{-# LANGUAGE RecordWildCards #-}{-# LANGUAGE PatternGuards #-}moduleData.CRF.Chain1.Constrained.Train(CRF(..),train)whereimportControl.Applicative((<$>),(<*>))importSystem.IO(hSetBuffering,stdout,BufferMode(..))importData.Binary(Binary,put,get)importqualifiedData.SetasSimportqualifiedData.MapasMimportqualifiedData.VectorasVimportqualifiedNumeric.SGDasSGDimportqualifiedNumeric.SGD.LogSignedasLimportData.CRF.Chain1.Constrained.Dataset.InternalimportData.CRF.Chain1.Constrained.Dataset.External(SentL,unknown,unDist)importData.CRF.Chain1.Constrained.Dataset.Codec(mkCodec,Codec,encodeDataL,encodeLabels)importData.CRF.Chain1.Constrained.Feature(Feature,featuresIn)importData.CRF.Chain1.Constrained.Model(Model(..),mkModel,FeatIx(..),featToJustInt)importData.CRF.Chain1.Constrained.Inference(accuracy,expectedFeaturesIn)-- | A conditional random field model with additional codec used for-- data encoding.dataCRFab=CRF{-- | The codec is used to transform data into internal representation,-- where each observation and each label is represented by a unique-- integer number.codec::Codecab,-- | The actual model, which is a map from 'Feature's to potentials.model::Model}instance(Orda,Ordb,Binarya,Binaryb)=>Binary(CRFab)whereputCRF{..}=putcodec>>putmodelget=CRF<$>get<*>get-- | Train the CRF using the stochastic gradient descent method.-- The resulting model will contain features extracted with-- the user supplied extraction function.-- You can use the functions provided by the "Data.CRF.Chain1.Feature.Present"-- and "Data.CRF.Chain1.Feature.Hidden" modules for this purpose.-- When the evaluation data 'IO' action is 'Just', the iterative-- training process will notify the user about the current accuracy-- on the evaluation part every full iteration over the training part.-- TODO: Accept custom r0 construction function.train::(Orda,Ordb)=>SGD.SgdArgs-- ^ Args for SGD->IO[SentLab]-- ^ Training data 'IO' action->Maybe(IO[SentLab])-- ^ Maybe evalation data->(AVecLb->[(Xs,Ys)]->[Feature])-- ^ Feature selection->IO(CRFab)-- ^ Resulting modeltrainsgdArgstrainIOevalIO'MaybeextractFeats=dohSetBufferingstdoutNoBuffering(_codec,trainData)<-mkCodec<$>trainIO_r0<-encodeLabels_codec.S.toList.unkSet<$>trainIOevalDataM<-caseevalIO'MaybeofJustevalIO->Just.encodeDataL_codec<$>evalIONothing->returnNothingletfeats=extractFeats_r0trainDatacrf=(mkModelfeats){r0=_r0}para<-SGD.sgdMsgdArgs(notifysgdArgscrftrainDataevalDataM)(gradOncrf)(V.fromListtrainData)(valuescrf)return$CRF_codec(crf{values=para})-- | Collect labels assigned to unknown words (with empty list-- of potential interpretations).unkSet::Ordb=>[SentLab]->S.SetbunkSet=S.fromList.concatMaponSentwhereonSent=concatMaponWordonWordword|unknown(fstword)=M.keys.unDist.snd$word|otherwise=[]gradOn::Model->SGD.Para->(Xs,Ys)->SGD.GradgradOncrfpara(xs,ys)=SGD.fromLogList$[(featToJustIntcurrfeat,L.fromPosval)|(feat,val)<-featuresInxsys]++[(ix,L.fromNegval)|(FeatIxix,val)<-expectedFeaturesIncurrxs]wherecurr=crf{values=para}notify::SGD.SgdArgs->Model->[(Xs,Ys)]->Maybe[(Xs,Ys)]->SGD.Para->Int->IO()notifySGD.SgdArgs{..}crftrainDataevalDataMparak|doneTotalk==doneTotal(k-1)=putStr"."|JustdataSet<-evalDataM=doletx=accuracy(crf{values=para})dataSetputStrLn("\n"++"["++show(doneTotalk)++"] f = "++showx)|otherwise=putStrLn("\n"++"["++show(doneTotalk)++"] f = #")wheredoneTotal::Int->IntdoneTotal=floor.donedone::Int->Doubledonei=fromIntegral(i*batchSize)/fromIntegraltrainSizetrainSize=lengthtrainData