----------------------------------------------------------------------------- | Module : Math.Statistics.Dirichlet.Density-- Copyright : (c) 2009-2012 Felipe Lessa-- License : BSD3---- Maintainer : felipe.lessa@gmail.com-- Stability : experimental-- Portability : portable----------------------------------------------------------------------------moduleMath.Statistics.Dirichlet.Density(DirichletDensity(..),empty,fromList,toList,derive,cost)whereimportqualifiedData.VectorasVimportqualifiedData.Vector.GenericasGimportqualifiedData.Vector.UnboxedasUimportControl.DeepSeq(NFData(..))importNumeric.GSL.Special.Gamma(lngamma)importNumeric.GSL.Special.Psi(psi)importMath.Statistics.Dirichlet.OptionsimportMath.Statistics.Dirichlet.Util-- | A Dirichlet density.newtypeDirichletDensity=DD{unDD::U.VectorDouble}deriving(Eq)instanceShowDirichletDensitywhereshowsPrecprec(DDv)=showParen(prec>10)$showString"fromList ".showsPrec11(U.toListv)instanceReadDirichletDensitywherereadsPrecp('(':xs)=let(ys,')':zs)=break(==')')xsinmap(\(x,s)->(x,s++zs))$readsPrecpysreadsPrecpxs=let[("fromList",list)]=lexxsinmap(\(x,s)->(fromListx,s))$readsPrecplistinstanceNFDataDirichletDensitywherernfDD{}=()-- | @empty n x@ is an \"empty\" Dirichlet density with size-- @n@ and all alphas set to @x@.empty::Int->Double->DirichletDensityempty=(DD.).U.replicate{-# INLINE empty #-}-- | @fromList xs@ constructs a Dirichlet density from a list of-- alpha values.fromList::[Double]->DirichletDensityfromList=DD.U.fromList{-# INLINE fromList #-}-- | @toList d@ deconstructs a Dirichlet density to a list of-- alpha values.toList::DirichletDensity->[Double]toList(DDxs)=U.toListxs{-# INLINE toList #-}-- | Derive a Dirichlet density using a maximum likelihood method-- as described by Karplus et al (equation 26). All training-- vectors should have the same length, however this is not-- verified.derive::DirichletDensity->Predicate->StepSize->TrainingVectors->ResultDirichletDensityderive(DDinitial)(PredmaxIter'minDelta_deltaSteps'__)(Stepstep)trainingData|V.lengthtrainingData==0=err"empty training data"|U.lengthinitial<1=err"empty initial vector"|maxIter'<1=err"non-positive maxIter"|minDelta_<0=err"negative minDelta"|deltaSteps'<1=err"non-positive deltaSteps"|step<=0=err"non-positive step"|step>=1=err"step greater than one"|otherwise=trainwhereerr=error.("Dirichlet.derive: "++)-- Compensate the different deltaSteps.!minDelta'=minDelta_*fromIntegraldeltaSteps'-- Number of training sequences.!trainingSize=fromIntegral$V.lengthtrainingData-- Sums of each training sequence.trainingSums::U.VectorDouble!trainingSums=G.unstream$G.stream$V.mapU.sumtrainingData-- Functions that work on the alphas only (and not their logs).calcSumAs=U.sum.snd.U.unzipfinish=DD.snd.U.unzip-- Start training in the zero-th iteration and with-- infinite inital cost.train=train'1infinity(U.suminitial)$U.map(\x->(logx,x))initialtrain'!iter!oldCost!sumAs!alphas=-- Reestimate alpha's.let!alphas'=U.imapcalculateAlphasalphas!psiSumAs=psisumAs!psiSums=U.sum$U.map(\sumT->psi$sumT+sumAs)trainingSumscalculateAlphas!i(!w,!a)=let!s1=trainingSize*(psiSumAs-psia)!s2=V.sum$V.map(\t->psi$tU.!i+a)trainingData!w'=w+step*a*(s1+s2-psiSums)!a'=expw'in(w',a')-- Recalculate constants.!sumAs'=calcSumAsalphas'!calcCost=iter`mod`deltaSteps'==0!cost'=ifcalcCostthennewCostelseoldCostwherenewCost=costWorker(snd$U.unzipalphas')sumAs'trainingDatatrainingSums!delta=abs(cost'-oldCost)-- Verify convergence. Even with MaxIter we only stop-- iterating if the delta was calculated. Otherwise we-- wouldn't be able to tell the caller why the delta was-- still big when we reached the limit.incase(calcCost,delta<=minDelta',iter>=maxIter')of(True,True,_)->ResultDeltaiterdeltacost'$finishalphas'(True,_,True)->ResultMaxIteriterdeltacost'$finishalphas'_->train'(iter+1)cost'sumAs'alphas'-- | Cost function for deriving a Dirichlet density (equation-- 18). This function is minimized by 'derive'.cost::TrainingVectors->DirichletDensity->Doublecosttv(DDarr)=costWorkerarr(U.sumarr)tv$G.unstream$G.stream$V.mapU.sumtv-- | 'cost' needs to calculate the sum of all training vectors.-- This functios avoids recalculting this quantity in 'derive'-- multiple times. This is the used by both 'cost' and 'derive'.costWorker::U.VectorDouble->Double->TrainingVectors->U.VectorDouble->DoublecostWorker!alphas!sumAs!trainingData!trainingSums=let!lngammaSumAs=lngammasumAsft=U.sum$U.zipWithwtalphaswherewt_ia_i=lngamma(t_i+a_i)-lngamma(t_i+1)-lngammaa_igsumT=lngamma(sumT+1)-lngamma(sumT+sumAs)innegate$(V.sum$V.mapftrainingData)+(U.sum$U.mapgtrainingSums)+lngammaSumAs*fromIntegral(U.lengthtrainingSums){-# INLINE costWorker #-}