{-# LANGUAGE RankNTypes #-}{-# LANGUAGE GeneralizedNewtypeDeriving #-}{-# LANGUAGE StandaloneDeriving #-}{-# LANGUAGE RecordWildCards #-}{-# LANGUAGE DeriveDataTypeable #-}-- | This program compares two Infernal covariance models with each other.-- Based on the Infernal CM scoring mechanism, a Link sequence and Link score-- are calculated. The Link sequence is defined as the sequence scoring highest-- in both models simultanuously.---- The complete algorithm is described in:---- "Christian Höner zu Siederdissen, and Ivo L. Hofacker. 2010. Discriminatory-- power of RNA family models. Bioinformatics 26, no. 18: 453–59."---- <http://bioinformatics.oxfordjournals.org/content/26/18/i453.long>-------- NOTE always use coverage analysis to find out, if we really used all code-- paths (in long models, if a path is not taken, there is a bug)-- NOTE when comparing hits with cmsearch, use the following commandline:---- cmsearch --no-null3 --cyk --fil-no-hmm --fil-no-qdb---- --no-null3 : important, the test sequence is so short that null3 can easily-- generate scores that are way off! remember, we are interested in a sequence-- that is typically embedded in something large---- --fil-no-hmm, --fil-no-qdb: do not use heuristics for speedup, they-- sometimes hide results (in at least one case)---- (--toponly): if the comparison was done onesided---- (-g): if you want to compare globallymoduleBioInf.CMComparewhereimportControl.Arrow(first,second,(***))importControl.LensimportControl.MonadimportData.Array.IArrayimportData.List(maximumBy,nub,sort)importqualifiedData.MapasMimportSystem.Console.CmdArgsimportSystem.Environment(getArgs)importText.PrintfimportBiobase.PrimaryimportBiobase.SElab.CMimportBiobase.SElab.CM.ImportimportBiobase.SElab.Types-- * optimization functions-- | Type of the optimization functions.typeOpta=(CM->StateID->a-- E,CM->StateID->BitScore->a->a-- lbegin,CM->StateID->BitScore->a->a-- S,CM->StateID->BitScore->a->a-- D,CM->StateID->BitScore->(Char,Char,BitScore)->a->a-- MP,CM->StateID->BitScore->(Char,BitScore)->a->a-- ML,CM->StateID->BitScore->(Char,BitScore)->a->a-- IL,CM->StateID->BitScore->(Char,BitScore)->a->a-- MR,CM->StateID->BitScore->(Char,BitScore)->a->a-- IR,CM->StateID->a->a->a-- B,[(a,a)]->[(a,a)]-- optimization,a->String-- finalize, make pretty for output)-- | Calculates the cyk optimal score over both models.cykMaxiMin::OptBitScorecykMaxiMin=(end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize)whereend__=0lbegin__ts=t+sstart__ts=t+sdelete__ts=t+smatchP__t(_,_,e)s=t+e+smatchL__t(_,e)s=t+e+sinsertL__t(_,e)s=t+e+smatchR__t(_,e)s=t+e+sinsertR__t(_,e)s=t+e+sbranch__st=s+topt[]=[]optxs=[maximumBy(\(a,b)(c,d)->(minab)`compare`(mincd))xs]-- (xs `using` parList rdeepseq)]finalizes=shows-- | Return the nucleotide sequence leading to the score. uses an optional-- endmarker to denote end states. the string is the same for both models. this-- is the only Opt function, currently, for which this is true.rnaString::Bool->Opt[Char]rnaStringendmarker=(end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize)whereend__=['N'|endmarker]lbegin___s=sstart___s=sdelete___s=smatchP___(k1,k2,_)s=[k1]++s++[k2]matchL___(k,_)s=k:sinsertL___(k,_)s=k:smatchR___(k,_)s=s++[k]insertR___(k,_)s=s++[k]branch__st=s++topt=idfinalizes=ifendmarkerthenconcatMapfselseconcatMapshowsfx|x=='N'="_"|otherwise=showx-- | Dotbracket notation, again with an endmarker, to see the secondary-- structure corresponding to the rnastring.dotBracket::Bool->OptStringdotBracketendmarker=(end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize)whereend__=['_'|endmarker]lbegin___s=sstart___s=sdelete___s=smatchP____s="("++s++")"matchL____s='.':sinsertL____s=',':smatchR____s=s++"."insertR____s=s++","branch__st=s++topt=idfinalizes=s-- | Show the nodes which were visited to get the score. the last node can-- occur multiple times. if it does, local end transitions were used.visitedNodes::Opt[NodeID]visitedNodes=(end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize)whereendcmk=[((cm^.states)M.!k)^.nodeID]lbegincmk_s=sstartcmk_s=((cm^.states)M.!k)^.nodeID:sdeletecmk_s=((cm^.states)M.!k)^.nodeID:smatchPcmk__s=((cm^.states)M.!k)^.nodeID:smatchLcmk__s=((cm^.states)M.!k)^.nodeID:sinsertLcmk__s=((cm^.states)M.!k)^.nodeID:smatchRcmk__s=((cm^.states)M.!k)^.nodeID:sinsertRcmk__s=((cm^.states)M.!k)^.nodeID:sbranchcmkst=((cm^.states)M.!k)^.nodeID:(s++t)opt=id-- NOTE do not sort, do not nub !finalizexs=(show$mapunNodeIDxs)-- NOTE do not sort, do not nub !-- | Detailed output of the different states, that were visited.extendedOutput::OptStringextendedOutput=(end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize)whereendcmsid=printf"E %5d %5d"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)lbegincmsidts=printf"lbegin %5d %5d %7.3f \n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)sstartcmsidts=printf"S %5d %5d %7.3f \n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)sdeletecmsidts=printf"D %5d %5d %7.3f \n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)smatchPcmsidt(k1,k2,e)s=printf"MP %5d %5d %7.3f %7.3f %1s %1s\n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)(unBitScoree)(showk1)(showk2)smatchLcmsidt(k,e)s=printf"ML %5d %5d %7.3f %7.3f %1s\n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)(unBitScoree)(showk)sinsertLcmsidt(k,e)s=printf"IL %5d %5d %7.3f %7.3f %1s\n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)(unBitScoree)(showk)smatchRcmsidt(k,e)s=printf"MR %5d %5d %7.3f %7.3f %1s\n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)(unBitScoree)(showk)sinsertRcmsidt(k,e)s=printf"IR %5d %5d %7.3f %7.3f %1s\n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)(unBitScoret)(unBitScoree)(showk)sbranchcmsidst=printf"B %5d %5d\n%s\n%s"(unStateIDsid)(unNodeID$((cm^.states)M.!sid)^.nodeID)stopt=idfinalizes="\nLabel State Node Trans Emis\n\n"++s-- | Algebra product operation.(<*>)::Eqa=>Opta->Optb->Opt(a,b)algA<*>algB=(end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize)where(endA,lbeginA,startA,deleteA,matchPA,matchLA,insertLA,matchRA,insertRA,branchA,optA,finalizeA)=algA(endB,lbeginB,startB,deleteB,matchPB,matchLB,insertLB,matchRB,insertRB,branchB,optB,finalizeB)=algBendcmk=(endAcmk,endBcmk)lbegincmkt(sA,sB)=(lbeginAcmktsA,lbeginBcmktsB)startcmkt(sA,sB)=(startAcmktsA,startBcmktsB)deletecmkt(sA,sB)=(deleteAcmktsA,deleteBcmktsB)matchPcmkte(sA,sB)=(matchPAcmktesA,matchPBcmktesB)matchLcmkte(sA,sB)=(matchLAcmktesA,matchLBcmktesB)insertLcmkte(sA,sB)=(insertLAcmktesA,insertLBcmktesB)matchRcmkte(sA,sB)=(matchRAcmktesA,matchRBcmktesB)insertRcmkte(sA,sB)=(insertRAcmktesA,insertRBcmktesB)branchcmk(sA,sB)(tA,tB)=(branchAcmksAtA,branchBcmksBtB)optxs=[((xl1,xl2),(xr1,xr2))|(xl1,xr1)<-nub$optA[(yl1,yr1)|((yl1,yl2),(yr1,yr2))<-xs],(xl2,xr2)<-optB[(yl2,yr2)|((yl1,yl2),(yr1,yr2))<-xs,(yl1,yr1)==(xl1,xr1)]]finalize(sA,sB)=finalizeAsA++"\n"++finalizeBsB-- * The grammar for CM comparison.-- | Recursion in two CMs simultanously.recurse::Bool->Opta->CM->CM->Array(StateID,StateID)[(a,a)]recursefastIns(end,lbegin,start,delete,matchP,matchL,insertL,matchR,insertR,branch,opt,finalize)m1m2=locarrwherelock1k2|otherwise=opt$dor<-arr!(k1,k2)return$(lbeginm1k1lb1***lbeginm2k2lb2)rwherelb1=M.findWithDefault(BitScore(-10000))k1(m1^.localBegin)lb2=M.findWithDefault(BitScore(-10000))k2(m2^.localBegin)reck1k2=letxyz=rec'k1k2inxyz-- traceShow ("rec",k1,((m1^.states) M.! k1) ^. stateType,k2,((m2^.states) M.! k2) ^. stateType) xyzrec'k1k2--|t1==E&&t2==E=[(endm1k1,endm2k2)]--|t1==S&&t2==S=opt$do(c1,tr1)<-s1^.transitions++[(ls1,le1)](c2,tr2)<-s2^.transitions++[(ls2,le2)]r<-arr!(c1,c2)return$(startm1k1tr1***startm2k2tr2)r|t1==D&&t2==D=opt$do(c1,tr1)<-s1^.transitions++[(ls1,le1)](c2,tr2)<-s2^.transitions++[(ls2,le2)]r<-arr!(c1,c2)return$(deletem1k1tr1***deletem2k2tr2)r-- match pair emitting states|t1==MP&&t2==MP=opt$do(c1,tr1)<-s1^.transitions++[(ls1,le1)](c2,tr2)<-s2^.transitions++[(ls2,le2)](e1,e2)<-zip(s1^.emits^.pair)(s2^.emits^.pair)r<-arr!(c1,c2)return$(matchPm1k1tr1e1***matchPm2k2tr2e2)r-- match left emitting states|t1`elem`lstates&&t2`elem`lstates=opt$do(c1,tr1)<-s1^.transitions++[(ls1,le1)](c2,tr2)<-s2^.transitions++[(ls2,le2)]guard$(notfastIns&&(c1/=k1||c2/=k2))||(fastIns&&c1/=k1&&c2/=k2)(e1,e2)<-zip(s1^.emits^.single)(s2^.emits^.single)r<-arr!(c1,c2)letf=ift1==MLthenmatchLelseinsertLletg=ift2==MLthenmatchLelseinsertLreturn$(fm1k1tr1e1***gm2k2tr2e2)r-- match right emitting states|t1`elem`rstates&&t2`elem`rstates=opt$do(c1,tr1)<-s1^.transitions++[(ls1,le1)](c2,tr2)<-s2^.transitions++[(ls2,le2)]guard$(notfastIns&&(c1/=k1||c2/=k2))||(fastIns&&c1/=k1&&c2/=k2)(e1,e2)<-zip(s1^.emits^.single)(s2^.emits^.single)r<-arr!(c1,c2)letf=ift1==MRthenmatchRelseinsertRletg=ift2==MRthenmatchRelseinsertRreturn$(fm1k1tr1e1***gm2k2tr2e2)r-- if one state is E, we can only delete states, except for another S state, which will go into local end-- it is not possible to use an emitting state on the right as those would require emitting on the left, too!|t1==E&&t2`elem`[D,S]=opt$do(c2,tr2)<-s2^.transitions++[(ls2,le2)]r<-arr!(k1,c2)return$ift2==Dthensecond(deletem2k2tr2)relsesecond(startm2k2tr2)r-- the other way around with D,E|t1`elem`[D,S]&&t2==E=opt$do(c1,tr1)<-s1^.transitions++[(ls1,le2)]r<-arr!(c1,k2)return$ift1==Dthenfirst(deletem1k1tr1)relsefirst(startm1k1tr1)r-- two branching states|t1==B&&t2==B=opt$let[(l1,_),(r1,_)]=s1^.transitions[(l2,_),(r2,_)]=s2^.transitionsin-- both branches are matcheddo(s1,s2)<-arr!(l1,l2)-- left branch (m1,m2)(t1,t2)<-arr!(r1,r2)-- right branch (m1,m2)return(branchm1k1s1t1,branchm2k2s2t2)-- (m1,m2)++do(t1,s2)<-arr!(r1,l2)-- match right branch of m1 with left branch of m2-- local ends for other branchesx<-arr!(ls1,ls2)let(s1,t2)=(deletem1l1le1***deletem2l2le2)xreturn(branchm1k1s1t1,branchm2k2s2t2)++do(s1,t2)<-arr!(l1,r2)x<-arr!(ls1,ls2)let(t1,s2)=(deletem1l1le1***deletem2l2le2)xreturn(branchm1k1s1t1,branchm2k2s2t2)-- branch - non-branch|t1==B&&t2/=B=opt$let[(l,_),(r,_)]=s1^.transitionsindo(s1,s2)<-arr!(l,k2)-- left branch and m2x<-arr!(ls1,ls2)-- dont do anything for ls2, since we do not have to-- delete a branch in model 2.let(t1,t2)=first(deletem1rle1)xreturn(branchm1k1s1t1,branchm2k2s2t2)++do(t1,t2)<-arr!(r,k2)-- right branch and m2x<-arr!(ls1,ls2)let(s1,s2)=first(deletem1lle1)x-- delete left branch in m1return(branchm1k1s1t1,branchm2k2s2t2)-- branch - non-branch|t1/=B&&t2==B=opt$let[(l,_),(r,_)]=s2^.transitionsindo(s1,s2)<-arr!(k1,l)x<-arr!(ls1,ls2)let(t1,t2)=second(deletem2rle2)xreturn(branchm1k1s1t1,branchm2k2s2t2)++do(t1,t2)<-arr!(k1,r)x<-arr!(ls1,ls2)let(s1,s2)=second(deletem2lle2)xreturn(branchm1k1s1t1,branchm2k2s2t2)-- S state versus any|t1==S=opt$do(c1,tr1)<-s1^.transitions++[(ls1,le1)]r<-arr!(c1,k2)return$first(startm1k1tr1)r-- S state versus any|t2==S=opt$do(c2,tr2)<-s2^.transitions++[(ls2,le2)]r<-arr!(k1,c2)return$second(startm2k2tr2)r--|otherwise=[]wheres1=(m1^.states)M.!k1s2=(m2^.states)M.!k2t1=s1^.stateTypet2=s2^.stateTypele1=M.findWithDefault(BitScore(-10000))k1(m1^.localEnd)le2=M.findWithDefault(BitScore(-10000))k2(m2^.localEnd)ls1=sn1ls2=sn2lstates=[ML,IL]rstates=[MR,IR]locarr=(array((0,0),(sn1,sn2))[((k1,k2),lock1k2)|k1<-[0..sn1],k2<-[0..sn2]])arr=(array((0,0),(sn1,sn2))[((k1,k2),reck1k2)|k1<-[0..sn1],k2<-[0..sn2]])`asTypeOf`locarrsn1=fst.M.findMax$m1^.statessn2=fst.M.findMax$m2^.states