{-# OPTIONS #-}-- ------------------------------------------------------------moduleHolumbus.Crawler.TypeswhereimportControl.DeepSeqimportControl.Monad.ReaderimportControl.Monad.StateimportControl.Monad.ReaderStateIO-- import Control.Monad.StateimportData.Binary(Binary)importqualifiedData.BinaryasB-- else naming conflict with put and get from Monad.StateimportData.Function.SelectorimportHolumbus.Crawler.ConstantsimportHolumbus.Crawler.URIsimportHolumbus.Crawler.RobotTypesimportHolumbus.Crawler.XmlArrows(checkDocumentStatus)importText.XML.HXT.CoreimportText.XML.HXT.CurlimportqualifiedText.XML.HXT.Arrow.XmlState.RunIOStateArrowasHXT(theSysConfigComp)importqualifiedText.XML.HXT.Arrow.XmlState.TypeDefsasHXT(theInputOptions)importSystem.Log.Logger(Priority(..))-- -------------------------------------------------------------- | The action to combine the result of a single document with the accumulator for the overall crawler result.-- This combining function runs in the IO monad to enable storing parts of the result externally-- but it is not a CrawlerAction, else parallel crawling with forkIO is not longer applicabletypeAccumulateDocResultar=(URI,a)->r->IOr-- | The folding operator for merging partial results when working with mapFold and parallel crawlingtypeMergeDocResultsr=r->r->IOr-- | The operator for saving intermediate resultstypeSavePartialResultsr=FilePath->r->IOr-- | The extractor function for a single documenttypeProcessDocumenta=IOSArrowXmlTreea-- | The crawler action monadtypeCrawlerActionar=ReaderStateIO(CrawlerConfigar)(CrawlerStater)-- | The crawler configuration recorddataCrawlerConfigar=CrawlerConfig{cc_sysConfig::SysConfig,cc_preRefsFilter::IOSArrowXmlTreeXmlTree,cc_processRefs::IOSArrowXmlTreeURI,cc_preDocFilter::IOSArrowXmlTreeXmlTree,cc_processDoc::ProcessDocumenta,cc_accumulate::AccumulateDocResultar-- result accumulation runs in the IO monad to allow storing parts externally,cc_fold::MergeDocResultsr,cc_followRef::URI->Bool,cc_addRobotsTxt::CrawlerConfigar->AddRobotsAction,cc_clickLevel::!Int,cc_maxNoOfDocs::!Int,cc_maxParDocs::!Int,cc_maxParThreads::!Int,cc_saveIntervall::!Int,cc_savePathPrefix::!String,cc_savePreAction::FilePath->CrawlerActionar()-- SavePartialResults r,cc_traceLevel::!Priority,cc_traceLevelHxt::!Priority}-- | The crawler state recorddataCrawlerStater=CrawlerState{cs_toBeProcessed::!URIsWithLevel,cs_alreadyProcessed::!URIs,cs_robots::!Robots-- is part of the state, it will grow during crawling,cs_noOfDocs::!Int-- stop crawling when this counter reaches 0, (-1) means unlimited # of docs,cs_noOfDocsSaved::!Int,cs_listOfDocsSaved::![Int],cs_resultAccu::!r-- evaluate accumulated result, else memory leaks show up,cs_resultInit::!r-- the initial value for folding results}deriving(Show)instance(NFDatar)=>NFData(CrawlerStater)wherernfCrawlerState{cs_toBeProcessed=a,cs_alreadyProcessed=b,cs_robots=c,cs_noOfDocs=d,cs_noOfDocsSaved=e,cs_listOfDocsSaved=f,cs_resultAccu=g,cs_resultInit=h}=rnfa`seq`rnfb`seq`rnfc`seq`rnfd`seq`rnfe`seq`rnff`seq`rnfg`seq`rnfhinstance(XmlPicklerr)=>XmlPickler(CrawlerStater)wherexpickle=xpElem"crawler-state"$xpWrap(\((d,e,f),(a,b,c,g,h))->CrawlerStateabcdefgh,\(CrawlerStateabcdefgh)->((d,e,f),(a,b,c,g,h)))$xpPair(xpTriple(xpAttr"no-of-docs"xpPrim)(xpAttr"no-of-docs-saved"xpPrim)(xpAttr"list-of-docs-saved"$xpList$xpElem"saved-at"$xpPrim))(xp5Tuple(xpElem"to-be-processed"$xpURIsWithLevel)(xpElem"already-processed"$xpURIs)xpRobotsxpicklexpickle)wherexpURIs=xpWrap(fromListURIs,toListURIs)$xpList$xpElem"doc"$xpAttr"href"$xpTextxpURIsWithLevel=xpWrap(fromListURIs',toListURIs')$xpList$xpElem"doc"$xpPair(xpAttr"href"$xpText)(xpAttr"clicklevel"xpInt)-- -------------------------------------------------------------- | selector functions for CrawlerStatetheToBeProcessed::Selector(CrawlerStater)URIsWithLeveltheToBeProcessed=Scs_toBeProcessed(\xs->s{cs_toBeProcessed=x})theAlreadyProcessed::Selector(CrawlerStater)URIstheAlreadyProcessed=Scs_alreadyProcessed(\xs->s{cs_alreadyProcessed=x})theRobots::Selector(CrawlerStater)RobotstheRobots=Scs_robots(\xs->s{cs_robots=x})theNoOfDocs::Selector(CrawlerStater)InttheNoOfDocs=Scs_noOfDocs(\xs->s{cs_noOfDocs=x})theNoOfDocsSaved::Selector(CrawlerStater)InttheNoOfDocsSaved=Scs_noOfDocsSaved(\xs->s{cs_noOfDocsSaved=x})theListOfDocsSaved::Selector(CrawlerStater)[Int]theListOfDocsSaved=Scs_listOfDocsSaved(\xs->s{cs_listOfDocsSaved=x})theResultAccu::Selector(CrawlerStater)rtheResultAccu=Scs_resultAccu(\xs->s{cs_resultAccu=x})theResultInit::Selector(CrawlerStater)rtheResultInit=Scs_resultInit(\xs->s{cs_resultInit=x})-- | selector functions for CrawlerConfigtheSysConfig::Selector(CrawlerConfigar)SysConfigtheSysConfig=Scc_sysConfig(\xs->s{cc_sysConfig=x})theTraceLevel::Selector(CrawlerConfigar)PrioritytheTraceLevel=Scc_traceLevel(\xs->s{cc_traceLevel=x})theTraceLevelHxt::Selector(CrawlerConfigar)PrioritytheTraceLevelHxt=Scc_traceLevelHxt(\xs->s{cc_traceLevelHxt=x})theClickLevel::Selector(CrawlerConfigar)InttheClickLevel=Scc_clickLevel(\xs->s{cc_clickLevel=x})theMaxNoOfDocs::Selector(CrawlerConfigar)InttheMaxNoOfDocs=Scc_maxNoOfDocs(\xs->s{cc_maxNoOfDocs=x})theMaxParDocs::Selector(CrawlerConfigar)InttheMaxParDocs=Scc_maxParDocs(\xs->s{cc_maxParDocs=x})theMaxParThreads::Selector(CrawlerConfigar)InttheMaxParThreads=Scc_maxParThreads(\xs->s{cc_maxParThreads=x})theSaveIntervall::Selector(CrawlerConfigar)InttheSaveIntervall=Scc_saveIntervall(\xs->s{cc_saveIntervall=x})theSavePathPrefix::Selector(CrawlerConfigar)StringtheSavePathPrefix=Scc_savePathPrefix(\xs->s{cc_savePathPrefix=x})theSavePreAction::Selector(CrawlerConfigar)(FilePath->CrawlerActionar())-- (SavePartialResults r)theSavePreAction=Scc_savePreAction(\xs->s{cc_savePreAction=x})theFollowRef::Selector(CrawlerConfigar)(URI->Bool)theFollowRef=Scc_followRef(\xs->s{cc_followRef=x})theAddRobotsAction::Selector(CrawlerConfigar)(CrawlerConfigar->AddRobotsAction)theAddRobotsAction=Scc_addRobotsTxt(\xs->s{cc_addRobotsTxt=x})theAccumulateOp::Selector(CrawlerConfigar)(AccumulateDocResultar)theAccumulateOp=Scc_accumulate(\xs->s{cc_accumulate=x})theFoldOp::Selector(CrawlerConfigar)(MergeDocResultsr)theFoldOp=Scc_fold(\xs->s{cc_fold=x})thePreRefsFilter::Selector(CrawlerConfigar)(IOSArrowXmlTreeXmlTree)thePreRefsFilter=Scc_preRefsFilter(\xs->s{cc_preRefsFilter=x})theProcessRefs::Selector(CrawlerConfigar)(IOSArrowXmlTreeURI)theProcessRefs=Scc_processRefs(\xs->s{cc_processRefs=x})thePreDocFilter::Selector(CrawlerConfigar)(IOSArrowXmlTreeXmlTree)thePreDocFilter=Scc_preDocFilter(\xs->s{cc_preDocFilter=x})theProcessDoc::Selector(CrawlerConfigar)(IOSArrowXmlTreea)theProcessDoc=Scc_processDoc(\xs->s{cc_processDoc=x})-- -------------------------------------------------------------- a rather boring default crawler configurationdefaultCrawlerConfig::AccumulateDocResultar->MergeDocResultsr->CrawlerConfigardefaultCrawlerConfigopop2=CrawlerConfig{cc_sysConfig=(withCurl[(curl_user_agent,defaultCrawlerName),(curl_max_time,show$(60*1000::Int))-- whole transaction for reading a document must complete within 60,000 milli seconds, ,(curl_connect_timeout,show$(10::Int))-- connection must be established within 10 seconds]),cc_preRefsFilter=this-- no preprocessing for refs extraction,cc_processRefs=none-- don't extract refs,cc_preDocFilter=checkDocumentStatus-- default: in case of errors throw away any contents,cc_processDoc=none-- no document processing at all,cc_accumulate=op-- combining function for result accumulating,cc_fold=op2,cc_followRef=constFalse-- do not follow any refs,cc_addRobotsTxt=const$constreturn-- do not add robots.txt evaluation,cc_saveIntervall=(-1)-- never save an itermediate state,cc_savePathPrefix="/tmp/hc-"-- the prefix for filenames into which intermediate states are saved,cc_savePreAction=const$return()-- no action before saving state,cc_clickLevel=maxBound-- click level set to infinity,cc_maxNoOfDocs=(-1)-- maximum # of docs to be crawled, -1 means unlimited,cc_maxParDocs=20-- maximum # of doc crawled in parallel,cc_maxParThreads=5-- maximum # of threads running in parallel,cc_traceLevel=NOTICE-- traceLevel,cc_traceLevelHxt=WARNING-- traceLevel for hxt}theInputOptions::Selector(CrawlerConfigar)AttributestheInputOptions=theSysConfig>>>HXT.theSysConfigCompHXT.theInputOptionstheCrawlerName::Selector(CrawlerConfigar)StringtheCrawlerName=theInputOptions>>>S{getS=lookupDefdefaultCrawlerNamecurl_user_agent,setS=addEntrycurl_user_agent}theMaxTime::Selector(CrawlerConfigar)InttheMaxTime=theInputOptions>>>S{getS=read.lookupDef"0"curl_max_time,setS=addEntrycurl_max_time.show.(`max`1)}theConnectTimeout::Selector(CrawlerConfigar)InttheConnectTimeout=theInputOptions>>>S{getS=read.lookupDef"0"curl_connect_timeout,setS=addEntrycurl_connect_timeout.show.(`max`1)}-- -------------------------------------------------------------- | Add attributes for accessing documentsaddSysConfig::SysConfig->CrawlerConfigar->CrawlerConfigaraddSysConfigcf=chgStheSysConfig(>>>cf)-- | Insert a robots no follow filter before thePreRefsFilteraddRobotsNoFollow::CrawlerConfigar->CrawlerConfigaraddRobotsNoFollow=chgSthePreRefsFilter(robotsNoFollow>>>)-- | Insert a robots no follow filter before thePreRefsFilteraddRobotsNoIndex::CrawlerConfigar->CrawlerConfigaraddRobotsNoIndex=chgSthePreDocFilter(robotsNoIndex>>>)-- | Set the log levelsetCrawlerTraceLevel::Priority->Priority->CrawlerConfigar->CrawlerConfigarsetCrawlerTraceLevelllx=setStheTraceLevell>>>setStheTraceLevelHxtlx-- | Set save intervall in configsetCrawlerSaveConf::Int->String->CrawlerConfigar->CrawlerConfigarsetCrawlerSaveConfif=setStheSaveIntervalli>>>setStheSavePathPrefixf-- | Set action performed before saving crawler statesetCrawlerSaveAction::(FilePath->CrawlerActionar())->CrawlerConfigar->CrawlerConfigarsetCrawlerSaveActionf=setStheSavePreActionf-- | Set max # of steps (clicks) to reach a documentsetCrawlerClickLevel::Int->CrawlerConfigar->CrawlerConfigarsetCrawlerClickLevelmcl=setStheClickLevelmcl-- | Set max # of documents to be crawled-- and max # of documents crawled in parallelsetCrawlerMaxDocs::Int->Int->Int->CrawlerConfigar->CrawlerConfigarsetCrawlerMaxDocsmxdmxpmxt=setStheMaxNoOfDocsmxd>>>setStheMaxParDocsmxp>>>setStheMaxParThreadsmxt-- | Set the pre hook filter executed before the hrefs are collectedsetCrawlerPreRefsFilter::IOSArrowXmlTreeXmlTree->CrawlerConfigar->CrawlerConfigarsetCrawlerPreRefsFilterf=setSthePreRefsFilterf-- ------------------------------------------------------------instance(Binaryr)=>Binary(CrawlerStater)whereputs=doB.put(getStheToBeProcesseds)B.put(getStheAlreadyProcesseds)B.put(getStheRobotss)B.put(getStheNoOfDocss)B.put(getStheNoOfDocsSaveds)B.put(getStheListOfDocsSaveds)B.put(getStheResultAccus)B.put(getStheResultInits)get=dotbp<-B.getalp<-B.getrbt<-B.getmxd<-B.getmxs<-B.getlsd<-B.getacc<-B.getini<-B.getreturn$CrawlerState{cs_toBeProcessed=tbp,cs_alreadyProcessed=alp,cs_robots=rbt,cs_noOfDocs=mxd,cs_noOfDocsSaved=mxs,cs_listOfDocsSaved=lsd,cs_resultAccu=acc,cs_resultInit=ini}putCrawlerState::(Binaryr)=>CrawlerStater->B.PutputCrawlerState=B.putgetCrawlerState::(Binaryr)=>B.Get(CrawlerStater)getCrawlerState=B.getinitCrawlerState::r->CrawlerStaterinitCrawlerStater=CrawlerState{cs_toBeProcessed=emptyURIs,cs_alreadyProcessed=emptyURIs,cs_robots=emptyRobots,cs_noOfDocs=0,cs_noOfDocsSaved=0,cs_listOfDocsSaved=[],cs_resultAccu=r,cs_resultInit=r}-- ---------------------------------------------------------------- basic crawler actions-- | Load a component from the crawler configurationgetConf::Selector(CrawlerConfigar)v->CrawlerActionarvgetConf=asks.getSgetState::Selector(CrawlerStater)v->CrawlerActionarvgetState=gets.getSputState::Selector(CrawlerStater)v->v->CrawlerActionar()putStatesel=modify.setSselmodifyState::Selector(CrawlerStater)v->(v->v)->CrawlerActionar()modifyStatesel=modify.chgSselmodifyStateIO::Selector(CrawlerStater)v->(v->IOv)->CrawlerActionar()modifyStateIOsel=modifyIO.chgMsel-- ------------------------------------------------------------