{-# OPTIONS #-}-- ------------------------------------------------------------moduleHolumbus.Crawler.HtmlwhereimportData.Function.SelectorimportData.ListimportData.MaybeimportHolumbus.Crawler.TypesimportHolumbus.Crawler.URIsimportSystem.FilePathimportText.XML.HXT.Corehiding(when,getState){- just for debugging
import qualified Debug.Trace as D
-- -}-- ------------------------------------------------------------defaultHtmlCrawlerConfig::AccumulateDocResultar->MergeDocResultsr->CrawlerConfigardefaultHtmlCrawlerConfigopop2=(setStheSysConfig(withValidateno>>>withParseHTMLyes>>>withInputEncodingisoLatin1>>>withWarningsno>>>withIgnoreNoneXmlContentsyes)>>>setSthePreRefsFilterthis>>>setStheProcessRefsgetHtmlReferences$defaultCrawlerConfigopop2)-- -------------------------------------------------------------- | Collect all HTML references to other documents within a, frame and iframe elementsgetHtmlReferences::ArrowXmla=>aXmlTreeURIgetHtmlReferences=fromLA(getRefs$<computeDocBase)wheregetRefsbase=deep(hasNameWith((`elem`["a","frame","iframe"]).localPart))>>>(getAttrValue0"href"<+>getAttrValue0"src")>>^toAbsRefbasegetDocReferences::ArrowXmla=>aXmlTreeURIgetDocReferences=fromLA(getRefs$<computeDocBase)wheregetRefsbase=multiselRefs>>^toAbsRefbasewherehasLocNamen=hasNameWith((==n).localPart)selRefenan=hasLocNameen:->getAttrValue0anselRefs=choiceA$map(uncurryselRef)names++[appletRefs,objectRefs,this:->none]names=[("img","src"),("input","src")-- input type="image" scr="...",("link","href"),("script","src")]appletRefs=hasLocName"applet":->(getAppRef$<getAppBase)wheregetAppBase=(getAttrValue0"codebase"`withDefault`".")>>^toAbsRefbasegetAppRefab=getAttrValue0"code">>^toAbsRefabobjectRefs=hasLocName"object":->none-- TODO-- | construct an absolute URI by a base URI and a possibly relative URItoAbsRef::URI->URI->URItoAbsRefbaseref=(expandURIStringref-- here >>> is normal function composition>>>fromMayberef>>>removeFragment)basewhereremoveFragmentr|"#"`isPrefixOf`path=reverse.tail$path|otherwise=rwherepath=dropWhile(/='#').reverse$r-- -------------------------------------------------------------- | Compute the base URI of a HTML page with respect to a possibly-- given base element in the head element of a html page.---- Stolen from Uwe Schmidt, http:\/\/www.haskell.org\/haskellwiki\/HXT-- and then stolen back again by Uwe from Holumbus.UtilitycomputeDocBase::ArrowXmla=>aXmlTreeStringcomputeDocBase=(((getByPath["html","head","base"]>>>getAttrValue"href"-- and compute document base with transfer uri and base)&&&getAttrValuetransferURI)>>>expandURI)`orElse`getAttrValuetransferURI-- the default: take the transfer uri-- ------------------------------------------------------------getByPath::ArrowXmla=>[String]->aXmlTreeXmlTreegetByPath=seqA.map(\n->getChildren>>>hasNamen)getHtmlTitle::ArrowXmla=>aXmlTreeStringgetHtmlTitle=getAllText$getByPath["html","head","title"]getHtmlPlainText::ArrowXmla=>aXmlTreeStringgetHtmlPlainText=getAllText$getByPath["html","body"]getAllText::ArrowXmla=>aXmlTreeXmlTree->aXmlTreeStringgetAllTextgetText'=(getText'>>>(fromLA$deepgetText)>>^(" "++)-- text parts are separated by a space)>.(concat>>>normalizeWS)-- normalize SpaceisHtmlContents::ArrowXmla=>aXmlTreeXmlTreeisHtmlContents=(getAttrValuetransferMimeType>>>isA(`elem`[text_html,application_xhtml]))`guards`thisisPdfContents::ArrowXmla=>aXmlTreeXmlTreeisPdfContents=(getAttrValuetransferMimeType>>>isA(==application_pdf))`guards`thisgetTitleOrDocName::ArrowXmla=>aXmlTreeStringgetTitleOrDocName=(getHtmlTitle>>>isA(not.null))`orElse`(getAttrValuetransferURI>>^takeFileName)isElemWithAttr::ArrowXmla=>String->String->(String->Bool)->aXmlTreeXmlTreeisElemWithAttrenanav=isElem>>>hasNameen>>>hasAttrValueanav-- ------------------------------------------------------------application_pdf::Stringapplication_pdf="application/pdf"-- -------------------------------------------------------------- | normalize whitespace by splitting a text into words and joining this together with unwordsnormalizeWS::String->StringnormalizeWS=words>>>unwords-- | take the first n chars of a string, if the input-- is too long the cut off is indicated by \"...\" at the endlimitLength::Int->String->StringlimitLengthns|lengths'<=n=s|otherwise=take(n-3)s'++"..."wheres'=take(n+1)s-- ------------------------------------------------------------