{-# OPTIONS #-}-- ------------------------------------------------------------moduleHolumbus.Crawler.UtilwhereimportControl.Applicative(liftA2)importData.ListimportqualifiedText.Regex.XMLSchema.StringasRE-- -------------------------------------------------------------- | create temp file namemkTmpFile::Int->String->Int->StringmkTmpFilensi=(s++).reverse.taken.(++replicaten'0').reverse.show$i-- -------------------------------------------------------------- | Simple predicate genertor for filtering of URIs-- If the first predicate (isAllowed) holds and the second (isDenied) does not hold-- the predicate holds. This can be used for constructing simple URL filterssimpleFollowRef::(String->Bool)->(String->Bool)->(String->Bool)simpleFollowRefisAllowedisDenied=isAllowed.&&.(not.isDenied)where(.&&.)=liftA2(&&)-- | A convenient function, that takes two lists of strings in regexp syntax,-- The first list are the patterns for the allowed strings,-- the second one for the patterns to deny the string.-- Two regular expressions are build from these lists of strings,-- and the string to be tested is matched against both regexessimpleFollowRef'::[String]->[String]->(String->Bool)simpleFollowRef'alloweddenied=simpleFollowRefallowed'denied'wheremkAlt::[String]->StringmkAltrs="("++intercalate"|"rs++")"allowed'|nullallowed=constTrue|otherwise=match$mkAltalloweddenied'|nulldenied=constFalse|otherwise=match$mkAltdenied-- ------------------------------------------------------------match::String->String->Boolmatchre=RE.matchRE(parseREre)sed::(String->String)->String->String->Stringsededitre=parseREre`seq`RE.sededitresplit::String->String->(String,String)splitre=parseREre`seq`RE.splitretokenize::String->String->[String]tokenizere=parseREre`seq`RE.tokenizere-- ------------------------------------------------------------parseRE::String->RE.RegexparseREre=check.RE.parseRegex$rewherecheckre'|RE.isZerore'=error$"\nsyntax error in regexp: "++re++"\n"++RE.errRegexre'|otherwise=re'-- ------------------------------------------------------------