{-# LANGUAGE TemplateHaskell, QuasiQuotes, TupleSections, ViewPatterns #-}------------------------------------------------------------------------------- |-- Module : Text.Regex.PCRE.Rex-- Copyright : (c) Michael Sloan 2011---- Maintainer : Michael Sloan (mgsloan@gmail.com)-- Stability : unstable-- Portability : unportable-- -- This module provides a template Haskell quasiquoter for regular-- expressions, which provides the following features:-- -- 1) Compile-time checking that the regular expression is valid.-- -- 2) Arity of resulting tuple based on the number of selected capture patterns-- in the regular expression.---- 3) Allows for the inline interpolation of mapping functions :: String -> a.---- 4) Precompiles the regular expression at compile time, by calling into the-- PCRE library and storing a 'ByteString' literal representation of its state.---- 5) Compile-time configurable to use different PCRE options, turn off-- precompilation, use 'ByteString's, or set a default mapping expression.---- Since this is a quasiquoter library that generates code using view patterns,-- the following extensions are required:---- > {-# LANGUAGE TemplateHaskell, QuasiQuotes, ViewPatterns #-}---- Here's a silly example which parses peano numbers of the form Z, S Z,-- S S Z, etc. The \s+ means that it is not sensitive to the quantity or type-- of seperating whitespace. (these examples can also be found in Test.hs)---- > peano :: String -> Maybe Int-- > peano = [rex|^(?{ length . filter (=='S') } \s* (?:S\s+)*Z)\s*$|]---- > *Main> peano "Z"-- > Just 0-- > *Main> peano "S Z"-- > Just 1-- > *Main> peano "S S Z"-- > Just 2-- > *Main> peano "S S S Z"-- > Just 3-- > *Main> peano "invalid"-- > Nothing---- The token \"(?{\" introduces a capture group which has a mapping applied to-- the -- result - in this case \"length . filter (=='S')\". If the ?{ ... }-- are omitted, then the capture group is not taken as part of the results of-- the match. If the contents of the ?{ ... } is omitted, then 'id' is assumed:---- > parsePair :: String -> Maybe (String, String)-- > parsePair = [rex|^<\s* (?{ }[^\s,>]+) \s*,\s* (?{ }[^\s,>]+) \s*>$|]---- The following example is derived from-- http://www.regular-expressions.info/dates.html---- > parseDate :: String -> Maybe (Int, Int, Int)-- > parseDate [rex|^(?{ read -> y }(?:19|20)\d\d)[- /.]-- > (?{ read -> m }0[1-9]|1[012])[- /.]-- > (?{ read -> d }0[1-9]|[12][0-9]|3[01])$|]-- > | (d > 30 && (m `elem` [4, 6, 9, 11]))-- > || (m == 2 &&-- > (d == 29 && not (mod y 4 == 0 && (mod y 100 /= 0 || mod y 400 == 0)))-- > || (d > 29)) = Nothing-- > | otherwise = Just (y, m, d)-- > parseDate _ = Nothing---- The above example makes use of the regex quasi-quoter as a pattern matcher.-- The interpolated Haskell patterns are used to construct an implicit view-- pattern out of the inlined ones. The above pattern is expanded to the-- equivalent:---- > parseDate ([rex|^(?{ read }(?:19|20)\d\d)[- /.]-- > (?{ read }0[1-9]|1[012])[- /.]-- > (?{ read }0[1-9]|[12][0-9]|3[01])$|]-- > -> Just (y, m, d))------ Caveat: Since haskell-src-exts does not support parsing view-patterns, the-- above is implemented as a relatively naive split on \"->\". It presumes that-- the last \"->\" in the interpolated pattern seperates the pattern from an-- expression on the left. This allows for lambdas to be present in the-- expression, but prevents nesting view patterns.---- There are also a few other inelegances:---- 1) PCRE captures, unlike .NET regular expressions, yield the last capture-- made by a particular pattern. So, for example, (...)*, will only yield one-- match for '...'. Ideally these would be detected and yield an implicit [a].---- 2) Patterns with disjunction between captures ((?{f}a) | (?{g}b)) will-- provide the empty string to one of f / g. In the case of pattern-- expressions, it would be convenient to be able to map multiple captures into-- a single variable / pattern, preferring the first non-empty option. The-- general logic for this is a bit complicated, and postponed for a later-- release.---- Since pcre-light is a wrapper over a C API, the most efficient interface is-- ByteStrings, as it does not natively speak Haskell lists. The [rex| ... ]-- quasiquoter implicitely packs the input into a bystestring, and unpacks the-- results to strings before providing them to your mappers. The 'brex'-- 'QuasiQuoter' is provided for this purpose. You can also define your own-- 'QuasiQuoter' - the definitions of the default configurations are as follows:---- > rex = rexWithConf $ defaultRexConf-- > brex = rexWithConf $ defaultRexConf { rexByteString = True } -- >-- > defaultRexConf = RexConf False True "id" [PCRE.extended] []---- The first @False@ specifies to use @String@ rather than 'ByteString'. The-- @True@ argument specifies to use precompilation. -- The-- string following is the default mapping expression, used when omitted.-- Due to GHC staging restrictions, your configuration will need to be in a-- different module than its usage.---- Inspired by Matt Morrow's regexqq package:-- <http://hackage.haskell.org/packages/archive/regexqq/latest/doc/html/src/Text-Regex-PCRE-QQ.html>---- And code from Erik Charlebois's interpolatedstring-qq package:-- <http://hackage.haskell.org/packages/archive/interpolatedstring-qq/latest/doc/html/Text-InterpolatedString-QQ.html>-------------------------------------------------------------------------------moduleText.Regex.PCRE.Rex(-- * Quasiquotersrex,brex-- * Configurable QuasiQuoter,rexWithConf,RexConf(..),defaultRexConf-- * Utility,makeQuasiMultiline-- * Used by Generated Code,maybeRead,padRight)whereimportText.Regex.PCRE.PrecompileimportqualifiedText.Regex.PCRE.LightasPCREimportControl.Applicative((<$>))importControl.Arrow(first)importControl.Monad(liftM)importData.ByteString.Char8(pack,unpack,empty)importData.List(find)importData.List.Split(split,onSublist)importData.Maybe(catMaybes,listToMaybe,fromJust,isJust)importData.Char(isSpace)importSystem.IO.Unsafe(unsafePerformIO)importLanguage.Haskell.THimportLanguage.Haskell.TH.QuoteimportLanguage.Haskell.Meta.Parse{- TODO:
* Benchmark
* Target Text.Regex.Base ?
* Add unit tests
-}dataRexConf=RexConf{rexByteString::Bool,rexCompiled::Bool,rexView::String,rexPCREOpts::[PCRE.PCREOption],rexPCREExecOpts::[PCRE.PCREExecOption]}-- | Default regular expression quasiquoter for 'String's and 'ByteString's,-- respectively.rex,brex::QuasiQuoterrex=rexWithConf$defaultRexConfbrex=rexWithConf$defaultRexConf{rexByteString=True}-- | This is a 'QuasiQuoter' transformer, which allows for a whitespace-sensitive-- quasi-quoter to be broken over multiple lines. The default 'rex' and-- 'brex' functions do not need this as they are already whitespace insensitive.-- However, if you create your own configuration, which omits the 'PCRE.extended'-- parameter, then this could be useful. The leading space of each line is-- ignored, and all newlines removed.makeQuasiMultiline::QuasiQuoter->QuasiQuotermakeQuasiMultiline(QuasiQuoterabcd)=QuasiQuoter(a.pre)(b.pre)(c.pre)(d.pre)wherepre=concat.(\(x:xs)->x:map(dropWhileisSpace)xs).lines-- | Default rex configuration, which specifies that the regexes operate on-- strings, don't postprocess the matched patterns, and use 'PCRE.extended'.-- This setting causes whitespace to be nonsemantic, and ignores # comments.defaultRexConf::RexConfdefaultRexConf=RexConfFalseFalse"id"[PCRE.extended][]-- | A configureable regular-expression QuasiQuoter. Takes the options to pass-- to the PCRE engine, along with 'Bool's to flag 'ByteString' usage and-- non-compilation respecively. The provided 'String' indicates which mapping-- function to use, when one is omitted - \"(?{} ...)\".rexWithConf::RexConf->QuasiQuoterrexWithConfconf=QuasiQuoter(makeExpconf.parseIt)(makePatconf.parseIt)undefinedundefined-- Template Haskell Code Generation--------------------------------------------------------------------------------- Creates the template haskell Exp which corresponds to the parsed interpolated-- regex. This particular code mainly just handles making "read" the-- default for captures which lack a parser definition, and defaulting to making-- the parser that doesn't existmakeExp::RexConf->ParseChunks->ExpQmakeExpconf(cnt,pat,exs)=buildExpconfcntpatexs'whereexs'=map(\ix->liftM(processExpconf.snd)$find((==ix).fst)exs)[0..cnt]-- Creates the template haskell Pat which corresponds to the parsed interpolated-- regex. As well as handling the aforementioned defaulting considerations, this-- turns per-capture view patterns into a single tuple-resulting view pattern.-- -- E.g. [reg| ... (?{e1 -> v1} ...) ... (?{e2 -> v2} ...) ... |] becomes-- [reg| ... (?{e1} ...) ... (?{e2} ...) ... |] -> (v1, v2)makePat::RexConf->ParseChunks->PatQmakePatconf(cnt,pat,exs)=doviewExp<-buildExpconfcntpat$map(liftMfst)viewsreturn.ViewPviewExp.(\xs->ConP(mkName"Just")[TupPxs]).mapsnd$catMaybesviewswhereviews::[Maybe(Exp,Pat)]views=map(\ix->liftM(processView.snd)$find((==ix).fst)exs)[0..cnt]processView::String->(Exp,Pat)processViewxs=casesplitFromBack2((split.onSublist)"->"xs)of(_,[r])->onSpacer(error$"blank pattern in view: "++r)((processExpconf"",).processPat)-- View pattern(l,[_,r])->(processExpconf$concatl,processPatr)-- Included so that Haskell doesn't warn about non-exhaustive patterns-- (even though the above are exhaustive in this context)_->undefined-- Here's where the main meat of the template haskell is generated. Given the-- number of captures, the pattern string, and a list of capture expressions,-- yields the template Haskell Exp which parses a string into a tuple.buildExp::RexConf->Int->String->[MaybeExp]->ExpQbuildExpconfcntpatxs=[|letr=$(get_regex)in$(process).(flip$PCRE.matchr)$(liftRS$rexPCREExecOptsconf).$(ifrexByteStringconfthen[|id|]else[|pack|])|]whereliftRSx=[|readshown|]whereshown=showx--TODO: make sure this takes advantage of bytestring fusion stuff - is-- the right pack / unpack. Or use XOverloadedStringsget_regex|rexCompiledconf=[|unsafePerformIO(regexFromTable$!$(table_bytes))|]|otherwise=[|PCRE.compile(packpat)$(liftRSpcreOpts)|]table_bytes=[|pack$(LitE.StringL.unpack<$>runIOtable_string)|]table_string=forceMaybeMsg"Error while getting PCRE compiled representation\n"<$>precompile(packpat)pcreOptspcreOpts=rexPCREOptsconfprocess=case(nullvs,rexByteStringconf)of(True,_)->[|liftM(const())|](_,False)->[|liftM($(returnmaps).padRight""pad.mapunpack)|](_,True)->[|liftM($(returnmaps).padRightemptypad)|]pad=cnt+2maps=LamE[ListP.(WildP:)$mapVarPvs].TupE.map(uncurryAppE)-- filter out all "Nothing" exprs.map(firstfromJust).filter(isJust.fst)-- [(Expr, Variable applied to)].zipxs$mapVarEvsvs=[mkName$"v"++showi|i<-[0..cnt]]-- Parse a Haskell expression into a template Haskell ExpprocessExp::RexConf->String->ExpprocessExpconfxs=forceEitherMsg("Error while parsing capture mapper `"++xs++"'").parseExp$onSpacexs(rexViewconf)id-- Parse a Haskell pattern match into a template Haskell Pat, yielding Nothing-- for patterns which consist of just whitespace.processPat::String->PatprocessPatxs=forceEitherMsg("Error while parsing capture pattern `"++xs++"'")$parsePatxs-- Parsing-------------------------------------------------------------------------------typeParseChunk=EitherString(Int,String)typeParseChunks=(Int,String,[(Int,String)])-- Postprocesses the results of the chunk-wise parse output, into the pattern to-- be pased to the regex engine, and the interpolated parseIt::String->ParseChunksparseItxs=(cnt,concat[x|Leftx<-results],[(i,x)|Right(i,x)<-results])where(cnt,results)=parseRegex(filter(`notElem`"\r\n")xs)""(-1)-- A pair of mutually-recursive functions, one for processing the quotation-- and the other for the anti-quotation.-- TODO: add check for erroneous { }parseRegex::String->String->Int->(Int,[ParseChunk])parseRegexinpsix=caseinpof-- Disallow branch-reset capture.('(':'?':'|':_)->error"Branch reset pattern (?| not allowed in quasi-quoted regex."-- Ignore non-capturing parens / handle backslash escaping.('\\':'\\':xs)->parseRegexxs("\\\\"++s)ix('\\':'(':xs)->parseRegexxs(")\\"++s)ix('\\':')':xs)->parseRegexxs("(\\"++s)ix('(':'?':':':xs)->parseRegexxs(":?("++s)ix-- Anti-quote for processing a capture group.('(':'?':'{':xs)->mapSnd((Left$reverse('(':s)):)$parseHaskellxs""(ix+1)-- Keep track of how many capture groups we've seen.('(':xs)->parseRegexxs('(':s)(ix+1)-- Consume the regular expression contents.(x:xs)->parseRegexxs(x:s)ix[]->(ix,[Left$reverses])parseHaskell::String->String->Int->(Int,[ParseChunk])parseHaskellinpsix=caseinpof-- Escape } in the Haskell splice using a backslash.('\\':'}':xs)->parseHaskellxs('}':s)ix-- Capture accumulated antiquote, and continue parsing regex literal.('}':xs)->mapSnd((Right(ix,reverses)):)$parseRegexxs""ix-- Consume the antiquoute contents, appending to a reverse accumulator.(x:xs)->parseHaskellxs(x:s)ix[]->error"Rex haskell splice terminator, }, never found"-- Utils--------------------------------------------------------------------------------- | A possibly useful utility function - yields 'Just' x when there is a-- valid parse, and 'Nothing' otherwise.maybeRead::(Reada)=>String->MaybeamaybeRead=fmapfst.listToMaybe.readssplitFromBack::Int->[a]->([a],[a])splitFromBackixs=(reverseb,reversea)where(a,b)=splitAti$reversexsonSpace::String->a->(String->a)->aonSpacesxf|allisSpaces=x|otherwise=fs-- | Given a desired list-length, if the passed list is too short, it is padded-- with the given element. Otherwise, it trims.padRight::a->Int->[a]->[a]padRight_0_=[]padRightvi[]=replicateivpadRightvi(x:xs)=x:padRightv(i-1)xsmapSnd::(t->t2)->(t1,t)->(t1,t2)mapSndf(x,y)=(x,fy)-- From MissingH{- | Like 'forceMaybe', but lets you customize the error message raised if
Nothing is supplied. -}forceMaybeMsg::String->Maybea->aforceMaybeMsgmsgNothing=errormsgforceMaybeMsg_(Justx)=x{- | Like 'forceEither', but can raise a specific message with the error. -}forceEitherMsg::Showe=>String->Eitherea->aforceEitherMsgmsg(Leftx)=error$msg++": "++showxforceEitherMsg_(Rightx)=x