{-# LANGUAGE CPP #-}---------------------------------------------------------------------- |-- Module : Text.Regex.PCRE.Light-- Copyright: Copyright (c) 2007-2008, Don Stewart-- License : BSD3---- Maintainer: Don Stewart <dons@galois.com>-- Stability : experimental-- Portability: H98 + CPP------------------------------------------------------------------------ -- A simple, portable binding to perl-compatible regular expressions-- (PCRE) via strict ByteStrings.--moduleText.Regex.PCRE.Light(-- * The abstract PCRE Regex typeRegex-- * ByteString interface,compile,compileM,match-- * Regex types and constructors externally visible-- ** PCRE compile-time bit flags,PCREOption,anchored,auto_callout{-, bsr_anycrlf-}{-, bsr_unicode-},caseless,dollar_endonly,dotall,dupnames,extended,extra,firstline,multiline{-, newline_any-}{-, newline_anycrlf-},newline_cr,newline_crlf,newline_lf,no_auto_capture,ungreedy,utf8,no_utf8_check-- ** PCRE exec-time bit flags,PCREExecOption,exec_anchored{-, exec_newline_any -}{-, exec_newline_anycrlf -},exec_newline_cr,exec_newline_crlf,exec_newline_lf,exec_notbol,exec_noteol,exec_notempty,exec_no_utf8_check,exec_partial)whereimportText.Regex.PCRE.Light.Base-- StringsimportqualifiedData.ByteStringasS#if __GLASGOW_HASKELL__ >= 608importqualifiedData.ByteString.InternalasSimportqualifiedData.ByteString.UnsafeasS#elseimportqualifiedData.ByteString.BaseasS#endif-- ForeignsimportForeignimportForeign.PtrimportForeign.C.TypesimportForeign.C.StringimportForeign.StorableimportForeign.Marshal.Alloc-- | 'compile'---- Compile a perl-compatible regular expression stored in a strict bytestring.---- An example---- > let r = compile (pack "^(b+|a){1,2}?bc") []---- Or using GHC's -XOverloadedStrings flag, and importing-- Data.ByteString.Char8, we can avoid the pack:---- > let r = compile "^(b+|a){1,2}?bc" []---- If the regular expression is invalid, an exception is thrown.-- If this is unsuitable, 'compileM' is availlable, which returns failure -- in a monad.---- To do case insentive matching,---- > compile "^(b+|a){1,2}?bc" [caseless]---- Other flags are documented below.---- The resulting abstract regular expression can be passed to 'match'-- for matching against a subject string.---- The arguments are:---- * 'pat': A ByteString containing the regular expression to be compiled. ---- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.---- Valid compile-time flags are:---- * 'anchored' - Force pattern anchoring---- * 'auto_callout' - Compile automatic callouts---- * 'bsr_anycrlf' - \\R matches only CR, LF, or CRLF---- * 'bsr_unicode' - \\R matches all Unicode line endings---- * 'caseless' - Do caseless matching---- * 'dollar_endonly' - '$' not to match newline at end---- * 'dotall' - matches anything including NL---- * 'dupnames' - Allow duplicate names for subpatterns---- * 'extended' - Ignore whitespace and # comments---- * 'extra' - PCRE extra features (not much use currently)---- * 'firstline' - Force matching to be before newline---- * 'multiline' - '^' and '$' match newlines within data---- * 'newline_any' - Recognize any Unicode newline sequence---- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences---- * 'newline_cr' - Set CR as the newline sequence---- * 'newline_crlf' - Set CRLF as the newline sequence---- * 'newline_lf' - Set LF as the newline sequence---- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)---- * 'ungreedy' - Invert greediness of quantifiers---- * 'utf8' - Run in UTF-8 mode---- * 'no_utf8_check' - Do not check the pattern for UTF-8 validity---- The regex is allocated via malloc on the C side, and will be-- deallocated by the runtime when the Haskell value representing it-- goes out of scope.---- See 'man pcreapi for more details.---- Caveats: patterns with embedded nulls, such as "\0*" seem to be-- mishandled, as this won't currently match the subject "\0\0\0".--compile::S.ByteString->[PCREOption]->Regexcompileso=casecompileMsoofRightr->rLefte->error("Text.Regex.PCRE.Light: Error in regex: "++e)-------------------------------------------------------------------------- | 'compileM'-- A safe version of 'compile' with failure wrapped in an Either.---- Examples,---- > > compileM ".*" [] :: Either String Regex-- > Right (Regex 0x000000004bb5b980 ".*")---- > > compileM "*" [] :: Either String Regex-- > Left "nothing to repeat"--compileM::S.ByteString->[PCREOption]->EitherStringRegexcompileMstros=unsafePerformIO$S.useAsCStringstr$\pattern->doalloca$\errptr->doalloca$\erroffset->dopcre_ptr<-c_pcre_compilepattern(combineOptionsos)errptrerroffsetnullPtrifpcre_ptr==nullPtrthendoerr<-peekCString=<<peekerrptrreturn(Lefterr)elsedoreg<-newForeignPtrfinalizerFreepcre_ptr-- release with free()return(Right(Regexregstr))-- Possible improvements: an 'IsString' instance could be defined-- for 'Regex', which would allow the compiler to insert calls to-- 'compile' based on the type:---- The following would be valid:---- > match "a.*b" "abcdef" []---- and equivalent to:---- > match (either error id (compile "a.*b")) "abcdef" []-- | 'match'---- Matches a compiled regular expression against a given subject string,-- using a matching algorithm that is similar to Perl's. If the subject-- string doesn't match the regular expression, 'Nothing' is returned,-- otherwise the portion of the string that matched is returned, along-- with any captured subpatterns.---- The arguments are:---- * 'regex', a PCRE regular expression value produced by compile---- * 'subject', the subject string to match against---- * 'options', an optional set of exec-time flags to exec.---- Available runtime options are:---- * 'exec_anchored' - Match only at the first position---- * 'exec_newline_any' - Recognize any Unicode newline sequence---- * 'exec_newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences---- * 'exec_newline_cr' - Set CR as the newline sequence---- * 'exec_newline_crlf' - Set CRLF as the newline sequence---- * 'exec_newline_lf' - Set LF as the newline sequence---- * 'exec_notbol' - Subject is not the beginning of a line---- * 'exec_noteol' - Subject is not the end of a line---- * 'exec_notempty' - An empty string is not a valid match---- * 'exec_no_utf8_check' - Do not check the subject for UTF-8---- * 'exec_partial' - Return PCRE_ERROR_PARTIAL for a partial match---- The result value, and any captured subpatterns, are returned.-- If the regex is invalid, or the subject string is empty, Nothing-- is returned.--match::Regex->S.ByteString->[PCREExecOption]->Maybe[S.ByteString]match(Regexpcre_fp_)subjectos=unsafePerformIO$dowithForeignPtrpcre_fp$\pcre_ptr->don_capt<-capturedCountpcre_ptr-- The smallest size for ovector that will allow for n captured-- substrings, in addition to the offsets of the substring-- matched by the whole pattern, is (n+1)*3. (man pcreapi)letovec_size=(n_capt+1)*3ovec_bytes=ovec_size*size_of_cintallocaBytesovec_bytes$\ovec->dolet(str_fp,off,len)=S.toForeignPtrsubjectwithForeignPtrstr_fp$\cstr->dor<-c_pcre_execpcre_ptrnullPtr(cstr`plusPtr`off)-- may contain binary zero bytes.(fromIntegrallen)0(combineExecOptionsos)ovec(fromIntegralovec_size)ifr<0-- errors, or error_no_matchthenreturnNothingelseletloopnoacc=ifn==rthenreturn(Just(reverseacc))elsedoi<-peekElemOffovec$!oj<-peekElemOffovec(o+1)lets=substringijsubjects`seq`loop(n+1)(o+2)(s:acc)inloop00[]-- The first two-thirds of ovec is used to pass back captured-- substrings When a match is successful, information about captured-- substrings is returned in pairs of integers, starting at the-- beginning of ovector, and continuing up to two-thirds of its length at-- the most. The first pair, ovector[0] and ovector[1], identify the-- portion of the subject string matched by the entire pattern. The next-- pair is used for the first capturing subpattern, and so on. The-- value returned by pcre_exec() is one more than the highest num- bered-- pair that has been set. For example, if two sub- strings have been-- captured, the returned value is 3. where-- The first element of a pair is set to the offset of the first-- character in a substring, and the second is set to the offset of the-- first character after the end of a substring.substring::CInt->CInt->S.ByteString->S.ByteStringsubstringxy_|x==y=S.empty-- XXX an unset subpatternsubstringabs=end-- note that we're not checking...wherestart=S.unsafeDrop(fromIntegrala)send=S.unsafeTake(fromIntegral(b-a))start-- use pcre_info to work out how many substrings to reserve space forcapturedCount::PtrPCRE->IOIntcapturedCountregex_ptr=alloca$\n_ptr->do-- (st :: Ptr CInt)c_pcre_fullinforegex_ptrnullPtrinfo_capturecountn_ptrreturn.fromIntegral=<<peek(n_ptr::PtrCInt)