{-
Copyright 2009-2011 Mario Blazevic
This file is part of the Streaming Component Combinators (SCC) project.
The SCC project is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later
version.
SCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with SCC. If not, see
<http://www.gnu.org/licenses/>.
-}-- | Module "XML" defines primitives and combinators for parsing and manipulating XML.{-# LANGUAGE PatternGuards, FlexibleContexts, MultiParamTypeClasses, ScopedTypeVariables, Rank2Types #-}{-# OPTIONS_HADDOCK hide #-}moduleControl.Concurrent.SCC.XML(-- * Parsing XMLxmlTokens,parseXMLTokens,expandXMLEntity,XMLToken(..),-- * XML splittersxmlElement,xmlElementContent,xmlElementName,xmlAttribute,xmlAttributeName,xmlAttributeValue,xmlElementHavingTagWith)whereimportPreludehiding(takeWhile)importControl.Applicative(Alternative((<|>)))importControl.Arrow((>>>))importControl.Monad(when)importData.CharimportData.Maybe(mapMaybe)importData.Monoid(Monoid(..))importData.List(find)importData.Text(Text,pack,unpack,singleton)importqualifiedData.TextasTextimportNumeric(readDec,readHex)importData.Functor.Contravariant.Ticker(andThen,tickOne,tickWhile)importText.ParserCombinators.Incremental(Parser,anyToken,satisfy,many0,takeWhile,takeWhile1,string,option,skip,lookAhead,notFollowedBy,mapIncremental,(><),(<<|>))importControl.Monad.Coroutine(Coroutine,sequentialBinder)importControl.Concurrent.SCC.StreamsimportControl.Concurrent.SCC.Typeshiding(Parser)importControl.Concurrent.SCC.Coercions(coerce)importControl.Concurrent.SCC.Combinators(parserToSplitter,findsTrueIn)dataXMLToken=StartTag|EndTag|EmptyTag|ElementName|AttributeName|AttributeValue|EntityReference|EntityName|ProcessingInstruction|ProcessingInstructionText|Comment|CommentText|StartMarkedSectionCDATA|EndMarkedSection|DoctypeDeclaration|ErrorTokenStringderiving(Eq,Show)-- | Converts an XML entity name into the text value it represents: @expandXMLEntity \"lt\" = \"<\"@.expandXMLEntity::String->StringexpandXMLEntity"lt"="<"expandXMLEntity"gt"=">"expandXMLEntity"quot"="\""expandXMLEntity"apos"="'"expandXMLEntity"amp"="&"expandXMLEntity('#':'x':codePoint)=[chr(fst$head$readHexcodePoint)]expandXMLEntity('#':codePoint)=[chr(fst$head$readDeccodePoint)]expandXMLEntitye=error("String \""++e++"\" is not a built-in entity name.")newtypeXMLStream=XMLStream{chunk::[MarkupXMLTokenText]}deriving(Show)instanceMonoidXMLStreamwheremempty=XMLStream[]l`mappend`XMLStream[]=lXMLStream[]`mappend`r=rXMLStreaml`mappend`XMLStreamr@((Contentrc):rt)=caselastlofContentlc->XMLStream(initl++Content(mappendlcrc):rt)_->XMLStream(l++r)XMLStreaml`mappend`XMLStreamr=XMLStream(l++r)xmlParser::ParserStringXMLStreamxmlParser=many0(xmlContent<|>xmlMarkup)wherexmlContent=mapContent$takeWhile1(\x->x/="<"&&x/="&")xmlMarkup=(string"<">>((startTag<|>endTag<|>processingInstruction<|>declaration)<<|>return(XMLStream[Markup$PointerrorUnescapedContentLT,Content(singleton'<')])))<|>entityReference"&"startTag=return(XMLStream[Markup(StartStartTag),Content(singleton'<'),Markup(StartElementName)])><name><return(XMLStream[Markup(EndElementName)])><whiteSpace><attributes><option(string"/">>return(XMLStream[Markup(PointEmptyTag),Content(singleton'/')]))><whiteSpace><(string">">>return(XMLStream[Content(singleton'>'),Markup(EndStartTag)])<<|>return(XMLStream[Markup$PointunterminatedStartTag,Markup$EndStartTag]))entityReferences=strings>>(return(XMLStream[Markup(StartEntityReference),Content(packs),Markup(StartEntityName)])><name><(string";">>return(XMLStream[Markup(EndEntityName),Content(singleton';'),Markup(EndEntityReference)]))<<|>return(XMLStream[Markup$Point$errorBadEntityReference,Content(packs)]))attributes=many0(attribute><whiteSpace)attribute=return(XMLStream[Markup(StartAttributeName)])><name><return(XMLStream[Markup(EndAttributeName)])><(mapContent(string"=")<<|>(fmap(\x->XMLStream[Markup$Point$errorBadAttributex])anyToken><whiteSpace><option(mapContent$string"=")))><((string"\""<|>string"\'")>>=\quote->return(XMLStream[Content$packquote,Markup(StartAttributeValue)])><mapContent(takeWhile(/=quote))><return(XMLStream[Markup(EndAttributeValue),Content$packquote])><skip(stringquote)<<|>(anyToken>>=\q->return(XMLStream[Markup$Point$errorBadQuoteCharacterq,Content$packquote])))endTag=(string"/">>return(XMLStream[Markup(StartEndTag),Content(pack"</"),Markup(StartElementName)]))><name><return(XMLStream[Markup(EndElementName)])><whiteSpace><(string">">>return(XMLStream[Content(singleton'>'),Markup(EndEndTag)])<<|>return(XMLStream[Markup$PointunterminatedEndTag,Markup(EndEndTag)]))processingInstruction=(string"?">>return(XMLStream[Markup(StartProcessingInstruction),Content(pack"<?"),Markup(StartProcessingInstructionText)]))><upto"?>"><(string"?>">>return(XMLStream[Markup(EndProcessingInstructionText),Content(pack"?>"),Markup(EndProcessingInstruction)])<<|>return(XMLStream[Markup$PointunterminatedProcessingInstruction]))declaration=string"!">>((comment<|>cdataMarkedSection<|>doctypeDeclaration)<<|>return(XMLStream[Markup$Point$errorBadDeclarationType,Content(pack"<")]))comment=(string"--">>return(XMLStream[Markup(StartComment),Content(pack"<!--"),Markup(StartCommentText)]))><upto"-->"><(string"-->">>return(XMLStream[Markup(EndCommentText),Content(pack"-->"),Markup(EndComment)])<<|>return(XMLStream[Markup$PointunterminatedComment]))cdataMarkedSection=(string"[CDATA[">>return(XMLStream[Markup(StartStartMarkedSectionCDATA),Content(pack"<![CDATA["),Markup(EndStartMarkedSectionCDATA)]))><upto"]]>"><(string"]]>">>return(XMLStream[Markup(StartEndMarkedSection),Content(pack"]]>"),Markup(EndEndMarkedSection)])<<|>return(XMLStream[Markup$PointunterminatedMarkedSection]))doctypeDeclaration=(string"DOCTYPE">>return(XMLStream[Markup(StartDoctypeDeclaration),Content(pack"<!DOCTYPE")]))><whiteSpace><(name><whiteSpace><option((mapContent(string"SYSTEM")<|>mapContent(string"PUBLIC")><whiteSpace><literal)><whiteSpace><literal><whiteSpace)><option(mapContent(string"[")><whiteSpace><many0((markupDeclaration<|>comment<|>processingInstruction<|>entityReference"%")><whiteSpace)><mapContent(string"]")><whiteSpace)><mapContent(string">")<<|>return(XMLStream[Markup(PointerrorMalformedDoctypeDeclaration)]))><return(XMLStream[Markup(EndDoctypeDeclaration)])literal=(string"\""<|>string"\'")>>=\quote->return(XMLStream[Content$packquote])><mapContent(takeWhile(/=quote))><return(XMLStream[Content$packquote])><skip(stringquote)markupDeclaration=mapContent(string"<!")><(many0(mapContent(takeWhile1(\x->x/=">"&&x/="\""&&x/="\'"))<|>literal)><mapContent(string">")<<|>return(XMLStream[Markup$PointunterminatedMarkupDeclaration]))name=mapContent(takeWhile1(isNameChar.head))mapContent=mapIncremental(XMLStream.(:[]).Content.pack)whiteSpace=mapContent(takeWhile(isSpace.head))uptoend@(lead:_)=mapContent(many0(takeWhile1(/=[lead])<<|>notFollowedBy(stringend)><anyToken))errorBadQuoteCharacterq=ErrorToken("Invalid quote character "++showq)errorBadAttributex=ErrorToken("Invalid character "++showx++" following attribute name")errorBadEntityReference=ErrorToken"Invalid entity reference."errorBadDeclarationType=ErrorToken"The \"<!\" sequence must be followed by \"[CDATA[\" or \"--\"."errorMalformedDoctypeDeclaration=ErrorToken"Malformed DOCTYPE declaration."errorUnescapedContentLT=ErrorToken"Unescaped character '<' in content"unterminatedComment=ErrorToken"Unterminated comment."unterminatedMarkedSection=ErrorToken"Unterminated marked section."unterminatedMarkupDeclaration=ErrorToken"Unterminated markup declaration."unterminatedStartTag=ErrorToken"Missing '>' at the end of start tag."unterminatedEndTag=ErrorToken"Missing '>' at the end of end tag."unterminatedProcessingInstruction=ErrorToken"Unterminated processing instruction."isNameStartx=isLetterx||x=='_'isNameCharx=isAlphaNumx||x=='_'||x=='-'||x==':'-- | XML markup splitter wrapping 'parseXMLTokens'.xmlTokens::Monadm=>SplittermChar(BoundaryXMLToken)xmlTokens=parserToSplitter(parseXMLTokens>>>statelessTransducerunpackContent)whereunpackContent::MarkupXMLTokenText->[MarkupXMLTokenChar]unpackContent(Markupb)=[Markupb]unpackContent(Contentc)=mapContent(unpackc)-- | The XML token parser. This parser converts plain text to parsed text, which is a precondition for using the-- remaining XML components.parseXMLTokens::Monadm=>TransducermChar(MarkupXMLTokenText)parseXMLTokens=Transducer(pourParsed(mapIncrementalchunkxmlParser))dispatchOnString::forallmadr.(Monadm,AncestorFunctorad)=>SourcemaChar->(String->Coroutinedmr)->[(String,String->Coroutinedmr)]->CoroutinedmrdispatchOnStringsourcefailurefullCases=dispatchfullCasesidwheredispatchcasesconsumed=casefind(null.fst)casesofJust(~"",rhs)->rhs(consumed"")Nothing->getsource>>=maybe(failure(consumed""))(\x->casemapMaybe(startingWithx)casesof[]->failure(consumed[x])subcases->dispatch(subcases++fullCases)(consumed.(x:)))startingWithx~(y:rest,rhs)|x==y=Just(rest,rhs)|otherwise=NothinggetElementName::forallmad.(Monadm,AncestorFunctorad)=>Sourcema(MarkupXMLTokenText)->([MarkupXMLTokenText]->[MarkupXMLTokenText])->Coroutinedm([MarkupXMLTokenText],MaybeText)getElementNamesourcef=getsource>>=maybe(return(f[],Nothing))(\x->letf'=f.(x:)incasexofMarkup(StartElementName)->getRestOfRegionElementNamesourcef'idMarkup(PointErrorToken{})->getElementNamesourcef'Content{}->getElementNamesourcef'_->error("Expected an ElementName, received "++showx))getRestOfRegion::forallmad.(Monadm,AncestorFunctorad)=>XMLToken->Sourcema(MarkupXMLTokenText)->([MarkupXMLTokenText]->[MarkupXMLTokenText])->(Text->Text)->Coroutinedm([MarkupXMLTokenText],MaybeText)getRestOfRegiontokensourcefg=getWhileisContentsource>>=\content->getsource>>=\x->casexofJusty@(MarkupEnd{})->return(f(content++[y]),Just(g$Text.concat$mapfromContentcontent))_->error("Expected rest of "++showtoken++", received "++showx)pourRestOfRegion::forallma1a2a3d.(Monadm,AncestorFunctora1d,AncestorFunctora2d,AncestorFunctora3d)=>XMLToken->Sourcema1(MarkupXMLTokenText)->Sinkma2(MarkupXMLTokenText)->Sinkma3(MarkupXMLTokenText)->CoroutinedmBoolpourRestOfRegiontokensourcesinkendSink=pourWhileisContentsourcesink>>getsource>>=maybe(returnFalse)(\x->casexofMarkup(Endtoken')|token==token'->putendSinkx>>returnTrue_->error("Expected rest of "++showtoken++", received "++showx))getRestOfStartTag::forallmad.(Monadm,AncestorFunctorad)=>Sourcema(MarkupXMLTokenText)->Coroutinedm([MarkupXMLTokenText],Bool)getRestOfStartTagsource=dorest<-getWhilenotEndTagsourceend<-getsourcecaseendofNothing->return(rest,False)Juste@(Markup(EndStartTag))->return(rest++[e],True)Juste@(Markup(PointEmptyTag))->getRestOfStartTagsource>>=\(rest',_)->return(rest++(e:rest'),False)_->error"getWhile returned early!"wherenotEndTag(Markup(EndStartTag))=FalsenotEndTag(Markup(PointEmptyTag))=FalsenotEndTag_=TruegetRestOfEndTag::forallmad.(Monadm,AncestorFunctorad)=>Sourcema(MarkupXMLTokenText)->Coroutinedm[MarkupXMLTokenText]getRestOfEndTagsource=getWhile(/=Markup(EndEndTag))source>>=\tokens->getsource>>=maybe(error"No end to the end tag!")(return.(tokens++).(:[]))findEndTag::forallma1a2a3d.(Monadm,AncestorFunctora1d,AncestorFunctora2d,AncestorFunctora3d)=>Sourcema1(MarkupXMLTokenText)->Sinkma2(MarkupXMLTokenText)->Sinkma3(MarkupXMLTokenText)->Text->Coroutinedm()findEndTagsourcesinkendSinkname=findTagwherefindTag=pourWhilenoTagStartsourcesink>>getsource>>=maybe(return())consumeOnenoTagStart(Markup(StartStartTag))=FalsenoTagStart(Markup(StartEndTag))=FalsenoTagStart_=TrueconsumeOnex@(Markup(StartEndTag))=do(tokens,mn)<-getElementNamesource(x:)maybe(return())(\name'->getRestOfEndTagsource>>=\rest->ifname==name'thenputList(tokens++rest)endSink>>return()elseputList(tokens++rest)sink>>findTag)mnconsumeOnex@(Markup(StartStartTag))=do(tokens,mn)<-getElementNamesource(x:)maybe(return())(\name'->do(rest,hasContent)<-getRestOfStartTagsource_<-putList(tokens++rest)sinkwhenhasContent(findEndTagsourcesinksinkname')findTag)mnconsumeOne_=error"pourWhile returned early!"findStartTag::forallma1a2d.(Monadm,AncestorFunctora1d,AncestorFunctora2d)=>Sourcema1(MarkupXMLTokenText)->Sinkma2(MarkupXMLTokenText)->Coroutinedm(Maybe(MarkupXMLTokenText))findStartTagsourcesink=pourWhile(/=Markup(StartStartTag))sourcesink>>getsource-- | Splits all top-level elements with all their content to /true/, all other input to /false/.xmlElement::Monadm=>Splitterm(MarkupXMLTokenText)()xmlElement=Splitter$\sourcetruefalseedge->letsplit0=findStartTagsourcefalse>>=maybe(return[])(\x->doputedge()puttruex(tokens,mn)<-getElementNamesourceidmaybe(putListtokenstrue)(\name->do(rest,hasContent)<-getRestOfStartTagsource_<-putList(tokens++rest)trueifhasContentthensplit1nameelsesplit0)mn)split1name=findEndTagsourcetruetruename>>split0insplit0>>return()-- | Splits the content of all top-level elements to /true/, their tags and intervening input to /false/.xmlElementContent::Monadm=>Splitterm(MarkupXMLTokenText)()xmlElementContent=Splitter$\sourcetruefalseedge->letsplit0=findStartTagsourcefalse>>=maybe(return[])(\x->doputfalsex(tokens,mn)<-getElementNamesourceidmaybe(putListtokensfalse)(\name->do(rest,hasContent)<-getRestOfStartTagsource_<-putList(tokens++rest)falseifhasContentthenputedge()>>split1nameelsesplit0)mn)split1name=findEndTagsourcetruefalsename>>split0insplit0>>return()-- | Similiar to @('Control.Concurrent.SCC.Combinators.having' 'element')@, except it runs the argument splitter-- only on each element's start tag, not on the entire element with its content.xmlElementHavingTagWith::forallmb.Monadm=>Splitterm(MarkupXMLTokenText)b->Splitterm(MarkupXMLTokenText)bxmlElementHavingTagWithtest=isolateSplitter$\sourcetruefalseedge->letsplit0=findStartTagsourcefalse>>=maybe(return())(\x->do(tokens,mn)<-getElementNamesource(x:)maybe(return())(\name->do(rest,hasContent)<-getRestOfStartTagsourcelettag=tokens++rest(_,found)<-pipe(putListtag)(findsTrueIntest)casefoundofJustmb->maybe(return())(putedge)mb>>putListtagtrue>>split1hasContenttruenameNothing->putListtagfalse>>split1hasContentfalsename)mn)split1hasContentsinkname=whenhasContent(findEndTagsourcesinksinkname)>>split0insplit0-- | Splits every attribute specification to /true/, everything else to /false/.xmlAttribute::Monadm=>Splitterm(MarkupXMLTokenText)()xmlAttribute=Splitter$\sourcetruefalseedge->letsplit0=getWith(\x->casexofMarkup(StartAttributeName)->doputedge()puttruexpourRestOfRegionAttributeNamesourcetruetrue>>=flipwhensplit1_->putfalsex>>split0)sourcesplit1=getWith(\x->casexofMarkup(StartAttributeValue)->puttruex>>pourRestOfRegionAttributeValuesourcetruetrue>>=flipwhensplit0_->puttruex>>split1)sourceinsplit0-- | Splits every element name, including the names of nested elements and names in end tags, to /true/, all the rest of-- input to /false/.xmlElementName::Monadm=>Splitterm(MarkupXMLTokenText)()xmlElementName=Splitter(splitSimpleRegionsElementName)-- | Splits every attribute name to /true/, all the rest of input to /false/.xmlAttributeName::Monadm=>Splitterm(MarkupXMLTokenText)()xmlAttributeName=Splitter(splitSimpleRegionsAttributeName)-- | Splits every attribute value, excluding the quote delimiters, to /true/, all the rest of input to /false/.xmlAttributeValue::Monadm=>Splitterm(MarkupXMLTokenText)()xmlAttributeValue=Splitter(splitSimpleRegionsAttributeValue)splitSimpleRegions::Monadm=>XMLToken->OpenSplitterma1a2a3a4d(MarkupXMLTokenText)()()splitSimpleRegionstokensourcetruefalseedge=split0wheresplit0=getWithconsumeOnesourceconsumeOnex@(Markup(Starttoken'))|token==token'=putfalsex>>putedge()>>pourRestOfRegiontokensourcetruefalse>>=flipwhensplit0consumeOnex=putfalsex>>split0isContent::Markupbx->BoolisContentContent{}=TrueisContent_=FalsefromContent::Markupbx->xfromContent(Contentx)=xfromContent_=error"fromContent expects Content!"