---------------------------------------------------------------------- |-- Module : Text.XML.Light.Input-- Copyright : (c) Galois, Inc. 2007-- License : BSD3---- Maintainer: Iavor S. Diatchki <diatchki@galois.com>-- Stability : provisional-- Portability: portable---- Lightweight XML parsing--moduleText.XML.Light.Input(parseXML,parseXMLDoc)whereimportText.XML.Light.TypesimportText.XML.Light.ProcimportText.XML.Light.Output(tagEnd)importData.Char(isSpace)importData.List(isPrefixOf)importNumeric(readHex)-- | parseXMLDoc, parse a XMLl document to maybe an elementparseXMLDoc::String->MaybeElementparseXMLDocxs=strip(parseXMLxs)wherestripcs=caseonlyElemscsofe:es|"?xml"`isPrefixOf`qName(elNamee)->strip(mapElemes)|otherwise->Juste_->Nothing-- | parseXML to a list of content chunksparseXML::String->[Content]parseXMLxs=parse$tokens$preprocessxs------------------------------------------------------------------------parse::[Token]->[Content]parse[]=[]parsets=let(es,_,ts1)=nodes([],Nothing)[]tsines++parsets1-- Information about namespaces.-- The first component is a map that associates prefixes to URIs,-- the second is the URI for the default namespace, if one was provided.typeNSInfo=([(String,String)],MaybeString)nodes::NSInfo->[QName]->[Token]->([Content],[QName],[Token])nodesnsps(TokCRefref:ts)=let(es,qs,ts1)=nodesnspstsin(CRefref:es,qs,ts1)nodesnsps(TokTexttxt:ts)=let(es,qs,ts1)=nodesnspsts(more,es1)=caseesofTextcd:es1'|cdVerbatimcd==cdVerbatimtxt->(cdDatacd,es1')_->([],es)in(Texttxt{cdData=cdDatatxt++more}:es1,qs,ts1)nodescur_infops(TokStartptasempty:ts)=(node:siblings,open,toks)wherenew_name=annotNamenew_infotnew_info=foldraddNScur_infoasnode=ElemElement{elLine=Justp,elName=new_name,elAttribs=map(annotAttrnew_info)as,elContent=children}(children,(siblings,open,toks))|empty=([],nodescur_infopsts)|otherwise=let(es1,qs1,ts1)=nodesnew_info(new_name:ps)tsin(es1,caseqs1of[]->nodescur_infopsts1_:qs3->([],qs3,ts1))nodesnsps(TokEndpt:ts)=lett1=annotNamenstincasebreak(t1==)psof(as,_:_)->([],as,ts)-- Unknown closing tag. Insert as text.(_,[])->let(es,qs,ts1)=nodesnspstsin(TextCData{cdLine=Justp,cdVerbatim=CDataText,cdData=tagEndt""}:es,qs,ts1)nodes_ps[]=([],ps,[])annotName::NSInfo->QName->QNameannotName(namespaces,def_ns)n=n{qURI=maybedef_ns(`lookup`namespaces)(qPrefixn)}annotAttr::NSInfo->Attr->AttrannotAttrnsa@(Attr{attrKey=k})=case(qPrefixk,qNamek)of-- Do not apply the default name-space to unqualified-- attributes. See Section 6.2 of <http://www.w3.org/TR/REC-xml-names>.(Nothing,_)->a_->a{attrKey=annotNamensk}addNS::Attr->NSInfo->NSInfoaddNS(Attr{attrKey=key,attrVal=val})(ns,def)=case(qPrefixkey,qNamekey)of(Nothing,"xmlns")->(ns,ifnullvalthenNothingelseJustval)(Just"xmlns",k)->((k,val):ns,def)_->(ns,def)-- Lexer -----------------------------------------------------------------------typeLChar=(Line,Char)typeLString=[LChar]dataToken=TokStartLineQName[Attr]Bool-- is empty?|TokEndLineQName|TokCRefString|TokTextCDataderivingShowtokens::String->[Token]tokens=tokens'.linenumber1tokens'::LString->[Token]tokens'((_,'<'):c@(_,'!'):cs)=specialccstokens'((_,'<'):cs)=tag(dropSpacecs)-- we are being nice heretokens'[]=[]tokens'cs@((l,_):_)=let(as,bs)=breakn('<'==)csinmapcvt(decode_textas)++tokens'bs-- XXX: Note, some of the lines might be a bit inacuaratewherecvt(TxtBitx)=TokTextCData{cdLine=Justl,cdVerbatim=CDataText,cdData=x}cvt(CRefBitx)=casecref_to_charxofJustc->TokTextCData{cdLine=Justl,cdVerbatim=CDataText,cdData=[c]}Nothing->TokCRefxspecial::LChar->LString->[Token]special_((_,'-'):(_,'-'):cs)=skipcswhereskip((_,'-'):(_,'-'):(_,'>'):ds)=tokens'dsskip(_:ds)=skipdsskip[]=[]-- unterminated commentspecialc((_,'['):(_,'C'):(_,'D'):(_,'A'):(_,'T'):(_,'A'):(_,'['):cs)=let(xs,ts)=cdatacsinTokTextCData{cdLine=Just(fstc),cdVerbatim=CDataVerbatim,cdData=xs}:tokens'tswherecdata((_,']'):(_,']'):(_,'>'):ds)=([],ds)cdata((_,d):ds)=let(xs,ys)=cdatadsin(d:xs,ys)cdata[]=([],[])specialccs=let(xs,ts)=munch""0csinTokTextCData{cdLine=Just(fstc),cdVerbatim=CDataRaw,cdData='<':'!':(reversexs)}:tokens'tswheremunchaccnesting((_,'>'):ds)|nesting==(0::Int)=('>':acc,ds)|otherwise=munch('>':acc)(nesting-1)dsmunchaccnesting((_,'<'):ds)=munch('<':acc)(nesting+1)dsmunchaccn((_,x):ds)=munch(x:acc)ndsmunchacc_[]=(acc,[])-- unterminated DTD markup--special c cs = tag (c : cs) -- invalid specials are processed as tagsqualName::LString->(QName,LString)qualNamexs=let(as,bs)=breaknendNamexs(q,n)=casebreak(':'==)asof(q1,_:n1)->(Justq1,n1)_->(Nothing,as)in(QName{qURI=Nothing,qPrefix=q,qName=n},bs)whereendNamex=isSpacex||x=='='||x=='>'||x=='/'tag::LString->[Token]tag((p,'/'):cs)=let(n,ds)=qualName(dropSpacecs)inTokEndpn:casedsof(_,'>'):es->tokens'es-- tag was not properly closed..._->tokens'dstag[]=[]tagcs=let(n,ds)=qualNamecs(as,b,ts)=attribs(dropSpaceds)inTokStart(fst(headcs))nasb:tsattribs::LString->([Attr],Bool,[Token])attribscs=casecsof(_,'>'):ds->([],False,tokens'ds)(_,'/'):ds->([],True,casedsof(_,'>'):es->tokens'es-- insert missing > ..._->tokens'ds)(_,'?'):(_,'>'):ds->([],True,tokens'ds)-- doc ended within a tag..[]->([],False,[])_->let(a,cs1)=attribcs(as,b,ts)=attribscs1in(a:as,b,ts)attrib::LString->(Attr,LString)attribcs=let(ks,cs1)=qualNamecs(vs,cs2)=attr_val(dropSpacecs1)in((Attrks(decode_attrvs)),dropSpacecs2)attr_val::LString->(String,LString)attr_val((_,'='):cs)=string(dropSpacecs)attr_valcs=("",cs)dropSpace::LString->LStringdropSpace=dropWhile(isSpace.snd)-- | Match the value for an attribute. For malformed XML we do-- our best to guess the programmer's intention.string::LString->(String,LString)string((_,'"'):cs)=break'('"'==)cs-- Allow attributes to be enclosed between ' '.string((_,'\''):cs)=break'('\''==)cs-- Allow attributes that are not enclosed by anything.stringcs=breakneoscswhereeosx=isSpacex||x=='>'||x=='/'break'::(a->Bool)->[(b,a)]->([a],[(b,a)])break'pxs=let(as,bs)=breaknpxsin(as,casebsof[]->[]_:cs->cs)breakn::(a->Bool)->[(b,a)]->([a],[(b,a)])breaknpl=(mapsndas,bs)where(as,bs)=break(p.snd)ldecode_attr::String->Stringdecode_attrcs=concatMapcvt(decode_textcs)wherecvt(TxtBitx)=xcvt(CRefBitx)=casecref_to_charxofJustc->[c]Nothing->'&':x++";"dataTxt=TxtBitString|CRefBitStringderivingShowdecode_text::[Char]->[Txt]decode_textxs@('&':cs)=casebreak(';'==)csof(as,_:bs)->CRefBitas:decode_textbs_->[TxtBitxs]decode_text[]=[]decode_textcs=let(as,bs)=break('&'==)csinTxtBitas:decode_textbscref_to_char::[Char]->MaybeCharcref_to_charcs=casecsof'#':ds->num_escds"lt"->Just'<'"gt"->Just'>'"amp"->Just'&'"apos"->Just'\''"quot"->Just'"'_->Nothingnum_esc::String->MaybeCharnum_esccs=casecsof'x':ds->check(readHexds)_->check(readscs)wherecheck[(n,"")]=cvt_charncheck_=Nothingcvt_char::Int->MaybeCharcvt_charx|fromEnum(minBound::Char)<=x&&x<=fromEnum(maxBound::Char)=Just(toEnumx)|otherwise=Nothingpreprocess::String->Stringpreprocess('\r':'\n':cs)='\n':preprocesscspreprocess('\r':cs)='\n':preprocesscspreprocess(c:cs)=c:preprocesscspreprocess[]=[]linenumber::Line->String->LStringlinenumber_[]=[]linenumbern('\n':s)=n'`seq`((n,'\n'):linenumbern's)wheren'=n+1linenumbern(c:s)=(n,c):linenumberns