{-
Copyright (C) 2010 Paul Rivier <paul*rivier#demotera*com> | tr '*#' '.@'
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}{- |
Module : Text.Pandoc.Readers.Textile
Copyright : Copyright (C) 2010-2011 Paul Rivier and John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : Paul Rivier <paul*rivier#demotera*com>
Stability : alpha
Portability : portable
Conversion from Textile to 'Pandoc' document, based on the spec
available at http://redcloth.org/textile.Implemented and parsed:
- Paragraphs
- Code blocks
- Lists
- blockquote
- Inlines : strong, emph, cite, code, deleted, superscript,
subscript, links
- footnotes
Implemented but discarded:
- HTML-specific and CSS-specific attributes
Left to be implemented:
- dimension sign
- all caps
- continued blocks (ex bq..)
TODO : refactor common patterns across readers :
- autolink
- smartPunctuation
- more ...
-}moduleText.Pandoc.Readers.Textile(readTextile)whereimportText.Pandoc.DefinitionimportText.Pandoc.SharedimportText.Pandoc.ParsingimportText.Pandoc.Readers.HTML(htmlTag,isInlineTag,isBlockTag)importText.ParserCombinators.ParsecimportText.HTML.TagSoup.MatchimportData.Char(digitToInt,isLetter)importControl.Monad(guard,liftM)-- | Parse a Textile text and return a Pandoc document.readTextile::ParserState-- ^ Parser state, including options for parser->String-- ^ String to parse (assuming @'\n'@ line endings)->PandocreadTextilestates=(readWithparseTextile)state(s++"\n\n")---- Constants and data structure definitions---- | Special chars border strings parsingspecialChars::[Char]specialChars="\\[]<>*#_@~-+^&,.;:!?|\"'%()"-- | Generate a Pandoc ADT from a textile documentparseTextile::GenParserCharParserStatePandocparseTextile=do-- textile allows raw HTML and does smart punctuation by defaultupdateState(\state->state{stateParseRaw=True,stateSmart=True})manyblanklinestartPos<-getPosition-- go through once just to get list of reference keys and notes-- docMinusKeys is the raw document with blanks where the keys/notes were...letfirstPassParser=noteBlock<|>lineClumpmanyTillfirstPassParsereof>>=setInput.concatsetPositionstartPosst'<-getStateletreversedNotes=stateNotesst'updateState$\s->s{stateNotes=reversereversedNotes}-- now parse it for real...blocks<-parseBlocksreturn$Pandoc(Meta[][][])blocks-- FIXMEnoteMarker::GenParserCharParserState[Char]noteMarker=skipManyspaceChar>>string"fn">>manyTilldigit(char'.')noteBlock::GenParserCharParserState[Char]noteBlock=try$dostartPos<-getPositionref<-noteMarkeroptionalblanklinecontents<-liftMunlines$many1TillanyLine(blanklines<|>noteBlock)endPos<-getPositionletnewnote=(ref,contents++"\n")st<-getStateletoldnotes=stateNotesstupdateState$\s->s{stateNotes=newnote:oldnotes}-- return blanks so line count isn't affectedreturn$replicate(sourceLineendPos-sourceLinestartPos)'\n'-- | Parse document blocksparseBlocks::GenParserCharParserState[Block]parseBlocks=manyTillblockeof-- | Block parsers list tried in definition orderblockParsers::[GenParserCharParserStateBlock]blockParsers=[codeBlock,header,blockQuote,hrule,anyList,rawHtmlBlock,maybeExplicitBlock"table"table,maybeExplicitBlock"p"para,nullBlock]-- | Any block in the order of definition of blockParsersblock::GenParserCharParserStateBlockblock=choiceblockParsers<?>"block"codeBlock::GenParserCharParserStateBlockcodeBlock=codeBlockBc<|>codeBlockPrecodeBlockBc::GenParserCharParserStateBlockcodeBlockBc=try$dostring"bc. "contents<-manyTillanyLineblanklinesreturn$CodeBlock("",[],[])$unlinescontents-- | Code Blocks in Textile are between <pre> and </pre>codeBlockPre::GenParserCharParserStateBlockcodeBlockPre=try$dohtmlTag(tagOpen(=="pre")null)result'<-manyTillanyChar(try$htmlTag(tagClose(=="pre"))>>blockBreak)-- drop leading newline if anyletresult''=caseresult'of'\n':xs->xs_->result'-- drop trailing newline if anyletresult'''=casereverseresult''of'\n':_->initresult''_->result''return$CodeBlock("",[],[])result'''-- | Header of the form "hN. content" with N in 1..6header::GenParserCharParserStateBlockheader=try$dochar'h'level<-oneOf"123456">>=return.digitToIntoptionalattributeschar'.'whitespacename<-manyTillinlineblockBreakreturn$Headerlevel(normalizeSpacesname)-- | Blockquote of the form "bq. content"blockQuote::GenParserCharParserStateBlockblockQuote=try$dostring"bq"optionalattributeschar'.'whitespacepara>>=return.BlockQuote.(:[])-- Horizontal rulehrule::GenParserCharstBlockhrule=try$doskipSpacesstart<-oneOf"-*"count2(skipSpaces>>charstart)skipMany(spaceChar<|>charstart)newlineoptionalblanklinesreturnHorizontalRule-- Lists handling-- | Can be a bullet list or an ordered list. This implementation is-- strict in the nesting, sublist must start at exactly "parent depth-- plus one"anyList::GenParserCharParserStateBlockanyList=try$dol<-anyListAtDepth1blanklinesreturnl-- | This allow one type of list to be nested into an other type,-- provided correct nestinganyListAtDepth::Int->GenParserCharParserStateBlockanyListAtDepthdepth=choice[bulletListAtDepthdepth,orderedListAtDepthdepth,definitionList]-- | Bullet List of given depth, depth being the number of leading '*'bulletListAtDepth::Int->GenParserCharParserStateBlockbulletListAtDepthdepth=try$doitems<-many1(bulletListItemAtDepthdepth)return(BulletListitems)-- | Bullet List Item of given depth, depth being the number of-- leading '*'bulletListItemAtDepth::Int->GenParserCharParserState[Block]bulletListItemAtDepthdepth=try$docountdepth(char'*')optionalattributeswhitespacep<-inlines>>=return.Plainsublist<-option[](anyListAtDepth(depth+1)>>=return.(:[]))return(p:sublist)-- | Ordered List of given depth, depth being the number of-- leading '#'orderedListAtDepth::Int->GenParserCharParserStateBlockorderedListAtDepthdepth=try$doitems<-many1(orderedListItemAtDepthdepth)return(OrderedList(1,DefaultStyle,DefaultDelim)items)-- | Ordered List Item of given depth, depth being the number of-- leading '#'orderedListItemAtDepth::Int->GenParserCharParserState[Block]orderedListItemAtDepthdepth=try$docountdepth(char'#')optionalattributeswhitespacep<-inlines>>=return.Plainsublist<-option[](anyListAtDepth(depth+1)>>=return.(:[]))return(p:sublist)-- | A definition list is a set of consecutive definition itemsdefinitionList::GenParserCharParserStateBlockdefinitionList=try$doitems<-many1definitionListItemreturn$DefinitionListitems-- | A definition list item in textile begins with '- ', followed by-- the term defined, then spaces and ":=". The definition follows, on-- the same single line, or spaned on multiple line, after a line-- break.definitionListItem::GenParserCharParserState([Inline],[[Block]])definitionListItem=try$dostring"- "term<-many1Tillinline(try(whitespace>>string":="))def<-inlineDef<|>multilineDefreturn(term,def)whereinlineDef::GenParserCharParserState[[Block]]inlineDef=liftM(\d->[[Plaind]])$try(whitespace>>inlines)multilineDef::GenParserCharParserState[[Block]]multilineDef=try$dooptionalwhitespace>>newlines<-many1TillanyChar(try(string"=:">>newline))-- this ++ "\n\n" does not look very goodds<-parseFromStringparseBlocks(s++"\n\n")return[ds]-- | This terminates a block such as a paragraph. Because of raw html-- blocks support, we have to lookAhead for a rawHtmlBlock.blockBreak::GenParserCharParserState()blockBreak=try(newline>>blanklines>>return())<|>(lookAheadrawHtmlBlock>>return())-- | A raw Html Block, optionally followed by blanklinesrawHtmlBlock::GenParserCharParserStateBlockrawHtmlBlock=try$do(_,b)<-htmlTagisBlockTagoptionalblanklinesreturn$RawBlock"html"b-- | In textile, paragraphs are separated by blank lines.para::GenParserCharParserStateBlockpara=try$docontent<-manyTillinlineblockBreakreturn$Para$normalizeSpacescontent-- Tables-- | A table cell spans until a pipe |tableCell::GenParserCharParserStateTableCelltableCell=doc<-many1(noneOf"|\n")content<-parseFromString(many1inline)creturn$[Plain$normalizeSpacescontent]-- | A table row is made of many table cellstableRow::GenParserCharParserState[TableCell]tableRow=try$dochar'|'cells<-endBy1tableCell(char'|')newlinereturncells-- | Many table rowstableRows::GenParserCharParserState[[TableCell]]tableRows=many1tableRow-- | Table headers are made of cells separated by a tag "|_."tableHeaders::GenParserCharParserState[TableCell]tableHeaders=try$doletseparator=(try$string"|_.")separatorheaders<-sepBy1tableCellseparatorchar'|'newlinereturnheaders-- | A table with an optional header. Current implementation can-- handle tables with and without header, but will parse cells-- alignment attributes as content.table::GenParserCharParserStateBlocktable=try$doheaders<-option[]tableHeadersrows<-tableRowsblanklinesletnbOfCols=max(lengthheaders)(length$headrows)return$Table[](replicatenbOfColsAlignDefault)(replicatenbOfCols0.0)headersrows-- | Blocks like 'p' and 'table' do not need explicit block tag.-- However, they can be used to set HTML/CSS attributes when needed.maybeExplicitBlock::String-- ^ block tag name->GenParserCharParserStateBlock-- ^ implicit block->GenParserCharParserStateBlockmaybeExplicitBlocknameblk=try$dooptional$try$stringname>>optionalattributes>>char'.'>>((trywhitespace)<|>endline)blk------------ Inlines------------ | Any inline elementinline::GenParserCharParserStateInlineinline=choiceinlineParsers<?>"inline"-- | List of consecutive inlines before a newlineinlines::GenParserCharParserState[Inline]inlines=manyTillinlinenewline-- | Inline parsers tried in orderinlineParsers::[GenParserCharParserStateInline]inlineParsers=[autoLink,str,whitespace,endline,code,htmlSpan,rawHtmlInline,note,simpleInline(string"??")(Cite[]),simpleInline(string"**")Strong,simpleInline(string"__")Emph,simpleInline(char'*')Strong,simpleInline(char'_')Emph,simpleInline(char'-')Strikeout,simpleInline(char'^')Superscript,simpleInline(char'~')Subscript,link,image,mark,smartPunctuationinline,symbol]-- | Trademark, registered, copyrightmark::GenParserCharstInlinemark=try$char'('>>(trytm<|>tryreg<|>copy)reg::GenParserCharstInlinereg=dooneOf"Rr"char')'return$Str"\174"tm::GenParserCharstInlinetm=dooneOf"Tt"oneOf"Mm"char')'return$Str"\8482"copy::GenParserCharstInlinecopy=dooneOf"Cc"char')'return$Str"\169"note::GenParserCharParserStateInlinenote=try$dochar'['ref<-many1digitchar']'state<-getStateletnotes=stateNotesstatecaselookuprefnotesofNothing->fail"note not found"Justraw->liftMNote$parseFromStringparseBlocksraw-- | Any stringstr::GenParserCharParserStateInlinestr=doxs<-many1(noneOf(specialChars++"\t\n "))optional$try$dolookAhead(char'(')notFollowedBy'markgetInput>>=setInput.(' ':)-- add space before acronym explanation-- parse a following hyphen if followed by a letter-- (this prevents unwanted interpretation as starting a strikeout section)result<-optionxs$try$dochar'-'next<-lookAheadletterguard$isLetter(lastxs)||isLetternextreturn$xs++"-"return$Strresult-- | Textile allows HTML span infos, we discard themhtmlSpan::GenParserCharParserStateInlinehtmlSpan=try$dochar'%'_<-attributescontent<-manyTillanyChar(char'%')return$Strcontent-- | Some number of space charswhitespace::GenParserCharParserStateInlinewhitespace=many1spaceChar>>returnSpace<?>"whitespace"-- | In Textile, an isolated endline character is a line breakendline::GenParserCharParserStateInlineendline=try$donewline>>notFollowedByblanklinereturnLineBreakrawHtmlInline::GenParserCharParserStateInlinerawHtmlInline=liftM(RawInline"html".snd)$htmlTagisInlineTag-- | Textile standard link syntax is "label":targetlink::GenParserCharParserStateInlinelink=try$doname<-surrounded(char'"')inlinechar':'url<-manyTill(anyChar)(lookAhead$(space<|>try(oneOf".;,:">>(space<|>newline))))return$Linkname(url,"")-- | Detect plain links to http or email.autoLink::GenParserCharParserStateInlineautoLink=do(orig,src)<-(tryuri<|>tryemailAddress)return$Link[Strorig](src,"")-- | image embeddingimage::GenParserCharParserStateInlineimage=try$dochar'!'>>notFollowedByspacesrc<-manyTillanyChar(lookAhead$oneOf"!(")alt<-option""(try$(char'('>>manyTillanyChar(char')')))char'!'return$Image[Stralt](src,alt)-- | Any special symbol defined in specialCharssymbol::GenParserCharParserStateInlinesymbol=doresult<-oneOfspecialCharsreturn$Str[result]-- | Inline codecode::GenParserCharParserStateInlinecode=code1<|>code2code1::GenParserCharParserStateInlinecode1=surrounded(char'@')anyChar>>=return.CodenullAttrcode2::GenParserCharParserStateInlinecode2=dohtmlTag(tagOpen(=="tt")null)result'<-manyTillanyChar(try$htmlTag$tagClose(=="tt"))return$CodenullAttrresult'-- | Html / CSS attributesattributes::GenParserCharParserStateStringattributes=choice[enclosed(char'(')(char')')anyChar,enclosed(char'{')(char'}')anyChar,enclosed(char'[')(char']')anyChar]-- | Parses material surrounded by a parser.surrounded::GenParserCharstt-- ^ surrounding parser->GenParserCharsta-- ^ content parser (to be used repeatedly)->GenParserCharst[a]surroundedborder=enclosedborderborder-- | Inlines are most of the time of the same formsimpleInline::GenParserCharParserStatet-- ^ surrounding parser->([Inline]->Inline)-- ^ Inline constructor->GenParserCharParserStateInline-- ^ content parser (to be used repeatedly)simpleInlineborderconstruct=surroundedborder(inlineWithAttribute)>>=return.construct.normalizeSpaceswhereinlineWithAttribute=(try$optionalattributes)>>inline