A WORKING RTF TO HTML CONVERTER IN PHP

-

May 12, 2014

In a recent project, I desperately needed an RTF to HTML converter written in PHP. Googling around turned up some matches, but I could not get them to work properly. Also, one of them called passthru() to use a RTF2HTML executable, which is something I didn’t want. I was looking for a RTF2HTML converter written purely in PHP.

Since I couldn’t find anything ready-made, I sat down and coded one up myself. It’s short, and it works, implementing the subset of RTF tags that you’ll need in HTML and ignoring the rest. As it turns out, the RTF format isn’t that complicated when you really look at it, but it isn’t something you code a parser for in 15 minutes either.

How to use it

Include the file rtf.php somewhere in your project. Then do this:

$reader=newRtfReader();$rtf=file_get_contents("test.rtf");// or use a string$reader->Parse($rtf);

The code

/**
* RTF parser/formatter
*
* This code reads RTF files and formats the RTF data to HTML.
*
* PHP version 5
*
* @author Alexander van Oostenrijk
* @copyright 2014 Alexander van Oostenrijk
* @license GNU
* @version 1
* @link http://www.independent-software.com
*
* Sample of use:
*
* $reader = new RtfReader();
* $rtf = file_get_contents("itc.rtf"); // or use a string
* $reader->Parse($rtf);
* //$reader->root->dump(); // to see what the reader read
* $formatter = new RtfHtml();
* echo $formatter->Format($reader->root);
*/classRtfElement{protectedfunctionIndent($level){for($i=0;$i<$level*2;$i++)echo"&nbsp;";}}classRtfGroupextendsRtfElement{public$parent;public$children;publicfunction__construct(){$this->parent=null;$this->children=array();}publicfunctionGetType(){// No children?if(sizeof($this->children)==0)returnnull;// First child not a control word?$child=$this->children[0];if(get_class($child)!="RtfControlWord")returnnull;return$child->word;}publicfunctionIsDestination(){// No children?if(sizeof($this->children)==0)returnnull;// First child not a control symbol?$child=$this->children[0];if(get_class($child)!="RtfControlSymbol")returnnull;return$child->symbol=='*';}publicfunctiondump($level=0){echo"<div>";$this->Indent($level);echo"{";echo"</div>";foreach($this->childrenas$child){if(get_class($child)=="RtfGroup"){if($child->GetType()=="fonttbl")continue;if($child->GetType()=="colortbl")continue;if($child->GetType()=="stylesheet")continue;if($child->GetType()=="info")continue;// Skip any pictures:if(substr($child->GetType(),0,4)=="pict")continue;if($child->IsDestination())continue;}$child->dump($level+2);}echo"<div>";$this->Indent($level);echo"}";echo"</div>";}}classRtfControlWordextendsRtfElement{public$word;public$parameter;publicfunctiondump($level){echo"<div style='color:green'>";$this->Indent($level);echo"WORD {$this->word} ({$this->parameter})";echo"</div>";}}classRtfControlSymbolextendsRtfElement{public$symbol;public$parameter=0;publicfunctiondump($level){echo"<div style='color:blue'>";$this->Indent($level);echo"SYMBOL {$this->symbol} ({$this->parameter})";echo"</div>";}}classRtfTextextendsRtfElement{public$text;publicfunctiondump($level){echo"<div style='color:red'>";$this->Indent($level);echo"TEXT {$this->text}";echo"</div>";}}classRtfReader{public$root=null;protectedfunctionGetChar(){$this->char=$this->rtf[$this->pos++];}protectedfunctionParseStartGroup(){// Store state of document on stack.$group=newRtfGroup();if($this->group!=null)$group->parent=$this->group;if($this->root==null){$this->group=$group;$this->root=$group;}else{array_push($this->group->children,$group);$this->group=$group;}}protectedfunctionis_letter(){if(ord($this->char)>=65&&ord($this->char)<=90)returnTRUE;if(ord($this->char)>=90&&ord($this->char)<=122)returnTRUE;returnFALSE;}protectedfunctionis_digit(){if(ord($this->char)>=48&&ord($this->char)<=57)returnTRUE;returnFALSE;}protectedfunctionParseEndGroup(){// Retrieve state of document from stack.$this->group=$this->group->parent;}protectedfunctionParseControlWord(){$this->GetChar();$word="";while($this->is_letter()){$word.=$this->char;$this->GetChar();}// Read parameter (if any) consisting of digits.// Paramater may be negative.$parameter=null;$negative=false;if($this->char=='-'){$this->GetChar();$negative=true;}while($this->is_digit()){if($parameter==null)$parameter=0;$parameter=$parameter*10+$this->char;$this->GetChar();}if($parameter===null)$parameter=1;if($negative)$parameter=-$parameter;// If this is \u, then the parameter will be followed by // a character.if($word=="u"){}// If the current character is a space, then// it is a delimiter. It is consumed.// If it's not a space, then it's part of the next// item in the text, so put the character back.else{if($this->char!=' ')$this->pos--;}$rtfword=newRtfControlWord();$rtfword->word=$word;$rtfword->parameter=$parameter;array_push($this->group->children,$rtfword);}protectedfunctionParseControlSymbol(){// Read symbol (one character only).$this->GetChar();$symbol=$this->char;// Symbols ordinarily have no parameter. However, // if this is \', then it is followed by a 2-digit hex-code:$parameter=0;if($symbol=='\''){$this->GetChar();$parameter=$this->char;$this->GetChar();$parameter=hexdec($parameter.$this->char);}$rtfsymbol=newRtfControlSymbol();$rtfsymbol->symbol=$symbol;$rtfsymbol->parameter=$parameter;array_push($this->group->children,$rtfsymbol);}protectedfunctionParseControl(){// Beginning of an RTF control word or control symbol.// Look ahead by one character to see if it starts with// a letter (control world) or another symbol (control symbol):$this->GetChar();$this->pos--;if($this->is_letter())$this->ParseControlWord();else$this->ParseControlSymbol();}protectedfunctionParseText(){// Parse plain text up to backslash or brace,// unless escaped.$text="";do{$terminate=false;$escape=false;// Is this an escape?if($this->char=='\\'){// Perform lookahead to see if this// is really an escape sequence.$this->GetChar();switch($this->char){case'\\':$text.='\\';break;case'{':$text.='{';break;case'}':$text.='}';break;default:// Not an escape. Roll back.$this->pos=$this->pos-2;$terminate=true;break;}}elseif($this->char=='{'||$this->char=='}'){$this->pos--;$terminate=true;}if(!$terminate&&!$escape){$text.=$this->char;$this->GetChar();}}while(!$terminate&&$this->pos<$this->len);$rtftext=newRtfText();$rtftext->text=$text;array_push($this->group->children,$rtftext);}publicfunctionParse($rtf){$this->rtf=$rtf;$this->pos=0;$this->len=strlen($this->rtf);$this->group=null;$this->root=null;while($this->pos<$this->len){// Read next character:$this->GetChar();// Ignore \r and \nif($this->char=="\n"||$this->char=="\r")continue;// What type of character is this?switch($this->char){case'{':$this->ParseStartGroup();break;case'}':$this->ParseEndGroup();break;case'\\':$this->ParseControl();break;default:$this->ParseText();break;}}}}classRtfState{publicfunction__construct(){$this->Reset();}publicfunctionReset(){$this->bold=false;$this->italic=false;$this->underline=false;$this->end_underline=false;$this->strike=false;$this->hidden=false;$this->fontsize=0;}}classRtfHtml{publicfunctionFormat($root){$this->output="";// Create a stack of states:$this->states=array();// Put an initial standard state onto the stack:$this->state=newRtfState();array_push($this->states,$this->state);$this->FormatGroup($root);return$this->output;}protectedfunctionFormatGroup($group){// Can we ignore this group?if($group->GetType()=="fonttbl")return;if($group->GetType()=="colortbl")return;if($group->GetType()=="stylesheet")return;if($group->GetType()=="info")return;// Skip any pictures:if(substr($group->GetType(),0,4)=="pict")return;if($group->IsDestination())return;// Push a new state onto the stack:$this->state=clone$this->state;array_push($this->states,$this->state);foreach($group->childrenas$child){if(get_class($child)=="RtfGroup")$this->FormatGroup($child);if(get_class($child)=="RtfControlWord")$this->FormatControlWord($child);if(get_class($child)=="RtfControlSymbol")$this->FormatControlSymbol($child);if(get_class($child)=="RtfText")$this->FormatText($child);}// Pop state from stack.array_pop($this->states);$this->state=$this->states[sizeof($this->states)-1];}protectedfunctionFormatControlWord($word){if($word->word=="plain")$this->state->Reset();if($word->word=="b")$this->state->bold=$word->parameter;if($word->word=="i")$this->state->italic=$word->parameter;if($word->word=="ul")$this->state->underline=$word->parameter;if($word->word=="ulnone")$this->state->end_underline=$word->parameter;if($word->word=="strike")$this->state->strike=$word->parameter;if($word->word=="v")$this->state->hidden=$word->parameter;if($word->word=="fs")$this->state->fontsize=ceil(($word->parameter/24)*16);if($word->word=="par")$this->output.="<p>";// Characters:if($word->word=="lquote")$this->output.="&lsquo;";if($word->word=="rquote")$this->output.="&rsquo;";if($word->word=="ldblquote")$this->output.="&ldquo;";if($word->word=="rdblquote")$this->output.="&rdquo;";if($word->word=="emdash")$this->output.="&mdash;";if($word->word=="endash")$this->output.="&ndash;";if($word->word=="bullet")$this->output.="&bull;";if($word->word=="u")$this->output.="&loz;";}protectedfunctionBeginState(){$span="";if($this->state->bold)$span.="font-weight:bold;";if($this->state->italic)$span.="font-style:italic;";if($this->state->underline)$span.="text-decoration:underline;";if($this->state->end_underline)$span.="text-decoration:none;";if($this->state->strike)$span.="text-decoration:strikethrough;";if($this->state->hidden)$span.="display:none;";if($this->state->fontsize!=0)$span.="font-size: {$this->state->fontsize}px;";$this->output.="<span style='{$span}'>";}protectedfunctionEndState(){$this->output.="</span>";}protectedfunctionFormatControlSymbol($symbol){if($symbol->symbol=='\''){$this->BeginState();$this->output.=htmlentities(chr($symbol->parameter),ENT_QUOTES,'ISO-8859-1');$this->EndState();}}protectedfunctionFormatText($text){$this->BeginState();$this->output.=$text->text;$this->EndState();}}