Meta

iTextSharp slightly smarter text extraction strategy

iTextSharp’s SimpleTextExtractionStrategy is great but it is simple as the name implies. It can detect new lines pretty well but it has no care for the order of the lines themselves. If your PDF isn’t written top to bottom (as many PDFs aren’t) you’ll get everything out of order. The code below is a modified version of the current SimpleTextExtractionStrategy found here. Instead of just appending text to a master buffer and inserting a newline every time a different Y coordinate is found it stores the Y coordinates in a dictionary and appends to each. There’s actually several flaws in this logic but so far it has been working pretty well for me, at least in comparison to the old way.

I’ve removed many of the comments from the source, see the original link above for details on various fields and methods. Any comments below are my alterations to the above.

using iTextSharp.text.pdf.parser;
using System;
using System.Collections.Generic;
using System.Text;
/*
* $Id: SimpleTextExtractionStrategy.cs 318 2012-02-27 22:46:07Z psoares33 $
*
* This file is part of the iText project.
* Copyright (c) 1998-2012 1T3XT BVBA
* Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License version 3
* as published by the Free Software Foundation with the addition of the
* following permission added to Section 15 as permitted in Section 7(a):
* FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
* 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses or write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA, 02110-1301 USA, or download the license from the following URL:
* http://itextpdf.com/terms-of-use/
*
* The interactive user interfaces in modified source and object code versions
* of this program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU Affero General Public License.
*
* In accordance with Section 7(b) of the GNU Affero General Public License,
* you must retain the producer line in every PDF that is created or manipulated
* using iText.
*
* You can be released from the requirements of the license by purchasing
* a commercial license. Buying such a license is mandatory as soon as you
* develop commercial activities involving the iText software without
* disclosing the source code of your own applications.
* These activities include: offering paid services to customers as an ASP,
* serving PDFs on the fly in a web application, shipping iText with a closed
* source product.
*
* For more information, please contact iText Software Corp. at this
* address: sales@itextpdf.com
*/
public class TopToBottomTextExtractionStrategy : ITextExtractionStrategy {
private Vector lastStart;
private Vector lastEnd;
//Store each line individually. A SortedDictionary will automatically shuffle things around based on the key
private SortedDictionary<int, StringBuilder> results = new SortedDictionary<int, StringBuilder>();
//Constructor and some methods that aren't used
public TopToBottomTextExtractionStrategy() { }
public virtual void BeginTextBlock() { }
public virtual void EndTextBlock() { }
public virtual void RenderImage(ImageRenderInfo renderInfo) { }
//Convert our lines into a giant block of text
public virtual String GetResultantText() {
//Buffer
StringBuilder buf = new StringBuilder();
//Loop through each line (which is already sorted top to bottom)
foreach (var s in results) {
//Append to the buffer
buf.AppendLine(s.Value.ToString());
}
return buf.ToString();
}
public virtual void RenderText(TextRenderInfo renderInfo) {
bool firstRender = results.Count == 0;
LineSegment segment = renderInfo.GetBaseline();
Vector start = segment.GetStartPoint();
Vector end = segment.GetEndPoint();
//Use the Y value of the bottom left corner of the text for the key
int currentLineKey = (int)start[1];
if (!firstRender) {
Vector x0 = start;
Vector x1 = lastStart;
Vector x2 = lastEnd;
float dist = (x2.Subtract(x1)).Cross((x1.Subtract(x0))).LengthSquared / x2.Subtract(x1).LengthSquared;
float sameLineThreshold = 1f;
//If we've detected that we're still on the same
if (dist <= sameLineThreshold) {
//Use the previous Y coordinate
currentLineKey = (int)lastStart[1];
}
}
//Hack: PDFs start with zero at the bottom so our keys will be upside down. Using negative keys cheats this.
currentLineKey = currentLineKey * -1;
//If this line hasn't been used before add a new line to our collection
if (!results.ContainsKey(currentLineKey)) {
results.Add(currentLineKey, new StringBuilder());
}
//Insert a space between blocks of text if it appears there should be
if (!firstRender && //First pass never needs a leading space
results[currentLineKey].Length !=0 && //Don't append a space to the begining of a line
!results[currentLineKey].ToString().EndsWith(" ") && //Don't append if the current buffer ends in a space already
renderInfo.GetText().Length > 0 && //Don't append if the new next is empty
!renderInfo.GetText().StartsWith(" ")) { //Don't append if the new text starts with a space
//Calculate the distance between the two blocks
float spacing = lastEnd.Subtract(start).Length;
//If it "looks" like it should be a space
if (spacing > renderInfo.GetSingleSpaceWidth() / 2f) {
//Add a space
results[currentLineKey].Append(" ");
}
}
//Add the text to the line in our collection
results[currentLineKey].Append(renderInfo.GetText());
lastStart = start;
lastEnd = end;
}
}