Simple ways to extract Tabular Content from PDFs
Whether you are a digital marketing firm or a garment wholesaler, setting your digital base and keeping your books digitalized has become imperative. Several sectors today function on digital documents only. Amongst all the digital copies, the most common type is PDF. Due to its strict security standards, it becomes difficult for companies to procure selected data from the documents.
Companies find it difficult to search within documents, analyze trends based on the data saved within the PDF documents, and bulk paper processing. .NET experts at DEV IT have come with efficient solutions to all your above problems. Read this article to know more.
How to extract several tables of content from PDF?
To extract tabular data from PDFs, you may leverage the following libraries:
- ITextsharp: Open-source library available to extract text and font style of the content.
- HtmlAgilityPack: Open-source library available to process HTML nodes/Tags.
Steps to extract TOC:
- Extract HTML from the PDF using ITextSharp:
- Use TextWithFontExtractionStategy class to get the font style and size information of the text from the PDF. TextWithFontExtractionStategy Class is given as a reference at the end of the blog.
- Prepare custom HTML for PDF and add page number for each page.
- Method: Refer GetPdfHTMLWithPageNo(string pdfPath)
- The above method will return the HTML from the PDF with font style and size.
2. Extract all the headers highlighted in bold using HtmlAgilityPack
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
- Use the HtmlAgilityPack and load your HTML using the class mentioned below.
- Loop through each page and node to get the content highlighted in bold. Refer GetPageWiseHaderList(Stream HTMLPath)
- Or to extract the content that has a greater font size compared to regular content you may follow the above the step.
Using the above steps, you can get the header contents from the PDF. You can also auto-tag, bookmark important keywords or content using the same TOC.
Extracting data from PDFs is easy yet technical. If you are someone who requires external help to fetch the necessary data from your digital documents, get in touch with a DEV IT expert here.
Code Snippet:
public StringBuilder GetPdfHTMLWithPageNo(string pdfPath)
{
try
{
string fileName = pdfPath;
if (!File.Exists(fileName))
throw new FileNotFoundException("fileName not found...
Filename ==> " + fileName);
using (PdfReader PdfReader = new PdfReader(fileName))
{
PdfReader.unethicalreading = false;
string pageWiseSearchableFlag = string.Empty;
bool IsOpenedWithFullPermissions = PdfReader.IsOpenedWithFullPermissions;
StringBuilder PdfText = new StringBuilder();
PdfText.Append(@"<HTML>" + Environment.NewLine);
for (int page = 1; page <= PdfReader.NumberOfPages; page++)
{
string text = string.Empty;
string outputText = string.Empty;
try
{
TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
text = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(PdfReader, page, S);
}
catch (Exception ex)
{
}
if (!string.IsNullOrWhiteSpace(text))
{
PdfText.Append(@"<div class='PageStart' PageNo=" + page.ToString() + ">" + Environment.NewLine);
PdfText.Append(Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text))));
PdfText.Append(@"</div>" + Environment.NewLine);
}
else
{
}
}
PdfText.Append(@"</HTML>" + Environment.NewLine);
return PdfText;
}
}
catch (Exception ex)
{
throw ex;
}
}
public class TextWithFontExtractionStategy : TextSharp.text.pdf.parser.ITextExtractionStrategy
{
//HTML buffer
private StringBuilder result = new StringBuilder();
//Store last used properties
private Vector lastBaseLine;
private string lastFont;
private float lastFontSize;
private enum TextRenderMode
{
FillText = 0,
StrokeText = 1,
FillThenStrokeText = 2,
Invisible = 3,
FillTextAndAddToPathForClipping = 4,
StrokeTextAndAddToPathForClipping = 5,
FillThenStrokeTextAndAddToPathForClipping = 6,
AddTextToPaddForClipping = 7
}
public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
{
string curFont = renderInfo.GetFont().PostscriptFontName;
var baseColor = renderInfo.GetFillColor();
//Check if faux bold is used
if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
{
curFont += "-Bold";
}
//This code assumes that if the baseline changes then we're on a newline
Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
Single curFontSize = rect.Height;
//See if something has changed, either the baseline, the font or the font size
if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
{
//if we've put down at least one span tag close it
if ((this.lastBaseLine != null))
{
this.result.AppendLine("</span>");
}
//If the baseline has changed then insert a line break
if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
{
this.result.AppendLine("<br />");
}
//Create an HTML tag with appropriate styles
this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize);
}
//Append the current text
this.result.Append(renderInfo.GetText());
//Set currently used properties
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}
public string GetResultantText()
{
//If we wrote anything, then we'll always have a missing closing tag, so close it here
if (result.Length > 0)
{
result.Append("</span>");
}
return result.ToString();
}
//Not needed
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo) { }
}
public List<Header> GetPageWiseHaderList(Stream HTMLPath)
{
List<Header> lstHeaders = new List<Header>();
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.Load(HTMLPath);
var PageNodes = doc.DocumentNode.SelectNodes("//*[@*[contains(., 'PageStart')]]");
List<string> lstBoldStyles = this.GetBoldStyleDictionary();
foreach (var page in PageNodes)
{
int pageNumber = Convert.ToInt32(page.Attributes.FirstOrDefault(x => x.Name == "pageno").Value);
foreach (var node in page.SelectNodes(".//*[@*[contains(., 'font-family')]]"))
{
HtmlAgilityPack.HtmlAttributeCollection htmlAttributes = node.Attributes;
var classAttribute = htmlAttributes.FirstOrDefault(x => x.Name == "style");
foreach (string val in lstBoldStyles)
{
if (classAttribute != null && classAttribute.Value.Contains(val))
{
lstHeaders.Add(new Header { PageNo = pageNumber, HeaderValue = node.InnerText });
break;
}
else if (classAttribute != null && Convert.ToInt32(Convert.ToDouble(classAttribute.Value.Split(':')[2])) >= 10)
{
lstHeaders.Add(new Header { PageNo = pageNumber, HeaderValue = node.InnerText });
break;
}
}
}
}
return lstHeaders;
}
public class Header
{
public int PageNo { get; set; }
public string HeaderValue { get; set; }
}