This sample shows how to extract text from a page or from entire PDF document.
Use PdfDocument.GetText() or PdfPage.GetText() methods to extract text in plain text format. You can also use PdfCanvas.GetTextData() method to extract text chunks with their coordinates.
Alternative methods are PdfDocument.GetTextWithFormatting() and PdfPage.GetTextWithFormatting(). These methods will extract text with formatting. Formatting means that all relative text positions will be kept after extraction and text will look more readable. Extracting text with formatting may be especially useful for PDF documents with tabular data.
using System.Diagnostics; using System.IO; namespace BitMiracle.Docotic.Pdf.Samples { public static class ExtractText { public static void Main() { // NOTE: // When used in trial mode, the library imposes some restrictions. // Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx // for more information. using (PdfDocument pdf = new PdfDocument("Sample data/jfif3.pdf")) { // Extract plain text from document string documentTextFile = "Document text.txt"; using (StreamWriter writer = new StreamWriter(documentTextFile)) writer.Write(pdf.GetText()); Process.Start(documentTextFile); // Extract text with formatting from document string documentTextFormattedFile = "Document text with formatting.txt"; using (StreamWriter writer = new StreamWriter(documentTextFormattedFile)) writer.Write(pdf.GetTextWithFormatting()); Process.Start(documentTextFormattedFile); // Extract plain text from first page string firstPageTextFile = "First page text.txt"; using (StreamWriter writer = new StreamWriter(firstPageTextFile)) writer.Write(pdf.Pages[0].GetText()); Process.Start(firstPageTextFile); } } } }
Imports System.Diagnostics Imports System.IO Imports BitMiracle.Docotic.Pdf Namespace BitMiracle.Docotic.Pdf.Samples Public NotInheritable Class ExtractText Public Shared Sub Main() ' NOTE: ' When used in trial mode, the library imposes some restrictions. ' Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx ' for more information. Using pdf As New PdfDocument("Sample data/jfif3.pdf") ' Extract plain text from document Dim documentTextFile As String = "Document text.txt" Using writer As New StreamWriter(documentTextFile) writer.Write(pdf.GetText()) End Using Process.Start(documentTextFile) ' Extract text with formatting from document Dim documentTextFormattedFile As String = "Document text with formatting.txt" Using writer As New StreamWriter(documentTextFormattedFile) writer.Write(pdf.GetTextWithFormatting()) End Using Process.Start(documentTextFormattedFile) ' Extract plain text from first page Dim firstPageTextFile As String = "First page text.txt" Using writer As New StreamWriter(firstPageTextFile) writer.Write(pdf.Pages(0).GetText()) End Using Process.Start(firstPageTextFile) End Using End Sub End Class End Namespace