Extract text

Docotic.Pdf Library Help > Samples > Text > Extract text

This sample shows how to extract text from a page or from entire PDF document.

Use PdfDocument.GetText() or PdfPage.GetText() methods to extract text in plain text format. You can also use PdfCanvas.GetTextData() method to extract text chunks with their coordinates.

Alternative methods are PdfDocument.GetTextWithFormatting() and PdfPage.GetTextWithFormatting(). These methods will extract text with formatting. Formatting means that all relative text positions will be kept after extraction and text will look more readable. Extracting text with formatting may be especially useful for PDF documents with tabular data.

CopyC#
using System.Diagnostics;
using System.IO;

namespace BitMiracle.Docotic.Pdf.Samples
{
    public static class ExtractText
    {
        public static void Main()
        {
            // NOTE: 
            // When used in trial mode, the library imposes some restrictions.
            // Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx
            // for more information.

            PdfDocument pdf = new PdfDocument("Sample data/jfif3.pdf");

            // Extract plain text from document
            string documentTextFile = "Document text.txt";
            using (StreamWriter writer = new StreamWriter(documentTextFile))
                writer.Write(pdf.GetText());

            Process.Start(documentTextFile);

            // Extract text with formatting from document
            string documentTextFormattedFile = "Document text with formatting.txt";
            using (StreamWriter writer = new StreamWriter(documentTextFormattedFile))
                writer.Write(pdf.GetTextWithFormatting());

            Process.Start(documentTextFormattedFile);

            // Extract plain text from first page
            string firstPageTextFile = "First page text.txt";
            using (StreamWriter writer = new StreamWriter(firstPageTextFile))
                writer.Write(pdf.Pages[0].GetText());

            Process.Start(firstPageTextFile);

            pdf.Dispose();
        }
    }
}
CopyVB.NET
Imports System.Diagnostics
Imports System.IO

Imports BitMiracle.Docotic.Pdf

Namespace BitMiracle.Docotic.Pdf.Samples
    Public NotInheritable Class ExtractText
        Public Shared Sub Main()
            ' NOTE: 
            ' When used in trial mode, the library imposes some restrictions.
            ' Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx
            ' for more information.


            Dim pdf As New PdfDocument("Sample data/jfif3.pdf")

            ' Extract plain text from document
            Dim documentTextFile As String = "Document text.txt"
            Using writer As New StreamWriter(documentTextFile)
                writer.Write(pdf.GetText())
            End Using

            Process.Start(documentTextFile)

            ' Extract text with formatting from document
            Dim documentTextFormattedFile As String = "Document text with formatting.txt"
            Using writer As New StreamWriter(documentTextFormattedFile)
                writer.Write(pdf.GetTextWithFormatting())
            End Using

            Process.Start(documentTextFormattedFile)

            ' Extract plain text from first page
            Dim firstPageTextFile As String = "First page text.txt"
            Using writer As New StreamWriter(firstPageTextFile)
                writer.Write(pdf.Pages(0).GetText())
            End Using

            Process.Start(firstPageTextFile)
            pdf.Dispose()
        End Sub
    End Class
End Namespace