Extract text

Docotic.Pdf Library Help > Samples > Text > Extract text
Docotic.Pdf documentation

This sample shows how to extract text from a page or from entire PDF document.

Use PdfDocument.GetText() or PdfPage.GetText() methods to extract text in plain text format. You can also use PdfCanvas.GetTextData() method to extract text chunks with their coordinates.

Alternative methods are PdfDocument.GetTextWithFormatting() and PdfPage.GetTextWithFormatting(). These methods will extract text with formatting. Formatting means that all relative text positions will be kept after extraction and text will look more readable. Extracting text with formatting may be especially useful for PDF documents with tabular data.

C# 
using System.Diagnostics;
using System.IO;

namespace BitMiracle.Docotic.Pdf.Samples
{
    public static class ExtractText
    {
        public static void Main()
        {
            // NOTE: 
            // When used in trial mode, the library imposes some restrictions.
            // Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx
            // for more information.

            using (PdfDocument pdf = new PdfDocument("Sample data/jfif3.pdf"))
            {
                // Extract plain text from document
                string documentTextFile = "Document text.txt";
                using (StreamWriter writer = new StreamWriter(documentTextFile))
                    writer.Write(pdf.GetText());

                Process.Start(documentTextFile);

                // Extract text with formatting from document
                string documentTextFormattedFile = "Document text with formatting.txt";
                using (StreamWriter writer = new StreamWriter(documentTextFormattedFile))
                    writer.Write(pdf.GetTextWithFormatting());

                Process.Start(documentTextFormattedFile);

                // Extract plain text from first page
                string firstPageTextFile = "First page text.txt";
                using (StreamWriter writer = new StreamWriter(firstPageTextFile))
                    writer.Write(pdf.Pages[0].GetText());

                Process.Start(firstPageTextFile);
            }
        }
    }
}
Visual Basic 
Imports System.Diagnostics
Imports System.IO

Imports BitMiracle.Docotic.Pdf

Namespace BitMiracle.Docotic.Pdf.Samples
    Public NotInheritable Class ExtractText
        Public Shared Sub Main()
            ' NOTE: 
            ' When used in trial mode, the library imposes some restrictions.
            ' Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx
            ' for more information.

            Using pdf As New PdfDocument("Sample data/jfif3.pdf")

                ' Extract plain text from document
                Dim documentTextFile As String = "Document text.txt"
                Using writer As New StreamWriter(documentTextFile)
                    writer.Write(pdf.GetText())
                End Using

                Process.Start(documentTextFile)

                ' Extract text with formatting from document
                Dim documentTextFormattedFile As String = "Document text with formatting.txt"
                Using writer As New StreamWriter(documentTextFormattedFile)
                    writer.Write(pdf.GetTextWithFormatting())
                End Using

                Process.Start(documentTextFormattedFile)

                ' Extract plain text from first page
                Dim firstPageTextFile As String = "First page text.txt"
                Using writer As New StreamWriter(firstPageTextFile)
                    writer.Write(pdf.Pages(0).GetText())
                End Using

                Process.Start(firstPageTextFile)
            End Using
        End Sub
    End Class
End Namespace