Extract text from link target

Docotic.Pdf Library Help > Samples > Pages and Navigation > Extract text from link target

This sample shows how to retrieve a link information and extract text starting from the link target.

Links in PDF document are represented as PdfActionArea objects. PdfActionArea object can be used to get a page which hosts action area and area bounds. Also PdfActionArea contains the Action property. For links this property contains an instance of PdfGoToAction object. PdfGoToAction object contains a page associated with the link.

In order to extract text from link's target page we should get the top offset on target page and then retrieve all text from target page with coordinates below the top offset.

Please note that the trial version of Docotic.Pdf loads only odd pages on document opening, so you may find that some expected links and pages are missing. Licensed version doesn't have this limitation and loads all pages and links.

C# 
using System;
using System.Drawing;
using System.Text;
using System.Windows.Forms;

namespace BitMiracle.Docotic.Pdf.Samples
{
    public static class TextFromLink
    {
        private class LinkInfo
        {
            public readonly PdfGoToAction Action;
            public readonly int Index = -1;
            public readonly PdfRectangle Bounds;
            public readonly int OwnerPageNumber = -1;
            public readonly int TargetPageNumber = -1;

            public LinkInfo(PdfDocument pdf, PdfActionArea actionArea, int index)
            {
                if (pdf == null)
                    throw new ArgumentNullException("document");

                if (actionArea == null)
                    throw new ArgumentNullException("actionArea");

                Action = actionArea.Action as PdfGoToAction;
                if (Action == null)
                    throw new ArgumentException("Action area doesn't contain link", "actionArea");

                Index = index;
                Bounds = actionArea.BoundingBox;
                OwnerPageNumber = pdf.IndexOf(actionArea.Owner);
                TargetPageNumber = pdf.IndexOf(Action.View.Page);
            }
        }

        public static void Main()
        {
            // NOTE: 
            // When used in trial mode, the library imposes some restrictions.
            // Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx
            // for more information.

            using (PdfDocument pdf = new PdfDocument(@"Sample Data\Link.pdf"))
            {
                LinkInfo linkInfo = getFirstLink(pdf);
                if (linkInfo == null)
                {
                    MessageBox.Show("Document doesn't contain links!");
                    return;
                }

                StringBuilder linkDescription = new StringBuilder();
                linkDescription.AppendLine("Link's index in document widgets collection: " + linkInfo.Index);
                linkDescription.AppendLine("Number of page with link: " + linkInfo.OwnerPageNumber);
                linkDescription.AppendLine("Link bounds: " + linkInfo.Bounds.ToString());
                linkDescription.AppendLine("Link points to page # " + linkInfo.TargetPageNumber);
                linkDescription.AppendLine("NOTE: All page numbers are zero-based. If you use trial version of Docotic.Pdf then even pages of original document are not loaded.");

                linkDescription.AppendLine();
                linkDescription.AppendLine("Text from link:");
                linkDescription.AppendLine(getTextFromLink(linkInfo.Action));

                System.Diagnostics.Process.Start(@"Sample Data\Link.pdf");
                MessageBox.Show(linkDescription.ToString());
            }
        }

        private static LinkInfo getFirstLink(PdfDocument pdf)
        {
            for (int i = 0; i < pdf.Widgets.Count; ++i)
            {
                PdfActionArea actionArea = pdf.Widgets[i] as PdfActionArea;
                if (actionArea != null)
                {
                    PdfGoToAction linkAction = actionArea.Action as PdfGoToAction;
                    if (linkAction != null)
                    {
                        // lets ignore links which point to an absent page
                        if (linkAction.View.Page != null)
                            return new LinkInfo(pdf, actionArea, i);
                    }
                }
            }

            return null;
        }

        private static string getTextFromLink(PdfGoToAction linkAction)
        {
            PdfPage targetPage = linkAction.View.Page;
            if (targetPage == null)
                return String.Empty;

            StringBuilder result = new StringBuilder();

            const float eps = 5.0f; // small reserve for text start vertical position
            PdfCollection<PdfTextData> textFromTargetPage = targetPage.Canvas.GetTextData();
            foreach (PdfTextData textData in textFromTargetPage)
            {
                if (textData.Position.Y < linkAction.View.Top - eps)
                    continue;

                result.Append(textData.Text + " ");
            }

            return result.ToString();
        }
    }
}
Visual Basic 
Imports System
Imports System.Drawing
Imports System.Text
Imports System.Windows.Forms

Imports BitMiracle.Docotic.Pdf

Namespace BitMiracle.Docotic.Samples
    Public NotInheritable Class TextFromLink
        Private Class LinkInfo
            Public ReadOnly Action As PdfGoToAction
            Public ReadOnly Index As Integer = -1
            Public ReadOnly Bounds As PdfRectangle
            Public ReadOnly OwnerPageNumber As Integer = -1
            Public ReadOnly TargetPageNumber As Integer = -1

            Public Sub New(ByVal pdf As PdfDocument, ByVal actionArea As PdfActionArea, ByVal linkIndex As Integer)
                If pdf Is Nothing Then
                    Throw New ArgumentNullException("document")
                End If

                If actionArea Is Nothing Then
                    Throw New ArgumentNullException("actionArea")
                End If

                Action = TryCast(actionArea.Action, PdfGoToAction)
                If Action Is Nothing Then
                    Throw New ArgumentException("Action area doesn't contain link", "actionArea")
                End If

                Index = linkIndex
                Bounds = actionArea.BoundingBox
                OwnerPageNumber = pdf.IndexOf(actionArea.Owner)
                TargetPageNumber = pdf.IndexOf(Action.View.Page)
            End Sub
        End Class

        Public Shared Sub Main()
            ' NOTE: 
            ' When used in trial mode, the library imposes some restrictions.
            ' Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx
            ' for more information.

            Using pdf As New PdfDocument("Sample Data\Link.pdf")
                Dim linkInfo As LinkInfo = getFirstLink(pdf)
                If linkInfo Is Nothing Then
                    MessageBox.Show("Document doesn't contain links!")
                    Return
                End If

                Dim linkDescription As New StringBuilder()
                linkDescription.AppendLine("Link's index in document widgets collection: " + linkInfo.Index.ToString())
                linkDescription.AppendLine("Number of page with link: " + linkInfo.OwnerPageNumber.ToString())
                linkDescription.AppendLine("Link bounds: " + linkInfo.Bounds.ToString())
                linkDescription.AppendLine("Link points to page # " + linkInfo.TargetPageNumber.ToString())
                linkDescription.AppendLine("NOTE: All page numbers are zero-based. If you use trial version of Docotic.Pdf then even pages of original document are not loaded.")

                linkDescription.AppendLine()
                linkDescription.AppendLine("Text from link:")
                linkDescription.AppendLine(getTextFromLink(linkInfo.Action))

                System.Diagnostics.Process.Start("Sample Data\Link.pdf")
                MessageBox.Show(linkDescription.ToString())
            End Using
        End Sub

        Private Shared Function getFirstLink(ByVal pdf As PdfDocument) As LinkInfo
            For i As Integer = 0 To pdf.Widgets.Count - 1
                Dim actionArea As PdfActionArea = TryCast(pdf.Widgets(i), PdfActionArea)
                If actionArea IsNot Nothing Then
                    Dim linkAction As PdfGoToAction = TryCast(actionArea.Action, PdfGoToAction)
                    If linkAction IsNot Nothing Then
                        ' lets ignore links which point to an absent page
                        If linkAction.View.Page IsNot Nothing Then
                            Return New LinkInfo(pdf, actionArea, i)
                        End If
                    End If
                End If
            Next

            Return Nothing
        End Function

        Private Shared Function getTextFromLink(ByVal linkAction As PdfGoToAction) As String
            Dim targetPage As PdfPage = linkAction.View.Page
            If targetPage Is Nothing Then
                Return [String].Empty
            End If

            Dim result As New StringBuilder()

            Const eps As Single = 5.0F
            ' small reserve for text start vertical position
            Dim textFromTargetPage As PdfCollection(Of PdfTextData) = targetPage.Canvas.GetTextData()
            For Each textData As PdfTextData In textFromTargetPage
                If textData.Position.Y < linkAction.View.Top - eps Then
                    Continue For
                End If

                result.Append(textData.Text + " ")
            Next

            Return result.ToString()
        End Function
    End Class
End Namespace