This sample shows how to retrieve a link information and extract text starting from the link target.
Links in PDF document are represented as PdfActionArea objects. PdfActionArea object can be used to get a page which hosts action area and area bounds. Also PdfActionArea contains the Action property. For links this property contains an instance of PdfGoToAction object. PdfGoToAction object contains a page associated with the link.
In order to extract text from link's target page we should get the top offset on target page and then retrieve all text from target page with coordinates below the top offset.
Please note that the trial version of Docotic.Pdf loads only odd pages on document opening, so you may find that some expected links and pages are missing. Licensed version doesn't have this limitation and loads all pages and links.
using System; using System.Drawing; using System.Text; using System.Windows.Forms; namespace BitMiracle.Docotic.Pdf.Samples { public static class TextFromLink { private class LinkInfo { public readonly PdfGoToAction Action; public readonly int Index = -1; public readonly PdfRectangle Bounds; public readonly int OwnerPageNumber = -1; public readonly int TargetPageNumber = -1; public LinkInfo(PdfDocument pdf, PdfActionArea actionArea, int index) { if (pdf == null) throw new ArgumentNullException("document"); if (actionArea == null) throw new ArgumentNullException("actionArea"); Action = actionArea.Action as PdfGoToAction; if (Action == null) throw new ArgumentException("Action area doesn't contain link", "actionArea"); Index = index; Bounds = actionArea.BoundingBox; OwnerPageNumber = pdf.IndexOf(actionArea.Owner); TargetPageNumber = pdf.IndexOf(Action.View.Page); } } public static void Main() { // NOTE: // When used in trial mode, the library imposes some restrictions. // Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx // for more information. using (PdfDocument pdf = new PdfDocument(@"Sample Data\Link.pdf")) { LinkInfo linkInfo = getFirstLink(pdf); if (linkInfo == null) { MessageBox.Show("Document doesn't contain links!"); return; } StringBuilder linkDescription = new StringBuilder(); linkDescription.AppendLine("Link's index in document widgets collection: " + linkInfo.Index); linkDescription.AppendLine("Number of page with link: " + linkInfo.OwnerPageNumber); linkDescription.AppendLine("Link bounds: " + linkInfo.Bounds.ToString()); linkDescription.AppendLine("Link points to page # " + linkInfo.TargetPageNumber); linkDescription.AppendLine("NOTE: All page numbers are zero-based. If you use trial version of Docotic.Pdf then even pages of original document are not loaded."); linkDescription.AppendLine(); linkDescription.AppendLine("Text from link:"); linkDescription.AppendLine(getTextFromLink(linkInfo.Action)); System.Diagnostics.Process.Start(@"Sample Data\Link.pdf"); MessageBox.Show(linkDescription.ToString()); } } private static LinkInfo getFirstLink(PdfDocument pdf) { for (int i = 0; i < pdf.Widgets.Count; ++i) { PdfActionArea actionArea = pdf.Widgets[i] as PdfActionArea; if (actionArea != null) { PdfGoToAction linkAction = actionArea.Action as PdfGoToAction; if (linkAction != null) { // lets ignore links which point to an absent page if (linkAction.View.Page != null) return new LinkInfo(pdf, actionArea, i); } } } return null; } private static string getTextFromLink(PdfGoToAction linkAction) { PdfPage targetPage = linkAction.View.Page; if (targetPage == null) return String.Empty; StringBuilder result = new StringBuilder(); const float eps = 5.0f; // small reserve for text start vertical position PdfCollection<PdfTextData> textFromTargetPage = targetPage.Canvas.GetTextData(); foreach (PdfTextData textData in textFromTargetPage) { if (textData.Position.Y < linkAction.View.Top - eps) continue; result.Append(textData.Text + " "); } return result.ToString(); } } }
Imports System Imports System.Drawing Imports System.Text Imports System.Windows.Forms Imports BitMiracle.Docotic.Pdf Namespace BitMiracle.Docotic.Samples Public NotInheritable Class TextFromLink Private Class LinkInfo Public ReadOnly Action As PdfGoToAction Public ReadOnly Index As Integer = -1 Public ReadOnly Bounds As PdfRectangle Public ReadOnly OwnerPageNumber As Integer = -1 Public ReadOnly TargetPageNumber As Integer = -1 Public Sub New(ByVal pdf As PdfDocument, ByVal actionArea As PdfActionArea, ByVal linkIndex As Integer) If pdf Is Nothing Then Throw New ArgumentNullException("document") End If If actionArea Is Nothing Then Throw New ArgumentNullException("actionArea") End If Action = TryCast(actionArea.Action, PdfGoToAction) If Action Is Nothing Then Throw New ArgumentException("Action area doesn't contain link", "actionArea") End If Index = linkIndex Bounds = actionArea.BoundingBox OwnerPageNumber = pdf.IndexOf(actionArea.Owner) TargetPageNumber = pdf.IndexOf(Action.View.Page) End Sub End Class Public Shared Sub Main() ' NOTE: ' When used in trial mode, the library imposes some restrictions. ' Please visit http://bitmiracle.com/pdf-library/trial-restrictions.aspx ' for more information. Using pdf As New PdfDocument("Sample Data\Link.pdf") Dim linkInfo As LinkInfo = getFirstLink(pdf) If linkInfo Is Nothing Then MessageBox.Show("Document doesn't contain links!") Return End If Dim linkDescription As New StringBuilder() linkDescription.AppendLine("Link's index in document widgets collection: " + linkInfo.Index.ToString()) linkDescription.AppendLine("Number of page with link: " + linkInfo.OwnerPageNumber.ToString()) linkDescription.AppendLine("Link bounds: " + linkInfo.Bounds.ToString()) linkDescription.AppendLine("Link points to page # " + linkInfo.TargetPageNumber.ToString()) linkDescription.AppendLine("NOTE: All page numbers are zero-based. If you use trial version of Docotic.Pdf then even pages of original document are not loaded.") linkDescription.AppendLine() linkDescription.AppendLine("Text from link:") linkDescription.AppendLine(getTextFromLink(linkInfo.Action)) System.Diagnostics.Process.Start("Sample Data\Link.pdf") MessageBox.Show(linkDescription.ToString()) End Using End Sub Private Shared Function getFirstLink(ByVal pdf As PdfDocument) As LinkInfo For i As Integer = 0 To pdf.Widgets.Count - 1 Dim actionArea As PdfActionArea = TryCast(pdf.Widgets(i), PdfActionArea) If actionArea IsNot Nothing Then Dim linkAction As PdfGoToAction = TryCast(actionArea.Action, PdfGoToAction) If linkAction IsNot Nothing Then ' lets ignore links which point to an absent page If linkAction.View.Page IsNot Nothing Then Return New LinkInfo(pdf, actionArea, i) End If End If End If Next Return Nothing End Function Private Shared Function getTextFromLink(ByVal linkAction As PdfGoToAction) As String Dim targetPage As PdfPage = linkAction.View.Page If targetPage Is Nothing Then Return [String].Empty End If Dim result As New StringBuilder() Const eps As Single = 5.0F ' small reserve for text start vertical position Dim textFromTargetPage As PdfCollection(Of PdfTextData) = targetPage.Canvas.GetTextData() For Each textData As PdfTextData In textFromTargetPage If textData.Position.Y < linkAction.View.Top - eps Then Continue For End If result.Append(textData.Text + " ") Next Return result.ToString() End Function End Class End Namespace