search PDF text, highlight found words by drawing rectangle after getting their coordinates save PDF with text highlighted

Question

Anyone can help with how to get a text coordinates? can this be possible? because I just wanted a windows form app where the user types a word in a text box, and the app reads existing PDF using iTextSharp, highlights the matched words if found, and saves the PDF with highlighted text. so far i have almost everything done, including the drawing of a yellow rectangle, but what is lacking is how to get the text coordinates of the matched patterns to highlight them, thanks in advance: (by the way: sb is the search text box, tb is a rich text box where the PDF text is exhibited)

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using iTextSharp.text;
using System.Text.RegularExpressions;

namespace manipulatePDF
{
    public partial class Form1 : Form
    {
        string oldFile;
        Document document = new Document();
        StringBuilder text = new StringBuilder();
    public Form1()
    {
        InitializeComponent();
    }
    private void open_Click(object sender, EventArgs e)
    {
        reset_Click(sender, e);

        openFileDialog1.Filter = "PDF Files (.pdf)|*.pdf";
        openFileDialog1.FilterIndex = 1;

        if (openFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK)
        {
            label1.Text = "File Location: " + openFileDialog1.FileName;
            oldFile = openFileDialog1.FileName;

            // open the reader
            PdfReader reader = new PdfReader(oldFile);

            iTextSharp.text.Rectangle size = reader.GetPageSizeWithRotation(1);
            document.SetPageSize(size);

            for (int cPage = 1; cPage <= reader.NumberOfPages; cPage++)
            {
                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                string currentText = PdfTextExtractor.GetTextFromPage(reader, cPage, strategy);
                currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                text.Append(currentText);
                reader.Close();
            }
            tb.Text = text.ToString();
        }
    }
    private void save_Click(object sender, EventArgs e)
    {
        saveFileDialog1.InitialDirectory = "C: ";
        saveFileDialog1.Title = "Save the PDF File";
        saveFileDialog1.Filter = "PDF files (*.pdf)|*.pdf";

        if (saveFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK)
        {
            PdfReader reader = new PdfReader(oldFile);
            string newFile = saveFileDialog1.FileName;

            // open the writer
            FileStream fs = new FileStream(newFile, FileMode.Create, FileAccess.Write);
            PdfWriter writer = PdfWriter.GetInstance(document, fs);

            document.Open();

            // the pdf content
            PdfContentByte cb = writer.DirectContent;

            // select the font properties
            PdfGState graphicsState = new PdfGState();
            graphicsState.FillOpacity = 10;
            cb.SetGState(graphicsState);

            int index = 0;
            while (index < text.ToString().LastIndexOf(sb.Text))
            {
                if (contain.Checked == true)
                {
                    tb.Find(sb.Text, index, tb.TextLength, RichTextBoxFinds.MatchCase);
                    tb.SelectionBackColor = Color.Gold;
                    index = tb.Text.IndexOf(sb.Text, index) + 1;
                }
                else if (exact.Checked == true)
                {
                    tb.Find(sb.Text, index, tb.TextLength, RichTextBoxFinds.WholeWord);
                    tb.SelectionBackColor = Color.Gold;
                    index = tb.Text.IndexOf(sb.Text, index) + 1;
                }
            }

            int count = 0; //counts the pattern occurance
            for (int cPage = 1; cPage <= reader.NumberOfPages; cPage++)
            {
                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                string currentText = PdfTextExtractor.GetTextFromPage(reader, cPage, strategy);
                currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
                string textToSearch = sb.Text;
                int lastStartIndex = currentText.IndexOf(textToSearch, 0, StringComparison.CurrentCulture);

                while (lastStartIndex != -1)//if the pattern was found
                {
                    count++;
                    lastStartIndex = currentText.IndexOf(textToSearch, lastStartIndex + 1, StringComparison.CurrentCulture);

                    BaseFont bf = BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED);
                    cb.SetFontAndSize(bf, 10);

                    cb.SetColorFill(new CMYKColor(0f, 0f, 1f, 0f));
                    cb.Rectangle(document.PageSize.Width - 500f, 600f, 100f, 100f);
                    cb.Fill();
                }

                if (count != 0)
                {
                    if (contain.Checked == true)
                    {
                        label2.Text = "Number of pages: " + cPage + " - " + textToSearch + " found " + count + " times. \n";
                    }
                    else if (exact.Checked == true)
                    {
                        //finds the words that are bounded by a space or a dot and store in cCount
                        //returns the count of matched pattern = count - cCount
                    }
                }

                text.Append(currentText);
                // create the new page and add it to the pdf
                PdfImportedPage page = writer.GetImportedPage(reader, cPage);
                cb.AddTemplate(page, 0, 0);

                document.NewPage();
                //PdfStamper stamper = new PdfStamper(reader, fs);
                ////Create a rectangle for the highlight. NOTE: Technically this isn't used but it helps with the quadpoint calculation
                //iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(60.6755f, 749.172f, 94.0195f, 735.3f);
                ////Create an array of quad points based on that rectangle. NOTE: The order below doesn't appear to match the actual spec but is what Acrobat produces
                //float[] quad = { rect.Left, rect.Bottom, rect.Right, rect.Bottom, rect.Left, rect.Top, rect.Right, rect.Top };

                ////Create our hightlight
                //PdfAnnotation highlight = PdfAnnotation.CreateMarkup(stamper.Writer, rect, null, PdfAnnotation.MARKUP_HIGHLIGHT, quad);

                ////Set the color
                //highlight.Color = BaseColor.YELLOW;

                ////Add the annotation
                //stamper.AddAnnotation(highlight, 1);
            }

            // close the streams
            document.Close();
            fs.Close();
            writer.Close();
            reader.Close();
        }
    }
    private void reset_Click(object sender, EventArgs e)
    {
        tb.Text = "";
    }
}

See my answer here and follow the links in that post. I would recommending always search word-by-word as multiple words might span multiple lines which could get ugly. stackoverflow.com/a/6527010/231316 — Chris Haas

Jcis Jcis · Accepted Answer · 2012-07-30T07:23:22

Well, i had added a downloable example made using Vb.NET 2010 that does exactly what you need, and it's available in another post in the same thread Chris referenced. That code will work for every font type, font size and will return all matches for the word/sentence you search for, returning each match as a rectangle with x/y locations to the UI, and finally hightlighting them all and saving to a new PDF, you just need to give some initial parameters like, search term, comparison type by culture, source PDF path and destination PDF Path. The only thing not implemented is that particular case when the search word/sentence falls into multiple lines, but it should be an easy change in code since you can use SameLine() method in TextChunk Class.

search PDF text, highlight found words by drawing rectangle after getting their coordinates save PDF with text highlighted

1 Answers