I am unable to extract content from the below attached image in its pdf form however it works fine when I convert it into jpg format. My problem is I have a ton of scanned pdf's with multiple scanned pages inside them. I want to see if there is a direct way to extract content instead of the overhead of converting pdf's to jpg's and then extracting text. I followed the solution provided at link
pdf version of doc is pdfversion
My java version "1.8.0_112", tesseract 3.04.01, leptonica-1.74.1, libjpeg 8d : libpng 1.6.28 : libtiff 4.0.7 : zlib 1.2.8
pom.xml has
<dependencies>
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.14</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.14</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.3.1</version>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>4.2.2</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.11</version>
</dependency>
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>1.6.5</version>
</dependency>
</dependencies>
java code
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
public class Sample {
public static void main(String[] args)
throws IOException, TikaException, SAXException {
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
TesseractOCRConfig config = new TesseractOCRConfig();
config.setTesseractPath("/usr/local/bin/");
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
pdfConfig.setExtractUniqueInlineImagesOnly(false);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(PDFParserConfig.class, pdfConfig);
parseContext.set(Parser.class, parser);
FileInputStream stream = new FileInputStream(new File("path2pdf.pdf"));
Metadata metadata = new Metadata();
parser.parse(stream, handler, metadata, parseContext);
System.out.println(metadata);
String content = handler.toString();
System.out.println("===============");
System.out.println(content);
System.out.println("Done");
}
}
but no use, please advice if I am doing something wrong here.