So My problem is that I am parsing through these PDF's to extract information with PDFminer and it works for most forms. However there are these other PDF's that won't open unless you use Adobe Acrobat. Unless using that it gives the message:
The document you are trying to load requires Adobe Reader 8 or higher. You may not have the Adobe Reader installed or your viewing environment may not be properly configured to use Adobe Reader.
For information on how to install Adobe Reader and configure your viewing environment please see http://www.adobe.com/go/pdf_forms_configure.
This is fine when just trying to look at the document obviously you just download adobe acrobat and open it with that. But when using PDFMiner I can't figure out how to parse through the text because it doesn't use AA to open the files.
Sample PDF: http://www.forms.ssb.gov.on.ca/mbs/ssb/forms/ssbforms.nsf/GetFileAttach/012-8551E~1/$File/8551E.pdf
Sample Code:
def extract_text_by_page(pdf_path):
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
resource_manager = PDFResourceManager()
fake_file_handle = BytesIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
converter.close()
fake_file_handle.close()
def extract_text(pdf_path):
with open("PDFText.txt", "wb") as text_file:
for page in extract_text_by_page(pdf_path):
print(page)
print()
text_file.write(page)
text_file.write(bytes("\n\n", 'utf-8'))
file=open("PDFText.txt")
f = file.read()
wordbank={"required", "shall", "must", "Name"}
wordcount={}
for word in wordbank:
phonelist = re.findall("\\b(" + word + ")\\b(?i)", f )
wordcount[word] = len(phonelist)
file.close();
return wordcount
def main():
args = parse_cli()
document = "Documents/" + args.file
#Get Word count
print(extract_text(document))
if __name__ == '__main__':
main()