I'm using a tutorial to create a corpus of pdf files. I have the following code:
import nltk
import PyPDF2
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from PyPDF2 import PdfFileReader
def getTextPDF(pdfFileName):
pdf_file = open(pdfFileName, 'rb')
readpdf = PdfFileReader(pdf_file)
text = []
for i in range(0,readpdf.getNumPages()):
text.append(readpdf.getPage(i).extractText())
return '\n'.join(text)
corpusDir = 'reports/'
jun15 = getTextPDF('reports/June2015.pdf')
dec15 = getTextPDF('reports/December2015.pdf')
jun16 = getTextPDF('reports/June2016.pdf')
dec16 = getTextPDF('reports/December2016.pdf')
jun17 = getTextPDF('reports/June2017.pdf')
dec17 = getTextPDF('reports/December2017.pdf')
files = [jun15,dec15,jun16,dec16,jun17,dec17]
for idx, f in enumerate(files):
with open (corpusDir+str(idx)+'.txt','w') as output:
output.write(f)
corpus = PlaintextCorpusReader(corpusDir, '.*')
print (corpus.words())
UnicodeDecodeError Traceback (most recent call last) in () ----> 1 print (corpus.words())
/anaconda3/lib/python3.6/site-packages/nltk/collections.py in repr(self) 224 pieces = [] 225 length = 5 --> 226 for elt in self: 227 pieces.append(repr(elt)) 228 length += len(pieces[-1]) + 2
/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/util.py in iterate_from(self, start_tok) 400 401 # Get everything we can from this piece. --> 402 for tok in piece.iterate_from(max(0, start_tok-offset)): 403 yield tok 404
/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/util.py in iterate_from(self, start_tok) 294 self._current_toknum = toknum 295 self._current_blocknum = block_index --> 296 tokens = self.read_block(self._stream) 297 assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( 298 'block reader %s() should return list or tuple.' %
/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/plaintext.py in _read_word_block(self, stream) 120 words = [] 121 for i in range(20): # Read 20 lines at a time. --> 122 words.extend(self._word_tokenizer.tokenize(stream.readline())) 123 return words 124
/anaconda3/lib/python3.6/site-packages/nltk/data.py in readline(self, size) 1166 while True: 1167 startpos = self.stream.tell() - len(self.bytebuffer) -> 1168 new_chars = self._read(readsize) 1169 1170 # If we're at a '\r', then read one extra character, since
/anaconda3/lib/python3.6/site-packages/nltk/data.py in _read(self, size) 1398 1399 # Decode the bytes into unicode characters -> 1400 chars, bytes_decoded = self._incr_decode(bytes) 1401 1402 # If we got bytes but couldn't decode any, then read further.
/anaconda3/lib/python3.6/site-packages/nltk/data.py in _incr_decode(self, bytes) 1429 while True: 1430 try: -> 1431 return self.decode(bytes, 'strict') 1432 except UnicodeDecodeError as exc: 1433 # If the exception occurs at the end of the string,
/anaconda3/lib/python3.6/encodings/utf_8.py in decode(input, errors) 14 15 def decode(input, errors='strict'): ---> 16 return codecs.utf_8_decode(input, errors, True) 17 18 class IncrementalEncoder(codecs.IncrementalEncoder):
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 395: invalid start byte
I've been looking at different posts, but I still can't tell if the problem is that I'm using the wrong methods or that I have to encode or decode something. Ifd it's the latter I don't know where. Any ideas would be appreciated.