Python PYPDF2 : 'utf-8' codec can't decode byte 0x80 in position 395: invalid start byte

Question

I'm using a tutorial to create a corpus of pdf files. I have the following code:

import nltk
import PyPDF2
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from PyPDF2 import PdfFileReader

def getTextPDF(pdfFileName):
pdf_file = open(pdfFileName, 'rb')
readpdf = PdfFileReader(pdf_file)
text = []
for i in range(0,readpdf.getNumPages()):
    text.append(readpdf.getPage(i).extractText())
return '\n'.join(text)

corpusDir = 'reports/'

jun15 = getTextPDF('reports/June2015.pdf')
dec15 = getTextPDF('reports/December2015.pdf')
jun16 = getTextPDF('reports/June2016.pdf')
dec16 = getTextPDF('reports/December2016.pdf')
jun17 = getTextPDF('reports/June2017.pdf')
dec17 = getTextPDF('reports/December2017.pdf')

files = [jun15,dec15,jun16,dec16,jun17,dec17]
for idx, f in enumerate(files):
    with open (corpusDir+str(idx)+'.txt','w') as output:
        output.write(f)

corpus = PlaintextCorpusReader(corpusDir, '.*')

print (corpus.words())

UnicodeDecodeError Traceback (most recent call last) in () ----> 1 print (corpus.words())

/anaconda3/lib/python3.6/site-packages/nltk/collections.py in repr(self) 224 pieces = [] 225 length = 5 --> 226 for elt in self: 227 pieces.append(repr(elt)) 228 length += len(pieces[-1]) + 2

/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/util.py in iterate_from(self, start_tok) 400 401 # Get everything we can from this piece. --> 402 for tok in piece.iterate_from(max(0, start_tok-offset)): 403 yield tok 404

/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/util.py in iterate_from(self, start_tok) 294 self._current_toknum = toknum 295 self._current_blocknum = block_index --> 296 tokens = self.read_block(self._stream) 297 assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( 298 'block reader %s() should return list or tuple.' %

/anaconda3/lib/python3.6/site-packages/nltk/corpus/reader/plaintext.py in _read_word_block(self, stream) 120 words = [] 121 for i in range(20): # Read 20 lines at a time. --> 122 words.extend(self._word_tokenizer.tokenize(stream.readline())) 123 return words 124

/anaconda3/lib/python3.6/site-packages/nltk/data.py in readline(self, size) 1166 while True: 1167 startpos = self.stream.tell() - len(self.bytebuffer) -> 1168 new_chars = self._read(readsize) 1169 1170 # If we're at a '\r', then read one extra character, since

/anaconda3/lib/python3.6/site-packages/nltk/data.py in _read(self, size) 1398 1399 # Decode the bytes into unicode characters -> 1400 chars, bytes_decoded = self._incr_decode(bytes) 1401 1402 # If we got bytes but couldn't decode any, then read further.

/anaconda3/lib/python3.6/site-packages/nltk/data.py in _incr_decode(self, bytes) 1429 while True: 1430 try: -> 1431 return self.decode(bytes, 'strict') 1432 except UnicodeDecodeError as exc: 1433 # If the exception occurs at the end of the string,

/anaconda3/lib/python3.6/encodings/utf_8.py in decode(input, errors) 14 15 def decode(input, errors='strict'): ---> 16 return codecs.utf_8_decode(input, errors, True) 17 18 class IncrementalEncoder(codecs.IncrementalEncoder):

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 395: invalid start byte

I've been looking at different posts, but I still can't tell if the problem is that I'm using the wrong methods or that I have to encode or decode something. Ifd it's the latter I don't know where. Any ideas would be appreciated.

dahrs dahrs · Accepted Answer · 2018-05-31T16:53:56

It would be best to see the whole error message but I'm guessing you are using python 2 and your reports have some utf-8 in them. First of, try to specify the encoding at the beginning and when you open your files:

#!/usr/bin/python
#-*- coding:utf-8 -*- 
import nltk
import PyPDF2
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from PyPDF2 import PdfFileReader
import codecs
def getTextPDF(pdfFileName):
    pdf_file = codecs.open(pdfFileName, 'rb', encoding='utf8')
    readpdf = PdfFileReader(pdf_file)
    text = []
    for i in range(0,readpdf.getNumPages()):
        text.append(readpdf.getPage(i).extractText())
    return '\n'.join(text)

corpusDir = 'reports/'

jun15 = getTextPDF('reports/June2015.pdf')
dec15 = getTextPDF('reports/December2015.pdf')
jun16 = getTextPDF('reports/June2016.pdf')
dec16 = getTextPDF('reports/December2016.pdf')
jun17 = getTextPDF('reports/June2017.pdf')
dec17 = getTextPDF('reports/December2017.pdf')

files = [jun15,dec15,jun16,dec16,jun17,dec17]
for idx, f in enumerate(files):
    with codecs.open(corpusDir+str(idx)+'.txt','w', encoding='utf8') as output:
        output.write(f)

corpus = PlaintextCorpusReader(corpusDir, '.*')

print (corpus.words())

if that doen't work, you can try bodging your strings, but it's not ideal:

def toUtf8(stringOrUnicode):
    '''
    Returns the argument in utf-8 encoding
    '''
    typeArg = type(stringOrUnicode)
    if typeArg is unicode:
        return stringOrUnicode.encode('utf8').decode('utf8')
    elif typeArg is str:
        return stringOrUnicode.decode('utf8')

Otherwise, show us the message error to try and detect exactly where is the problem.

Python PYPDF2 : 'utf-8' codec can't decode byte 0x80 in position 395: invalid start byte

1 Answers