I dont quite understand why I cannot Lemmatize or do Stemming. I tried converting the array to string, but I have no luck.
This is my code.
import bs4, re, string, nltk, numpy as np, pandas as pd
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
news_url="https://news.google.com/news/rss"
Client=urlopen(news_url)
xml_page=Client.read()
Client.close()
soup_pg=soup(xml_page,"xml")
news_lst=soup_page.findAll("item")
limit=19
corpus = []
# Print news title, url and publish date
for index, news in enumerate(news_list):
#print(news.title.text)
#print(index+1)
corpus.append(news.title.text)
if index ==limit:
break
#print(arrayList)
df = pd.DataFrame(corpus, columns=['News'])
wpt=nltk.WordPunctTokenizer()
stop_words=nltk.corpus.stopwords.words('english')
def normalize_document (doc):
#lowercase and remove special characters\whitespace
doc=re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) #re.I ignore case sensitive, ASCII-only matching
doc=doc.lower()
doc=doc.strip()
#tokenize document
tokens=wpt.tokenize(doc)
#filter stopwords out of document
filtered_tokens=[token for token in tokens if token not in stop_words]
#re-create documenr from filtered tokens
doc=' '.join(filtered_tokens)
return doc
normalize_corpus=np.vectorize(normalize_document)
norm_corpus=normalize_corpus(corpus)
norm_corpus
The error I get starts with the next lines I add
stemmer = PorterStemmer()
sentences = nltk.sent_tokenize(norm_corpus)
# Stemming
for i in range(len(norm_corpus)):
words = nltk.word_tokenize(norm_corpus[i])
words = [stemmer.stem(word) for word in words]
norm_corpus[i] = ' '.join(words)
once I insert these lines then I get the following error: TypeError: cannot use a string pattern on a bytes-like object
I think if I solve the error with stemming it will be the same solution to my error with lemmatization.