I'm using the Naive Bayes Classifier from nltk to perform sentiment analysis on some tweets. I'm training the data using the corpus file found here: https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed, as well as using the method there.
When creating the training set I've done it using all ~4000 tweets in the data set but I also thought I'd test with a very small amount of 30.
When testing with the entire set, it only returns 'neutral' as the labels when using the classifier on a new set of tweets but when using 30 it will only return positive, does this mean my training data is incomplete or too heavily 'weighted' with neutral entries and is the reason for my classifier only returning neutral when using ~4000 tweets in my training set?
I've included my full code below.
twitter_api = twitter.Api(consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token_key = access_token,
access_token_secret = access_token_secret)
# Test set builder
def buildtestset(keyword):
try:
min_id = None
tweets = []
ids = []
for i in range(0,50):
tweetsdata = twitter_api.GetSearch(keyword, count = 100, max_id = min_id )
for t in tweetsdata:
tweets.append(t)
ids.append(t.id)
min_id = min(ids)
print(str(len(tweets))+ ' tweets found for keyword: '+keyword)
return[{"text":status.text, "label":None} for status in tweets]
except:
print('this is so sad')
return None
# Quick test
keyword = 'bicycle'
testdataset = buildtestset(keyword)
# Training set builder
def buildtrainingset(corpusfile,tweetdata):
#corpusfile = pathway to corpus data
#tweetdata = pathway to file we going to save all the tweets to
corpus = []
with open(corpusfile,'r') as csvfile:
linereader = csv.reader(csvfile, delimiter = ',', quotechar = "\"")
for row in linereader:
corpus.append({'tweet_id':row[2],'label':row[1],'topic':row[0]})
# Append every tweet from corpusfile to our corpus list
rate_limit = 180
sleep_time = 900/180
# these are set up so we call enough times to be within twitters guidelines
# the rest is calling the api of every tweet to get the status object, text associated with it and then put it in our
# data set - trainingdata
trainingdata = []
count = 0
for tweet in corpus:
if count < 30:
try:
status = twitter_api.GetStatus(tweet['tweet_id'])
print ('Tweet fetched '+status.text)
tweet['text'] = status.text
trainingdata.append(tweet)
time.sleep(sleep_time)
count += 1
except:
count += 1
continue
#write tweets to empty csv
with open(tweetdata,'w',encoding='utf-8') as csvfile:
linewriter = csv.writer(csvfile, delimiter=',',quotechar = "\"")
for tweet in trainingdata:
try:
linewriter.writerow([tweet['tweet_id'],tweet['text'],tweet['label'],tweet['topic']])
except Exception as e:
print(e)
return trainingdata
corpusfile = (r'C:\Users\zacda\OneDrive\Desktop\DATA2901\Assignment\corpusmaster.csv')
tweetdata = (r'C:\Users\zacda\OneDrive\Desktop\DATA2901\Assignment\tweetdata.csv')
TrainingData = buildtrainingset(corpusfile,tweetdata)
import re # regular expression library
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
class preprocesstweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
def processtweets(self, list_of_tweets):
processedtweets=[]
for tweet in list_of_tweets:
processedtweets.append((self._processtweet(tweet["text"]),tweet["label"]))
return processedtweets
def _processtweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
tweetprocessor = preprocesstweets()
processedtrainingdata = tweetprocessor.processtweets(TrainingData)
processedtestdata = tweetprocessor.processtweets(testdataset)
# This is a list of all the words we have in the training set, the word_features is a list of all the distinct words w freq
import nltk
def buildvocab(processedtrainingdata):
all_words = []
for (words, sentiment) in processedtrainingdata:
all_words.extend(words)
wordlist = nltk.FreqDist(all_words)
word_features = wordlist.keys()
return word_features
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in tweet_words) #creates json key containing word x, its loc.
# Every key has a T/F according - true for present , false for not
return features
# Building the feature vector
word_features = buildvocab(processedtrainingdata)
training_features = nltk.classify.apply_features(extract_features, processedtrainingdata)
# apply features does the actual extraction
# Naive Bayes Classifier
Nbayes = nltk.NaiveBayesClassifier.train(training_features)
Nbayes_result_labels = [Nbayes.classify(extract_features(tweet[0])) for tweet in processedtestdata]
# get the majority vote [?]
if Nbayes_result_labels.count('positive') > Nbayes_result_labels.count('negative'):
print('Positive')
print(str(100*Nbayes_result_labels.count('positive')/len(Nbayes_result_labels)))
elif Nbayes_result_labels.count('negative') > Nbayes_result_labels.count('positive'):
print(str(100*Nbayes_result_labels.count('negative')/len(Nbayes_result_labels)))
print('Negative sentiment')
else:
print('Neutral')