I'm super new to programming so please be patient and keep it simple because I just started last week learning python. I'm willing to post anything you need for more info but remember, I'm a n00b.
My problem:
I'm using MACOSX Sierra with Visual Studio Code with python 2.7 and running into YUGE data processing times (i.e. 5+ minutes, closer to 10+ minutes, and on this particular code 30+ minutes)
Any suggestions? i haven't really been able to find too much on a solution anywhere online.
My CPU in activity monitor is at like a steady 98% when running these processes and I don't know if this is normal or what to do to speed things up.
caveat:
In simple coding my processing time isn't too bad, but it seems when algorithms are introduced, things bog way down and it's frustrating.
Below is the coding I'm using that seems to run fine except with insane processing times with the output included at the end:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[:1900:]
# classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
# save_classifier = open("naivebayes.pickle", "wb")
# pickle.dump(classifier, save_classifier)
# save_classifier.close()
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
# GaussianNB_classifier = SklearnClassifier(GaussianNB())
# GaussianNB_classifier.train(training_set)
# print("GaussianNB_classifier accuracy percent:", (nltk.classify.accuracy(GaussianNB_classifier, testing_set))*100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
# SVC_classifier = SklearnClassifier(SVC())
# SVC_classifier.train(training_set)
# print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
print("Classication:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100)
print("Classication:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100)
print("Classication:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100)
print("Classication:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100)
print("Classication:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100)
print("Classication:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100)
('Original Naive Bayes Algo accuracy percent:', 87.31578947368422)
Most Informative Features
insulting = True neg : pos = 11.0 : 1.0
sans = True neg : pos = 9.0 : 1.0
refreshingly = True pos : neg = 8.4 : 1.0
wasting = True neg : pos = 8.3 : 1.0
mediocrity = True neg : pos = 7.7 : 1.0
dismissed = True pos : neg = 7.0 : 1.0
customs = True pos : neg = 6.3 : 1.0
fabric = True pos : neg = 6.3 : 1.0
overwhelmed = True pos : neg = 6.3 : 1.0
bruckheimer = True neg : pos = 6.3 : 1.0
wires = True neg : pos = 6.3 : 1.0
uplifting = True pos : neg = 6.2 : 1.0
ugh = True neg : pos = 5.8 : 1.0
stinks = True neg : pos = 5.8 : 1.0
lang = True pos : neg = 5.7 : 1.0
('MNB_classifier accuracy percent:', 89.21052631578948)
('BernoulliNB_classifier accuracy percent:', 86.42105263157895)
('LogisticRegression_classifier accuracy percent:', 94.47368421052632)
('SGDClassifier_classifier accuracy percent:', 85.73684210526315)
('LinearSVC_classifier accuracy percent:', 99.52631578947368)
('NuSVC_classifier accuracy percent:', 91.52631578947368)
('voted_classifier accuracy percent:', 93.36842105263158)
('Classication:', u'pos', 'Confidence %:', 100)
('Classication:', u'pos', 'Confidence %:', 0)
('Classication:', u'neg', 'Confidence %:', 0)
('Classication:', u'neg', 'Confidence %:', 100)
('Classication:', u'neg', 'Confidence %:', 100)
('Classication:', u'neg', 'Confidence %:', 100)