7
votes

Hey I am trying to use a Naive Bayes classifier to classify some text. I am using NLTK. Whenever I test the classifier using the classify() method it always returns the correct classification for the first item, and the same classification for every other line of text I classify. The following is my code:

from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
import nltk
import random
import nltk.data

documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000] 

def bag_of_words(words):
    return dict([word,True] for word in words)

def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

text1="i love this city"
text2="i hate this city"


feats1=bag_of_words(word_tokenize(text1))
feats2=bag_of_words(word_tokenize(text2))


print classifier.classify(feats1)
print classifier.classify(feats2)

This code will print pos twice where as if I flipped the last 2 lines of the code it will print neg twice. Can anyone help?

1

1 Answers

4
votes

Change

features['contains(%s)' % word] = (word in document_words)

to

features[word] = (word in document)

Otherwise the classifier only knows about "words" of the form "contains(...)", and is therefore clueless about the words in "i love this city"


import nltk.tokenize as tokenize
import nltk
import random
random.seed(3)

def bag_of_words(words):
    return dict([word, True] for word in words)

def document_features(document): 
    features = {}
    for word in word_features:
        features[word] = (word in document)
        # features['contains(%s)' % word] = (word in document_words)
    return features

movie_reviews = nltk.corpus.movie_reviews

documents = [(set(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000] 

train_set = [(document_features(d), c) for (d, c) in documents[:200]]

classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.show_most_informative_features()
for word in ('love', 'hate'):
    # No hope in passing the tests if word is not in word_features
    assert word in word_features
    print('probability {w!r} is positive: {p:.2%}'.format(
        w = word, p = classifier.prob_classify({word : True}).prob('pos')))

tests = ["i love this city",
         "i hate this city"]

for test in tests:
    words = tokenize.word_tokenize(test)
    feats = bag_of_words(words)
    print('{s} => {c}'.format(s = test, c = classifier.classify(feats)))

yields

Most Informative Features
                   worst = True              neg : pos    =     15.5 : 1.0
              ridiculous = True              neg : pos    =     11.5 : 1.0
                  batman = True              neg : pos    =      7.6 : 1.0
                   drive = True              neg : pos    =      7.6 : 1.0
                   blame = True              neg : pos    =      7.6 : 1.0
                terrible = True              neg : pos    =      6.9 : 1.0
                  rarely = True              pos : neg    =      6.4 : 1.0
                 cliches = True              neg : pos    =      6.0 : 1.0
                       $ = True              pos : neg    =      5.9 : 1.0
               perfectly = True              pos : neg    =      5.5 : 1.0
probability 'love' is positive: 61.52%
probability 'hate' is positive: 36.71%
i love this city => pos
i hate this city => neg