I am following this example to create a multinomial naive Bayes classifier for text data in scikit-learn. However, the output of the confusion matrix and classifier F-1 score is incorrect. I think the errors have to do with the input data format I am using. I have one csv file per training example. The csv file contains one row with features like so 'blah, blahblah, andsoon'. Each file is either classfied as positive or negative. How, can I correctly read this files?
Here is my code:
import numpy
import csv
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
NEWLINE = '\n'
NEGATIVE = 'negative'
POSITIVE = 'positive'
SOURCES = [
('negative\\', NEGATIVE),
('positive\\', POSITIVE)
]
SKIP_FILES = {'cmds'}
def build_data_frame(policies, path, classification):
rows = []
index = []
for policy in policies:
current_csv = path + policy + '.csv'
# check if file exists
if (os.path.isfile(current_csv)):
with open(current_csv, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# get each row in policy
for row in reader:
# remove all commas from inside the text lists
clean_row = ' '.join(row)
rows.append({'text': clean_row, 'class': classification})
index.append(current_csv)
data_frame = DataFrame(rows, index=index)
return data_frame
def policy_analyzer_main(policies, write_pol_path):
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(policies, write_pol_path + path, classification))
classify(data)
pipeline = Pipeline([
('count_vectorizer', CountVectorizer()),
('classifier', MultinomialNB())
])
def classify(data):
k_fold = KFold(n=len(data), n_folds=10)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values.astype(str)
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values.astype(str)
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label=POSITIVE)
scores.append(score)
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)
Here is an example of the warning message I am getting:
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
'precision', 'predicted', average, warn_for)
('Total emails classified:', 75)
('Score:', 0.025000000000000001)
Confusion matrix:
[[39 35]
[46 24]]