How to load text data correctly in scikit-learn?

Question

I am following this example to create a multinomial naive Bayes classifier for text data in scikit-learn. However, the output of the confusion matrix and classifier F-1 score is incorrect. I think the errors have to do with the input data format I am using. I have one csv file per training example. The csv file contains one row with features like so 'blah, blahblah, andsoon'. Each file is either classfied as positive or negative. How, can I correctly read this files?

Here is my code:

import numpy
import csv
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score

NEWLINE = '\n'

NEGATIVE = 'negative'
POSITIVE = 'positive'

SOURCES = [
    ('negative\\', NEGATIVE),
    ('positive\\', POSITIVE)
]

SKIP_FILES = {'cmds'}


def build_data_frame(policies, path, classification):
    rows = []
    index = []

    for policy in policies:

        current_csv = path + policy + '.csv'

        # check if file exists
        if (os.path.isfile(current_csv)):

            with open(current_csv, 'r') as csvfile:

                reader = csv.reader(csvfile, delimiter=',', quotechar='"')

                # get each row in policy
                for row in reader:
                    # remove all commas from inside the text lists
                    clean_row = ' '.join(row)
                    rows.append({'text': clean_row, 'class': classification})
                    index.append(current_csv)

    data_frame = DataFrame(rows, index=index)
    return data_frame


def policy_analyzer_main(policies, write_pol_path):
    data = DataFrame({'text': [], 'class': []})
    for path, classification in SOURCES:
        data = data.append(build_data_frame(policies, write_pol_path + path, classification))
    classify(data)

pipeline = Pipeline([
    ('count_vectorizer',   CountVectorizer()),
    ('classifier',         MultinomialNB())
])

def classify(data):

    k_fold = KFold(n=len(data), n_folds=10)
    scores = []
    confusion = numpy.array([[0, 0], [0, 0]])
    for train_indices, test_indices in k_fold:
        train_text = data.iloc[train_indices]['text'].values
        train_y = data.iloc[train_indices]['class'].values.astype(str)

        test_text = data.iloc[test_indices]['text'].values
        test_y = data.iloc[test_indices]['class'].values.astype(str)

        pipeline.fit(train_text, train_y)
        predictions = pipeline.predict(test_text)

        confusion += confusion_matrix(test_y, predictions)
        score = f1_score(test_y, predictions, pos_label=POSITIVE)
        scores.append(score)

    print('Total emails classified:', len(data))
    print('Score:', sum(scores)/len(scores))
    print('Confusion matrix:')
    print(confusion)

Here is an example of the warning message I am getting:

UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
 ('Total emails classified:', 75)
 ('Score:', 0.025000000000000001)
Confusion matrix:
[[39 35]
 [46 24]]

Ibraim Ganiev Ibraim Ganiev · Accepted Answer · 2015-12-13T06:58:41

Look at your predictions on each iteration over train-test split. Because that warning means that your algorithm labeled all test samples as negative when some samples in test set are positive (maybe only 1 of them is positive, but anyway it will raise that warning).

Also look at your splits over dataset, because it's possible that some of test splits contains only 1 positive sample, but your classifier misclassifies it.

For example it raises that warning in this case (To make clear what happens in your code):

from sklearn.metrics import f1_score

# here we have only 4 labels of 4 samples
f1_score([0,0,1,0],[0,0,0,0])
/usr/local/lib/python3.4/dist-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
'precision', 'predicted', average, warn_for)

How to load text data correctly in scikit-learn?

1 Answers