0
votes

I couldn't find and solve multinomial naive Bayes from scratch without the sklearn MultinomialNB library. But, when I fit MultinomialNB Classifier to the training set. But there's some problem. Here's the problem

enter image description here

I make the bag of words model and splitting the train and test set.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split #it was "sklearn.cross_validation" but now it changed
X = corpus
y = dataset.id_sentimen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state=0)


vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2))


X_train = vect.fit_transform(X_train).toarray()
X_test = vect.transform(X_test)

Here's the code MultinomialNB without library from sklearn

class MultinomialNB():

    def fit(self, X_train, y_train, ls=0.01):
        self.ls = ls
        self.y_classes, y_counts = np.unique(y_train, return_counts=True)
        self.x_classes = [np.unique(x) for x in X.T]
        self.phi_y = 1.0 * y_counts/y_counts.sum()
        self.phi_x = self.mean_X(X_train, y_train)
        self.c_x = self.count_x(X_train, y_train)
        return self

    def mean_X(self, X_train, y_train):
        return [[self.ls_mean_x(X_train, y_train, k, j) for j in range(len(self.x_classes))] for k in self.y_classes]

    def ls_mean_x(self, X_train, y_train, k, j):
        x_data = (X_train[:,j][y==k].reshape(-1,1) == self.x_classes[j])
        return (x_data.sum(axis=0) + self.ls ) / (len(x_data) + (len(self.x_classes) * self.ls))

    def get_mean_x(self, y_train, j):
        return 1 + self.ls / (self.c_x[y][j] + (len(self.x_classes) * self.ls))

    def count_x(self, X_train, y_train):
        return [[len(X[:,j][y==k].reshape(-1,1) == self.x_classes[j])
                       for j in range(len(self.x_classes))]
                      for k in self.y_classes]

    def predict(self, X_train):
        return np.apply_along_axis(lambda x: self.compute_probs(x), 1, X_train)

    def compute_probs(self, x):
        probs = np.array([self.compute_prob(x, y) for y in range(len(self.y_classes))])
        return self.y_classes[np.argmax(probs)]

    def compute_prob(self, x, y):
        Pxy = 1
        for j in range(len(x)):
            x_clas = self.x_classes[j]
            if x[j] in x_clas:
                i = list(x_clas).index(x[j])
                p_x_j_y = self.phi_x[y][j][i] # p(xj|y)
                Pxy *= p_x_j_y
            else:
                Pxy *= get_mean_x(y, j)
        return Pxy * self.phi_y[y]

    def evaluate(self, X_train, y_train):
        return (self.predict(X_train) == y_train).mean()

I want to fit the MultinomialNB to my training set

# Fitting MultinomialNB Classifier to the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predicting test results
y_pred = classifier.predict(X_test)
ytest = np.array(y_test)

# f1_score(ytest, y_pred, average='weighted')
print(classification_report(ytest, y_pred))
print(confusion_matrix(ytest, y_pred))

Could anyone help solve the code and the error?

1
Welcome to SO; where exactly in all this code does the error opo up? Please edit your post to include the full error trace.desertnaut

1 Answers

0
votes

I did my own build from scratch.. perhaps can help you build your own

import numpy as np

class multinomialNB:

def __init__(self, alpha=1):
    self.alpha = alpha 

def fit(self, X_train, y_train):
    m, n = X_train.shape
    self._classes = np.unique(y_train)
    n_classes = len(self._classes)

    # init: Prior & Likelihood
    self._priors = np.zeros(n_classes)
    self._likelihoods = np.zeros((n_classes, n))

    # Get Prior and Likelihood
    for idx, c in enumerate(self._classes):
        X_train_c = X_train[c == y_train]
        self._priors[idx] = X_train_c.shape[0] / m 
        self._likelihoods[idx, :] = ((X_train_c.sum(axis=0)) + self.alpha) / (np.sum(X_train_c.sum(axis=0) + self.alpha))


def predict(self, X_test):
    return [self._predict(x_test) for x_test in X_test]

def _predict(self, x_test):
    # Calculate posterior for each class
    posteriors = []
    for idx, c in enumerate(self._classes):
        prior_c = np.log(self._priors[idx])
        likelihoods_c = self.calc_likelihood(self._likelihoods[idx,:], x_test)
        posteriors_c = np.sum(likelihoods_c) + prior_c
        posteriors.append(posteriors_c)

    return self._classes[np.argmax(posteriors)]

def calc_likelihood(self, cls_likeli, x_test):
    return np.log(cls_likeli) * x_test

def score(self, X_test, y_test):
    y_pred = self.predict(X_test)
    return np.sum(y_pred == y_test)/len(y_test)