I couldn't find and solve multinomial naive Bayes from scratch without the sklearn MultinomialNB library. But, when I fit MultinomialNB Classifier to the training set. But there's some problem. Here's the problem
I make the bag of words model and splitting the train and test set.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split #it was "sklearn.cross_validation" but now it changed
X = corpus
y = dataset.id_sentimen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state=0)
vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2))
X_train = vect.fit_transform(X_train).toarray()
X_test = vect.transform(X_test)
Here's the code MultinomialNB without library from sklearn
class MultinomialNB():
def fit(self, X_train, y_train, ls=0.01):
self.ls = ls
self.y_classes, y_counts = np.unique(y_train, return_counts=True)
self.x_classes = [np.unique(x) for x in X.T]
self.phi_y = 1.0 * y_counts/y_counts.sum()
self.phi_x = self.mean_X(X_train, y_train)
self.c_x = self.count_x(X_train, y_train)
return self
def mean_X(self, X_train, y_train):
return [[self.ls_mean_x(X_train, y_train, k, j) for j in range(len(self.x_classes))] for k in self.y_classes]
def ls_mean_x(self, X_train, y_train, k, j):
x_data = (X_train[:,j][y==k].reshape(-1,1) == self.x_classes[j])
return (x_data.sum(axis=0) + self.ls ) / (len(x_data) + (len(self.x_classes) * self.ls))
def get_mean_x(self, y_train, j):
return 1 + self.ls / (self.c_x[y][j] + (len(self.x_classes) * self.ls))
def count_x(self, X_train, y_train):
return [[len(X[:,j][y==k].reshape(-1,1) == self.x_classes[j])
for j in range(len(self.x_classes))]
for k in self.y_classes]
def predict(self, X_train):
return np.apply_along_axis(lambda x: self.compute_probs(x), 1, X_train)
def compute_probs(self, x):
probs = np.array([self.compute_prob(x, y) for y in range(len(self.y_classes))])
return self.y_classes[np.argmax(probs)]
def compute_prob(self, x, y):
Pxy = 1
for j in range(len(x)):
x_clas = self.x_classes[j]
if x[j] in x_clas:
i = list(x_clas).index(x[j])
p_x_j_y = self.phi_x[y][j][i] # p(xj|y)
Pxy *= p_x_j_y
else:
Pxy *= get_mean_x(y, j)
return Pxy * self.phi_y[y]
def evaluate(self, X_train, y_train):
return (self.predict(X_train) == y_train).mean()
I want to fit the MultinomialNB to my training set
# Fitting MultinomialNB Classifier to the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# Predicting test results
y_pred = classifier.predict(X_test)
ytest = np.array(y_test)
# f1_score(ytest, y_pred, average='weighted')
print(classification_report(ytest, y_pred))
print(confusion_matrix(ytest, y_pred))
Could anyone help solve the code and the error?