0
votes

Kfold commands for accuracy, AUC and recall worked fine but now showing errors.

kernel restarted a number of times and attempted alternative approaches to no avail, such as 'stratifiedkfold', 'in emumerate' and looping.

from sklearn.model_selection import KFold
svc_clf = svm.SVC(C=50, 
                  kernel='rbf', 
                  gamma=0.1,
                  probability=False,
                  class_weight={1: 5}
                 )
svc_clf.fit(X_train_std, y_train)

# K-fold cross-validator
kfold = Kfold(n_splits=10, random_state=140311, shuffle=True)
for train_index, test_index in kfold.split(X):
    X_training, X_testing = X_train_std[train_index], X_train_std[test_index]
    y_training, y_testing = y_train[train_index], y_train[test_index]

df_kfold_acc = cross_val_score(svc_clf, X_train_std, y_train, cv=kfold, scoring='accuracy')
print'10 fold validation accuracy scores: \n', (df_kfold_acc)
print'Kfold mean accuracy score: \n', (df_kfold_acc).mean()

df_kfold_auc = cross_val_score(svc_clf, X_train_std, y_train, cv=kfold, scoring='roc_auc')
print'\n\n 10 fold validation AUC scores:\n ', (df_kfold_auc)
print'Kfold mean AUC score: \n', (df_kfold_auc).mean()

df_kfold_recall = cross_val_score(svc_clf, X_train_std, y_train, cv=kfold, scoring='recall')
print'\n\n 10 fold validation recall scores:\n', (df_kfold_recall)
print'Kfold mean recall score: \n', (df_kfold_recall).mean()

Expected (and previous got) something like the following:

10 fold validation accuracy scores: {0.7982993, 0.6793838, etc (for 10 folds total)} Kfold mean accuracy score: 0.78679979

Actual error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-56-61c6420c7a2f> in <module>()
      6                   class_weight={1: 5}
      7                  )
----> 8 svc_clf.fit(X_train_std, y_train)
      9 
     10 # K-fold cross-validator

/Users/db/anaconda2/lib/python2.7/site-packages/sklearn/svm/base.pyc in fit(self, X, y, sample_weight)
    147         self._sparse = sparse and not callable(self.kernel)
    148 
--> 149         X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
    150         y = self._validate_targets(y)
    151 

/Users/db/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    576                         dtype=None)
    577     else:
--> 578         y = column_or_1d(y, warn=True)
    579         _assert_all_finite(y)
    580     if y_numeric and y.dtype.kind == 'O':

/Users/db/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.pyc in column_or_1d(y, warn)
    612         return np.ravel(y)
    613 
--> 614     raise ValueError("bad input shape {0}".format(shape))
    615 
    616 

ValueError: bad input shape (513, 10)
1
The error alternates between that mentioned in the description above and the following error: NameError: name 'Kfold' is not defineddb2020

1 Answers

0
votes

The error warns you about the shape of the data you are passing for training is not in the correct shape. x_train : Training vector {array-like, sparse matrix}, shape (n_samples, n_features) y : Target vector relative to X array-like, shape (n_samples,).

Since you gave no information about the data you are using, (513,10) shape for X is okay, but you should check the target vector shape. It should be the shape mentioned above.

from sklearn.model_selection import KFold,cross_val_score
from sklearn import svm

X_train = np.array([[1,1,1,1],[1,1,1,1],[0,0,0,0],[0,0,0,0]])
y_train = np.array([1,1,0,0])

svc_clf = svm.SVC(C=50, 
                  kernel='rbf', 
                  gamma=0.1,
                  probability=False,
                  class_weight={1: 5}
                 )

# K-fold cross-validator
kfold = KFold(n_splits=4, random_state=140311, shuffle=True)

df_kfold_acc = cross_val_score(svc_clf, X_train, y_train, cv=kfold, scoring='accuracy')
print('4 fold validation accuracy scores: \n', (df_kfold_acc))
print('Kfold mean accuracy score: \n', (df_kfold_acc).mean())

Output :

4 fold validation accuracy scores: 
 [1. 1. 1. 1.]
Kfold mean accuracy score: 
 1.0