I'm using this code to oversample the original data using SMOTE and then training a random forest model with cross validation.
y = df.target
X = df.drop('target', axis=1)
imba_pipeline = make_pipeline(SMOTE(random_state=27, sampling_strategy=1.0),
RandomForestClassifier(n_estimators=200, random_state = 42))
f1_score = cross_val_score(imba_pipeline, X, y, scoring='f1_weighted', cv=5)
roc_auc_score = cross_val_score(imba_pipeline, X, y, scoring='roc_auc', cv=5)
print("F1: %0.4f " % (f1_score.mean()))
print("ROC-AUC: %0.4f " % (roc_auc_score.mean()))
The output is :
F1: 0.9336
ROC-AUC: 0.6589
Now, my question is how to plot the ROC curve in this situation?
In the normal situation where we split the data into training and testing, I use this code:
y = df.target
X = df.drop('target', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=27)
sm = SMOTE(random_state=27, sampling_strategy=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)
smote_rf =RandomForestClassifier(n_estimators=200, random_state = 42).fit(X_train, y_train)
smote_pred_rf = smote_rf.predict_proba(X_test)[:,1]
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, smote_pred_rf)
print('roc_auc_score for DecisionTree: ', roc_auc_score(y_test, smote_pred_rf))
# plot ROC
plt.figure()
auc_smote = auc(false_positive_rate1, true_positive_rate1)
plt.plot(false_positive_rate1, true_positive_rate1, color='red',lw = 1, label='SMOTE (auc= %0.5f)' % auc_smote)
plt.plot([0, 1], [0, 1], lw = 1, color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Abalone Data Set (RF)', fontweight='bold')
plt.legend(loc="lower right")
plt.show()