I ran sequential feature selection (mlxtend) to find the best (by roc_auc scoring) features to use in a KNN. However, when I select the best features and run them back through sklearn knn with the same parameters, I get a much different roc_auc value (0.83 vs 0.67).
Reading through the mlxtend documentation, it uses sklearn roc_auc scoring, so I can't figure out why I am getting such different scores.
Mlxtend
knn = KNeighborsClassifier(n_neighbors=4)
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs1 = SFS(knn,
k_features=5,
forward=True,
floating=True,
verbose=2,
scoring='roc_auc',
cv=5)
sfs1 = sfs1.fit(X, y)
Sklearn I took this from an sklearn example -- https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
##KNN
cv = StratifiedKFold(n_splits=5,shuffle=False)
classifier = KNeighborsClassifier(n_neighbors=4,weights='uniform')
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
for train, test in cv.split(X, y):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN ROC')
plt.legend(loc="lower right")
plt.show()
OS: 10.14.6
Python: 3.6.8.final.0
Sklearn: 0.21.3
mlxtend: 0.17.0
Related
While I wrote the kfold codes to validate, I cannot use the object for len() as object, and I've ever tried list() but failed again. BTW, the processing time became very slow and spent over 4 hours but the memories still less than 20%, would anyone can help me to solve these problems:
from sklearn.metrics import roc_curve, auc
from scipy import interp
def plot_roc(X, y, clf_class, **kwargs):
kf = KFold(len(y),n_splits=3,shuffle=True)
y_prob = np.zeros((len(y),2))
for train_index, test_index in kf.split(X,y):
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
for i, (train_index, test_index) in enumerate(kf.split(X,y)):
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
# Predict probabilities, not classes
y_prob[test_index] = clf.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
mean_tpr /= len(kf.split(X,y))
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print ("Support vector machines:")
print(plot_roc(X,y,SVC,probability=True))
print ("Random forests:")
print(plot_roc(X,y,RF,n_estimators=18))
print ("K-nearest-neighbors:")
print(plot_roc(X,y,KNN))
print ("Gradient Boosting Classifier:")
print(plot_roc(X,y,GBC))
and the error message is as follows, even I've tried to use list() to enclose the kf (use split substitute for kf):
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11212/2950209521.py in <module>
39
40 print ("Support vector machines:")
---> 41 print(plot_roc(X,y,SVC,probability=True))
42
43 print ("Random forests:")
~\AppData\Local\Temp/ipykernel_11212/2950209521.py in plot_roc(X, y, clf_class, **kwargs)
23 roc_auc = auc(fpr, tpr)
24 plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
---> 25 mean_tpr /= len(kf.split(X,y))
26 mean_tpr[-1] = 1.0
27 mean_auc = auc(mean_fpr, mean_tpr)
TypeError: object of type 'generator' has no len()
And if I changed the len(kf) as len(list(kf.split(X,y))), the plot became very strange as attached.
I'm using this code to oversample the original data using SMOTE and then training a random forest model with cross validation.
y = df.target
X = df.drop('target', axis=1)
imba_pipeline = make_pipeline(SMOTE(random_state=27, sampling_strategy=1.0),
RandomForestClassifier(n_estimators=200, random_state = 42))
f1_score = cross_val_score(imba_pipeline, X, y, scoring='f1_weighted', cv=5)
roc_auc_score = cross_val_score(imba_pipeline, X, y, scoring='roc_auc', cv=5)
print("F1: %0.4f " % (f1_score.mean()))
print("ROC-AUC: %0.4f " % (roc_auc_score.mean()))
The output is :
F1: 0.9336
ROC-AUC: 0.6589
Now, my question is how to plot the ROC curve in this situation?
In the normal situation where we split the data into training and testing, I use this code:
y = df.target
X = df.drop('target', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=27)
sm = SMOTE(random_state=27, sampling_strategy=1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)
smote_rf =RandomForestClassifier(n_estimators=200, random_state = 42).fit(X_train, y_train)
smote_pred_rf = smote_rf.predict_proba(X_test)[:,1]
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, smote_pred_rf)
print('roc_auc_score for DecisionTree: ', roc_auc_score(y_test, smote_pred_rf))
# plot ROC
plt.figure()
auc_smote = auc(false_positive_rate1, true_positive_rate1)
plt.plot(false_positive_rate1, true_positive_rate1, color='red',lw = 1, label='SMOTE (auc= %0.5f)' % auc_smote)
plt.plot([0, 1], [0, 1], lw = 1, color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Abalone Data Set (RF)', fontweight='bold')
plt.legend(loc="lower right")
plt.show()
First of all, I think you should run 1 cross-validation instead of a new cross-validation for every metric that you want. That is wasting resources and you are not measuring the same models for those metrics then.
For that, see the function cross_validate (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate)
Example:
>>> scores = cross_validate(lasso, X, y, cv=3,
... scoring=('r2', 'neg_mean_squared_error'),
... return_train_score=True)
>>> print(scores['test_neg_mean_squared_error'])
[-3635.5... -3573.3... -6114.7...]
>>> print(scores['train_r2'])
[0.28010158 0.39088426 0.22784852]
Specifically for the ROC curve, you probably need to go even more in detail and grab predictions from every round of your cross validation.
There is this example on the sklearn website that shows one way to do that: https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
Copy paste below:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold
# #############################################################################
# Data IO and generation
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
X, y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape
# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# #############################################################################
# Classification and ROC analysis
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', probability=True,
random_state=random_state)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
classifier.fit(X[train], y[train])
viz = plot_roc_curve(classifier, X[test], y[test],
name='ROC fold {}'.format(i),
alpha=0.3, lw=1, ax=ax)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
aucs.append(viz.roc_auc)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
title="Receiver operating characteristic example")
ax.legend(loc="lower right")
plt.show()
My code to plot the ROC curve,
'''
true = y_true
pred = preds[:,0]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
fpr[i], tpr[i], _ = roc_curve(true, pred)
roc_auc[i] = auc(fpr[i], tpr[i])
print(roc_auc_score(test, pred))
plt.figure()
plt.plot(fpr[1], tpr[1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()
'''
However, I receive the error, "ValueError: not enough values to unpack (expected 3, got 2)" in the for loop for fpr, tpr calcuation. Any ideas what I am doing wrong? Thanks for your time.
Try this
from sklearn import metrics
true = y_true
pred = preds[:,0]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
fpr[i], tpr[i], _ = metrics.roc_curve(true, pred)
roc_auc[i] = metrics.auc(fpr[i], tpr[i])
print(roc_auc_score(test, pred))
plt.figure()
plt.plot(fpr[1], tpr[1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()
I was trying to plot a ROC curve by using the documentation provided by sklearn. My data is in a CSV file, and it looks like this.It has two classes 'Good'and 'Bad'
screenshot of my CSV file
And my code looks like this
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
import sys
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
# Import some data to play with
df = pd.read_csv("E:\\autodesk\\TTI ROC curve.csv")
X =df[['TTI','Max TemperatureF','Mean TemperatureF','Min TemperatureF',' Min Humidity']].values
y = df['TTI_Category'].as_matrix()
# Binarize the output
y = label_binarize(y, classes=['Good','Bad'])
n_classes = y.shape[1]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()enter code here
If i run this code the system told me random_state is not defined. so I changed it to random_state=true. Then the system told me
plt.plot(fpr[2], tpr[2], color='darkorange', KeyError: 2 <matplotlib.figure.Figure at 0xd8bff60>
if I print out n_classes. The system told me it's "1", and if I print out the n_classes in the documentation it says 3. I'm not sure if that's where the problem is. Does anyone have answer to this traceback?
Looks like you simply don't understand how your data is structured and how your code should work.
LabelBinarizer will return a one-v-all encoding, meaning that for two classes you will get the following mapping: ['good', 'bad', 'good'] -> [[1], [0], [1]], s.t. n_classes = 1.
Why would you expect it to be 3 if you have 2 classes?
Simply change plt.plot(fpr[2], tpr[2], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2]) to plt.plot(fpr[0], tpr[0], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[0]) and you should be good.
Just look in tpr and fpr dictionaries and you will see that you don't have a tpr[2] or fpr[2]. n_classes = y.shape[1] shows how many classes you have (= 2), which means that you have keys of 0 and 1 in your tpr and fpr dictionaries.
You are overcomplicating things by using a multi-class approach when you only have 2 classes (binary classification). I think you are using this tutorial.
I would advise replacing the following:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
With something like:
fpr, tpr = roc_curve(y_test.values, y_score[:,1])
roc_auc = auc(fpr,tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
I'm recently struggling with using sklearn for my project.
I wanted to build a classifier and classify my data into six groups. the total sample size was 88 then I split the data into train(66) and test(22)
I did exactly as sklearn documentation showed, here is my code
from sklearn.multiclass import OneVsRestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
clf = OneVsRestClassifier(QDA())
QDA_score = clf.fit(train,label).decision_function(test)
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
fpr[i], tpr[i], _ = roc_curve(label_test[:, i], QDA_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
from itertools import cycle
import matplotlib.pyplot as plt
plt.figure()
lw = 2
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color,n in zip(range(3), colors,['_000','_15_30_45','60']):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of {0} (area = {1:0.2f})'
''.format(n , roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC for multi-classes')
plt.legend(loc="lower right")
plt.show()
the link is my result.
however every time I run the code the result changes. I'm wondering if there is anyway that I can combine this with Cross validation and compute average and stable ROC for each class
Thanks!
You can use cross_val_predict to first get the cross-validated probabilities and then plot the ROC curve for each class.
Example using Iris data
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y_bin = label_binarize(y, classes=[0, 1, 2])
n_classes = y_bin.shape[1]
clf = OneVsRestClassifier(QDA())
y_score = cross_val_predict(clf, X, y, cv=10 ,method='predict_proba')
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()
To get the ROC for each Fold do this:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
iris = datasets.load_iris()
X = iris.data
y = iris.target
X, y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape
# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# Classification and ROC analysis
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', probability=True,
random_state=random_state)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
for train, test in cv.split(X, y):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Luck', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
It is hard to tell without more details of the data and the comlexity of the problem you are trying to solve, but irregular learning performance like yours could indicate that your dataset is too small for the irregularity and complexity of the data, so that every time you sample you get a train dataset which is different.
A common test vs train stabling technique you could also look into is k-fold cross validation.
UPDATE:
K-fold cross validation is basically slicing the data into k parts and then do the learning process k times and average their results, where each time a different part of the data is the test dataset and the rest k-1 parts are the train dataset.