Related
I need to determine how well different classification models predict values. In order to do this i need to plot an ROC curve but i am struggling to develop an approach.
I included my entire python code as well as the link to the dataset i used. It seems like a lot of code but is really simple actually. The main issue i find, is that i have a 3x3 confusion matrix and dont know how to translate that into a ROC plot.
Any help is greatly appreciated.
Dataset:
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
import numpy as np
#data = pd.read_csv('wineQualityReds.csv', usecols=lambda x: 'Unnamed' not in x,)
data = pd.read_csv('wineQualityWhites.csv', usecols=lambda x: 'Unnamed' not in x,)
# roc curve and auc score
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
def plot_roc_curve(fpr, tpr):
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
bins = [1,4,6,10]
quality_labels = [0,1,2]
data['quality_categorial'] = pd.cut(data['quality'], bins = bins, labels = quality_labels, include_lowest = True)
display(data.head(n=2))
quality_raw = data['quality_categorial']
features_raw = data.drop(['quality', 'quality_categorial'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_raw, quality_raw, test_size = 0.2, random_state = 0)
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
def train_predict_evaluate(learner, sample_size, X_train, y_train, X_test, y_test):
results = {}
#start = time()
learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
#end = time()
#results['train_time'] = end - start
#start = time()
predictions_train = learner.predict(X_train[:300])
predictions_test = learner.predict(X_test)
#end = time()
#results['pred_time'] = end - start
results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
results['acc_test'] = accuracy_score(y_test, predictions_test)
results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta = 0.5, average = 'micro')
results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5, average = 'micro')
#####################
#array = print(confusion_matrix(y_test, predictions_test))
labels = ['Positives','Negatives']
cm = confusion_matrix(y_test, predictions_test)
print(cm)
df_cm = pd.DataFrame(cm, columns=np.unique(y_test), index = np.unique(y_test))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, cmap="Blues", annot=True, fmt = 'g',annot_kws={"size": 16})# font size
#######################
print(predictions_test)
#auc = roc_auc_score(y_test, probs)
#print('AUC: %.2f' % auc)
#fpr, tpr, thresholds = roc_curve(y_test, probs)
#plot_roc_curve(fpr, tpr)
print("{} trained on {} samples." .format(learner.__class__.__name__, sample_size))
return results
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
clf_A = GaussianNB()
clf_B = DecisionTreeClassifier(max_depth=None, random_state=None)
clf_C = RandomForestClassifier(max_depth=None, random_state=None)
samples_100 = len(y_train)
samples_10 = int(len(y_train)*10/100)
samples_1 = int(len(y_train)*1/100)
results = {}
for clf in [clf_A,clf_B,clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = \
train_predict_evaluate(clf, samples, X_train, y_train, X_test, y_test)
train_predict_evaluate(clf_C, samples_100, X_train, y_train, X_test, y_test)
You cannot directly calculate RoC curve from confusion matrix because AUC - ROC curve is a performance measurement for classification problem at various thresholds settings.
The following code works for me:
def plot_roc(model, X_test, y_test):
# calculate the fpr and tpr for all thresholds of the classification
probabilities = model.predict_proba(np.array(X_test))
predictions = probabilities[:, 1]
fpr, tpr, threshold = metrics.roc_curve(y_test, predictions)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
I am currently trying to implement an ROC Curve for my kNN classification algorithm. I am aware that an ROC Curve is a plot of True Positive Rate vs False Positive Rate, I am just struggling with finding those values from my dataset. I import 'autoimmune.csv' into my python script and run the kNN algorithm on it to output an accuracy value. Scikit-learn.org documentation shows that to generate the TPR and FPR I need to pass in values of y_test and y_scores as shown below:
fpr, tpr, threshold = roc_curve(y_test, y_scores)
I am just struggling with what I should be using as these values.
Thanks for your help in advance and apologies if there is something I have missed as it is my first post here.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('./autoimmune.csv')
X = data.drop(columns=['autoimmune'])
y = data['autoimmune'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train,y_train)
knn.predict(X_test)[0:10]
knn.score(X_test,y_test)
print("Test set score: {:.4f}".format(knn.score(X_test, y_test)))
knn_cv = KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(knn_cv, X, y, cv=10)
print(cv_scores)
print('cv_scores mean:{}' .format(np.mean(cv_scores)))
y_scores = cross_val_score(knn_cv, X, y, cv=76)
fpr, tpr, threshold = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
If you look at the documentation for roc_curve(), you will see the following regarding the y_score parameter:
y_score : array, shape = [n_samples] Target scores, can either be
probability estimates of the positive class, confidence values, or
non-thresholded measure of decisions (as returned by
“decision_function” on some classifiers).
You can get probability estimates using the predict_proba() method of the KNeighborsClassifier in sklearn. This returns a numpy array with two columns for a binary classification, one each for the negative and positive class. For the roc_curve() function you want to use probability estimates of the positive class, so you can replace your:
y_scores = cross_val_score(knn_cv, X, y, cv=76)
fpr, tpr, threshold = roc_curve(y_test, y_scores)
with:
y_scores = knn.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
Notice how you need to take all the rows of the second column with [:, 1] to only select the probability estimates of the positive class. Here's a minimal reproducible example using the Wisconsin breast cancer dataset, since I don't have your autoimmune.csv:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train,y_train)
y_scores = knn.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
This produces the following ROC curve:
I'm recently struggling with using sklearn for my project.
I wanted to build a classifier and classify my data into six groups. the total sample size was 88 then I split the data into train(66) and test(22)
I did exactly as sklearn documentation showed, here is my code
from sklearn.multiclass import OneVsRestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
clf = OneVsRestClassifier(QDA())
QDA_score = clf.fit(train,label).decision_function(test)
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
fpr[i], tpr[i], _ = roc_curve(label_test[:, i], QDA_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
from itertools import cycle
import matplotlib.pyplot as plt
plt.figure()
lw = 2
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color,n in zip(range(3), colors,['_000','_15_30_45','60']):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of {0} (area = {1:0.2f})'
''.format(n , roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC for multi-classes')
plt.legend(loc="lower right")
plt.show()
the link is my result.
however every time I run the code the result changes. I'm wondering if there is anyway that I can combine this with Cross validation and compute average and stable ROC for each class
Thanks!
You can use cross_val_predict to first get the cross-validated probabilities and then plot the ROC curve for each class.
Example using Iris data
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y_bin = label_binarize(y, classes=[0, 1, 2])
n_classes = y_bin.shape[1]
clf = OneVsRestClassifier(QDA())
y_score = cross_val_predict(clf, X, y, cv=10 ,method='predict_proba')
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()
To get the ROC for each Fold do this:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
iris = datasets.load_iris()
X = iris.data
y = iris.target
X, y = X[y != 2], y[y != 2]
n_samples, n_features = X.shape
# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# Classification and ROC analysis
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', probability=True,
random_state=random_state)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
for train, test in cv.split(X, y):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Luck', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
It is hard to tell without more details of the data and the comlexity of the problem you are trying to solve, but irregular learning performance like yours could indicate that your dataset is too small for the irregularity and complexity of the data, so that every time you sample you get a train dataset which is different.
A common test vs train stabling technique you could also look into is k-fold cross validation.
UPDATE:
K-fold cross validation is basically slicing the data into k parts and then do the learning process k times and average their results, where each time a different part of the data is the test dataset and the rest k-1 parts are the train dataset.
I am building 2 models.
Model 1
modelgb = GradientBoostingClassifier()
modelgb.fit(x_train,y_train)
predsgb = modelgb.predict_proba(x_test)[:,1]
metrics.roc_auc_score(y_test,predsgb, average='macro', sample_weight=None)
Model 2
model = LogisticRegression()
model = model.fit(x_train,y_train)
predslog = model.predict_proba(x_test)[:,1]
metrics.roc_auc_score(y_test,predslog, average='macro', sample_weight=None)
How do i plot both the ROC curves in one plot , with a legend & text of AUC scores for each model ?
Try adapting this to your data:
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
plt.figure(0).clf()
pred = np.random.rand(1000)
label = np.random.randint(2, size=1000)
fpr, tpr, thresh = metrics.roc_curve(label, pred)
auc = metrics.roc_auc_score(label, pred)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
pred = np.random.rand(1000)
label = np.random.randint(2, size=1000)
fpr, tpr, thresh = metrics.roc_curve(label, pred)
auc = metrics.roc_auc_score(label, pred)
plt.plot(fpr,tpr,label="data 2, auc="+str(auc))
plt.legend(loc=0)
Just by adding the models to the list will plot multiple ROC curves in one plot. Hopefully this works for you!
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
plt.figure()
# Add the models to the list that you want to view on the ROC plot
models = [
{
'label': 'Logistic Regression',
'model': LogisticRegression(),
},
{
'label': 'Gradient Boosting',
'model': GradientBoostingClassifier(),
}
]
# Below for loop iterates through your models list
for m in models:
model = m['model'] # select the model
model.fit(x_train, y_train) # train the model
y_pred=model.predict(x_test) # predict the test data
# Compute False postive rate, and True positive rate
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(x_test)[:,1])
# Calculate Area under the curve to display on the plot
auc = metrics.roc_auc_score(y_test,model.predict(x_test))
# Now, plot the computed values
plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show() # Display
Something like this ...
#ROC Curve
from sklearn.metrics import roc_curve
y_pred_prob1 = classifier1.predict_proba(X_test)[:,1]
fpr1 , tpr1, thresholds1 = roc_curve(Y_test, y_pred_prob1)
y_pred_prob2 = classifier2.predict_proba(X_test)[:,1]
fpr2 , tpr2, thresholds2 = roc_curve(Y_test, y_pred_prob2)
y_pred_prob3 = classifier3.predict_proba(X_test)[:,1]
fpr3 , tpr3, thresholds3 = roc_curve(Y_test, y_pred_prob3)
y_pred_prob4 = classifier4.predict_proba(X_test)[:,1]
fpr4 , tpr4, thresholds4 = roc_curve(Y_test, y_pred_prob4)
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr1, tpr1, label= "Linear")
plt.plot(fpr2, tpr2, label= "Poly")
plt.plot(fpr3, tpr3, label= "RBF")
plt.plot(fpr4, tpr4, label= "Sigmoid")
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title('Receiver Operating Characteristic')
plt.show()
from sklearn.metrics import plot_roc_curve
fig = plot_roc_curve( clf, x_train_bow, y_train)
fig = plot_roc_curve( clf, x_test_bow, y_test, ax = fig.ax_)
fig.figure_.suptitle("ROC curve comparison")
plt.show()
Basically plot_roc_curve function plot the roc_curve for the classifier. So if we use plot_roc_curve two times without the specifying ax parameter it will plot two graphs. So here we store the first gragh in the figure variable and access its axis and provide to the next plot_roc_curve function, so that the plot appear of the axes of the first graph only.
from sklearn.metrics import plot_roc_curve
classifiers = [log_reg, decision_tree, decision_forest]
ax = plt.gca()
for i in classifiers:
plot_roc_curve(i, X_test, y_test, ax=ax)
I want to compare lasso with other classifiers in sklearn. I have a binary outcome vector y. I usually compute a vector probas that contains the predicted probability for each input point to have 1 as a phenotype and then generate a ROC curve fro these 2 vectors. But how to compute this probability for lasso classifier? There is no method predict_proba.
For other classifiers this code works:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn import datasets
from sklearn.cross_validation import LeaveOneOut
import pandas as pd
from sklearn import metrics
#loading a toy dataset
iris = datasets.load_iris()
X = iris.data
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
y = iris.target
X, y = X[y != 2], y[y != 2]
classifiers = [
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
LogisticRegression(),
]
classifierNames=[ "Random Forests", "Logistic Regression" ]
for clf in classifiers:
print (clf)
loo = LeaveOneOut(len(y))
probas=[]
for train, test in loo:
probas.append ( clf.fit(X[train], y[train]).predict_proba(X[test])[0][1])
#probas is a vector that contains the probability of getting phenotype 1
#Then we just need to use our auc roc function for plotting.
dfphenotypes=pd.DataFrame(y)
dfpredicted=pd.DataFrame(probas)
#probas contains the probability of getting phenotype 1
#then we just need to use our auc roc function.
roc_auc=metrics.roc_auc_score(dfphenotypes, dfpredicted)
fpr, tpr, thresholds=metrics.roc_curve(dfphenotypes, dfpredicted)
# Plot ROC curve
plt.plot(fpr, tpr, '--', label=classifierNames[i]+' (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.figure(num=1, figsize=(30,40))
print("auc =", roc_auc)