I need to determine how well different classification models predict values. In order to do this i need to plot an ROC curve but i am struggling to develop an approach.
I included my entire python code as well as the link to the dataset i used. It seems like a lot of code but is really simple actually. The main issue i find, is that i have a 3x3 confusion matrix and dont know how to translate that into a ROC plot.
Any help is greatly appreciated.
Dataset:
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
import numpy as np
#data = pd.read_csv('wineQualityReds.csv', usecols=lambda x: 'Unnamed' not in x,)
data = pd.read_csv('wineQualityWhites.csv', usecols=lambda x: 'Unnamed' not in x,)
# roc curve and auc score
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
def plot_roc_curve(fpr, tpr):
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
bins = [1,4,6,10]
quality_labels = [0,1,2]
data['quality_categorial'] = pd.cut(data['quality'], bins = bins, labels = quality_labels, include_lowest = True)
display(data.head(n=2))
quality_raw = data['quality_categorial']
features_raw = data.drop(['quality', 'quality_categorial'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_raw, quality_raw, test_size = 0.2, random_state = 0)
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
def train_predict_evaluate(learner, sample_size, X_train, y_train, X_test, y_test):
results = {}
#start = time()
learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
#end = time()
#results['train_time'] = end - start
#start = time()
predictions_train = learner.predict(X_train[:300])
predictions_test = learner.predict(X_test)
#end = time()
#results['pred_time'] = end - start
results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
results['acc_test'] = accuracy_score(y_test, predictions_test)
results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta = 0.5, average = 'micro')
results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5, average = 'micro')
#####################
#array = print(confusion_matrix(y_test, predictions_test))
labels = ['Positives','Negatives']
cm = confusion_matrix(y_test, predictions_test)
print(cm)
df_cm = pd.DataFrame(cm, columns=np.unique(y_test), index = np.unique(y_test))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, cmap="Blues", annot=True, fmt = 'g',annot_kws={"size": 16})# font size
#######################
print(predictions_test)
#auc = roc_auc_score(y_test, probs)
#print('AUC: %.2f' % auc)
#fpr, tpr, thresholds = roc_curve(y_test, probs)
#plot_roc_curve(fpr, tpr)
print("{} trained on {} samples." .format(learner.__class__.__name__, sample_size))
return results
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
clf_A = GaussianNB()
clf_B = DecisionTreeClassifier(max_depth=None, random_state=None)
clf_C = RandomForestClassifier(max_depth=None, random_state=None)
samples_100 = len(y_train)
samples_10 = int(len(y_train)*10/100)
samples_1 = int(len(y_train)*1/100)
results = {}
for clf in [clf_A,clf_B,clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = \
train_predict_evaluate(clf, samples, X_train, y_train, X_test, y_test)
train_predict_evaluate(clf_C, samples_100, X_train, y_train, X_test, y_test)
You cannot directly calculate RoC curve from confusion matrix because AUC - ROC curve is a performance measurement for classification problem at various thresholds settings.
The following code works for me:
def plot_roc(model, X_test, y_test):
# calculate the fpr and tpr for all thresholds of the classification
probabilities = model.predict_proba(np.array(X_test))
predictions = probabilities[:, 1]
fpr, tpr, threshold = metrics.roc_curve(y_test, predictions)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Related
I am trying to plot the mean ROC curve of a support vector machine (SVM) model with a linear kernel over 10 runs. The code fits the SVM model to the training data, and generates the ROC curve and its corresponding area under the curve (AUC) for each run. The mean ROC curve is then computed using the mean false positive rate (mean_fpr) and mean true positive rate (mean_tpr) obtained from all 10 runs. However, the resulting plot does not start at the origin (0, 0), indicating that there is an issue with the computation of mean_fpr and mean_tpr.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
# Read the csv file
df = load_breast_cancer(as_frame=True)
# Split the data into features (X) and target (y)
X = df['data']
y = df['target']
from sklearn.metrics import roc_curve, auc
# Number of runs
#random.seed(321)
n_runs = 10
# Lists to store the results
aucs = []
tprs = []
fprs = []
for i in range(n_runs):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None)
svclassifier = SVC(kernel='linear', random_state=i)
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
y_score = svclassifier.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_score)
fprs.append(fpr)
tprs.append(tpr)
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
# Mean ROC curve
mean_fpr = np.unique(np.concatenate(fprs))
mean_tpr = np.unique(np.concatenate(tprs))
mean_tpr = np.zeros_like(mean_fpr)
for i in range(n_runs):
mean_tpr += np.interp(mean_fpr, fprs[i], tprs[i])
mean_tpr /= n_runs
mean_auc = auc(mean_fpr, mean_tpr)
# Plot the mean ROC curve
sns.lineplot(x=mean_fpr, y=mean_tpr, ci=None, label='Mean ROC (AUC = %0.2f)' % mean_auc)
plt.xlim([-0.1, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
The problematic line of code is as follows:
for i in range(n_runs):
mean_tpr += np.interp(mean_fpr, fprs[i], tprs[i])
Can anyone help me identify and fix the issue with the mean_fpr and mean_tpr values so that the resulting plot starts at (0, 0)?
I wish to append the results to the list data for each of the models utilised, however the function calc only appends the results from the last model. I am sure it is something really simple I am missing here!
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
classifiers =[LogisticRegression(solver='liblinear', penalty='l2', C=200),
LogisticRegression(penalty='l2', C=1),
DecisionTreeClassifier(),
BernoulliNB()]
class_names = ['Logistic Regression', 'Logistic Regression'
'Regularized','CART', 'Naive Bayes (Bernoulli)']
# import some data to play with
iris = datasets.load_iris()
Xdata = iris.data[:, :2] # we only take the first two features
ydata = iris.target
def calc (classifier_names, classifier_models, Xdata, ydata):
X_train, X_test, y_train, y_test = \
train_test_split(Xdata, ydata,test_size = 0.50, stratify=ydata,
random_state = 42)
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)
data=[]
for name, clf in zip(classifier_names, classifier_models):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
ROC_AUC = plot_ROC_AUC(clf, X_test, y_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
Brier_Score = metrics.brier_score_loss(y_test, y_pred)
data.append((ROC_AUC,
Accuracy,
Brier_Score))
cols = ['ROC_AUC', 'Accuracy', 'Brier_Score']
result = pd.DataFrame(data, columns = cols, index=classifier_names)
return result
output = calc(class_names, classifiers, Xdata, ydata)
output
ROC_AUC Accuracy Brier_Score
Logistic Regression 0.925517 0.855072 0.144928
Logistic Regression Regularized 0.925517 0.855072 0.144928
CART 0.925517 0.855072 0.144928
Naive Bayes (Bernoulli) 0.925517 0.855072 0.144928
#want this to change here
#function within the calc function
def plot_ROC_AUC(fit_model, X_test, y_test):
probs=fit_model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
#plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
return roc_auc
I'm uncertain on the specifics of what you're attempting but i see an issue here
def calc (classifier_names, classifier_models, X, y):
X_train, X_test, y_train, y_test = \
train_test_split(Xdata, ydata,test_size = 0.50, stratify=ydata,
random_state = 42)
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)
data=[]
for name, clf in zip(classifier_names, classifier_models):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
ROC_AUC = plot_ROC_AUC(clf, X_test, y_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
Brier_Score = metrics.brier_score_loss(y_test, y_pred)
data.append((ROC_AUC,
Accuracy,
Brier_Score))
cols = ['ROC_AUC', 'Accuracy', 'Brier_Score']
result = pd.DataFrame(data, columns = cols, index=classifier_names)
return result
or simplified:
def func(something, darkside):
for i in range(some_int):
return some_other_func(i)
this loop will only go through one step, as the return statement will break out of the function.
I think what you should attempt to do is aggregate the results of the for loop in some DataFrame and then return the aggregate. At this point I could say it's an indentation issue but looking higher i see you overwrite result on each loop too, so I would start there
Maybe move the loop outside the function? and do this instead:
def func(something, darkside):
return some_expression_of(something,darkside)
for name, clf, in zip(classifer_names, classifier_models:
func(name,clf)
I am currently trying to implement an ROC Curve for my kNN classification algorithm. I am aware that an ROC Curve is a plot of True Positive Rate vs False Positive Rate, I am just struggling with finding those values from my dataset. I import 'autoimmune.csv' into my python script and run the kNN algorithm on it to output an accuracy value. Scikit-learn.org documentation shows that to generate the TPR and FPR I need to pass in values of y_test and y_scores as shown below:
fpr, tpr, threshold = roc_curve(y_test, y_scores)
I am just struggling with what I should be using as these values.
Thanks for your help in advance and apologies if there is something I have missed as it is my first post here.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv('./autoimmune.csv')
X = data.drop(columns=['autoimmune'])
y = data['autoimmune'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train,y_train)
knn.predict(X_test)[0:10]
knn.score(X_test,y_test)
print("Test set score: {:.4f}".format(knn.score(X_test, y_test)))
knn_cv = KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(knn_cv, X, y, cv=10)
print(cv_scores)
print('cv_scores mean:{}' .format(np.mean(cv_scores)))
y_scores = cross_val_score(knn_cv, X, y, cv=76)
fpr, tpr, threshold = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)
print(roc_auc)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
If you look at the documentation for roc_curve(), you will see the following regarding the y_score parameter:
y_score : array, shape = [n_samples] Target scores, can either be
probability estimates of the positive class, confidence values, or
non-thresholded measure of decisions (as returned by
“decision_function” on some classifiers).
You can get probability estimates using the predict_proba() method of the KNeighborsClassifier in sklearn. This returns a numpy array with two columns for a binary classification, one each for the negative and positive class. For the roc_curve() function you want to use probability estimates of the positive class, so you can replace your:
y_scores = cross_val_score(knn_cv, X, y, cv=76)
fpr, tpr, threshold = roc_curve(y_test, y_scores)
with:
y_scores = knn.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
Notice how you need to take all the rows of the second column with [:, 1] to only select the probability estimates of the positive class. Here's a minimal reproducible example using the Wisconsin breast cancer dataset, since I don't have your autoimmune.csv:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train,y_train)
y_scores = knn.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
This produces the following ROC curve:
I was trying to plot ROC curve with classifiers other than svm.SVC which is provided in the documentation. My code works good for svm.SVC; however, after I switched to KNeighborsClassifier, MultinomialNB, and DecisionTreeClassifier, the system keeps telling me check_consistent_length(y_true, y_score)andFound input variables with inconsistent numbers of samples: [26632, 53264] My CSV file looks like this
And here is my code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
import sys
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
# Import some data to play with
df = pd.read_csv("E:\\autodesk\\Hourly and weather categorized2.csv")
X =df[['TTI','Max TemperatureF','Mean TemperatureF','Min TemperatureF',' Min Humidity']].values
y = df['TTI_Category'].as_matrix()
y=y.reshape(-1,1)
# Binarize the output
y = label_binarize(y, classes=['Good','Bad'])
n_classes = y.shape[1]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
plt.figure()
lw = 1
plt.plot(fpr[0], tpr[0], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[0])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
I'm suspecting that the error occurs at this line fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]),but I'm a beginner to this ROC curve, so could someone kindly guide me through this traceback. Thanks a lot for your time and help.Here is another question regarding ROC curve from me
By the way here is the whole traceback. Hopefully my explanation is clear enough. `
Traceback (most recent call last):
File "<ipython-input-1-16eb0db9d4d9>", line 1, in <module>
runfile('C:/Users/Think/Desktop/Python Practice/ROC with decision tree.py', wdir='C:/Users/Think/Desktop/Python Practice')
File "C:\Users\Think\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
execfile(filename, namespace)
File "C:\Users\Think\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/Think/Desktop/Python Practice/ROC with decision tree.py", line 47, in <module>
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
File "C:\Users\Think\Anaconda2\lib\site-packages\sklearn\metrics\ranking.py", line 510, in roc_curve
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
File "C:\Users\Think\Anaconda2\lib\site-packages\sklearn\metrics\ranking.py", line 302, in _binary_clf_curve
check_consistent_length(y_true, y_score)
File "C:\Users\Think\Anaconda2\lib\site-packages\sklearn\utils\validation.py", line 173, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [26632, 53264]
You need to use the predict_proba function of the DecisionTreeClassifier:
Example:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
classifier = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()
The problem is been solved by adding this line to the original code y_resampled = label_binarize(y_resampled, classes=['Good','Bad','Ok'])
There is a key difference in all these implementation which are being ignored. Inherently tree based algorithms in sklearn interpret one-hot encoded (binarized) target labels as a multi-label problem. To get AUC and ROC curve for multi-class problem one must binarize the outputs for ROC calculation only. By default there is no need to use OneVsRestClassifier with any of the algorithm stated under inherently multi class. For algorithms which aren't inherently multi class it makes sense to use OVR classifier or to avoid complex decision functions in case of SVM. Please refer to the code snippets below, first one is the same code which was used in the example above. The second one is the correct implementation which takes into account where a multi class classifier is trained and then computes the ROC for individual class. Check the difference in the plots.
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
classifier = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()
import matplotlib.pyplot as plt
from sklearn import datasets
from itertools import cycle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Binarize the output
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
classifier = DecisionTreeClassifier(random_state=0)
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
n_classes = y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
colors = cycle(['blue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class data')
plt.legend(loc="lower right")
plt.show()
I'm using the DecisionTreeClassifier from scikit-learn to classify some data. I'm also using other algorithms and to compare them I use the area under the precision-recall metric. The problem is the shape of the AUPRC for the DecisionTreeClassifier is a square and not the usual shape you would expect for this metric.
Here is how I am calculating the AUPRC for the DecisionTreeClassifier. I had some trouble calculating this because the DecisionTreeClassifer does not have the decision_function() as do other classifiers like LogisticRegression
These are the results I got for the AUPRC of SVM, Logistic Regression, and DecisionTreeClassifier
Here is how I calculate the AUPRC for DecisionTreeClassifier
def execute(X_train, y_train, X_test, y_test):
tree = DecisionTreeClassifier(class_weight='balanced')
tree_y_score = tree.fit(X_train, y_train).predict(X_test)
tree_ap_score = average_precision_score(y_test, tree_y_score)
precision, recall, _ = precision_recall_curve(y_test, tree_y_score)
values = {'ap_score': tree_ap_score, 'precision': precision, 'recall': recall}
return values
Here is how I calculate the AUPRC for SVM:
def execute(X_train, y_train, X_test, y_test):
svm = SVC(class_weight='balanced')
svm.fit(X_train, y_train.values.ravel())
svm_y_score = svm.decision_function(X_test)
svm_ap_score = average_precision_score(y_test, svm_y_score)
precision, recall, _ = precision_recall_curve(y_test, svm_y_score)
values = {'ap_score': svm_ap_score, 'precision': precision, 'recall': recall}
return values
Here is how I calculate the AUPRC for LogisticRegression:
def execute(X_train, y_train, X_test, y_test):
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train.values.ravel())
lr_y_score = lr.decision_function(X_test)
lr_ap_score = average_precision_score(y_test, lr_y_score)
precision, recall, _ = precision_recall_curve(y_test, lr_y_score)
values = {'ap_score': lr_ap_score, 'precision': precision, 'recall': recall}
return values
I then call them methods and plot the results like this:
import LogReg_AP_Harness as lrApTest
import SVM_AP_Harness as svmApTest
import DecTree_AP_Harness as dtApTest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
def do_work(df):
X = df.ix[:, df.columns != 'Class']
y = df.ix[:, df.columns == 'Class']
y_binarized = label_binarize(y, classes=[0, 1])
n_classes = y_binarized.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
_, _, y_train_binarized, y_test_binarized = train_test_split(X, y_binarized, test_size=.3, random_state=0)
print('Executing Logistic Regression')
lr_values = lrApTest.execute(X_train, y_train, X_test, y_test)
print('Executing Decision Tree')
dt_values = dtApTest.execute(X_train, y_train_binarized, X_test, y_test_binarized)
print('Executing SVM')
svm_values = svmApTest.execute(X_train, y_train, X_test, y_test)
plot_aupr_curves(lr_values, svm_values, dt_values)
def plot_aupr_curves(lr_values, svm_values, dt_values):
lr_ap_score = lr_values['ap_score']
lr_precision = lr_values['precision']
lr_recall = lr_values['recall']
svm_ap_score = svm_values['ap_score']
svm_precision = svm_values['precision']
svm_recall = svm_values['recall']
dt_ap_score = dt_values['ap_score']
dt_precision = dt_values['precision']
dt_recall = dt_values['recall']
plt.step(svm_recall, svm_precision, color='g', alpha=0.2,where='post')
plt.fill_between(svm_recall, svm_precision, step='post', alpha=0.2, color='g')
plt.step(lr_recall, lr_precision, color='b', alpha=0.2, where='post')
plt.fill_between(lr_recall, lr_precision, step='post', alpha=0.2, color='b')
plt.step(dt_recall, dt_precision, color='r', alpha=0.2, where='post')
plt.fill_between(dt_recall, dt_precision, step='post', alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('SVM (Green): Precision-Recall curve: AP={0:0.2f}'.format(svm_ap_score) + '\n' +
'Logistic Regression (Blue): Precision-Recall curve: AP={0:0.2f}'.format(lr_ap_score) + '\n' +
'Decision Tree (Red): Precision-Recall curve: AP={0:0.2f}'.format(dt_ap_score))
plt.show()
In the the do_work() method I had to binarize y because DecisionTreeClassifier does not have a descision_function(). I had the approach from here.
This is the plot:
I guess what it boils down to is that I'm calculating the AUPRC for DecisionTreeClassifier incorrectly.
For DecisionTreeClassifier, replace predict with pred_proba; the latter serves the same role as decision_function.