I'm using the DecisionTreeClassifier from scikit-learn to classify some data. I'm also using other algorithms and to compare them I use the area under the precision-recall metric. The problem is the shape of the AUPRC for the DecisionTreeClassifier is a square and not the usual shape you would expect for this metric.
Here is how I am calculating the AUPRC for the DecisionTreeClassifier. I had some trouble calculating this because the DecisionTreeClassifer does not have the decision_function() as do other classifiers like LogisticRegression
These are the results I got for the AUPRC of SVM, Logistic Regression, and DecisionTreeClassifier
Here is how I calculate the AUPRC for DecisionTreeClassifier
def execute(X_train, y_train, X_test, y_test):
tree = DecisionTreeClassifier(class_weight='balanced')
tree_y_score = tree.fit(X_train, y_train).predict(X_test)
tree_ap_score = average_precision_score(y_test, tree_y_score)
precision, recall, _ = precision_recall_curve(y_test, tree_y_score)
values = {'ap_score': tree_ap_score, 'precision': precision, 'recall': recall}
return values
Here is how I calculate the AUPRC for SVM:
def execute(X_train, y_train, X_test, y_test):
svm = SVC(class_weight='balanced')
svm.fit(X_train, y_train.values.ravel())
svm_y_score = svm.decision_function(X_test)
svm_ap_score = average_precision_score(y_test, svm_y_score)
precision, recall, _ = precision_recall_curve(y_test, svm_y_score)
values = {'ap_score': svm_ap_score, 'precision': precision, 'recall': recall}
return values
Here is how I calculate the AUPRC for LogisticRegression:
def execute(X_train, y_train, X_test, y_test):
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train.values.ravel())
lr_y_score = lr.decision_function(X_test)
lr_ap_score = average_precision_score(y_test, lr_y_score)
precision, recall, _ = precision_recall_curve(y_test, lr_y_score)
values = {'ap_score': lr_ap_score, 'precision': precision, 'recall': recall}
return values
I then call them methods and plot the results like this:
import LogReg_AP_Harness as lrApTest
import SVM_AP_Harness as svmApTest
import DecTree_AP_Harness as dtApTest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
def do_work(df):
X = df.ix[:, df.columns != 'Class']
y = df.ix[:, df.columns == 'Class']
y_binarized = label_binarize(y, classes=[0, 1])
n_classes = y_binarized.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
_, _, y_train_binarized, y_test_binarized = train_test_split(X, y_binarized, test_size=.3, random_state=0)
print('Executing Logistic Regression')
lr_values = lrApTest.execute(X_train, y_train, X_test, y_test)
print('Executing Decision Tree')
dt_values = dtApTest.execute(X_train, y_train_binarized, X_test, y_test_binarized)
print('Executing SVM')
svm_values = svmApTest.execute(X_train, y_train, X_test, y_test)
plot_aupr_curves(lr_values, svm_values, dt_values)
def plot_aupr_curves(lr_values, svm_values, dt_values):
lr_ap_score = lr_values['ap_score']
lr_precision = lr_values['precision']
lr_recall = lr_values['recall']
svm_ap_score = svm_values['ap_score']
svm_precision = svm_values['precision']
svm_recall = svm_values['recall']
dt_ap_score = dt_values['ap_score']
dt_precision = dt_values['precision']
dt_recall = dt_values['recall']
plt.step(svm_recall, svm_precision, color='g', alpha=0.2,where='post')
plt.fill_between(svm_recall, svm_precision, step='post', alpha=0.2, color='g')
plt.step(lr_recall, lr_precision, color='b', alpha=0.2, where='post')
plt.fill_between(lr_recall, lr_precision, step='post', alpha=0.2, color='b')
plt.step(dt_recall, dt_precision, color='r', alpha=0.2, where='post')
plt.fill_between(dt_recall, dt_precision, step='post', alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('SVM (Green): Precision-Recall curve: AP={0:0.2f}'.format(svm_ap_score) + '\n' +
'Logistic Regression (Blue): Precision-Recall curve: AP={0:0.2f}'.format(lr_ap_score) + '\n' +
'Decision Tree (Red): Precision-Recall curve: AP={0:0.2f}'.format(dt_ap_score))
plt.show()
In the the do_work() method I had to binarize y because DecisionTreeClassifier does not have a descision_function(). I had the approach from here.
This is the plot:
I guess what it boils down to is that I'm calculating the AUPRC for DecisionTreeClassifier incorrectly.
For DecisionTreeClassifier, replace predict with pred_proba; the latter serves the same role as decision_function.
Related
I generated several datasets, and using classifiers, I predicted the distribution of clusters. I need to draw boundaries between clusters on the chart. In the form of lines or in the form of filled areas - it does not matter. Please let me know if there is any way to do this.
My code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
n_sample = 2000
def make_square(n_sample):
data=np.array([0,[]])
data[0] = np.random.sample((n_sample,2))
for i in range(n_sample):
if data[0][i][0] > 0.5 and data[0][i][1] > 0.5 or data[0][i][0] < 0.5 and data[0][i][1] < 0.5:
data[1].append(1)
else:
data[1].append(0)
return data
datasets = [
make_circles(n_samples=n_sample, noise=0.09, factor=0.5),
make_square(n_sample),
make_moons(n_samples=n_sample, noise=0.12),
]
ks=[]
for data in datasets:
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
accs = []
for i in range(1, 8):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
acc0 = knn.score(X_test, y_test)
accs.append(acc0)
plt.figure(figsize=(12, 6))
plt.plot(range(1, 8), accs, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('accs Score K Value')
plt.xlabel('K Value')
plt.ylabel('accs Score')
print("Max Score:", max(accs), "k=",accs.index(max(accs))+1)
ks.append(accs.index(max(accs))+1)
for i in range(3):
data = datasets[i]
k = ks[i]
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
plt.figure(figsize=(9,9))
plt.title("Test")
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.figure(figsize=(9,9))
plt.title("Predict")
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
Example output:
enter image description here
enter image description here
scikit-learn 1.1 introduced the DecisionBoundaryDisplay to assist with this sort of task.
Following the use of make_moons and the KNeighborsClassifier in the question, we can fit the classifier on the dataset, invoke the DecisionBoundaryDisplay.from_estimator() method, then scatter the X data on the returned axis:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
X, y = make_moons(noise=0.2)
clf = KNeighborsClassifier().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", alpha=0.3)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
Resulting in something like this:
Lasso, although it's a regression algorithm, can be used as a classifier. Therefore, there should be a way to make a ROC curve and find it's AUC.
This is my code for making the model, scaling and standardizing it:
X = data.drop(['Response'], axis = 1)
Y = data.Response
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state = 42)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
pipline = Pipeline([
('scaler', StandardScaler()),
('model', Lasso(normalize=True))
])
lasso_model = GridSearchCV(pipline,
{'model__alpha': np.arange(0, 3, 0.05)},
cv = 10 ,
scoring = 'roc_auc',
verbose = 3,
n_jobs = -1,
error_score = 'raise')
lasso_model.fit(scaled_X_train, Y_train)
Now trying to make the ROC curves, one for the training set and one for the test set:
# Make a line for the random classification, AUC = 0.5:
r_probs = [0 for _ in range(len(Y_test))]
r_auc = roc_auc_score(Y_test,r_probs)
r_fpr, r_tpr , _ = roc_curve(Y_test, r_probs)
y_pred_proba = lasso_model.predict_proba(scaled_X_train)[::,1]
fpr, tpr, _ = roc_curve(Y_train, y_pred_proba)
auc = roc_auc_score(Y_train, y_pred_proba)
#create ROC curve
plt.plot(r_fpr, r_tpr, linestyle = '--', label = 'Random Prediction (AUROC = %0.3f)' %r_auc)
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.title('Train set ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.show()
I get the error that predict_proba does not exist for lasso regression, since it is not a classification algorithm. So how can I plot a ROC curve?
Lasso is basically linear model with L1 regularization. You can enable this kind of regularization for LogisticRegression() using penalty='l1' for the same effect (see this question, for example).
Alternatively, you can try to duct tape a sigmoid function to your Lasso() output, but that'd be doing the same thing as above with much more effort.
I'd like to evaluate my machine learning model. I computed the area under the ROC curve with roc_auc_score() and plotted the ROC curve with plot_roc_curve() functions of sklearn. In the second function the AUC is also computed and shown in the plot. Now my problem is, that I get different results for the two AUC.
Here's the reproducible code with sample dataset:
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
model = MLPClassifier(random_state=42)
model.fit(X_train, y_train)
yPred = model.predict(X_test)
print(roc_auc_score(y_test, yPred))
plot_roc_curve(model, X_test, y_test)
plt.show()
The roc_auc_score function gives me 0.979 and the plot shows 1.00.
Despite the fact that the second function takes the model as an argument and predicts yPred again, the outcome should not differ. It is not a round off error. If I decrease training iterations to get a bad predictor the values still differ.
With my real dataset I "achieved" a difference of 0.1 between the two methods.
How does this aberration come?
You should pass the prediction probabilities to roc_auc_score, and not the predicted classes. Like this:
yPred_p = model.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, yPred_p))
# output: 0.9983354140657512
When you pass the predicted classes, this is actually the curve for which AUC is being calculated (which is wrong):
Code to regenerate:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, yPred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='AUC = ' + str(round(roc_auc, 2)))
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
I wish to append the results to the list data for each of the models utilised, however the function calc only appends the results from the last model. I am sure it is something really simple I am missing here!
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
classifiers =[LogisticRegression(solver='liblinear', penalty='l2', C=200),
LogisticRegression(penalty='l2', C=1),
DecisionTreeClassifier(),
BernoulliNB()]
class_names = ['Logistic Regression', 'Logistic Regression'
'Regularized','CART', 'Naive Bayes (Bernoulli)']
# import some data to play with
iris = datasets.load_iris()
Xdata = iris.data[:, :2] # we only take the first two features
ydata = iris.target
def calc (classifier_names, classifier_models, Xdata, ydata):
X_train, X_test, y_train, y_test = \
train_test_split(Xdata, ydata,test_size = 0.50, stratify=ydata,
random_state = 42)
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)
data=[]
for name, clf in zip(classifier_names, classifier_models):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
ROC_AUC = plot_ROC_AUC(clf, X_test, y_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
Brier_Score = metrics.brier_score_loss(y_test, y_pred)
data.append((ROC_AUC,
Accuracy,
Brier_Score))
cols = ['ROC_AUC', 'Accuracy', 'Brier_Score']
result = pd.DataFrame(data, columns = cols, index=classifier_names)
return result
output = calc(class_names, classifiers, Xdata, ydata)
output
ROC_AUC Accuracy Brier_Score
Logistic Regression 0.925517 0.855072 0.144928
Logistic Regression Regularized 0.925517 0.855072 0.144928
CART 0.925517 0.855072 0.144928
Naive Bayes (Bernoulli) 0.925517 0.855072 0.144928
#want this to change here
#function within the calc function
def plot_ROC_AUC(fit_model, X_test, y_test):
probs=fit_model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
#plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
return roc_auc
I'm uncertain on the specifics of what you're attempting but i see an issue here
def calc (classifier_names, classifier_models, X, y):
X_train, X_test, y_train, y_test = \
train_test_split(Xdata, ydata,test_size = 0.50, stratify=ydata,
random_state = 42)
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)
data=[]
for name, clf in zip(classifier_names, classifier_models):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
ROC_AUC = plot_ROC_AUC(clf, X_test, y_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
Brier_Score = metrics.brier_score_loss(y_test, y_pred)
data.append((ROC_AUC,
Accuracy,
Brier_Score))
cols = ['ROC_AUC', 'Accuracy', 'Brier_Score']
result = pd.DataFrame(data, columns = cols, index=classifier_names)
return result
or simplified:
def func(something, darkside):
for i in range(some_int):
return some_other_func(i)
this loop will only go through one step, as the return statement will break out of the function.
I think what you should attempt to do is aggregate the results of the for loop in some DataFrame and then return the aggregate. At this point I could say it's an indentation issue but looking higher i see you overwrite result on each loop too, so I would start there
Maybe move the loop outside the function? and do this instead:
def func(something, darkside):
return some_expression_of(something,darkside)
for name, clf, in zip(classifer_names, classifier_models:
func(name,clf)
I need to determine how well different classification models predict values. In order to do this i need to plot an ROC curve but i am struggling to develop an approach.
I included my entire python code as well as the link to the dataset i used. It seems like a lot of code but is really simple actually. The main issue i find, is that i have a 3x3 confusion matrix and dont know how to translate that into a ROC plot.
Any help is greatly appreciated.
Dataset:
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
import numpy as np
#data = pd.read_csv('wineQualityReds.csv', usecols=lambda x: 'Unnamed' not in x,)
data = pd.read_csv('wineQualityWhites.csv', usecols=lambda x: 'Unnamed' not in x,)
# roc curve and auc score
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
def plot_roc_curve(fpr, tpr):
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
bins = [1,4,6,10]
quality_labels = [0,1,2]
data['quality_categorial'] = pd.cut(data['quality'], bins = bins, labels = quality_labels, include_lowest = True)
display(data.head(n=2))
quality_raw = data['quality_categorial']
features_raw = data.drop(['quality', 'quality_categorial'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_raw, quality_raw, test_size = 0.2, random_state = 0)
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score
def train_predict_evaluate(learner, sample_size, X_train, y_train, X_test, y_test):
results = {}
#start = time()
learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
#end = time()
#results['train_time'] = end - start
#start = time()
predictions_train = learner.predict(X_train[:300])
predictions_test = learner.predict(X_test)
#end = time()
#results['pred_time'] = end - start
results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
results['acc_test'] = accuracy_score(y_test, predictions_test)
results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta = 0.5, average = 'micro')
results['f_test'] = fbeta_score(y_test, predictions_test, beta = 0.5, average = 'micro')
#####################
#array = print(confusion_matrix(y_test, predictions_test))
labels = ['Positives','Negatives']
cm = confusion_matrix(y_test, predictions_test)
print(cm)
df_cm = pd.DataFrame(cm, columns=np.unique(y_test), index = np.unique(y_test))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, cmap="Blues", annot=True, fmt = 'g',annot_kws={"size": 16})# font size
#######################
print(predictions_test)
#auc = roc_auc_score(y_test, probs)
#print('AUC: %.2f' % auc)
#fpr, tpr, thresholds = roc_curve(y_test, probs)
#plot_roc_curve(fpr, tpr)
print("{} trained on {} samples." .format(learner.__class__.__name__, sample_size))
return results
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
clf_A = GaussianNB()
clf_B = DecisionTreeClassifier(max_depth=None, random_state=None)
clf_C = RandomForestClassifier(max_depth=None, random_state=None)
samples_100 = len(y_train)
samples_10 = int(len(y_train)*10/100)
samples_1 = int(len(y_train)*1/100)
results = {}
for clf in [clf_A,clf_B,clf_C]:
clf_name = clf.__class__.__name__
results[clf_name] = {}
for i, samples in enumerate([samples_1, samples_10, samples_100]):
results[clf_name][i] = \
train_predict_evaluate(clf, samples, X_train, y_train, X_test, y_test)
train_predict_evaluate(clf_C, samples_100, X_train, y_train, X_test, y_test)
You cannot directly calculate RoC curve from confusion matrix because AUC - ROC curve is a performance measurement for classification problem at various thresholds settings.
The following code works for me:
def plot_roc(model, X_test, y_test):
# calculate the fpr and tpr for all thresholds of the classification
probabilities = model.predict_proba(np.array(X_test))
predictions = probabilities[:, 1]
fpr, tpr, threshold = metrics.roc_curve(y_test, predictions)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()