Whats different confusion matrix accuracy vs test accuracy? - python

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(img_array, img_labels,
shuffle=True, stratify=img_labels,
test_size=0.1, random_state=42)
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', mode='min',save_best_only=True,verbose=1)
restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.2,patience=3,verbose=1,min_delta=0.0001)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
hist = model.fit(x_train, y_train, epochs =50, batch_size=64,callbacks = [checkpoint,reduce_lr], validation_data=(x_test, y_test))
plt.plot(hist.history['loss'], color='b', label='Training Loss')
plt.plot(hist.history['val_loss'], color='r', label='Validation Loss')
plt.legend(loc='upper right')
plt.subplot(1, 2, 2)
plt.ylabel('Accuracy', fontsize=16)
plt.plot(hist.history['accuracy'], color='b', label='Training Accuracy')
plt.plot(hist.history['val_accuracy'], color='r', label='Validation Accuracy')
plt.legend(loc='lower right')
plt.show()
Output
y_pred=model.predict(x_test)
y_pred=np.argmax(y_pred, axis=1)
y_test=np.argmax(y_test, axis=1)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix
#FP, FN, TP, TN değerleri bulma
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.sum() - (FP + FN + TP)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP)
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
ACC*100
Output: array([87.21092226, 98.74616885, 85.81777654, 89.69072165, 80.46809696, 93.56366676, 83.75592087])
While the accuracy is around 60% as a result of deep learning, the accuracy is around 80%-85% on average as a result of calculation from the confusion matrix. What is the difference in accuracy here? Do both truths have different meanings?

Related

How to plot the ROC curve for ANN for 10 fold Cross validation in Keras using Python?

I was just trying to find ROC plot for all the 10 experiments for 10 fold cross-validation for ANN in Keras. I got stuck with it for a week and can not find a solution. Could anyone help with this? I have tried the code from the following link(https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html) from sklearn and wanted to use wrapper to use Keras model in sklearn but it shows errors. My code in python:
## Creating NN in Keras
# Load libraries
import numpy as np
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
# Set random seed
np.random.seed(7)
#Create Function That Constructs Neural Network
# Create function returning a compiled network
def create_network():
# Start neural network
network = models.Sequential()
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=25, activation='relu', input_shape=(X.shape[1],)))
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=X.shape[1], activation='relu'))
# Add fully connected layer with a sigmoid activation function
network.add(layers.Dense(units=1, activation='sigmoid'))
# Compile neural network
network.compile(loss='binary_crossentropy', # Cross-entropy
optimizer='adam', # Root Mean Square Propagation
metrics=['accuracy']) # Accuracy performance metric
# Return compiled network
return network
###
#Wrap Function In KerasClassifier
# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=create_network,
epochs=150,
batch_size=10,
verbose=0)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold
n_samples, n_features = X.shape
# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# #############################################################################
# Classification and ROC analysis
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=10)
classifier = neural_network
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
classifier.fit(X[train], y[train])
viz = plot_roc_curve(classifier, X[test], y[test],
name='ROC fold {}'.format(i),
alpha=0.3, lw=1, ax=ax)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
aucs.append(viz.roc_auc)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
title="Receiver operating characteristic example")
ax.legend(loc="lower right")
plt.show()
**It shows the following error:**
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-29-f10078491154> in <module>()
40 viz = plot_roc_curve(classifier, X[test], y[test],
41 name='ROC fold {}'.format(i),
---> 42 alpha=0.3, lw=1, ax=ax)
43 interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
44 interp_tpr[0] = 0.0
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_plot/roc_curve.py in plot_roc_curve(estimator, X, y, sample_weight, drop_intermediate, response_method, name, ax, **kwargs)
170 )
171 if not is_classifier(estimator):
--> 172 raise ValueError(classification_error)
173
174 prediction_method = _check_classifer_response_method(estimator,
ValueError: KerasClassifier should be a binary classifier
I had the same question. I found this link very informative.
https://www.kaggle.com/kanncaa1/roc-curve-with-k-fold-cv. I have modified it for my case as bellow:
seed = 7
np.random.seed(seed)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 1
fig, ax = plt.subplots()
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
# for i, (train, test) in enumerate(cv.split(X_13 , target)):
for train, test in kfold.split(X_train, y_train):
# create model
model= Sequential()
model.add(Dense(100, input_dim=X_train.shape[1], activation= 'relu',kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(80, activation = 'relu',kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))
##- compile model
sgd = SGD(lr=0.1, momentum=0.8)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
model.fit(X_train[train], y_train[train], epochs=100, batch_size=15,verbose=0)
# evaluate the model
y_pred_keras = model.predict_proba(X_train[test]).ravel()
fpr, tpr, thresholds = roc_curve(y_train[test], y_pred_keras)
tprs.append(interp(mean_fpr, fpr, tpr))
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()
Hope it could help!
I have just answered what seems to be the copy of this post (apart from variable names) here.
Not sure whether this is the exact duplicate or not because the question comes from a different account but it seems like that. But here is a copy of my answer in case one of these is closed as a duplicate.
This is an implementational detail that is (probably) missing in this wrapper library.
Sklearn simply checks whether an attribute called _estimator_type is present on the estimator and is set to string value classifier. You can see that by looking into sklearn's source code on github.
def is_classifier(estimator):
"""Return True if the given estimator is (probably) a classifier.
Parameters
----------
estimator : object
Estimator object to test.
Returns
-------
out : bool
True if estimator is a classifier and False otherwise.
"""
return getattr(estimator, "_estimator_type", None) == "classifier"
All you need to do is to add this attribute to your classifier object manually.
classifier = KerasClassifier(build_fn=create_network,
epochs=10,
batch_size=100,
verbose=2)
classifier._estimator_type = "classifier"
I have tested it and it works.

How can I test real-time predictions in neural network? [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
As I'm new here let me ask a question that would be fairly common. I wrote an MLP model of the neural network using deep learning. I'm using a standard data-set which I downloaded here enter link description here. from statistical point of view, the accuracy and f1-score of mine shows wonderfully output. now I need to test this program with real-time data. I will be glad to hear your suggestions for how can I perform real-time predictions in neural networks?
from pandas import pandas as pd
from pandas import DataFrame
from numpy import*
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,auc
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
########################################################################################
db = pd.read_csv(r"C:\Users\cert 3\Desktop\Vasou\proposal\code\StackOverFlow\UDP-Flood-CSV.csv")
X = db.iloc[:, 0:4]
y = db.iloc[:, 4]
m, n = X.shape
MG = X
X = preprocessing.scale(X)
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
y = to_categorical(encoded_y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
########################################################################################
mlp = MLPClassifier()
parameter_space = {'hidden_layer_sizes': [(8,12,4), (5,5,2), (4,4,4)],
'activation': ['tanh', 'relu'],
'solver': ['sgd', 'adam'],
'alpha': [0.001,0.01, 0.05, 0.1],
'learning_rate': ['constant','adaptive'],
'max_iter':[20,50,100]
}
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3,return_train_score=True)
clf.fit(X_train, y_train)
print('Best parameters found:\n', clf.best_params_, clf.best_score_)
#######################################################################################
cvr = clf.cv_results_
df = DataFrame(cvr)
scores = df['mean_test_score']
h = df['param_hidden_layer_sizes']
alpha = df['param_alpha']
optim = df['param_solver']
l_rate = df['param_learning_rate']
activ = df['param_activation']
itr = df['param_max_iter']
dh = DataFrame({'Scores': scores, 'Itraction':itr, 'Hidden_Layers': h, 'alpha': alpha ,
'Solver':optim, 'Learning_Rate':l_rate, 'Activation':activ})
########################################################################################
model = Sequential()
model.add(Dense(8, input_dim=n, kernel_initializer='uniform', activation='tanh'))
model.add(Dense(12, activation='tanh'))
model.add(Dense(4, activation='tanh'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
########################################################################################
hist = model.fit(X_train, y_train, batch_size = 10, epochs = 100, validation_split=0.5)
scoress = model.evaluate(X, y, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scoress[1]*100))
print(hist.history)
# save model and architecture to single file
model.save("model.h5")
model.save_weights("model_weight.h5")
print("saved model to disk")
# Plot training & validation accuracy values
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.title('Training vs Test accuracy , DA')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training acc', 'Validation acc'], loc='best')
#plt.show()
#plt.figure()
a = plt.savefig('Accuracy.png', dpi=300, bbox_inches='tight')
plt.close(a)
# Plot training & validation loss values
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Training vs Test Loss , DA')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training loss', 'Validation loss'], loc='best')
#plt.show()
#plt.figure()
b = plt.savefig('Loss.png', dpi=300, bbox_inches='tight')
plt.close(b)
##########################################################################################
y_score = model.predict(X_test)
org = zeros((y_test.shape[0]))
prd = zeros((y_score.shape[0]))
def decode(datum):
return np.argmax(datum)
for i in range(y_score.shape[0]):
prd[i] = decode(y_score[i])
for j in range(y_test.shape[0]):
org[j] = decode(y_test[j])
confusion_matrix(org,prd)
print("Accuracy of MLP: ", "\n", confusion_matrix(org,prd))
f = open("output.txt", "a")
print('Accuracy Score : ' + str(accuracy_score(org,prd)), file=f)
f.close()
##########################################################################################
def generate_results(y_test, y_score):
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
#plt.show()
plt.savefig('False and True comparison.png', dpi=300, bbox_inches='tight')
print('AUC: %f' % roc_auc)
print('Generating results')
generate_results(y_test[:, 0], y_score[:, 0])
and this is my python code.
If you want to use this code on demand, you can use the < input stream arrow from shell, for example: python script.py < your_streamer.
At last you must choose special character at the end of packet to find out when you capture whole of packet.
In python input() is a good choose with \n seprator.
script.py
while True:
X = np.array(input().split(','), dtype=np.float)
y = model.predict([X])
print(X, y)
file.txt as streamer.
0.218,0.7451,0.7451,0.574
0.215,0.8854,0.7451,0.745
0.275,0.5744,0.7451,0.574
0.751,0.5744,0.2150,0.885
...
...
...
$ python script.py < file.txt

Python for loop only appending last result to list/DataFrame (example with iris dataset)

I wish to append the results to the list data for each of the models utilised, however the function calc only appends the results from the last model. I am sure it is something really simple I am missing here!
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
classifiers =[LogisticRegression(solver='liblinear', penalty='l2', C=200),
LogisticRegression(penalty='l2', C=1),
DecisionTreeClassifier(),
BernoulliNB()]
class_names = ['Logistic Regression', 'Logistic Regression'
'Regularized','CART', 'Naive Bayes (Bernoulli)']
# import some data to play with
iris = datasets.load_iris()
Xdata = iris.data[:, :2] # we only take the first two features
ydata = iris.target
def calc (classifier_names, classifier_models, Xdata, ydata):
X_train, X_test, y_train, y_test = \
train_test_split(Xdata, ydata,test_size = 0.50, stratify=ydata,
random_state = 42)
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)
data=[]
for name, clf in zip(classifier_names, classifier_models):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
ROC_AUC = plot_ROC_AUC(clf, X_test, y_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
Brier_Score = metrics.brier_score_loss(y_test, y_pred)
data.append((ROC_AUC,
Accuracy,
Brier_Score))
cols = ['ROC_AUC', 'Accuracy', 'Brier_Score']
result = pd.DataFrame(data, columns = cols, index=classifier_names)
return result
output = calc(class_names, classifiers, Xdata, ydata)
output
ROC_AUC Accuracy Brier_Score
Logistic Regression 0.925517 0.855072 0.144928
Logistic Regression Regularized 0.925517 0.855072 0.144928
CART 0.925517 0.855072 0.144928
Naive Bayes (Bernoulli) 0.925517 0.855072 0.144928
#want this to change here
#function within the calc function
def plot_ROC_AUC(fit_model, X_test, y_test):
probs=fit_model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
#plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
return roc_auc
I'm uncertain on the specifics of what you're attempting but i see an issue here
def calc (classifier_names, classifier_models, X, y):
X_train, X_test, y_train, y_test = \
train_test_split(Xdata, ydata,test_size = 0.50, stratify=ydata,
random_state = 42)
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)
data=[]
for name, clf in zip(classifier_names, classifier_models):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
ROC_AUC = plot_ROC_AUC(clf, X_test, y_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
Brier_Score = metrics.brier_score_loss(y_test, y_pred)
data.append((ROC_AUC,
Accuracy,
Brier_Score))
cols = ['ROC_AUC', 'Accuracy', 'Brier_Score']
result = pd.DataFrame(data, columns = cols, index=classifier_names)
return result
or simplified:
def func(something, darkside):
for i in range(some_int):
return some_other_func(i)
this loop will only go through one step, as the return statement will break out of the function.
I think what you should attempt to do is aggregate the results of the for loop in some DataFrame and then return the aggregate. At this point I could say it's an indentation issue but looking higher i see you overwrite result on each loop too, so I would start there
Maybe move the loop outside the function? and do this instead:
def func(something, darkside):
return some_expression_of(something,darkside)
for name, clf, in zip(classifer_names, classifier_models:
func(name,clf)

SVM and Random Forest with recall = 0

I am trying to predict one from two values which can appear in column 'exit'. I have clean data (about 20 columns and 4k rows contain typical information about customers like 'sex', 'age' ...). In training dataset about 20% customers were qualified as '1'. I made two models- svm and random forest- but both predict for test dataset mostly '0' (almost everytime). Recall of two models is 0.
I atached code where I think I could do some stupid mistake. Any ideas why recall is so low during 80% accuracies?
def ml_model():
print('sklearn: %s' % sklearn.__version__)
df = pd.read_csv('clean_data.csv')
df.head()
feat = df.drop(columns=['target'], axis=1)
label = df["target"]
x_train, x_test, y_train, y_test = train_test_split(feat, label, test_size=0.3)
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
# SVC method
support_vector_classifier = SVC(probability=True)
# Grid search
rand_list = {"C": stats.uniform(0.1, 10),
"gamma": stats.uniform(0.1, 1)}
auc = make_scorer(roc_auc_score)
rand_search_svc = RandomizedSearchCV(support_vector_classifier, param_distributions=rand_list, n_iter=100, n_jobs=4, cv=3, random_state=42,
scoring=auc)
rand_search_svc.fit(x_train, y_train)
support_vector_classifier = rand_search_svc.best_estimator_
cross_val_svc = cross_val_score(estimator=support_vector_classifier, X=x_train, y=y_train, cv=10, n_jobs=-1)
print("Cross Validation Accuracy for SVM: ", round(cross_val_svc.mean() * 100, 2), "%")
predicted_y = support_vector_classifier.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
precision_score = tp / (tp + fp)
recall_score = tp / (tp + fn)
print("Recall score SVC: ", recall_score)
# Random forests
random_forest_classifier = RandomForestClassifier()
# Grid search
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
rand_search_rf = RandomizedSearchCV(random_forest_classifier, param_distributions=param_dist,
n_iter=100, cv=5, iid=False)
rand_search_rf.fit(x_train, y_train)
random_forest_classifier = rand_search_rf.best_estimator_
cross_val_rfc = cross_val_score(estimator=random_forest_classifier, X=x_train, y=y_train, cv=10, n_jobs=-1)
print("Cross Validation Accuracy for RF: ", round(cross_val_rfc.mean() * 100, 2), "%")
predicted_y = random_forest_classifier.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
precision_score = tp / (tp + fp)
recall_score = tp / (tp + fn)
print("Recall score RF: ", recall_score)
new_data = pd.read_csv('new_data.csv')
new_data = cleaning_data_to_predict(new_data)
if round(cross_val_svc.mean() * 100, 2) > round(cross_val_rfc.mean() * 100, 2):
predictions = support_vector_classifier.predict(new_data)
predictions_proba = support_vector_classifier.predict_proba(new_data)
else:
predictions = random_forest_classifier.predict(new_data)
predictions_proba = random_forest_classifier.predict_proba(new_data)
f = open("output.txt", "w+")
for i in range(len(predictions.tolist())):
print("id: ", i, "probability: ", predictions_proba.tolist()[i][1], "exit: ", predictions.tolist()[i], file=open("output.txt", "a"))
If I have not missed it, you forgot to scale your test set.
So, you need to scale it as well. Note that you should just transform it, do not fit it again. See below.
x_test = sc_x.transform(x_test)
I agree with #e_kapti, also check the formula of the recall and accuracy, you might consider using the F1 Score instead (https://en.wikipedia.org/wiki/F1_score).
Recall = TP / (TP+FN) Accuracy = (TP + TN) / (TP + TN + FP + FN) With TP, FP, TN, FN being number of true positives, false positives, true negatives and false negatives, respectively.

Area under the precision-recall curve for DecisionTreeClassifier is a square

I'm using the DecisionTreeClassifier from scikit-learn to classify some data. I'm also using other algorithms and to compare them I use the area under the precision-recall metric. The problem is the shape of the AUPRC for the DecisionTreeClassifier is a square and not the usual shape you would expect for this metric.
Here is how I am calculating the AUPRC for the DecisionTreeClassifier. I had some trouble calculating this because the DecisionTreeClassifer does not have the decision_function() as do other classifiers like LogisticRegression
These are the results I got for the AUPRC of SVM, Logistic Regression, and DecisionTreeClassifier
Here is how I calculate the AUPRC for DecisionTreeClassifier
def execute(X_train, y_train, X_test, y_test):
tree = DecisionTreeClassifier(class_weight='balanced')
tree_y_score = tree.fit(X_train, y_train).predict(X_test)
tree_ap_score = average_precision_score(y_test, tree_y_score)
precision, recall, _ = precision_recall_curve(y_test, tree_y_score)
values = {'ap_score': tree_ap_score, 'precision': precision, 'recall': recall}
return values
Here is how I calculate the AUPRC for SVM:
def execute(X_train, y_train, X_test, y_test):
svm = SVC(class_weight='balanced')
svm.fit(X_train, y_train.values.ravel())
svm_y_score = svm.decision_function(X_test)
svm_ap_score = average_precision_score(y_test, svm_y_score)
precision, recall, _ = precision_recall_curve(y_test, svm_y_score)
values = {'ap_score': svm_ap_score, 'precision': precision, 'recall': recall}
return values
Here is how I calculate the AUPRC for LogisticRegression:
def execute(X_train, y_train, X_test, y_test):
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train.values.ravel())
lr_y_score = lr.decision_function(X_test)
lr_ap_score = average_precision_score(y_test, lr_y_score)
precision, recall, _ = precision_recall_curve(y_test, lr_y_score)
values = {'ap_score': lr_ap_score, 'precision': precision, 'recall': recall}
return values
I then call them methods and plot the results like this:
import LogReg_AP_Harness as lrApTest
import SVM_AP_Harness as svmApTest
import DecTree_AP_Harness as dtApTest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
def do_work(df):
X = df.ix[:, df.columns != 'Class']
y = df.ix[:, df.columns == 'Class']
y_binarized = label_binarize(y, classes=[0, 1])
n_classes = y_binarized.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
_, _, y_train_binarized, y_test_binarized = train_test_split(X, y_binarized, test_size=.3, random_state=0)
print('Executing Logistic Regression')
lr_values = lrApTest.execute(X_train, y_train, X_test, y_test)
print('Executing Decision Tree')
dt_values = dtApTest.execute(X_train, y_train_binarized, X_test, y_test_binarized)
print('Executing SVM')
svm_values = svmApTest.execute(X_train, y_train, X_test, y_test)
plot_aupr_curves(lr_values, svm_values, dt_values)
def plot_aupr_curves(lr_values, svm_values, dt_values):
lr_ap_score = lr_values['ap_score']
lr_precision = lr_values['precision']
lr_recall = lr_values['recall']
svm_ap_score = svm_values['ap_score']
svm_precision = svm_values['precision']
svm_recall = svm_values['recall']
dt_ap_score = dt_values['ap_score']
dt_precision = dt_values['precision']
dt_recall = dt_values['recall']
plt.step(svm_recall, svm_precision, color='g', alpha=0.2,where='post')
plt.fill_between(svm_recall, svm_precision, step='post', alpha=0.2, color='g')
plt.step(lr_recall, lr_precision, color='b', alpha=0.2, where='post')
plt.fill_between(lr_recall, lr_precision, step='post', alpha=0.2, color='b')
plt.step(dt_recall, dt_precision, color='r', alpha=0.2, where='post')
plt.fill_between(dt_recall, dt_precision, step='post', alpha=0.2, color='r')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('SVM (Green): Precision-Recall curve: AP={0:0.2f}'.format(svm_ap_score) + '\n' +
'Logistic Regression (Blue): Precision-Recall curve: AP={0:0.2f}'.format(lr_ap_score) + '\n' +
'Decision Tree (Red): Precision-Recall curve: AP={0:0.2f}'.format(dt_ap_score))
plt.show()
In the the do_work() method I had to binarize y because DecisionTreeClassifier does not have a descision_function(). I had the approach from here.
This is the plot:
I guess what it boils down to is that I'm calculating the AUPRC for DecisionTreeClassifier incorrectly.
For DecisionTreeClassifier, replace predict with pred_proba; the latter serves the same role as decision_function.

Categories