I am trying to compute precision, recall, and f1 score on my test dataset. However, I am using ImageDataGenerator format, not using train_test_split (x_train, y_train, x_test and y_test). That's why I couldn't find any references online.
IMAGE_SIZE = 224
BATCH_SIZE = 64
EPOCH = 30
CHANNEL = 3
CLASSES = 10
train_path = "/Users/ba/Documents/mycodes/datasets/DS/train"
valid_path = "/Users/ba/Documents/mycodes/datasets/DS/val"
test_path = "/Users/ba/Documents/mycodes/datasets/DS/test"
train_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input) \
.flow_from_directory(directory=train_path, target_size=(IMAGE_SIZE,IMAGE_SIZE), batch_size=BATCH_SIZE)
valid_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input) \
.flow_from_directory(directory=valid_path, target_size=(IMAGE_SIZE,IMAGE_SIZE), batch_size=BATCH_SIZE)
test_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input) \
.flow_from_directory(directory=test_path, target_size=(IMAGE_SIZE,IMAGE_SIZE), batch_size=BATCH_SIZE, shuffle=False)
Then I tried to calculate precision, recall, and f1 in the following way down below:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
y_pred_logits = model.predict(test_batches)
y_pred = tf.math.argmax(y_pred_logits)
test_classes = test_batches.classes
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_classes, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_classes, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_classes, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_classes, y_pred)
print('F1 score: %f' % f1)
Unfortunately it throws this error message:
ValueError: Found input variables with inconsistent numbers of samples: [1887, 10]
Can you help me re-write the code, or any other references using ImageDataGenerator format I used?
So the problem were on y_true (test_classes) and y_pred. With this, one can also calculate the confusion matrix.
#Making prediction
y_true = test_batches.classes
predictions = model.predict(x=test_batches, steps=len(test_batches), verbose=0)
y_pred = predictions.argmax(axis=1)
print("Precision Score: ",precision_score(y_true, y_pred, pos_label='positive', average='micro'))
print("Recall Score: ",recall_score(y_true, y_pred, pos_label='positive', average='micro'))
print("F1 Score: ",f1_score(y_true, y_pred, pos_label='positive', average='micro'))
print("Accuracy Score: ",accuracy_score(y_true, y_pred))
Related
I am really new in programming, especially, in machine learning. Currently, I am training my dataset and I am using KNN, random forest, and decision tree as my algorithms. However, my accuracy, precision, recall, and f1 scores in random forest and decision tree are all 1.0, which means something is wrong. On the other hand, my KNN scores are low (Accuracy: 0.892 Recall: 0.452 Precision: 0.824 F1-score: 0.584).
I already cleaned and split my dataset for training and testing, and imputed (median) my dataset, so I am really confused as to why the results are like this. What can I do to fix this?
P.S. I am not really sure how to ask questions here, so if I am lacking any information necessary, just tell me.
dataset image: https://i.stack.imgur.com/6FR1K.png
distribution of dataset: https://i.stack.imgur.com/1uZzN.png
#Convert 0's to NaN
columns = ["Age", "Race", "Marital Status", "T Stage", "N Stage",
"6th Stage", "Grade", "A Stage", "Tumor Size", "Estrogen Status",
"Progesterone Status", "Regional Node Examined", "Reginol Node
Positive", "Survival Months", "Status"]
data[columns] = data[columns].replace({'0':np.nan, 0:np.nan})
#imputing using median
imp_median.fit(data.values)
imp_median.fit(data.values)
data_median = imp_median.transform(data.values)
data_median = pd.DataFrame(data_median)
data_median.columns =["Age", "Race", "Marital Status", "T Stage ",
"N Stage", "6th Stage", "Grade", "A Stage", "Tumor Size", "Estrogen
Status", "Progesterone Status", "Regional Node Examined", "Reginol
Node Positive", "Survival Months", "Status"]
#scaling data median
minmaxScale = MinMaxScaler()
X = minmaxScale.fit_transform(data_median.values)
transformedDF = minmaxScale.transform(X)
data_transformedDF = pd.DataFrame(X)
data_transformedDF.columns =["Age", "Race", "Marital Status", "T
Stage ", "N Stage", "6th Stage", "Grade", "A Stage", "Tumor Size",
"Estrogen Status", "Progesterone Status", "Regional Node Examined",
"Reginol Node Positive", "Survival Months", "Status"]
#splitting the dataset
features = data_transformedDF.drop(["Status"], axis=1)
outcome_variable = data_transformedDF["Status"]
x_train, x_test, y_train, y_test = train_test_split(features,
outcome_variable, test_size=0.20, random_state=7)
#cross validation
def cross_validation(model, _X, _y, _cv=10):
'''
Function to perform 10 Folds Cross-Validation
Parameters
model: Python Class, default=None
This is the machine learning algorithm to be used for
training.
_X: array
This is the matrix of features (age, race, etc).
_y: array
This is the target variable (1 - Dead, 0 - Alive).
cv: int, default=10
Determines the number of folds for cross-validation.
Returns
The function returns a dictionary containing the metrics
'accuracy', 'precision', 'recall', 'f1' for
training/validation set.
'''
_scoring = ['accuracy', 'precision', 'recall', 'f1']
results = cross_validate(estimator=model,
X=_X,
y=_y,
cv=_cv,
scoring=_scoring,
return_train_score=True)
return {"Training Accuracy scores":
results['train_accuracy'], "Mean Training
Accuracy":results['train_accuracy'].mean()*100,
"Mean Training Precision":
results['train_precision'].mean(),
"Mean Training Recall":
results['train_recall'].mean(),
"Mean Training F1 Score":
results['train_f1'].mean(),
}
#KNN
knn = KNeighborsClassifier()
cross_validation(knn, x_train, y_train, 10)
#DecisionTree
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
cross_validation(dtc, x_train, y_train, 10)
#RandomForest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
cross_validation(rfc, x_train, y_train, 10)
# Test predictions for dtc
dtc_fitted = dtc.fit(x_train, y_train)
y_pred = dtc_fitted.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred) +
' Recall: %.3f' % recall_score(y_test, y_pred) +
' Precision: %.3f' % precision_score(y_test, y_pred) +
' F1-score: %.3f' % f1_score(y_test, y_pred))\
# Test predictions for rfc
rfc_fitted = rfc.fit(x_train, y_train)
y_pred = rfc_fitted.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred) +
' Recall: %.3f' % recall_score(y_test, y_pred) +
' Precision: %.3f' % precision_score(y_test, y_pred) +
' F1-score: %.3f' % f1_score(y_test, y_pred))
# Test predictions for knn
knn_fitted = knn.fit(x_train, y_train)
y_pred = knn_fitted.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred) +
' Recall: %.3f' % recall_score(y_test, y_pred) +
' Precision: %.3f' % precision_score(y_test, y_pred) +
' F1-score: %.3f' % f1_score(y_test, y_pred))
**For KNN**
'Mean Training Accuracy': 90.2971947134574,
'Mean Training Precision': 0.8457275536528337,
'Mean Training Recall': 0.44194341372912804,
'Mean Training F1 Score': 0.5804614758695162
test predictions for knn
Accuracy: 0.872 Recall: 0.323 Precision: 0.707 F1-score: 0.443
**For Decision Tree**
'Mean Training Accuracy': 100.0,
'Mean Training Precision': 1.0,
'Mean Training Recall': 1.0,
'Mean Training F1 Score': 1.0
test predictions for dtc:
Accuracy: 0.850 Recall: 0.528 Precision: 0.523 F1-score: 0.525
**For Random Forest**
'Mean Training Accuracy': 99.99309630652398,
'Mean Training Precision': 1.0,
'Mean Training Recall': 0.9995454545454546,
test predictions for rtc:
Accuracy: 0.896 Recall: 0.449 Precision: 0.803 F1-score: 0.576
from imblearn.over_sampling import SMOTE
smote = SMOTE()
# Oversample the training data
X_train_resampled, y_train_resampled = smote.fit_resample(x_train,
y_train)
I ran knn, rfc, and dtc again after running the code for smote
This might not be a technical issue with the code but rather with something known as target leakage.
That is one of the features in your model is recorded after your label has occurred. For example if you are predicting if the patient is going to die vs not to die, and there is a survival date field, then most models can perfectly predict the outcome.
KNN is a bit different because it is a memorization model - it doesn't learn the relationship between the variable and label. So if it hasn't seen an observation before, it won't give perfect prediction even in the presence of target leakage.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(img_array, img_labels,
shuffle=True, stratify=img_labels,
test_size=0.1, random_state=42)
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', mode='min',save_best_only=True,verbose=1)
restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.2,patience=3,verbose=1,min_delta=0.0001)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
hist = model.fit(x_train, y_train, epochs =50, batch_size=64,callbacks = [checkpoint,reduce_lr], validation_data=(x_test, y_test))
plt.plot(hist.history['loss'], color='b', label='Training Loss')
plt.plot(hist.history['val_loss'], color='r', label='Validation Loss')
plt.legend(loc='upper right')
plt.subplot(1, 2, 2)
plt.ylabel('Accuracy', fontsize=16)
plt.plot(hist.history['accuracy'], color='b', label='Training Accuracy')
plt.plot(hist.history['val_accuracy'], color='r', label='Validation Accuracy')
plt.legend(loc='lower right')
plt.show()
Output
y_pred=model.predict(x_test)
y_pred=np.argmax(y_pred, axis=1)
y_test=np.argmax(y_test, axis=1)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix
#FP, FN, TP, TN değerleri bulma
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.sum() - (FP + FN + TP)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP)
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
ACC*100
Output: array([87.21092226, 98.74616885, 85.81777654, 89.69072165, 80.46809696, 93.56366676, 83.75592087])
While the accuracy is around 60% as a result of deep learning, the accuracy is around 80%-85% on average as a result of calculation from the confusion matrix. What is the difference in accuracy here? Do both truths have different meanings?
I am currently in the process of displaying precision, recall and fscore. Now my question is how do I do this? What I tried is the following:
num_users, num_items = train_mat.shape
user_input, item_input, labels = get_train_samples(train_mat, num_negatives)
val_user_input, val_item_input, val_labels = get_train_samples(val_mat, num_negatives)
.
.
.
history = model.fit([np.array(user_input), np.array(item_input)], np.array(labels),
epochs=EPOCHS, verbose=VERBOSE, shuffle=True, batch_size = BATCH_SIZE,
validation_data=([np.array(val_user_input), np.array(val_item_input)], np.array(val_labels)),
callbacks=CALLBACKS)
.
.
.
# Precision, recall and fscore
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, roc_curve, auc
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print('Precision, recall, and F1 score, averaged and weighted by number of instances in each class:')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('f1 score: {}\n'.format(fscore))
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred)
print('Precision, recall, and F1 score, per class [0 1]:')
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('f1 score: {}'.format(fscore))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)
Unfortunately I don't know how to get y_test and y_pred. How do I get these values?
you shall have y_test as the test set to test your model and if you dont have such a set you can use sklearn train test split for getting a training set and a test set. Here is the link for how to use it:
sklearn traiin test split
and when you will have your test set you will do this to get y_pred:
y_pred = model.predict(y_test)
I am trying to predict one from two values which can appear in column 'exit'. I have clean data (about 20 columns and 4k rows contain typical information about customers like 'sex', 'age' ...). In training dataset about 20% customers were qualified as '1'. I made two models- svm and random forest- but both predict for test dataset mostly '0' (almost everytime). Recall of two models is 0.
I atached code where I think I could do some stupid mistake. Any ideas why recall is so low during 80% accuracies?
def ml_model():
print('sklearn: %s' % sklearn.__version__)
df = pd.read_csv('clean_data.csv')
df.head()
feat = df.drop(columns=['target'], axis=1)
label = df["target"]
x_train, x_test, y_train, y_test = train_test_split(feat, label, test_size=0.3)
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
# SVC method
support_vector_classifier = SVC(probability=True)
# Grid search
rand_list = {"C": stats.uniform(0.1, 10),
"gamma": stats.uniform(0.1, 1)}
auc = make_scorer(roc_auc_score)
rand_search_svc = RandomizedSearchCV(support_vector_classifier, param_distributions=rand_list, n_iter=100, n_jobs=4, cv=3, random_state=42,
scoring=auc)
rand_search_svc.fit(x_train, y_train)
support_vector_classifier = rand_search_svc.best_estimator_
cross_val_svc = cross_val_score(estimator=support_vector_classifier, X=x_train, y=y_train, cv=10, n_jobs=-1)
print("Cross Validation Accuracy for SVM: ", round(cross_val_svc.mean() * 100, 2), "%")
predicted_y = support_vector_classifier.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
precision_score = tp / (tp + fp)
recall_score = tp / (tp + fn)
print("Recall score SVC: ", recall_score)
# Random forests
random_forest_classifier = RandomForestClassifier()
# Grid search
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
rand_search_rf = RandomizedSearchCV(random_forest_classifier, param_distributions=param_dist,
n_iter=100, cv=5, iid=False)
rand_search_rf.fit(x_train, y_train)
random_forest_classifier = rand_search_rf.best_estimator_
cross_val_rfc = cross_val_score(estimator=random_forest_classifier, X=x_train, y=y_train, cv=10, n_jobs=-1)
print("Cross Validation Accuracy for RF: ", round(cross_val_rfc.mean() * 100, 2), "%")
predicted_y = random_forest_classifier.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
precision_score = tp / (tp + fp)
recall_score = tp / (tp + fn)
print("Recall score RF: ", recall_score)
new_data = pd.read_csv('new_data.csv')
new_data = cleaning_data_to_predict(new_data)
if round(cross_val_svc.mean() * 100, 2) > round(cross_val_rfc.mean() * 100, 2):
predictions = support_vector_classifier.predict(new_data)
predictions_proba = support_vector_classifier.predict_proba(new_data)
else:
predictions = random_forest_classifier.predict(new_data)
predictions_proba = random_forest_classifier.predict_proba(new_data)
f = open("output.txt", "w+")
for i in range(len(predictions.tolist())):
print("id: ", i, "probability: ", predictions_proba.tolist()[i][1], "exit: ", predictions.tolist()[i], file=open("output.txt", "a"))
If I have not missed it, you forgot to scale your test set.
So, you need to scale it as well. Note that you should just transform it, do not fit it again. See below.
x_test = sc_x.transform(x_test)
I agree with #e_kapti, also check the formula of the recall and accuracy, you might consider using the F1 Score instead (https://en.wikipedia.org/wiki/F1_score).
Recall = TP / (TP+FN) Accuracy = (TP + TN) / (TP + TN + FP + FN) With TP, FP, TN, FN being number of true positives, false positives, true negatives and false negatives, respectively.
Using the code below, I have the Accuracy . Now I am trying to
1) find the precision and recall for each fold (10 folds total)
2) get the mean for precision
3) get the mean for recall
This could be similar to print(scores) and print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) below.
Any thoughts?
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
iris = datasets.load_iris()
skf = StratifiedKFold(n_splits=10)
clf = svm.SVC(kernel='linear', C=1)
scores = cross_validation.cross_val_score(clf, iris.data, iris.target, cv=10)
print(scores) #[ 1. 0.93333333 1. 1. 0.86666667 1. 0.93333333 1. 1. 1.]
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Accuracy: 0.97 (+/- 0.09)
This is a bit different, because cross_val_score can't calculate precision/recall for non-binary classification, so you need to use recision_score, recall_score and make cross-validation manually. Parameter average='micro' calculates global precision/recall.
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score
iris = datasets.load_iris()
skf = StratifiedKFold(n_splits=10)
clf = svm.SVC(kernel='linear', C=1)
X = iris.data
y = iris.target
precision_scores = []
recall_scores = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
y_pred = clf.fit(X_train, y_train).predict(X_test)
precision_scores.append(precision_score(y_test, y_pred, average='micro'))
recall_scores.append(recall_score(y_test, y_pred, average='micro'))
print(precision_scores)
print("Recall: %0.2f (+/- %0.2f)" % (np.mean(precision_scores), np.std(precision_scores) * 2))
print(recall_scores)
print("Recall: %0.2f (+/- %0.2f)" % (np.mean(recall_scores), np.std(recall_scores) * 2))
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, precision_score,
accuracy_score, f1_score,roc_auc_score
def binary_classification_performance(y_test, y_pred):
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
accuracy = round(accuracy_score(y_pred = y_pred, y_true = y_test),2)
precision = round(precision_score(y_pred = y_pred, y_true = y_test),2)
recall = round(recall_score(y_pred = y_pred, y_true = y_test),2)
f1_score = round(2*precision*recall/(precision + recall),2)
specificity = round(tn/(tn+fp),2)
npv = round(tn/(tn+fn),2)
auc_roc = round(roc_auc_score(y_score = y_pred, y_true = y_test),2)
result = pd.DataFrame({'Accuracy' : [accuracy],
'Precision (or PPV)' : [precision],
'Recall (senitivity or TPR)' : [recall],
'f1 score' : [f1_score],
'AUC_ROC' : [auc_roc],
'Specificty (or TNR)': [specificity],
'NPV' : [npv],
'True Positive' : [tp],
'True Negative' : [tn],
'False Positive':[fp],
'False Negative':[fn]})
return result
binary_classification_performance(y_test, y_pred)