Using the code below, I have the Accuracy . Now I am trying to
1) find the precision and recall for each fold (10 folds total)
2) get the mean for precision
3) get the mean for recall
This could be similar to print(scores) and print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) below.
Any thoughts?
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
iris = datasets.load_iris()
skf = StratifiedKFold(n_splits=10)
clf = svm.SVC(kernel='linear', C=1)
scores = cross_validation.cross_val_score(clf, iris.data, iris.target, cv=10)
print(scores) #[ 1. 0.93333333 1. 1. 0.86666667 1. 0.93333333 1. 1. 1.]
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Accuracy: 0.97 (+/- 0.09)
This is a bit different, because cross_val_score can't calculate precision/recall for non-binary classification, so you need to use recision_score, recall_score and make cross-validation manually. Parameter average='micro' calculates global precision/recall.
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score
iris = datasets.load_iris()
skf = StratifiedKFold(n_splits=10)
clf = svm.SVC(kernel='linear', C=1)
X = iris.data
y = iris.target
precision_scores = []
recall_scores = []
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
y_pred = clf.fit(X_train, y_train).predict(X_test)
precision_scores.append(precision_score(y_test, y_pred, average='micro'))
recall_scores.append(recall_score(y_test, y_pred, average='micro'))
print(precision_scores)
print("Recall: %0.2f (+/- %0.2f)" % (np.mean(precision_scores), np.std(precision_scores) * 2))
print(recall_scores)
print("Recall: %0.2f (+/- %0.2f)" % (np.mean(recall_scores), np.std(recall_scores) * 2))
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, recall_score, precision_score,
accuracy_score, f1_score,roc_auc_score
def binary_classification_performance(y_test, y_pred):
tp, fp, fn, tn = confusion_matrix(y_test, y_pred).ravel()
accuracy = round(accuracy_score(y_pred = y_pred, y_true = y_test),2)
precision = round(precision_score(y_pred = y_pred, y_true = y_test),2)
recall = round(recall_score(y_pred = y_pred, y_true = y_test),2)
f1_score = round(2*precision*recall/(precision + recall),2)
specificity = round(tn/(tn+fp),2)
npv = round(tn/(tn+fn),2)
auc_roc = round(roc_auc_score(y_score = y_pred, y_true = y_test),2)
result = pd.DataFrame({'Accuracy' : [accuracy],
'Precision (or PPV)' : [precision],
'Recall (senitivity or TPR)' : [recall],
'f1 score' : [f1_score],
'AUC_ROC' : [auc_roc],
'Specificty (or TNR)': [specificity],
'NPV' : [npv],
'True Positive' : [tp],
'True Negative' : [tn],
'False Positive':[fp],
'False Negative':[fn]})
return result
binary_classification_performance(y_test, y_pred)
Related
I have Timeseries dataset. I have used cross validation and XGBregressor model. Now i want to forcast my prediction for particular x day.
As per my understaning any fundamental ML prediction model can be expressed as : y = a. f(x)+b. where x = input, b= bias, y= prediction.
I have already trained the model. So for my case x will be time vector. Now the predicted forcasted output matrix say Y already contains all the forcast of x days/month which include everything.
So only thing for me is now to filter out from data what I need.
So, I am trying to write a fucntion argument where Y is the argument and from that i want to filter out for example it say below 20% or what will be forcast on 36 day. Can someone explain me how i can write this?
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
import xgboost as xgbr
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
tss = TimeSeriesSplit(n_splits=3, test_size=24*365*1, gap=24)
df3 = df3.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df3):
train = df3.iloc[train_idx].dropna()
test = df3.iloc[val_idx].dropna()
FEATURES = ['7day_rolling_avg','Lag_1']
TARGET = 'Liquid Lvl % C'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
#################################################################################################################
xgbr = xgb.XGBRegressor(verbosity=0)
print(xgbr)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
importance_type='gain', learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
xgbr.fit(X_train, y_train)
score = xgbr.score(X_train, y_train)
print("Training score: ", score)
scores = cross_val_score(xgbr, X_train, y_train,cv=3)
print("Mean cross-validation score: %.2f" % scores.mean())
ypred = xgbr.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))
x_ax = range(len(y_test))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, ypred, label="predicted")
plt.title("Data Prediction")
plt.legend()
plt.show()
The below lines are the sample code where I am able to compute accuracy, precision, recall, and f1 score. How can I also compute a false positive rate (FPR) for Stratified K fold cross-validation?
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score,
f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
scoring = {'accuracy' : make_scorer(accuracy_score),
'precision' : make_scorer(precision_score),
'recall' : make_scorer(recall_score),
'f1_score' : make_scorer(f1_score)}
skfold = StratifiedKFold(n_splits=10)
dt_clf = DecisionTreeClassifier()
results = cross_validate(estimator=dt_clf,
X=data_train_X,
y=target_train_Y,
cv=skfold,
scoring=scoring)
print("Results", results)
You could define a custom scorer as follows:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
def false_positive_rate(y_true, y_pred):
# false positive
fp = ((y_pred == 1) & (y_true == 0)).sum()
# true negative
tn = ((y_pred == 0) & (y_true == 0)).sum()
# false positive rate
return fp / (fp + tn)
scoring = {
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score),
'recall': make_scorer(recall_score),
'f1_score': make_scorer(f1_score),
'false_positive_rate': make_scorer(false_positive_rate),
}
skf = StratifiedKFold(n_splits=3)
clf = DecisionTreeClassifier(random_state=42)
X, y = make_classification(random_state=42)
results = cross_validate(estimator=clf, X=X, y=y, cv=skf, scoring=scoring)
print(results['test_false_positive_rate'])
# [0.11764706 0.11764706 0.0625]
I wrote this code with Logistic Regression. You can substitute it with any other binary classification algorithm you'd like.
#Importing required libraries
from sklearn.model_selection import KFold
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
data = load_breast_cancer(as_frame = True)
df = result.frame
X = result.iloc[:,:-1]
y = result.iloc[:,-1]
#shffling
X = X.sample(frac = 1)
y= y.sample(frac = 1)
#Implementing cross validation
kf = KFold(n_splits=10)#, random_state=None
model = LogisticRegression(max_iter=1000000)#(solver= 'liblinear')
acc_score = list()
res_tpr = list()
res_fpr = list()
for train_index , test_index in kf.split(X):
#X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
#y_train , y_test = y[train_index] , y[test_index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train,y_train)
pred_values = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred_values, labels=[0, 1]).ravel()
print(f'True Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')
tpr=(np.divide(tp,(tp+fn)))
fpr=(np.divide(fp,(fp+tn)))
if tp==0:
tpr=0
if fp==0:
fpr=0
print('tpr=%.4f fpr=%.3f' % ( mean(tpr), mean(fpr)))
res_tpr.append(mean(tpr))
res_fpr.append(mean(fpr))
print('---------------------')
acc = accuracy_score(pred_values , y_test)
acc_score.append(acc)
avg_acc_score = np.sum(acc_score)/10
total_tpr=np.sum(res_tpr)/10
total_fpr=np.sum(res_fpr)/10
print('\n\n',' total_tpr=%.4f total_fpr=%.3f' % (total_tpr,total_fpr))
#print('\n\n','accuracy of each fold - {}'.format(acc_score))
print('\n\n','Avg accuracy : {}'.format(avg_acc_score))
I am using the machine learning algorithm kNN and instead of dividing the dataset into 66,6% for training and 33,4% for tests I need to use cross-validation with the following parameters: K=3, 1/euclidean.
K=3 has no mystery, I simply add to the code:
Classifier = KNeighborsClassifier(n_neighbors=3, p=2, metric='euclidean')
and it's solved. What I can't understand is the 1/euclidean, and how I could apply that to the code?
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
def openfile():
df = pd.read_csv('Testfile - kNN.csv')
return df
def main():
start_time = time.time()
dataset = openfile()
X = dataset.drop(columns=['Label'])
y = dataset['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Classifier = KNeighborsClassifier(n_neighbors=3, p=2, metric='euclidean')
Classifier.fit(X_train, y_train)
y_pred_class = Classifier.predict(X_test)
score = cross_val_score(Classifier, X, y, cv=10)
y_pred_prob = Classifier.predict_proba(X_test)[:, 1]
print("accuracy_score:", metrics.accuracy_score(y_test, y_pred_class),'\n')
print("confusion matrix")
print(metrics.confusion_matrix(y_test, y_pred_class),'\n')
print("Background precision score:", metrics.precision_score(y_test, y_pred_class, labels=['background'], average='micro')*100,"%")
print("Botnet precision score:", metrics.precision_score(y_test, y_pred_class, labels=['bot'], average='micro')*100,"%")
print("Normal precision score:", metrics.precision_score(y_test, y_pred_class, labels=['normal'], average='micro')*100,"%",'\n')
print(metrics.classification_report(y_test, y_pred_class, digits=2),'\n')
print(score,'\n')
print(score.mean(),'\n')
print("--- %s seconds ---" % (time.time() - start_time))
You can create your own function and pass it as a callable to metric param.
Create your function something like below:
from scipy.spatial import distance
def inverse_euc(a,b):
return 1/distance.euclidean(a, b)
Now use it as callable in your KNN function:
Classifier = KNeighborsClassifier(algorithm='ball_tree',n_neighbors=3, p=2, metric=inverse_euc)
I am implemeting Adaboost algorithm with Decision tree from sklearn library and once i predict the results i got an error any explanation and thank you :
The error is :
AdaBoostClassifier with algorithm='SAMME.R' requires that the weak learner supports the calculation of class probabilities with a predict_proba method.
Please change the base estimator or set algorithm='SAMME' instead.
The dataset is from UCI repository you can access it from this link :
https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/
The code is as follow:
from sklearn.model_selection import *
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
np.random.seed(1)
selected_features=['sex', 'cp','fbs', 'exang']
X=datascaled[selected_features]
Y=datascaled['num']
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=0)
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [1, 2]
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)
ABC = AdaBoostClassifier(base_estimator = DTC)
# run grid search
model1=GridSearchCV(ABC, param_grid=param_grid, scoring = 'accuracy')
model1.fit(X_train,y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",model1.best_params_)
prediction=model1.predict(X_test)
#importing the metrics module
from sklearn import metrics
#evaluation(Accuracy)
print("Accuracy:",metrics.accuracy_score(prediction,y_test))
#evaluation(Confusion Metrix)
print("Confusion Metrix:\n",metrics.confusion_matrix(prediction,y_test))
#after tuning
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from time import *
from sklearn import metrics
n_folds=10
model=AdaBoostClassifier(random_state = 11,base_estimator= 'entropy', n_estimators= 2)
cv = 10
t0 = time()
y_pred = cross_val_predict(model, X=X, y=Y, n_jobs=-1, cv=cv)
t = time() - t0
print("=" * 52)
print("time cost: {}".format(t))
print()
print("confusion matrix\n", metrics.confusion_matrix(y, y_pred))
print()
print("\t\taccuracy: {}".format(metrics.accuracy_score(y, y_pred)))
print("\t\troc_auc_score: {}".format(metrics.roc_auc_score(y, y_pred)))
print("\t\tcohen_kappa_score: {}".format(metrics.cohen_kappa_score(y, y_pred)))
print()
print("\t\tclassification report")
print("-" * 52)
print(metrics.classification_report(y, y_pred))
I am trying to calculate roc_auc for hard votingclassifier that i build . i present the code with reprodcible example. now i want to calculate the roc_auc score and plot ROC curver but unfortunately i got the following error predict_proba is not available when voting='hard'
# Voting Ensemble for Classification
import pandas
from sklearn import datasets
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer,confusion_matrix, f1_score, precision_score, recall_score, cohen_kappa_score,accuracy_score,roc_curve
import numpy as np
np.random.seed(42)
iris = datasets.load_iris()
X = iris.data[:, :4] # we only take the first two features.
Y = iris.target
print(Y)
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
estimators.append(('RandomForest', model2))
model3 = MultinomialNB()
estimators.append(('NaiveBayes', model3))
model4=SVC(probability=True)
estimators.append(('svm', model4))
model5=DecisionTreeClassifier()
estimators.append(('Cart', model5))
# create the ensemble model
print('Majority Class Labels (Majority/Hard Voting)')
ensemble = VotingClassifier(estimators,voting='hard')
#accuracy
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold,scoring='accuracy')
y_pred = cross_val_predict(ensemble, X ,Y, cv=10)
print("Accuracy ensemble model : %0.2f (+/- %0.2f) " % (results.mean(), results.std() ))
print(results.mean())
#recall
recall_scorer = make_scorer(recall_score, pos_label=1)
recall = cross_val_score(ensemble, X, Y, cv=kfold, scoring=recall_scorer)
print('Recall', np.mean(recall), recall)
# Precision
precision_scorer = make_scorer(precision_score, pos_label=1)
precision = cross_val_score(ensemble, X, Y, cv=kfold, scoring=precision_scorer)
print('Precision', np.mean(precision), precision)
#f1_score
f1_scorer = make_scorer(f1_score, pos_label=1)
f1_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring=f1_scorer)
print('f1_score ', np.mean(f1_score ),f1_score )
#roc_auc_score
roc_auc_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )
To calculate the roc_aucmetric you first need to
Replace: ensemble = VotingClassifier(estimators,voting='hard')
with: ensemble = VotingClassifier(estimators,voting='soft').
Next, the last 2 lines of code will throw an error:
roc_auc_score = cross_val_score(ensemble, X, Y, cv=3, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )
ValueError: multiclass format is not supported
This is normal since in Y you have 3 classes (np.unique(Y) == array([0, 1, 2])).
You can't use roc_auc as a single summary metric for multiclass models. If you want, you could calculate **per-class roc_auc.**
How to solve this:
1) Use only two classes to calculate the roc_auc_score
2) use label binarization in advance vefore calling roc_auc_score