Problems setting up conditional search space in hyperopt - python

I'll fully admit that I may be setting up the conditional space wrong here but for some reason, I just can't get this to function at all. I am attempting to use hyperopt to tune a logistic regression model and depending on the solver there are some other parameters that need to be explored. If you choose the liblinear solver you can choose penalties, and depending on the penalty you can also choose dual. When I try and run hyperopt on this search space though, it keeps giving me an error because its passing the entire dictionary as show below. Any ideas?
The error I'm getting is
ValueError: Logistic Regression supports only liblinear, newton-cg, lbfgs and sag solvers, got {'solver': 'sag'}'
This format worked when setting up a random forest search space so I'm at a loss.
import numpy as np
import scipy as sp
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="white")
import pyodbc
import statsmodels as sm
from pandasql import sqldf
import math
from tqdm import tqdm
import pickle
from sklearn.preprocessing import RobustScaler, OneHotEncoder, MinMaxScaler
from sklearn.utils import shuffle
from sklearn.cross_validation import KFold, StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold as StratifiedKFoldIt
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectFromModel, SelectKBest
from sklearn.decomposition import PCA, IncrementalPCA, FactorAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, accuracy_score, classification_report, confusion_matrix, f1_score, log_loss
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection
from xgboost.sklearn import XGBClassifier
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
space4lr = {
'C': hp.uniform('C', .0001, 100.0),
'solver' : hp.choice('solver', [
{'solver' : 'newton-cg',},
{'solver' : 'lbfgs',},
{'solver' : 'sag'},
{'solver' : 'liblinear', 'penalty' : hp.choice('penalty', [
{'penalty' : 'l1'},
{'penalty' : 'l2', 'dual' : hp.choice('dual', [True, False])}]
)},
]),
'fit_intercept': hp.choice('fit_intercept', ['True', 'False']),
'class_weight': hp.choice('class_weight', ['balanced', None]),
'max_iter': 50000,
'random_state': 84,
'n_jobs': 8
}
lab = 0
results = pd.DataFrame()
for i in feature_elims:
target = 'Binary_over_3'
alt_targets = ['year2_PER', 'year2_GP' ,'year2_Min', 'year2_EFF' ,'year2_WS/40' ,'year2_Pts/Poss' ,'Round' ,'GRZ_Pick'
,'GRZ_Player_Rating' ,'Binary_over_2', 'Binary_over_3' ,'Binary_over_4' ,'Binary_5' ,'Draft_Strength']
#alt_targets.remove(target)
nondata_columns = ['display_name' ,'player_global_id', 'season' ,'season_' ,'team_global_id', 'birth_date', 'Draft_Day']
nondata_columns.extend(alt_targets)
AGG_SET_CART_PERC = sqldf("""SELECT * FROM AGG_SET_PLAYED_ADJ_SOS_Jan1 t1
LEFT JOIN RANKINGS t2 ON t1.[player_global_id] = t2.[player_global_id]
LEFT JOIN Phys_Training t3 ON t1.[player_global_id] = t3.[player_global_id]""")
AGG_SET_CART_PERC['HS_RSCI'] = AGG_SET_CART_PERC['HS_RSCI'].fillna(110)
AGG_SET_CART_PERC['HS_Avg_Rank'] = AGG_SET_CART_PERC['HS_Avg_Rank'].fillna(1)
AGG_SET_CART_PERC['HS_years_ranked'] = AGG_SET_CART_PERC['HS_years_ranked'].fillna(0)
AGG_SET_CART_PERC = shuffle(AGG_SET_CART_PERC, random_state=8675309)
rus = RandomUnderSampler(random_state=8675309)
ros = RandomOverSampler(random_state=8675309)
rs = RobustScaler()
X = AGG_SET_CART_PERC
y = X[target]
X = pd.DataFrame(X.drop(nondata_columns, axis=1))
position = pd.get_dummies(X['position'])
for idx, row in position.iterrows():
if row['F/C'] == 1:
row['F'] = 1
row['C'] = 1
if row['G/F'] == 1:
row['G'] = 1
row['F'] = 1
position = position.drop(['F/C', 'G/F'], axis=1)
X = pd.concat([X, position], axis=1).drop(['position'], axis=1)
X = rs.fit_transform(X, y=None)
X = i.transform(X)
def hyperopt_train_test(params):
clf = LogisticRegression(**params)
#cvs = cross_val_score(xgbc, X, y, scoring='recall', cv=skf).mean()
skf = StratifiedKFold(y, n_folds=6, shuffle=False, random_state=1)
metrics = []
tuning_met = []
accuracy = []
precision = []
recall = []
f1 = []
log = []
for i, (train, test) in enumerate(skf):
X_train = X[train]
y_train = y[train]
X_test = X[test]
y_test = y[test]
X_train, y_train = ros.fit_sample(X_train, y_train)
X_train, y_train = rus.fit_sample(X_train, y_train)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
tuning_met.append((((precision_score(y_test, y_pred))*4) + recall_score(y_test, y_pred))/5)
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred))
recall.append(recall_score(y_test, y_pred))
f1.append(f1_score(y_test, y_pred))
log.append(log_loss(y_test, y_pred))
metrics.append(sum(tuning_met) / len(tuning_met))
metrics.append(sum(accuracy) / len(accuracy))
metrics.append(sum(precision) / len(precision))
metrics.append(sum(recall) / len(recall))
metrics.append(sum(f1) / len(f1))
metrics.append(sum(log) / len(log))
return(metrics)
best = 0
count = 0
def f(params):
global best, count, results, lab, met
met = hyperopt_train_test(params.copy())
met.append(params)
met.append(featureset_labels[lab])
acc = met[0]
results = results.append([met])
if acc > best:
print(featureset_labels[lab],'new best:', acc, 'Accuracy:', met[1], 'Precision:', met[2], 'Recall:', met[3], 'using', params, """
""")
best = acc
else:
print(acc, featureset_labels[lab], count)
count = count + 1
return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()
best = fmin(f, space4lr, algo=tpe.suggest, max_evals=1000, trials=trials)
print(featureset_labels[lab], ' best:')
print(best, """
""")
lab = lab + 1

Though replying too late, but yesterday I faced this issue. Below mentioned code snippet, I think will help you out.
space = hp.choice('classifier',[
{
'model': LogisticRegression,
'param':
{
'hyper_param_groups' :hp.choice('hyper_param_groups',
[
{
'penalty':hp.choice('penalty_block1', ['l2']),
'solver':hp.choice('solver_block1', ['newton-cg', 'sag', 'saga', 'lbfgs']),
'multi_class':hp.choice('multi_class', ['ovr', 'multinomial']),
},
{
'penalty':hp.choice('penalty_block2', ['l2']),
'solver':hp.choice('solver_block2', ['liblinear']),
'multi_class':hp.choice('multi_class_block2', ['ovr']),
},
{
'penalty':hp.choice('penalty_block3', ['l1']),
'solver':hp.choice('solver_block3', ['saga']),
'multi_class':hp.choice('multi_class_block3', ['ovr', 'multinomial']),
},
]),
'dual':hp.choice('dual', [False]),
'class_weight':hp.choice('class_weight', ['balanced', None]),
'random_state':hp.choice('random_state', [10,267]),
'max_iter':hp.choice('max_iter', [100,500]),
'verbose':hp.choice('verbose', [0])
}
}])
and how to use it in callable method:
penalty = args['param']['hyper_param_groups']['penalty']
solver = args['param']['hyper_param_groups']['solver']
multi_class = args['param']['hyper_param_groups']['multi_class']
dual = args['param']['dual']
class_weight = args['param']['class_weight']
random_state = args['param']['random_state']
max_iter = args['param']['max_iter']
verbose = args['param']['verbose']

Related

X has 14 features, but RandomForestClassifier is expecting 20 features as input

Could I please ask, I have this code (a reproducible example):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
acc_list = list()
f1_list = list()
precision_list = list()
recall_list = list()
auc_list = list()
#for ROC curve
tprs = []
base_fpr = np.linspace(0, 1, 101)
plt.figure(figsize=(5, 5))
plt.axes().set_aspect('equal', 'datalim')
count = 0
list_shap_values = list()
list_test_sets = list()
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_
best_model_shap = search.best_estimator_['clf_cv'].fit(split_x_train,split_y_train)
print(search.best_params_)
yhat = best_model.predict(split_x_test) #changed from best_model and split_x_test
accuracy = accuracy_score(split_y_test,yhat)
acc_list.append(accuracy)
f1_sc = f1_score(split_y_test,yhat)
f1_list.append(f1_sc)
precision_sc = precision_score(split_y_test,yhat)
precision_list.append(precision_sc)
recall_sc = recall_score(split_y_test,yhat)
recall_list.append(recall_sc)
fpr, tpr, _ = roc_curve(split_y_test, yhat)
auc = metrics.auc(fpr,tpr)
auc_list.append(auc)
plt.plot(fpr, tpr, 'b', alpha=0.15)
tpr = np.interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
count +=1
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
explainer = shap.TreeExplainer(best_model_shap)
shap_values = explainer.shap_values(split_x_test)
list_shap_values.append(shap_values)
list_test_sets.append(test_ix)
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test_df = pd.DataFrame(full_X_train[test_set])
cols = X_test_df.columns
shap_sum = np.abs(shap_values[1,:,:]).mean(0)
importance_df = pd.DataFrame({
'column_name':cols,
'shap_values':shap_sum
})
importance_df.sort_values('shap_values',ascending=False)
print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
tpr_std = tprs.std(axis=0)
tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
tprs_lower = mean_tprs - tpr_std
plt.plot(base_fpr, mean_tprs, 'b')
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC for stratified 5-fold CV (blue line = mean)')
plt.savefig(output_plt_file)
print(importance_df)
return
param_grid = [{
'clf_cv__min_samples_leaf':[1,3,5],
[200,500,700,1000,1500,2000]
}]
run_model_with_grid_search(param_grid=param_grid)
I get the error:
X has 14 features, but RandomForestClassifier is expecting 20 features as input
I can see other people have had this issue understand the problem (I think) - it's because I do feature selection before I build a model, but then I don't transform my X test data to be the same dimensions.
I was trying to implement solutions based on seeing other people's work e.g.
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_
best_model_shap = search.best_estimator_['clf_cv'].fit_transform(split_x_train,split_y_train)
print(search.best_params_)
transformed_x_test = best_model.transform(split_x_test)
yhat = best_model.predict(transformed_x_test) #changed from best_model and split_x_test
But that is leading to other errors (e.g. in this case, 'GridSerchCV object has no attribute fit_transform') - so I'm just not clear how to implement other's solutions to my issue.
Could someone please demonstrate to me how to get this piece of code working (it should all work except for this error anyway). Also, if someone could show me how to print which features are being selected by the feature selection process that would be great but I think I can figure that out after this starts working.

How to compute false positive rate of an imbalanced dataset for Stratified K fold cross validation?

The below lines are the sample code where I am able to compute accuracy, precision, recall, and f1 score. How can I also compute a false positive rate (FPR) for Stratified K fold cross-validation?
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score,
f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
scoring = {'accuracy' : make_scorer(accuracy_score),
'precision' : make_scorer(precision_score),
'recall' : make_scorer(recall_score),
'f1_score' : make_scorer(f1_score)}
skfold = StratifiedKFold(n_splits=10)
dt_clf = DecisionTreeClassifier()
results = cross_validate(estimator=dt_clf,
X=data_train_X,
y=target_train_Y,
cv=skfold,
scoring=scoring)
print("Results", results)
You could define a custom scorer as follows:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
def false_positive_rate(y_true, y_pred):
# false positive
fp = ((y_pred == 1) & (y_true == 0)).sum()
# true negative
tn = ((y_pred == 0) & (y_true == 0)).sum()
# false positive rate
return fp / (fp + tn)
scoring = {
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score),
'recall': make_scorer(recall_score),
'f1_score': make_scorer(f1_score),
'false_positive_rate': make_scorer(false_positive_rate),
}
skf = StratifiedKFold(n_splits=3)
clf = DecisionTreeClassifier(random_state=42)
X, y = make_classification(random_state=42)
results = cross_validate(estimator=clf, X=X, y=y, cv=skf, scoring=scoring)
print(results['test_false_positive_rate'])
# [0.11764706 0.11764706 0.0625]
I wrote this code with Logistic Regression. You can substitute it with any other binary classification algorithm you'd like.
#Importing required libraries
from sklearn.model_selection import KFold
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
data = load_breast_cancer(as_frame = True)
df = result.frame
X = result.iloc[:,:-1]
y = result.iloc[:,-1]
#shffling
X = X.sample(frac = 1)
y= y.sample(frac = 1)
#Implementing cross validation
kf = KFold(n_splits=10)#, random_state=None
model = LogisticRegression(max_iter=1000000)#(solver= 'liblinear')
acc_score = list()
res_tpr = list()
res_fpr = list()
for train_index , test_index in kf.split(X):
#X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
#y_train , y_test = y[train_index] , y[test_index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train,y_train)
pred_values = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, pred_values, labels=[0, 1]).ravel()
print(f'True Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')
tpr=(np.divide(tp,(tp+fn)))
fpr=(np.divide(fp,(fp+tn)))
if tp==0:
tpr=0
if fp==0:
fpr=0
print('tpr=%.4f fpr=%.3f' % ( mean(tpr), mean(fpr)))
res_tpr.append(mean(tpr))
res_fpr.append(mean(fpr))
print('---------------------')
acc = accuracy_score(pred_values , y_test)
acc_score.append(acc)
avg_acc_score = np.sum(acc_score)/10
total_tpr=np.sum(res_tpr)/10
total_fpr=np.sum(res_fpr)/10
print('\n\n',' total_tpr=%.4f total_fpr=%.3f' % (total_tpr,total_fpr))
#print('\n\n','accuracy of each fold - {}'.format(acc_score))
print('\n\n','Avg accuracy : {}'.format(avg_acc_score))

AttributeError: 'numpy.ndarray' object has no attribute 'score' error

I have tried to look for a problem but there is nothing Im seeing wrong here. What could it be? This is for trying binary classification in SVM for the fashion MNIST data set but only classifying 5 and 7.
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
trainset = 'mnist_train.xlsx'
trs = pd.read_excel(trainset)
testset = 'mnist_test.xlsx'
tes = pd.read_excel(testset)
xtrain = trs.iloc[:, [1, 783]]
ytrain = trs.iloc[:, 0]
xtest = tes.iloc[:, [1, 783]]
ytest = tes.iloc[:, 0]
##Linear SVC
svclassifier = SVC(kernel='linear', C=1)
svclassifier.fit(xtest, ytest)
ypred = svclassifier.predict(xtest)
print(ypred.score(xtrain, ytrain))
print(ypred.score(xtest, ytest))
##Gaussian SVC
svclassifier = SVC(kernel='rbf', C=1)
svclassifier.fit(xtrain, ytrain)
ypred = svclassifier.predict(xtest)
print(ypred.score(xtrain, ytrain))
print(ypred.score(xtest, ytest))
ypred is an array of predicted class labels, so the exception makes sense.
What you should do is use the classifier’s score method:
svclassifier = SVC(kernel='rbf', C=1)
svclassifier.fit(xtrain, ytrain)
# ypred = svclassifier.predict(xtest) # We don’t actually use this.
print(svclassifier.score(xtrain, ytrain))
print(svclassifier.score(xtest, ytest))

How to properly use sklearn's cross_validate with One Hot Encoded classes?

I created a model to classify my 8-classes dataset and get some scores from it using MLP. To do so, I decided to use sklearn.metrics.cross_validate, using 10 folds.
The following code works fine:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, make_scorer, f1_score
import pandas as pd
def MLPClasify(sample):
df = pd.read_csv('my_path\\my_file.csv', header=None)
y = df[NumberOfFeatures]
x = df.drop([NumberOfFeatures], axis=1)
clf = MLPClassifier(hidden_layer_sizes=(27), activation='logistic', max_iter=500, alpha=0.0001,
solver='adam', verbose=10, random_state=21, tol=0.000000001)
clf.out_activation_ = 'softmax'
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score,
average='weighted')}
scores = cross_validate(clf, x, y, cv=10, scoring=scoring)
return scores
Everything went ok, I was getting some accuracies around 60%. So I decided to use one hot encoding to see if I can get better results. So I wrote the following code:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, make_scorer, f1_score
import pandas as pd
def MLPClasify(sample):
df = pd.read_csv('my_path\\my_file.csv', header=None)
y = df[NumberOfFeatures]
x = df.drop([NumberOfFeatures], axis=1)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
onehot_encoder = OneHotEncoder()
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
y = onehot_encoded
clf = MLPClassifier(hidden_layer_sizes=(27), activation='logistic', max_iter=500, alpha=0.0001,
solver='adam', verbose=10, random_state=21, tol=0.000000001)
clf.out_activation_ = 'softmax'
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score,
average='weighted')}
scores = cross_validate(clf, x, y, cv=10, scoring=scoring)
return scores
Well, the code runs, but I get the following warning:
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use zero_division parameter to con
trol this behavior.
average, "true nor predicted", 'F-score is', len(true_sum)
Also, my accuracy drops to less than 2%
Any ideas on what I may be doing wrong?
Thanks for the help

Adaboosting Ensemble learning

I am implemeting Adaboost algorithm with Decision tree from sklearn library and once i predict the results i got an error any explanation and thank you :
The error is :
AdaBoostClassifier with algorithm='SAMME.R' requires that the weak learner supports the calculation of class probabilities with a predict_proba method.
Please change the base estimator or set algorithm='SAMME' instead.
The dataset is from UCI repository you can access it from this link :
https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/
The code is as follow:
from sklearn.model_selection import *
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
np.random.seed(1)
selected_features=['sex', 'cp','fbs', 'exang']
X=datascaled[selected_features]
Y=datascaled['num']
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=0)
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [1, 2]
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)
ABC = AdaBoostClassifier(base_estimator = DTC)
# run grid search
model1=GridSearchCV(ABC, param_grid=param_grid, scoring = 'accuracy')
model1.fit(X_train,y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",model1.best_params_)
prediction=model1.predict(X_test)
#importing the metrics module
from sklearn import metrics
#evaluation(Accuracy)
print("Accuracy:",metrics.accuracy_score(prediction,y_test))
#evaluation(Confusion Metrix)
print("Confusion Metrix:\n",metrics.confusion_matrix(prediction,y_test))
#after tuning
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from time import *
from sklearn import metrics
n_folds=10
model=AdaBoostClassifier(random_state = 11,base_estimator= 'entropy', n_estimators= 2)
cv = 10
t0 = time()
y_pred = cross_val_predict(model, X=X, y=Y, n_jobs=-1, cv=cv)
t = time() - t0
print("=" * 52)
print("time cost: {}".format(t))
print()
print("confusion matrix\n", metrics.confusion_matrix(y, y_pred))
print()
print("\t\taccuracy: {}".format(metrics.accuracy_score(y, y_pred)))
print("\t\troc_auc_score: {}".format(metrics.roc_auc_score(y, y_pred)))
print("\t\tcohen_kappa_score: {}".format(metrics.cohen_kappa_score(y, y_pred)))
print()
print("\t\tclassification report")
print("-" * 52)
print(metrics.classification_report(y, y_pred))

Categories