sklearn LinearModel, Ridge and Lasso produce incompatible output? - python

I am playing a bit with differnt regression models on the boston house dataset. I found that if I use a normal linear model or ridge regression the predicted values are of shape (102, 1) while if I use the identical code with Lasso the output is shape (102,). Why is that? This makes it then crash with ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all() in the pearsonr line.
Any idea on how to make the code below run smoothly?
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
import sys
def evalOneModel (model, name, X, y, nRuns):
allMse = []
allR2 = []
all_rho_P = []
################ OLS ################
for i in range(nRuns):
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
model = model.fit(x_train, y_train)
predictions = model.predict(x_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
allMse.append(mse)
allR2.append(r2)
print(type(y_test))
print(y_test.shape)
print(type(predictions))
print(predictions.shape)
rhoP, pval = pearsonr(y_test, predictions)
rhoP = rhoP[0]
all_rho_P.append(rhoP)
print("run{}={:0.3f}; ".format(i, rhoP), end="")
print(model.coef_)
myTitle = "{} mean={:0.3f}".format(name, np.mean(all_rho_P))
print("")
print(myTitle)
print("")
sys.stdout.flush()
####### MAIN #####
pd.set_option('expand_frame_repr', False)
bosten_data = load_boston()
df = pd.DataFrame(bosten_data.data, columns=bosten_data.feature_names)
df['MEDV'] = bosten_data.target # add the target to the data frame
target = pd.DataFrame(bosten_data.target, columns=["MEDV"])
norm_df = (df - df.mean()) / df.std()
norm_target = (target - target.mean()) / target.std()
X = norm_df[["RM", "AGE", "PTRATIO", "LSTAT"]]
y = norm_target
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr, pearsonr
print("\n\nstarting runs ...\n")
from sklearn import linear_model
model = linear_model.LinearRegression()
evalOneModel (model, "OLS", X, y, 1)
from sklearn.linear_model import Ridge # L2
model = linear_model.Ridge(alpha=1.0)
evalOneModel (model, "Ridge (alpha=1)", X, y, 1)
from sklearn.linear_model import Lasso # L1
model = linear_model.Lasso(alpha=1.0)
evalOneModel (model, "Lasso (alpha=1)", X, y, 1)

Related

f_importances function in sklearn

I found this question here which seems to address my problem(Determining the most contributing features for SVM classifier in sklearn).
However as my understanding of Python language is limited I need some help.
I have a dependent variable which is 'Group' that has two levels 'Group1' and 'Group2'.
This is the code I found, adapted to my data:
import pandas as pd
df = pd.read_csv('C:/Users/myPC/OneDrive/Desktop/analysis/dataframe6.csv')
X = df.drop('Group', axis=1)
y = df['Group']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_, features_names)
It produces just a blank plot.
I think there is something I should change in features_names = ['input1', 'input2'] but I am not sure what.
The code you used to plot expects a one-dimensional array. The attribute coef_, according to the documentation will be:
coef_ ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)
Weights assigned to the features when kernel="linear".
Using an example :
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
np.random.seed(123)
df = pd.DataFrame(np.random.uniform(0,1,(400,3)),columns=['input1','input2','input3'])
df['Group'] = np.random.choice(['Group1','Group2'],400)
X = df.drop('Group', axis=1)
y = df['Group']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
We check the shape of the array:
print(svclassifier.coef_.shape)
(1, 3)
Because you have only 2 class, there's only 1 row. We can do:
from matplotlib import pyplot as plt
from sklearn import svm
def f_importances(coef, names):
imp = coef
imp,names = zip(*sorted(zip(imp,names)))
plt.barh(range(len(names)), imp, align='center')
plt.yticks(range(len(names)), names)
plt.show()
features_names = ['input1', 'input2','input3']
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
f_importances(svclassifier.coef_[0], features_names)
This is the plot I got :

Improve precision of my predictive technique in Python

I am using the following Python code to make output predictions depending on some values using decision trees based on entropy/gini index. My input data is contained in the file: https://drive.google.com/file/d/1C8GZ2wiqFUW3WuYxyc0G3axgkM1Uwsb6/view?usp=sharing
The first column "gold" in the file contains the output that I am trying to predict (either T or N). The remaining columns represents some 0 or 1 data that I can use to predict the first column. I am using a test set of 30% and a training set of 70%. I am getting the same precision/recall using either entropy or gini index. I am getting a precision of 0.80 for T and a recall of 0.54 for T. I would like to increase the precision of T and I am okay if the recall for T goes down as well, I am willing to accept this tradeoff. I do not care about the precision/recall of N predictions, I am just trying to improve the precision of T, that's all I care about. I guess increasing the precision means that we should abstain from making predictions in some situations that we are not certain about. How to do that?
# Run this program on your local python
# interpreter, provided you have installed
# the required libraries.
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn import tree
import collections
import pydotplus
# Function importing Dataset
column_count =0
def importdata():
balance_data = pd.read_csv( 'data1extended.txt', sep= ',')
row_count, column_count = balance_data.shape
# Printing the dataswet shape
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print("Number of columns ", column_count)
# Printing the dataset obseravtions
print ("Dataset: ",balance_data.head())
return balance_data, column_count
def columns(balance_data):
row_count, column_count = balance_data.shape
return column_count
# Function to split the dataset
def splitdataset(balance_data, column_count):
# Separating the target variable
X = balance_data.values[:, 1:column_count]
Y = balance_data.values[:, 0]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
return X, Y, X_train, X_test, y_train, y_test
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
# Creating the classifier object
clf_gini = DecisionTreeClassifier(criterion = "gini",
random_state = 100,max_depth=3, min_samples_leaf=5)
# Performing training
clf_gini.fit(X_train, y_train)
return clf_gini
# Function to perform training with entropy.
def tarin_using_entropy(X_train, X_test, y_train):
# Decision tree with entropy
clf_entropy = DecisionTreeClassifier(
criterion = "entropy", random_state = 100,
max_depth = 3, min_samples_leaf = 5)
# Performing training
clf_entropy.fit(X_train, y_train)
return clf_entropy
# Function to make predictions
def prediction(X_test, clf_object):
# Predicton on test with giniIndex
y_pred = clf_object.predict(X_test)
print("Predicted values:")
print(y_pred)
return y_pred
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
print("Confusion Matrix: ",
confusion_matrix(y_test, y_pred))
print ("Accuracy : ",
accuracy_score(y_test,y_pred)*100)
print("Report : ",
classification_report(y_test, y_pred))
#Univariate selection
def selection(column_count, data):
# data = pd.read_csv("data1extended.txt")
X = data.iloc[:,1:column_count] #independent columns
y = data.iloc[:,0] #target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
df=pd.DataFrame(data, columns=X)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(5,'Score')) #print 10 best features
return X,y,data,df
#Feature importance
def feature(X,y):
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()
#Correlation Matrix
def correlation(data, column_count):
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(column_count,column_count))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")
def generate_decision_tree(X,y):
clf = DecisionTreeClassifier(random_state=0)
data_feature_names = ['callersAtLeast1T','CalleesAtLeast1T','callersAllT','calleesAllT','CallersAtLeast1N','CalleesAtLeast1N','CallersAllN','CalleesAllN','childrenAtLeast1T','parentsAtLeast1T','childrenAtLeast1N','parentsAtLeast1N','childrenAllT','parentsAllT','childrenAllN','ParentsAllN','ParametersatLeast1T','FieldMethodsAtLeast1T','ReturnTypeAtLeast1T','ParametersAtLeast1N','FieldMethodsAtLeast1N','ReturnTypeN','ParametersAllT','FieldMethodsAllT','ParametersAllN','FieldMethodsAllN']
#generate model
model = clf.fit(X, y)
# Create DOT data
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=data_feature_names,
class_names=y)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
# Show graph
Image(graph.create_png())
# Create PDF
graph.write_pdf("tree.pdf")
# Create PNG
graph.write_png("tree.png")
# Driver code
def main():
# Building Phase
data,column_count = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data, column_count)
clf_gini = train_using_gini(X_train, X_test, y_train)
clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
# Operational Phase
print("Results Using Gini Index:")
# Prediction using gini
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
# Prediction using entropy
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
#COMMENTED OUT THE 4 FOLLOWING LINES DUE TO MEMORY ERROR
#X,y,dataheaders,df=selection(column_count,data)
#generate_decision_tree(X,y)
#feature(X,y)
#correlation(dataheaders,column_count)
# Calling main function
if __name__=="__main__":
main()
I would suggest using Pipelines, to build data pipelines and GridSearchCV to find the best possible hyper-parameters and classifiers for the pipe.
A basic example;
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_class
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline[('kbest', SelectKBest(chi2, k=3000)),
('clf', DecisionTreeClassifier())
])
pipe_params = {'kbest__k': range(1, 10, 1),
'kbest__score_func': [f_classif, chi2],
'clf__max_depth': np.arange(1,30),
'clf__min_samples_leaf': [1,2,4,5,10,20,30,40,80,100]}
grid_search = GridSearchCV(pipe, pipe_params, n_jobs=-1
scoring=accuracy_score, cv=10)
grid_search.fit(X_train, Y_train)
This will iterate over every hyper-parameters in pipe_params and choose the best classifier based on accuracy_score.

python the evaluation index valus are different largely between cross validate and train_test_split cases

write a program, use support vectore Regression-SVR to predict, firstly, split the dataset to train dataset and test dataset, the ratio of test dataset is 20%(case 1); secondly, use cross validate, split the dataset to 5 groups to predict(case 2),however, Using the same evaluation index(R2,MAE,MSE) to evaluate the two methods, the results are quite different
the program is as follows:
dataset = pd.read_csv('Dataset/allGlassStraightThroughTube.csv')
tube_par = dataset.iloc[:, 3:8].values
tube_eff = dataset.iloc[:, -1:].values
# # form train dataset , test dataset
tube_par_X_train, tube_par_X_test, tube_eff_Y_train, tube_eff_Y_test = train_test_split(tube_par, tube_eff, random_state=33, test_size=0.2)
# normalize the data
sc_X = StandardScaler()
sc_Y = StandardScaler()
sc_tube_par_X_train = sc_X.fit_transform(tube_par_X_train)
sc_tube_par_X_test = sc_X.transform(tube_par_X_test)
sc_tube_eff_Y_train = sc_Y.fit_transform(tube_eff_Y_train)
sc_tube_eff_Y_test = sc_Y.transform(tube_eff_Y_test)
# fit rbf SVR to the sc_tube_par_X dataset
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(sc_tube_par_X_train, sc_tube_eff_Y_train)
#
# # predict new result according to the sc_tube_par_X Dataset
pre_sc_tube_eff_Y_test = support_vector_regressor.predict(sc_tube_par_X_test)
pre_tube_eff_Y_test = sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)
# calculate the predict quality
print('R2-score value rbf SVR')
print(r2_score(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean squared error of rbf SVR is')
print(mean_squared_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean absolute error of rbf SVR is')
print(mean_absolute_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
# normalize
sc_tube_par_X = sc_X.fit_transform(tube_par)
sc_tube_eff_Y = sc_Y.fit_transform(tube_eff)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, sc_tube_par_X, sc_tube_eff_Y, cv=5, scoring=scoring, return_train_score=False)
in case 1, the evaluation index output is:
R2-score value rbf SVR
0.6486074476528559
The mean squared error of rbf SVR is
0.00013501023459497165
The mean absolute error of rbf SVR is
0.007196636233830076
in case 2, the evalution index output is:
R2-score
0.2621779727614816
test_neg_mean_squared_error
-0.6497292887710239
test_neg_mean_absolute_error
-0.5629408849740231
the difference between case 1 and case 2 is big, could you please me the reason and how to correct it
Bin.
I have prepared a little example to see how the results change using cross-validation. I recomend you to try to split the data without seed and see how the results change.
You will see that cross validation results are almost a constant independently of the data split.
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_validate
#from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
def print_metrics(real_y,predicted_y):
# calculate the predict quality
print('R2-score value {:>8.4f}'.format(r2_score(real_y, predicted_y)))
print('Mean squared error is {:>8.4f}'.format(mean_squared_error(real_y, predicted_y)))
print('Mean absolute error is {:>8.4f}\n\n'.format(mean_absolute_error(real_y, predicted_y)))
def show_plot(real_y,predicted_y):
fig,ax = plt.subplots()
ax.scatter(real_y,predicted_y,edgecolors=(0,0,0))
ax.plot([real_y.min(),real_y.max()],[real_y.min(),real_y.max()],"k--",lw=4)
ax.set_xlabel("Measured")
ax.set_ylabel("Predicted")
plt.show()
# dataset load
boston = datasets.load_boston()
#dataset info
# print(boston.keys())
# print(boston.DESCR)
# print(boston.data.shape)
# print(boston.feature_names)
# numpy_arrays
X = boston.data
Y = boston.target
# # form train dataset , test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=5, test_size=0.2)
# fit scalers
sc_X = StandardScaler().fit(X_train)
# standarizes X (train and test)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)
######################################################################
############################### SVR ##################################
######################################################################
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(X_train, Y_train)
predicted_Y = support_vector_regressor.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
########################### LINEAR REGRESSOR #########################
######################################################################
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
predicted_Y = lin_model.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
######################### SVR + CROSS VALIDATION #####################
######################################################################
sc = StandardScaler().fit(X)
standarized_X = sc.transform(X)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, standarized_X, Y, cv=10, scoring=scoring, return_train_score=False)
print(scores["test_r2"].mean())
print(-1*(scores["test_neg_mean_squared_error"].mean()))
print(-1*(scores["test_neg_mean_absolute_error"].mean()))

How to increase the model accuracy of multiple linear regression

This is the custom code
#Custom model for multiple linear regression
import numpy as np
import pandas as pd
dataset = pd.read_csv("50s.csv")
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4:5].values
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
x[:,3] = lb.fit_transform(x[:,3])
from sklearn.preprocessing import OneHotEncoder
on = OneHotEncoder(categorical_features=[3])
x = on.fit_transform(x).toarray()
x = x[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/5, random_state=0)
con = np.matrix(X_train)
z = np.matrix(y_train)
#training model
result1 = con.transpose()*con
result1 = np.linalg.inv(result1)
p = con.transpose()*z
f = result1*p
l = []
for i in range(len(X_test)):
temp = f[0]*X_test[i][0] + f[1]*X_test[i][1] +f[2]*X_test[i][2]+f[3]*X_test[i][3]+f[4]*X_test[i][4]
l.append(temp)
import matplotlib.pyplot as plt
plt.scatter(y_test,l)
plt.show()
Then I created created a model with scikit learn
and compared the results with y_test and l(predicted values of above code)
comparisons are as follows
for i in range(len(prediction)):
print(y_test[i],prediction[i],l[i],sep=' ')
103282.38 103015.20159795816 [[116862.44205399]]
144259.4 132582.27760816005 [[118661.40080974]]
146121.95 132447.73845175043 [[124952.97891882]]
77798.83 71976.09851258533 [[60680.01036438]]
This were the comparison between y_test,scikit-learn model predictions and custom code predictions
please help with the accuracy of model.
blue :Custom model predictions
yellow : scikit-learn model predictions

roc_auc in VotingClassifier, RandomForestClassifier in scikit-learn (sklearn)

I am trying to calculate roc_auc for hard votingclassifier that i build . i present the code with reprodcible example. now i want to calculate the roc_auc score and plot ROC curver but unfortunately i got the following error predict_proba is not available when voting='hard'
# Voting Ensemble for Classification
import pandas
from sklearn import datasets
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer,confusion_matrix, f1_score, precision_score, recall_score, cohen_kappa_score,accuracy_score,roc_curve
import numpy as np
np.random.seed(42)
iris = datasets.load_iris()
X = iris.data[:, :4] # we only take the first two features.
Y = iris.target
print(Y)
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
estimators.append(('RandomForest', model2))
model3 = MultinomialNB()
estimators.append(('NaiveBayes', model3))
model4=SVC(probability=True)
estimators.append(('svm', model4))
model5=DecisionTreeClassifier()
estimators.append(('Cart', model5))
# create the ensemble model
print('Majority Class Labels (Majority/Hard Voting)')
ensemble = VotingClassifier(estimators,voting='hard')
#accuracy
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold,scoring='accuracy')
y_pred = cross_val_predict(ensemble, X ,Y, cv=10)
print("Accuracy ensemble model : %0.2f (+/- %0.2f) " % (results.mean(), results.std() ))
print(results.mean())
#recall
recall_scorer = make_scorer(recall_score, pos_label=1)
recall = cross_val_score(ensemble, X, Y, cv=kfold, scoring=recall_scorer)
print('Recall', np.mean(recall), recall)
# Precision
precision_scorer = make_scorer(precision_score, pos_label=1)
precision = cross_val_score(ensemble, X, Y, cv=kfold, scoring=precision_scorer)
print('Precision', np.mean(precision), precision)
#f1_score
f1_scorer = make_scorer(f1_score, pos_label=1)
f1_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring=f1_scorer)
print('f1_score ', np.mean(f1_score ),f1_score )
#roc_auc_score
roc_auc_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )
To calculate the roc_aucmetric you first need to
Replace: ensemble = VotingClassifier(estimators,voting='hard')
with: ensemble = VotingClassifier(estimators,voting='soft').
Next, the last 2 lines of code will throw an error:
roc_auc_score = cross_val_score(ensemble, X, Y, cv=3, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )
ValueError: multiclass format is not supported
This is normal since in Y you have 3 classes (np.unique(Y) == array([0, 1, 2])).
You can't use roc_auc as a single summary metric for multiclass models. If you want, you could calculate **per-class roc_auc.**
How to solve this:
1) Use only two classes to calculate the roc_auc_score
2) use label binarization in advance vefore calling roc_auc_score

Categories