I am trying to calculate roc_auc for hard votingclassifier that i build . i present the code with reprodcible example. now i want to calculate the roc_auc score and plot ROC curver but unfortunately i got the following error predict_proba is not available when voting='hard'
# Voting Ensemble for Classification
import pandas
from sklearn import datasets
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer,confusion_matrix, f1_score, precision_score, recall_score, cohen_kappa_score,accuracy_score,roc_curve
import numpy as np
np.random.seed(42)
iris = datasets.load_iris()
X = iris.data[:, :4] # we only take the first two features.
Y = iris.target
print(Y)
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)
estimators.append(('RandomForest', model2))
model3 = MultinomialNB()
estimators.append(('NaiveBayes', model3))
model4=SVC(probability=True)
estimators.append(('svm', model4))
model5=DecisionTreeClassifier()
estimators.append(('Cart', model5))
# create the ensemble model
print('Majority Class Labels (Majority/Hard Voting)')
ensemble = VotingClassifier(estimators,voting='hard')
#accuracy
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold,scoring='accuracy')
y_pred = cross_val_predict(ensemble, X ,Y, cv=10)
print("Accuracy ensemble model : %0.2f (+/- %0.2f) " % (results.mean(), results.std() ))
print(results.mean())
#recall
recall_scorer = make_scorer(recall_score, pos_label=1)
recall = cross_val_score(ensemble, X, Y, cv=kfold, scoring=recall_scorer)
print('Recall', np.mean(recall), recall)
# Precision
precision_scorer = make_scorer(precision_score, pos_label=1)
precision = cross_val_score(ensemble, X, Y, cv=kfold, scoring=precision_scorer)
print('Precision', np.mean(precision), precision)
#f1_score
f1_scorer = make_scorer(f1_score, pos_label=1)
f1_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring=f1_scorer)
print('f1_score ', np.mean(f1_score ),f1_score )
#roc_auc_score
roc_auc_score = cross_val_score(ensemble, X, Y, cv=kfold, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )
To calculate the roc_aucmetric you first need to
Replace: ensemble = VotingClassifier(estimators,voting='hard')
with: ensemble = VotingClassifier(estimators,voting='soft').
Next, the last 2 lines of code will throw an error:
roc_auc_score = cross_val_score(ensemble, X, Y, cv=3, scoring='roc_auc')
print('roc_auc_score ', np.mean(roc_auc_score ),roc_auc_score )
ValueError: multiclass format is not supported
This is normal since in Y you have 3 classes (np.unique(Y) == array([0, 1, 2])).
You can't use roc_auc as a single summary metric for multiclass models. If you want, you could calculate **per-class roc_auc.**
How to solve this:
1) Use only two classes to calculate the roc_auc_score
2) use label binarization in advance vefore calling roc_auc_score
Related
I have to fit 40 time series in a VectorAutoregressive model, the enormous quantity of variables suggest to use a selection method. I would love to use the LASSO method, but I'm using statsmodel for the fitting, and the only way to implement LASSO with that library is for a Linear regression model. Someone can help?
You can try using fit_regularized, it's like when you fit an OLS, and you set L1_wt to be 1 so that it is a lasso:
sm.OLS(..,..).fit_regularized(alpha=..,L1_wt=1)
We can with an example, first load the boston dataset:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import numpy as np
import statsmodels.api as sm
scaler = StandardScaler()
data = load_boston()
data_scaled = scaler.fit_transform(data.data)
X_train, X_test, y_train, y_test = train_test_split(data_scaled, data.target, test_size=0.33, random_state=42)
Below to show that it works similarly, and you need to tweak the shrinkage parameter, alpha anyway in your model:
alphas = [0.0001,0.001, 0.01, 0.1,0.2, 0.5, 1]
mse_sklearn = []
mse_sm = []
for a in alphas:
clf = linear_model.Lasso(alpha=a)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mse_sklearn.append(mean_squared_error(y_test, y_pred))
mdl = sm.OLS(y_train,sm.add_constant(X_train)).fit_regularized(alpha=a,L1_wt=1)
y_pred = mdl.predict(sm.add_constant(X_test))
mse_sm.append(mean_squared_error(y_test, y_pred))
Visualize the results:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(alphas,mse_sm,label="sm")
ax.plot(alphas,mse_sklearn,label="sklearn")
ax.legend()
I have the following code, where i predict a value from 4 input values:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
data = np.loadtxt('C:/Users/hedeg/Desktop/RulaSoftEdgePrediction.txt')
X_train = np.array(data[0:3500,0:4])
y_train = np.array(data[0:3500,4])
X_test = np.array(data[3500::,0:4])
y_test = np.array(data[3500::,4])
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
I get this error msg:
raise ValueError("Unknown label type: %s" % repr(ys))
ValueError: Unknown label type: (array([1. , 1.1, 1.2, ..., 3. , 3. , 3. ]),)
How can i solve this problem?
Try to use this one:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
# generate 2d classification dataset
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
# fit final model
model = LogisticRegression()
model.fit(X, y)
# example of training a final classification model
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
# generate 2d classification dataset
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
# fit final model
model = LogisticRegression()
model.fit(X, y)
I created a model to classify my 8-classes dataset and get some scores from it using MLP. To do so, I decided to use sklearn.metrics.cross_validate, using 10 folds.
The following code works fine:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, make_scorer, f1_score
import pandas as pd
def MLPClasify(sample):
df = pd.read_csv('my_path\\my_file.csv', header=None)
y = df[NumberOfFeatures]
x = df.drop([NumberOfFeatures], axis=1)
clf = MLPClassifier(hidden_layer_sizes=(27), activation='logistic', max_iter=500, alpha=0.0001,
solver='adam', verbose=10, random_state=21, tol=0.000000001)
clf.out_activation_ = 'softmax'
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score,
average='weighted')}
scores = cross_validate(clf, x, y, cv=10, scoring=scoring)
return scores
Everything went ok, I was getting some accuracies around 60%. So I decided to use one hot encoding to see if I can get better results. So I wrote the following code:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, make_scorer, f1_score
import pandas as pd
def MLPClasify(sample):
df = pd.read_csv('my_path\\my_file.csv', header=None)
y = df[NumberOfFeatures]
x = df.drop([NumberOfFeatures], axis=1)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
onehot_encoder = OneHotEncoder()
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
y = onehot_encoded
clf = MLPClassifier(hidden_layer_sizes=(27), activation='logistic', max_iter=500, alpha=0.0001,
solver='adam', verbose=10, random_state=21, tol=0.000000001)
clf.out_activation_ = 'softmax'
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score,
average='weighted')}
scores = cross_validate(clf, x, y, cv=10, scoring=scoring)
return scores
Well, the code runs, but I get the following warning:
UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use zero_division parameter to con
trol this behavior.
average, "true nor predicted", 'F-score is', len(true_sum)
Also, my accuracy drops to less than 2%
Any ideas on what I may be doing wrong?
Thanks for the help
I have tried the following code and this error has been occuring to me
Link for DataSet is in link bellow
ValueError
---> line 18 ds1_model.fit(X, y)
ValueError: could not convert string to float: 'Iris-setosa'
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv'
ds1 = pd.read_csv(url)
ds1.columns = (['SepalLength' , 'SepalWidth' , 'PetalLength' , 'PetalWidth' , 'ClassLabel'])
ds1_filtered=ds1.dropna(axis=0)
y = ds1_filtered.ClassLabel
ds1_features = ['SepalLength' , 'SepalWidth' , 'PetalLength' , 'PetalWidth']
X = ds1_filtered[ds1_features]
ds1_model = DecisionTreeRegressor()
ds1_model.fit(X, y)
PredictedClassLabel = ds1_model.predict(X)
mean_absolute_error(y, PredictedClassLabel)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
ds1_model = DecisionTreeRegressor()
ds1_model.fit(train_X, train_y)
predicitions = ds1_model.predict(val_X)
print(mean_absolute_error(val_y, predictions))
can you please help to suggest or explain how to fix this?
DataSet Link
As the name ClassLabel implies, the iris dataset is a classification and not a regression one; hence, neither DecisionTreeRegressor is the correct model to use nor mean_absolute_error is the correct metric.
You should use a DecisionTreeClassifier and accuracy_score instead:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = load_iris()
clf = DecisionTreeClassifier()
train_X, val_X, train_y, val_y = train_test_split(iris.data, iris.label, random_state = 0)
clf.fit(train_X, train_Y)
pred = clf.predict(val_X)
print(accuracy_score(val_y, pred))
The scikit-learn decision tree classification tutorial using the said dataset can give you more ideas.
I have 4 features and one target variable. I am using RandomForestRegressor instead of RandomForestClassifer as my target variable is float. When I am trying to fit my model and then output them in sorted order to get the important features I am getting Not fitted error how to fix it?
Code:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
# Split the data into 30% test and 70% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
feat_labels = data.columns[:4]
regr = RandomForestRegressor(max_depth=2, random_state=0)
#clf = RandomForestClassifier(n_estimators=100, random_state=0)
# Train the classifier
#clf.fit(X_train, y_train)
regr.fit(X, y)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
You are fitting to regr but calling the feature importances on clf. Try calling this instead:
importances = regr.feature_importances_
I noticed that previously your classifier was being fit with the training data you setup, but the regressor is now being fit with X and y.
However, I don't see here where you're setting X and y in the first place or even more where you actually load in a dataset. Could it be you forgot this step as well as what Harpal mentioned in another answer?