I have a binary classification problem. I've been using cross validation to optimize the ElasticNet parameters. However ElasticNet only seems to work when I supply roc_auc as the scoring method to be used during CV, However I also want to test out a wide range of scoring methods, in particular accuracy. Specifically, when using accuracy, ElasticNet returns this error:
ValueError: Classification metrics can't handle a mix of binary and continuous targets
However my y targets are indeed binary. Below is a replication of my problem using the dataset from here:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
data = pd.read_csv('data 2.csv')
# by default majority class (benign) will be negative
lb = LabelBinarizer()
data['diagnosis'] = lb.fit_transform(data['diagnosis'].values)
targets = data['diagnosis']
data.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, targets, stratify=targets)
#elastic net logistic regression
lr = ElasticNet(max_iter=2000)
scorer = 'accuracy'
param_grid = {
'alpha': [1e-4, 1e-3, 1e-2, 0.01, 0.1, 1, 5, 10],
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
skf = StratifiedKFold(n_splits=10)
clf = GridSearchCV(lr, param_grid, scoring=scorer, cv=skf, return_train_score=True,
n_jobs=-1)
clf.fit(X_train.values, y_train.values)
I figured that ElasticNet might be trying to solve a linear regression problem so I tried lr = LogisticRegression(penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga') as the classifier but the same problem persists.
If I use as the scoring metric scorer = 'roc_auc' then the model is built as expected.
Also, as a sanity to check to see if there is something wrong with the data I tried the same but with a random forest classifier and here the problem disappears:
# random forest
clf = RandomForestClassifier(n_jobs=-1)
param_grid = {
'min_samples_split': [3, 5, 10],
'n_estimators' : [100, 300],
'max_depth': [3, 5, 15, 25],
'max_features': [3, 5, 10, 20]
}
skf = StratifiedKFold(n_splits=10)
scorer = 'accuracy'
grid_search = GridSearchCV(clf, param_grid, scoring=scorer,
cv=skf, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train.values, y_train.values)
Has anyone got any ideas on what's happening here?
ElasticNet is a regression model.
If you want an ElasticNet penalty in classification, use LogisticRegression:
lr = LogisticRegression(solver="saga", penalty="elasticnet")
Minimal Reproducible Example:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
lr = LogisticRegression(solver="saga", penalty="elasticnet", max_iter=2000)
param_grid = {
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
clf = GridSearchCV(lr, param_grid, scoring='accuracy', cv=StratifiedKFold(n_splits=10), return_train_score=True, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
Related
I am trying to use Grid-Search-Cross-Validation to find the best value of hyperparameter C. I did split the data set into two subsets contains 50% of the Mnist 784, and used only one of the two subsets with 60% and 40% for training and testing respectively.
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import numpy as np
mnist = fetch_openml('mnist_784')
X, y = mnist['data'], mnist['target']
X_1, X_2, y_1, y_2 = train_test_split(X, y, test_size=0.5, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.4)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm = LinearSVC(dual=False, max_iter=10000)
param_grid = {'C': [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]}
grid = GridSearchCV(svm, param_grid, scoring='accuracy')
grid.fit(X_train_scaled, y_train)
print("Best value of C:", grid.best_params_['C'])
accuracy = grid.score(X_test_scaled, y_test)
print("Test accuracy:", accuracy)
I have tried everything without any progress.
I tried minimizing the data and then normalizing it, also tried to increase the max_iter=10000 and 15000.
I have the following unbalanced data set with two features (keon i.e. gender and alder i.e. age) that was balanced using under_sampling method which I trained on different classifier to predict the call_ending_reason where 0 is No and 1 is Yes:
The balanced dataset with both 1 and 0 have same kind of distribution which can be visualized like this:
However, after performing under_sampling method on the above shown dataset and training both type of dataset in various classifier from sklearn, the balanced dataset is detecting 1s high precision but 0s with very low precision. The opposite happens when I use the main dataset.
Here is the code:
x = filtered_data_limited_features_with_yes_no
y = filtered_data_limited_features_with_yes_no['call_ending_reason']
del x['call_ending_reason']
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.80)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
# rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='constant',learning_rate_init=0.0001, n_iter_no_change=50, max_iter=100)
# rfc = GaussianNB()
rfc=RandomForestClassifier()
param_grid = {
'n_estimators': [50,100,200,500],
'max_features': ['auto', 'sqrt', 'log2'],
'criterion' :['gini', 'entropy']
}
CV_rfc_all_data = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
# rfc = LinearSVC()
CV_rfc_all_data.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, CV_rfc_all_data.predict(X_test)))
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler( random_state=1)
df_balanced, balanced_labels = ros.fit_resample(x, y)
####TRAINING AND PREDICTING CLASSIFIER BASED ON BALANCED DATASET
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.70)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
# rfc=RandomForestClassifier()
# param_grid = {
# 'n_estimators': [50,100,200,500],
# 'max_features': ['auto', 'sqrt', 'log2'],
# 'criterion' :['gini', 'entropy']
# }
# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
# CV_rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='invscaling',learning_rate_init=0.0003, n_iter_no_change=50, max_iter=100)
CV_rfc = DecisionTreeClassifier()
CV_rfc.fit(X_train, y_train)
# CV_rfc.best_params_
Questions:
Given the visualization:
What classifier should be used to train the classifier with more than 65% precision for both predicting 1 and 0
Do I need to scale the data given its only 2 features? If so how should I do that properly to scale both training and testing data
You can try to set the class_weight="balanced" argument the models, it is supported in most of the models that are supported by scikit-learn It won't be magic, but in my experience, it usually works better than under or over sampling.
For the metric used in your grid search, I would use the f1_score as suggested by #Erwan, it will penalize heavily poor precision and poor recall, and will reward hyper parameters that yield a more balanced model.
The whole idea is to perform a grid search over all possible values of lambda, where each possible values of lambda would give a specific best subset of feature. At The end of the day I'm trying to do hyperparameter tuning (lambda) and feature selection at the same time. any advice is greatly appreciated! thankyou so much
ISSUE :
result of gs_cv.best_estimator_[0].estimator.alpha while gs_cv.best_estimator_[1].alpha = 1.0 (pipeline indexing results)
best_parameter from the grid_search_cv doesnt seem to be fitted to the model part of the pipeline as seen in the image.
I got this when print(gs_cv.best_estimator_.named_steps). The Ridge() still uses the default value lambda of 1
{'sfs_ridge': SequentialFeatureSelector(estimator=Ridge(alpha=0.0), k_features=5,
scoring='r2'), 'ridge_regression': Ridge()}
------------Code------------------
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
#Model
ridge = Ridge()
#hyperparameter_alpha = np.logspace(-6,6, num=5)
#SFS model
sfs_ridge = SFS(estimator=ridge, k_features = 5, forward=True, floating=False, scoring='r2', cv = 5)
#Pipeline model
pipe = Pipeline([ ('sfs_ridge', sfs_ridge), ('ridge_regression', ridge) ])
#GridSearchCV
#The parameter_grid for the model should start with the name you give when defining the pipeline!!
param_grid = [ {'sfs_ridge__k_features': [2,4,5] ,'sfs_ridge__estimator__alpha': np.arange(0,1,0.05) }]
gs_cv = GridSearchCV(estimator= pipe, param_grid= param_grid, scoring="neg_mean_absolute_error", n_jobs = -1, cv=5, refit=True)
gs_cv.fit(X_train, y_train)
print(gs_cv.best_estimator_[0].estimator.alpha) #print out 0.0
print(gs_cv.best_estimator_[1].alpha) #print out 1.0
print(gs_cv.best_estimator_[0].k_feature_idx_)
Below is the code that I am trying to execute
# Train a logistic regression model, report the coefficients and model performance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
clf = LogisticRegression().fit(X_train, y_train)
params = {'penalty':['l1','l2'],'dual':[True,False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True,False],
'solver':['saga']}
gridlog = GridSearchCV(clf, params, cv=5, n_jobs=2, scoring='roc_auc')
cv_scores = cross_val_score(gridlog, X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_) # throws error
The last code line above is where the error is being thrown from. I have used this exact same code to run other models. Any idea why I may be facing this issue?
You need to fit gridlog first. cross_val_score will not do this, it returns the scores & nothing else.
Hence, as gridlog isn't trained, it throws error.
Below code works perfectly fine:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
diabetes = datasets.load_breast_cancer()
x = diabetes.data[:150]
y = diabetes.target[:150]
clf = LogisticRegression().fit(x, y)
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gridlog = GridSearchCV(clf, params, cv=2, n_jobs=2,
scoring='roc_auc')
gridlog.fit(x,y) # <- missing in your code
cv_scores = cross_val_score(gridlog, x, y)
print(cv_scores)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_)
# result:
Logistic regression parameters: {'C': 1}
Your code should be updated such that the LogisticRegression classifier is passed to the GridSearch (not its fit):
from sklearn.datasets import load_breast_cancer # For example only
X_train, y_train = load_breast_cancer(return_X_y=True)
params = {'penalty':['l1', 'l2'],'dual':[True, False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True, False],
'solver':['saga']}
gridlog = GridSearchCV(LogisticRegression(), params, cv=5, n_jobs=2, scoring='roc_auc')
gridlog.fit(X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ', gridlog.best_params_) # Now it displays all the parameters selected by the grid search
Results
Logistic Regression parameters: {'C': 0.1, 'dual': False, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'saga'}
Note, as #desertnaut pointed out, you don't use cross_val_score for GridSearchCV.
See a complete example of how to use GridSearch here.
The example use a SVC classifier instead of a LogisticRegression, but the approach is the same.
I want to perform recursive feature elimination with cross validation (rfecv) in 10-fold cross validation (i.e. cross_val_predict or cross_validate) in sklearn.
Since rfecv itself has a cross validation part in its name, I am not clear how to do it. My current code is as follows.
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 0, class_weight="balanced")
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
rfecv = RFECV(estimator=clf, step=1, cv=k_fold)
Please let me know how I can use the data X and y with rfecv in 10-fold cross validation.
I am happy to provide more details if needed.
To use recursive feature elimination in conjunction with a pre-defined k_fold, you should use RFE and not RFECV:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
clf = RandomForestClassifier(random_state = 0, class_weight="balanced")
selector = RFE(clf, 5, step=1)
cv_acc = []
for train_index, val_index in k_fold.split(X, y):
selector.fit(X[train_index], y[train_index])
pred = selector.predict(X[val_index])
acc = accuracy_score(y[val_index], pred)
cv_acc.append(acc)
cv_acc
# result:
[1.0,
0.9333333333333333,
0.9333333333333333,
1.0,
0.9333333333333333,
0.9333333333333333,
0.8666666666666667,
1.0,
0.8666666666666667,
0.9333333333333333]
To perform feature selection with RFE and then fit a rf with 10 fold cross validation, here's how you could do it:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
rf = RandomForestClassifier(random_state = 0, class_weight="balanced")
rfe = RFE(estimator=rf, step=1)
Now transform the original X by fitting with the RFECV:
X_new = rfe.fit_transform(X, y)
Here are the ranked features (not much of a problem with only 4 of them):
rfe.ranking_
# array([2, 3, 1, 1])
Now split into train and test data and perform a cross validation in conjunction with a grid search using GridSearchCV (they usually go together):
X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size=0.7)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
param_grid = {
'n_estimators': [5, 10, 15, 20],
'max_depth': [2, 5, 7, 9]
}
grid_clf = GridSearchCV(rf, param_grid, cv=k_fold.split(X_train, y_train))
grid_clf.fit(X_train, y_train)
y_pred = grid_clf.predict(X_test)
confusion_matrix(y_test, y_pred)
array([[17, 0, 0],
[ 0, 11, 0],
[ 0, 3, 14]], dtype=int64)