how to implement RFECV and gridsearchCV in svm - python

I try to implement gridsearchcv and then rfecv on svm
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
param_grid = {'C': [0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear','rbf','sigmoid']}
estimator = SVC(probability = True, coef0 = 1.0)
clf = GridSearchCV (estimator, param_grid, cv=10, verbose=True)
clf.fit(data_train, label_train)
selector = RFECV(estimator, step = 1, cv= 10)
selector.fit(data_train, label_train)
label_predicted = selector.predict(data_test)
print(classification_report(label_test, label_predicted, digits=4))
it shows an error
ValueError: when importance_getter=='auto', the underlying estimator SVC should have coef_ or feature_importances_ attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.
here is output image

Related

SKLearn & ElasticNet: Cross validation fails when using Accuracy as a metric

I have a binary classification problem. I've been using cross validation to optimize the ElasticNet parameters. However ElasticNet only seems to work when I supply roc_auc as the scoring method to be used during CV, However I also want to test out a wide range of scoring methods, in particular accuracy. Specifically, when using accuracy, ElasticNet returns this error:
ValueError: Classification metrics can't handle a mix of binary and continuous targets
However my y targets are indeed binary. Below is a replication of my problem using the dataset from here:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
data = pd.read_csv('data 2.csv')
# by default majority class (benign) will be negative
lb = LabelBinarizer()
data['diagnosis'] = lb.fit_transform(data['diagnosis'].values)
targets = data['diagnosis']
data.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, targets, stratify=targets)
#elastic net logistic regression
lr = ElasticNet(max_iter=2000)
scorer = 'accuracy'
param_grid = {
'alpha': [1e-4, 1e-3, 1e-2, 0.01, 0.1, 1, 5, 10],
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
skf = StratifiedKFold(n_splits=10)
clf = GridSearchCV(lr, param_grid, scoring=scorer, cv=skf, return_train_score=True,
n_jobs=-1)
clf.fit(X_train.values, y_train.values)
I figured that ElasticNet might be trying to solve a linear regression problem so I tried lr = LogisticRegression(penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga') as the classifier but the same problem persists.
If I use as the scoring metric scorer = 'roc_auc' then the model is built as expected.
Also, as a sanity to check to see if there is something wrong with the data I tried the same but with a random forest classifier and here the problem disappears:
# random forest
clf = RandomForestClassifier(n_jobs=-1)
param_grid = {
'min_samples_split': [3, 5, 10],
'n_estimators' : [100, 300],
'max_depth': [3, 5, 15, 25],
'max_features': [3, 5, 10, 20]
}
skf = StratifiedKFold(n_splits=10)
scorer = 'accuracy'
grid_search = GridSearchCV(clf, param_grid, scoring=scorer,
cv=skf, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train.values, y_train.values)
Has anyone got any ideas on what's happening here?
ElasticNet is a regression model.
If you want an ElasticNet penalty in classification, use LogisticRegression:
lr = LogisticRegression(solver="saga", penalty="elasticnet")
Minimal Reproducible Example:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
lr = LogisticRegression(solver="saga", penalty="elasticnet", max_iter=2000)
param_grid = {
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
clf = GridSearchCV(lr, param_grid, scoring='accuracy', cv=StratifiedKFold(n_splits=10), return_train_score=True, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

I'm trying to tune my model using the Grid search model in #kaggle notebook. In order to benefit from the GPU, I used this package hummingbird-ml. Thanks in advance
However, I get the following issue:
AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'
Here is my code:
from hummingbird.ml import convert
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, mean_squared_error
from pprint import pprint
# Hyper-tunning for SVM regressor
import numpy as np
base_svr = SVR()
scorer = make_scorer(mean_squared_error, greater_is_better=False)
param_grid_svr = {'C': [0.01, 0.1,1, 10, 100],
'gamma': [1,0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
'epsilon': [0.01, 0.1, 0.2 , 0.3, 1]}
pprint(param_grid_svr)
# Create a GridSearchCV object and fit it to the training data
svr_gs = GridSearchCV(base_svr,param_grid_svr, n_jobs = -1 , scoring=scorer, cv=3 ,refit=True,verbose=2)
# Converting scikit-learn model to PyTorch on CPU
svr_gs_pytorch = convert(svr_gs, 'torch')
# Switching PyTorch from CPU to GPU
%%capture
svr_gs_pytorch.to('cuda')
# Train the model in GPU
svr_gs_pytorch.fit(X_train,y_train)
# print best parameter after tuning
svr_gs_pytorch.best_params_

Optimizing learning rate and number of estimators for multioutput gradient boosting

I have a dataset with multiple outputs and am trying to use gradient boosting to predict all the values at once. I imported MultiOutputRegressor so multiple outputs can be predicted at once; I'm able to make it work for the default gradient boosting function. However, I'm running into an error when I try to optimize the gradient boosting function for each output.
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn import ensemble
params = {'max_depth': 3, 'n_estimators': 100, 'learning_rate': 0.1}
gradient_regressor = MultiOutputRegressor(ensemble.GradientBoostingRegressor(**params))
GradBoostModel = gradient_regressor.fit(X_train, y_train)
prediction_GradBoost = GradBoostModel.predict(X_test)
LR = {'learning_rate':[0.15, 0.125, 0.1, 0.75, 0.05], 'n_estimators':[50, 75, 100, 150, 200, 250, 300, 400]}
tuning = GridSearchCV(estimator = GradBoostModel, param_grid = LR, scoring = 'r2')
tuning.fit(X_train, y_train)
tuning.best_params_, tuning.best_score_
I'm trying to use GridSearchCV to cycle through the listed learning rates and number of estimators to find the optimal values. But, I get the following error:
Invalid parameter learning_rate for estimator MultiOutputRegressor.
Check the list of available parameters with `estimator.get_params().keys()`
I think I understand the reason for the error: when I try to optimize the gradient boosting parameters, they are passed through the MultiOutputRegressor, which doesn't recognize them. Is this the case? Also, how can I change my code, such that I can optimize these parameters for each output?
Indeed the params are prefixed with estimator__, in general, to find out what params to use downstream in your pipeline use the .get_params().keys() method on your model, eg:
print(GradBoostModel.get_params().keys())
dict_keys(['estimator__alpha', 'estimator__ccp_alpha', 'estimator__criterion', 'estimator__init', 'estimator__learning_rate',...
Full working example with the linnerud dataset:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.datasets import load_linnerud
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
# Data
rng = np.random.RandomState(0)
X, y = load_linnerud(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
# Model
params = {'max_depth': 3, 'n_estimators': 100, 'learning_rate': 0.1}
gradient_regressor = MultiOutputRegressor(GradientBoostingRegressor(**params))
GradBoostModel = gradient_regressor.fit(X_train, y_train)
prediction_GradBoost = GradBoostModel.predict(X_test)
LR = {'estimator__learning_rate': [0.15, 0.125, 0.1, 0.75, 0.05], 'estimator__n_estimators': [50, 75, 100, 150, 200, 250, 300, 400]}
print('Params from GradBoostModel', GradBoostModel.get_params().keys())
tuning = GridSearchCV(estimator=GradBoostModel, param_grid=LR, scoring='r2')
tuning.fit(X_train, y_train)

'GridSearchCV' object has no attribute 'best_params_' when using LogisticRegression

Below is the code that I am trying to execute
# Train a logistic regression model, report the coefficients and model performance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
clf = LogisticRegression().fit(X_train, y_train)
params = {'penalty':['l1','l2'],'dual':[True,False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True,False],
'solver':['saga']}
gridlog = GridSearchCV(clf, params, cv=5, n_jobs=2, scoring='roc_auc')
cv_scores = cross_val_score(gridlog, X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_) # throws error
The last code line above is where the error is being thrown from. I have used this exact same code to run other models. Any idea why I may be facing this issue?
You need to fit gridlog first. cross_val_score will not do this, it returns the scores & nothing else.
Hence, as gridlog isn't trained, it throws error.
Below code works perfectly fine:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
diabetes = datasets.load_breast_cancer()
x = diabetes.data[:150]
y = diabetes.target[:150]
clf = LogisticRegression().fit(x, y)
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gridlog = GridSearchCV(clf, params, cv=2, n_jobs=2,
scoring='roc_auc')
gridlog.fit(x,y) # <- missing in your code
cv_scores = cross_val_score(gridlog, x, y)
print(cv_scores)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_)
# result:
Logistic regression parameters: {'C': 1}
Your code should be updated such that the LogisticRegression classifier is passed to the GridSearch (not its fit):
from sklearn.datasets import load_breast_cancer # For example only
X_train, y_train = load_breast_cancer(return_X_y=True)
params = {'penalty':['l1', 'l2'],'dual':[True, False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True, False],
'solver':['saga']}
gridlog = GridSearchCV(LogisticRegression(), params, cv=5, n_jobs=2, scoring='roc_auc')
gridlog.fit(X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ', gridlog.best_params_) # Now it displays all the parameters selected by the grid search
Results
Logistic Regression parameters: {'C': 0.1, 'dual': False, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'saga'}
Note, as #desertnaut pointed out, you don't use cross_val_score for GridSearchCV.
See a complete example of how to use GridSearch here.
The example use a SVC classifier instead of a LogisticRegression, but the approach is the same.

How to perform feature selection (rfecv) in cross validation in sklearn

I want to perform recursive feature elimination with cross validation (rfecv) in 10-fold cross validation (i.e. cross_val_predict or cross_validate) in sklearn.
Since rfecv itself has a cross validation part in its name, I am not clear how to do it. My current code is as follows.
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 0, class_weight="balanced")
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
rfecv = RFECV(estimator=clf, step=1, cv=k_fold)
Please let me know how I can use the data X and y with rfecv in 10-fold cross validation.
I am happy to provide more details if needed.
To use recursive feature elimination in conjunction with a pre-defined k_fold, you should use RFE and not RFECV:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
clf = RandomForestClassifier(random_state = 0, class_weight="balanced")
selector = RFE(clf, 5, step=1)
cv_acc = []
for train_index, val_index in k_fold.split(X, y):
selector.fit(X[train_index], y[train_index])
pred = selector.predict(X[val_index])
acc = accuracy_score(y[val_index], pred)
cv_acc.append(acc)
cv_acc
# result:
[1.0,
0.9333333333333333,
0.9333333333333333,
1.0,
0.9333333333333333,
0.9333333333333333,
0.8666666666666667,
1.0,
0.8666666666666667,
0.9333333333333333]
To perform feature selection with RFE and then fit a rf with 10 fold cross validation, here's how you could do it:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
rf = RandomForestClassifier(random_state = 0, class_weight="balanced")
rfe = RFE(estimator=rf, step=1)
Now transform the original X by fitting with the RFECV:
X_new = rfe.fit_transform(X, y)
Here are the ranked features (not much of a problem with only 4 of them):
rfe.ranking_
# array([2, 3, 1, 1])
Now split into train and test data and perform a cross validation in conjunction with a grid search using GridSearchCV (they usually go together):
X_train, X_test, y_train, y_test = train_test_split(X_new,y,train_size=0.7)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
param_grid = {
'n_estimators': [5, 10, 15, 20],
'max_depth': [2, 5, 7, 9]
}
grid_clf = GridSearchCV(rf, param_grid, cv=k_fold.split(X_train, y_train))
grid_clf.fit(X_train, y_train)
y_pred = grid_clf.predict(X_test)
confusion_matrix(y_test, y_pred)
array([[17, 0, 0],
[ 0, 11, 0],
[ 0, 3, 14]], dtype=int64)

Categories