Let's consider a multivariate regression problem (2 response variables: Latitude and Longitude). Currently, a few machine learning model implementations like Support Vector Regression sklearn.svm.SVR do not currently provide naive support of multivariate regression. For this reason, sklearn.multioutput.MultiOutputRegressor can be used.
Example:
from sklearn.multioutput import MultiOutputRegressor
svr_multi = MultiOutputRegressor(SVR(),n_jobs=-1)
#Fit the algorithm on the data
svr_multi.fit(X_train, y_train)
y_pred= svr_multi.predict(X_test)
My goal is to tune the parameters of SVR by sklearn.model_selection.GridSearchCV. Ideally, if the response was a single variable and not multiple, I would perform an operation as follows:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
pipe_svr = (Pipeline([('scl', StandardScaler()),
('reg', SVR())]))
grid_param_svr = {
'reg__C': [0.01,0.1,1,10],
'reg__epsilon': [0.1,0.2,0.3],
'degree': [2,3,4]
}
gs_svr = (GridSearchCV(estimator=pipe_svr,
param_grid=grid_param_svr,
cv=10,
scoring = 'neg_mean_squared_error',
n_jobs = -1))
gs_svr = gs_svr.fit(X_train,y_train)
However, as my response y_train is 2-dimensional, I need to use the MultiOutputRegressor on top of SVR. How can I modify the above code to enable this GridSearchCV operation? If not possible, is there a better alternative?
For use without pipeline, put estimator__ before parameters:
param_grid = {'estimator__min_samples_split':[10, 50],
'estimator__min_samples_leaf':[50, 150]}
gb = GradientBoostingRegressor()
gs = GridSearchCV(MultiOutputRegressor(gb), param_grid=param_grid)
gs.fit(X,y)
I just found a working solution. In the case of nested estimators, the parameters of the inner estimator can be accessed by estimator__.
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
pipe_svr = Pipeline([('scl', StandardScaler()),
('reg', MultiOutputRegressor(SVR()))])
grid_param_svr = {
'reg__estimator__C': [0.1,1,10]
}
gs_svr = (GridSearchCV(estimator=pipe_svr,
param_grid=grid_param_svr,
cv=2,
scoring = 'neg_mean_squared_error',
n_jobs = -1))
gs_svr = gs_svr.fit(X_train,y_train)
gs_svr.best_estimator_
Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)),
('reg', MultiOutputRegressor(estimator=SVR(C=10, cache_size=200,
coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1,
shrinking=True, tol=0.001, verbose=False), n_jobs=1))])
Thank you, Marco.
Adding to your answer here is a short illustrative example of a Randomized Search applied to a Multi-Ouput GradientBoostingRegressor.
from sklearn.datasets import load_linnerud
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV
x, y = load_linnerud(return_X_y=True)
model = MultiOutputRegressor(GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
criterion='friedman_mse', min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_depth=3,
min_impurity_decrease=0.0,
min_impurity_split=None, init=None, random_state=None,
max_features=None,
alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
validation_fraction=0.1, n_iter_no_change=None, tol=0.0001,
ccp_alpha=0.0))
hyperparameters = dict(estimator__learning_rate=[0.05, 0.1, 0.2, 0.5, 0.9], estimator__loss=['ls', 'lad', 'huber'],
estimator__n_estimators=[20, 50, 100, 200, 300, 500, 700, 1000],
estimator__criterion=['friedman_mse', 'mse'], estimator__min_samples_split=[2, 4, 7, 10],
estimator__max_depth=[3, 5, 10, 15, 20, 30], estimator__min_samples_leaf=[1, 2, 3, 5, 8, 10],
estimator__min_impurity_decrease=[0, 0.2, 0.4, 0.6, 0.8],
estimator__max_leaf_nodes=[5, 10, 20, 30, 50, 100, 300])
randomized_search = RandomizedSearchCV(model, hyperparameters, random_state=0, n_iter=5, scoring=None,
n_jobs=2, refit=True, cv=5, verbose=True,
pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True)
hyperparameters_tuning = randomized_search.fit(x, y)
print('Best Parameters = {}'.format(hyperparameters_tuning.best_params_))
tuned_model = hyperparameters_tuning.best_estimator_
print(tuned_model.predict(x))
Related
I have a binary classification problem. I've been using cross validation to optimize the ElasticNet parameters. However ElasticNet only seems to work when I supply roc_auc as the scoring method to be used during CV, However I also want to test out a wide range of scoring methods, in particular accuracy. Specifically, when using accuracy, ElasticNet returns this error:
ValueError: Classification metrics can't handle a mix of binary and continuous targets
However my y targets are indeed binary. Below is a replication of my problem using the dataset from here:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
data = pd.read_csv('data 2.csv')
# by default majority class (benign) will be negative
lb = LabelBinarizer()
data['diagnosis'] = lb.fit_transform(data['diagnosis'].values)
targets = data['diagnosis']
data.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, targets, stratify=targets)
#elastic net logistic regression
lr = ElasticNet(max_iter=2000)
scorer = 'accuracy'
param_grid = {
'alpha': [1e-4, 1e-3, 1e-2, 0.01, 0.1, 1, 5, 10],
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
skf = StratifiedKFold(n_splits=10)
clf = GridSearchCV(lr, param_grid, scoring=scorer, cv=skf, return_train_score=True,
n_jobs=-1)
clf.fit(X_train.values, y_train.values)
I figured that ElasticNet might be trying to solve a linear regression problem so I tried lr = LogisticRegression(penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga') as the classifier but the same problem persists.
If I use as the scoring metric scorer = 'roc_auc' then the model is built as expected.
Also, as a sanity to check to see if there is something wrong with the data I tried the same but with a random forest classifier and here the problem disappears:
# random forest
clf = RandomForestClassifier(n_jobs=-1)
param_grid = {
'min_samples_split': [3, 5, 10],
'n_estimators' : [100, 300],
'max_depth': [3, 5, 15, 25],
'max_features': [3, 5, 10, 20]
}
skf = StratifiedKFold(n_splits=10)
scorer = 'accuracy'
grid_search = GridSearchCV(clf, param_grid, scoring=scorer,
cv=skf, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train.values, y_train.values)
Has anyone got any ideas on what's happening here?
ElasticNet is a regression model.
If you want an ElasticNet penalty in classification, use LogisticRegression:
lr = LogisticRegression(solver="saga", penalty="elasticnet")
Minimal Reproducible Example:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
lr = LogisticRegression(solver="saga", penalty="elasticnet", max_iter=2000)
param_grid = {
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
clf = GridSearchCV(lr, param_grid, scoring='accuracy', cv=StratifiedKFold(n_splits=10), return_train_score=True, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
Below is the code that I am trying to execute
# Train a logistic regression model, report the coefficients and model performance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
clf = LogisticRegression().fit(X_train, y_train)
params = {'penalty':['l1','l2'],'dual':[True,False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True,False],
'solver':['saga']}
gridlog = GridSearchCV(clf, params, cv=5, n_jobs=2, scoring='roc_auc')
cv_scores = cross_val_score(gridlog, X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_) # throws error
The last code line above is where the error is being thrown from. I have used this exact same code to run other models. Any idea why I may be facing this issue?
You need to fit gridlog first. cross_val_score will not do this, it returns the scores & nothing else.
Hence, as gridlog isn't trained, it throws error.
Below code works perfectly fine:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
diabetes = datasets.load_breast_cancer()
x = diabetes.data[:150]
y = diabetes.target[:150]
clf = LogisticRegression().fit(x, y)
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gridlog = GridSearchCV(clf, params, cv=2, n_jobs=2,
scoring='roc_auc')
gridlog.fit(x,y) # <- missing in your code
cv_scores = cross_val_score(gridlog, x, y)
print(cv_scores)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_)
# result:
Logistic regression parameters: {'C': 1}
Your code should be updated such that the LogisticRegression classifier is passed to the GridSearch (not its fit):
from sklearn.datasets import load_breast_cancer # For example only
X_train, y_train = load_breast_cancer(return_X_y=True)
params = {'penalty':['l1', 'l2'],'dual':[True, False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True, False],
'solver':['saga']}
gridlog = GridSearchCV(LogisticRegression(), params, cv=5, n_jobs=2, scoring='roc_auc')
gridlog.fit(X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ', gridlog.best_params_) # Now it displays all the parameters selected by the grid search
Results
Logistic Regression parameters: {'C': 0.1, 'dual': False, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'saga'}
Note, as #desertnaut pointed out, you don't use cross_val_score for GridSearchCV.
See a complete example of how to use GridSearch here.
The example use a SVC classifier instead of a LogisticRegression, but the approach is the same.
I would like to apply gridsearch CV on a scikit-learn pipeline [[feature selection] + [algorithm]] but it give the following error, how can I correct the code?
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
pipeline1 = Pipeline([
('feature_selection', SelectFromModel(svm.SVC(kernel='linear'))),
('filter' , SelectKBest(k=11)),
('classification' , svm.SVC(kernel='linear'))
])
grid_parameters_tune =
[{'estimator__C': [0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}]
model = GridSearchCV(pipeline1, grid_parameters_tune, cv=5, n_jobs=-1,
verbose=1)
model.fit(X, y)
ValueError: Invalid parameter estimator for estimator Pipeline(memory=None,
steps=[('feature_union', FeatureUnion(n_jobs=None,
transformer_list=[('filter', SelectKBest(k=10, score_func=<function f_classif at 0x000001ECCBB3E840>)), ('feature_selection', SelectFromModel(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', ...r', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]). Check the list of available parameters with `estimator.get_params().keys()`.
I think the error comes from the name in your grid_parameters_tune. You are trying to access estimator__C, but there are no steps names estimator in your pipeline. Renaming it classification__C should do the trick.
If you want to access to the C parameter from the SVC in SelectFromModel, you can do so with feature_selection__estimator__C
Below is a working example with random data. I changed some of the parameters from your original pipeline in order to save some time, do not necessarily copy it directly.
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
X = pd.DataFrame(data=np.arange(1000).reshape(-1, 25))
y = np.random.binomial(1, 0.5, 1000//25)
pipeline1 = Pipeline(
[
("feature_selection", SelectFromModel(svm.SVC(kernel="linear"))),
("filter", SelectKBest(k=11)),
("classification", svm.SVC(kernel="linear")),
]
)
grid_parameters_tune = [{"classification__C": [0.01, 0.1, 1.0, 10.0,]}]
model = GridSearchCV(pipeline1, grid_parameters_tune, cv=3, n_jobs=-1, verbose=1)
model.fit(X, y)
As for the second way:
pipeline1 = Pipeline(
[
("feature_selection", SelectFromModel(svm.SVC(kernel="linear"))),
("filter", SelectKBest(k=11)),
("classification", svm.SVC(kernel="linear")),
]
)
grid_parameters_tune = [{"feature_selection__estimator__C": [0.01, 0.1, 1.0, 10.0,]}]
model = GridSearchCV(pipeline1, grid_parameters_tune, cv=3, n_jobs=-1, verbose=1)
model.fit(X, y)
I have used a GridSearch for parameter optimization when predicting values with 10-fold cross validation using sklearn, as shown below,
svr_params = {
'C': [0.1, 1, 10],
'epsilon': [0.01, 0.05, 0.1, 0.5, 1],
}
svr = SVR(kernel='linear', coef0=0.1, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1)
best_svr = GridSearchCV(
svr, param_grid=svr_params, cv=10, verbose=0, n_jobs=-1)
predicted = cross_val_predict(best_svr, X, y, cv=10)
I want to print out the best parameters selected by the GridSearch for C and epsilon. I would really appriate some help. Thanks in advance.
The best parameters are available as best_params_ attribute of GridSearchCV.
best_svr = GridSearchCV(svr, param_grid=svr_params, cv=10, verbose=0, n_jobs=-1, refit=True)
best_svr.fit(X, y)
print(best_svr.best_params_)
I want to use GridSearchCV over a range of alphas (LaPlace smoothing parameters) to check which gives me the best accuracy with a Bernoulli Naive Bayes model.
def binarize_pixels(data, threshold=0.784):
# Initialize a new feature array with the same shape as the original data.
binarized_data = np.zeros(data.shape)
# Apply a threshold to each feature.
for feature in range(data.shape[1]):
binarized_data[:,feature] = data[:,feature] > threshold
return binarized_data
binarized_train_data = binarize_pixels(mini_train_data)
def BNB():
clf = BernoulliNB()
clf.fit(binarized_train_data, mini_train_labels)
scoring = clf.score(mini_train_data, mini_train_labels)
predsNB = clf.predict(dev_data)
print "Bernoulli binarized model accuracy: {:.4}".format(np.mean(predsNB == dev_labels))
The model runs fine, while my GridSearch cross validation does not:
pipeline = Pipeline([('classifier', BNB())])
def P8(alphas):
gs_clf = GridSearchCV(pipeline, param_grid = alphas, refit=True)
y_predictions = gs_clf.best_estimator_.predict(dev_data)
print classification_report(dev_labels, y_predictions)
alphas = {'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
P8(alphas)
I get AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'
The problem is in the following two rows:
gs_clf = GridSearchCV(pipeline, param_grid = alphas, refit=True)
y_predictions = gs_clf.best_estimator_.predict(dev_data)
Note that before using predict, you first need to fit the model. That is, to call gs_clf.fit. See the following example from the documentation:
>>> from sklearn import svm, datasets
>>> from sklearn.model_selection import GridSearchCV
>>> iris = datasets.load_iris()
>>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
>>> svr = svm.SVC()
>>> clf = GridSearchCV(svr, parameters)
>>> clf.fit(iris.data, iris.target)
...
GridSearchCV(cv=None, error_score=...,
estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
decision_function_shape=None, degree=..., gamma=...,
kernel='rbf', max_iter=-1, probability=False,
random_state=None, shrinking=True, tol=...,
verbose=False),
fit_params={}, iid=..., n_jobs=1,
param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
scoring=..., verbose=...)
>>> sorted(clf.cv_results_.keys())
...
['mean_fit_time', 'mean_score_time', 'mean_test_score',...
'mean_train_score', 'param_C', 'param_kernel', 'params',...
'rank_test_score', 'split0_test_score',...
'split0_train_score', 'split1_test_score', 'split1_train_score',...
'split2_test_score', 'split2_train_score',...
'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]