XGBoost with GridSearchCV, Scaling, PCA, and Early-Stopping in sklearn Pipeline - python

I want to combine a XGBoost model with input scaling and feature space reduction by PCA. In addition, the hyperparameters of the model as well as the number of components used in the PCA should be tuned using cross-validation. And to prevent the model from overfitting, early stopping should be added.
For combining the various steps, I decided to use sklearn's Pipeline functionalities.
At the beginning, I had some problems making sure, that the PCA is also applied to the validation set. But I think using XGB__eval_set makes the deal.
The code is actually running without any errors, but seems to run forever (at some point the CPU usage of all cores goes down to zero but the processes continue to run for hours; had to kill the session at some point).
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X_with_features, y, test_size=0.2, random_state=123)
# Train / Validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
# Pipeline
pipe = Pipeline(steps=[("Scale", StandardScaler()),
("PCA", PCA()),
("XGB", XGBRegressor())])
# Hyper-parameter grid (Test only)
grid_param_pipe = {'PCA__n_components': [5],
'XGB__n_estimators': [1000],
'XGB__max_depth': [3],
'XGB__reg_alpha': [0.1],
'XGB__reg_lambda': [0.1]}
# Grid object
grid_search_pipe = GridSearchCV(estimator=pipe,
param_grid=grid_param_pipe,
scoring="neg_mean_squared_error",
cv=5,
n_jobs=5,
verbose=3)
# Run CV
grid_search_pipe.fit(X_train, y_train, XGB__early_stopping_rounds=10, XGB__eval_metric="rmse", XGB__eval_set=[[X_val, y_val]])

The problem is that fit method requires an evaluation set created externally, but we cannot create one before the transformation by the pipeline.
This is a bit hacky, but the idea is to create a thin wrapper to the xgboost regressor/classifier that prepare for the evaluation set inside.
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, XGBClassifier
class XGBoostWithEarlyStop(BaseEstimator):
def __init__(self, early_stopping_rounds=5, test_size=0.1,
eval_metric='mae', **estimator_params):
self.early_stopping_rounds = early_stopping_rounds
self.test_size = test_size
self.eval_metric=eval_metric='mae'
if self.estimator is not None:
self.set_params(**estimator_params)
def set_params(self, **params):
return self.estimator.set_params(**params)
def get_params(self, **params):
return self.estimator.get_params()
def fit(self, X, y):
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size)
self.estimator.fit(x_train, y_train,
early_stopping_rounds=self.early_stopping_rounds,
eval_metric=self.eval_metric, eval_set=[(x_val, y_val)])
return self
def predict(self, X):
return self.estimator.predict(X)
class XGBoostRegressorWithEarlyStop(XGBoostWithEarlyStop):
def __init__(self, *args, **kwargs):
self.estimator = XGBRegressor()
super(XGBoostRegressorWithEarlyStop, self).__init__(*args, **kwargs)
class XGBoostClassifierWithEarlyStop(XGBoostWithEarlyStop):
def __init__(self, *args, **kwargs):
self.estimator = XGBClassifier()
super(XGBoostClassifierWithEarlyStop, self).__init__(*args, **kwargs)
Below is a test.
from sklearn.datasets import load_diabetes
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
x, y = load_diabetes(return_X_y=True)
print(x.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
('pca', PCA(5)),
('xgb', XGBoostRegressorWithEarlyStop())
])
param_grid = {
'pca__n_components': [3, 5, 7],
'xgb__n_estimators': [10, 20, 30, 50]
}
grid = GridSearchCV(pipe, param_grid, scoring='neg_mean_absolute_error')
grid.fit(x, y)
print(grid.best_params_)
If requesting feature requests to the developers, the easiest extension to make is to allow XGBRegressor to create evaluation set internally when not provided. This way, no extension to the scikit-learn is necessary (I guess).

Related

gridsearchcv best_estimator parameter doenst have same value as the fitted model when using pipeline indexing. Also uses sequential feature selection

The whole idea is to perform a grid search over all possible values of lambda, where each possible values of lambda would give a specific best subset of feature. At The end of the day I'm trying to do hyperparameter tuning (lambda) and feature selection at the same time. any advice is greatly appreciated! thankyou so much
ISSUE :
result of gs_cv.best_estimator_[0].estimator.alpha while gs_cv.best_estimator_[1].alpha = 1.0 (pipeline indexing results)
best_parameter from the grid_search_cv doesnt seem to be fitted to the model part of the pipeline as seen in the image.
I got this when print(gs_cv.best_estimator_.named_steps). The Ridge() still uses the default value lambda of 1
{'sfs_ridge': SequentialFeatureSelector(estimator=Ridge(alpha=0.0), k_features=5,
scoring='r2'), 'ridge_regression': Ridge()}
------------Code------------------
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
#Model
ridge = Ridge()
#hyperparameter_alpha = np.logspace(-6,6, num=5)
#SFS model
sfs_ridge = SFS(estimator=ridge, k_features = 5, forward=True, floating=False, scoring='r2', cv = 5)
#Pipeline model
pipe = Pipeline([ ('sfs_ridge', sfs_ridge), ('ridge_regression', ridge) ])
#GridSearchCV
#The parameter_grid for the model should start with the name you give when defining the pipeline!!
param_grid = [ {'sfs_ridge__k_features': [2,4,5] ,'sfs_ridge__estimator__alpha': np.arange(0,1,0.05) }]
gs_cv = GridSearchCV(estimator= pipe, param_grid= param_grid, scoring="neg_mean_absolute_error", n_jobs = -1, cv=5, refit=True)
gs_cv.fit(X_train, y_train)
print(gs_cv.best_estimator_[0].estimator.alpha) #print out 0.0
print(gs_cv.best_estimator_[1].alpha) #print out 1.0
print(gs_cv.best_estimator_[0].k_feature_idx_)

Perform cross-validation with GLM regression model in Python

How do I perform cross-validation with GLM regression model?
I have created a glm model sm.GLM(endog, exog, family=sm.families.Gamma(link=sm.families.links.log())).fit() and I would need to cross-validate the result, however I cannot find a way to do this with sm.GLM model. Found multiple examples where model = LogisticRegression() is used, but this is not applicable to my data.
Here is the code:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold
Test = pd.read_csv(r'D:\myfile.csv')
endog = Test['Y']
exog = Test[['log_X1', 'log_A', 'log_B']]
glm_model = sm.GLM(endog, exog, family=sm.families.Gaussian(link=sm.families.links.log())).fit()
y_pred = glm_model.predict()
scoring = "neg_root_mean_squared_error"
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
crossvalidation = KFold(n_splits=10)
scores = cross_val_score(glm_model, X_train, y_train, scoring="neg mean_squared_error", cv=crossvalidation)
With the particular line I get error. Perhaps there are other ways how to do this?
scores = cross_val_score(glm_model, X_train, y_train, scoring="neg mean_squared_error", cv=crossvalidation)
TypeError: estimator should be an estimator implementing 'fit' method, <statsmodels.genmod.generalized_linear_model.GLMResultsWrapper object at 0x000002972A2181F0> was passed
The answer is SMWrapper:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)

I got an error while running lasso regression method

raise NotFittedError(msg % {'name': type(estimator).name})
sklearn.exceptions.NotFittedError: This Lasso instance is not fitted
yet. Call 'fit' with appropriate arguments before using this
estimator.
from sklearn import datasets
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
#
# Load the Boston Data Set
#
bh = datasets.load_boston()
X = bh.data
y = bh.target
#
# Create training and test split
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#
# Create an instance of Lasso Regression implementation
#
lasso = Lasso(alpha=1.0)
#
# Fit the Lasso model
#
lasso.fit(X_test, y_test)
#
# Create the model score
#
#lasso.score(X_test, y_test), lasso.score(X_train, y_train)
lasso_reg = Lasso(normalize=True)
y_pred_lass =lasso_reg.predict(X_test)
print(y_pred_lass)
You've actually created two lasso models. One called lasso which you fit. But after that you create another one lasso_reg = Lasso(normalize=True) which you try to call predict on but that model hasn't been fitted yet. Try this:
from sklearn import datasets
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
bh = datasets.load_boston()
X = bh.data
y = bh.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lasso = Lasso(alpha=1.0, normalize=True)
lasso.fit(X_test, y_test)
y_pred_lass =lasso.predict(X_test)
print(y_pred_lass)
As the error says you have to call lasso_reg.fit(X_test, y_test) before calling lasso_reg.predict(X_test) This will fix the issue.
lasso_reg = Lasso(normalize=True)
lasso_reg.fit(X_test, y_test)
y_pred_lass =lasso_reg.predict(X_test)
print(y_pred_lass)

how to run the same linear model n times?

I built a linear model with the sklearn based on the Cement and Concrete Composites dataset.
Initially, i used the train_test_split(X, Y, test_size=0.3, Shuffle=False) and i found the train and test error.
Now i try to run the same model 10 times with Shuffle=True and compute the mean and sd of the errors. The new results should be compared to the first ones.
How could i loop the same model n times and save the errors in a list?
Try something like this:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
errors = []
for i in range(10):
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, Shuffle=True)
model = LinearRegression() # the model you want to use here
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
error = accuracy_score(y_test, y_pred) # the error metric you want to use here
errors.append(error)
What you need is cross-validation: repeated evaluation of the model on different splits of the same data. train_test_split in this case is a wrapper around ShuffleSplit cross-validation.
In your case it might look like this:
from sklearn.model_selection import ShuffleSplit, cross_val_score
import numpy as np
from sklearn.linear_model import LinearRegression
X, y = ... # read dataset
model = LinearRegression()
# n_splits=10 is for 10 random shuffled train-test splits
cv = ShuffleSplit(n_splits=10, test_size=.3, random_state=0)
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
np.mean(scores), np.std(scores)
If you want to compute the error on your own or do anything else with models/results, you could do it like this:
for train_ids, test_ids in cv.split(X):
model.fit(X[train_ids], y[train_ids])
model.score(X[test_ids], y[test_ids])
...
More about this:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

How to pass two estimator objects to sklearn's GridSearchCV so that they have the same parameters in each step?

I'm trying to use GridSearchCV from SKlearn to tune hyperparameters for my estimator.
In the first step, the estimator is used to for is SequentialFeatureSelection, which is a custom library that performs wrapper based feature selection. This means iteratively adding new features and identifying the ones where the estimator performs best with. Hence, the SequentialFeatureSelection method requires my estimator. This library is programmed so that it is perfectly fine to use with SKlearn, so I integrate it in the first step of the GridSearchCV pipeline to transform the features to the ones selected.
In the second step, I would like to use exactly the same classifier with exactly the same parameters to be fitted and predict the outcome. However with the parameter grid, I can only either set the parameters to the classifier that I pass to SequentialFeatureSelector OR the ones in 'clf' and I cannot assure that they are always the same.
Finally, with the selected features and selected parameters I want to predict on a previously held out test-set.
On the bottom of the page of the SFS library, they show how to use SFS with GridSearchCV, but there the KNN algorithm used to select features and the one used to predict are also using different parameters. And when I check for myself after traininf SFS and GridSearchCV, the parameters are never the same, even when I use the clf.clone() as proposed. Here is my code:
import sklearn.pipeline
import sklearn.tree
import sklearn.model_selection
import mlxtend.feature_selection
def sfs(x, y):
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=0)
clf = sklearn.tree.DecisionTreeClassifier()
param_grid = {
"sfs__estimator__max_depth": [5]
}
sfs = mlxtend.feature_selection.SequentialFeatureSelector(clone_estimator=True, # Clone like in Tutorial
estimator=clf,
k_features=10,
forward=True,
floating=False,
scoring='accuracy',
cv=3,
n_jobs=1)
pipe = sklearn.pipeline.Pipeline([('sfs', sfs), ("clf", clf)])
gs = sklearn.model_selection.GridSearchCV(estimator=pipe,
param_grid=param_grid,
scoring='accuracy',
n_jobs=1,
cv=3,
refit=True)
gs = gs.fit(x_train, y_train)
# Both estimators should have depth 5!
print("SFS Final Estimator Depth: " + str(gs.best_estimator_.named_steps.sfs.estimator.max_depth))
print("CLF Final Estimator Depth: " + str(gs.best_estimator_._final_estimator.max_depth))
# Evaluate...
y_test_pred = gs.predict(x_test)
# Accuracy etc...
The question would be, how do I assure that they always have the same parameters set within the same pipeline?
Thanks!
I found a solution, where I overwrite some methods of the SequentialFeatureSelector (SFS) class to also use its estimator for predicting after transformation. This is done by introducing a Custom SFS class 'CSequentialFeatureSelector' that overwrites the following methods from SFS:
In the fit(self, X, y) method, not only the normal fit is performed, but also the self.estimator is the fitted on the transformed data, so that it is possible to implement predict and predict_proba methods for the SFS class.
I implemented predict and predict_probba methods for the SFS class, that call the predict and predict_probba methods of the fitted self.estimator.
Hence, I only have one estimator left that is used for SFS and predicting.
Here is some of the code:
import sklearn.pipeline
import sklearn.tree
import sklearn.model_selection
import mlxtend.feature_selection
class CSequentialFeatureSelector(mlxtend.feature_selection.SequentialFeatureSelector):
def predict(self, X):
X = self.transform(X)
return self.estimator.predict(X)
def predict_proba(self, X):
X = self.transform(X)
return self.estimator.predict_proba(X)
def fit(self, X, y):
self.fit_helper(X, y) # fit helper is the 'old' fit method, which I copied and renamed to fit_helper
self.estimator.fit(self.transform(X), y)
return self
def sfs(x, y):
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=0)
clf = sklearn.tree.DecisionTreeClassifier()
param_grid = {
"sfs__estimator__max_depth": [3, 4, 5]
}
sfs = mlxtend.feature_selection.SequentialFeatureSelector(clone_estimator=True,
estimator=clf,
k_features=10,
forward=True,
floating=False,
scoring='accuracy',
cv=3,
n_jobs=1)
# Now only one object in the pipeline (in fact this is not even needed anymore)
pipe = sklearn.pipeline.Pipeline([('sfs', sfs)])
gs = sklearn.model_selection.GridSearchCV(estimator=pipe,
param_grid=param_grid,
scoring='accuracy',
n_jobs=1,
cv=3,
refit=True)
gs = gs.fit(x_train, y_train)
print("SFS Final Estimator Depth: " + str(gs.best_estimator_.named_steps.sfs.estimator.max_depth))
y_test_pred = gs.predict(x_test)
# Evaluate performance of y_test_pred

Categories