Perform cross-validation with GLM regression model in Python - python

How do I perform cross-validation with GLM regression model?
I have created a glm model sm.GLM(endog, exog, family=sm.families.Gamma(link=sm.families.links.log())).fit() and I would need to cross-validate the result, however I cannot find a way to do this with sm.GLM model. Found multiple examples where model = LogisticRegression() is used, but this is not applicable to my data.
Here is the code:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold
Test = pd.read_csv(r'D:\myfile.csv')
endog = Test['Y']
exog = Test[['log_X1', 'log_A', 'log_B']]
glm_model = sm.GLM(endog, exog, family=sm.families.Gaussian(link=sm.families.links.log())).fit()
y_pred = glm_model.predict()
scoring = "neg_root_mean_squared_error"
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
crossvalidation = KFold(n_splits=10)
scores = cross_val_score(glm_model, X_train, y_train, scoring="neg mean_squared_error", cv=crossvalidation)
With the particular line I get error. Perhaps there are other ways how to do this?
scores = cross_val_score(glm_model, X_train, y_train, scoring="neg mean_squared_error", cv=crossvalidation)
TypeError: estimator should be an estimator implementing 'fit' method, <statsmodels.genmod.generalized_linear_model.GLMResultsWrapper object at 0x000002972A2181F0> was passed

The answer is SMWrapper:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)

Related

Find and use top 10 features in XGBoost regression pipeline

I want to get the top 10 features with XGBRegressor with ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10) I get the top 10 features. But how could I use this in my pipeline?
I have this class FeatureSelector_Only_Top_10, how could I only use the top 10 features and later printed out? For example print(grid.feature_selection_top_10.top10features).
Imports:
import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
XGB:
xgb_reg_start = time.time()
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)
xgb_reg_end = time.time()
print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("\nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)
Pipeline:
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
def fit(self, X, y = None):
# Don't know
return self
def transform(self, X, y = None):
# Don't know
return X
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
#('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
print("score = %3.2f" %(grid.score(X_test,y_test)))
If you want to select the N best features of your dataset in your Pipelineyou should define a custom Transformer.
This object should train and select the N best feature from xgboost during the transform() method. Then During the transform() method, this transformer should filter your dataset accordingly.
I would do as follows:
from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso
import pandas as pd
import xgboost as xgb
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
self.top_n_features = None
def fit(self, X, y = None):
X = pd.DataFrame(X)
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X, y)
self.top_n_features = (pd.DataFrame(
xgb_reg.feature_importances_,
columns=['weight'],
index=X.columns)
.sort_values(by='weight', ascending=False)
.head(10)
)
return self
def transform(self, X, y = None):
return pd.DataFrame(X).filter(self.top_n_features.index)
X, y = make_regression(n_features=50)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [('feature_selection_top_10', FeatureSelector_Only_Top_10()),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
pipeline.fit(X, y)
print("score = %3.2f" %(pipeline.score(X_test,y_test)))
#retrieve the top N features and their weights
pipeline['feature_selection_top_10'].top_n_features
You can include SelectFromModel in the pipeline in order to extract the top 10 features based on their importance weights, there is no need to create a custom transformer. As explained in the documentation, if you want to select 10 features you need to set max_features=10 and threshold=-np.inf.
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_features=100, n_samples=1000, random_state=42)
X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')
pipeline = Pipeline([
('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
('regressor', LinearRegression())
])
pipeline.fit(X, y)
selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10
selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']
selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]
selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515 0.16829694 28.81967319 0.50277914 24.55006237 68.17120687]

I got an error while running lasso regression method

raise NotFittedError(msg % {'name': type(estimator).name})
sklearn.exceptions.NotFittedError: This Lasso instance is not fitted
yet. Call 'fit' with appropriate arguments before using this
estimator.
from sklearn import datasets
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
#
# Load the Boston Data Set
#
bh = datasets.load_boston()
X = bh.data
y = bh.target
#
# Create training and test split
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#
# Create an instance of Lasso Regression implementation
#
lasso = Lasso(alpha=1.0)
#
# Fit the Lasso model
#
lasso.fit(X_test, y_test)
#
# Create the model score
#
#lasso.score(X_test, y_test), lasso.score(X_train, y_train)
lasso_reg = Lasso(normalize=True)
y_pred_lass =lasso_reg.predict(X_test)
print(y_pred_lass)
You've actually created two lasso models. One called lasso which you fit. But after that you create another one lasso_reg = Lasso(normalize=True) which you try to call predict on but that model hasn't been fitted yet. Try this:
from sklearn import datasets
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
bh = datasets.load_boston()
X = bh.data
y = bh.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lasso = Lasso(alpha=1.0, normalize=True)
lasso.fit(X_test, y_test)
y_pred_lass =lasso.predict(X_test)
print(y_pred_lass)
As the error says you have to call lasso_reg.fit(X_test, y_test) before calling lasso_reg.predict(X_test) This will fix the issue.
lasso_reg = Lasso(normalize=True)
lasso_reg.fit(X_test, y_test)
y_pred_lass =lasso_reg.predict(X_test)
print(y_pred_lass)

how to run the same linear model n times?

I built a linear model with the sklearn based on the Cement and Concrete Composites dataset.
Initially, i used the train_test_split(X, Y, test_size=0.3, Shuffle=False) and i found the train and test error.
Now i try to run the same model 10 times with Shuffle=True and compute the mean and sd of the errors. The new results should be compared to the first ones.
How could i loop the same model n times and save the errors in a list?
Try something like this:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
errors = []
for i in range(10):
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, Shuffle=True)
model = LinearRegression() # the model you want to use here
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
error = accuracy_score(y_test, y_pred) # the error metric you want to use here
errors.append(error)
What you need is cross-validation: repeated evaluation of the model on different splits of the same data. train_test_split in this case is a wrapper around ShuffleSplit cross-validation.
In your case it might look like this:
from sklearn.model_selection import ShuffleSplit, cross_val_score
import numpy as np
from sklearn.linear_model import LinearRegression
X, y = ... # read dataset
model = LinearRegression()
# n_splits=10 is for 10 random shuffled train-test splits
cv = ShuffleSplit(n_splits=10, test_size=.3, random_state=0)
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
np.mean(scores), np.std(scores)
If you want to compute the error on your own or do anything else with models/results, you could do it like this:
for train_ids, test_ids in cv.split(X):
model.fit(X[train_ids], y[train_ids])
model.score(X[test_ids], y[test_ids])
...
More about this:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html

How to use custom scoring function in sklearn cross_val_score

I want to use Adjusted Rsquare in the cross_val_score function. I tried with make_scorer function but it is not working.
from sklearn.cross_validation import train_test_split
X_tr, X_test, y_tr, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regression = LinearRegression(normalize=True)
from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import r2_score
def adjusted_rsquare(y_true,y_pred):
adjusted_r_squared = 1 - (1-r2_score(y_true, y_pred))*(len(y_pred)-1)/(len(y_pred)-X_test.shape[1]-1)
return adjusted_r_squared
my_scorer = make_scorer(adjusted_rsquare, greater_is_better=True)
score = np.mean(cross_val_score(regression, X_tr, y_tr, scoring=my_scorer,cv=crossvalidation, n_jobs=1))
It is trowing an error:
IndexError: positional indexers are out-of-bounds
Is there any way to use my custom function i.e; adjusted_rsquare with cross_val_score?
adjusted_rsquare(X,Y) is a number, it's not a function, just create the scorer like this:
my_scorer = make_scorer(adjusted_rsquare, greater_is_better=True)
You also need to change the score function:
def adjusted_rsquare(y_true, y_pred, **kwargs):
That's the prototype that you should use. You compare the actual result to the result it should have been.

XGBoost with GridSearchCV, Scaling, PCA, and Early-Stopping in sklearn Pipeline

I want to combine a XGBoost model with input scaling and feature space reduction by PCA. In addition, the hyperparameters of the model as well as the number of components used in the PCA should be tuned using cross-validation. And to prevent the model from overfitting, early stopping should be added.
For combining the various steps, I decided to use sklearn's Pipeline functionalities.
At the beginning, I had some problems making sure, that the PCA is also applied to the validation set. But I think using XGB__eval_set makes the deal.
The code is actually running without any errors, but seems to run forever (at some point the CPU usage of all cores goes down to zero but the processes continue to run for hours; had to kill the session at some point).
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(X_with_features, y, test_size=0.2, random_state=123)
# Train / Validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
# Pipeline
pipe = Pipeline(steps=[("Scale", StandardScaler()),
("PCA", PCA()),
("XGB", XGBRegressor())])
# Hyper-parameter grid (Test only)
grid_param_pipe = {'PCA__n_components': [5],
'XGB__n_estimators': [1000],
'XGB__max_depth': [3],
'XGB__reg_alpha': [0.1],
'XGB__reg_lambda': [0.1]}
# Grid object
grid_search_pipe = GridSearchCV(estimator=pipe,
param_grid=grid_param_pipe,
scoring="neg_mean_squared_error",
cv=5,
n_jobs=5,
verbose=3)
# Run CV
grid_search_pipe.fit(X_train, y_train, XGB__early_stopping_rounds=10, XGB__eval_metric="rmse", XGB__eval_set=[[X_val, y_val]])
The problem is that fit method requires an evaluation set created externally, but we cannot create one before the transformation by the pipeline.
This is a bit hacky, but the idea is to create a thin wrapper to the xgboost regressor/classifier that prepare for the evaluation set inside.
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, XGBClassifier
class XGBoostWithEarlyStop(BaseEstimator):
def __init__(self, early_stopping_rounds=5, test_size=0.1,
eval_metric='mae', **estimator_params):
self.early_stopping_rounds = early_stopping_rounds
self.test_size = test_size
self.eval_metric=eval_metric='mae'
if self.estimator is not None:
self.set_params(**estimator_params)
def set_params(self, **params):
return self.estimator.set_params(**params)
def get_params(self, **params):
return self.estimator.get_params()
def fit(self, X, y):
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size)
self.estimator.fit(x_train, y_train,
early_stopping_rounds=self.early_stopping_rounds,
eval_metric=self.eval_metric, eval_set=[(x_val, y_val)])
return self
def predict(self, X):
return self.estimator.predict(X)
class XGBoostRegressorWithEarlyStop(XGBoostWithEarlyStop):
def __init__(self, *args, **kwargs):
self.estimator = XGBRegressor()
super(XGBoostRegressorWithEarlyStop, self).__init__(*args, **kwargs)
class XGBoostClassifierWithEarlyStop(XGBoostWithEarlyStop):
def __init__(self, *args, **kwargs):
self.estimator = XGBClassifier()
super(XGBoostClassifierWithEarlyStop, self).__init__(*args, **kwargs)
Below is a test.
from sklearn.datasets import load_diabetes
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
x, y = load_diabetes(return_X_y=True)
print(x.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
('pca', PCA(5)),
('xgb', XGBoostRegressorWithEarlyStop())
])
param_grid = {
'pca__n_components': [3, 5, 7],
'xgb__n_estimators': [10, 20, 30, 50]
}
grid = GridSearchCV(pipe, param_grid, scoring='neg_mean_absolute_error')
grid.fit(x, y)
print(grid.best_params_)
If requesting feature requests to the developers, the easiest extension to make is to allow XGBRegressor to create evaluation set internally when not provided. This way, no extension to the scikit-learn is necessary (I guess).

Categories