pipeline regressor ensemble with scikit-learn version 0.19.2 - python

I'm using scikit-learn version 0.19.2 (for onnx conversion compatibility),
and I'm having problems implementing ensemble methods with Pipeline.
The code below is trying to implement linear regression from two independent regressors:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import sklearn
# data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# fitting models
forest_clf = sklearn.ensemble.RandomForestRegressor()
forest_clf.fit(X, y)
logistic_clf = linear_model.LogisticRegression()
logistic_clf.fit(X,y)
# pipeline
estimators = [('random_forest', forest_clf), ('logistic', logistic_clf)]
model = Pipeline( steps=[ ('models', FeatureUnion(estimators) ), ('linear_regression', linear_model.base.LinearRegression() ) ] )
# fitting ensemble
model.fit(X,y)
resulting in error
TypeError: All estimators should implement fit and transform. 'RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=False)' (type ) doesn't
Can someone help me figuring what went wrong?
p.s I'm looking for some Pipeline technique to imitate the sklearn.ensemble.StackingRegressor from version 0.23.

From the documentation of the Pipeline class:
Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods.
As the error message indicates, the RandomForestRegressor does not implement the transform function (neither does LogisitcRegression). Hence, they cannot be used directly as transformers in the pipeline.
If you want to use a pipeline, the only workaround I see is to wrap them in custom classes that implement the transform function as needed:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
class RFTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.rf = RandomForestRegressor()
def fit(self, X, y=None):
self.rf.fit(X, y)
return self
def transform(self, X):
return self.rf.predict(X).reshape(-1, 1)
class LRTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.lr = LogisticRegression()
def fit(self, X, y=None):
self.lr.fit(X, y)
return self
def transform(self, X):
return self.lr.predict(X).reshape(-1, 1)
These custom transformers will simply fit their respective models and the transform function will return their prediction (and if I understand correctly, this is what you want to concatenate and pass to the final estimator).
Now, you can use these transformers like this:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
# Data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# Instantiate transformers
forest_clf = RFTransformer()
logistic_clf = LRTransformer()
# Pipeline
estimators = [('random_forest', forest_clf), ('logistic', logistic_clf)]
model = Pipeline(steps=[('models', FeatureUnion(estimators)), ('linear_regression', LinearRegression())])
# Fitting ensemble
model.fit(X, y)
Finally, just to confirm the desired output:
print(model.predict(X))
# Output
# [1. 2.]

Based on #afsharov answer, I've made a general class for custom transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
class customTransform(BaseEstimator, TransformerMixin):
def __init__(self, algo_type):
# algo type
if algo_type=='GradientBoosting':
model = sklearn.ensemble.GradientBoostingRegressor()
elif algo_type=='svm':
model = linear_model.svm.SVR()
elif algo_type=='LogisticRegression':
model = linear_model.LogisticRegression(solver='newton-cg')
elif algo_type=='sgd':
model = linear_model.stochastic_gradient.SGDRegressor()
elif algo_type=='LinearRegression':
model = linear_model.base.LinearRegression()
elif algo_type=='TheilSen':
model = linear_model.theil_sen.TheilSenRegressor()
elif algo_type=='RandomForest':
model = sklearn.ensemble.RandomForestRegressor(criterion='mae')
# set the name and the model
self.name = algo_type
self.model = model
setattr(self, algo_type, model)
def fit(self, X, y=None):
# self.rf.fit(X,y)
getattr(self, self.name).fit(X, y)
return self
def transform(self, X):
# self.rf.predict(X).reshape(-1, 1)
return getattr(self, self.name).predict(X).reshape(-1,1)
def show(self):
print( '{}: {}'.format( self.name, self.model ) )
# Data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# Instantiate transformers
boosting_clf = customTransform('GradientBoosting')
forest_clf = customTransform('RandomForest')
boosting_clf.show()
# Pipeline
estimators = [('random_forest', forest_clf), ('gradient_boosting', boosting_clf)]
model = Pipeline(steps=[ ('models', FeatureUnion(estimators)), ('linear_regression', linear_model.LinearRegression())] )
# Fitting ensemble
model.fit(X, y)
# predict
print(model.predict(X) )

Related

Clip output from sklearn pipeline predict

Let's see the following pipeline:
scaler = ScalerFactory.get_scaler(scaler_type)
model = MultiOutputRegressor(lgb.LGBMRegressor(metric='tweedie', **hyperparameters))
steps = [('scaler', scaler), ('model', model)]
pipeline = Pipeline(steps)
pipeline.fit(X, y, model__feature_name=list(X.columns))
I am trying to add another step to the pipeline, so when it predicts it rounds all the values that are between -1 and 1 to 0.
I am trying to create a new class:
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
class OutputClipper(BaseEstimator, TransformerMixin):
def __init__(self) -> None:
super().__init__()
self.clipping = False
def fit(self, X, y=None):
return self
def transform(self, X, y):
y[(y>-1) & (y<1)] = 0
return y
and the new pipeline becomes:
steps = [('scaler', scaler), ('model', model), ('clipping',OutputClipper) ]
pipeline = Pipeline(steps)
However, I feel like this doesn't quite work. I guess the transform happens when the pipeline is called with the .predict() method. I am not sure how to test it too.

Perform cross-validation with GLM regression model in Python

How do I perform cross-validation with GLM regression model?
I have created a glm model sm.GLM(endog, exog, family=sm.families.Gamma(link=sm.families.links.log())).fit() and I would need to cross-validate the result, however I cannot find a way to do this with sm.GLM model. Found multiple examples where model = LogisticRegression() is used, but this is not applicable to my data.
Here is the code:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold
Test = pd.read_csv(r'D:\myfile.csv')
endog = Test['Y']
exog = Test[['log_X1', 'log_A', 'log_B']]
glm_model = sm.GLM(endog, exog, family=sm.families.Gaussian(link=sm.families.links.log())).fit()
y_pred = glm_model.predict()
scoring = "neg_root_mean_squared_error"
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1)
crossvalidation = KFold(n_splits=10)
scores = cross_val_score(glm_model, X_train, y_train, scoring="neg mean_squared_error", cv=crossvalidation)
With the particular line I get error. Perhaps there are other ways how to do this?
scores = cross_val_score(glm_model, X_train, y_train, scoring="neg mean_squared_error", cv=crossvalidation)
TypeError: estimator should be an estimator implementing 'fit' method, <statsmodels.genmod.generalized_linear_model.GLMResultsWrapper object at 0x000002972A2181F0> was passed
The answer is SMWrapper:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)

How do I change - using for loops to call multiple functions - into - using a pipeline to call a class?

So the basic requirement is that, I get a dictionary of models from user, and a dictionary of their hyper parameters and give a report. Currently goal is for binary classification, but this can be extended later.
This is what I am currently doing:
import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
def build_model(model_name, model_class, params=None):
"""
return model instance
"""
if 'Ridge' in model_name:
model = model_class(penalty='l2')
elif 'Lasso' in model_name:
model = model_class(penalty='l1')
elif 'Ensemble' in model_name:
model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
else:
model = model_class()
if params is not None:
print('Custom Model Parameters provided. Implementing Randomized Search for {} model'.format(model_name))
rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
scoring=make_scorer(f1_score), error_score=0.0)
return rscv
print('No model parameters provided. Using sklearn default values for {} model'.format(model_name))
return model
def fit_model(model_name, model_instance, xTrain, yTrain):
"""
fit model
"""
if model_name == 'SVM':
scaler = StandardScaler()
model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
else:
model = model_instance.fit(xTrain, yTrain)
return model
def predict_vals(fitted_model, xTest):
"""
predict and return vals
"""
if model_name == 'SVM':
scaler = StandardScaler()
y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
else:
y_prediction = fitted_model.predict(xTest)
return y_prediction
def get_metrics(yTest, y_prediction):
"""
get metrics after getting prediction
"""
return [recall_score(yTest, y_prediction),
precision_score(yTest, y_prediction),
f1_score(yTest, y_prediction),
roc_auc_score(yTest, y_prediction)]
def model_report(list_of_metrics):
"""
add metrics to df, return df
"""
df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
df = df.round(3)
return df
models = {
'Logistic Regression Ridge': LogisticRegression,
'Logistic Regression Lasso': LogisticRegression,
'Random Forest': RandomForestClassifier,
'SVM': SVC,
'GBM': GradientBoostingClassifier,
'EnsembleRFGBM': VotingClassifier
}
model_parameters = {
'SVM': {
'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
'class_weight': ['balanced'],
'gamma': [0.0001, 0.001],
'kernel': ['linear']
},
'Random Forest': {
'n_estimators': [5, 10, 50, 100, 200],
'max_depth': [3, 5, 10, 20, 40],
'criterion': ['gini', 'entropy'],
'bootstrap': [True, False],
'min_samples_leaf': [np.random.randint(1,10)]
},
'Logistic Regression Ridge': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'Logistic Regression Lasso': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'GBM': {
'n_estimators': [10, 50, 100, 200, 500],
'max_depth': [3, 5, 10, None],
'min_samples_leaf': [np.random.randint(1,10)]
},
'EnsembleRFGBM': {
'rf__n_estimators': [5, 10, 50, 100, 200],
'rf__max_depth': [3, 5, 10, 20, 40],
'rf__min_samples_leaf': [np.random.randint(1,10)],
'gbm__n_estimators': [10, 50, 100, 200, 500],
'gbm__max_depth': [3, 5, 10, None],
'gbm__min_samples_leaf': [np.random.randint(1,10)]
}
}
Without parameters I get the following report.
# without parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
With parameters given as input
# with parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class, model_parameters)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
The task given to me right now is as follows.
Take from user, a dictionary of models, and their parameters. If parameters are not provided, then use defaults of the models.
Give as output the report (as seen in images)
I was told that I should change the functions to classes. And avoid for loops if possible.
My challenges:
How do I change all the functions into a class and methods? Basically my senior wants something like
report.getReport # gives the dataFrame of the report
But the above sounds to me like it can be done in a function as follows (I don't understand why/how a class would be beneficial)
customReport(whatever inputs I'd like to give) # gives df of report
How do I avoid for loops to get through the user inputs for various models? What I thought was that I could use sklearn pipeline, since according to my understanding, pipeline is a series of steps, so from user take the params and models, and execute them as a series of steps. This avoids the for loops.
Something like this
customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
'SVC', SVC(with relevant params from params dict)) ] )
Similar solution I found is here but I would like to avoid for loops as such.
Another related solution here is using a class which can switch between different models. But here I would require that the user be able to give option whether he wants to do Gridsearch/RandomizedSearch/CV/None. My thinking is that I use this class, then inherit this to another class which the user can give input to choose Gridsearch/RandomizedSearch/CV/None etc. I'm not sure if I'm thinking in the right direction.
NOTE A full working solution is desirable (would love it) but not mandatory. It is ok if your answer has a skeleton which can give me a direction how to proceed. I am ok with exploring and learning from it.
I have implemented a working solution. I should have worded my question better. I initially misunderstood how GridsearchCV or RandomizedSearchCV works internally. cv_results_ gives all the results of the grid available. I thought only the best estimator was available to us.
Using this, for each type of model, I took the max rank_test_score, and got the parameters making up the model. In this example, it is 4 models. Now I ran each of those models, i.e. the best combination of parameters for each model, with my test data, and predicted the required scores. I think this solution can be extended to RandomizedSearchCV and a lot more other options.
NOTE: This is just a trivial solution. Lot of modifications necessary, like needing to scale data for specific models, etc. This solution will just serve as a starting point which can be modified according to the user's needs.
Credits to this answer for the ClfSwitcher() class.
Following is the implementation of the class (suggestions to improve are welcomed).
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
class ClfSwitcher(BaseEstimator):
def __init__(self, model=RandomForestClassifier()):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.model = model
def fit(self, X, y=None, **kwargs):
self.model.fit(X, y)
return self
def predict(self, X, y=None):
return self.model.predict(X)
def predict_proba(self, X):
return self.model.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
class report(ClfSwitcher):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.grid = None
self.full_report = None
self.concise_report = None
self.scoring_metrics = {
'precision': precision_score,
'recall': recall_score,
'f1': f1_score,
'roc_auc': roc_auc_score
}
def griddy(self, pipeLine, parameters, **kwargs):
self.grid = GridSearchCV(pipeLine, parameters, scoring='accuracy', n_jobs=-1)
def fit_grid(self, X_train, y_train=None, **kwargs):
self.grid.fit(X_train, y_train)
def make_grid_report(self):
self.full_report = pd.DataFrame(self.grid.cv_results_)
#staticmethod
def get_names(col):
return col.__class__.__name__
#staticmethod
def calc_score(col, metric):
return round(metric(y_test, col.fit(X_train, y_train).predict(X_test)), 4)
def make_concise_report(self):
self.concise_report = pd.DataFrame(self.grid.cv_results_)
self.concise_report['model_names'] = self.concise_report['param_cst__model'].apply(self.get_names)
self.concise_report = self.concise_report.sort_values(['model_names', 'rank_test_score'], ascending=[True, False]) \
.groupby(['model_names']).head(1)[['param_cst__model', 'model_names']] \
.reset_index(drop=True)
for metric_name, metric_func in self.scoring_metrics.items():
self.concise_report[metric_name] = self.concise_report['param_cst__model'].apply(self.calc_score, metric=metric_func)
self.concise_report = self.concise_report[['model_names', 'precision', 'recall', 'f1', 'roc_auc', 'param_cst__model']]
pipeline = Pipeline([
('cst', ClfSwitcher()),
])
parameters = [
{
'cst__model': [RandomForestClassifier()],
'cst__model__n_estimators': [10, 20],
'cst__model__max_depth': [5, 10],
'cst__model__criterion': ['gini', 'entropy']
},
{
'cst__model': [SVC()],
'cst__model__C': [10, 20],
'cst__model__kernel': ['linear'],
'cst__model__gamma': [0.0001, 0.001]
},
{
'cst__model': [LogisticRegression()],
'cst__model__C': [13, 17],
'cst__model__penalty': ['l1', 'l2']
},
{
'cst__model': [GradientBoostingClassifier()],
'cst__model__n_estimators': [10, 50],
'cst__model__max_depth': [3, 5],
'cst__model__min_samples_leaf': [1, 2]
}
]
my_report = report()
my_report.griddy(pipeline, parameters, scoring='f1')
my_report.fit_grid(X_train, y_train)
my_report.make_concise_report()
my_report.concise_report
Output Report as desired.
You can consider using map(), details here: https://www.geeksforgeeks.org/python-map-function/
Some programmers have the habit of avoiding raw loops - "A raw loop is any loop inside a function where the function serves purpose larger than the algorithm
implemented by the loop". More details here: https://sean-parent.stlab.cc/presentations/2013-09-11-cpp-seasoning/cpp-seasoning.pdf
I think that's the reason you are asked to remove for loop.

Sklearn GridSearch with pre-training

I'm making a Sklearn Pipeline with a parameter optimization made by the GridSearchCV. The pipeline has to get the best model for several different entities implementing a pre-train and then fine-tune approach: pre-train all the entities together and the fine-tune every single element and returns a model for each entity. These are the constraint of the pipeline:
Pre-train and fine-tuning have to be in the same pipeline because both the model has to have the same data in each GridSearchCV's fold.
The pre-train model has to pass its weights to the fine-tuning model.
I have implemented:
A Sklearn Transformer that takes a data-frame with all the entities in input and fit itself.
A Sklearn Regressor that splits the data-frame in one data-frame for each entity and fit a Keras model for each entity.
What I'm missing is how to pass from the Pre-train transformer to the Fine-tuning transformer the weights obtained by the Pre-train transformer (considering that each GridSearchCV fold has different weights)
Here is the code:
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, Input
import copy
class MyRegressor(BaseEstimator, TransformerMixin):
def __init__(self, neurons, featInput, featOutput):
self.neurons = neurons
self.preTrain = None
self.featInput = featInput
self.featOutput = featOutput
def fit(self, X, y=None):
X_train = X[self.featInput]
y_train = X[self.featOutput]
inputLayer = Input(shape=(len(self.featInput), ), name='INPUT')
hidden = Dense(self.neurons, name='HIDDEN')(inputLayer)
outputLayer = Dense(len(self.featOutput), name='OUTPUT')(hidden)
self.model = Model(inputLayer, outputLayer)
self.model.compile(loss='mse', optimizer='rmsprop')
if self.preTrain is not None:
self.model.loadWeights(self.preTrain)
self.model.fit(X_train, y_train)
return self
def predict(self, X):
return self.model.predict(X[self.featInput])
def transform(self, X):
return X
def score(self, X, y=None, sample_weight=None):
y_true = X[self.featOutput]
y_pred = self.predict(X)
return mean_squared_error(y_true, y_pred)
class LoopTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns, component):
self.columns = columns
self.component = component
self.components = []
def fit(self, X, y=None):
for index, idx in X[self.columns].drop_duplicates().iterrows():
entityDf = X[(X[self.columns] == idx).sum(axis=1) == len(self.columns)].copy()
self.components.append({'id': idx, 'component': copy.deepcopy(self.component)})
self.components[-1]['component'].fit(entityDf, y)
return self
def predict(self, X):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
res = comp['component'].predict(entityDf)
results.append(res)
dfRes = pd.concat(results)
return dfRes
def score(self, X, y=None, sample_weight=None):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
if len(entityDf) > 0:
results.append(comp['component'].score(entityDf))
return np.average(results)
#create the input dataframe: 3 entities
dataFrame = pd.DataFrame([], columns=['entityId', 'input', 'output'])
for entity in range(3):
x = np.arange(random.randint(10, 20))
y = x * (entity + 1)
tempDf = pd.DataFrame(np.array([x, y]).T, columns=['input', 'output'])
tempDf['entityId'] = entity
dataFrame = pd.concat([dataFrame, tempDf], sort=False)
dataFrame = dataFrame.reset_index(drop=True)
#create the pipeline
neurons = [5, 10]
myPipe = Pipeline([('preTrain',
MyRegressor(neurons=neurons[0], featInput=['input'], featOutput=['output'])),
('fineTuning',
LoopTransformer(['entityId'],
MyRegressor(
neurons=neurons[0],
featInput=['input'],
featOutput=['output'])))])
#pre-train and fine-tuning has to have always the same number of neurons
params = [{
'preTrain__neurons': [neurons[0]],
'fineTuning__component__neurons': [neurons[0]]
}, {
'preTrain__neurons': [neurons[1]],
'fineTuning__component__neurons': [neurons[1]]
}]
gs = GridSearchCV(myPipe, params, verbose=1, cv=3)
gs.fit(dataFrame, dataFrame)
score = gs.score(dataFrame, dataFrame)
print(score)
I'm pretty the sklearn.Pipeline as is doesn't support this. But as long as you don't clone your pipeline (which happens for instance if you use a GridSearchCV), you can hack your way through with a code like the following code, which gives the instance of a step in the pipeline to the next step. You can apply the same principle in your pipeline:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
print("got the means: %s" % self.scaler.mean_)
return self
def transform(self, X):
return X
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
pipeline = make_pipeline(scaler,
MyTransformer(scaler),
LogisticRegression(solver='lbfgs',
multi_class='auto'))
pipeline = pipeline.fit(X, y)
X = X - 1
pipeline = pipeline.fit(X, y)
Which would give you this output, as expected:
got the means: [5.84333333 3.05733333 3.758 1.19933333]
got the means: [4.84333333 2.05733333 2.758 0.19933333]

Using statsmodel estimations with scikit-learn cross validation, is it possible?

I posted this question to Cross Validated forum and later realized may be this would find appropriate audience in stackoverlfow instead.
I am looking for a way I can use the fit object (result) ontained from python statsmodel to feed into cross_val_score of scikit-learn cross_validation method?
The attached link suggests that it may be possible but I have not succeeded.
I am getting the following error
estimator should a be an estimator implementing 'fit' method
statsmodels.discrete.discrete_model.BinaryResultsWrapper object at
0x7fa6e801c590 was passed
Refer this link
Indeed, you cannot use cross_val_score directly on statsmodels objects, because of different interface: in statsmodels
training data is passed directly into the constructor
a separate object contains the result of model estimation
However, you can write a simple wrapper to make statsmodels objects look like sklearn estimators:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
This class contains correct fit and predict methods, and can be used with sklearn, e.g. cross-validated or included into a pipeline. Like here:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
X, y = make_regression(random_state=1, n_samples=300, noise=100)
print(cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2'))
print(cross_val_score(LinearRegression(), X, y, scoring='r2'))
You can see that the output of two models is identical, because they are both OLS models, cross-validated in the same way.
[0.28592315 0.37367557 0.47972639]
[0.28592315 0.37367557 0.47972639]
Following the suggestion of David (which gave me an error, complaining about missing function get_parameters) and the scikit learn documentation, I created the following wrapper for a linear regression.
It has the same interface of sklearn.linear_model.LinearRegression but in addition has also the function summary(), which gives the info about p-values, R2 and other statistics, as in statsmodels.OLS.
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
import numpy as np
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.estimator_checks import check_estimator
class MyLinearRegression(BaseEstimator, RegressorMixin):
def __init__(self, fit_intercept=True):
self.fit_intercept = fit_intercept
"""
Parameters
------------
column_names: list
It is an optional value, such that this class knows
what is the name of the feature to associate to
each column of X. This is useful if you use the method
summary(), so that it can show the feature name for each
coefficient
"""
def fit(self, X, y, column_names=() ):
if self.fit_intercept:
X = sm.add_constant(X)
# Check that X and y have correct shape
X, y = check_X_y(X, y)
self.X_ = X
self.y_ = y
if len(column_names) != 0:
cols = column_names.copy()
cols = list(cols)
X = pd.DataFrame(X)
cols = column_names.copy()
cols.insert(0,'intercept')
print('X ', X)
X.columns = cols
self.model_ = sm.OLS(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
# Check is fit had been called
check_is_fitted(self, 'model_')
# Input validation
X = check_array(X)
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
def get_params(self, deep = False):
return {'fit_intercept':self.fit_intercept}
def summary(self):
print(self.results_.summary() )
Example of use:
cols = ['feature1','feature2']
X_train = df_train[cols].values
X_test = df_test[cols].values
y_train = df_train['label']
y_test = df_test['label']
model = MyLinearRegression()
model.fit(X_train, y_train)
model.summary()
model.predict(X_test)
If you want to show the names of the columns, you can call
model.fit(X_train, y_train, column_names=cols)
To use it in cross_validation:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(MyLinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores
For reference purpose, if you use the statsmodels formula API and/or use the fit_regularized method, you can modify #David Dale's wrapper class in this way.
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from statsmodels.formula.api import glm as glm_sm
# This is an example wrapper for statsmodels GLM
class SMWrapper(BaseEstimator, RegressorMixin):
def __init__(self, family, formula, alpha, L1_wt):
self.family = family
self.formula = formula
self.alpha = alpha
self.L1_wt = L1_wt
self.model = None
self.result = None
def fit(self, X, y):
data = pd.concat([pd.DataFrame(X), pd.Series(y)], axis=1)
data.columns = X.columns.tolist() + ['y']
self.model = glm_sm(self.formula, data, family=self.family)
self.result = self.model.fit_regularized(alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
return self.result
def predict(self, X):
return self.result.predict(X)
Though I think this is not technically scikit-learn, there is the package pmdarima (link to pmdarima package on PyPi) that wraps statsmodel and provides a scikit-learn like interface.

Categories