Using statsmodel estimations with scikit-learn cross validation, is it possible? - python

I posted this question to Cross Validated forum and later realized may be this would find appropriate audience in stackoverlfow instead.
I am looking for a way I can use the fit object (result) ontained from python statsmodel to feed into cross_val_score of scikit-learn cross_validation method?
The attached link suggests that it may be possible but I have not succeeded.
I am getting the following error
estimator should a be an estimator implementing 'fit' method
statsmodels.discrete.discrete_model.BinaryResultsWrapper object at
0x7fa6e801c590 was passed
Refer this link

Indeed, you cannot use cross_val_score directly on statsmodels objects, because of different interface: in statsmodels
training data is passed directly into the constructor
a separate object contains the result of model estimation
However, you can write a simple wrapper to make statsmodels objects look like sklearn estimators:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
This class contains correct fit and predict methods, and can be used with sklearn, e.g. cross-validated or included into a pipeline. Like here:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
X, y = make_regression(random_state=1, n_samples=300, noise=100)
print(cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2'))
print(cross_val_score(LinearRegression(), X, y, scoring='r2'))
You can see that the output of two models is identical, because they are both OLS models, cross-validated in the same way.
[0.28592315 0.37367557 0.47972639]
[0.28592315 0.37367557 0.47972639]

Following the suggestion of David (which gave me an error, complaining about missing function get_parameters) and the scikit learn documentation, I created the following wrapper for a linear regression.
It has the same interface of sklearn.linear_model.LinearRegression but in addition has also the function summary(), which gives the info about p-values, R2 and other statistics, as in statsmodels.OLS.
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
import numpy as np
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.estimator_checks import check_estimator
class MyLinearRegression(BaseEstimator, RegressorMixin):
def __init__(self, fit_intercept=True):
self.fit_intercept = fit_intercept
"""
Parameters
------------
column_names: list
It is an optional value, such that this class knows
what is the name of the feature to associate to
each column of X. This is useful if you use the method
summary(), so that it can show the feature name for each
coefficient
"""
def fit(self, X, y, column_names=() ):
if self.fit_intercept:
X = sm.add_constant(X)
# Check that X and y have correct shape
X, y = check_X_y(X, y)
self.X_ = X
self.y_ = y
if len(column_names) != 0:
cols = column_names.copy()
cols = list(cols)
X = pd.DataFrame(X)
cols = column_names.copy()
cols.insert(0,'intercept')
print('X ', X)
X.columns = cols
self.model_ = sm.OLS(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
# Check is fit had been called
check_is_fitted(self, 'model_')
# Input validation
X = check_array(X)
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
def get_params(self, deep = False):
return {'fit_intercept':self.fit_intercept}
def summary(self):
print(self.results_.summary() )
Example of use:
cols = ['feature1','feature2']
X_train = df_train[cols].values
X_test = df_test[cols].values
y_train = df_train['label']
y_test = df_test['label']
model = MyLinearRegression()
model.fit(X_train, y_train)
model.summary()
model.predict(X_test)
If you want to show the names of the columns, you can call
model.fit(X_train, y_train, column_names=cols)
To use it in cross_validation:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(MyLinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores

For reference purpose, if you use the statsmodels formula API and/or use the fit_regularized method, you can modify #David Dale's wrapper class in this way.
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from statsmodels.formula.api import glm as glm_sm
# This is an example wrapper for statsmodels GLM
class SMWrapper(BaseEstimator, RegressorMixin):
def __init__(self, family, formula, alpha, L1_wt):
self.family = family
self.formula = formula
self.alpha = alpha
self.L1_wt = L1_wt
self.model = None
self.result = None
def fit(self, X, y):
data = pd.concat([pd.DataFrame(X), pd.Series(y)], axis=1)
data.columns = X.columns.tolist() + ['y']
self.model = glm_sm(self.formula, data, family=self.family)
self.result = self.model.fit_regularized(alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
return self.result
def predict(self, X):
return self.result.predict(X)

Though I think this is not technically scikit-learn, there is the package pmdarima (link to pmdarima package on PyPi) that wraps statsmodel and provides a scikit-learn like interface.

Related

Stacking classifier: Using custom classifier returns error

I'm using a StackingClassifier in sklearn, where I want the component models to be custom classifiers. In order to do this, I wanted to test it out with some dummy code where the custom classifier is the exact same as an already existing model (KNN, in this example). However this throws an error, and I'm not sure I understand why, and looking for help with this. It's probably something fairly obvious (I'm new to trying to write custom classifiers and using ClassiferMixIn), but I can't seem to figure out what I'm missing:
Code -- the baseline example without my custom class (works):
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
model = StackingClassifier(estimators=[
('tree', Pipeline([('tree', DecisionTreeClassifier(random_state=42))])),
('knn', Pipeline([('knn', KNeighborsClassifier())])),
])
model.fit(X, y)
Code -- the with my custom class (doesn't work):
class MyOwnClassifier(ClassifierMixin):
def __init__(self,classifier):
self.classifier = classifier
def fit(self, X, y):
self.classifier.fit(X,y)
return self
def predict(self, X):
return self.classifier.predict(X)
def predict_proba(self, X):
return self.classifier.predict_proba(X)
model = StackingClassifier(estimators=[
('tree', Pipeline([('tree', DecisionTreeClassifier(random_state=42))])),
('knn', Pipeline([('knn', MyOwnClassifier(KNeighborsClassifier()))])),
])
model.fit(X, y)
returns the error
AttributeError: 'MyOwnClassifier' object has no attribute 'classes_'
What really puzzles me about this is that in this answer, an identity transform could be used as part of the pipeline, and I can't imagine that object had 'classes_' either.
You've got 3 problems with your code:
StackingClassifier expects an attribute classes_ to be available on a fitted classifier, which is clearly stated in the error message. The linked example does have it, whereas yours doesn't. It can be checked if you run like dir(MyOwnClassifier(KNeighborsClassifier()).fit(X,y)).
BaseEstimator is missing from your class definition (you can do without it, but its presence makes life easier)
Pipelines in you code are extraneous clutter that are not necessary to debug your code and only complicating debugging.
Once you correct these problems you have a working code:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.base import ClassifierMixin, BaseEstimator
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
class MyOwnClassifier(ClassifierMixin, BaseEstimator):
def __init__(self,classifier):
self.classifier = classifier
def fit(self, X, y):
self.classifier.fit(X,y)
self.classes_ = self.classifier.classes_
return self
def predict(self, X):
return self.classifier.predict(X)
def predict_proba(self, X):
return self.classifier.predict_proba(X)
model = StackingClassifier(estimators=[
('tree', DecisionTreeClassifier(random_state=42)),
('knn', MyOwnClassifier(KNeighborsClassifier()))])
model.fit(X,y)
StackingClassifier(estimators=[('tree',
DecisionTreeClassifier(random_state=42)),
('knn',
MyOwnClassifier(classifier=KNeighborsClassifier()))])

Why am I getting 'last step of pipeline' error when using make_pipeline in scikit-learn?

So I am trying to use make_pipeline in scikit-learn to clean my data (replace missing values and then clean for outliers, apply an encoding function to the categorical variables and then finally add a Random Forest Regressor through RandomForestRegressor. The input is a DataFrame. Eventually I'd like to put this through GridSearchCV to search over optimal hyperparameters for the regressor.
In order to do this I built some custom classes which inherit the TransformerMixin class as advised here. Here is what I have so far
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin
import pandas as pd
class Cleaning(TransformerMixin):
def __init__(self, column_labels):
self.column_labels = column_labels
def fit(self, X, y=None):
return self
def transform(self, X):
"""Given a dataframe X with predictors, clean it."""
X_imputed, medians_X = median_imputer(X) # impute all missing numeric data with median
quantiles_X = get_quantiles(X_imputed, self.column_labels)
X_nooutliers, _ = replace_outliers(X_imputed, self.column_labels, medians_X, quantiles_X)
return X_nooutliers
class Encoding(TransformerMixin):
def __init__(self, encoder_list):
self.encoder_list = encoder_list
def fit(self, X, y=None):
return self
def transform(self, X):
"""Takes in dataframe X and applies encoding transformation to them"""
return encode_data(self.encoder_list, X)
However, when I run the following line of code I am getting an error:
import category_encoders as ce
pipeline_cleaning = Cleaning(column_labels = train_labels)
OneHot_binary = ce.OneHotEncoder(cols = ['new_store'])
OneHot = ce.OneHotEncoder(cols= ['transport_availability'])
Count = ce.CountEncoder(cols = ['county'])
pipeline_encoding = Encoding([OneHot_binary, OneHot, Count])
baseline = RandomForestRegressor(n_estimators=500, random_state=12)
make_pipeline([pipeline_cleaning, pipeline_encoding,baseline])
The error is saying Last step of Pipeline should implement fit or be the string 'passthrough'. I don't understand why?
EDIT: slight typo in the last line, correct. The Third element in the list passed to make_pipeline is the regressor
Change the line:
make_pipeline([pipeline_cleaning, pipeline_encoding,baseline])
to (without list):
make_pipeline(pipeline_cleaning, pipeline_encoding,baseline)
Pipeline(steps=[('cleaning', <__main__.Cleaning object at 0x7f617260c1d0>),
('encoding', <__main__.Encoding object at 0x7f617260c278>),
('randomforestregressor',
RandomForestRegressor(n_estimators=500, random_state=12))])
and you're fine to go

How to pass pipeline model to a class without instantiating it

I would like to create a class that takes a scikit-learn pipeline and loop over it (as in the code example below).
In the example below I can however only pass the instance of my pipeline to the class and not create a new one in order to start with a fresh model.
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
class my_class:
def __init__(self,model):
self.model = model
def evaluate(self, X, y):
results = []
for i in range(10):
self.model.fit(X,y) #I always use the same instance here.
y_pred = self.model.predict(X)
results.append(accuracy_score(y_pred=y_pred, y_true=y))
return results
iris = load_iris()
X = iris.data
y = iris.target
pipeline = Pipeline([
('classifier', AdaBoostClassifier())
])
test = my_class(pipeline)
scores = test.evaluate(X,y)
Your code might be initializing different models but the result will always be the same because you are training and testing on the same data X every time without any change in the hyperparameters and random_state as None. That is why the results list will always contain same value.

Sklearn GridSearch with pre-training

I'm making a Sklearn Pipeline with a parameter optimization made by the GridSearchCV. The pipeline has to get the best model for several different entities implementing a pre-train and then fine-tune approach: pre-train all the entities together and the fine-tune every single element and returns a model for each entity. These are the constraint of the pipeline:
Pre-train and fine-tuning have to be in the same pipeline because both the model has to have the same data in each GridSearchCV's fold.
The pre-train model has to pass its weights to the fine-tuning model.
I have implemented:
A Sklearn Transformer that takes a data-frame with all the entities in input and fit itself.
A Sklearn Regressor that splits the data-frame in one data-frame for each entity and fit a Keras model for each entity.
What I'm missing is how to pass from the Pre-train transformer to the Fine-tuning transformer the weights obtained by the Pre-train transformer (considering that each GridSearchCV fold has different weights)
Here is the code:
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, Input
import copy
class MyRegressor(BaseEstimator, TransformerMixin):
def __init__(self, neurons, featInput, featOutput):
self.neurons = neurons
self.preTrain = None
self.featInput = featInput
self.featOutput = featOutput
def fit(self, X, y=None):
X_train = X[self.featInput]
y_train = X[self.featOutput]
inputLayer = Input(shape=(len(self.featInput), ), name='INPUT')
hidden = Dense(self.neurons, name='HIDDEN')(inputLayer)
outputLayer = Dense(len(self.featOutput), name='OUTPUT')(hidden)
self.model = Model(inputLayer, outputLayer)
self.model.compile(loss='mse', optimizer='rmsprop')
if self.preTrain is not None:
self.model.loadWeights(self.preTrain)
self.model.fit(X_train, y_train)
return self
def predict(self, X):
return self.model.predict(X[self.featInput])
def transform(self, X):
return X
def score(self, X, y=None, sample_weight=None):
y_true = X[self.featOutput]
y_pred = self.predict(X)
return mean_squared_error(y_true, y_pred)
class LoopTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns, component):
self.columns = columns
self.component = component
self.components = []
def fit(self, X, y=None):
for index, idx in X[self.columns].drop_duplicates().iterrows():
entityDf = X[(X[self.columns] == idx).sum(axis=1) == len(self.columns)].copy()
self.components.append({'id': idx, 'component': copy.deepcopy(self.component)})
self.components[-1]['component'].fit(entityDf, y)
return self
def predict(self, X):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
res = comp['component'].predict(entityDf)
results.append(res)
dfRes = pd.concat(results)
return dfRes
def score(self, X, y=None, sample_weight=None):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
if len(entityDf) > 0:
results.append(comp['component'].score(entityDf))
return np.average(results)
#create the input dataframe: 3 entities
dataFrame = pd.DataFrame([], columns=['entityId', 'input', 'output'])
for entity in range(3):
x = np.arange(random.randint(10, 20))
y = x * (entity + 1)
tempDf = pd.DataFrame(np.array([x, y]).T, columns=['input', 'output'])
tempDf['entityId'] = entity
dataFrame = pd.concat([dataFrame, tempDf], sort=False)
dataFrame = dataFrame.reset_index(drop=True)
#create the pipeline
neurons = [5, 10]
myPipe = Pipeline([('preTrain',
MyRegressor(neurons=neurons[0], featInput=['input'], featOutput=['output'])),
('fineTuning',
LoopTransformer(['entityId'],
MyRegressor(
neurons=neurons[0],
featInput=['input'],
featOutput=['output'])))])
#pre-train and fine-tuning has to have always the same number of neurons
params = [{
'preTrain__neurons': [neurons[0]],
'fineTuning__component__neurons': [neurons[0]]
}, {
'preTrain__neurons': [neurons[1]],
'fineTuning__component__neurons': [neurons[1]]
}]
gs = GridSearchCV(myPipe, params, verbose=1, cv=3)
gs.fit(dataFrame, dataFrame)
score = gs.score(dataFrame, dataFrame)
print(score)
I'm pretty the sklearn.Pipeline as is doesn't support this. But as long as you don't clone your pipeline (which happens for instance if you use a GridSearchCV), you can hack your way through with a code like the following code, which gives the instance of a step in the pipeline to the next step. You can apply the same principle in your pipeline:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
print("got the means: %s" % self.scaler.mean_)
return self
def transform(self, X):
return X
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
pipeline = make_pipeline(scaler,
MyTransformer(scaler),
LogisticRegression(solver='lbfgs',
multi_class='auto'))
pipeline = pipeline.fit(X, y)
X = X - 1
pipeline = pipeline.fit(X, y)
Which would give you this output, as expected:
got the means: [5.84333333 3.05733333 3.758 1.19933333]
got the means: [4.84333333 2.05733333 2.758 0.19933333]

Scikit learn Custom Transformer dimension mismatch

I'm coming from R, so scikit API still very confusing to me. I was following this tutorial http://michelleful.github.io/code-blog/2015/06/20/pipelines/ to learn about Pipelines. So let's create a fake dataset just for reference:
x1,x2,y
foo,zoo,1
bar,moo,2
goo,too,3
roo,zoo,4
too,moo,5
My goal is very simple: train a linear regression on y, using separate tfidf matrices from x1 and x2, plus some custom features from both x1 and x2 (ie, word length, etc).
Let's start with the simpler task of using only tfidf from x1. Here's the full code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import time
import re
import math
def clip_RMSLE(y, y_pred, **kwargs):
y_pred[y_pred < 0] = 0.0
to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
return tfidf.fit_transform(df[self.colname].values)
def fit(self, df, y=None):
return self
start = time.time()
seed = 1991
ngram_rg = (1,2)
RMSLE = make_scorer(clip_RMSLE, greater_is_better=False)
def tokenizer(text):
if text:
result = re.findall('[a-z]{2,}', text.lower())
else:
result = []
return result
df = pd.read_csv('fake.csv', sep=',')
y = df['y'].values
pipeline = Pipeline([('tfidf', ColumnNgram('x1', tokenizer, ngram_rg)),
('linear_reg', LinearRegression(n_jobs=1))
])
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(pipeline, df, y, cv=kfold, scoring=RMSLE)
print(results)
print(results.mean())
end = time.time()
print('Timeto finish this thing: %0.2fs' % (end - start))
I'm getting the error ValueError: dimension mismatch, probably because some terms will not appear in both train/validation folds. What's the proper way of doing this? Thank you!
Change your ColumnNgram to this:
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
return self.tfidf.transform(df[self.colname].values)
def fit(self, df, y=None):
self.tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
self.tfidf.fit(df[self.colname].values)
return self
You should declare and learn about the training data in fit(). Currently you are re-fitting the data in each call to transform(), which obviously will return in different features in train and validation sets as you have suggested.
The proper way is to keep a TfidfVectorizer which learns data during the fit() and then only transform the new data in transform() instead of re-fitting the new data.

Categories