Sklearn GridSearch with pre-training - python

I'm making a Sklearn Pipeline with a parameter optimization made by the GridSearchCV. The pipeline has to get the best model for several different entities implementing a pre-train and then fine-tune approach: pre-train all the entities together and the fine-tune every single element and returns a model for each entity. These are the constraint of the pipeline:
Pre-train and fine-tuning have to be in the same pipeline because both the model has to have the same data in each GridSearchCV's fold.
The pre-train model has to pass its weights to the fine-tuning model.
I have implemented:
A Sklearn Transformer that takes a data-frame with all the entities in input and fit itself.
A Sklearn Regressor that splits the data-frame in one data-frame for each entity and fit a Keras model for each entity.
What I'm missing is how to pass from the Pre-train transformer to the Fine-tuning transformer the weights obtained by the Pre-train transformer (considering that each GridSearchCV fold has different weights)
Here is the code:
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, Input
import copy
class MyRegressor(BaseEstimator, TransformerMixin):
def __init__(self, neurons, featInput, featOutput):
self.neurons = neurons
self.preTrain = None
self.featInput = featInput
self.featOutput = featOutput
def fit(self, X, y=None):
X_train = X[self.featInput]
y_train = X[self.featOutput]
inputLayer = Input(shape=(len(self.featInput), ), name='INPUT')
hidden = Dense(self.neurons, name='HIDDEN')(inputLayer)
outputLayer = Dense(len(self.featOutput), name='OUTPUT')(hidden)
self.model = Model(inputLayer, outputLayer)
self.model.compile(loss='mse', optimizer='rmsprop')
if self.preTrain is not None:
self.model.loadWeights(self.preTrain)
self.model.fit(X_train, y_train)
return self
def predict(self, X):
return self.model.predict(X[self.featInput])
def transform(self, X):
return X
def score(self, X, y=None, sample_weight=None):
y_true = X[self.featOutput]
y_pred = self.predict(X)
return mean_squared_error(y_true, y_pred)
class LoopTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns, component):
self.columns = columns
self.component = component
self.components = []
def fit(self, X, y=None):
for index, idx in X[self.columns].drop_duplicates().iterrows():
entityDf = X[(X[self.columns] == idx).sum(axis=1) == len(self.columns)].copy()
self.components.append({'id': idx, 'component': copy.deepcopy(self.component)})
self.components[-1]['component'].fit(entityDf, y)
return self
def predict(self, X):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
res = comp['component'].predict(entityDf)
results.append(res)
dfRes = pd.concat(results)
return dfRes
def score(self, X, y=None, sample_weight=None):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
if len(entityDf) > 0:
results.append(comp['component'].score(entityDf))
return np.average(results)
#create the input dataframe: 3 entities
dataFrame = pd.DataFrame([], columns=['entityId', 'input', 'output'])
for entity in range(3):
x = np.arange(random.randint(10, 20))
y = x * (entity + 1)
tempDf = pd.DataFrame(np.array([x, y]).T, columns=['input', 'output'])
tempDf['entityId'] = entity
dataFrame = pd.concat([dataFrame, tempDf], sort=False)
dataFrame = dataFrame.reset_index(drop=True)
#create the pipeline
neurons = [5, 10]
myPipe = Pipeline([('preTrain',
MyRegressor(neurons=neurons[0], featInput=['input'], featOutput=['output'])),
('fineTuning',
LoopTransformer(['entityId'],
MyRegressor(
neurons=neurons[0],
featInput=['input'],
featOutput=['output'])))])
#pre-train and fine-tuning has to have always the same number of neurons
params = [{
'preTrain__neurons': [neurons[0]],
'fineTuning__component__neurons': [neurons[0]]
}, {
'preTrain__neurons': [neurons[1]],
'fineTuning__component__neurons': [neurons[1]]
}]
gs = GridSearchCV(myPipe, params, verbose=1, cv=3)
gs.fit(dataFrame, dataFrame)
score = gs.score(dataFrame, dataFrame)
print(score)

I'm pretty the sklearn.Pipeline as is doesn't support this. But as long as you don't clone your pipeline (which happens for instance if you use a GridSearchCV), you can hack your way through with a code like the following code, which gives the instance of a step in the pipeline to the next step. You can apply the same principle in your pipeline:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
print("got the means: %s" % self.scaler.mean_)
return self
def transform(self, X):
return X
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
pipeline = make_pipeline(scaler,
MyTransformer(scaler),
LogisticRegression(solver='lbfgs',
multi_class='auto'))
pipeline = pipeline.fit(X, y)
X = X - 1
pipeline = pipeline.fit(X, y)
Which would give you this output, as expected:
got the means: [5.84333333 3.05733333 3.758 1.19933333]
got the means: [4.84333333 2.05733333 2.758 0.19933333]

Related

Clip output from sklearn pipeline predict

Let's see the following pipeline:
scaler = ScalerFactory.get_scaler(scaler_type)
model = MultiOutputRegressor(lgb.LGBMRegressor(metric='tweedie', **hyperparameters))
steps = [('scaler', scaler), ('model', model)]
pipeline = Pipeline(steps)
pipeline.fit(X, y, model__feature_name=list(X.columns))
I am trying to add another step to the pipeline, so when it predicts it rounds all the values that are between -1 and 1 to 0.
I am trying to create a new class:
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
class OutputClipper(BaseEstimator, TransformerMixin):
def __init__(self) -> None:
super().__init__()
self.clipping = False
def fit(self, X, y=None):
return self
def transform(self, X, y):
y[(y>-1) & (y<1)] = 0
return y
and the new pipeline becomes:
steps = [('scaler', scaler), ('model', model), ('clipping',OutputClipper) ]
pipeline = Pipeline(steps)
However, I feel like this doesn't quite work. I guess the transform happens when the pipeline is called with the .predict() method. I am not sure how to test it too.

Multi-output regression using skorch & sklearn pipeline gives runtime error due to dtype

I want to use skorch to do multi-output regression. I've created a small toy example as can be seen below. In the example, the NN should predict 5 outputs. I also want to use a preprocessing step that is incorporated using sklearn pipelines (in this example PCA is used, but it could be any other preprocessor). When executing this example I get the following error in the Variable._execution_engine.run_backward step of torch:
RuntimeError: Found dtype Double but expected Float
Am I forgetting something? I suspect, somewhere something has to be cast, but as skorch handles a lot of the pytorch stuff, I don't see what and where.
Example:
import torch
import skorch
from sklearn.datasets import make_classification, make_regression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
X, y = make_regression(n_samples=1000, n_features=40, n_targets=5)
X = X.astype('float32')
class RegressionModule(torch.nn.Module):
def __init__(self, input_dim=80):
super().__init__()
self.l0 = torch.nn.Linear(input_dim, 10)
self.l1 = torch.nn.Linear(10, 5)
def forward(self, X):
y = self.l0(X)
y = self.l1(y)
return y
class InputShapeSetter(skorch.callbacks.Callback):
def on_train_begin(self, net, X, y):
net.set_params(module__input_dim=X.shape[-1])
net = skorch.NeuralNetRegressor(
RegressionModule,
callbacks=[InputShapeSetter()],
)
pipe = make_pipeline(PCA(n_components=10), net)
pipe.fit(X, y)
print(pipe.predict(X))
Edit 1:
Casting X to float32 at the start won't work for every preprocessor as can be seen from this example:
import torch
import skorch
from sklearn.datasets import make_classification, make_regression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from category_encoders import OneHotEncoder
X, y = make_regression(n_samples=1000, n_features=40, n_targets=5)
X = pd.DataFrame(X,columns=[f'feature_{i}' for i in range(X.shape[1])])
X['feature_1'] = pd.qcut(X['feature_1'], 3, labels=["good", "medium", "bad"])
y = y.astype('float32')
class RegressionModule(torch.nn.Module):
def __init__(self, input_dim=80):
super().__init__()
self.l0 = torch.nn.Linear(input_dim, 10)
self.l1 = torch.nn.Linear(10, 5)
def forward(self, X):
y = self.l0(X)
y = self.l1(y)
return y
class InputShapeSetter(skorch.callbacks.Callback):
def on_train_begin(self, net, X, y):
net.set_params(module__input_dim=X.shape[-1])
net = skorch.NeuralNetRegressor(
RegressionModule,
callbacks=[InputShapeSetter()],
)
pipe = make_pipeline(OneHotEncoder(cols=['feature_1'], return_df=False), net)
pipe.fit(X, y)
print(pipe.predict(X))
By default OneHotEncoder returns numpy array of dtype=float64. So one could simply cast the input-data X when being fed into forward() of the model:
class RegressionModule(torch.nn.Module):
def __init__(self, input_dim=80):
super().__init__()
self.l0 = torch.nn.Linear(input_dim, 10)
self.l1 = torch.nn.Linear(10, 5)
def forward(self, X):
X = X.to(torch.float32)
y = self.l0(X)
y = self.l1(y)
return y

pipeline regressor ensemble with scikit-learn version 0.19.2

I'm using scikit-learn version 0.19.2 (for onnx conversion compatibility),
and I'm having problems implementing ensemble methods with Pipeline.
The code below is trying to implement linear regression from two independent regressors:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import sklearn
# data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# fitting models
forest_clf = sklearn.ensemble.RandomForestRegressor()
forest_clf.fit(X, y)
logistic_clf = linear_model.LogisticRegression()
logistic_clf.fit(X,y)
# pipeline
estimators = [('random_forest', forest_clf), ('logistic', logistic_clf)]
model = Pipeline( steps=[ ('models', FeatureUnion(estimators) ), ('linear_regression', linear_model.base.LinearRegression() ) ] )
# fitting ensemble
model.fit(X,y)
resulting in error
TypeError: All estimators should implement fit and transform. 'RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=False)' (type ) doesn't
Can someone help me figuring what went wrong?
p.s I'm looking for some Pipeline technique to imitate the sklearn.ensemble.StackingRegressor from version 0.23.
From the documentation of the Pipeline class:
Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods.
As the error message indicates, the RandomForestRegressor does not implement the transform function (neither does LogisitcRegression). Hence, they cannot be used directly as transformers in the pipeline.
If you want to use a pipeline, the only workaround I see is to wrap them in custom classes that implement the transform function as needed:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
class RFTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.rf = RandomForestRegressor()
def fit(self, X, y=None):
self.rf.fit(X, y)
return self
def transform(self, X):
return self.rf.predict(X).reshape(-1, 1)
class LRTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.lr = LogisticRegression()
def fit(self, X, y=None):
self.lr.fit(X, y)
return self
def transform(self, X):
return self.lr.predict(X).reshape(-1, 1)
These custom transformers will simply fit their respective models and the transform function will return their prediction (and if I understand correctly, this is what you want to concatenate and pass to the final estimator).
Now, you can use these transformers like this:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
# Data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# Instantiate transformers
forest_clf = RFTransformer()
logistic_clf = LRTransformer()
# Pipeline
estimators = [('random_forest', forest_clf), ('logistic', logistic_clf)]
model = Pipeline(steps=[('models', FeatureUnion(estimators)), ('linear_regression', LinearRegression())])
# Fitting ensemble
model.fit(X, y)
Finally, just to confirm the desired output:
print(model.predict(X))
# Output
# [1. 2.]
Based on #afsharov answer, I've made a general class for custom transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
class customTransform(BaseEstimator, TransformerMixin):
def __init__(self, algo_type):
# algo type
if algo_type=='GradientBoosting':
model = sklearn.ensemble.GradientBoostingRegressor()
elif algo_type=='svm':
model = linear_model.svm.SVR()
elif algo_type=='LogisticRegression':
model = linear_model.LogisticRegression(solver='newton-cg')
elif algo_type=='sgd':
model = linear_model.stochastic_gradient.SGDRegressor()
elif algo_type=='LinearRegression':
model = linear_model.base.LinearRegression()
elif algo_type=='TheilSen':
model = linear_model.theil_sen.TheilSenRegressor()
elif algo_type=='RandomForest':
model = sklearn.ensemble.RandomForestRegressor(criterion='mae')
# set the name and the model
self.name = algo_type
self.model = model
setattr(self, algo_type, model)
def fit(self, X, y=None):
# self.rf.fit(X,y)
getattr(self, self.name).fit(X, y)
return self
def transform(self, X):
# self.rf.predict(X).reshape(-1, 1)
return getattr(self, self.name).predict(X).reshape(-1,1)
def show(self):
print( '{}: {}'.format( self.name, self.model ) )
# Data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# Instantiate transformers
boosting_clf = customTransform('GradientBoosting')
forest_clf = customTransform('RandomForest')
boosting_clf.show()
# Pipeline
estimators = [('random_forest', forest_clf), ('gradient_boosting', boosting_clf)]
model = Pipeline(steps=[ ('models', FeatureUnion(estimators)), ('linear_regression', linear_model.LinearRegression())] )
# Fitting ensemble
model.fit(X, y)
# predict
print(model.predict(X) )

Scikit learn Custom Transformer dimension mismatch

I'm coming from R, so scikit API still very confusing to me. I was following this tutorial http://michelleful.github.io/code-blog/2015/06/20/pipelines/ to learn about Pipelines. So let's create a fake dataset just for reference:
x1,x2,y
foo,zoo,1
bar,moo,2
goo,too,3
roo,zoo,4
too,moo,5
My goal is very simple: train a linear regression on y, using separate tfidf matrices from x1 and x2, plus some custom features from both x1 and x2 (ie, word length, etc).
Let's start with the simpler task of using only tfidf from x1. Here's the full code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import time
import re
import math
def clip_RMSLE(y, y_pred, **kwargs):
y_pred[y_pred < 0] = 0.0
to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
return tfidf.fit_transform(df[self.colname].values)
def fit(self, df, y=None):
return self
start = time.time()
seed = 1991
ngram_rg = (1,2)
RMSLE = make_scorer(clip_RMSLE, greater_is_better=False)
def tokenizer(text):
if text:
result = re.findall('[a-z]{2,}', text.lower())
else:
result = []
return result
df = pd.read_csv('fake.csv', sep=',')
y = df['y'].values
pipeline = Pipeline([('tfidf', ColumnNgram('x1', tokenizer, ngram_rg)),
('linear_reg', LinearRegression(n_jobs=1))
])
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(pipeline, df, y, cv=kfold, scoring=RMSLE)
print(results)
print(results.mean())
end = time.time()
print('Timeto finish this thing: %0.2fs' % (end - start))
I'm getting the error ValueError: dimension mismatch, probably because some terms will not appear in both train/validation folds. What's the proper way of doing this? Thank you!
Change your ColumnNgram to this:
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
return self.tfidf.transform(df[self.colname].values)
def fit(self, df, y=None):
self.tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
self.tfidf.fit(df[self.colname].values)
return self
You should declare and learn about the training data in fit(). Currently you are re-fitting the data in each call to transform(), which obviously will return in different features in train and validation sets as you have suggested.
The proper way is to keep a TfidfVectorizer which learns data during the fit() and then only transform the new data in transform() instead of re-fitting the new data.

Using statsmodel estimations with scikit-learn cross validation, is it possible?

I posted this question to Cross Validated forum and later realized may be this would find appropriate audience in stackoverlfow instead.
I am looking for a way I can use the fit object (result) ontained from python statsmodel to feed into cross_val_score of scikit-learn cross_validation method?
The attached link suggests that it may be possible but I have not succeeded.
I am getting the following error
estimator should a be an estimator implementing 'fit' method
statsmodels.discrete.discrete_model.BinaryResultsWrapper object at
0x7fa6e801c590 was passed
Refer this link
Indeed, you cannot use cross_val_score directly on statsmodels objects, because of different interface: in statsmodels
training data is passed directly into the constructor
a separate object contains the result of model estimation
However, you can write a simple wrapper to make statsmodels objects look like sklearn estimators:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
This class contains correct fit and predict methods, and can be used with sklearn, e.g. cross-validated or included into a pipeline. Like here:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
X, y = make_regression(random_state=1, n_samples=300, noise=100)
print(cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2'))
print(cross_val_score(LinearRegression(), X, y, scoring='r2'))
You can see that the output of two models is identical, because they are both OLS models, cross-validated in the same way.
[0.28592315 0.37367557 0.47972639]
[0.28592315 0.37367557 0.47972639]
Following the suggestion of David (which gave me an error, complaining about missing function get_parameters) and the scikit learn documentation, I created the following wrapper for a linear regression.
It has the same interface of sklearn.linear_model.LinearRegression but in addition has also the function summary(), which gives the info about p-values, R2 and other statistics, as in statsmodels.OLS.
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
import numpy as np
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.estimator_checks import check_estimator
class MyLinearRegression(BaseEstimator, RegressorMixin):
def __init__(self, fit_intercept=True):
self.fit_intercept = fit_intercept
"""
Parameters
------------
column_names: list
It is an optional value, such that this class knows
what is the name of the feature to associate to
each column of X. This is useful if you use the method
summary(), so that it can show the feature name for each
coefficient
"""
def fit(self, X, y, column_names=() ):
if self.fit_intercept:
X = sm.add_constant(X)
# Check that X and y have correct shape
X, y = check_X_y(X, y)
self.X_ = X
self.y_ = y
if len(column_names) != 0:
cols = column_names.copy()
cols = list(cols)
X = pd.DataFrame(X)
cols = column_names.copy()
cols.insert(0,'intercept')
print('X ', X)
X.columns = cols
self.model_ = sm.OLS(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
# Check is fit had been called
check_is_fitted(self, 'model_')
# Input validation
X = check_array(X)
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
def get_params(self, deep = False):
return {'fit_intercept':self.fit_intercept}
def summary(self):
print(self.results_.summary() )
Example of use:
cols = ['feature1','feature2']
X_train = df_train[cols].values
X_test = df_test[cols].values
y_train = df_train['label']
y_test = df_test['label']
model = MyLinearRegression()
model.fit(X_train, y_train)
model.summary()
model.predict(X_test)
If you want to show the names of the columns, you can call
model.fit(X_train, y_train, column_names=cols)
To use it in cross_validation:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(MyLinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores
For reference purpose, if you use the statsmodels formula API and/or use the fit_regularized method, you can modify #David Dale's wrapper class in this way.
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from statsmodels.formula.api import glm as glm_sm
# This is an example wrapper for statsmodels GLM
class SMWrapper(BaseEstimator, RegressorMixin):
def __init__(self, family, formula, alpha, L1_wt):
self.family = family
self.formula = formula
self.alpha = alpha
self.L1_wt = L1_wt
self.model = None
self.result = None
def fit(self, X, y):
data = pd.concat([pd.DataFrame(X), pd.Series(y)], axis=1)
data.columns = X.columns.tolist() + ['y']
self.model = glm_sm(self.formula, data, family=self.family)
self.result = self.model.fit_regularized(alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
return self.result
def predict(self, X):
return self.result.predict(X)
Though I think this is not technically scikit-learn, there is the package pmdarima (link to pmdarima package on PyPi) that wraps statsmodel and provides a scikit-learn like interface.

Categories