Related
I'm using scikit-learn version 0.19.2 (for onnx conversion compatibility),
and I'm having problems implementing ensemble methods with Pipeline.
The code below is trying to implement linear regression from two independent regressors:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import sklearn
# data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# fitting models
forest_clf = sklearn.ensemble.RandomForestRegressor()
forest_clf.fit(X, y)
logistic_clf = linear_model.LogisticRegression()
logistic_clf.fit(X,y)
# pipeline
estimators = [('random_forest', forest_clf), ('logistic', logistic_clf)]
model = Pipeline( steps=[ ('models', FeatureUnion(estimators) ), ('linear_regression', linear_model.base.LinearRegression() ) ] )
# fitting ensemble
model.fit(X,y)
resulting in error
TypeError: All estimators should implement fit and transform. 'RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=False)' (type ) doesn't
Can someone help me figuring what went wrong?
p.s I'm looking for some Pipeline technique to imitate the sklearn.ensemble.StackingRegressor from version 0.23.
From the documentation of the Pipeline class:
Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods.
As the error message indicates, the RandomForestRegressor does not implement the transform function (neither does LogisitcRegression). Hence, they cannot be used directly as transformers in the pipeline.
If you want to use a pipeline, the only workaround I see is to wrap them in custom classes that implement the transform function as needed:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
class RFTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.rf = RandomForestRegressor()
def fit(self, X, y=None):
self.rf.fit(X, y)
return self
def transform(self, X):
return self.rf.predict(X).reshape(-1, 1)
class LRTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.lr = LogisticRegression()
def fit(self, X, y=None):
self.lr.fit(X, y)
return self
def transform(self, X):
return self.lr.predict(X).reshape(-1, 1)
These custom transformers will simply fit their respective models and the transform function will return their prediction (and if I understand correctly, this is what you want to concatenate and pass to the final estimator).
Now, you can use these transformers like this:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
# Data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# Instantiate transformers
forest_clf = RFTransformer()
logistic_clf = LRTransformer()
# Pipeline
estimators = [('random_forest', forest_clf), ('logistic', logistic_clf)]
model = Pipeline(steps=[('models', FeatureUnion(estimators)), ('linear_regression', LinearRegression())])
# Fitting ensemble
model.fit(X, y)
Finally, just to confirm the desired output:
print(model.predict(X))
# Output
# [1. 2.]
Based on #afsharov answer, I've made a general class for custom transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
class customTransform(BaseEstimator, TransformerMixin):
def __init__(self, algo_type):
# algo type
if algo_type=='GradientBoosting':
model = sklearn.ensemble.GradientBoostingRegressor()
elif algo_type=='svm':
model = linear_model.svm.SVR()
elif algo_type=='LogisticRegression':
model = linear_model.LogisticRegression(solver='newton-cg')
elif algo_type=='sgd':
model = linear_model.stochastic_gradient.SGDRegressor()
elif algo_type=='LinearRegression':
model = linear_model.base.LinearRegression()
elif algo_type=='TheilSen':
model = linear_model.theil_sen.TheilSenRegressor()
elif algo_type=='RandomForest':
model = sklearn.ensemble.RandomForestRegressor(criterion='mae')
# set the name and the model
self.name = algo_type
self.model = model
setattr(self, algo_type, model)
def fit(self, X, y=None):
# self.rf.fit(X,y)
getattr(self, self.name).fit(X, y)
return self
def transform(self, X):
# self.rf.predict(X).reshape(-1, 1)
return getattr(self, self.name).predict(X).reshape(-1,1)
def show(self):
print( '{}: {}'.format( self.name, self.model ) )
# Data
X = [[1, 2, 2], [4, 5, 6]]
y = [1, 2]
# Instantiate transformers
boosting_clf = customTransform('GradientBoosting')
forest_clf = customTransform('RandomForest')
boosting_clf.show()
# Pipeline
estimators = [('random_forest', forest_clf), ('gradient_boosting', boosting_clf)]
model = Pipeline(steps=[ ('models', FeatureUnion(estimators)), ('linear_regression', linear_model.LinearRegression())] )
# Fitting ensemble
model.fit(X, y)
# predict
print(model.predict(X) )
I would like to create a class that takes a scikit-learn pipeline and loop over it (as in the code example below).
In the example below I can however only pass the instance of my pipeline to the class and not create a new one in order to start with a fresh model.
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
class my_class:
def __init__(self,model):
self.model = model
def evaluate(self, X, y):
results = []
for i in range(10):
self.model.fit(X,y) #I always use the same instance here.
y_pred = self.model.predict(X)
results.append(accuracy_score(y_pred=y_pred, y_true=y))
return results
iris = load_iris()
X = iris.data
y = iris.target
pipeline = Pipeline([
('classifier', AdaBoostClassifier())
])
test = my_class(pipeline)
scores = test.evaluate(X,y)
Your code might be initializing different models but the result will always be the same because you are training and testing on the same data X every time without any change in the hyperparameters and random_state as None. That is why the results list will always contain same value.
I'm making a Sklearn Pipeline with a parameter optimization made by the GridSearchCV. The pipeline has to get the best model for several different entities implementing a pre-train and then fine-tune approach: pre-train all the entities together and the fine-tune every single element and returns a model for each entity. These are the constraint of the pipeline:
Pre-train and fine-tuning have to be in the same pipeline because both the model has to have the same data in each GridSearchCV's fold.
The pre-train model has to pass its weights to the fine-tuning model.
I have implemented:
A Sklearn Transformer that takes a data-frame with all the entities in input and fit itself.
A Sklearn Regressor that splits the data-frame in one data-frame for each entity and fit a Keras model for each entity.
What I'm missing is how to pass from the Pre-train transformer to the Fine-tuning transformer the weights obtained by the Pre-train transformer (considering that each GridSearchCV fold has different weights)
Here is the code:
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, Input
import copy
class MyRegressor(BaseEstimator, TransformerMixin):
def __init__(self, neurons, featInput, featOutput):
self.neurons = neurons
self.preTrain = None
self.featInput = featInput
self.featOutput = featOutput
def fit(self, X, y=None):
X_train = X[self.featInput]
y_train = X[self.featOutput]
inputLayer = Input(shape=(len(self.featInput), ), name='INPUT')
hidden = Dense(self.neurons, name='HIDDEN')(inputLayer)
outputLayer = Dense(len(self.featOutput), name='OUTPUT')(hidden)
self.model = Model(inputLayer, outputLayer)
self.model.compile(loss='mse', optimizer='rmsprop')
if self.preTrain is not None:
self.model.loadWeights(self.preTrain)
self.model.fit(X_train, y_train)
return self
def predict(self, X):
return self.model.predict(X[self.featInput])
def transform(self, X):
return X
def score(self, X, y=None, sample_weight=None):
y_true = X[self.featOutput]
y_pred = self.predict(X)
return mean_squared_error(y_true, y_pred)
class LoopTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns, component):
self.columns = columns
self.component = component
self.components = []
def fit(self, X, y=None):
for index, idx in X[self.columns].drop_duplicates().iterrows():
entityDf = X[(X[self.columns] == idx).sum(axis=1) == len(self.columns)].copy()
self.components.append({'id': idx, 'component': copy.deepcopy(self.component)})
self.components[-1]['component'].fit(entityDf, y)
return self
def predict(self, X):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
res = comp['component'].predict(entityDf)
results.append(res)
dfRes = pd.concat(results)
return dfRes
def score(self, X, y=None, sample_weight=None):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
if len(entityDf) > 0:
results.append(comp['component'].score(entityDf))
return np.average(results)
#create the input dataframe: 3 entities
dataFrame = pd.DataFrame([], columns=['entityId', 'input', 'output'])
for entity in range(3):
x = np.arange(random.randint(10, 20))
y = x * (entity + 1)
tempDf = pd.DataFrame(np.array([x, y]).T, columns=['input', 'output'])
tempDf['entityId'] = entity
dataFrame = pd.concat([dataFrame, tempDf], sort=False)
dataFrame = dataFrame.reset_index(drop=True)
#create the pipeline
neurons = [5, 10]
myPipe = Pipeline([('preTrain',
MyRegressor(neurons=neurons[0], featInput=['input'], featOutput=['output'])),
('fineTuning',
LoopTransformer(['entityId'],
MyRegressor(
neurons=neurons[0],
featInput=['input'],
featOutput=['output'])))])
#pre-train and fine-tuning has to have always the same number of neurons
params = [{
'preTrain__neurons': [neurons[0]],
'fineTuning__component__neurons': [neurons[0]]
}, {
'preTrain__neurons': [neurons[1]],
'fineTuning__component__neurons': [neurons[1]]
}]
gs = GridSearchCV(myPipe, params, verbose=1, cv=3)
gs.fit(dataFrame, dataFrame)
score = gs.score(dataFrame, dataFrame)
print(score)
I'm pretty the sklearn.Pipeline as is doesn't support this. But as long as you don't clone your pipeline (which happens for instance if you use a GridSearchCV), you can hack your way through with a code like the following code, which gives the instance of a step in the pipeline to the next step. You can apply the same principle in your pipeline:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
print("got the means: %s" % self.scaler.mean_)
return self
def transform(self, X):
return X
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
pipeline = make_pipeline(scaler,
MyTransformer(scaler),
LogisticRegression(solver='lbfgs',
multi_class='auto'))
pipeline = pipeline.fit(X, y)
X = X - 1
pipeline = pipeline.fit(X, y)
Which would give you this output, as expected:
got the means: [5.84333333 3.05733333 3.758 1.19933333]
got the means: [4.84333333 2.05733333 2.758 0.19933333]
I'm coming from R, so scikit API still very confusing to me. I was following this tutorial http://michelleful.github.io/code-blog/2015/06/20/pipelines/ to learn about Pipelines. So let's create a fake dataset just for reference:
x1,x2,y
foo,zoo,1
bar,moo,2
goo,too,3
roo,zoo,4
too,moo,5
My goal is very simple: train a linear regression on y, using separate tfidf matrices from x1 and x2, plus some custom features from both x1 and x2 (ie, word length, etc).
Let's start with the simpler task of using only tfidf from x1. Here's the full code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import time
import re
import math
def clip_RMSLE(y, y_pred, **kwargs):
y_pred[y_pred < 0] = 0.0
to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
return tfidf.fit_transform(df[self.colname].values)
def fit(self, df, y=None):
return self
start = time.time()
seed = 1991
ngram_rg = (1,2)
RMSLE = make_scorer(clip_RMSLE, greater_is_better=False)
def tokenizer(text):
if text:
result = re.findall('[a-z]{2,}', text.lower())
else:
result = []
return result
df = pd.read_csv('fake.csv', sep=',')
y = df['y'].values
pipeline = Pipeline([('tfidf', ColumnNgram('x1', tokenizer, ngram_rg)),
('linear_reg', LinearRegression(n_jobs=1))
])
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(pipeline, df, y, cv=kfold, scoring=RMSLE)
print(results)
print(results.mean())
end = time.time()
print('Timeto finish this thing: %0.2fs' % (end - start))
I'm getting the error ValueError: dimension mismatch, probably because some terms will not appear in both train/validation folds. What's the proper way of doing this? Thank you!
Change your ColumnNgram to this:
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
return self.tfidf.transform(df[self.colname].values)
def fit(self, df, y=None):
self.tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
self.tfidf.fit(df[self.colname].values)
return self
You should declare and learn about the training data in fit(). Currently you are re-fitting the data in each call to transform(), which obviously will return in different features in train and validation sets as you have suggested.
The proper way is to keep a TfidfVectorizer which learns data during the fit() and then only transform the new data in transform() instead of re-fitting the new data.
I posted this question to Cross Validated forum and later realized may be this would find appropriate audience in stackoverlfow instead.
I am looking for a way I can use the fit object (result) ontained from python statsmodel to feed into cross_val_score of scikit-learn cross_validation method?
The attached link suggests that it may be possible but I have not succeeded.
I am getting the following error
estimator should a be an estimator implementing 'fit' method
statsmodels.discrete.discrete_model.BinaryResultsWrapper object at
0x7fa6e801c590 was passed
Refer this link
Indeed, you cannot use cross_val_score directly on statsmodels objects, because of different interface: in statsmodels
training data is passed directly into the constructor
a separate object contains the result of model estimation
However, you can write a simple wrapper to make statsmodels objects look like sklearn estimators:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
This class contains correct fit and predict methods, and can be used with sklearn, e.g. cross-validated or included into a pipeline. Like here:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
X, y = make_regression(random_state=1, n_samples=300, noise=100)
print(cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2'))
print(cross_val_score(LinearRegression(), X, y, scoring='r2'))
You can see that the output of two models is identical, because they are both OLS models, cross-validated in the same way.
[0.28592315 0.37367557 0.47972639]
[0.28592315 0.37367557 0.47972639]
Following the suggestion of David (which gave me an error, complaining about missing function get_parameters) and the scikit learn documentation, I created the following wrapper for a linear regression.
It has the same interface of sklearn.linear_model.LinearRegression but in addition has also the function summary(), which gives the info about p-values, R2 and other statistics, as in statsmodels.OLS.
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
import numpy as np
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.estimator_checks import check_estimator
class MyLinearRegression(BaseEstimator, RegressorMixin):
def __init__(self, fit_intercept=True):
self.fit_intercept = fit_intercept
"""
Parameters
------------
column_names: list
It is an optional value, such that this class knows
what is the name of the feature to associate to
each column of X. This is useful if you use the method
summary(), so that it can show the feature name for each
coefficient
"""
def fit(self, X, y, column_names=() ):
if self.fit_intercept:
X = sm.add_constant(X)
# Check that X and y have correct shape
X, y = check_X_y(X, y)
self.X_ = X
self.y_ = y
if len(column_names) != 0:
cols = column_names.copy()
cols = list(cols)
X = pd.DataFrame(X)
cols = column_names.copy()
cols.insert(0,'intercept')
print('X ', X)
X.columns = cols
self.model_ = sm.OLS(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
# Check is fit had been called
check_is_fitted(self, 'model_')
# Input validation
X = check_array(X)
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
def get_params(self, deep = False):
return {'fit_intercept':self.fit_intercept}
def summary(self):
print(self.results_.summary() )
Example of use:
cols = ['feature1','feature2']
X_train = df_train[cols].values
X_test = df_test[cols].values
y_train = df_train['label']
y_test = df_test['label']
model = MyLinearRegression()
model.fit(X_train, y_train)
model.summary()
model.predict(X_test)
If you want to show the names of the columns, you can call
model.fit(X_train, y_train, column_names=cols)
To use it in cross_validation:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(MyLinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores
For reference purpose, if you use the statsmodels formula API and/or use the fit_regularized method, you can modify #David Dale's wrapper class in this way.
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from statsmodels.formula.api import glm as glm_sm
# This is an example wrapper for statsmodels GLM
class SMWrapper(BaseEstimator, RegressorMixin):
def __init__(self, family, formula, alpha, L1_wt):
self.family = family
self.formula = formula
self.alpha = alpha
self.L1_wt = L1_wt
self.model = None
self.result = None
def fit(self, X, y):
data = pd.concat([pd.DataFrame(X), pd.Series(y)], axis=1)
data.columns = X.columns.tolist() + ['y']
self.model = glm_sm(self.formula, data, family=self.family)
self.result = self.model.fit_regularized(alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
return self.result
def predict(self, X):
return self.result.predict(X)
Though I think this is not technically scikit-learn, there is the package pmdarima (link to pmdarima package on PyPi) that wraps statsmodel and provides a scikit-learn like interface.