Scikit learn Custom Transformer dimension mismatch - python

I'm coming from R, so scikit API still very confusing to me. I was following this tutorial http://michelleful.github.io/code-blog/2015/06/20/pipelines/ to learn about Pipelines. So let's create a fake dataset just for reference:
x1,x2,y
foo,zoo,1
bar,moo,2
goo,too,3
roo,zoo,4
too,moo,5
My goal is very simple: train a linear regression on y, using separate tfidf matrices from x1 and x2, plus some custom features from both x1 and x2 (ie, word length, etc).
Let's start with the simpler task of using only tfidf from x1. Here's the full code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import time
import re
import math
def clip_RMSLE(y, y_pred, **kwargs):
y_pred[y_pred < 0] = 0.0
to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
return tfidf.fit_transform(df[self.colname].values)
def fit(self, df, y=None):
return self
start = time.time()
seed = 1991
ngram_rg = (1,2)
RMSLE = make_scorer(clip_RMSLE, greater_is_better=False)
def tokenizer(text):
if text:
result = re.findall('[a-z]{2,}', text.lower())
else:
result = []
return result
df = pd.read_csv('fake.csv', sep=',')
y = df['y'].values
pipeline = Pipeline([('tfidf', ColumnNgram('x1', tokenizer, ngram_rg)),
('linear_reg', LinearRegression(n_jobs=1))
])
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(pipeline, df, y, cv=kfold, scoring=RMSLE)
print(results)
print(results.mean())
end = time.time()
print('Timeto finish this thing: %0.2fs' % (end - start))
I'm getting the error ValueError: dimension mismatch, probably because some terms will not appear in both train/validation folds. What's the proper way of doing this? Thank you!

Change your ColumnNgram to this:
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
return self.tfidf.transform(df[self.colname].values)
def fit(self, df, y=None):
self.tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
self.tfidf.fit(df[self.colname].values)
return self
You should declare and learn about the training data in fit(). Currently you are re-fitting the data in each call to transform(), which obviously will return in different features in train and validation sets as you have suggested.
The proper way is to keep a TfidfVectorizer which learns data during the fit() and then only transform the new data in transform() instead of re-fitting the new data.

Related

Find and use top 10 features in XGBoost regression pipeline

I want to get the top 10 features with XGBRegressor with ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10) I get the top 10 features. But how could I use this in my pipeline?
I have this class FeatureSelector_Only_Top_10, how could I only use the top 10 features and later printed out? For example print(grid.feature_selection_top_10.top10features).
Imports:
import time
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
XGB:
xgb_reg_start = time.time()
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_nor, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train_nor)
val_preds_xgb_reg = xgb_reg.predict(X_test_nor)
xgb_reg_end = time.time()
print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("\nTraining MSE:", round(metrics.mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(metrics.mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(metrics.r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(metrics.r2_score(y_test, val_preds_xgb_reg),4))
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg.sort_values(by='weight', ascending=False).head(10)
Pipeline:
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
def fit(self, X, y = None):
# Don't know
return self
def transform(self, X, y = None):
# Don't know
return X
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [#('feature_selection_top_10', FeatureSelector_Only_Top_10()),
#('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
print("score = %3.2f" %(grid.score(X_test,y_test)))
If you want to select the N best features of your dataset in your Pipelineyou should define a custom Transformer.
This object should train and select the N best feature from xgboost during the transform() method. Then During the transform() method, this transformer should filter your dataset accordingly.
I would do as follows:
from sklearn.datasets import make_regression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso
import pandas as pd
import xgboost as xgb
class FeatureSelector_Only_Top_10(BaseEstimator, TransformerMixin):
def __init__(self,n_components = 10):
self.n_components = n_components
self.top_n_features = None
def fit(self, X, y = None):
X = pd.DataFrame(X)
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X, y)
self.top_n_features = (pd.DataFrame(
xgb_reg.feature_importances_,
columns=['weight'],
index=X.columns)
.sort_values(by='weight', ascending=False)
.head(10)
)
return self
def transform(self, X, y = None):
return pd.DataFrame(X).filter(self.top_n_features.index)
X, y = make_regression(n_features=50)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
steps = [('feature_selection_top_10', FeatureSelector_Only_Top_10()),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
pipeline.fit(X, y)
print("score = %3.2f" %(pipeline.score(X_test,y_test)))
#retrieve the top N features and their weights
pipeline['feature_selection_top_10'].top_n_features
You can include SelectFromModel in the pipeline in order to extract the top 10 features based on their importance weights, there is no need to create a custom transformer. As explained in the documentation, if you want to select 10 features you need to set max_features=10 and threshold=-np.inf.
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_features=100, n_samples=1000, random_state=42)
X = pd.DataFrame(data=X, columns=['x' + str(i) for i in range(X.shape[1])])
y = pd.Series(y, name='y')
pipeline = Pipeline([
('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
('regressor', LinearRegression())
])
pipeline.fit(X, y)
selected_features = pipeline['selector'].get_support()
print(selected_features.sum())
# 10
selected_features_names = X.columns[selected_features].tolist()
print(selected_features_names)
# ['x0', 'x14', 'x17', 'x35', 'x42', 'x43', 'x57', 'x71', 'x84', 'x95']
selected_features_importances = pipeline['selector'].estimator_.feature_importances_[selected_features]
print(selected_features_importances)
# [0.09361505 0.18474296 0.14420615 0.01952794 0.10946904 0.02192107 0.03307951 0.02948984 0.02851948 0.1216883]
selected_features_coefficients = pipeline['regressor'].coef_
print(selected_features_coefficients)
# [49.43000693 83.91437854 78.25242596 -0.76411769 56.67970515 0.16829694 28.81967319 0.50277914 24.55006237 68.17120687]

How to use BERT and Elmo embedding with sklearn

I created a text classifier that uses Tf-Idf using sklearn, and I want to use BERT and Elmo embedding instead of Tf-Idf.
How would one do that ?
I'm getting Bert embedding using the code below:
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings
# init embedding
embedding = TransformerWordEmbeddings('bert-base-uncased')
# create a sentence
sentence = Sentence('The grass is green .')
# embed words in sentence
embedding.embed(sentence)
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
column_trans = ColumnTransformer([
('tfidf', TfidfVectorizer(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])
# Initialize data
data = [
['This process, however, afforded me no means of.', 20, 1],
['another long description', 21, 1],
['It never once occurred to me that the fumbling', 19, 0],
['How lovely is spring As we looked from Windsor', 18, 0]
]
# Create DataFrame
df = pd.DataFrame(data, columns=['text', 'number', 'target'])
X = column_trans.fit_transform(df)
X = X.toarray()
y = df.loc[:, "target"].values
# Perform classification
classifier = LogisticRegression(random_state=0)
classifier.fit(X, y)
Sklearn offers the possibility to make custom data transformer (unrelated to the machine learning model "transformers").
I implemented a custom sklearn data transformer that uses the flair library that you use. Please note that I used TransformerDocumentEmbeddings instead of TransformerWordEmbeddings. And one that works with the transformers library.
I'm adding a SO question that discuss which transformer layer is interesting to use here.
I'm not familiar with Elmo, though I found this that uses tensorflow. You may be able to modify the code I shared to make Elmo work.
import torch
import numpy as np
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.base import BaseEstimator, TransformerMixin
class FlairTransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=None, layers=None):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.model_kw_args = {'batch_size': batch_size, 'layers': layers}
self.model_kw_args = {k: v for k, v in self.model_kw_args.items()
if v is not None}
def fit(self, X, y=None):
return self
def transform(self, X):
model = TransformerDocumentEmbeddings(
self.model_name, fine_tune=False,
**self.model_kw_args)
sentences = [Sentence(text) for text in X]
embedded = model.embed(sentences)
embedded = [e.get_embedding().reshape(1, -1) for e in embedded]
return np.array(torch.cat(embedded).cpu())
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModel
from more_itertools import chunked
class TransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=1, layer=-1):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.layer = layer
self.batch_size = batch_size
def fit(self, X, y=None):
return self
def transform(self, X):
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
model = AutoModel.from_pretrained(self.model_name)
res = []
for batch in chunked(X, self.batch_size):
encoded_input = tokenizer.batch_encode_plus(
batch, return_tensors='pt', padding=True, truncation=True)
output = model(**encoded_input)
embed = output.last_hidden_state[:,self.layer].detach().numpy()
res.append(embed)
return np.concatenate(res)
In your case replace your column transformer by this:
column_trans = ColumnTransformer([
('embedding', FlairTransformerEmbedding(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])

Why am I getting 'last step of pipeline' error when using make_pipeline in scikit-learn?

So I am trying to use make_pipeline in scikit-learn to clean my data (replace missing values and then clean for outliers, apply an encoding function to the categorical variables and then finally add a Random Forest Regressor through RandomForestRegressor. The input is a DataFrame. Eventually I'd like to put this through GridSearchCV to search over optimal hyperparameters for the regressor.
In order to do this I built some custom classes which inherit the TransformerMixin class as advised here. Here is what I have so far
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin
import pandas as pd
class Cleaning(TransformerMixin):
def __init__(self, column_labels):
self.column_labels = column_labels
def fit(self, X, y=None):
return self
def transform(self, X):
"""Given a dataframe X with predictors, clean it."""
X_imputed, medians_X = median_imputer(X) # impute all missing numeric data with median
quantiles_X = get_quantiles(X_imputed, self.column_labels)
X_nooutliers, _ = replace_outliers(X_imputed, self.column_labels, medians_X, quantiles_X)
return X_nooutliers
class Encoding(TransformerMixin):
def __init__(self, encoder_list):
self.encoder_list = encoder_list
def fit(self, X, y=None):
return self
def transform(self, X):
"""Takes in dataframe X and applies encoding transformation to them"""
return encode_data(self.encoder_list, X)
However, when I run the following line of code I am getting an error:
import category_encoders as ce
pipeline_cleaning = Cleaning(column_labels = train_labels)
OneHot_binary = ce.OneHotEncoder(cols = ['new_store'])
OneHot = ce.OneHotEncoder(cols= ['transport_availability'])
Count = ce.CountEncoder(cols = ['county'])
pipeline_encoding = Encoding([OneHot_binary, OneHot, Count])
baseline = RandomForestRegressor(n_estimators=500, random_state=12)
make_pipeline([pipeline_cleaning, pipeline_encoding,baseline])
The error is saying Last step of Pipeline should implement fit or be the string 'passthrough'. I don't understand why?
EDIT: slight typo in the last line, correct. The Third element in the list passed to make_pipeline is the regressor
Change the line:
make_pipeline([pipeline_cleaning, pipeline_encoding,baseline])
to (without list):
make_pipeline(pipeline_cleaning, pipeline_encoding,baseline)
Pipeline(steps=[('cleaning', <__main__.Cleaning object at 0x7f617260c1d0>),
('encoding', <__main__.Encoding object at 0x7f617260c278>),
('randomforestregressor',
RandomForestRegressor(n_estimators=500, random_state=12))])
and you're fine to go

Sklearn GridSearch with pre-training

I'm making a Sklearn Pipeline with a parameter optimization made by the GridSearchCV. The pipeline has to get the best model for several different entities implementing a pre-train and then fine-tune approach: pre-train all the entities together and the fine-tune every single element and returns a model for each entity. These are the constraint of the pipeline:
Pre-train and fine-tuning have to be in the same pipeline because both the model has to have the same data in each GridSearchCV's fold.
The pre-train model has to pass its weights to the fine-tuning model.
I have implemented:
A Sklearn Transformer that takes a data-frame with all the entities in input and fit itself.
A Sklearn Regressor that splits the data-frame in one data-frame for each entity and fit a Keras model for each entity.
What I'm missing is how to pass from the Pre-train transformer to the Fine-tuning transformer the weights obtained by the Pre-train transformer (considering that each GridSearchCV fold has different weights)
Here is the code:
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, Input
import copy
class MyRegressor(BaseEstimator, TransformerMixin):
def __init__(self, neurons, featInput, featOutput):
self.neurons = neurons
self.preTrain = None
self.featInput = featInput
self.featOutput = featOutput
def fit(self, X, y=None):
X_train = X[self.featInput]
y_train = X[self.featOutput]
inputLayer = Input(shape=(len(self.featInput), ), name='INPUT')
hidden = Dense(self.neurons, name='HIDDEN')(inputLayer)
outputLayer = Dense(len(self.featOutput), name='OUTPUT')(hidden)
self.model = Model(inputLayer, outputLayer)
self.model.compile(loss='mse', optimizer='rmsprop')
if self.preTrain is not None:
self.model.loadWeights(self.preTrain)
self.model.fit(X_train, y_train)
return self
def predict(self, X):
return self.model.predict(X[self.featInput])
def transform(self, X):
return X
def score(self, X, y=None, sample_weight=None):
y_true = X[self.featOutput]
y_pred = self.predict(X)
return mean_squared_error(y_true, y_pred)
class LoopTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns, component):
self.columns = columns
self.component = component
self.components = []
def fit(self, X, y=None):
for index, idx in X[self.columns].drop_duplicates().iterrows():
entityDf = X[(X[self.columns] == idx).sum(axis=1) == len(self.columns)].copy()
self.components.append({'id': idx, 'component': copy.deepcopy(self.component)})
self.components[-1]['component'].fit(entityDf, y)
return self
def predict(self, X):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
res = comp['component'].predict(entityDf)
results.append(res)
dfRes = pd.concat(results)
return dfRes
def score(self, X, y=None, sample_weight=None):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
if len(entityDf) > 0:
results.append(comp['component'].score(entityDf))
return np.average(results)
#create the input dataframe: 3 entities
dataFrame = pd.DataFrame([], columns=['entityId', 'input', 'output'])
for entity in range(3):
x = np.arange(random.randint(10, 20))
y = x * (entity + 1)
tempDf = pd.DataFrame(np.array([x, y]).T, columns=['input', 'output'])
tempDf['entityId'] = entity
dataFrame = pd.concat([dataFrame, tempDf], sort=False)
dataFrame = dataFrame.reset_index(drop=True)
#create the pipeline
neurons = [5, 10]
myPipe = Pipeline([('preTrain',
MyRegressor(neurons=neurons[0], featInput=['input'], featOutput=['output'])),
('fineTuning',
LoopTransformer(['entityId'],
MyRegressor(
neurons=neurons[0],
featInput=['input'],
featOutput=['output'])))])
#pre-train and fine-tuning has to have always the same number of neurons
params = [{
'preTrain__neurons': [neurons[0]],
'fineTuning__component__neurons': [neurons[0]]
}, {
'preTrain__neurons': [neurons[1]],
'fineTuning__component__neurons': [neurons[1]]
}]
gs = GridSearchCV(myPipe, params, verbose=1, cv=3)
gs.fit(dataFrame, dataFrame)
score = gs.score(dataFrame, dataFrame)
print(score)
I'm pretty the sklearn.Pipeline as is doesn't support this. But as long as you don't clone your pipeline (which happens for instance if you use a GridSearchCV), you can hack your way through with a code like the following code, which gives the instance of a step in the pipeline to the next step. You can apply the same principle in your pipeline:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
print("got the means: %s" % self.scaler.mean_)
return self
def transform(self, X):
return X
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
pipeline = make_pipeline(scaler,
MyTransformer(scaler),
LogisticRegression(solver='lbfgs',
multi_class='auto'))
pipeline = pipeline.fit(X, y)
X = X - 1
pipeline = pipeline.fit(X, y)
Which would give you this output, as expected:
got the means: [5.84333333 3.05733333 3.758 1.19933333]
got the means: [4.84333333 2.05733333 2.758 0.19933333]

Using statsmodel estimations with scikit-learn cross validation, is it possible?

I posted this question to Cross Validated forum and later realized may be this would find appropriate audience in stackoverlfow instead.
I am looking for a way I can use the fit object (result) ontained from python statsmodel to feed into cross_val_score of scikit-learn cross_validation method?
The attached link suggests that it may be possible but I have not succeeded.
I am getting the following error
estimator should a be an estimator implementing 'fit' method
statsmodels.discrete.discrete_model.BinaryResultsWrapper object at
0x7fa6e801c590 was passed
Refer this link
Indeed, you cannot use cross_val_score directly on statsmodels objects, because of different interface: in statsmodels
training data is passed directly into the constructor
a separate object contains the result of model estimation
However, you can write a simple wrapper to make statsmodels objects look like sklearn estimators:
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
class SMWrapper(BaseEstimator, RegressorMixin):
""" A universal sklearn-style wrapper for statsmodels regressors """
def __init__(self, model_class, fit_intercept=True):
self.model_class = model_class
self.fit_intercept = fit_intercept
def fit(self, X, y):
if self.fit_intercept:
X = sm.add_constant(X)
self.model_ = self.model_class(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
This class contains correct fit and predict methods, and can be used with sklearn, e.g. cross-validated or included into a pipeline. Like here:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
X, y = make_regression(random_state=1, n_samples=300, noise=100)
print(cross_val_score(SMWrapper(sm.OLS), X, y, scoring='r2'))
print(cross_val_score(LinearRegression(), X, y, scoring='r2'))
You can see that the output of two models is identical, because they are both OLS models, cross-validated in the same way.
[0.28592315 0.37367557 0.47972639]
[0.28592315 0.37367557 0.47972639]
Following the suggestion of David (which gave me an error, complaining about missing function get_parameters) and the scikit learn documentation, I created the following wrapper for a linear regression.
It has the same interface of sklearn.linear_model.LinearRegression but in addition has also the function summary(), which gives the info about p-values, R2 and other statistics, as in statsmodels.OLS.
import statsmodels.api as sm
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
import numpy as np
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.estimator_checks import check_estimator
class MyLinearRegression(BaseEstimator, RegressorMixin):
def __init__(self, fit_intercept=True):
self.fit_intercept = fit_intercept
"""
Parameters
------------
column_names: list
It is an optional value, such that this class knows
what is the name of the feature to associate to
each column of X. This is useful if you use the method
summary(), so that it can show the feature name for each
coefficient
"""
def fit(self, X, y, column_names=() ):
if self.fit_intercept:
X = sm.add_constant(X)
# Check that X and y have correct shape
X, y = check_X_y(X, y)
self.X_ = X
self.y_ = y
if len(column_names) != 0:
cols = column_names.copy()
cols = list(cols)
X = pd.DataFrame(X)
cols = column_names.copy()
cols.insert(0,'intercept')
print('X ', X)
X.columns = cols
self.model_ = sm.OLS(y, X)
self.results_ = self.model_.fit()
return self
def predict(self, X):
# Check is fit had been called
check_is_fitted(self, 'model_')
# Input validation
X = check_array(X)
if self.fit_intercept:
X = sm.add_constant(X)
return self.results_.predict(X)
def get_params(self, deep = False):
return {'fit_intercept':self.fit_intercept}
def summary(self):
print(self.results_.summary() )
Example of use:
cols = ['feature1','feature2']
X_train = df_train[cols].values
X_test = df_test[cols].values
y_train = df_train['label']
y_test = df_test['label']
model = MyLinearRegression()
model.fit(X_train, y_train)
model.summary()
model.predict(X_test)
If you want to show the names of the columns, you can call
model.fit(X_train, y_train, column_names=cols)
To use it in cross_validation:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(MyLinearRegression(), X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores
For reference purpose, if you use the statsmodels formula API and/or use the fit_regularized method, you can modify #David Dale's wrapper class in this way.
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from statsmodels.formula.api import glm as glm_sm
# This is an example wrapper for statsmodels GLM
class SMWrapper(BaseEstimator, RegressorMixin):
def __init__(self, family, formula, alpha, L1_wt):
self.family = family
self.formula = formula
self.alpha = alpha
self.L1_wt = L1_wt
self.model = None
self.result = None
def fit(self, X, y):
data = pd.concat([pd.DataFrame(X), pd.Series(y)], axis=1)
data.columns = X.columns.tolist() + ['y']
self.model = glm_sm(self.formula, data, family=self.family)
self.result = self.model.fit_regularized(alpha=self.alpha, L1_wt=self.L1_wt, refit=True)
return self.result
def predict(self, X):
return self.result.predict(X)
Though I think this is not technically scikit-learn, there is the package pmdarima (link to pmdarima package on PyPi) that wraps statsmodel and provides a scikit-learn like interface.

Categories