Get test set feature vectors from Pipeline in scikit-learn - python

I want to debug my ML model and I want to see which features are used/activated in each observation in my test set. So I need to get the transformed matrix with the features after the vectorization, feature selection steps in the pipeline.
First of all here is the pipeline:
pipeline = Pipeline([
('fu', FeatureUnion(
transformer_list=[
('val', Pipeline([
('ext', FeatureExtractor(feat_type="valence")),
('vect', DictVectorizer()),
])),
('bot', Pipeline([
('ext', FeatureExtractor(feat_type="bot", term="word", pos=False, negation=True)),
('vec', CountVectorizer(min_df=3, max_df=0.9, lowercase=False)),
("fs", SelectKBest(chi2, k=8000)),
('bin', Binarizer()),
('trans', TfidfTransformer(sublinear_tf=True, smooth_idf=True, use_idf=True)),
]))
],
)),
('stats', FeatureStats()),
("fs", MaxAbsScaler()),
('classifier', svm.LinearSVC(C=.5)),
])
So I read the docs and it seems that I can use transform to get the transformed matrix (which is now deprecated, but anyway...). And i tried doing this:
pipeline.fit(X_train, y_train)
transformed = pipeline.transform(X_test)
y_predicted = pipeline.predict(X_test)
But I have a problem.
As you can see I select the the 8000 most discriminating features so the matrix that goes in and out of the classifier must be N x 8000. But the problem is that transformed is M x 3012 (N=training examples, M=test examples).
In order to verify that the pipeline is working correctly I put FeatureStats, a transformer that simply prints the length of the feature vectors, right before the classifier, and it prints 8000.
class FeatureStats(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def transform(self, X, y=None):
print("size = ", X.shape[1])
return X
def fit(self, X, y=None):
return self
Why does pipeline.transform(X_test) return a smaller matrix? Am I missing something?

Related

How to add a feature using a pipeline and FeatureUnion

In the code below I use a tweeter dataset to perform sentiment analysis. I use a pipeline which performs the following processes:
1) performs some basic text preprocessing
2) vectorizes the tweet text
3) adds an extra feature ( text length)
4) classification
I would like to add one more feature which is the scaled number of followers. I wrote a function that takes as an input the whole dataframe (df) and returns a new dataframe with scaled number of followers. However, I am finding it challenging to add this process on the pipeline e.g. add this feature to the other features using the sklearn pipeline.
Any help or advise on this problem will be much appreciated.
the question and code below is inspired by Ryan's post:pipelines
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
def import_data(filename,sep,eng,header = None,skiprows=1):
#read csv
dataset = pd.read_csv(filename,sep=sep,engine=eng,header = header,skiprows=skiprows)
#rename columns
dataset.columns = ['text','followers','sentiment']
return dataset
df = import_data('apple_v3.txt','\t','python')
X, y = df.text, df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y)
tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)
count_vect = CountVectorizer(tokenizer=tokenizer.tokenize)
classifier = LogisticRegression()
def get_scalled_followers(df):
scaler = MinMaxScaler()
df[['followers']] = df[['followers']].astype(float)
df[['followers']] = scaler.fit_transform(df[['followers']])
followers = df['followers'].values
followers_reshaped = followers.reshape((len(followers),1))
return df
def get_tweet_length(text):
return len(text)
import numpy as np
def genericize_mentions(text):
return re.sub(r'#[\w_-]+', 'thisisanatmention', text)
def reshape_a_feature_column(series):
return np.reshape(np.asarray(series), (len(series), 1))
def pipelinize_feature(function, active=True):
def list_comprehend_a_function(list_or_series, active=True):
if active:
processed = [function(i) for i in list_or_series]
processed = reshape_a_feature_column(processed)
return processed
else:
return reshape_a_feature_column(np.zeros(len(list_or_series)))
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn_helpers import pipelinize, genericize_mentions, train_test_and_evaluate
sentiment_pipeline = Pipeline([
('genericize_mentions', pipelinize(genericize_mentions, active=True)),
('features', FeatureUnion([
('vectorizer', count_vect),
('post_length', pipelinize_feature(get_tweet_length, active=True))
])),
('classifier', classifier)
])
sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, X_train, y_train, X_test, y_test)
The best explanation I have found so far is at the following post: pipelines
My data includes heterogenous features and the following step by step approach works well and is easy to understand:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
#step1 - select data from dataframe and split the dataset in train and test sets
features= [c for c in df.columns.values if c not in ['sentiment']]
numeric_features= [c for c in df.columns.values if c not in ['text','sentiment']]
target = 'sentiment'
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
#step2 - create a number selector class and text selector class. These classes allow to select specific columns from the dataframe
class NumberSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[[self.key]]
class TextSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.key]
#step 3 create one pipeline for the text data and one for the numerical data
text = Pipeline([
('selector', TextSelector(key='content')),
('tfidf', TfidfVectorizer( stop_words='english'))
])
text.fit_transform(X_train)
followers = Pipeline([
('selector', NumberSelector(key='followers')),
('standard', MinMaxScaler())
])
followers.fit_transform(X_train)
#step 4 - features union
feats = FeatureUnion([('text', text),
('length', followers)])
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)
# step 5 - add the classifier and predict
pipeline = Pipeline([
('features',feats),
('classifier', SVC(kernel = 'linear', probability=True, C=1, class_weight = 'balanced'))
])
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
np.mean(preds == y_test)
# step 6 use the model to predict new data not included in the test set
# in my example the pipeline expects a dataframe as an input which should have a column called 'text' and a column called 'followers'
array = [["#apple is amazing",25000]]
dfObj = pd.DataFrame(array,columns = ['text' , 'followers'])
#prints the expected class e.g. positive or negative sentiment
print(pipeline.predict(dfObj))
#print the probability for each class
print(pipeline.predict_proba(dfObj))
You can use FeatureUnion to combine the features extracted from the different columns of your dataframe. You should feed the dataframe to the pipeline and use FunctionTransformer to extract specific columns. It might look like this (I haven't run it, some errors possible)
sentiment_pipeline = Pipeline([
FeatureUnion([
# your added feature (maybe you'll need to reshape it so ndim == 2)
('scaled_followers', FunctionTransformer(lambda df: get_scalled_followers(df).values,
validate=False)),
# previous features
('text_features', Pipeline([
('extractor', FunctionTransformer(lambda df: df.text.values, validate=False))
('genericize_mentions', pipelinize(genericize_mentions, active=True)),
('features', FeatureUnion([
('vectorizer', count_vect),
('post_length', pipelinize_feature(get_tweet_length, active=True))
])),
]))
]),
('classifier', classifier)
])
sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, df_train, y_train, df_test, y_test)
Another solution could be not use Pipeline and just stack the features together with np.hstack.

How to implement a custom type selector in Scikit learn Pipeline

I have this Pipeline:
transformer = Pipeline([
('features', FeatureUnion(transformer_list=[
('numericals', Pipeline([
('selector', TypeSelector(np.number)),
('scaler', StandardScaler()),
])),
('categoricals', Pipeline([
('selector', TypeSelector('category')),
('labeler', StringIndexer()),
('encoder', OneHotEncoder(handle_unknown='ignore')),
]))
])),
('feature_selection', SelectFromModel(LinearSVC())),
('classifier', SVC(decision_function_shape='ovo'))
])
Where this is implementation of TypeSelector:
class TypeSelector(BaseEstimator, TransformerMixin):
def __init__(self, dtype):
self.dtype = dtype
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return X.select_dtypes(include=[self.dtype])
class StringIndexer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return X.apply(lambda s: s.cat.codes.replace(
{-1: len(s.cat.categories)}
))
Now training and prediction work fine. But I want feature names. I read a lot in github but nothing works. I tried something like this:
transformer.named_steps['features'].get_feature_names()
But still I get this AttributeError: Transformer numericals (type Pipeline) does not provide get_feature_names. How can I implement that custom type selector ?
This is complicated, as sklearn doesn't provide get_feature_names for Pipeline. Scalers and your custom transformers don't provide feature names either. There are several tickets open for fixing this (see e.g. https://github.com/scikit-learn/scikit-learn/issues/6424, https://github.com/scikit-learn/scikit-learn/issues/6425).
There are two possible workarounds:
1) build feature names manually. You need to consider SelectFromModel indices, and feature names from the Pipeline before it.
2) use a library. We created https://github.com/TeamHG-Memex/eli5 for such purposes; it supports getting feature names from Pipeline, FeatureUnion and many built-in transformers, and you can extend it to support your transformers or missing sklearn transformers. See eli5.transform_feature_names and https://eli5.readthedocs.io/en/latest/libraries/sklearn.html#transformation-pipelines.
It won't work out of box: you'd have to at least register transformation functions for your custom transformers - or implement .get_feature_names methods for your transformers. Usage should be something like this (sorry, it is more like a pseudo-code, I haven't really checked it):
from eli5 import transform_feature_names
#transform_feature_names.register(StringIndexer)
def indexer_feature_names(transformer, in_names=None):
assert is_names is not None # don't handle it for now
return ["StringIndexer(%s)" % name for name in in_names]
# or just pass input feature names as-is
# return in_names
# .. something similar for TypeSelector, and for OneHotEncoder as well
feature_names = transform_feature_names(transformer, list(df.columns))

Multiple classification models in a scikit pipeline python

I am solving a binary classification problem over some text documents using Python and implementing the scikit-learn library, and I wish to try different models to compare and contrast results - mainly using a Naive Bayes Classifier, SVM with K-Fold CV, and CV=5. I am finding a difficulty in combining all of the methods into one pipeline, given that the latter two models use gridSearchCV(). I cannot have multiple Pipelines running during a single implementation due to concurrency issues, hence I need to implement all the different models using one pipeline.
This is what I have till now,
# pipeline for naive bayes
naive_bayes_pipeline = Pipeline([
('bow_transformer', CountVectorizer(analyzer=split_into_lemmas, stop_words='english')),
('tf_idf', TfidfTransformer()),
('classifier', MultinomialNB())
])
# accessing and using the pipelines
naive_bayes = naive_bayes_pipeline.fit(train_data['data'], train_data['gender'])
# pipeline for SVM
svm_pipeline = Pipeline([
('bow_transformer', CountVectorizer(analyzer=split_into_lemmas, stop_words='english')),
('tf_idf', TfidfTransformer()),
('classifier', SVC())
])
param_svm = [
{'classifier__C': [1, 10], 'classifier__kernel': ['linear']},
{'classifier__C': [1, 10], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]
grid_svm_skf = GridSearchCV(
svm_pipeline, # pipeline from above
param_grid=param_svm, # parameters to tune via cross validation
refit=True, # fit using all data, on the best detected classifier
n_jobs=-1, # number of cores to use for parallelization; -1 uses "all cores"
scoring='accuracy',
cv=StratifiedKFold(train_data['gender'], n_folds=5), # using StratifiedKFold CV with 5 folds
)
svm_skf = grid_svm_skf.fit(train_data['data'], train_data['gender'])
predictions_svm_skf = svm_skf.predict(test_data['data'])
EDIT 1:
The second pipeline is the only pipeline using gridSearchCV(), and never seems to be executed.
EDIT 2:
Added more code to show gridSearchCV() use.
Consider checking out similar questions here:
Compare multiple algorithms with sklearn pipeline
Pipeline: Multiple classifiers?
To summarize,
Here is an easy way to optimize over any classifier and for each classifier any settings of parameters.
Create a switcher class that works for any estimator
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
Now you can pass in anything for the estimator parameter. And you can optimize any parameter for any estimator you pass in as follows:
Perform hyper-parameter optimization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', ClfSwitcher()),
])
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': ['english', None],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3)
gscv.fit(train_data, train_labels)
How to interpret clf__estimator__loss
clf__estimator__loss is interpreted as the loss parameter for whatever estimator is, where estimator = SGDClassifier() in the top most example and is itself a parameter of clf which is a ClfSwitcher object.

Do you need to scale Vectorizers in sklearn?

I have a set of custom features and a set of features created with Vectorizers, in this case TfidfVectorizer.
All of my custom features are simple np.arrays (e.g. [0, 5, 4, 22, 1]). I am using StandardScaler to scale all of my featues, as you can see in my Pipeline by calling StandardScaler after my "custom pipeline". The question is whether there is a way or a need to scale the Vectorizers I use in my "vectorized_pipeline". Applying StandardScaler on the vectorizers doesn't seem to work (I get the following Error: "ValueError: Cannot center sparse matrices").
And another question, is it sensible to scale all of my features after I have joined them in the FeatureUnion or do I scale each of them separately (in my example, by calling the scaler in "pos_cluster" and "stylistic_features" seprately instead of calling it after the both of them have been joined), what is a better practice of doing this?
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import feature_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
X = ['I am a sentence', 'an example']
Y = [1, 2]
X_dev = ['another sentence']
inner_scaler = StandardScaler()
# classifier
LinearSVC1 = LinearSVC(tol=1e-4, C = 0.10000000000000001)
# vectorizers
countVecWord = TfidfVectorizer(ngram_range=(1, 3), max_features=2000, analyzer=u'word', sublinear_tf=True, use_idf = True, min_df=2, max_df=0.85, lowercase = True)
countVecWord_tags = TfidfVectorizer(ngram_range=(1, 4), max_features= 1000, analyzer=u'word', min_df=2, max_df=0.85, sublinear_tf=True, use_idf = True, lowercase = False)
pipeline = Pipeline([
('union', FeatureUnion(
transformer_list=[
('vectorized_pipeline', Pipeline([
('union_vectorizer', FeatureUnion([
('stem_text', Pipeline([
('selector', ItemSelector(key='stem_text')),
('stem_tfidf', countVecWord)
])),
('pos_text', Pipeline([
('selector', ItemSelector(key='pos_text')),
('pos_tfidf', countVecWord_tags)
])),
])),
])),
('custom_pipeline', Pipeline([
('custom_features', FeatureUnion([
('pos_cluster', Pipeline([
('selector', ItemSelector(key='pos_text')),
('pos_cluster_inner', pos_cluster)
])),
('stylistic_features', Pipeline([
('selector', ItemSelector(key='raw_text')),
('stylistic_features_inner', stylistic_features)
]))
])),
('inner_scale', inner_scaler)
])),
],
# weight components in FeatureUnion
# n_jobs=6,
transformer_weights={
'vectorized_pipeline': 0.8, # 0.8,
'custom_pipeline': 1.0 # 1.0
},
)),
('clf', classifier),
])
pipeline.fit(X, Y)
y_pred = pipeline.predict(X_dev)
First things first:
Error "Cannot center sparse matrices"
The reason is quite simple - StandardScaler efficiently applies feature-wise transformation:
f_i = (f_i - mean(f_i)) / std(f_i)
which for sparse matrices will result in the dense ones, as mean(f_i) will be non zero (usually). In practise only features equal to their means - will end up being zero. Scikit learn does not want to do this, as this is a huge modification of your data, which might result in failures in other parts of code, huge usage of memory etc. How to deal with it? If you really want to do this, there are two options:
densify your matrix through .toarray(), which will require lots of memory, but will give you exactly what you expect
create StandardScaler without mean, thus StandardScaler(with_mean = False) which instaed willl apply f_i = f_i / std(f_i), but will leave sparse format of your data.
Is scalind needed?
This is a whole other problem - usualy, scaling (of any form) is just a heuristics. This is not something that you have to apply, there are no guarantees that it will help, it is just a reasonable thing to do when you have no idea what your data looks like. "Smart" vectorizers, such as tfidf are actually already doing that. The idf transformation is supposed to create a kind of reasonable data scaling. There is no guarantee which one will be better, but in general, tfidf should be enough. Especially given the fact, that it still support sparse computations, while StandardScaler does not.

Using partial_fit with Scikit Pipeline

How do you call partial_fit() on a scikit-learn classifier wrapped inside a Pipeline()?
I'm trying to build an incrementally trainable text classifier using SGDClassifier like:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
classifier = Pipeline([
('vectorizer', HashingVectorizer(ngram_range=(1,4), non_negative=True)),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(SGDClassifier())),
])
but I get an AttributeError trying to call classifier.partial_fit(x,y).
It supports fit(), so I don't see why partial_fit() isn't available. Would it be possible to introspect the pipeline, call the data transformers, and then directly call partial_fit() on my classifier?
Here is what I'm doing - where 'mapper' and 'clf' are the 2 steps in my Pipeline obj.
def partial_pipe_fit(pipeline_obj, df):
X = pipeline_obj.named_steps['mapper'].fit_transform(df)
Y = df['class']
pipeline_obj.named_steps['clf'].partial_fit(X,Y)
You probably want to keep track of performance as you keep adjusting/updating your classifier - but that is a secondary point
so more specifically - the original pipeline(s) were constructed as follows
to_vect = Pipeline([('vect', CountVectorizer(min_df=2, max_df=.9, ngram_range=(1, 1), max_features = 100)),
('tfidf', TfidfTransformer())])
full_mapper = DataFrameMapper([
('norm_text', to_vect),
('norm_fname', to_vect), ])
full_pipe = Pipeline([('mapper', full_mapper), ('clf', SGDClassifier(n_iter=15, warm_start=True,
n_jobs=-1, random_state=self.random_state))])
google DataFrameMapper to learn more about it - but here it just enables a transformation step that plays nice with pandas
Pipeline does not use partial_fit, hence does not expose it. We would probably need a dedicated pipelining scheme for out-of-core computation but that also depends on the capabilities of the previous models.
In particular in this case you would probably want to do several passes over your data, one to fit each stage of the pipeline and then to transform the dataset to fit the next one, except for the first stage which is stateless, hence does not fit parameters from the data.
In the mean time it's probably easier to roll your own wrapper code tailored to your needs.
Even though this question is 8 years old it is still very relevant and has not been updated for quite a time now.
As the matter of fact, there is now a nice package created by Vincent Warmerdam called tokenwiser.
It is used for NLP stuff mostly to fit within the sklearn infrastructure. However, there is the main building block that can be used even for not NLP tasks.
The package has PartialPipeline boiler plate and Documentantation.
Example here:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion
pipe = PartialPipeline([
("clean", Cleaner()),
("union", PartialFeatureUnion([
("full_text_pipe", PartialPipeline([
("identity", Identity()),
("hash1", HashingVectorizer()),
])),
("hyphen_pipe", PartialPipeline([
("hyphen", HyphenTextPrep()),
("hash2", HashingVectorizer()),
]))
])),
("clf", SGDClassifier())
])
X = [
"i really like this post",
"thanks for that comment",
"i enjoy this friendly forum",
"this is a bad post",
"i dislike this article",
"this is not well written"
]
y = np.array([1, 1, 1, 0, 0, 0])
for loop in range(3):
pipe.partial_fit(X, y, classes=[0, 1])
I can imagine this template working even for non-NLP-related stuff. Hope someone will find this super usefull.
I also propose my basic implementation of utilizing partial_fit within a sklearn pipeline.
We just need to use a model that allows for partial fit (e.g. SGDregressor, xgboost, etc) and create own sklearn compatible classes
(Huge KUDOS to VIncent Warmerdam who started this in his TOKENWISER project)
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklego.preprocessing import PatsyTransformer
class xgboost_partial_trainer(BaseEstimator, TransformerMixin):
"""
allows for incremental training od xgboost model within a sklean pipeline
"""
def __init__(self, training_params: dict = None):
self.training_params = training_params
self.trained_model = None
self._first_call = True
self.evals_result = {}
self.iter_number = 1
self._X_train, self._X_test, self._y_train, self._y_test = (
None,
None,
None,
None,
)
def partial_fit(self, X, y=None, classes=None, **fit_params):
print(f"firts run: {self._first_call}, n_iter = {self.iter_number}")
self.iter_number += 1
if self._first_call:
# Select random subset of data and store within the model (for error loss over time)
self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(
X, y, test_size=0.6, random_state=1
)
self._xg_train = xgb.DMatrix(self._X_train, label=self._y_train)
self._xg_test = xgb.DMatrix(self._X_test, label=self._y_test)
# validations set to watch performance - same testing data, changebla training data
self.watchlist = [
(self._xg_train, "train_batch"),
(self._xg_test, "eval_fixed"),
]
# Trainig Part Itself
self.trained_model = xgb.train(
params=self.training_params,
dtrain=xgb.DMatrix(X, y),
xgb_model=self.trained_model,
evals=self.watchlist,
)
# Swich outside firts batch
self._first_call = False
else:
self._xg_train = xgb.DMatrix(X, y)
self.watchlist = [
(self._xg_train, "train_batch"),
(self._xg_test, "eval_fixed"),
]
self.trained_model = xgb.train(
params=self.training_params,
dtrain=self._xg_train,
xgb_model=self.trained_model,
evals=self.watchlist,
)
# self._predicted_y = self.trained_model.predict(xgb.DMatrix(self._X_test))
# print(f"mean_squared_error = {mean_squared_error(self._y_test, self._predicted_y, squared = False)}")
return self
def predict(self, X, y=None, **fit_params):
return self.trained_model.predict(xgb.DMatrix(X))
def transform(self, X, y=None, **fit_params):
return self.trained_model.predict(xgb.DMatrix(X))
def fit(self, X, y=None, **fit_params):
return self
class PartialPipeline(Pipeline):
"""
Utility function to generate a `PartialPipeline`
Arguments:
steps: a collection of text-transformers
"""
def partial_fit(self, X, y=None, classes=None, **kwargs):
"""
Fits the components, but allow for batches.
"""
# print(f"there are partial steps {self.steps_partial}")
for _, step in self.steps:
if hasattr(step, "partial_fit"):
step.partial_fit(X, y, **kwargs)
elif hasattr(step, "fit_transform"):
X = step.fit_transform(X)
elif hasattr(step, "transform"):
X = step.transform(X)
elif hasattr(step, "fit"):
X = step.fit(X)
return self
Once we have these sklearn classes we may utilize the Pipeline:
my_pipeline = PartialPipeline([
("patsy", PatsyTransformer(FORMULA2)),
("xgboost_model", xgboost_partial_trainer(training_params=params)),
])
df_chunked = pd.read_csv(your_date, chunksize=5_000)
for df in df_chunked:
my_pipeline.partial_fit(df, y=df["speed"])
Please, provide me with feedback and code cleaning suggestions. I am fully aware that this is not perfect. However, as a nice prototype - not too bad!

Categories