I have a trained customised fasttext model (fasttext is a word embedding algorithm developed by Facebook). I managed to get the expected result in a function but now I want to rewrite it into a customised transformer so I can add it into my sklearn pipeline as it only accepts transformer.
The function takes a word and returns vectors of the word:
def name2vector(name=None):
vec = [np.array(model.get_word_vector(w)) for w in name.lower().split(' ')]
name_vec = np.sum(vec, axis=0) # If "name" is multiple words, sum the vectors
return (name_vec)
returned value:
array([-0.01087821, 0.01030535, -0.01402427, 0.0310982 , 0.08786983,
-0.00404521, -0.03286128, -0.00842709, 0.03934859, -0.02717219,
0.01151722, -0.03253938, -0.02435859, 0.03330994, -0.03696496], dtype=float32))
I want the tranformer does the same thing as the function.
I know I can use BaseEstimator and TransformerMixin to rewrite it into a transformer by reading the tutorial but I still stuck on this. Some suggestions will be great, thanks.
Assuming you're working with a pandas DataFrame, you could do something like this:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
class FastTextTransformer(TransformerMixin, BaseEstimator):
def __init__(self, model):
self.model = model
def get_params(self, deep):
return {'dimension': self.model.get_dimension()}
def fit(self, X, y):
# We assume the FT model was already fit
return self
def transform(self, X):
X_copy = X.copy()
X_copy = X_copy.apply(self.name2vector)
return pd.DataFrame(X_copy.tolist())
def name2vector(self, name):
vec = [np.array(self.model.get_word_vector(w)) for w in name.lower().split(' ')]
name_vec = np.sum(vec, axis=0) # If "name" is multiple words, sum the vectors
return name_vec
To demonstrate the usage let's load a fasttext model and a sample data-set of amazon reviews:
import fasttext as ft
ft_model = ft.load_model('amazon_review_polarity.ftz')
amz_df = pd.read_html('https://huggingface.co/datasets/amazon_polarity/viewer/amazon_polarity/test')[0]
amz_df.rename(columns={'content (string)': 'content', 'label (class label)': 'label'}, inplace=True)
amz_df
And then use it as a bona fide scikit-learn Pipeline.
pipe = Pipeline([
('ft', FastTextTransformer(ft_model)),
('clf', LogisticRegression()),
])
And now we can fit and predict
pipe.fit(amz_df['content'], amz_df.label)
pipe.predict(pd.Series(['great', 'very cool', 'very disappointed']))
Which returns
array(['positive', 'positive', 'negative'], dtype=object)
N.B. In case you want to compute an average of the words in the sentence, instead of a sum, you can replace name2vector with the built-in method get_sentence_vector. For a supervised model, it'll return the average. For unsupervised ones (CBOW and skipgram), it first divides each vector by its L2 norm, and then averages.
See the discussion here.
Credit: Stefano Fiorucci - anakin87
The library compress-fasttext (it's a wrapper around Gensim that makes fastText models more lightweight) already has such a transformer:
import compress_fasttext
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from compress_fasttext.feature_extraction import FastTextTransformer
small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
'https://github.com/avidale/compress-fasttext/releases/download/v0.0.4/cc.en.300.compressed.bin'
)
classifier = make_pipeline(
FastTextTransformer(model=small_model),
LogisticRegression()
).fit(
['banana', 'soup', 'burger', 'car', 'tree', 'city'],
[1, 1, 1, 0, 0, 0]
)
classifier.predict(['jet', 'train', 'cake', 'apple'])
# array([0, 0, 1, 1])
Under the hood, it finds all "words" (alphanumeric sequences) in the text and averages their fastText embeddings.
Here is the source code.
Related
I am learning about sklearn custom transformers and read about the two core ways to create custom transformers:
by setting up a custom class that inherits from BaseEstimator and TransformerMixin, or
by creating a transformation method and passing it to FunctionTransformer.
I wanted to compare these two approaches by implementing a "meta-vectorizer" functionality: a vectorizer that supports either CountVectorizer or TfidfVectorizer and transforms the input data according to the specified vectorizer type.
However, I can't seem to get any of the two work when passing them to a sklearn.pipeline.Pipeline. I am getting the following error message in the fit_transform() step:
ValueError: all the input array dimensions for the concatenation axis must match
exactly, but along dimension 0, the array at index 0 has size 6 and the array
at index 1 has size 1
My code for option 1 (using a custom class):
class Vectorizer(BaseEstimator, TransformerMixin):
def __init__(self, vectorizer:Callable=CountVectorizer(), ngram_range:tuple=(1,1)) -> None:
super().__init__()
self.vectorizer = vectorizer
self.ngram_range = ngram_range
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X_vect_ = self.vectorizer.fit_transform(X.copy())
return X_vect_.toarray()
pipe = Pipeline([
('column_transformer', ColumnTransformer([
('lesson_type_category', OneHotEncoder(), ['Type']),
('comment_text_vectorizer', Vectorizer(), ['Text'])],
remainder='drop')),
('model', LogisticRegression())])
param_dict = {'column_transformer__comment_text_vectorizer__vectorizer': \
[CountVectorizer(), TfidfVectorizer()]
}
randsearch = GridSearchCV(pipe, param_dict, cv=2, scoring='f1',).fit(X_train, y_train)
And my code for option 2 (creating a custom transformer from a function using FunctionTransformer):
def vectorize_text(X, vectorizer: Callable):
X_vect_ = vectorizer.fit_transform(X)
return X_vect_.toarray()
vectorizer_transformer = FunctionTransformer(vectorize_text, kw_args={'vectorizer': TfidfVectorizer()})
pipe = Pipeline([
('column_transformer', ColumnTransformer([
('lesson_type_category', OneHotEncoder(), ['Type']),
('comment_text_vectorizer', vectorizer_transformer, ['Text'])],
remainder='drop')),
('model', LogisticRegression())])
param_dict = {'column_transformer__comment_text_vectorizer__kw_args': \
[{'vectorizer':CountVectorizer()}, {'vectorizer': TfidfVectorizer()}]
}
randsearch = GridSearchCV(pipe, param_dict, cv=2, scoring='f1').fit(X_train, y_train)
Imports and sample data:
import pandas as pd
from typing import Callable
import sklearn
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
df = pd.DataFrame([
['A99', 'hi i love python very much', 'c', 1],
['B07', 'which programming language should i learn', 'b', 0],
['A12', 'what is the difference between python django flask', 'b', 1],
['A21', 'i want to be a programmer one day', 'c', 0],
['B11', 'should i learn java or python', 'b', 1],
['C01', 'how much can i earn as a programmer with python', 'a', 0]
], columns=['Src', 'Text', 'Type', 'Target'])
Notes:
As recommended in this question, I transformed all sparse matrices to dense arrays after the vectorization, as you can see in both cases: X_vect_.toarray().
The issue is that both CountVectorizer and TfidfVectorizer require their input to be 1D (and not 2D). In such cases the doc of ColumnTransformer states that parameter columns of the transformers tuple should be passed as a string rather than as a list.
columns: str, array-like of str, int, array-like of int, array-like of bool, slice or callable
Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where transformer expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. A callable is passed the input data X and can return any of the above. To select multiple columns by name or dtype, you can use make_column_selector.
Therefore, the following will work in your case (i.e. changing ['Text'] into 'Text').
class Vectorizer(BaseEstimator, TransformerMixin):
def __init__(self, vectorizer:Callable=CountVectorizer(), ngram_range:tuple=(1,1)) -> None:
super().__init__()
self.vectorizer = vectorizer
self.ngram_range = ngram_range
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X_vect_ = self.vectorizer.fit_transform(X.copy())
return X_vect_.toarray()
pipe = Pipeline([
('column_transformer', ColumnTransformer([
('lesson_type_category', OneHotEncoder(handle_unknown='ignore'), ['Type']),
('comment_text_vectorizer', Vectorizer(), 'Text')], remainder='drop')),
('model', LogisticRegression())])
param_dict = {'column_transformer__comment_text_vectorizer__vectorizer': [CountVectorizer(), TfidfVectorizer()]
}
randsearch = GridSearchCV(pipe, param_dict, cv=2, scoring='f1',).fit(X_train, y_train)
You can adjust the example with FunctionTransformer accordingly. Observe, as a final remark, that I had to pass handle_unknown='ignore' to OneHotEncoder to prevent the possibility that an error would have arisen in case of unknown categories seen during the test phase of your cross-validation (and not seen during the training phase).
I created a text classifier that uses Tf-Idf using sklearn, and I want to use BERT and Elmo embedding instead of Tf-Idf.
How would one do that ?
I'm getting Bert embedding using the code below:
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings
# init embedding
embedding = TransformerWordEmbeddings('bert-base-uncased')
# create a sentence
sentence = Sentence('The grass is green .')
# embed words in sentence
embedding.embed(sentence)
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
column_trans = ColumnTransformer([
('tfidf', TfidfVectorizer(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])
# Initialize data
data = [
['This process, however, afforded me no means of.', 20, 1],
['another long description', 21, 1],
['It never once occurred to me that the fumbling', 19, 0],
['How lovely is spring As we looked from Windsor', 18, 0]
]
# Create DataFrame
df = pd.DataFrame(data, columns=['text', 'number', 'target'])
X = column_trans.fit_transform(df)
X = X.toarray()
y = df.loc[:, "target"].values
# Perform classification
classifier = LogisticRegression(random_state=0)
classifier.fit(X, y)
Sklearn offers the possibility to make custom data transformer (unrelated to the machine learning model "transformers").
I implemented a custom sklearn data transformer that uses the flair library that you use. Please note that I used TransformerDocumentEmbeddings instead of TransformerWordEmbeddings. And one that works with the transformers library.
I'm adding a SO question that discuss which transformer layer is interesting to use here.
I'm not familiar with Elmo, though I found this that uses tensorflow. You may be able to modify the code I shared to make Elmo work.
import torch
import numpy as np
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.base import BaseEstimator, TransformerMixin
class FlairTransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=None, layers=None):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.model_kw_args = {'batch_size': batch_size, 'layers': layers}
self.model_kw_args = {k: v for k, v in self.model_kw_args.items()
if v is not None}
def fit(self, X, y=None):
return self
def transform(self, X):
model = TransformerDocumentEmbeddings(
self.model_name, fine_tune=False,
**self.model_kw_args)
sentences = [Sentence(text) for text in X]
embedded = model.embed(sentences)
embedded = [e.get_embedding().reshape(1, -1) for e in embedded]
return np.array(torch.cat(embedded).cpu())
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModel
from more_itertools import chunked
class TransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=1, layer=-1):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.layer = layer
self.batch_size = batch_size
def fit(self, X, y=None):
return self
def transform(self, X):
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
model = AutoModel.from_pretrained(self.model_name)
res = []
for batch in chunked(X, self.batch_size):
encoded_input = tokenizer.batch_encode_plus(
batch, return_tensors='pt', padding=True, truncation=True)
output = model(**encoded_input)
embed = output.last_hidden_state[:,self.layer].detach().numpy()
res.append(embed)
return np.concatenate(res)
In your case replace your column transformer by this:
column_trans = ColumnTransformer([
('embedding', FlairTransformerEmbedding(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])
While using this as a model for spam classification, I'd like to add an additional feature of the Subject plus the body.
I have all of my features in a pandas dataframe. For example, the subject is df['Subject'], the body is df['body_text'] and the spam/ham label is df['ham/spam']
I receive the following error:
TypeError: 'FeatureUnion' object is not iterable
How can I use both df['Subject'] and df['body_text'] as features all while running them through the pipeline function?
from sklearn.pipeline import FeatureUnion
features = df[['Subject', 'body_text']].values
combined_2 = FeatureUnion(list(features))
pipeline = Pipeline([
('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
('tfidf_transformer', TfidfTransformer()),
('classifier', MultinomialNB())])
pipeline.fit(combined_2, df['ham/spam'])
k_fold = KFold(n=len(df), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
train_text = combined_2.iloc[train_indices]
train_y = df.iloc[test_indices]['ham/spam'].values
test_text = combined_2.iloc[test_indices]
test_y = df.iloc[test_indices]['ham/spam'].values
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
prediction_prob = pipeline.predict_proba(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label='spam')
scores.append(score)
FeatureUnion was not meant to be used that way. It instead takes two feature extractors / vectorizers and applies them to the input. It does not take data in the constructor the way it is shown.
CountVectorizer is expecting a sequence of strings. The easiest way to provide it with that is to concatenate the strings together. That would pass both the text in both columns to the same CountVectorizer.
combined_2 = df['Subject'] + ' ' + df['body_text']
An alternative method would be to run CountVectorizer and optionally TfidfTransformer individually on each column, and then stack the results.
import scipy.sparse as sp
subject_vectorizer = CountVectorizer(...)
subject_vectors = subject_vectorizer.fit_transform(df['Subject'])
body_vectorizer = CountVectorizer(...)
body_vectors = body_vectorizer.fit_transform(df['body_text'])
combined_2 = sp.hstack([subject_vectors, body_vectors], format='csr')
A third option is to implement your own transformer that would extract a dataframe column.
class DataFrameColumnExtracter(TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.column]
In that case you could use FeatureUnion on two pipelines, each containing your custom transformer, then CountVectorizer.
subj_pipe = make_pipeline(
DataFrameColumnExtracter('Subject'),
CountVectorizer()
)
body_pipe = make_pipeline(
DataFrameColumnExtracter('body_text'),
CountVectorizer()
)
feature_union = make_union(subj_pipe, body_pipe)
This feature union of pipelines will take the dataframe and each pipeline will process its column. It will produce the concatenation of term count matrices from the two columns given.
sparse_matrix_of_counts = feature_union.fit_transform(df)
This feature union can also be added as the first step in a larger pipeline.
Hi as I am new to machine learning methods using the sklearn library, I try to incorporate the decision tree into pipeline and then make both the prediction and output of the model, but as I run the following code, I got the warning:
'Pipeline' object has no attribute 'tree_'
So I wonder if the pipeline does not support with tree output, and how am I able to fix this problem? I have also tried using the decision_tree class directly, but I got another warning that:
setting an array element with a sequence.
I know that this appears as I have vectors with different dimension, but still no clue how to deal with the situation.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree.export import export_text
from sklearn import tree
# a function that reads the corpus, tokenizes it and returns the documents
# and their labels
def read_corpus(corpus_file, use_sentiment):
documents = []
labels = []
with open(corpus_file, encoding='utf-8') as f:
for line in f:
tokens = line.strip().split()
documents.append(tokens[3:])
if use_sentiment:
# 2-class problem: positive vs negative
labels.append( tokens[1] )
else:
# 6-class problem: books, camera, dvd, health, music, software
labels.append( tokens[0] )
return documents, labels
# a dummy function that just returns its input
def identity(x):
return x
# read the data and split i into train and test
X, Y = read_corpus('/Users/dengchenglong/Downloads/trainset', use_sentiment=False)
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]
# let's use the TF-IDF vectorizer
tfidf = False
# we use a dummy function as tokenizer and preprocessor,
# since the texts are already preprocessed and tokenized.
if tfidf:
vec = TfidfVectorizer(preprocessor = identity,
tokenizer = identity)
else:
vec = CountVectorizer(preprocessor = identity,
tokenizer = identity)
# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec),
('cls', tree.DecisionTreeClassifier())])
# train the classifier on the train dataset
decision_tree = classifier.fit(Xtrain, Ytrain)
# predict the labels of the test data
Yguess = classifier.predict(Xtest)
tree.plot_tree(classifier.fit(Xtest, Ytest))
# report performance of the classifier
print(accuracy_score(Ytest, Yguess))
print(classification_report(Ytest, Yguess))
What if you try this:
from sklearn.pipeline import make_pipeline
# combine the vectorizer with a Naive Bayes classifier
clf = DecisionTreeClassifier()
classifier = make_pipeline(vec,clf)
As it seems, before using pipeline you must initiate the model you are trying to apply. Let me know if this works and if not, the errors it's returning.
From: Scikit-learn documentation
Example out of: Make pipeline example with trees
How do you call partial_fit() on a scikit-learn classifier wrapped inside a Pipeline()?
I'm trying to build an incrementally trainable text classifier using SGDClassifier like:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
classifier = Pipeline([
('vectorizer', HashingVectorizer(ngram_range=(1,4), non_negative=True)),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(SGDClassifier())),
])
but I get an AttributeError trying to call classifier.partial_fit(x,y).
It supports fit(), so I don't see why partial_fit() isn't available. Would it be possible to introspect the pipeline, call the data transformers, and then directly call partial_fit() on my classifier?
Here is what I'm doing - where 'mapper' and 'clf' are the 2 steps in my Pipeline obj.
def partial_pipe_fit(pipeline_obj, df):
X = pipeline_obj.named_steps['mapper'].fit_transform(df)
Y = df['class']
pipeline_obj.named_steps['clf'].partial_fit(X,Y)
You probably want to keep track of performance as you keep adjusting/updating your classifier - but that is a secondary point
so more specifically - the original pipeline(s) were constructed as follows
to_vect = Pipeline([('vect', CountVectorizer(min_df=2, max_df=.9, ngram_range=(1, 1), max_features = 100)),
('tfidf', TfidfTransformer())])
full_mapper = DataFrameMapper([
('norm_text', to_vect),
('norm_fname', to_vect), ])
full_pipe = Pipeline([('mapper', full_mapper), ('clf', SGDClassifier(n_iter=15, warm_start=True,
n_jobs=-1, random_state=self.random_state))])
google DataFrameMapper to learn more about it - but here it just enables a transformation step that plays nice with pandas
Pipeline does not use partial_fit, hence does not expose it. We would probably need a dedicated pipelining scheme for out-of-core computation but that also depends on the capabilities of the previous models.
In particular in this case you would probably want to do several passes over your data, one to fit each stage of the pipeline and then to transform the dataset to fit the next one, except for the first stage which is stateless, hence does not fit parameters from the data.
In the mean time it's probably easier to roll your own wrapper code tailored to your needs.
Even though this question is 8 years old it is still very relevant and has not been updated for quite a time now.
As the matter of fact, there is now a nice package created by Vincent Warmerdam called tokenwiser.
It is used for NLP stuff mostly to fit within the sklearn infrastructure. However, there is the main building block that can be used even for not NLP tasks.
The package has PartialPipeline boiler plate and Documentantation.
Example here:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion
pipe = PartialPipeline([
("clean", Cleaner()),
("union", PartialFeatureUnion([
("full_text_pipe", PartialPipeline([
("identity", Identity()),
("hash1", HashingVectorizer()),
])),
("hyphen_pipe", PartialPipeline([
("hyphen", HyphenTextPrep()),
("hash2", HashingVectorizer()),
]))
])),
("clf", SGDClassifier())
])
X = [
"i really like this post",
"thanks for that comment",
"i enjoy this friendly forum",
"this is a bad post",
"i dislike this article",
"this is not well written"
]
y = np.array([1, 1, 1, 0, 0, 0])
for loop in range(3):
pipe.partial_fit(X, y, classes=[0, 1])
I can imagine this template working even for non-NLP-related stuff. Hope someone will find this super usefull.
I also propose my basic implementation of utilizing partial_fit within a sklearn pipeline.
We just need to use a model that allows for partial fit (e.g. SGDregressor, xgboost, etc) and create own sklearn compatible classes
(Huge KUDOS to VIncent Warmerdam who started this in his TOKENWISER project)
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklego.preprocessing import PatsyTransformer
class xgboost_partial_trainer(BaseEstimator, TransformerMixin):
"""
allows for incremental training od xgboost model within a sklean pipeline
"""
def __init__(self, training_params: dict = None):
self.training_params = training_params
self.trained_model = None
self._first_call = True
self.evals_result = {}
self.iter_number = 1
self._X_train, self._X_test, self._y_train, self._y_test = (
None,
None,
None,
None,
)
def partial_fit(self, X, y=None, classes=None, **fit_params):
print(f"firts run: {self._first_call}, n_iter = {self.iter_number}")
self.iter_number += 1
if self._first_call:
# Select random subset of data and store within the model (for error loss over time)
self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(
X, y, test_size=0.6, random_state=1
)
self._xg_train = xgb.DMatrix(self._X_train, label=self._y_train)
self._xg_test = xgb.DMatrix(self._X_test, label=self._y_test)
# validations set to watch performance - same testing data, changebla training data
self.watchlist = [
(self._xg_train, "train_batch"),
(self._xg_test, "eval_fixed"),
]
# Trainig Part Itself
self.trained_model = xgb.train(
params=self.training_params,
dtrain=xgb.DMatrix(X, y),
xgb_model=self.trained_model,
evals=self.watchlist,
)
# Swich outside firts batch
self._first_call = False
else:
self._xg_train = xgb.DMatrix(X, y)
self.watchlist = [
(self._xg_train, "train_batch"),
(self._xg_test, "eval_fixed"),
]
self.trained_model = xgb.train(
params=self.training_params,
dtrain=self._xg_train,
xgb_model=self.trained_model,
evals=self.watchlist,
)
# self._predicted_y = self.trained_model.predict(xgb.DMatrix(self._X_test))
# print(f"mean_squared_error = {mean_squared_error(self._y_test, self._predicted_y, squared = False)}")
return self
def predict(self, X, y=None, **fit_params):
return self.trained_model.predict(xgb.DMatrix(X))
def transform(self, X, y=None, **fit_params):
return self.trained_model.predict(xgb.DMatrix(X))
def fit(self, X, y=None, **fit_params):
return self
class PartialPipeline(Pipeline):
"""
Utility function to generate a `PartialPipeline`
Arguments:
steps: a collection of text-transformers
"""
def partial_fit(self, X, y=None, classes=None, **kwargs):
"""
Fits the components, but allow for batches.
"""
# print(f"there are partial steps {self.steps_partial}")
for _, step in self.steps:
if hasattr(step, "partial_fit"):
step.partial_fit(X, y, **kwargs)
elif hasattr(step, "fit_transform"):
X = step.fit_transform(X)
elif hasattr(step, "transform"):
X = step.transform(X)
elif hasattr(step, "fit"):
X = step.fit(X)
return self
Once we have these sklearn classes we may utilize the Pipeline:
my_pipeline = PartialPipeline([
("patsy", PatsyTransformer(FORMULA2)),
("xgboost_model", xgboost_partial_trainer(training_params=params)),
])
df_chunked = pd.read_csv(your_date, chunksize=5_000)
for df in df_chunked:
my_pipeline.partial_fit(df, y=df["speed"])
Please, provide me with feedback and code cleaning suggestions. I am fully aware that this is not perfect. However, as a nice prototype - not too bad!