How to use BERT and Elmo embedding with sklearn

How to use BERT and Elmo embedding with sklearn - python

I created a text classifier that uses Tf-Idf using sklearn, and I want to use BERT and Elmo embedding instead of Tf-Idf.
How would one do that ?
I'm getting Bert embedding using the code below:
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings
# init embedding
embedding = TransformerWordEmbeddings('bert-base-uncased')
# create a sentence
sentence = Sentence('The grass is green .')
# embed words in sentence
embedding.embed(sentence)
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
column_trans = ColumnTransformer([
('tfidf', TfidfVectorizer(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])
# Initialize data
data = [
['This process, however, afforded me no means of.', 20, 1],
['another long description', 21, 1],
['It never once occurred to me that the fumbling', 19, 0],
['How lovely is spring As we looked from Windsor', 18, 0]
]
# Create DataFrame
df = pd.DataFrame(data, columns=['text', 'number', 'target'])
X = column_trans.fit_transform(df)
X = X.toarray()
y = df.loc[:, "target"].values
# Perform classification
classifier = LogisticRegression(random_state=0)
classifier.fit(X, y)

Sklearn offers the possibility to make custom data transformer (unrelated to the machine learning model "transformers").
I implemented a custom sklearn data transformer that uses the flair library that you use. Please note that I used TransformerDocumentEmbeddings instead of TransformerWordEmbeddings. And one that works with the transformers library.
I'm adding a SO question that discuss which transformer layer is interesting to use here.
I'm not familiar with Elmo, though I found this that uses tensorflow. You may be able to modify the code I shared to make Elmo work.
import torch
import numpy as np
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.base import BaseEstimator, TransformerMixin
class FlairTransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=None, layers=None):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.model_kw_args = {'batch_size': batch_size, 'layers': layers}
self.model_kw_args = {k: v for k, v in self.model_kw_args.items()
if v is not None}
def fit(self, X, y=None):
return self
def transform(self, X):
model = TransformerDocumentEmbeddings(
self.model_name, fine_tune=False,
**self.model_kw_args)
sentences = [Sentence(text) for text in X]
embedded = model.embed(sentences)
embedded = [e.get_embedding().reshape(1, -1) for e in embedded]
return np.array(torch.cat(embedded).cpu())
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModel
from more_itertools import chunked
class TransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=1, layer=-1):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.layer = layer
self.batch_size = batch_size
def fit(self, X, y=None):
return self
def transform(self, X):
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
model = AutoModel.from_pretrained(self.model_name)
res = []
for batch in chunked(X, self.batch_size):
encoded_input = tokenizer.batch_encode_plus(
batch, return_tensors='pt', padding=True, truncation=True)
output = model(**encoded_input)
embed = output.last_hidden_state[:,self.layer].detach().numpy()
res.append(embed)
return np.concatenate(res)
In your case replace your column transformer by this:
column_trans = ColumnTransformer([
('embedding', FlairTransformerEmbedding(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])

Related

Write a fasttext customised transformer

I have a trained customised fasttext model (fasttext is a word embedding algorithm developed by Facebook). I managed to get the expected result in a function but now I want to rewrite it into a customised transformer so I can add it into my sklearn pipeline as it only accepts transformer.
The function takes a word and returns vectors of the word:
def name2vector(name=None):
vec = [np.array(model.get_word_vector(w)) for w in name.lower().split(' ')]
name_vec = np.sum(vec, axis=0) # If "name" is multiple words, sum the vectors
return (name_vec)
returned value:
array([-0.01087821, 0.01030535, -0.01402427, 0.0310982 , 0.08786983,
-0.00404521, -0.03286128, -0.00842709, 0.03934859, -0.02717219,
0.01151722, -0.03253938, -0.02435859, 0.03330994, -0.03696496], dtype=float32))
I want the tranformer does the same thing as the function.
I know I can use BaseEstimator and TransformerMixin to rewrite it into a transformer by reading the tutorial but I still stuck on this. Some suggestions will be great, thanks.

Assuming you're working with a pandas DataFrame, you could do something like this:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
class FastTextTransformer(TransformerMixin, BaseEstimator):
def __init__(self, model):
self.model = model
def get_params(self, deep):
return {'dimension': self.model.get_dimension()}
def fit(self, X, y):
# We assume the FT model was already fit
return self
def transform(self, X):
X_copy = X.copy()
X_copy = X_copy.apply(self.name2vector)
return pd.DataFrame(X_copy.tolist())
def name2vector(self, name):
vec = [np.array(self.model.get_word_vector(w)) for w in name.lower().split(' ')]
name_vec = np.sum(vec, axis=0) # If "name" is multiple words, sum the vectors
return name_vec
To demonstrate the usage let's load a fasttext model and a sample data-set of amazon reviews:
import fasttext as ft
ft_model = ft.load_model('amazon_review_polarity.ftz')
amz_df = pd.read_html('https://huggingface.co/datasets/amazon_polarity/viewer/amazon_polarity/test')[0]
amz_df.rename(columns={'content (string)': 'content', 'label (class label)': 'label'}, inplace=True)
amz_df
And then use it as a bona fide scikit-learn Pipeline.
pipe = Pipeline([
('ft', FastTextTransformer(ft_model)),
('clf', LogisticRegression()),
])
And now we can fit and predict
pipe.fit(amz_df['content'], amz_df.label)
pipe.predict(pd.Series(['great', 'very cool', 'very disappointed']))
Which returns
array(['positive', 'positive', 'negative'], dtype=object)
N.B. In case you want to compute an average of the words in the sentence, instead of a sum, you can replace name2vector with the built-in method get_sentence_vector. For a supervised model, it'll return the average. For unsupervised ones (CBOW and skipgram), it first divides each vector by its L2 norm, and then averages.
See the discussion here.
Credit: Stefano Fiorucci - anakin87

The library compress-fasttext (it's a wrapper around Gensim that makes fastText models more lightweight) already has such a transformer:
import compress_fasttext
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from compress_fasttext.feature_extraction import FastTextTransformer
small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
'https://github.com/avidale/compress-fasttext/releases/download/v0.0.4/cc.en.300.compressed.bin'
)
classifier = make_pipeline(
FastTextTransformer(model=small_model),
LogisticRegression()
).fit(
['banana', 'soup', 'burger', 'car', 'tree', 'city'],
[1, 1, 1, 0, 0, 0]
)
classifier.predict(['jet', 'train', 'cake', 'apple'])
# array([0, 0, 1, 1])
Under the hood, it finds all "words" (alphanumeric sequences) in the text and averages their fastText embeddings.
Here is the source code.

Sklearn Pipeline persistance with custom class not working

I am using sklearn pipeline in my code and saving the pipeline object to deploy in the another envinorment. I have one custom class to drop the features. I am saving the model successfully but when I am using the pipeline object in another envirorment which has same version of sklearn, it is throwing an error. The pipeline is working fine when I did not include my custom class DropFeatures. Below is the code
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
# Load the Iris dataset
df = pd.read_csv('Iris.csv')
label = 'Species'
labels = df[label]
df.drop(['Species'],axis=1,inplace=True)
# Set up a pipeline with a feature selection preprocessor that
# selects the top 2 features to use.
# The pipeline then uses a RandomForestClassifier to train the model.
class DropFeatures(BaseEstimator, TransformerMixin):
def __init__(self, features_to_drop=None):
self.features = features_to_drop
def fit(self, X, y=None):
return self
def transform(self, X):
# encode labels
if len(self.features) != 0:
X = X.copy()
X = X.drop(self.features, axis=1)
return X
return X
pipeline = Pipeline([
('drop_features', DropFeatures(['Id'])),
('feature_selection', SelectKBest(chi2, k=1)),
('classification', RandomForestClassifier())
])
pipeline.fit(df, labels)
print(pipeline.predict(query))
# Export the classifier to a file
joblib.dump(pipeline, 'model.joblib')
When I am using the model.joblib in another environment, I am getting an error. Below is the code to load the model and error in the image
from sklearn.externals import joblib
model = joblib.load('model1.joblib')
print(model)
Error stack trace:

Sklearn GridSearch with pre-training

I'm making a Sklearn Pipeline with a parameter optimization made by the GridSearchCV. The pipeline has to get the best model for several different entities implementing a pre-train and then fine-tune approach: pre-train all the entities together and the fine-tune every single element and returns a model for each entity. These are the constraint of the pipeline:
Pre-train and fine-tuning have to be in the same pipeline because both the model has to have the same data in each GridSearchCV's fold.
The pre-train model has to pass its weights to the fine-tuning model.
I have implemented:
A Sklearn Transformer that takes a data-frame with all the entities in input and fit itself.
A Sklearn Regressor that splits the data-frame in one data-frame for each entity and fit a Keras model for each entity.
What I'm missing is how to pass from the Pre-train transformer to the Fine-tuning transformer the weights obtained by the Pre-train transformer (considering that each GridSearchCV fold has different weights)
Here is the code:
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, Input
import copy
class MyRegressor(BaseEstimator, TransformerMixin):
def __init__(self, neurons, featInput, featOutput):
self.neurons = neurons
self.preTrain = None
self.featInput = featInput
self.featOutput = featOutput
def fit(self, X, y=None):
X_train = X[self.featInput]
y_train = X[self.featOutput]
inputLayer = Input(shape=(len(self.featInput), ), name='INPUT')
hidden = Dense(self.neurons, name='HIDDEN')(inputLayer)
outputLayer = Dense(len(self.featOutput), name='OUTPUT')(hidden)
self.model = Model(inputLayer, outputLayer)
self.model.compile(loss='mse', optimizer='rmsprop')
if self.preTrain is not None:
self.model.loadWeights(self.preTrain)
self.model.fit(X_train, y_train)
return self
def predict(self, X):
return self.model.predict(X[self.featInput])
def transform(self, X):
return X
def score(self, X, y=None, sample_weight=None):
y_true = X[self.featOutput]
y_pred = self.predict(X)
return mean_squared_error(y_true, y_pred)
class LoopTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns, component):
self.columns = columns
self.component = component
self.components = []
def fit(self, X, y=None):
for index, idx in X[self.columns].drop_duplicates().iterrows():
entityDf = X[(X[self.columns] == idx).sum(axis=1) == len(self.columns)].copy()
self.components.append({'id': idx, 'component': copy.deepcopy(self.component)})
self.components[-1]['component'].fit(entityDf, y)
return self
def predict(self, X):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
res = comp['component'].predict(entityDf)
results.append(res)
dfRes = pd.concat(results)
return dfRes
def score(self, X, y=None, sample_weight=None):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
if len(entityDf) > 0:
results.append(comp['component'].score(entityDf))
return np.average(results)
#create the input dataframe: 3 entities
dataFrame = pd.DataFrame([], columns=['entityId', 'input', 'output'])
for entity in range(3):
x = np.arange(random.randint(10, 20))
y = x * (entity + 1)
tempDf = pd.DataFrame(np.array([x, y]).T, columns=['input', 'output'])
tempDf['entityId'] = entity
dataFrame = pd.concat([dataFrame, tempDf], sort=False)
dataFrame = dataFrame.reset_index(drop=True)
#create the pipeline
neurons = [5, 10]
myPipe = Pipeline([('preTrain',
MyRegressor(neurons=neurons[0], featInput=['input'], featOutput=['output'])),
('fineTuning',
LoopTransformer(['entityId'],
MyRegressor(
neurons=neurons[0],
featInput=['input'],
featOutput=['output'])))])
#pre-train and fine-tuning has to have always the same number of neurons
params = [{
'preTrain__neurons': [neurons[0]],
'fineTuning__component__neurons': [neurons[0]]
}, {
'preTrain__neurons': [neurons[1]],
'fineTuning__component__neurons': [neurons[1]]
}]
gs = GridSearchCV(myPipe, params, verbose=1, cv=3)
gs.fit(dataFrame, dataFrame)
score = gs.score(dataFrame, dataFrame)
print(score)

I'm pretty the sklearn.Pipeline as is doesn't support this. But as long as you don't clone your pipeline (which happens for instance if you use a GridSearchCV), you can hack your way through with a code like the following code, which gives the instance of a step in the pipeline to the next step. You can apply the same principle in your pipeline:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
print("got the means: %s" % self.scaler.mean_)
return self
def transform(self, X):
return X
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
pipeline = make_pipeline(scaler,
MyTransformer(scaler),
LogisticRegression(solver='lbfgs',
multi_class='auto'))
pipeline = pipeline.fit(X, y)
X = X - 1
pipeline = pipeline.fit(X, y)
Which would give you this output, as expected:
got the means: [5.84333333 3.05733333 3.758 1.19933333]
got the means: [4.84333333 2.05733333 2.758 0.19933333]

How to add a feature to a vectorized data set?

I want to write a Naive Base text classificator.
Because sklearn does not accept 'text form' features I am transforming them using TfidfVectorizer.
I was successfully able to create such classificatory using only the transformed data as features. The code looks like this:
### text vectorization--go from strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed = vectorizer.transform(X_test_raw['url'])
### feature selection, because text is super high dimensional and
### can be really computationally chewy as a result
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)
X_train = selector.transform(X_train_transformed).toarray()
X_test = selector.transform(X_test_transformed).toarray()
clf = GaussianNB()
clf.fit(X_train, y_train_raw)
.....
Everything works as intended but I am having problems when I want to add another feature eg. flag indicating weather the given text contains a certain keyword.
I tried multiple things to properly transform the 'url' feature and then combine the transformed feature with another boolean feature but I was unsuccessfully.
Any tips how it should be done assuming that I have a pandas frame containing two features: 'url' (which I want to transform) and 'contains_keyword' flag?
The solution which failed looks like this:
vectorizer = CountVectorizer(min_df=1)
X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed = vectorizer.transform(X_test_raw['url'])
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)
X_train_selected = selector.transform(X_train_transformed)
X_test_selected = selector.transform(X_test_transformed)
X_train_raw['transformed_url'] = X_train_selected.toarray().tolist()
X_train_without = X_train_raw.drop(['url'], axis=1)
X_train = X_train_without.values
This produces rows containing a boolean flag and a list which is a wrong input for sklearn model. I have no idea how should i properly transform this. Grateful for any help.
Here are test data:
url,target,ads_keyword
googleadapis l google com,1,True
googleadapis l google com,1,True
clients1 google com,1,False
c go-mpulse net,1,False
translate google pl,1,False
url - splitted domain taken from dns query
target - target class for classification
ads_keyword - flag indicating weather the 'url' contains the 'ads' word.
I want to transform the 'url' using the TfidfVectorizer and use the transformed data together with 'ads_keyword' (and possibly more features in the future) as features used to train the Naive Bayes model.

Here is a demo, showing how to union features and how to tune up hyperparameters using GridSearchCV.
Unfortunately your sample data set is too tiny to train a real model...
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
import os
import re
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, name=None, position=None,
as_cat_codes=False, sparse=False):
self.name = name
self.position = position
self.as_cat_codes = as_cat_codes
self.sparse = sparse
def fit(self, X, y=None):
return self
def transform(self, X, **kwargs):
if self.name is not None:
col_pos = X.columns.get_loc(self.name)
elif self.position is not None:
col_pos = self.position
else:
raise Exception('either [name] or [position] parameter must be not-None')
if self.as_cat_codes and X.dtypes.iloc[col_pos] == 'category':
ret = X.iloc[:, col_pos].cat.codes
else:
ret = X.iloc[:, col_pos]
if self.sparse:
ret = csr_matrix(ret.values.reshape(-1,1))
return ret
union = FeatureUnion([
('text',
Pipeline([
('select', ColumnSelector('url')),
#('pct', SelectPercentile(percentile=1)),
('vect', TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')),
]) ),
('ads',
Pipeline([
('select', ColumnSelector('ads_keyword', sparse=True,
as_cat_codes=True)),
#('scale', StandardScaler(with_mean=False)),
]) )
])
pipe = Pipeline([
('union', union),
('clf', MultinomialNB())
])
param_grid = [
{
'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
max_df=0.5,
stop_words='english')],
'clf': [SGDClassifier(max_iter=500)],
'union__text__vect__ngram_range': [(1,1), (2,5)],
'union__text__vect__analyzer': ['word','char_wb'],
'clf__alpha': np.logspace(-5, 0, 6),
#'clf__max_iter': [500],
},
{
'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
max_df=0.5,
stop_words='english')],
'clf': [MultinomialNB()],
'union__text__vect__ngram_range': [(1,1), (2,5)],
'union__text__vect__analyzer': ['word','char_wb'],
'clf__alpha': np.logspace(-4, 2, 7),
},
#{ # NOTE: does NOT support sparse matrices!
# 'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
# max_df=0.5,
# stop_words='english')],
# 'clf': [GaussianNB()],
# 'union__text__vect__ngram_range': [(1,1), (2,5)],
# 'union__text__vect__analyzer': ['word','char_wb'],
#},
]
gs_kwargs = dict(scoring='roc_auc', cv=3, n_jobs=1, verbose=2)
X_train, X_test, y_train, y_test = \
train_test_split(df[['url','ads_keyword']], df['target'], test_size=0.33)
grid = GridSearchCV(pipe, param_grid=param_grid, **gs_kwargs)
grid.fit(X_train, y_train)
# prediction
predicted = grid.predict(X_test)

Scikit learn Custom Transformer dimension mismatch

I'm coming from R, so scikit API still very confusing to me. I was following this tutorial http://michelleful.github.io/code-blog/2015/06/20/pipelines/ to learn about Pipelines. So let's create a fake dataset just for reference:
x1,x2,y
foo,zoo,1
bar,moo,2
goo,too,3
roo,zoo,4
too,moo,5
My goal is very simple: train a linear regression on y, using separate tfidf matrices from x1 and x2, plus some custom features from both x1 and x2 (ie, word length, etc).
Let's start with the simpler task of using only tfidf from x1. Here's the full code:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import time
import re
import math
def clip_RMSLE(y, y_pred, **kwargs):
y_pred[y_pred < 0] = 0.0
to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(to_sum) * (1.0/len(y))) ** 0.5
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
return tfidf.fit_transform(df[self.colname].values)
def fit(self, df, y=None):
return self
start = time.time()
seed = 1991
ngram_rg = (1,2)
RMSLE = make_scorer(clip_RMSLE, greater_is_better=False)
def tokenizer(text):
if text:
result = re.findall('[a-z]{2,}', text.lower())
else:
result = []
return result
df = pd.read_csv('fake.csv', sep=',')
y = df['y'].values
pipeline = Pipeline([('tfidf', ColumnNgram('x1', tokenizer, ngram_rg)),
('linear_reg', LinearRegression(n_jobs=1))
])
kfold = KFold(n_splits=2, random_state=seed)
results = cross_val_score(pipeline, df, y, cv=kfold, scoring=RMSLE)
print(results)
print(results.mean())
end = time.time()
print('Timeto finish this thing: %0.2fs' % (end - start))
I'm getting the error ValueError: dimension mismatch, probably because some terms will not appear in both train/validation folds. What's the proper way of doing this? Thank you!

Change your ColumnNgram to this:
class ColumnNgram(BaseEstimator, TransformerMixin):
def __init__(self, colname, tokenizer, ngram_rg):
self.colname = colname
self.tokenizer = tokenizer
self.ngram_rg = ngram_rg
self.tfidf = None
def transform(self, df, y=None):
return self.tfidf.transform(df[self.colname].values)
def fit(self, df, y=None):
self.tfidf = TfidfVectorizer(tokenizer=self.tokenizer, ngram_range=self.ngram_rg)
self.tfidf.fit(df[self.colname].values)
return self
You should declare and learn about the training data in fit(). Currently you are re-fitting the data in each call to transform(), which obviously will return in different features in train and validation sets as you have suggested.
The proper way is to keep a TfidfVectorizer which learns data during the fit() and then only transform the new data in transform() instead of re-fitting the new data.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to use BERT and Elmo embedding with sklearn - python

Related

Write a fasttext customised transformer

Sklearn Pipeline persistance with custom class not working

Sklearn GridSearch with pre-training

How to add a feature to a vectorized data set?

Scikit learn Custom Transformer dimension mismatch

Categories

Resources