How to add a feature using a pipeline and FeatureUnion - python

In the code below I use a tweeter dataset to perform sentiment analysis. I use a pipeline which performs the following processes:
1) performs some basic text preprocessing
2) vectorizes the tweet text
3) adds an extra feature ( text length)
4) classification
I would like to add one more feature which is the scaled number of followers. I wrote a function that takes as an input the whole dataframe (df) and returns a new dataframe with scaled number of followers. However, I am finding it challenging to add this process on the pipeline e.g. add this feature to the other features using the sklearn pipeline.
Any help or advise on this problem will be much appreciated.
the question and code below is inspired by Ryan's post:pipelines
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
def import_data(filename,sep,eng,header = None,skiprows=1):
#read csv
dataset = pd.read_csv(filename,sep=sep,engine=eng,header = header,skiprows=skiprows)
#rename columns
dataset.columns = ['text','followers','sentiment']
return dataset
df = import_data('apple_v3.txt','\t','python')
X, y = df.text, df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y)
tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)
count_vect = CountVectorizer(tokenizer=tokenizer.tokenize)
classifier = LogisticRegression()
def get_scalled_followers(df):
scaler = MinMaxScaler()
df[['followers']] = df[['followers']].astype(float)
df[['followers']] = scaler.fit_transform(df[['followers']])
followers = df['followers'].values
followers_reshaped = followers.reshape((len(followers),1))
return df
def get_tweet_length(text):
return len(text)
import numpy as np
def genericize_mentions(text):
return re.sub(r'#[\w_-]+', 'thisisanatmention', text)
def reshape_a_feature_column(series):
return np.reshape(np.asarray(series), (len(series), 1))
def pipelinize_feature(function, active=True):
def list_comprehend_a_function(list_or_series, active=True):
if active:
processed = [function(i) for i in list_or_series]
processed = reshape_a_feature_column(processed)
return processed
else:
return reshape_a_feature_column(np.zeros(len(list_or_series)))
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn_helpers import pipelinize, genericize_mentions, train_test_and_evaluate
sentiment_pipeline = Pipeline([
('genericize_mentions', pipelinize(genericize_mentions, active=True)),
('features', FeatureUnion([
('vectorizer', count_vect),
('post_length', pipelinize_feature(get_tweet_length, active=True))
])),
('classifier', classifier)
])
sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, X_train, y_train, X_test, y_test)

The best explanation I have found so far is at the following post: pipelines
My data includes heterogenous features and the following step by step approach works well and is easy to understand:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
#step1 - select data from dataframe and split the dataset in train and test sets
features= [c for c in df.columns.values if c not in ['sentiment']]
numeric_features= [c for c in df.columns.values if c not in ['text','sentiment']]
target = 'sentiment'
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
#step2 - create a number selector class and text selector class. These classes allow to select specific columns from the dataframe
class NumberSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[[self.key]]
class TextSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.key]
#step 3 create one pipeline for the text data and one for the numerical data
text = Pipeline([
('selector', TextSelector(key='content')),
('tfidf', TfidfVectorizer( stop_words='english'))
])
text.fit_transform(X_train)
followers = Pipeline([
('selector', NumberSelector(key='followers')),
('standard', MinMaxScaler())
])
followers.fit_transform(X_train)
#step 4 - features union
feats = FeatureUnion([('text', text),
('length', followers)])
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)
# step 5 - add the classifier and predict
pipeline = Pipeline([
('features',feats),
('classifier', SVC(kernel = 'linear', probability=True, C=1, class_weight = 'balanced'))
])
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
np.mean(preds == y_test)
# step 6 use the model to predict new data not included in the test set
# in my example the pipeline expects a dataframe as an input which should have a column called 'text' and a column called 'followers'
array = [["#apple is amazing",25000]]
dfObj = pd.DataFrame(array,columns = ['text' , 'followers'])
#prints the expected class e.g. positive or negative sentiment
print(pipeline.predict(dfObj))
#print the probability for each class
print(pipeline.predict_proba(dfObj))

You can use FeatureUnion to combine the features extracted from the different columns of your dataframe. You should feed the dataframe to the pipeline and use FunctionTransformer to extract specific columns. It might look like this (I haven't run it, some errors possible)
sentiment_pipeline = Pipeline([
FeatureUnion([
# your added feature (maybe you'll need to reshape it so ndim == 2)
('scaled_followers', FunctionTransformer(lambda df: get_scalled_followers(df).values,
validate=False)),
# previous features
('text_features', Pipeline([
('extractor', FunctionTransformer(lambda df: df.text.values, validate=False))
('genericize_mentions', pipelinize(genericize_mentions, active=True)),
('features', FeatureUnion([
('vectorizer', count_vect),
('post_length', pipelinize_feature(get_tweet_length, active=True))
])),
]))
]),
('classifier', classifier)
])
sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, df_train, y_train, df_test, y_test)
Another solution could be not use Pipeline and just stack the features together with np.hstack.

Related

How to assign cv_results_['params'] from GridSearchCV

I want to build a dataframe with cv_results but GridSearchCV is giving a list back. Dont know how to assign mat to ind to make a dataframe.
Here an example
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import datasets
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
# Split the data into training/testing sets
X_train = diabetes_X[:-20]
X_test = diabetes_X[-20:]
# Split the targets into training/testing sets
y_train = diabetes_y[:-20]
y_test = diabetes_y[-20:]
pipeline = Pipeline([
('kbest', SelectKBest(f_classif)),
('regressor', KNeighborsRegressor())
])
parameters = {'kbest__k': list(range(1, X_train.shape[1]+1)),
'regressor__n_neighbors': list(range(1,21))}
grid = GridSearchCV(pipeline, parameters)
grid.fit(X_train, y_train)
mat = grid.cv_results_['mean_test_score']
ind = grid.cv_results_['params']
Output of params look like this:

Performing CountVectorizer on multiple columns [duplicate]

While using this as a model for spam classification, I'd like to add an additional feature of the Subject plus the body.
I have all of my features in a pandas dataframe. For example, the subject is df['Subject'], the body is df['body_text'] and the spam/ham label is df['ham/spam']
I receive the following error:
TypeError: 'FeatureUnion' object is not iterable
How can I use both df['Subject'] and df['body_text'] as features all while running them through the pipeline function?
from sklearn.pipeline import FeatureUnion
features = df[['Subject', 'body_text']].values
combined_2 = FeatureUnion(list(features))
pipeline = Pipeline([
('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
('tfidf_transformer', TfidfTransformer()),
('classifier', MultinomialNB())])
pipeline.fit(combined_2, df['ham/spam'])
k_fold = KFold(n=len(df), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
train_text = combined_2.iloc[train_indices]
train_y = df.iloc[test_indices]['ham/spam'].values
test_text = combined_2.iloc[test_indices]
test_y = df.iloc[test_indices]['ham/spam'].values
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
prediction_prob = pipeline.predict_proba(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label='spam')
scores.append(score)
FeatureUnion was not meant to be used that way. It instead takes two feature extractors / vectorizers and applies them to the input. It does not take data in the constructor the way it is shown.
CountVectorizer is expecting a sequence of strings. The easiest way to provide it with that is to concatenate the strings together. That would pass both the text in both columns to the same CountVectorizer.
combined_2 = df['Subject'] + ' ' + df['body_text']
An alternative method would be to run CountVectorizer and optionally TfidfTransformer individually on each column, and then stack the results.
import scipy.sparse as sp
subject_vectorizer = CountVectorizer(...)
subject_vectors = subject_vectorizer.fit_transform(df['Subject'])
body_vectorizer = CountVectorizer(...)
body_vectors = body_vectorizer.fit_transform(df['body_text'])
combined_2 = sp.hstack([subject_vectors, body_vectors], format='csr')
A third option is to implement your own transformer that would extract a dataframe column.
class DataFrameColumnExtracter(TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.column]
In that case you could use FeatureUnion on two pipelines, each containing your custom transformer, then CountVectorizer.
subj_pipe = make_pipeline(
DataFrameColumnExtracter('Subject'),
CountVectorizer()
)
body_pipe = make_pipeline(
DataFrameColumnExtracter('body_text'),
CountVectorizer()
)
feature_union = make_union(subj_pipe, body_pipe)
This feature union of pipelines will take the dataframe and each pipeline will process its column. It will produce the concatenation of term count matrices from the two columns given.
sparse_matrix_of_counts = feature_union.fit_transform(df)
This feature union can also be added as the first step in a larger pipeline.

How to add a feature to a vectorized data set?

I want to write a Naive Base text classificator.
Because sklearn does not accept 'text form' features I am transforming them using TfidfVectorizer.
I was successfully able to create such classificatory using only the transformed data as features. The code looks like this:
### text vectorization--go from strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed = vectorizer.transform(X_test_raw['url'])
### feature selection, because text is super high dimensional and
### can be really computationally chewy as a result
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)
X_train = selector.transform(X_train_transformed).toarray()
X_test = selector.transform(X_test_transformed).toarray()
clf = GaussianNB()
clf.fit(X_train, y_train_raw)
.....
Everything works as intended but I am having problems when I want to add another feature eg. flag indicating weather the given text contains a certain keyword.
I tried multiple things to properly transform the 'url' feature and then combine the transformed feature with another boolean feature but I was unsuccessfully.
Any tips how it should be done assuming that I have a pandas frame containing two features: 'url' (which I want to transform) and 'contains_keyword' flag?
The solution which failed looks like this:
vectorizer = CountVectorizer(min_df=1)
X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed = vectorizer.transform(X_test_raw['url'])
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)
X_train_selected = selector.transform(X_train_transformed)
X_test_selected = selector.transform(X_test_transformed)
X_train_raw['transformed_url'] = X_train_selected.toarray().tolist()
X_train_without = X_train_raw.drop(['url'], axis=1)
X_train = X_train_without.values
This produces rows containing a boolean flag and a list which is a wrong input for sklearn model. I have no idea how should i properly transform this. Grateful for any help.
Here are test data:
url,target,ads_keyword
googleadapis l google com,1,True
googleadapis l google com,1,True
clients1 google com,1,False
c go-mpulse net,1,False
translate google pl,1,False
url - splitted domain taken from dns query
target - target class for classification
ads_keyword - flag indicating weather the 'url' contains the 'ads' word.
I want to transform the 'url' using the TfidfVectorizer and use the transformed data together with 'ads_keyword' (and possibly more features in the future) as features used to train the Naive Bayes model.
Here is a demo, showing how to union features and how to tune up hyperparameters using GridSearchCV.
Unfortunately your sample data set is too tiny to train a real model...
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
import os
import re
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, name=None, position=None,
as_cat_codes=False, sparse=False):
self.name = name
self.position = position
self.as_cat_codes = as_cat_codes
self.sparse = sparse
def fit(self, X, y=None):
return self
def transform(self, X, **kwargs):
if self.name is not None:
col_pos = X.columns.get_loc(self.name)
elif self.position is not None:
col_pos = self.position
else:
raise Exception('either [name] or [position] parameter must be not-None')
if self.as_cat_codes and X.dtypes.iloc[col_pos] == 'category':
ret = X.iloc[:, col_pos].cat.codes
else:
ret = X.iloc[:, col_pos]
if self.sparse:
ret = csr_matrix(ret.values.reshape(-1,1))
return ret
union = FeatureUnion([
('text',
Pipeline([
('select', ColumnSelector('url')),
#('pct', SelectPercentile(percentile=1)),
('vect', TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')),
]) ),
('ads',
Pipeline([
('select', ColumnSelector('ads_keyword', sparse=True,
as_cat_codes=True)),
#('scale', StandardScaler(with_mean=False)),
]) )
])
pipe = Pipeline([
('union', union),
('clf', MultinomialNB())
])
param_grid = [
{
'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
max_df=0.5,
stop_words='english')],
'clf': [SGDClassifier(max_iter=500)],
'union__text__vect__ngram_range': [(1,1), (2,5)],
'union__text__vect__analyzer': ['word','char_wb'],
'clf__alpha': np.logspace(-5, 0, 6),
#'clf__max_iter': [500],
},
{
'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
max_df=0.5,
stop_words='english')],
'clf': [MultinomialNB()],
'union__text__vect__ngram_range': [(1,1), (2,5)],
'union__text__vect__analyzer': ['word','char_wb'],
'clf__alpha': np.logspace(-4, 2, 7),
},
#{ # NOTE: does NOT support sparse matrices!
# 'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
# max_df=0.5,
# stop_words='english')],
# 'clf': [GaussianNB()],
# 'union__text__vect__ngram_range': [(1,1), (2,5)],
# 'union__text__vect__analyzer': ['word','char_wb'],
#},
]
gs_kwargs = dict(scoring='roc_auc', cv=3, n_jobs=1, verbose=2)
X_train, X_test, y_train, y_test = \
train_test_split(df[['url','ads_keyword']], df['target'], test_size=0.33)
grid = GridSearchCV(pipe, param_grid=param_grid, **gs_kwargs)
grid.fit(X_train, y_train)
# prediction
predicted = grid.predict(X_test)

How to combine LabelBinarizer and OneHotEncoder in pipeline in python for categorical variables?

I have looked up for the right tutorials and Q/A on stackoverflow for the last few days without finding the right guide, primarily because examples showing use case of LabelBinarizer or OneHotEncoder don't show how it's incorporated into pipeline, and vice versa.
I have a dataset with 4 variables:
num1 num2 cate1 cate2
3 4 Cat 1
9 23 Dog 0
10 5 Dog 1
num1 and num2 are numeric variables, cate1 and cate2 are categorical variables. I understand I need to encode the categorical variables somehow before fitting a ML algorithm, but I am not quite sure how to do that in pipeline after multiple tries.
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
# Class that identifies Column type
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit (self, X, y=None, **fit_params):
return self
def transform(self, X):
return X[self.names]
# Separate target from training features
y = df['MED']
X = df.drop('MED', axis=1)
X_selected = X.filter(['num1', 'num2', 'cate1', 'cate2'])
# from the selected X, further choose categorical only
X_selected_cat = X_selected.filter(['cate1', 'cate2']) # hand selected since some cat var has value 0, 1
# Find the numerical columns, exclude categorical columns
X_num_cols = X_selected.columns[X_selected.dtypes.apply(lambda c: np.issubdtype(c, np.number))] # list of numeric column names, automated here
X_cat_cols = X_selected_cat.columns # list of categorical column names, previously hand-slected
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y,
test_size=0.5,
random_state=567,
stratify=y)
# Pipeline
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
('categorical', make_pipeline(Columns(names=X_cat_cols)))
])),
('LR_model', LogisticRegression()),
])
This gives me error ValueError: could not convert string to float: 'Cat'
Replacing the last 4th line with this
('categorical', make_pipeline(Columns(names=X_cat_cols),OneHotEncoder()))
will give me the same ValueError: could not convert string to float: 'Cat'.
Replacing the last 4th line with this
('categorical', make_pipeline(Columns(names=X_cat_cols),LabelBinarizer(),OneHotEncoder()))
])),
will give me a different error TypeError: fit_transform() takes 2 positional arguments but 3 were given.
Replacing the last 4th line with this
('numeric', make_pipeline(Columns(names=X_num_cols),LabelBinarizer())),
will give me this error TypeError: fit_transform() takes 2 positional arguments but 3 were given.
Taking up on Marcus' suggestion, I tried but unable to install scikit-learn dev version, but have found something similar called category_encoders.
Changing the code into this works:
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
import category_encoders as CateEncoder
# Class that identifies Column type
class Columns(BaseEstimator, TransformerMixin):
def __init__(self, names=None):
self.names = names
def fit (self, X, y=None, **fit_params):
return self
def transform(self, X):
return X[self.names]
# Separate target from training features
y = df['MED']
X = df.drop('MED', axis=1)
X_selected = X.filter(['num1', 'num2', 'cate1', 'cate2'])
# from the selected X, further choose categorical only
X_selected_cat = X_selected.filter(['cate1', 'cate2']) # hand selected since some cat var has value 0, 1
# Find the numerical columns, exclude categorical columns
X_num_cols = X_selected.columns[X_selected.dtypes.apply(lambda c: np.issubdtype(c, np.number))] # list of numeric column names, automated here
X_cat_cols = X_selected_cat.columns # list of categorical column names, previously hand-slected
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y,
test_size=0.5,
random_state=567,
stratify=y)
# Pipeline
pipe = Pipeline([
("features", FeatureUnion([
('numeric', make_pipeline(Columns(names=X_num_cols),StandardScaler())),
('categorical', make_pipeline(Columns(names=X_cat_cols),CateEncoder.BinaryEncoder()))
])),
('LR_model', LogisticRegression()),
])
As for me I should prefer to use LabelEncoder.
Just toy example.
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn import linear_model
df= pd.DataFrame({ 'y': [10,2,3,4,5,6,7,8], 'a': ['a', 'b','a', 'b','a', 'b','a', 'b' ],
'b': ['a', 'b','a', 'b','a', 'b','b', 'b' ], 'c': ['a', 'b','a', 'a','a', 'b','b', 'b' ]})
df
I define class to select columns
class MultiColumn():
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
Now I define class to preprocess with LabelEncoder
lb = df[['a', 'c']]
class MyLEncoder():
def transform(self, X, **fit_params):
enc = preprocessing.LabelEncoder()
enc_data = []
for i in list(lb.columns):
encc = enc.fit(lb[i])
enc_data.append(encc.transform(X[i]))
return np.asarray(enc_data).T
def fit_transform(self, X,y=None, **fit_params):
self.fit(X,y, **fit_params)
return self.transform(X)
def fit(self, X, y, **fit_params):
return self
I use for-loop because we can apply LabelEncoder just for a single vector.
Pipeline
X = df[['a', 'b', 'c']]
y = df['y']
regressor = linear_model.SGDRegressor()
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['a', 'c'])),
('one_hot', MyLEncoder())
])),
])),
# Use a regression
('model_fitting', linear_model.SGDRegressor()),
])
pipeline.fit(X, y)
pipeline.predict(X)
And check for new data
new= pd.DataFrame({ 'y': [3, 8], 'a': ['a', 'b' ],'c': ['b', 'a' ], 'b': [3, 6],})
pipeline.predict(new)
Similarly, we can do for any method of preprocessing categorical data.
LabelBinarizer and LabelEncoder fit and transform signatures not compatible with Pipeline. So Create your own custom Transformer with the needed signature.
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit(self, X, y=None):
"""this would allow us to fit the model based on the X input."""
super(LabelBinarizerPipelineFriendly, self).fit(X)
def transform(self, X, y=None):
return super(LabelBinarizerPipelineFriendly, self).transform(X)
def fit_transform(self, X, y=None):
return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)

Pipeline with meta classifier

I am trying to train a meta classifier on different features from a pandas dataframe.
The features are either text or categorical in nature.
I am having issues with fitting the model, with the following error 'Found input variables with inconsistent numbers of samples: [1, 48678]'. I understand what the error means, but not how to fix it. Help much appreciated!
The code I am using is as follows:
import pandas as pd
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
# set target label
target_label = ['target']
features = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5',
'text_1']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleansed_data[features],
cleansed_data[target_label], test_size=0.2, random_state=0)
text_features = ['text_1']
categorical_features = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5']
# encoder
le = preprocessing.LabelEncoder()
# vectoriser
vectoriser = TfidfVectorizer()
# classifiers
mlp_clf = MLPClassifier()
rf_clf = RandomForestClassifier()
from sklearn.base import TransformerMixin, BaseEstimator
class SelectColumnsTransfomer(BaseEstimator, TransformerMixin):
def __init__(self, columns=[]):
self.columns = columns
def transform(self, X, **transform_params):
trans = X[self.columns].copy()
return trans
def fit(self, X, y=None, **fit_params):
return self
# text pipeline
text_steps = [('feature extractor', SelectColumnsTransfomer(text_features)),
('tf-idf', vectoriser),
('classifier', mlp_clf)]
# categorical pipeline
categorical_steps = [('feature extractor',
SelectColumnsTransfomer(categorical_features)),
('label encode', le),
('classifier', rf_clf)]
pl_text = Pipeline(text_steps)
pl_categorical = Pipeline(categorical_steps)
pl_text.fit(X_train, y_train)
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[pl_text, pl_categorical],
use_probas=True,
meta_classifier=LogisticRegression())
EDIT: Here is some code that recreates the issue. 'ValueError: Found input variables with inconsistent numbers of samples: [1, 3]'
d = {'cat_1': ['A', 'A', 'B'], 'cat_2': [1, 2, 3],
'cat_2': ['G', 'H', 'I'], 'cat_3': ['AA', 'DD', 'PP'],
'cat_4': ['X', 'B', 'V'],
'text_1': ['the cat sat on the mat', 'the mat sat on the cat', 'sat on the cat mat']}
features = pd.DataFrame(data=d)
t = [0, 1, 0]
target = pd.DataFrame(data=t)
text_features = ['text_1']
categorical_features = ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5']
# text pipeline
text_steps = [('feature extractor', SelectColumnsTransfomer(text_features)),
('tf-idf', vectoriser),
('classifier', mlp_clf)]
# categorical pipeline
categorical_steps = [('feature extractor',
SelectColumnsTransfomer(categorical_features)),
('label encode', le),
('classifier', rf_clf)]
pl_text = Pipeline(text_steps)
pl_categorical = Pipeline(categorical_steps)
pl_text.fit(features, target)
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[pl_text, pl_categorical],
use_probas=True,
meta_classifier=LogisticRegression())
sclf.fit(features, target)
Ok, I managed to get it to work by replacing text_features = ['text_1']
with text_features = 'text_1'
Basically, when you pass ['text_1'] to the SelectColumnsTransfomer class it returns a DataFrame object which the tfidf vectoriser sees as one single input. The vectoriser applies fit_transform in your pipeline and returns a single value. This single value with cannot be used to predict three target values.
If you pass in 'text_1', this will get you a series and the vectoriser will correctly identify that you have three strings as features. You text pipeline will work now.

Categories