Create sklearn pipeline with column operations step - python

I wanted to know how can I insert into a sklearn pipeline one step which multiplies two columns values and delete the original ones.
I'm doing something like that.
After loading the Dataframe, I multiply the target columns and delete them.
Prepare X, Y, training set and test set.
Configure pipeline with StandardScaler and some ML method (for example Linear Regression)
Fit and predict.
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
# df is a pandas dataframe with columns A, B, C, Y
df.drop(columns=['B','C'], inplace=True)
X = df.loc[:,['A','BC']]
Y = df['Y']
x_train, x_test, y_train, y_test = train_test_split(X,Y,train_size=0.8)
pipe = Pipeline([
y_pred = pipe.predict(x_test)
With this approach, when I want to make some prediction of new data, I must pass the multiplication, for example A=1, B=3, C=4
And I want an approach like
What I want, is modify pipeline for something like
pipe = Pipeline([
('product', CustomFunction(columns_to_multiply, result_name_column)),
Is it possible with scikit-learn or custom functions? How?

I am unable to fully test your codes because of missing data. However, you may be able to adopt FunctionTransfomer as follows:
def CustomMultiplier(arrs):
a = arrs[:,0]
b =[:,1:], axis=1)
return np.column_stack((a, b))
if __name__ == '__main__':
transformer = FunctionTransformer(CustomMultiplier)
X = np.array([[1,3,4], [2,4,5]])
result = transformer.transform(X)
[[ 1 12]
[ 2 20]]


implement custom one-hot-encoding function for sklearn pipeline

In related to question posted in One Hot Encoding preserve the NAs for imputation I am trying to create a custom function that handles NAs when one hot encoding categorical variables. The setup should be suitable for train/test split and modelling using sklearn pipeline.
A simple reproducible example of my problem:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
# Make some categorical data X and a response y and split it.
X = pd.DataFrame(columns=["1","2"],data = [["A",np.nan],["B","A"],[np.nan,"A"],[np.nan,"B"],["B","A"],["A","B"],["C","B"],["D","E"]])
y = pd.DataFrame(data = np.array([1,5,4,6,2,3,9,9]))
X_train, X_test, Y_train, Y_test = train_test_split(X,y,test_size=0.2,random_state=42)
I've then created a custom function that does OHE with nan (using the procedure described in Cyclical Loop Between OneHotEncoder and KNNImpute in Scikit-learn)
class OHE_with_nan(BaseEstimator,TransformerMixin):
""" OHE with NAN. Not super pretty but works..
def __init__(self, copy=True):
self.copy = copy
def fit(self, X, y = None):
""" This transformer does not use a fit procedure """
return self
def transform(self, X, y = None):
""" Return the new object here"""
# Replace nans with "Missing" such that OneHotEncoder can work.
enc_missing = SimpleImputer(strategy="constant",fill_value="missing")
data1 = pd.DataFrame(columns=X.columns,data = enc_missing.fit_transform(X))
#Perform standard OHE
OHE = OneHotEncoder(sparse=False,handle_unknown="ignore")
OHE_fit = OHE.fit_transform(data1)
#save feature names of the OHE dataframe
data_OHE = pd.DataFrame(columns=OHE.get_feature_names(data1.columns),data = OHE_fit)
# Initialize
Column_names = data1.columns
Final_OHE = pd.DataFrame()
# Loop over columns to replace 0s with nan the correct places.
for i in range(len(data1.columns)):
tmp_data = data_OHE[data_OHE.columns[pd.Series(data_OHE.columns).str.startswith(Column_names[i])]]
missing_name = tmp_data.iloc[:,-1:].columns
missing_index = np.where(tmp_data[missing_name]==1)[0]
tmp_data.loc[missing_index,:] = np.nan
tmp_data1 = tmp_data.drop(missing_name,axis=1)
Final_OHE = pd.concat([Final_OHE, tmp_data1], axis=1)
return Final_OHE
This is then combined into a pipeline that predicts y using ridge regression (random choice of model, just for the example..)
Estimator = Pipeline([
('Model',Ridge(alpha = 0.01))
The procedure can be fitted:
pipe_fit =,Y_train)
But testing on unseen data fails:
pipe_fit.score(X_test, Y_test)
ValueError: X has 2 features, but KNNImputer is expecting 7 features as input.
This is because the handle_unknown = "ignore in OneHotEncoder within OHE_with_nanis no longer "active" as it has been wrapped into my custom function.
If one simply uses OneHotEncoder(handle_unknown = "ignore") directly in the pipeline, everything works fine (but that's not my intention as this "removes" the nans from the data I try to impute.)
My question
How do I enable handle_unknown = "ignore" in my custom function such that it can perform in a pipeline setup on unseen data as well?
Hope you understand my situation - any help is highly appreciated!
I think the main problem is that you need to save more information (especially, the internal OneHotEncoder) at fit time. I also made the missing-column identification a little more robust (I think maybe you were relying on the ordering putting that last, but that only held for your sample data because of alphabetical order?). I didn't spend much time cleaning things up or looking for efficiencies.
class OHE_with_nan(BaseEstimator, TransformerMixin):
"""One-hot encode, propagating NaNs.
Requires a dataframe as input!
def fit(self, X, y=None):
self.orig_cols_ = X.columns
self.imputer_ = SimpleImputer(strategy="constant", fill_value="MISSING")
X_filled = self.imputer_.fit_transform(X)
self.ohe_ = OneHotEncoder(sparse=False, handle_unknown="ignore")
self.ohe_colnames_ = self.ohe_.get_feature_names(X.columns)
self.missing_value_columns = np.array(["MISSING" in col for col in self.ohe_colnames_])
return self
def transform(self, X, y=None):
raw_ohe = pd.DataFrame(self.ohe_.transform(self.imputer_.transform(X)), columns=self.ohe_colnames_)
out_list = []
# Loop over columns to replace 0s with nan the correct places.
for orig_col in self.orig_cols_:
tmp_data = raw_ohe[self.ohe_colnames_[pd.Series(self.ohe_colnames_).str.startswith(orig_col)]]
missing_name = tmp_data.columns[["MISSING" in col for col in tmp_data.columns]]
missing_indices = np.where(tmp_data[missing_name]==1)[0]
tmp_data.loc[missing_indices, :] = np.nan
tmp_data1 = tmp_data.drop(missing_name, axis=1)
out = pd.concat(out_list, axis=1)
return out

How to use OneHotEncoder and Pipeline to make new predictions?

I'm working through a tutorial focusing on OneHotEncoder. I get the idea behind encoding features, but I'm having a little problem with using the encoder with pipeline to make a new prediction. Two of the features--"Sex" and "Embarked"--are categorical rather than numerical. When creating a new numpy array to make a prediction, do you use the initial values, say "male" and "C", or, say, "1" and "2" to make a new prediction? I get the following error: " ValueError: Specifying the columns using strings is only supported for pandas DataFrames," which is weird given that the values I'm using are numerical. Regardless, would I have to fit the pipeline to X_new to make a new prediction? If so, how can I do that?
X_new = [[3, 1, 0]] OR X_new = [['3','male', 'C']]
Complete code:
import pandas as pd
import numpy as np
df = pd.read_csv("")
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='lbfgs')
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean()
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X = df.drop('Survived', axis='columns')
from sklearn.compose import make_column_transformer
column_trans = make_column_transformer(
(OneHotEncoder(), ['Sex', 'Embarked']),
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(column_trans, logreg)
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
X_new = [[3, 1, 0]]
When you apply OneHotEncoder, the categorical column that you specify will be transformed into multiple integer columns based on number of unique value in the categorical column.
For example, the gender column contains 'male' and 'female', then it will converted the original column to 2 columns of 'male' and 'female'. It is difference from the LabelEncoder.
If you want to apply pipeline, logistic regression, and OneHotEncoder, you can use the pipeline to fit with the training data.,y)
and then you can apply the prediction. This is an example when I apply 3 features as Sex, Age, and embarked and apply OHE to Sex and embarked.
X_new = [['female', 20, 'C']]
X_new_df = pd.DataFrame (X_new,columns=['Sex','Age','Embarked'])
However, the features that you use in your code is all features except label classes ('Survived'), which is 11 features. the number of the input must be equal to or greater than the fitted model, while you apply only 3 columns that may prompt and error.

How to leave numerical columns out when using sklearn OneHotEncoder?

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
Sample data:
X_train = pd.DataFrame({'A': ['a1', 'a3', 'a2'],
'B': ['b2', 'b1', 'b3'],
'C': [1, 2, 3]})
y_train = pd.DataFrame({'Y': [1,0,1]})
Desired outcome:
I would like to include sklearn OneHotEncoder in my pipeline in this format:
encoder = ### SOME CODE ###
scaler = StandardScaler()
model = RandomForestClassifier(random_state=0)
# This is my ideal pipeline
pipe = Pipeline([('OneHotEncoder', encoder),
('Scaler', scaler),
('Classifier', model)]), y_train)
OneHotEncoder is encoding everything including the numerical columns. I want to keep numerical columns as it is and encode only categorical features in an efficient way that's compatible with Pipeline().
encoder = OneHotEncoder(drop='first', sparse=False)
encoder.transform(X_train) # Columns C is encoded - this is what I want to avoid
Work around (not ideal): I can get around the problem using pd.get_dummies(). However, this means I can't include it in my pipeline. Or is there a way?
X_train = pd.get_dummies(X_train, drop_first=True)
My preferred solution for this would be to use sklearn's ColumnTransformer (see here).
It enables you to split the data in as many groups as you want (in your case, categorical vs numerical data) and apply different preprocessing operations to these groups. This transformer can then be used in a pipeline as any other sklearn preprocessing tool. Here is a short example:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
X = pd.DataFrame({"a":[1,2,3],"b":["A","A","B"]})
y = np.array([0,1,1])
OHE = OneHotEncoder()
scaler = StandardScaler()
RFC = RandomForestClassifier()
cat_cols = ["b"]
num_cols = ["a"]
transformer = ColumnTransformer([('cat_cols', OHE, cat_cols),
('num_cols', scaler, num_cols)])
pipe = Pipeline([("preprocessing", transformer),
("classifier", RFC)]),y)
NB: I have taken some license with your request because this only applies the scaler to the numerical data, which I believe makes more sense? If you do want to apply the scaler to all columns, you can do this as well by modifying this example.
What I would do is to create my own custom transformer and put it into pipeline. With this way, you will have a lot of power over the data in your hand. So, the steps are like below:
1) Create a custom transformer class inheriting BaseEstimator and TransformerMixin. In its transform() function try to detect the values of that column is either numerical or categorical. If you do not want to deal with the logic right now, you can always give column name for categorical columns to your transform() function to select on the fly.
2) (Optional) Create your custom transformer to handle columns with only categorical values.
3) (Optional) Create your custom transformer to handle columns with only numerical values.
4) Build two pipelines (one for categorical, the other for numerical) using transformers you created and you can also use the existing ones from sklearn.
5) Merge two pipelines with FeatureUnion.
6) Merge your big pipeline with your ML model.
7) Call fit_transform()
The sample code (no optionals implemented): GitHub Jupyter Noteboook

Names for Feature Selection

I want to know the names of the features within my RF model. I read here that the output from gs.best_estimator_.named_steps["stepname"].feature_importances_ would mirror my columns from my data. However, the length of gs.best_estimator_.... is 10 and I have 13 columns. Some columns were not important. From other answers around (answer1, answer2), I would have to declare something within my pipeline. But I am confused as to what to declare because both answers deal with PCA, not RF.
Here is what I have so far.
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import datasets
# use iris as example
iris = datasets.load_iris()
X = iris.drop(['sepal_length'],axis=1)
y = iris.sepal_length
cats_feats = ['species']
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=13)
# Pipeline
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore',sparse=False))
# Bundle any preprocessing
preprocessor = ColumnTransformer(
('cat', categorical_transformer, cat_feats)
rf = RandomForestRegressor(random_state = 13)
mymodel = Pipeline(steps = [('preprocessor', preprocessor),
('model', rf)
# For this example, I used default values. In reality I do use a dictionary of parameters
gs = GridSearchCV(mymodel
,n_jobs = -1
,cv = 5
Why the length of the feature list does not match
The length of your features does not match because all non-categorical columns are being discarded when you are using your ColumnTransformer. By default, it only keeps columns for which a transformation was specified. As a result, if you do not want this to happen, you need to do this
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), cat_feats)],
(I removed your categorical pipeline, which is not necessary here)
Also keep in mind that applying the OHE will add features and so the total number of features is going to be larger than what you had in the beginning.
How to get the feature names
Once you have fitted everything, you need to retrieve the feature names for the result of the OHE and the remaining numerical columns.
For the OHE columns:
cat_features = gs.best_estimator_["preprocessor"].named_transformers_["cat"].get_feature_names()
For the numerical columns, you need to declare num_feats where all numerical features are in the same order as in your original dataframe.
Then just do:
feature_names = np.concatenate((cat_features, num_feats))
PS: this is a bit cumbersome, and this might be improved in later sklearn versions, but as of now, this is the procedure

Dummify categorical variables for logistic regression with pandas and scikit (OneHotEncoder)

I read this blog about new things in scikit. The OneHotEncoder taking strings seems like a useful feature. Below my attempt to use this
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_df = pd.read_csv('../../data/train.csv', usecols=cols)
test_df = pd.read_csv('../../data/test.csv', usecols=[e for e in cols if e != 'Survived'])
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.copy()
ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False), ['Sex', 'Embarked'])], remainder='passthrough')
X_train_t = ct.fit_transform(train_df)
X_test_t = ct.fit_transform(test_df)
# [ 0. 1. 0. 0. 1. 0. 3. 22. 1. 0. 7.25]
# [ 0. 1. 0. 1. 0. 3. 34.5 0. 0. 7.8292]
logreg = LogisticRegression(max_iter=5000), Y_train)
Y_pred = logreg.predict(X_test_t) # ValueError: X has 10 features per sample; expecting 11
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
I encounter the below python error with this code and also I have some additional concerns.
ValueError: X has 10 features per sample; expecting 11
To start from the beginning .. this script is written for the "titanic" dataset from kaggle. We have five numerical columns Pclass, Age, SibSp, Parch and Fare. The columns Sex and Embarked are categories male/female and Q/S/C (which is an abbreviation for a city name).
What I understood from the OneHotEncoder is that it creates dummy variables by placing additional columns. Well actually the output of ct.fit_transform() is no longer a pandas dataframe but a numpy array now. But as seen in the print debug statement there are more than the original 7 columns now.
There are three problems I encounter:
For some reason the test.csv has one less column. That would indicate to me that there is on less option in one of the categories. To fix that i would have to find all the available options in the categories over both train + test data. And then use these options (such as male/female) to transform the train and the test data separately. I have no idea how to do this with the tools i'm working with (pandas, scikit, etc). On second thought .. after inspecting the data i can not find the missing option in the test.csv ..
I want to avoid the "dummy variable trap". Right now it seems that there are too many columns created. I was expecting 1 column for Sex (total options 2 - 1 to avoid trap) and 2 for embarked. With the additional 5 numerical columns that would come to 8 total.
I don't recognize the output of the transform anymore. I would rather prefer a new dataframe where the new dummy columns have given their own name, such as Sex_male (1/0) Embarked_Q (1/0) and Embarked_S(1/0)
I'm only used to using gretl, there dummifying a variable and leaving out one option is very natural. I don't know in python if i'm doing it wrong or if this scenario is not part of the standard scikit toolkit. Any advice? Maybe I should write a custom encoder for this?
I will try and answer all your questions individually.
Answer for Question 1
In your code you have used fit_transform method both on your train and test data which is not the correct way of doing it. Generally, fit_transform is applied only on your train data set, and it returns a transformer which is then just used to transform your test data set. When you apply fit_transform on your test data, you just transform your test data with just the options/levels of the categorical variables available only in your test data set and it is very much possible that your test data may not contain all options/levels of all categorical variables, due to which the dimension of your train and test data set will differ resulting in the error which you have got.
So the correct way of doing it would be:
X_train_t = ct.fit_transform(X_train)
X_test_t = ct.transform(X_test)
Answer for Question 2
If you want to avoid the "dummy variable trap" you can make use of the parameter drop (by setting it to first) while creating the OneHotEncoder object in the ColumnTransformer, this will result in creating just one column for sex and two columns for Embarked since they have two and three options/levels respectively.
So the correct way of doing it would be:
ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, drop="first"), ['Sex','Embarked'])], remainder='passthrough')
Answer for Question 3
As of now the get_feature_names method which can be reconstruct your data frame with new dummy columns is not implemented insklearn yet. One work around for this would be to change the reminder to drop in the ColumnTransformer construction and construct your data frame separately as shown below:
ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, drop="first"), ['Sex', 'Embarked'])], remainder='drop')
A = pd.concat([X_train.drop(["Sex", "Embarked"], axis=1), pd.DataFrame(X_train_t, columns=ct.get_feature_names())], axis=1)
which will result in something like this:
Your final code will look like this:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_df = pd.read_csv('train.csv', usecols=cols)
test_df = pd.read_csv('test.csv', usecols=[e for e in cols if e != 'Survived'])
cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_df = train_df.dropna()
test_df = test_df.dropna()
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.copy()
categorical_values = ['Sex', 'Embarked']
X_train_cont = X_train.drop(categorical_values, axis=1)
X_test_cont = X_test.drop(categorical_values, axis=1)
ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, drop="first"), categorical_values)], remainder='drop')
X_train_categorical = ct.fit_transform(X_train)
X_test_categorical = ct.transform(X_test)
X_train_t = pd.concat([X_train_cont, pd.DataFrame(X_train_categorical, columns=ct.get_feature_names())], axis=1)
X_test_t = pd.concat([X_test_cont, pd.DataFrame(X_test_categorical, columns=ct.get_feature_names())], axis=1)
logreg = LogisticRegression(max_iter=5000), Y_train)
Y_pred = logreg.predict(X_test_t)
acc_log = round(logreg.score(X_train_t, Y_train) * 100, 2)
And when you do X_train_t.head() you get
Recommended practice is suggested in #Parthasarathy Subburaj's answer but I have seen in Kaggle or other competition, where people fit on the complete data (train+test). If you want to try the same, use the following format
X_train_t, X_test_t = ct.transform(X_test), ct.transform(X_test)
ya, use drop='first' to get over this issue. At the same time, remember this multicollinearity problem is not a big deal for non-linear models such as neural networks or even decision trees. I believe that is the reason why it is not kept as the default arg param value.
get_feature_names is not implemented exhaustively for pipelines and other stuffs in sklearn. Hence, they are supporting complete in ColumnTransformer as well.
Based on my experience, I had built this wrapper for ColumnTransfomer, which can support for even it has pipelines or reminder=passthrough.
This also picks up the feature names for get_feature_names instead of calling it as x0, x1 because we know the actual column names inside ColumnTransformer using _feature_names_in.
from sklearn.compose import ColumnTransformer
from sklearn.utils.validation import check_is_fitted
def _get_features_out(name, trans, features_in):
if hasattr(trans, 'get_feature_names'):
return [name + "__" + f for f in
return features_in
class NamedColumnTransformer(ColumnTransformer):
def get_feature_names(self):
feature_names = []
for name, trans, features, _ in self._iter(fitted=True):
if trans == 'drop':
if trans == 'passthrough':
elif hasattr(trans, '_iter'):
for _, op_name, t in trans._iter():
features=_get_features_out(op_name, t, features)
elif not hasattr(trans, 'get_feature_names'):
raise AttributeError("Transformer %s (type %s) does not "
"provide get_feature_names."
% (str(name), type(trans).__name__))
feature_names.extend(_get_features_out(name, trans, features))
return feature_names
Now, for your example,
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
# you can fetch the titanic dataset using this
X, y = fetch_openml("titanic", version=1,
as_frame=True, return_X_y=True)
# removing the columns which you are not using
X.drop(['name', 'ticket', 'cabin', 'boat', 'body', 'home.dest'],
axis=1, inplace=True)
X.reset_index(drop=True, inplace=True)
y = y[X.index]
categorical_values = ['sex', 'embarked']
ct = NamedColumnTransformer([("onehot", OneHotEncoder(
sparse=False, drop="first"), categorical_values)], remainder='passthrough')
clf = Pipeline(steps=[('preprocessor', ct),
('classifier', LogisticRegression(max_iter=5000))])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2), y_train)
# ['onehot__sex_male',
# 'onehot__embarked_Q',
# 'onehot__embarked_S',
# 'pclass',
# 'age',
# 'sibsp',
# 'parch',
# 'fare']
pd.DataFrame(clf[0].transform(X_train), columns=clf[0].get_feature_names())
You can also try the NamedColumnTransformer for a more interesting example of ColumnTransformer here.
