How to access ColumnTransformer elements in GridSearchCV - python

I wanted to find out the correct naming convention when referring to individual preprocessor included in ColumnTransformer (which is part of a pipeline) in param_grid for grid_search.
Environment & sample data:
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
df = sns.load_dataset('titanic')[['survived', 'age', 'embarked']]
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='survived'), df['survived'], test_size=0.2,
random_state=123)
Pipeline:
num = ['age']
cat = ['embarked']
num_transformer = Pipeline(steps=[('imputer', SimpleImputer()),
('discritiser', KBinsDiscretizer(encode='ordinal', strategy='uniform')),
('scaler', MinMaxScaler())])
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num),
('cat', cat_transformer, cat)])
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('classiffier', LogisticRegression(random_state=1, max_iter=10000))])
param_grid = dict([SOMETHING]imputer__strategy = ['mean', 'median'],
[SOMETHING]discritiser__nbins = range(5,10),
classiffier__C = [0.1, 10, 100],
classiffier__solver = ['liblinear', 'saga'])
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)
Basically, what should I write instead of [SOMETHING] in my code?
I have looked at this answer which answered the question for make_pipeline - so using the similar idea, I tried 'preprocessor__num__', 'preprocessor__num_', 'pipeline__num__', 'pipeline__num_' - no luck so far.
Thank you

You were close, the correct way to declare it is like this:
param_grid = {'preprocessor__num__imputer__strategy' : ['mean', 'median'],
'preprocessor__num__discritiser__n_bins' : range(5,10),
'classiffier__C' : [0.1, 10, 100],
'classiffier__solver' : ['liblinear', 'saga']}
Here is the full code:
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
df = sns.load_dataset('titanic')[['survived', 'age', 'embarked']]
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='survived'), df['survived'], test_size=0.2,
random_state=123)
num = ['age']
cat = ['embarked']
num_transformer = Pipeline(steps=[('imputer', SimpleImputer()),
('discritiser', KBinsDiscretizer(encode='ordinal', strategy='uniform')),
('scaler', MinMaxScaler())])
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num),
('cat', cat_transformer, cat)])
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('classiffier', LogisticRegression(random_state=1, max_iter=10000))])
param_grid = {'preprocessor__num__imputer__strategy' : ['mean', 'median'],
'preprocessor__num__discritiser__n_bins' : range(5,10),
'classiffier__C' : [0.1, 10, 100],
'classiffier__solver' : ['liblinear', 'saga']}
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)
One simply way to check the available parameter names is like this:
print(pipe.get_params().keys())
This will print out the list of all the available parameters which you can copy directly into your params dictionary.
I have written a utility function which you can use to check if a parameter exist in a pipeline/classifier by simply passing in a keyword.
def check_params_exist(esitmator, params_keyword):
all_params = esitmator.get_params().keys()
available_params = [x for x in all_params if params_keyword in x]
if len(available_params)==0:
return "No matching params found!"
else:
return available_params
Now if you are unsure of the exact name, just pass imputer as the keyword
print(check_params_exist(pipe, 'imputer'))
This will print the following list:
['preprocessor__num__imputer',
'preprocessor__num__imputer__add_indicator',
'preprocessor__num__imputer__copy',
'preprocessor__num__imputer__fill_value',
'preprocessor__num__imputer__missing_values',
'preprocessor__num__imputer__strategy',
'preprocessor__num__imputer__verbose',
'preprocessor__cat__imputer',
'preprocessor__cat__imputer__add_indicator',
'preprocessor__cat__imputer__copy',
'preprocessor__cat__imputer__fill_value',
'preprocessor__cat__imputer__missing_values',
'preprocessor__cat__imputer__strategy',
'preprocessor__cat__imputer__verbose']

Related

How to build a pipeline finding the best preprocessing per column in a fine-grained fashion?

In sklearn we can use the column transformer within a pipeline to apply a preprocessing choice to specific columns like this:
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, ...
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
# this is my x_data
x_data = pd.DataFrame(..., columns=['Variable1', 'Variable2', 'Variable3'])
pipeline = Pipeline(steps=[('preprocessing1', make_column_transformer((StandardScaler(), ['Variable1']),
remainder='passthrough')),
('preprocessing2', make_column_transformer((MaxAbsScaler(), ['Variable2']),
remainder='passthrough')),
('preprocessing3', make_column_transformer((MinMaxScaler(), ['Variable3']),
remainder='passthrough')),
('clf', MLPClassifier(...)
]
)
then we would run the GridSearchCV something along the lines of the following:
params = [{'preprocessing1': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'preprocessing2': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'preprocessing3': [MinMaxScaler(), MaxAbsScaler(), StandardScaler()], # <<<<<<<<<<<<< How???
'ann__hidden_layer_sizes': [(100,), (200,)],
'ann__solver': ['adam', 'lbfs', 'sgd'],
...
}]
cv = GridSearch(pipeline, params, cv=10, verbose=1, n_jobs=-1, refit=True)
What I would like to do, is to find the best preprocessing per predictor because usually one preprocessing for all predictors doesn't work best.
The naming convention in a pipeline is using double underscore __ to separate steps, and their parameters.
You can see the different parameter of your pipeline and their value using pipeline.get_params().
In your case the parameter preprocessing1__standardscaler is referencing the scaling preprocessing defined for the first step of your pipeline and this is the argument that should be set during the GridSearchCV.
The example below illustrates how to perform this operation:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
X, y = make_classification(
n_features=3, n_informative=3, n_redundant=0, random_state=42
)
pipeline = Pipeline(
steps=[
("preprocessing1", make_column_transformer((StandardScaler(), [0]), remainder="passthrough")),
("preprocessing2", make_column_transformer((StandardScaler(), [1]), remainder="passthrough")),
("preprocessing3", make_column_transformer((StandardScaler(), [2]), remainder="passthrough")),
("clf", MLPClassifier()),
]
)
param_grid = {
"preprocessing1__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
"preprocessing2__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
"preprocessing3__standardscaler": [StandardScaler(), MinMaxScaler(), MaxAbsScaler()],
}
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=1, n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_
This will return the following output:
{'preprocessing1__standardscaler': MinMaxScaler(),
'preprocessing2__standardscaler': StandardScaler(),
'preprocessing3__standardscaler': MaxAbsScaler()}

OneHotEncoder ValueError: Input contains NaN

I have downloaded this data, and this is my code:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
import plotly.figure_factory as ff
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
random_state = 27912
df_train = pd.read_csv("...")
df_test = pd.read_csv("...")
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(["Survived", "Ticket", "Cabin", "Name", "PassengerId"],
axis = 1),
df_train["Survived"], test_size=0.2,
random_state=42)
numeric_col_names = ["Age", "SibSp", "Parch", "Fare"]
ordinal_col_names = ["Pclass"]
one_hot_col_names = ["Embarked", "Sex"]
ct = make_column_transformer(
(SimpleImputer(strategy="median"), numeric_col_names),
(SimpleImputer(strategy="most_frequent"), ordinal_col_names + one_hot_col_names),
(OrdinalEncoder(), ordinal_col_names),
(OneHotEncoder(), one_hot_col_names),
(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
preprocessing_pipeline = Pipeline([("transformers", ct)])
preprocessing_pipeline.fit_transform(X_train)
I'm trying make column_transformer for preprocessing step, however, the OneHotEncoding step is giving me an error, ValueError: Input contains NaN. I don't really know why this is happening, because I'm imputing the values before. Any clues on why this is happening?
Trying something like this doesn't help neither
preprocessing_pipeline = Pipeline([("transformers", ct_first)])
ct_second = make_column_transformer((OneHotEncoder(), one_hot_col_names),(StandardScaler(), ordinal_col_names + one_hot_col_names + numeric_col_names))
pipeline = Pipeline([("transformer1", preprocessing_pipeline), ("transformer2", ct_second)])
pipeline.fit_transform(X_train)
I would like to know why is this happening and why the above code, first and second tries, are not correct.
Thanks
You need to create a pipeline for each column type to make sure that the different steps are applied sequentially (i.e. to make sure that the missing values are imputed prior to encoding and scaling), see also this example in the scikit-learn documentation.
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
# Load the data (from https://www.kaggle.com/c/titanic/data)
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
# Extract the features
X_train = df_train.drop(labels=['Survived', 'Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
X_test = df_test.drop(labels=['Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
# Map the feature names to the corresponding
# types (numerical, ordinal or categorical)
numeric_col_names = ['Age', 'SibSp', 'Parch', 'Fare']
ordinal_col_names = ['Pclass']
one_hot_col_names = ['Embarked', 'Sex']
# Define the numerical features pipeline
numeric_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Define the ordinal features pipeline
ordinal_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OrdinalEncoder()),
('scaler', StandardScaler())
])
# Define the categorical features pipeline
one_hot_col_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(sparse=False)),
('scaler', StandardScaler())
])
# Create the overall preprocessing pipeline
preprocessing_pipeline = make_column_transformer(
(numeric_col_transformer, numeric_col_names),
(ordinal_col_transformer, ordinal_col_names),
(one_hot_col_transformer, one_hot_col_names),
)
# Fit the pipeline to the training data
preprocessing_pipeline.fit(X_train)
# Apply the pipeline to the training and test data
X_train_ = preprocessing_pipeline.transform(X_train)
X_test_ = preprocessing_pipeline.transform(X_test)

How to solve 'Input contains NaN, infinity or a value too large for dtype('float64')' after already preprocessing using Pipeline?

There are many posts containing this error, but I couldn't find the solution for this problem. I'm using this dataset. This is what I've done, a preprocessing, with SimpleImputer for categorical and numerical features:
import pandas as pd
import numpy as np
%load_ext nb_black
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.model_selection import train_test_split
housing = pd.read_csv("housing.csv")
housing.head()
X = housing.drop(["longitude", "latitude", "median_house_value"], axis=1)
y = housing["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant")),
("encoder", CatBoostEncoder()),
]
)
numeric_features = [
"housing_median_age",
"total_rooms",
"total_bedrooms",
"population",
"households",
"median_income",
]
categorical_features = ["ocean_proximity"]
preprocessor = ColumnTransformer(
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical", categorical_transformer, categorical_features),
]
)
from sklearn.linear_model import LinearRegression
pipeline = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)
lr_model = pipeline.fit(X_train, y_train)
But I got this error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Any idea of what's happening in here?
It seems that the CatBoostEncoder is returning several nan values when fitted to the training set, which is why the LinearRegression throws an error.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from category_encoders import CatBoostEncoder
housing = pd.read_csv("housing.csv")
X = housing.drop(["longitude", "latitude", "median_house_value"], axis=1)
y = housing["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant")),
("encoder", CatBoostEncoder())
])
numeric_features = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]
categorical_features = ["ocean_proximity"]
preprocessor = ColumnTransformer(transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical", categorical_transformer, categorical_features),
])
X_new = preprocessor.fit_transform(X_train, y_train)
print(np.isnan(X_new).sum(axis=0))
# array([ 0, 0, 0, 0, 0, 0, 4315])

OneHotEncoder for categorical feature "day of week" results in ValueError

I want to define a Pipeline with a OneHotEncoder for the day_of_week column. I don't understand why I get a ValueError:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
if __name__ == '__main__':
data_dict = {
'age': [1, 2, 3],
'day_of_week': ['monday', 'tuesday', 'wednesday'],
'y': [5, 6, 7]
}
data = pd.DataFrame(data_dict, columns=data_dict)
numeric_features = ['age']
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
categorical_features = ['day_of_week']
print(categorical_features)
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])
preprocessor = ColumnTransformer(
transformers=[
('numerical', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features)])
classifier = Pipeline(
steps=[
('preprocessor', preprocessor),
('classifier', RandomForestRegressor(n_estimators=60))])
X = data.drop(labels=['y'], axis=1)
y = data['y']
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
trained_model = classifier.fit(X_train, y_train)
There is an error on this line:
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
train_test_split returns X (train, test) , y(train,test).. and since you assigned them wrongly, your classifier throws all kinds of error.
Try changing it to:
X_train,X_test, y_train,y_test = train_test_split(X, y, train_size=0.8, random_state=30)
Your code runs without error for me

Invalid parameter n_neighbors for estimator Pipeline

I'm pretty new to Python. I've run into an issue below that I really need help with:
df = pd.read_csv('train.csv') #titanic dataset from Kaggle
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'SibSp', 'Embarked']]
X = df.drop('Survived', axis='columns')
y = df.Survived
column_trans = make_column_transformer(
(OneHotEncoder(), ['Sex', 'Embarked']),
remainder='passthrough')
column_trans.fit_transform(X)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
param_grid = dict(n_neighbors=k_range)
knn = KNeighborsClassifier()
pipe = make_pipeline(column_trans, knn)
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy')
grid.fit(train_X, train_y) #this line gives me an error
The last line gives me an error of:
ValueError: Invalid parameter n_neighbors for estimator Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='passthrough',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('onehotencoder',
OneHotEncoder(categories='auto',
drop=None,
dtype=<class 'numpy.float64'>,
handle_unknown='error',
sparse=True),
['Sex', 'Embarked'])],
verbose=False)),
('kneighborsclassifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
What am I doing wrong here? Is it just not possible to do oneHot encoding, knn and pipeline simultaneously?
Parameters of pipelines can be set using __ separated parameter names, also you need the way in which you have defined your pipeline needs a revision. Please refer to the modified code below:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
df = pd.read_csv("titanic.csv")
df = df.drop(["Name"], axis=1)
X = df.drop('Survived', axis='columns')
y = df.Survived
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
column_trans = make_column_transformer(
(OneHotEncoder(), ['Sex']),
remainder='passthrough')
knn = KNeighborsClassifier()
pipe = Pipeline(steps=[('column_trans', column_trans), ('knn', knn)])
param_grid = {
'knn__n_neighbors': [2,5,15, 30, 45, 64]
}
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy')
grid.fit(train_X,train_y)
grid.best_params_
#{'knn__n_neighbors': 5}

Categories