How do you output preprocessed data from a pipeline as objects? - python

In many sklearn pipeline examples, I see people using another pipeline to pipe the preprocessing pipeline to some linear regression model. Is it possible to just output the preprocessed data from a pipeline so I can feed it into my flaml baseline code:
automl.fit(X_train=pp_training_data, y_train=pp_training_labels, **automl_settings)
Here is what I want my preprocessing pipeline code to look and act like (I know this doesn't work):
def diamond_preprocess(data_dir):
data = pd.read_csv(data_dir)
cleaned_data = data.drop(['id', 'depth_percent'], axis=1) # Features I don't want
x = cleaned_data.drop(['price'], axis=1) # Train data
y = cleaned_data['price'] # Label data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = x_train.select_dtypes(include=['object']).columns
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # Fill in missing data with median
('scaler', StandardScaler()) # Scale data
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill in missing data with 'missing'
('onehot', OneHotEncoder(handle_unknown='ignore')) # One hot encode categorical data
])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
pp_training_data, pp_training_label = preprocessor_pipeline
return pp_training_data, pp_training_label

You can apply the pipeline to the feature matrix only, without an estimator in the final step. See the code below for an example.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
# generate the data
data = pd.DataFrame({
'y': [1, 2, 3, 4, 5],
'x1': [6, 7, 8, np.nan, np.nan],
'x2': [9, 10, 11, np.nan, np.nan],
'x3': ['a', 'b', 'c', np.nan, np.nan],
'x4': [np.nan, np.nan, 'd', 'e', 'f']
})
# extract the features and target
x = data.drop(labels=['y'], axis=1)
y = data['y']
# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
# map the features to the corresponding types (numerical or categorical)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
# define the numerical features pipeline
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# define the categorical features pipeline
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# define the overall pipeline
preprocessor_pipeline = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
# fit the pipeline to the training data
preprocessor_pipeline.fit(x_train)
# apply the pipeline to the training and test data
x_train_ = preprocessor_pipeline.transform(x_train)
x_test_ = preprocessor_pipeline.transform(x_test)

Related

Specifying columns in scikit-learn pipeline after ColumnTransformer

I want to construct a scikit-learn pipeline in which some columns have values imputed, and then scaling is subsequently applied to some. If I put both operations in the same columntransformer this does not work as they proceed in parallel (and so missing values cause the scaler to fail). If I make two columntransformers and run them in series, however, I run into the issue that I cannot specify column names (as output of the first transformer is a np array). What is the correct way to go about this?
numeric_columns = list(X.select_dtypes('float64').columns)
cat_columns = list(X.select_dtypes('object').columns)+list(X.select_dtypes('int64').columns)
# Imputation
imp_mean = SimpleImputer(strategy='mean')
imp_freq = SimpleImputer(strategy='most_frequent')
imputer = ColumnTransformer(
[('Imput_mean', imp_mean, numeric_columns),
('Imput_freq', imp_freq, cat_columns),
], remainder='passthrough'
)
# Scaling
feature_transformer = ColumnTransformer(
[('num',StandardScaler(),numeric_columns),
], remainder='passthrough'
)
#Hyperparameters
parameters = {'model__n_components':[1,2,3,4,5]}
#Pipeline
pipeline = Pipeline([('imputer', imputer),
('feature_transformer', feature_transformer),
('model', PLSRegression())])
#Cross validation strategy
cv = KFold(n_splits=10, shuffle=True)
#Cross valdiate and evaluate
clf = GridSearchCV(pipeline, parameters, scoring="r2", cv=10)
cross_val_score(clf, X, y, cv=cv, scoring="r2"))
You might nest a Pipeline which takes care of the preprocessing of the numerical columns (performed serially) within the ColumnTransformer instance.
Here's an example:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
X = pd.DataFrame({'city': ['London', 'London', '', 'Sallisaw'],
'title': ['His Last Bow', 'How Watson Learned the Trick', 'A Moveable Feast', 'The Grapes of Wrath'],
'expert_rating': [5, 3, np.nan, 5],
'user_rating': [4, np.nan, 4, 3]})
numeric_columns = list(X.select_dtypes('float64').columns)
cat_columns = list(X.select_dtypes('object').columns) + list(X.select_dtypes('int64').columns)
imp_mean = SimpleImputer(strategy='mean')
imp_freq = SimpleImputer(missing_values='', strategy='most_frequent')
ct = ColumnTransformer([
('Imput_freq', imp_freq, cat_columns),
('pipe_num', Pipeline([('Imput_mean', imp_mean), ('num', StandardScaler())]), numeric_columns)
], remainder='passthrough'
)
pd.DataFrame(ct.fit_transform(X))
Here's a similar post: How to execute both parallel and serial transformations with sklearn pipeline?.

How do you preprocess labels in a pipeline with sklearn?

I have a preprocessing script that takes data from a diamonds dataset and preprocesses the data. I obviously need it to preprocess labels as well.
Here is my code:
# Data Preprocessing
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from icecream import ic
def diamond_preprocess(data_dir):
data = pd.read_csv(data_dir)
cleaned_data = data.drop(['id', 'depth_percent'], axis=1) # Features I don't want
x = cleaned_data.drop(['price'], axis=1) # Train data
y = cleaned_data['price'] # Label data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # Fill in missing data with median
('scaler', StandardScaler()) # Scale data
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill in missing data with 'missing'
('onehot', OneHotEncoder(handle_unknown='ignore')) # One hot encode categorical data
])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
# Fit to the training data
preprocessor_pipeline.fit(x_train)
preprocessor_pipeline.fit(y_train)
# Apply the pipeline to the training and test data
x_train_pipe = preprocessor_pipeline.transform(x_train)
x_test_pipe = preprocessor_pipeline.transform(x_test)
y_train_pipe = preprocessor_pipeline.transform(y_train)
y_test_pipe = preprocessor_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
return x_train, x_test, y_train, y_test
I am not very confident that my code is correct or that I have a good understanding of how pipelines and preprocessing works in sklearn. Apparently, the interpreter agrees as I get this error:
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 470, in fit
self.fit_transform(X, y=y)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 502, in fit_transform
self._check_n_features(X, reset=True)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\base.py", line 352, in _check_n_features
n_features = X.shape[1]
IndexError: tuple index out of range
How do I properly preprocess my labels like I did with my training data? An explanation would be great as well!
You can create an additional pipeline for your target column if you want to apply the transformations separately, see the example below.
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# generate the data
data = pd.DataFrame({
'y': [1, 2, np.nan, 4, 5],
'x1': [6, 7, 8, np.nan, np.nan],
'x2': [9, 10, 11, np.nan, np.nan],
'x3': ['a', 'b', 'c', np.nan, np.nan],
'x4': [np.nan, np.nan, 'd', 'e', 'f']
})
# extract the features and target
x = data.drop(labels=['y'], axis=1)
y = data[['y']] # note that this is a data frame, not a series
# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=99)
# map the features to the corresponding types (numerical or categorical)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
# define the features pipeline
numerical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
features_pipeline = ColumnTransformer(transformers=[
('num_features', numerical_features_transformer, numerical_features),
('cat_features', categorical_features_transformer, categorical_features)
])
# define the target pipeline
target_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
# fit the pipelines to the training data
features_pipeline.fit(x_train)
target_pipeline.fit(y_train)
# apply the pipelines to the training and test data
x_train_pipe = features_pipeline.transform(x_train)
x_test_pipe = features_pipeline.transform(x_test)
y_train_pipe = target_pipeline.transform(y_train)
y_test_pipe = target_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
You can use TransformedTargetRegressor to run some function on labels before and after regressor:
from sklearn.compose import TransformedTargetRegressor
pipeline = make_pipeline(
TransformedTargetRegressor(regressor=LinearRegression(fit_intercept = True, n_jobs = -1), func=np.log1p, inverse_func=np.expm1)
)

How to solve 'Input contains NaN, infinity or a value too large for dtype('float64')' after already preprocessing using Pipeline?

There are many posts containing this error, but I couldn't find the solution for this problem. I'm using this dataset. This is what I've done, a preprocessing, with SimpleImputer for categorical and numerical features:
import pandas as pd
import numpy as np
%load_ext nb_black
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.model_selection import train_test_split
housing = pd.read_csv("housing.csv")
housing.head()
X = housing.drop(["longitude", "latitude", "median_house_value"], axis=1)
y = housing["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant")),
("encoder", CatBoostEncoder()),
]
)
numeric_features = [
"housing_median_age",
"total_rooms",
"total_bedrooms",
"population",
"households",
"median_income",
]
categorical_features = ["ocean_proximity"]
preprocessor = ColumnTransformer(
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical", categorical_transformer, categorical_features),
]
)
from sklearn.linear_model import LinearRegression
pipeline = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)
lr_model = pipeline.fit(X_train, y_train)
But I got this error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Any idea of what's happening in here?
It seems that the CatBoostEncoder is returning several nan values when fitted to the training set, which is why the LinearRegression throws an error.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from category_encoders import CatBoostEncoder
housing = pd.read_csv("housing.csv")
X = housing.drop(["longitude", "latitude", "median_house_value"], axis=1)
y = housing["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant")),
("encoder", CatBoostEncoder())
])
numeric_features = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]
categorical_features = ["ocean_proximity"]
preprocessor = ColumnTransformer(transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical", categorical_transformer, categorical_features),
])
X_new = preprocessor.fit_transform(X_train, y_train)
print(np.isnan(X_new).sum(axis=0))
# array([ 0, 0, 0, 0, 0, 0, 4315])

OneHotEncoder for categorical feature "day of week" results in ValueError

I want to define a Pipeline with a OneHotEncoder for the day_of_week column. I don't understand why I get a ValueError:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
if __name__ == '__main__':
data_dict = {
'age': [1, 2, 3],
'day_of_week': ['monday', 'tuesday', 'wednesday'],
'y': [5, 6, 7]
}
data = pd.DataFrame(data_dict, columns=data_dict)
numeric_features = ['age']
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
categorical_features = ['day_of_week']
print(categorical_features)
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])
preprocessor = ColumnTransformer(
transformers=[
('numerical', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features)])
classifier = Pipeline(
steps=[
('preprocessor', preprocessor),
('classifier', RandomForestRegressor(n_estimators=60))])
X = data.drop(labels=['y'], axis=1)
y = data['y']
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
trained_model = classifier.fit(X_train, y_train)
There is an error on this line:
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
train_test_split returns X (train, test) , y(train,test).. and since you assigned them wrongly, your classifier throws all kinds of error.
Try changing it to:
X_train,X_test, y_train,y_test = train_test_split(X, y, train_size=0.8, random_state=30)
Your code runs without error for me

How to output Pandas object from sklearn pipeline

I have constructed a pipeline that takes a pandas dataframe that has been split into categorical and numerical columns. I am trying to run GridSearchCV on my results and ultimately look at the ranked features of importance for the best performing model that GridSearchCV selects. The problem I am encountering is that sklearn pipelines output numpy array objects and lose any column information along the way. Thus when I go to examine the most important coefficients of the model I am left with an unlabeled numpy array.
I have read that building a custom transformer might be a possible solution to this, but I do not have any experience doing so myself. I have also looked into leveraging the sklearn-pandas package, but I am hesitant to try and implement something that might not be updated in parallel with sklearn. Can anyone suggest what they believe is the best path to go about getting around this issue? I am also open to any literature that has hands on application of pandas and sklearn pipelines.
My Pipeline:
# impute and standardize numeric data
numeric_transformer = Pipeline([
('impute', SimpleImputer(missing_values=np.nan, strategy="mean")),
('scale', StandardScaler())
])
# impute and encode dummy variables for categorical data
categorical_transformer = Pipeline([
('impute', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
clf = Pipeline([
('transform', preprocessor),
('ridge', Ridge())
])
Cross Validation:
kf = KFold(n_splits=4, shuffle=True, random_state=44)
cross_val_score(clf, X_train, y_train, cv=kf).mean()
Grid Search:
param_grid = {
'ridge__alpha': [.001, .1, 1.0, 5, 10, 100]
}
gs = GridSearchCV(clf, param_grid, cv = kf)
gs.fit(X_train, y_train)
Examining Coefficients:
model = gs.best_estimator_
predictions = model.fit(X_train, y_train).predict(X_test)
model.named_steps['ridge'].coef_
Here is the output of the model coefficients as it currently stands when performed on the seaborn "mpg" dataset:
array([-4.64782052e-01, 1.47805207e+00, -3.28948689e-01, -5.37033173e+00,
2.80000700e-01, 2.71523808e+00, 6.29170887e-01, 9.51627968e-01,
...
-1.50574860e+00, 1.88477450e+00, 4.57285471e+00, -6.90459868e-01,
5.49416409e+00])
Ideally I would like to preserve the pandas dataframe information and retrieve the derived column names after OneHotEncoder and the other methods are called.
I would actually go for creating column names from the input. If your input is already divided into numerical an categorical you can use pd.get_dummies to get the number of different category for each categorical feature.
Then you can just create proper names for the columns as shown in the last part of this working example based on the question with some artificial data.
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
# create aritificial data
numeric_features_vals = pd.DataFrame({'x1': [1, 2, 3, 4], 'x2': [0.15, 0.25, 0.5, 0.45]})
numeric_features = ['x1', 'x2']
categorical_features_vals = pd.DataFrame({'cat1': [0, 1, 1, 2], 'cat2': [2, 1, 5, 0] })
categorical_features = ['cat1', 'cat2']
X_train = pd.concat([numeric_features_vals, categorical_features_vals], axis=1)
X_test = pd.DataFrame({'x1':[2,3], 'x2':[0.2, 0.3], 'cat1':[0, 1], 'cat2':[2, 1]})
y_train = pd.DataFrame({'labels': [10, 20, 30, 40]})
# impute and standardize numeric data
numeric_transformer = Pipeline([
('impute', SimpleImputer(missing_values=np.nan, strategy="mean")),
('scale', StandardScaler())
])
# impute and encode dummy variables for categorical data
categorical_transformer = Pipeline([
('impute', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
clf = Pipeline([
('transform', preprocessor),
('ridge', Ridge())
])
kf = KFold(n_splits=2, shuffle=True, random_state=44)
cross_val_score(clf, X_train, y_train, cv=kf).mean()
param_grid = {
'ridge__alpha': [.001, .1, 1.0, 5, 10, 100]
}
gs = GridSearchCV(clf, param_grid, cv = kf)
gs.fit(X_train, y_train)
model = gs.best_estimator_
predictions = model.fit(X_train, y_train).predict(X_test)
print('coefficients : ', model.named_steps['ridge'].coef_, '\n')
# create column names for categorical hot encoded data
columns_names_to_map = list(np.copy(numeric_features))
columns_names_to_map.extend('cat1_' + str(col) for col in pd.get_dummies(X_train['cat1']).columns)
columns_names_to_map.extend('cat2_' + str(col) for col in pd.get_dummies(X_train['cat2']).columns)
print('columns after preprocessing :', columns_names_to_map, '\n')
print('#'*80)
print( '\n', 'dataframe of rescaled features with custom colum names: \n\n', pd.DataFrame({col:vals for vals, col in zip (preprocessor.fit_transform(X_train).T, columns_names_to_map)}))
print('#'*80)
print( '\n', 'dataframe of ridge coefficients with custom colum names: \n\n', pd.DataFrame({col:vals for vals, col in zip (model.named_steps['ridge'].coef_.T, columns_names_to_map)}))
the code above (in the end) prints out the following dataframe which is a map from parameter name to parameter value:
I would use model.named_steps['transform'].get_feature_names_out().
It will return the feature names like this:
array(['num__cylinders', 'num__displacement', 'num__horsepower',
'num__weight', 'num__acceleration', 'num__model_year',
'cat__origin_europe', 'cat__origin_japan', 'cat__origin_usa',...])
Then you can use the feature names to transform the output to a dataframe:
weights_df = pd.DataFrame(model.named_steps['ridge'].coef_,index=model.named_steps['transform'].get_feature_names_out()).T

Categories