I have the following splitting function:
from typing import Tuple
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
def split_dataframe(
df: pd.DataFrame,
target_feature: str,
split_ratio: int = 0.2
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray]:
df_ = df.copy()
X = df_.drop(target_feature, axis=1)
y = df_[target_feature]
encoder = LabelEncoder()
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_ratio)
return X_train, X_test, y_train, y_test
I split the dataframe by using the following:
X_train, X_test, y_train, y_test = split_dataframe(df, 'Банк')
I use pipeline to transform X_train and y_train
from sklearn.pipeline import Pipeline, FeatureUnion
from mlxtend.feature_selection import ColumnSelector
import category_encoders as ce
cat_pipe = Pipeline(
[
('selector', ColumnSelector(categorical_features)),
('encoder', ce.one_hot.OneHotEncoder())
]
)
num_pipe = Pipeline(
[
('selector', ColumnSelector(numeric_features)),
('scaler', StandardScaler())
]
)
preprocessor = FeatureUnion(
transformer_list=[
('cat', cat_pipe),
('num', num_pipe)
]
)
new_df = pipe.fit_transform(X_train, y_train)
And after that I got ValueError: A given column is not a column of the dataframe and specifically KeyError: 'Банк'. I checked if the columns exist before of pass the dataframe to split in train and test. If i remove X = df_.drop(target_feature, axis=1) to X = df_ everything works correctly but target feature still in X.
I mage a mistake in pipe.fit_transform(X_train, y_train), i changed it to preprocessor.fit_transform(X_train, y_train) and it worked
Related
I have a problem. I want to use StandardScaler(), but my dataset contains certain OneHotEncoding values and other values that should be not be scaled. But if I'm running the StandardScaler() all the values are scaled. So is there an option to run this method only on certain values inside a pipeline?
I found this question: One-Hot-Encode categorical variables and scale continuous ones simultaneouely with the below code
columns = ['rank']
columns_to_scale = ['gre', 'gpa']
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)
# Concatenate (Column-Bind) Processed Columns Back Together
processed_data = np.concatenate([scaled_columns, encoded_columns], axis=1)
So is there an option to only run the StandardScaler() inside a pipeline on only certain values and the other values should be merged to the scaled values?
So the pipeline should only use StandardScaler on the values 'xy', 'xyz'.
StandardScaler Class
from sklearn.base import BaseEstimator, TransformerMixin
class StandardScaler_with_certain_features(BaseEstimator, TransformerMixin):
def __init__(self, columns_to_scale):
scaler = StandardScaler()
def fit(self, X, y = None):
scaler.fit(X_train) # only std.fit on train set
X_train_nor = scaler.transform(X_train.values)
def transform(self, X, y = None):
return X
Pipeline
columns_to_scale = ['xy', 'xyz']
steps = [('standard_scaler', StandardScaler_with_certain_features(columns_to_scale)),
('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
# Prediction
y_pred = grid.predict(X_test)
print("RMSE Val:", metrics.mean_squared_error(y_test, y_pred, squared=False))
You can include a ColumnTransformer in the Pipeline in order to apply the StandardScaler only to certain columns. You need to set remainder='passthrough to make sure that the columns that are not scaled are concatenated with the ones that are scaled.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
df = pd.DataFrame({
'y': np.random.normal(0, 1, 100),
'x': np.random.normal(0, 1, 100),
'z': np.random.normal(0, 1, 100),
'xy': np.random.normal(2, 3, 100),
'xyz': np.random.normal(4, 5, 100),
})
X = df.drop(labels=['y'], axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
preprocessor = ColumnTransformer(
transformers=[('scaler', StandardScaler(), ['xy', 'xyz'])],
remainder='passthrough'
)
pipeline = Pipeline([
('preprocessor', preprocessor),
('lasso', Lasso(alpha=0.03))
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)
I have a preprocessing script that takes data from a diamonds dataset and preprocesses the data. I obviously need it to preprocess labels as well.
Here is my code:
# Data Preprocessing
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from icecream import ic
def diamond_preprocess(data_dir):
data = pd.read_csv(data_dir)
cleaned_data = data.drop(['id', 'depth_percent'], axis=1) # Features I don't want
x = cleaned_data.drop(['price'], axis=1) # Train data
y = cleaned_data['price'] # Label data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')), # Fill in missing data with median
('scaler', StandardScaler()) # Scale data
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill in missing data with 'missing'
('onehot', OneHotEncoder(handle_unknown='ignore')) # One hot encode categorical data
])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
# Fit to the training data
preprocessor_pipeline.fit(x_train)
preprocessor_pipeline.fit(y_train)
# Apply the pipeline to the training and test data
x_train_pipe = preprocessor_pipeline.transform(x_train)
x_test_pipe = preprocessor_pipeline.transform(x_test)
y_train_pipe = preprocessor_pipeline.transform(y_train)
y_test_pipe = preprocessor_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
return x_train, x_test, y_train, y_test
I am not very confident that my code is correct or that I have a good understanding of how pipelines and preprocessing works in sklearn. Apparently, the interpreter agrees as I get this error:
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 470, in fit
self.fit_transform(X, y=y)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\compose\_column_transformer.py", line 502, in fit_transform
self._check_n_features(X, reset=True)
File "C:\Users\17574\Anaconda3\envs\kraken-gpu\lib\site-packages\sklearn\base.py", line 352, in _check_n_features
n_features = X.shape[1]
IndexError: tuple index out of range
How do I properly preprocess my labels like I did with my training data? An explanation would be great as well!
You can create an additional pipeline for your target column if you want to apply the transformations separately, see the example below.
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# generate the data
data = pd.DataFrame({
'y': [1, 2, np.nan, 4, 5],
'x1': [6, 7, 8, np.nan, np.nan],
'x2': [9, 10, 11, np.nan, np.nan],
'x3': ['a', 'b', 'c', np.nan, np.nan],
'x4': [np.nan, np.nan, 'd', 'e', 'f']
})
# extract the features and target
x = data.drop(labels=['y'], axis=1)
y = data[['y']] # note that this is a data frame, not a series
# split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=99)
# map the features to the corresponding types (numerical or categorical)
numerical_features = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x_train.select_dtypes(include=['object']).columns.tolist()
# define the features pipeline
numerical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_features_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
features_pipeline = ColumnTransformer(transformers=[
('num_features', numerical_features_transformer, numerical_features),
('cat_features', categorical_features_transformer, categorical_features)
])
# define the target pipeline
target_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
# fit the pipelines to the training data
features_pipeline.fit(x_train)
target_pipeline.fit(y_train)
# apply the pipelines to the training and test data
x_train_pipe = features_pipeline.transform(x_train)
x_test_pipe = features_pipeline.transform(x_test)
y_train_pipe = target_pipeline.transform(y_train)
y_test_pipe = target_pipeline.transform(y_test)
x_train = pd.DataFrame(data=x_train_pipe)
x_test = pd.DataFrame(data=x_test_pipe)
y_train = pd.DataFrame(data=y_train_pipe)
y_test = pd.DataFrame(data=y_test_pipe)
You can use TransformedTargetRegressor to run some function on labels before and after regressor:
from sklearn.compose import TransformedTargetRegressor
pipeline = make_pipeline(
TransformedTargetRegressor(regressor=LinearRegression(fit_intercept = True, n_jobs = -1), func=np.log1p, inverse_func=np.expm1)
)
I was working with California Housing Prices dataset, and this is what I've done:
import pandas as pd
from sklearn.model_selection import train_test_split
housing = pd.read_csv("housing.csv")
X = housing.drop(["longitude", "latitude", "median_house_value"], axis=1)
y = housing["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
import category_encoders as ce
encoder_list = [ce.WOEEncoder(), ce.OneHotEncoder()]
for encoder in encoder_list:
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]
)
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant")),
("encoder", encoder),
]
)
pipe = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)
pipe.fit(X_train, y_train)
pipe.predict(X_test)
print(encoder)
print(pipe.score(X_test, y_test))
Why is this generating two similar results? Shouldn't they be different? The same is happening when I try different scalers.
There are many posts containing this error, but I couldn't find the solution for this problem. I'm using this dataset. This is what I've done, a preprocessing, with SimpleImputer for categorical and numerical features:
import pandas as pd
import numpy as np
%load_ext nb_black
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.model_selection import train_test_split
housing = pd.read_csv("housing.csv")
housing.head()
X = housing.drop(["longitude", "latitude", "median_house_value"], axis=1)
y = housing["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant")),
("encoder", CatBoostEncoder()),
]
)
numeric_features = [
"housing_median_age",
"total_rooms",
"total_bedrooms",
"population",
"households",
"median_income",
]
categorical_features = ["ocean_proximity"]
preprocessor = ColumnTransformer(
transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical", categorical_transformer, categorical_features),
]
)
from sklearn.linear_model import LinearRegression
pipeline = Pipeline(
steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)
lr_model = pipeline.fit(X_train, y_train)
But I got this error:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Any idea of what's happening in here?
It seems that the CatBoostEncoder is returning several nan values when fitted to the training set, which is why the LinearRegression throws an error.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from category_encoders import CatBoostEncoder
housing = pd.read_csv("housing.csv")
X = housing.drop(["longitude", "latitude", "median_house_value"], axis=1)
y = housing["median_house_value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant")),
("encoder", CatBoostEncoder())
])
numeric_features = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]
categorical_features = ["ocean_proximity"]
preprocessor = ColumnTransformer(transformers=[
("numeric", numeric_transformer, numeric_features),
("categorical", categorical_transformer, categorical_features),
])
X_new = preprocessor.fit_transform(X_train, y_train)
print(np.isnan(X_new).sum(axis=0))
# array([ 0, 0, 0, 0, 0, 0, 4315])
I want to define a Pipeline with a OneHotEncoder for the day_of_week column. I don't understand why I get a ValueError:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
if __name__ == '__main__':
data_dict = {
'age': [1, 2, 3],
'day_of_week': ['monday', 'tuesday', 'wednesday'],
'y': [5, 6, 7]
}
data = pd.DataFrame(data_dict, columns=data_dict)
numeric_features = ['age']
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
categorical_features = ['day_of_week']
print(categorical_features)
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])
preprocessor = ColumnTransformer(
transformers=[
('numerical', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features)])
classifier = Pipeline(
steps=[
('preprocessor', preprocessor),
('classifier', RandomForestRegressor(n_estimators=60))])
X = data.drop(labels=['y'], axis=1)
y = data['y']
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
trained_model = classifier.fit(X_train, y_train)
There is an error on this line:
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
train_test_split returns X (train, test) , y(train,test).. and since you assigned them wrongly, your classifier throws all kinds of error.
Try changing it to:
X_train,X_test, y_train,y_test = train_test_split(X, y, train_size=0.8, random_state=30)
Your code runs without error for me