How to use StandardScaler inside a pipeline only on certain values? - python

I have a problem. I want to use StandardScaler(), but my dataset contains certain OneHotEncoding values and other values that should be not be scaled. But if I'm running the StandardScaler() all the values are scaled. So is there an option to run this method only on certain values inside a pipeline?
I found this question: One-Hot-Encode categorical variables and scale continuous ones simultaneouely with the below code
columns = ['rank']
columns_to_scale = ['gre', 'gpa']
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)
# Concatenate (Column-Bind) Processed Columns Back Together
processed_data = np.concatenate([scaled_columns, encoded_columns], axis=1)
So is there an option to only run the StandardScaler() inside a pipeline on only certain values and the other values should be merged to the scaled values?
So the pipeline should only use StandardScaler on the values 'xy', 'xyz'.
StandardScaler Class
from sklearn.base import BaseEstimator, TransformerMixin
class StandardScaler_with_certain_features(BaseEstimator, TransformerMixin):
def __init__(self, columns_to_scale):
scaler = StandardScaler()
def fit(self, X, y = None):
scaler.fit(X_train) # only std.fit on train set
X_train_nor = scaler.transform(X_train.values)
def transform(self, X, y = None):
return X
Pipeline
columns_to_scale = ['xy', 'xyz']
steps = [('standard_scaler', StandardScaler_with_certain_features(columns_to_scale)),
('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=100))),
('lasso', Lasso(alpha=0.03))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
# Prediction
y_pred = grid.predict(X_test)
print("RMSE Val:", metrics.mean_squared_error(y_test, y_pred, squared=False))

You can include a ColumnTransformer in the Pipeline in order to apply the StandardScaler only to certain columns. You need to set remainder='passthrough to make sure that the columns that are not scaled are concatenated with the ones that are scaled.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
df = pd.DataFrame({
'y': np.random.normal(0, 1, 100),
'x': np.random.normal(0, 1, 100),
'z': np.random.normal(0, 1, 100),
'xy': np.random.normal(2, 3, 100),
'xyz': np.random.normal(4, 5, 100),
})
X = df.drop(labels=['y'], axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
preprocessor = ColumnTransformer(
transformers=[('scaler', StandardScaler(), ['xy', 'xyz'])],
remainder='passthrough'
)
pipeline = Pipeline([
('preprocessor', preprocessor),
('lasso', Lasso(alpha=0.03))
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

Related

How to improve the knn model?

I built a knn model for classification. Unfortunately, my model has accuracy > 80%, and I would like to get a better result. Can I ask for some tips? Maybe I used too many predictors?
My data = https://www.openml.org/search?type=data&sort=runs&id=53&status=active
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
heart_disease = pd.read_csv('heart_disease.csv', sep=';', decimal=',')
y = heart_disease['heart_disease']
X = heart_disease.drop(["heart_disease"], axis=1)
correlation_matrix = heart_disease.corr()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
scaler = MinMaxScaler(feature_range=(-1,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
knn_3 = KNeighborsClassifier(3, n_jobs = -1)
knn_3.fit(X_train, y_train)
y_train_pred = knn_3.predict(X_train)
labels = ['0', '1']
print('Training set')
print(pd.DataFrame(confusion_matrix(y_train, y_train_pred), index = labels, columns = labels))
print(accuracy_score(y_train, y_train_pred))
print(f1_score(y_train, y_train_pred))
y_test_pred = knn_3.predict(X_test)
print('Test set')
print(pd.DataFrame(confusion_matrix(y_test, y_test_pred), index = labels, columns = labels))
print(accuracy_score(y_test, y_test_pred))
print(f1_score(y_test, y_test_pred))
hyperparameters = {'n_neighbors' : range(1, 15), 'weights': ['uniform','distance']}
knn_best = GridSearchCV(KNeighborsClassifier(), hyperparameters, n_jobs = -1, error_score = 'raise')
knn_best.fit(X_train,y_train)
knn_best.best_params_
y_train_pred_best = knn_best.predict(X_train)
y_test_pred_best = knn_best.predict(X_test)
print('Training set')
print(pd.DataFrame(confusion_matrix(y_train, y_train_pred_best), index = labels, columns = labels))
print(accuracy_score(y_train, y_train_pred_best))
print(f1_score(y_train, y_train_pred_best))
print('Test set')
print(pd.DataFrame(confusion_matrix(y_test, y_test_pred_best), index = labels, columns = labels))
print(accuracy_score(y_test, y_test_pred_best))
print(f1_score(y_test, y_test_pred_best))
```.
Just a little part of answer, to find the best number for k_neighbors.
errlist = [] #an error list to append
for i in range(1,40): #from 0-40 numbers to use in k_neighbors
knn_i = KNeighborsClassifier(k_neighbors=i)
knn_i.fit(X_train,y_train)
errlist.append(np.mean(knn_i.predict(X_test)!=y_test)) # append the mean of failed-predict numbers
plot a line to see best k_neighbors:
plt.plot(range(1,40),errlist)
feel free to change the numbers for range.

A given column is not a column of the dataframe Pandas

I have the following splitting function:
from typing import Tuple
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
def split_dataframe(
df: pd.DataFrame,
target_feature: str,
split_ratio: int = 0.2
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray]:
df_ = df.copy()
X = df_.drop(target_feature, axis=1)
y = df_[target_feature]
encoder = LabelEncoder()
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_ratio)
return X_train, X_test, y_train, y_test
I split the dataframe by using the following:
X_train, X_test, y_train, y_test = split_dataframe(df, 'Банк')
I use pipeline to transform X_train and y_train
from sklearn.pipeline import Pipeline, FeatureUnion
from mlxtend.feature_selection import ColumnSelector
import category_encoders as ce
cat_pipe = Pipeline(
[
('selector', ColumnSelector(categorical_features)),
('encoder', ce.one_hot.OneHotEncoder())
]
)
num_pipe = Pipeline(
[
('selector', ColumnSelector(numeric_features)),
('scaler', StandardScaler())
]
)
preprocessor = FeatureUnion(
transformer_list=[
('cat', cat_pipe),
('num', num_pipe)
]
)
new_df = pipe.fit_transform(X_train, y_train)
And after that I got ValueError: A given column is not a column of the dataframe and specifically KeyError: 'Банк'. I checked if the columns exist before of pass the dataframe to split in train and test. If i remove X = df_.drop(target_feature, axis=1) to X = df_ everything works correctly but target feature still in X.
I mage a mistake in pipe.fit_transform(X_train, y_train), i changed it to preprocessor.fit_transform(X_train, y_train) and it worked

OneHotEncoder for categorical feature "day of week" results in ValueError

I want to define a Pipeline with a OneHotEncoder for the day_of_week column. I don't understand why I get a ValueError:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
if __name__ == '__main__':
data_dict = {
'age': [1, 2, 3],
'day_of_week': ['monday', 'tuesday', 'wednesday'],
'y': [5, 6, 7]
}
data = pd.DataFrame(data_dict, columns=data_dict)
numeric_features = ['age']
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
categorical_features = ['day_of_week']
print(categorical_features)
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore', categories='auto'))])
preprocessor = ColumnTransformer(
transformers=[
('numerical', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features)])
classifier = Pipeline(
steps=[
('preprocessor', preprocessor),
('classifier', RandomForestRegressor(n_estimators=60))])
X = data.drop(labels=['y'], axis=1)
y = data['y']
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
trained_model = classifier.fit(X_train, y_train)
There is an error on this line:
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=30)
train_test_split returns X (train, test) , y(train,test).. and since you assigned them wrongly, your classifier throws all kinds of error.
Try changing it to:
X_train,X_test, y_train,y_test = train_test_split(X, y, train_size=0.8, random_state=30)
Your code runs without error for me

Invalid parameter n_neighbors for estimator Pipeline

I'm pretty new to Python. I've run into an issue below that I really need help with:
df = pd.read_csv('train.csv') #titanic dataset from Kaggle
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'SibSp', 'Embarked']]
X = df.drop('Survived', axis='columns')
y = df.Survived
column_trans = make_column_transformer(
(OneHotEncoder(), ['Sex', 'Embarked']),
remainder='passthrough')
column_trans.fit_transform(X)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
param_grid = dict(n_neighbors=k_range)
knn = KNeighborsClassifier()
pipe = make_pipeline(column_trans, knn)
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy')
grid.fit(train_X, train_y) #this line gives me an error
The last line gives me an error of:
ValueError: Invalid parameter n_neighbors for estimator Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='passthrough',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('onehotencoder',
OneHotEncoder(categories='auto',
drop=None,
dtype=<class 'numpy.float64'>,
handle_unknown='error',
sparse=True),
['Sex', 'Embarked'])],
verbose=False)),
('kneighborsclassifier',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
What am I doing wrong here? Is it just not possible to do oneHot encoding, knn and pipeline simultaneously?
Parameters of pipelines can be set using __ separated parameter names, also you need the way in which you have defined your pipeline needs a revision. Please refer to the modified code below:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
df = pd.read_csv("titanic.csv")
df = df.drop(["Name"], axis=1)
X = df.drop('Survived', axis='columns')
y = df.Survived
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
column_trans = make_column_transformer(
(OneHotEncoder(), ['Sex']),
remainder='passthrough')
knn = KNeighborsClassifier()
pipe = Pipeline(steps=[('column_trans', column_trans), ('knn', knn)])
param_grid = {
'knn__n_neighbors': [2,5,15, 30, 45, 64]
}
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy')
grid.fit(train_X,train_y)
grid.best_params_
#{'knn__n_neighbors': 5}

What does the error mean and how to fix it - "ValueError: query data dimension must match training data dimension"

I am trying to write the code for K-NN
Below is my code. - I know that issue is in `predict() but I am not able to figure out how o fix it.
# Importing the libraries
import numpy as np
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('UniversalBank.csv')
X = dataset.iloc[:,[ 1,2,3,5,6,7,8,10,11,12,13]].values #,
y = dataset.iloc[:,9].values
#Splitting the dataset to training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state= 0)
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Fitting the classifier to training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train,y_train)
#Predicting the test results
y_pred = classifier.predict(X_test)

Categories