Is there a pythonic way to chain together sklearn's StandardScaler instances to independently scale data with groups? I.e., if I wanted to find independently scale the features of the iris dataset; I could use the following code:
from sklearn.datasets import load_iris
data = load_iris()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['class'] = data['target']
means = df.groupby('class').mean()
stds = df.groupby('class').std()
df_rescaled = (
(df.drop(['class'], 1) - means.reindex(df['class']).values) /
stds.reindex(df['class']).values)
Here, I'm subtracting by the mean and dividing by the stdev of each group independently. But Its somewhat hard to carry around these means and stdev's, and essentially, replicate the behavior of StandardScaler when I have a categorical variable I'd like to control for.
Is there a more pythonic / sklearn-friendly way to implement this type of scaling?
Sure, you can use any sklearn operation and apply it to a groupby object.
First, a little convenience wrapper:
import typing
import pandas as pd
class SklearnWrapper:
def __init__(self, transform: typing.Callable):
self.transform = transform
def __call__(self, df):
transformed = self.transform.fit_transform(df.values)
return pd.DataFrame(transformed, columns=df.columns, index=df.index)
This one will apply any sklearn transform you pass into it to a group.
And finally simple usage:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
data = load_iris()
df = pd.DataFrame(data["data"], columns=data["feature_names"])
df["class"] = data["target"]
df_rescaled = (
df.groupby("class")
.apply(SklearnWrapper(StandardScaler()))
.drop("class", axis="columns")
)
EDIT: You can pretty much do anything with SklearnWrapper.
Here is an example of transforming and reversing this operation for each group (e.g. do not overwrite the transformation object) - just fit the object anew each time a new group is seen (and add it to list).
I have kinda replicated a bit of sklearn's functionality for easier usage (you can extend it with any function you want by passing appropriate string to _call_with_function internal method):
class SklearnWrapper:
def __init__(self, transformation: typing.Callable):
self.transformation = transformation
self._group_transforms = []
# Start with -1 and for each group up the pointer by one
self._pointer = -1
def _call_with_function(self, df: pd.DataFrame, function: str):
# If pointer >= len we are making a new apply, reset _pointer
if self._pointer >= len(self._group_transforms):
self._pointer = -1
self._pointer += 1
return pd.DataFrame(
getattr(self._group_transforms[self._pointer], function)(df.values),
columns=df.columns,
index=df.index,
)
def fit(self, df):
self._group_transforms.append(self.transformation.fit(df.values))
return self
def transform(self, df):
return self._call_with_function(df, "transform")
def fit_transform(self, df):
self.fit(df)
return self.transform(df)
def inverse_transform(self, df):
return self._call_with_function(df, "inverse_transform")
Usage (group transform, inverse operation and apply it again):
data = load_iris()
df = pd.DataFrame(data["data"], columns=data["feature_names"])
df["class"] = data["target"]
# Create scaler outside the class
scaler = SklearnWrapper(StandardScaler())
# Fit and transform data (holding state)
df_rescaled = df.groupby("class").apply(scaler.fit_transform)
# Inverse the operation
df_inverted = df_rescaled.groupby("class").apply(scaler.inverse_transform)
# Apply transformation once again
df_transformed = (
df_inverted.groupby("class")
.apply(scaler.transform)
.drop("class", axis="columns")
)
I updated #Szymon Maszke code:
class SklearnWrapper:
def __init__(self, transformation: typing.Callable):
self.transformation = transformation
self._group_transforms = []
# Start with -1 and for each group up the pointer by one
self._pointer = -1
def _call_with_function(self, df: pd.DataFrame, function: str):
# If pointer >= len we are making a new apply, reset _pointer
if self._pointer == len(self._group_transforms)-1 and function=="inverse_transform":
self._pointer = -1
self._pointer += 1
print(self._pointer)
return pd.DataFrame(
getattr(self._group_transforms[self._pointer], function)(df.values),
columns=df.columns,
index=df.index,
)
def fit(self, df):
scaler = copy(self.transformation)
self._group_transforms.append(scaler.fit(df.values))
return self
def transform(self, df):
return self._call_with_function(df, "transform")
def fit_transform(self, df):
self.fit(df)
return self.transform(df)
def inverse_transform(self, df):
return self._call_with_function(df, "inverse_transform")
StandardScaler() was not stored properly in _group_transforms, so I created a copy (using the copy lib) and store it (maybe there's a better way to do it using OOP).
Related
Here I have created the output x from first def(null_checking) function and want to use the same output (x) as input for second def(variance) within class. I tried a lot but couldnot.
#import dependencies
#import dependencies
from optparse import Values
from re import X
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd
from .prac import checking_null
#import datasets
datasets = pd.read_csv('Car_sales.csv')
datasets
features = ['Fuel_efficiency', 'Power_perf_factor', 'Engine_size', 'Horsepower', 'Fuel_capacity', 'Curb_weight']
features
class extraction():
def __init__(self, datasets, features):
self.features = features
self.datasets = datasets
#checking and removing null rows present in the dataframe
def null_checking(self):
datasets1 = self.datasets[self.features]
print(datasets1)
for items, x in enumerate(datasets1):
if items =='True':
continue
x = datasets1.dropna(axis=0)
print(x)
#calculating variance inflation factorss
def variance(self):
x.inner_display()
#we need intercept for calculating variance inflation factor
x['intercept'] = 1
print(x)
#Making the new dataframe
df = pd.DataFrame()
df['variables'] = x.columns
df['VIF'] = [variance_inflation_factor(x, i) for i in range(x.shape[1])]
print(df)
I have the following problem. I am defining a dataset class for my pytorch dataloader. In the class, I would like to use an externally defined function in the __getitem__ method.
Here is the example of what I mean
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
def readOneElement(df, idx):
# some fancy code which returns one element with index i based on full data set df (pandas)
...
# And here the Dataset class
class ModelDataset(Dataset):
def __init__(self, dataPath, transform = None):
def listUniqueExamples(self, df):
listExamples = df[['criterionId', 'adGroupId', 'campaignId', 'accountId']].drop_duplicates().reset_index();
return listExamples
self.transform = transform
# Load data
df = pd.read_csv(dataPath + '/trainData.csv')
# Transform constants to self
self.df = df
self.listExamples = listUniqueExamples(self, self.df)
self.length = len(self.listExamples)
def __len__(self):
return self.length
def __getitem__(self, idx):
# Create one example data
sample = readOneElement(df = self.df, idx = idx) # !!!
return sample
The line marked with #!!! will not work, because the function was defined outside this Dataset object. I get the following error
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
in
----> 1 data[20]
in __getitem__(self, idx)
31 def __getitem__(self, idx):
32 # Create one example data
---> 33 sample = readOneElement(df = self.df, idx = idx)
NameError: name 'readOneElement' is not defined
If I would have defined this function inside the Dataset object (as I have done for the listUniqueExamples function) then it would have worked. However, in this exact case I want this function to be external.
Is there any way to import external functions to the Dataset class in pytorch?
Thank you in advance!
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','as'],
'num1' : [1, 1, 0, 0] ,
'target' : [0.2,0.11,1.34,1.123]})
train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
preprocess = make_column_transformer(
(StandardScaler(),train_continuous_cols),
(OneHotEncoder(), train_categorical_cols)
)
df= preprocess.fit_transform(df)
Just trying to get all the feature names:
preprocess.get_feature_names()
Getting this error:
Transformer standardscaler (type StandardScaler) does not provide get_feature_names
How can I solve it? The examples online use pipeline and I'm trying to avoid that.
The following re-implementation of the ColumnTransformer returns a pandas DataFrame. Note that it should only be used if you input a pandas DataFrame to your pipeline.
All kudos go to Johannes Haupt who provided the get_feature_names() function that is resilient to transformers that don't have this function (see blogpost Extracting Column Names from the ColumnTransformer). I commented off the warnings because I did not want them and also pre-prending the transformation step to the column name; but it is easy to un-comment as you like.
#import warnings
import sklearn
import pandas as pd
class ColumnTransformerWithNames(ColumnTransformer):
def get_feature_names(column_transformer):
"""Get feature names from all transformers.
Returns
-------
feature_names : list of strings
Names of the features produced by transform.
"""
# Remove the internal helper function
#check_is_fitted(column_transformer)
# Turn loopkup into function for better handling with pipeline later
def get_names(trans):
# >> Original get_feature_names() method
if trans == 'drop' or (
hasattr(column, '__len__') and not len(column)):
return []
if trans == 'passthrough':
if hasattr(column_transformer, '_df_columns'):
if ((not isinstance(column, slice))
and all(isinstance(col, str) for col in column)):
return column
else:
return column_transformer._df_columns[column]
else:
indices = np.arange(column_transformer._n_features)
return ['x%d' % i for i in indices[column]]
if not hasattr(trans, 'get_feature_names'):
# >>> Change: Return input column names if no method avaiable
# Turn error into a warning
# warnings.warn("Transformer %s (type %s) does not "
# "provide get_feature_names. "
# "Will return input column names if available"
# % (str(name), type(trans).__name__))
# For transformers without a get_features_names method, use the input
# names to the column transformer
if column is None:
return []
else:
return [#name + "__" +
f for f in column]
return [#name + "__" +
f for f in trans.get_feature_names()]
### Start of processing
feature_names = []
# Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
if type(column_transformer) == sklearn.pipeline.Pipeline:
l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
else:
# For column transformers, follow the original method
l_transformers = list(column_transformer._iter(fitted=True))
for name, trans, column, _ in l_transformers:
if type(trans) == sklearn.pipeline.Pipeline:
# Recursive call on pipeline
_names = column_transformer.get_feature_names(trans)
# if pipeline has no transformer that returns names
if len(_names)==0:
_names = [#name + "__" +
f for f in column]
feature_names.extend(_names)
else:
feature_names.extend(get_names(trans))
return feature_names
def transform(self, X):
indices = X.index.values.tolist()
original_columns = X.columns.values.tolist()
X_mat = super().transform(X)
new_cols = self.get_feature_names()
new_X = pd.DataFrame(X_mat.toarray(), index=indices, columns=new_cols)
return new_X
def fit_transform(self, X, y=None):
super().fit_transform(X, y)
return self.transform(X)
Then you can replace the calls to ColumnTransformer to ColumnTransformerWithNames. The output is a DataFrame and this step now has a working get_feature_names().
I am assuming you are looking for ways to access the result of the transformer, which yields a numpy array.
ColumnTransfomer has an attribute called transformers_ :`
From the documentation:
transformers_ : list
The collection of fitted transformers as tuples of
(name, fitted_transformer, column). `fitted_transformer` can be an
estimator, 'drop', or 'passthrough'. In case there were no columns
selected, this will be the unfitted transformer.
If there are remaining columns, the final element is a tuple of the
form:
('remainder', transformer, remaining_columns) corresponding to the
``remainder`` parameter. If there are remaining columns, then
``len(transformers_)==len(transformers)+1``, otherwise
``len(transformers_)==len(transformers)``.
So that provides unfortunately only information on the transformer itself and the column it has been applied to, however not on the location of the resulting data except for the following :
notes: The order of the columns in the transformed feature matrix follows the
order of how the columns are specified in the transformers list.
So we know that the order of the output columns is the same as the order in which the columns are specified in the transformers list. Plus, we also know for our transformer steps how much columns they yield, as a StandardScaler() yields the same number of columns as the original data and OneHotEncoder() yields number of columns equal to the number of categories.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','asd'],
'num1' : [1, 1, 0, 0] ,
'target' : [0.2,0.11,1.34,1.123]})
train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
# get n_categories for categorical features
n_categories = [df[x].nunique() for x in train_categorical_cols]
preprocess = make_column_transformer(
(StandardScaler(),train_continuous_cols),
(OneHotEncoder(), train_categorical_cols)
)
preprocessed_df = preprocess.fit_transform(df)
# the scaler yield 1 column each
indexes_scaler = list(range(0,len(train_continuous_cols)))
# the encoder yields a number of columns equal to the number of categories in the data
cum_index_encoder = [0] + list(np.cumsum(n_categories))
# the encoder indexes come after the scaler indexes
start_index_encoder = indexes_scaler[-1]+1
indexes_encoder = [x + start_index_encoder for x in cum_index_encoder]
# get both lower and uper bound of index
index_pairs= zip (indexes_encoder[:-1],indexes_encoder[1:])
This results in the following output:
print ('Transformed {} continious cols resulting in a df with shape:'.format(len(train_continuous_cols)))
print (preprocessed_df[: , indexes_scaler].shape)
Transformed 2 continious cols resulting in a df with shape:
(4, 2)
for column, (start_id, end_id) in zip (train_categorical_cols,index_pairs):
print('Transformed column {} resulted in a df with shape:'.format(column))
print(preprocessed_df[:, start_id:end_id].shape)
Transformed column brand resulted in a df with shape:
(4, 4)
Transformed column category resulted in a df with shape:
(4, 4)
Data.csv: param1,param2,param3,result
1,2,cat1,12
2,3,cat2,13
1,6,cat1,6
1,1,cat2,12
Suppose i read the data from the file and convert categorical variables into dummy variables like this:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
data = pd.read_csv('data.csv')
type_dummies = pd.get_dummies(data.house_type)
data = pd.concat([data, type_dummies], axis=1)
I received dataframe:
1,2,1,0,..
1,6,0,1,..
I made simple linear regression for that dataset and received coeffs. How can i convert a new record (new_data = np.array([12,19,cat1])) for new_data = np.array([12,19,1,0)) using pandas for using it in my linear model? (such that new data categorical variables will be converted into dummy variables)
Typically you'll want to setup a pipeline to record the correct category:code mapping.
class CategoricalTransformer(TransformerMixin):
def fit(self, X, y=None, *args, **kwargs):
self.columns_ = X.columns
self.cat_columns_ = X.select_dtypes(include=['category']).columns
self.non_cat_columns_ = X.columns.drop(self.cat_columns_)
self.cat_map_ = {col: X[col].cat.categories
for col in self.cat_columns_}
self.ordered_ = {col: X[col].cat.ordered
for col in self.cat_columns_}
self.dummy_columns_ = {col: ["_".join([col, v])
for v in self.cat_map_[col]]
for col in self.cat_columns_}
self.transformed_columns_ = pd.Index(
self.non_cat_columns_.tolist() +
list(chain.from_iterable(self.dummy_columns_[k]
for k in self.cat_columns_))
)
def transform(self, X, y=None, *args, **kwargs):
return (pd.get_dummies(X)
.reindex(columns=self.transformed_columns_)
.fillna(0))
More here.
with the pipeline sklearn.pipeline.make_pipeline(CategoricalTransformer(), LinearRegression()), your predict method should correctly translate from the categorical house_type to variables.
I want to create my own transformer for use with the sklearn Pipeline.
I am creating a class that implements both fit and transform methods. The purpose of the transformer will be to remove rows from the matrix that have more than a specified number of NaNs.
The issue I am facing is how can I change both the X and y matrices that are passed to the transformer?
I believe this has to be done in the fit method since it has access to both X and y. Since python passes arguments by assignment once I reassign X to a new matrix with fewer rows the reference to the original X is lost (and of course the same is true for y). Is it possible to maintain this reference?
I’m using a pandas DataFrame to easily drop the rows that have too many NaNs, this may not be the right way to do it for my use case. The current code looks like this:
class Dropna():
# thresh is max number of NaNs allowed in a row
def __init__(self, thresh=0):
self.thresh = thresh
def fit(self, X, y):
total = X.shape[1]
# +1 to account for 'y' being added to the dframe
new_thresh = total + 1 - self.thresh
df = pd.DataFrame(X)
df['y'] = y
df.dropna(thresh=new_thresh, inplace=True)
X = df.drop('y', axis=1).values
y = df['y'].values
return self
def transform(self, X):
return X
Modifying the sample axis, e.g. removing samples, does not (yet?) comply with the scikit-learn transformer API. So if you need to do this, you should do it outside any calls to scikit learn, as preprocessing.
As it is now, the transformer API is used to transform the features of a given sample into something new. This can implicitly contain information from other samples, but samples are never deleted.
Another option is to attempt to impute the missing values. But again, if you need to delete samples, treat it as preprocessing before using scikit learn.
You have to modify the internal code of sklearn Pipeline.
We define a transformer that removes samples where at least the value of a feature or the target is NaN during fitting (fit_transform). While it removes the samples where at least the value of a feature is NaN during inference (transform). Important to note that our transformer returns X and y in fit_transform so we need to handle this behaviour in the sklearn Pipeline.
class Dropna():
def fit(self, X, y):
return self
def fit_transform(self, X, y):
mask = (np.isnan(X).any(-1) | np.isnan(y))
if hasattr(X, 'loc'):
X = X.loc[~mask]
else:
X = X[~mask]
if hasattr(y, 'loc'):
y = y.loc[~mask]
else:
y = y[~mask]
return X, y ###### make fit_transform return X and y
def transform(self, X):
mask = np.isnan(X).any(-1)
if hasattr(X, 'loc'):
X = X.loc[~mask]
else:
X = X[~mask]
return X
We only have to modify the original sklearn Pipeline in only two specific points in fit and in _fit method. The rest remains unchanged.
from sklearn import pipeline
from sklearn.base import clone
from sklearn.utils import _print_elapsed_time
from sklearn.utils.validation import check_memory
class Pipeline(pipeline.Pipeline):
def _fit(self, X, y=None, **fit_params_steps):
self.steps = list(self.steps)
self._validate_steps()
memory = check_memory(self.memory)
fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)
for (step_idx, name, transformer) in self._iter(
with_final=False, filter_passthrough=False
):
if transformer is None or transformer == "passthrough":
with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
continue
try:
# joblib >= 0.12
mem = memory.location
except AttributeError:
mem = memory.cachedir
finally:
cloned_transformer = clone(transformer) if mem else transformer
X, fitted_transformer = fit_transform_one_cached(
cloned_transformer,
X,
y,
None,
message_clsname="Pipeline",
message=self._log_message(step_idx),
**fit_params_steps[name],
)
if isinstance(X, tuple): ###### unpack X if is tuple X = (X,y)
X, y = X
self.steps[step_idx] = (name, fitted_transformer)
return X, y
def fit(self, X, y=None, **fit_params):
fit_params_steps = self._check_fit_params(**fit_params)
Xt = self._fit(X, y, **fit_params_steps)
if isinstance(Xt, tuple): ###### unpack X if is tuple X = (X,y)
Xt, y = Xt
with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
if self._final_estimator != "passthrough":
fit_params_last_step = fit_params_steps[self.steps[-1][0]]
self._final_estimator.fit(Xt, y, **fit_params_last_step)
return self
This is required in order to unpack the values generated by Dropna().fit_transform(X, y) in the new X and y.
Here is the full pipeline at work:
from sklearn.linear_model import Ridge
X = np.random.uniform(0,1, (100,3))
y = np.random.uniform(0,1, (100,))
X[np.random.uniform(0,1, (100)) < 0.1] = np.nan
y[np.random.uniform(0,1, (100)) < 0.1] = np.nan
pipe = Pipeline([('dropna', Dropna()), ('model', Ridge())])
pipe.fit(X, y)
pipe.predict(X).shape
Another trial with a further intermediate preprocessing step:
from sklearn.preprocessing import StandardScaler
pipe = Pipeline([('dropna', Dropna()), ('scaler', StandardScaler()), ('model', Ridge())])
pipe.fit(X, y)
pipe.predict(X).shape
More complex behaviors can be achieved with other simple modifications according to the needs. If you are interested also in Pipeline().fit_transform or Pipeline().fit_predict you need to operate the same changes.
The package imblearn, which is built on top of sklearn, contains an estimator FunctionSampler that allows manipulating both the features array, X, and target array, y, in a pipeline step.
Note that using it in a pipeline step requires using the Pipeline class in imblearn that inherits from the one in sklearn. Furthermore, by default, in the context of Pipeline, the method resample does nothing when it is not called immediately after fit (as in fit_resample). So, read the documentation ahead of time.
Adding to #João Matias response:
Here's an example of using imblearn to define a pipeline step that drops rows with missing values:
from imblearn import FunctionSampler
def drop_rows_with_any_nan(X, y):
return X[~np.isnan(X).any(axis=1), :], y[~np.isnan(X).any(axis=1)]
drop_rows_with_any_nan_sampler = FunctionSampler(func=drop_rows_with_any_nan, validate=False)
model_clf2 = pipeline.Pipeline(
[
('preprocess', column_transformer),
('drop_na', drop_rows_with_any_nan_sampler),
('smote', SMOTE()),
('xgb', xgboost.XGBClassifier()),
]
)
Note, you have to use the imblearn pipeline.
You can solve this easily by using the sklearn.preprocessing.FunctionTransformer method (http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)
You just need to put your alternations to X in a function
def drop_nans(X, y=None):
total = X.shape[1]
new_thresh = total - thresh
df = pd.DataFrame(X)
df.dropna(thresh=new_thresh, inplace=True)
return df.values
then you get your transformer by calling
transformer = FunctionTransformer(drop_nans, validate=False)
which you can use in the pipeline. The threshold can be set outside the drop_nans function.
#eickenberg is the proper and clean answer. Nevertheless, I like to keep everything into one Pipeline, so if you are interested, I created a library (not yet deployed on pypi) that allow to apply transformation on Y:
https://gitlab.com/thibaultB/transformers/
Usage is the following:
df = pd.DataFrame([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
df.columns = ["a", "b", "target"]
spliter = SplitXY("target") # Create a new step and give it name of column target
pipe = Pipeline([
("imputer", SklearnPandasWrapper(KNNImputer())),
("spliter", spliter),
("scaler", StandardScaler()),
("rf",
EstimatorWithoutYWrapper(RandomForestRegressor(random_state=45),
spliter)) # EstimatorWithoutYWrapper overwrite RandomForestRegressor to get y from spliter just before calling fit or transform
])
pipe.fit(df)
res = pipe.predict(df)
Using this code, you can alter the number of rows if you put all the transformer that modify the numbers of rows before the "SplitXY" transformer. Transformer before the SplitXY transformer should keep columns name, it is why I also added a SklearnPandasWrapper that wrap sklearn transformer (that usually return numpy array) to keep columns name.
You can use function transformer
df=pd.DataFrame([[1,2,3],[4,5,6],[np.NaN,np.NaN,9],[7,np.NaN,9]])
from sklearn.pipeline import FunctionTransformer,make_pipeline
def remove_na(df_,thresh=2):
return df.dropna(thresh=2)
pipe=make_pipeline(FunctionTransformer(func=remove_na,
validate=False,kw_args={"thresh":2}))
pipe.fit_transform(df)
Use "deep-copies" further on, down the pipeline and X, y remain protected
.fit() can first assign on each call deep-copy to new class-variables
self.X_without_NaNs = X.copy()
self.y_without_NaNs = y.copy()
and then reduce / transform these not to have more NaN-s than ordered by self.treshold