import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','as'],
'num1' : [1, 1, 0, 0] ,
'target' : [0.2,0.11,1.34,1.123]})
train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
preprocess = make_column_transformer(
(StandardScaler(),train_continuous_cols),
(OneHotEncoder(), train_categorical_cols)
)
df= preprocess.fit_transform(df)
Just trying to get all the feature names:
preprocess.get_feature_names()
Getting this error:
Transformer standardscaler (type StandardScaler) does not provide get_feature_names
How can I solve it? The examples online use pipeline and I'm trying to avoid that.
The following re-implementation of the ColumnTransformer returns a pandas DataFrame. Note that it should only be used if you input a pandas DataFrame to your pipeline.
All kudos go to Johannes Haupt who provided the get_feature_names() function that is resilient to transformers that don't have this function (see blogpost Extracting Column Names from the ColumnTransformer). I commented off the warnings because I did not want them and also pre-prending the transformation step to the column name; but it is easy to un-comment as you like.
#import warnings
import sklearn
import pandas as pd
class ColumnTransformerWithNames(ColumnTransformer):
def get_feature_names(column_transformer):
"""Get feature names from all transformers.
Returns
-------
feature_names : list of strings
Names of the features produced by transform.
"""
# Remove the internal helper function
#check_is_fitted(column_transformer)
# Turn loopkup into function for better handling with pipeline later
def get_names(trans):
# >> Original get_feature_names() method
if trans == 'drop' or (
hasattr(column, '__len__') and not len(column)):
return []
if trans == 'passthrough':
if hasattr(column_transformer, '_df_columns'):
if ((not isinstance(column, slice))
and all(isinstance(col, str) for col in column)):
return column
else:
return column_transformer._df_columns[column]
else:
indices = np.arange(column_transformer._n_features)
return ['x%d' % i for i in indices[column]]
if not hasattr(trans, 'get_feature_names'):
# >>> Change: Return input column names if no method avaiable
# Turn error into a warning
# warnings.warn("Transformer %s (type %s) does not "
# "provide get_feature_names. "
# "Will return input column names if available"
# % (str(name), type(trans).__name__))
# For transformers without a get_features_names method, use the input
# names to the column transformer
if column is None:
return []
else:
return [#name + "__" +
f for f in column]
return [#name + "__" +
f for f in trans.get_feature_names()]
### Start of processing
feature_names = []
# Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
if type(column_transformer) == sklearn.pipeline.Pipeline:
l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
else:
# For column transformers, follow the original method
l_transformers = list(column_transformer._iter(fitted=True))
for name, trans, column, _ in l_transformers:
if type(trans) == sklearn.pipeline.Pipeline:
# Recursive call on pipeline
_names = column_transformer.get_feature_names(trans)
# if pipeline has no transformer that returns names
if len(_names)==0:
_names = [#name + "__" +
f for f in column]
feature_names.extend(_names)
else:
feature_names.extend(get_names(trans))
return feature_names
def transform(self, X):
indices = X.index.values.tolist()
original_columns = X.columns.values.tolist()
X_mat = super().transform(X)
new_cols = self.get_feature_names()
new_X = pd.DataFrame(X_mat.toarray(), index=indices, columns=new_cols)
return new_X
def fit_transform(self, X, y=None):
super().fit_transform(X, y)
return self.transform(X)
Then you can replace the calls to ColumnTransformer to ColumnTransformerWithNames. The output is a DataFrame and this step now has a working get_feature_names().
I am assuming you are looking for ways to access the result of the transformer, which yields a numpy array.
ColumnTransfomer has an attribute called transformers_ :`
From the documentation:
transformers_ : list
The collection of fitted transformers as tuples of
(name, fitted_transformer, column). `fitted_transformer` can be an
estimator, 'drop', or 'passthrough'. In case there were no columns
selected, this will be the unfitted transformer.
If there are remaining columns, the final element is a tuple of the
form:
('remainder', transformer, remaining_columns) corresponding to the
``remainder`` parameter. If there are remaining columns, then
``len(transformers_)==len(transformers)+1``, otherwise
``len(transformers_)==len(transformers)``.
So that provides unfortunately only information on the transformer itself and the column it has been applied to, however not on the location of the resulting data except for the following :
notes: The order of the columns in the transformed feature matrix follows the
order of how the columns are specified in the transformers list.
So we know that the order of the output columns is the same as the order in which the columns are specified in the transformers list. Plus, we also know for our transformer steps how much columns they yield, as a StandardScaler() yields the same number of columns as the original data and OneHotEncoder() yields number of columns equal to the number of categories.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','asd'],
'num1' : [1, 1, 0, 0] ,
'target' : [0.2,0.11,1.34,1.123]})
train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
# get n_categories for categorical features
n_categories = [df[x].nunique() for x in train_categorical_cols]
preprocess = make_column_transformer(
(StandardScaler(),train_continuous_cols),
(OneHotEncoder(), train_categorical_cols)
)
preprocessed_df = preprocess.fit_transform(df)
# the scaler yield 1 column each
indexes_scaler = list(range(0,len(train_continuous_cols)))
# the encoder yields a number of columns equal to the number of categories in the data
cum_index_encoder = [0] + list(np.cumsum(n_categories))
# the encoder indexes come after the scaler indexes
start_index_encoder = indexes_scaler[-1]+1
indexes_encoder = [x + start_index_encoder for x in cum_index_encoder]
# get both lower and uper bound of index
index_pairs= zip (indexes_encoder[:-1],indexes_encoder[1:])
This results in the following output:
print ('Transformed {} continious cols resulting in a df with shape:'.format(len(train_continuous_cols)))
print (preprocessed_df[: , indexes_scaler].shape)
Transformed 2 continious cols resulting in a df with shape:
(4, 2)
for column, (start_id, end_id) in zip (train_categorical_cols,index_pairs):
print('Transformed column {} resulted in a df with shape:'.format(column))
print(preprocessed_df[:, start_id:end_id].shape)
Transformed column brand resulted in a df with shape:
(4, 4)
Transformed column category resulted in a df with shape:
(4, 4)
Related
Trying to create a simple jackknife of some regression coefficients. For simplicity, I am including sample data and code I am using. My issue is that the regression coefficients produced by the jackknife are the same, without any variation. Not sure what I am missing out.
# the libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
# generate dataset
df = np.random.randint(5, size=(20,4))
df[:10:]
# the regression function
def smREG(data):
X = df[:, 1:]
y = df[:, 0]
mod = sm.OLS(y, sm.add_constant(X)).fit()
return mod.params
# call reg function
smREG(df)
# the jackknife function
def simple_jackknife(data, fn):
jack_sample = {}
jack_reps = {}
for i in range(len(data)):
# delete row 'i' from df
jack_sample[i] = np.delete(arr=data, obj=i, axis=0)
# run function with row 'i' deleted
jack_reps[i] = fn(jack_sample[i])
return jack_reps
# call jackknife function
out = simple_jackknife(data=df, fn=smREG)
out
# just showing the first three of sample output
{0: array([ 2.33483583, -0.06541933, -0.00764364, -0.16846914]),
1: array([ 2.33483583, -0.06541933, -0.00764364, -0.16846914]),
2: array([ 2.33483583, -0.06541933, -0.00764364, -0.16846914])}
My issue is that, I am not sure why the regression coefficients are not changing, even if infinitesimally, after a row is deleted.
I have label encoded a part of my data. Now I want to identify what label was given to which category.
below mentioned is the Label encoder code and its fit and transform on dataframe df creating df1.
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
class MultiColumnLabelEncoder:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self # not relevant here
def transform(self,X):
'''
Transforms columns of X specified in self.columns using
LabelEncoder(). If no columns specified, transforms all
columns in X.
'''
output = X.copy()
if self.columns is not None:
for col in self.columns:
output[col] = LabelEncoder().fit_transform(output[col])
else:
for colname,col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self,X,y=None):
return self.fit(X,y).transform(X)
df1 = MultiColumnLabelEncoder(columns = ['EntryTerm','DEPENDENCYCODE']).fit_transform(df)
Here EntryTerm has two categories and DEPENDENCYCODE has multiple categories.
I want to identify if EntryTerm = 082021 was assigned 0 or 1 as label. And if DEPENCENCYCODE = 'B' was assigned 0, 1, 2 ,3 or 4 label.
Thanks.
In the case a dataframe has two or more columns with numerical and text values, and one Label/Target column, if I want to apply a model like svm, how can I use only the columns I am more interested in?
Ex.
Data Num Label/Target No_Sense
What happens here? group1 1 Migrate
Customer Management group2 0 Change Stage
Life Cycle Stages group1 1 Restructure
Drop-down allows to select status type group3 1 Restructure Status
and so.
The approach I have taken is
1.encode "Num" column:
one_hot = pd.get_dummies(df['Num'])
df = df.drop('Num',axis = 1)
df = df.join(one_hot)
2.encode "Data" column:
def bag_words(df):
df = basic_preprocessing(df)
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df['Data'])
list_corpus = df["Data"].tolist()
list_labels = df["Label/Target"].tolist()
X = count_vectorizer.transform(list_corpus)
return X, list_labels
Then apply bag_words to the dataset
X, y = bag_words(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
Is there anything that I missed in these steps? How can I select only "Data" and "Num" features in my training dataset? (as I think "No_Sense" is not so relevant for my purposes)
EDIT: I have tried with
def bag_words(df):
df = basic_preprocessing(df)
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df['Data'])
list_corpus = df["Data"].tolist()+ df["group1"].tolist()+df["group2"].tolist()+df["group3"].tolist() #<----
list_labels = df["Label/Target"].tolist()
X = count_vectorizer.transform(list_corpus)
return X, list_labels
but I have found the error:
TypeError: 'int' object is not iterable
I hope this helps you:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
#this part so I can recreate you df from the string you posted
#remove this part !!!!
data="""
Data Num Label/Target No_Sense
What happens here? group1 1 Migrate
Customer Management group2 0 Change Stage
Life Cycle Stages group1 1 Restructure
Drop-down allows to select status type group3 1 Restructure Status
"""
df = pd.DataFrame(np.array( [ re.split(r'\s{2,}', line) for line in lines[1:] ] ),
columns = lines[0].split())
#what you want starts from here!!!!:
one_hot = pd.get_dummies(df['Num'])
df = df.drop('Num',axis = 1)
df = df.join(one_hot)
#at this point you have 3 new fetures for 'Num' variable
def bag_words(df):
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df['Data'])
matrix = count_vectorizer.transform(df['Data'])
#this dataframe: `encoded_df`has 15 new features, these are the result of fitting
#the CountVectorizer to the 'Data' variable
encoded_df = pd.DataFrame(data=matrix.toarray(), columns=["Data"+str(i) for i in range(matrix.shape[1])])
#adding them to the dataframe
df.join(encoded_df)
#getting the numpy arrays that you can use in training
X = df.loc[:, ["Data"+str(i) for i in range(matrix.shape[1])] + ["group1", "group2", "group3"]].to_numpy()
y = df.loc[:, ["Label/Target"]].to_numpy()
return X, y
X, y = bag_words(df)
I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm
Data.csv: param1,param2,param3,result
1,2,cat1,12
2,3,cat2,13
1,6,cat1,6
1,1,cat2,12
Suppose i read the data from the file and convert categorical variables into dummy variables like this:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
data = pd.read_csv('data.csv')
type_dummies = pd.get_dummies(data.house_type)
data = pd.concat([data, type_dummies], axis=1)
I received dataframe:
1,2,1,0,..
1,6,0,1,..
I made simple linear regression for that dataset and received coeffs. How can i convert a new record (new_data = np.array([12,19,cat1])) for new_data = np.array([12,19,1,0)) using pandas for using it in my linear model? (such that new data categorical variables will be converted into dummy variables)
Typically you'll want to setup a pipeline to record the correct category:code mapping.
class CategoricalTransformer(TransformerMixin):
def fit(self, X, y=None, *args, **kwargs):
self.columns_ = X.columns
self.cat_columns_ = X.select_dtypes(include=['category']).columns
self.non_cat_columns_ = X.columns.drop(self.cat_columns_)
self.cat_map_ = {col: X[col].cat.categories
for col in self.cat_columns_}
self.ordered_ = {col: X[col].cat.ordered
for col in self.cat_columns_}
self.dummy_columns_ = {col: ["_".join([col, v])
for v in self.cat_map_[col]]
for col in self.cat_columns_}
self.transformed_columns_ = pd.Index(
self.non_cat_columns_.tolist() +
list(chain.from_iterable(self.dummy_columns_[k]
for k in self.cat_columns_))
)
def transform(self, X, y=None, *args, **kwargs):
return (pd.get_dummies(X)
.reindex(columns=self.transformed_columns_)
.fillna(0))
More here.
with the pipeline sklearn.pipeline.make_pipeline(CategoricalTransformer(), LinearRegression()), your predict method should correctly translate from the categorical house_type to variables.