label-encoder encoding a dataframe without encoding NaN missing values - python

I have a dataframe that contains Numerical, categorical and NaN values.
customer_class B C
0 OM1 1 2.0
1 NaN 6 1.0
2 OM1 9 NaN
....
I need a LabelEncoder that keeps my missing values as 'NaN' to use an Imputer afterwards.
So I have would like to use this code in order to encode my dataframe by keeping NaN value .
here is the code :
class LabelEncoderByCol(BaseEstimator, TransformerMixin):
def __init__(self,col):
#List of column names in the DataFrame that should be encoded
self.col = col
#Dictionary storing a LabelEncoder for each column
self.le_dic = {}
for el in self.col:
self.le_dic[el] = LabelEncoder()
def fit(self,x,y=None):
#Fill missing values with the string 'NaN'
x[self.col] = x[self.col].fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
self.le_dic[el].fit(a)
return self
def transform(self,x,y=None):
#Fill missing values with the string 'NaN'
x[self.col] = x[self.col].fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
#Store an ndarray of the current column
b = x[el].get_values()
#Replace the elements in the ndarray that are not 'NaN'
#using the transformer
b[b!='NaN'] = self.le_dic[el].transform(a)
#Overwrite the column in the DataFrame
x[el]=b
#return the transformed D
col = data1['customer_class']
LabelEncoderByCol(col)
LabelEncoderByCol.fit(x=col,y=None)
But I got this error :
846 if mask.any():
--> 847 raise ValueError('%s not contained in the index' % str(key[mask]))
848 self._set_values(indexer, value)
849
ValueError: ['OM1' 'OM1' 'OM1' ... 'other' 'EU' 'EUB'] not contained in the index
Any idea please to resolve this error?
thanks

Two things jumped out to me when I tried to reproduce:
Your code seems to expect a dataframe will be passed to your class. But in your example you passed a series. I fixed this by wrapping the series as a dataframe before passing it to your class: col = pd.DataFrame(data1['customer_class']).
In your class' __init__ method it seemed like you had intended to iterate through a list of column names, but instead were actually iterating through all of your columns, series by series. I fixed this by changing the appropriate line to: self.col = col.columns.values.
Below, I've pasted in my modifications to your class' __init__ and fit methods (my only modification to the transform method was to have it return the modified dataframe):
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
data1 = pd.DataFrame({'customer_class': ['OM1', np.nan, 'OM1'],
'B': [1,6,9],
'C': [2.0, 1.0, np.nan]})
class LabelEncoderByCol(BaseEstimator, TransformerMixin):
def __init__(self,col):
#List of column names in the DataFrame that should be encoded
self.col = col.columns.values
#Dictionary storing a LabelEncoder for each column
self.le_dic = {}
for el in self.col:
self.le_dic[el] = LabelEncoder()
def fit(self,x,y=None):
#Fill missing values with the string 'NaN'
x = x.fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
self.le_dic[el].fit(a)
return self
def transform(self,x,y=None):
#Fill missing values with the string 'NaN'
x[self.col] = x[self.col].fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
#Store an ndarray of the current column
b = x[el].get_values()
#Replace the elements in the ndarray that are not 'NaN'
#using the transformer
b[b!='NaN'] = self.le_dic[el].transform(a)
#Overwrite the column in the DataFrame
x[el]=b
return x
I am able to run the following lines (also slightly modified from your initial implementation) with no error:
col = pd.DataFrame(data1['customer_class'])
lenc = LabelEncoderByCol(col)
lenc.fit(x=col,y=None)
I can then access the classes for the customer_class column from your example:
lenc.fit(x=col,y=None).le_dic['customer_class'].classes_
Which outputs:
array(['OM1'], dtype=object)
Finally, I can transform the column using your class' transform method:
lenc.transform(x=col,y=None)
Which outputs the following:
customer_class
0 0
1 NaN
2 0

Related

How to substitute NaN for a text in a DataFrame?

I have a DataFrame and I need to change the content of the cells of a specific column to a text content (for example "not registered").
I am trying different options, these are some of them:
dftotal.fillna({"Computer_OS":"not registered", "Computer_OS_version":"not registered"}, inplace=True)
dftotal.loc[(dftotal["Computer_OS"]=="NaN"),"Computer_OS"] = "not registered"
Assumed that all values in Computer_OS column are string datatype else you would need to change datatype first.
import numpy as np
import pandas as pd
import re
def txt2nan(x):
"""
if given string x contains alphabet
return NaN else original x.
Parameters
----------
x : str
"""
if re.match('[a-zA-Z]', x):
return np.nan
else:
return x
df = pd.DataFrame({"os":["tsd", "ssad d", "sd", "1","2","3"]})
df["os"] = df["os"].apply(txt2nan)
Better sol'tn is to vectorize above operation:
df["os"] = np.where(df["os"].str.match('[a-zA-Z]'), np.nan, df["os"])

How to save values in pandas dataframe after editing some values

I have a dataframe which looks like this (It contains dummy data) -
I want to remove the text which occurs after "_________" identifier in each of the cells. I have written the code as follows (Logic: Adding a new column containing NaN and saving the edited values in that column) -
import pandas as pd
import numpy as np
df = pd.read_excel(r'Desktop\Trial.xlsx')
NaN = np.nan
df["Body2"] = NaN
substring = "____________"
for index, row in df.iterrows():
if substring in row["Body"]:
split_string = row["Body"].split(substring,1)
row["Body2"] = split_string[0]
print(df)
But the Body2 column still displays NaN and not the edited values.
Any help would be much appreciated!
`for index, row in df.iterrows():
if substring in row["Body"]:
split_string = row["Body"].split(substring,1)
#row["Body2"] = split_string[0] # instead use below line
df.at[index,'Body2'] = split_string[0]`
Make use of at to modify the value
Instead of iterating through the rows, do the operation on all rows at once. You can use expand to split the values into multiple columns, which I think is what you want.
substring = "____________"
df = pd.DataFrame({'Body': ['a____________b', 'c____________d', 'e____________f', 'gh']})
df[['Body1', 'Body2']] = df['Body'].str.split(substring, expand=True)
print(df)
# Body Body1 Body2
# 0 a____________b a b
# 1 c____________d c d
# 2 e____________f e f
# 3 gh gh None

how to resolve 'argument must be a string or number' in label encoder?

My dataset(i.e. dataset_train) has 43 categorical features, and I want to apply LabelEncoder in each of the columns.
Here i got all the categorical features:categorical_features = [features for features in dataset_train.columns if dataset_train[features].dtype == 'O']
Now a class is created to perform multiple label encoding:
class MultiColumnLabelEncoder:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self # not relevant here
def transform(self,X):
output = X.copy()
if self.columns is not None:
for col in self.columns:
output[col] = LabelEncoder().fit_transform(output[col])
else:
for colname,col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self,X,y=None):
return self.fit(X,y).transform(X)
Now after executing:
from sklearn.preprocessing import LabelEncoder
dataset_train = MultiColumnLabelEncoder(columns = categorical_features).fit_transform(dataset_train)
It's showing me :
TypeError: argument must be a string or number
Note that - NaN values are also handled previously.
how to handle this situation??
Ok, it is solved.
One of my categorical_features had just one NaN value in it.
Previously I wrote this below code to get my categorical_features containing NaN values:
categorical_features = [ feature for feature in dataset_train.columns if dataset_train[feature].isnull().sum() > 1 and dataset_train[feature].dtype == 'O' ]
That's why I got those features which had more than 1 missing values in its column.
That's why I faced this error.
Now, after I modified the above code to:
categorical_features = [ feature for feature in dataset_train.columns if dataset_train[feature].isnull().sum() >= 1 and dataset_train[feature].dtype == 'O' ]
Now I got every column even with single NaN value.
It can execute the above code for MultiColumnLabelEncoder which I mentioned in my question.
Thanks for cooperation ♥️.

Python pandas data frame clean with dictionary of regular expressions

I want to clean a pandas data frame using a dictionary of regular expressions representing allowed data entry formats.
I'm trying to iterate over the input data frame so to check every row against the allowed data entry format for a given column.
If an entry doesn't meet the format allowed for the column, I want to replace it with NaN (see desired output below).
My current code gives me an error message: 'DataFrame' object has no attribute 'col'.
My MWE features two representative regular expressions, but for my actual data set I've got ~40.
Thanks for any help!
# Packages
import pandas as pd
import re
import numpy as np
# Input data frame
data = {'score': [71,72,55,'a'],
'bet': [0.260,0.380,'0.8dd',0.260]
}
df1 = pd.DataFrame(data, columns = ['score', 'bet'])
# Input dictionary
dict1 = {'score':'^\d+$',
'bet': '^\d[\.]\d+$'}
# Cleaning function
def cleaner(df, dict):
for col in df.columns:
if col in dict:
for row in df.col:
if re.match(dict[col], str(row)):
row = row
else:
row = np.nan
return(df)
cleaned_df = cleaner(df1, dict1)
# ERROR MESSAGE
# 'DataFrame' object has no attribute 'col'
# Desired output
goal_data = {'score': [71,72,55, np.nan],
'bet': [0.260,0.380, np.nan, 0.260]
}
goal_df = pd.DataFrame(goal_data, columns = ['score', 'bet'])
there is a problem with your cleaning function in the if statement.
try running the following cleaner function in place of yours.
# Cleaning function
def cleaner(df, dict):
for col in df.columns:
if col in dict.keys():
for row in df.index:
if type(re.match(dict[col], str(df[col][row]))) is re.Match:
df[col][row] = df[col][row]
print(df[col][row])
else:
df[col][row] = np.nan
return(df)
print(cleaner(df1, dict1))
cleaned_df = cleaner(df1, dict1)
Try np.where(if condition, yes,else alternative)
import pandas as pd
import numpy as np
df1['score']=np.where(df1.score.str.match('^\d+$'),df1['score'],np.nan)
df1['bet']=np.where(df1.bet.str.match('^\d[\.]\d+$'),df1['bet'],np.nan)
score bet
0 71 0.26
1 72 0.38
2 55 NaN
3 NaN 0.26

label-encoder encoding missing values

I am using the label encoder to convert categorical data into numeric values.
How does LabelEncoder handle missing values?
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
le = LabelEncoder()
le.fit_transform(a)
Output:
array([1, 2, 3, 0, 4, 1])
For the above example, label encoder changed NaN values to a category. How would I know which category represents missing values?
Don't use LabelEncoder with missing values. I don't know which version of scikit-learn you're using, but in 0.17.1 your code raises TypeError: unorderable types: str() > float().
As you can see in the source it uses numpy.unique against the data to encode, which raises TypeError if missing values are found. If you want to encode missing values, first change its type to a string:
a[pd.isnull(a)] = 'NaN'
you can also use a mask to replace form the original data frame after labelling
df = pd.DataFrame({'A': ['x', np.NaN, 'z'], 'B': [1, 6, 9], 'C': [2, 1, np.NaN]})
A B C
0 x 1 2.0
1 NaN 6 1.0
2 z 9 NaN
original = df
mask = df_1.isnull()
A B C
0 False False False
1 True False False
2 False False True
df = df.astype(str).apply(LabelEncoder().fit_transform)
df.where(~mask, original)
A B C
0 1.0 0 1.0
1 NaN 1 0.0
2 2.0 2 NaN
Hello a little computational hack I did for my own work:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
le = LabelEncoder()
### fit with the desired col, col in position 0 for this example
fit_by = pd.Series([i for i in a.iloc[:,0].unique() if type(i) == str])
le.fit(fit_by)
### Set transformed col leaving np.NaN as they are
a["transformed"] = fit_by.apply(lambda x: le.transform([x])[0] if type(x) == str else x)
This is my solution, because I was not pleased with the solutions posted here. I needed a LabelEncoder that keeps my missing values as NaN to use an Imputer afterwards. So I have written my own LabelEncoder class. It works with DataFrames.
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
class LabelEncoderByCol(BaseEstimator, TransformerMixin):
def __init__(self,col):
#List of column names in the DataFrame that should be encoded
self.col = col
#Dictionary storing a LabelEncoder for each column
self.le_dic = {}
for el in self.col:
self.le_dic[el] = LabelEncoder()
def fit(self,x,y=None):
#Fill missing values with the string 'NaN'
x[self.col] = x[self.col].fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
self.le_dic[el].fit(a)
return self
def transform(self,x,y=None):
#Fill missing values with the string 'NaN'
x[self.col] = x[self.col].fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
#Store an ndarray of the current column
b = x[el].to_numpy()
#Replace the elements in the ndarray that are not 'NaN'
#using the transformer
b[b!='NaN'] = self.le_dic[el].transform(a)
#Overwrite the column in the DataFrame
x[el]=b
#return the transformed DataFrame
return x
You can enter a DataFrame, not only a 1-dim Series. with col you can chose the columns that should be encoded.
I would like to here some feedback.
I want to share with you my solution.
I created a module which take mix dataset and convert it from categorical to numerical
and inverse.
This Module also available in my Github well organized with example.
Please upvoted if you like my solution.
Tks,
Idan
class label_encoder_contain_missing_values :
def __init__ (self) :
pass
def categorical_to_numeric (self,dataset):
import numpy as np
import pandas as pd
self.dataset = dataset
self.summary = None
self.table_encoder= {}
for index in self.dataset.columns :
if self.dataset[index].dtypes == 'object' :
column_data_frame = pd.Series(self.dataset[index],name='column').to_frame()
unique_values = pd.Series(self.dataset[index].unique())
i = 0
label_encoder = pd.DataFrame({'value_name':[],'Encode':[]})
while i <= len(unique_values)-1:
if unique_values.isnull()[i] == True :
label_encoder = label_encoder.append({'value_name': unique_values[i],'Encode':np.nan}, ignore_index=True) #np.nan = -1
else:
label_encoder = label_encoder.append({'value_name': unique_values[i],'Encode':i}, ignore_index=True)
i+=1
output = pd.merge(left=column_data_frame,right = label_encoder, how='left',left_on='column',right_on='value_name')
self.summary = output[['column','Encode']].drop_duplicates().reset_index(drop=True)
self.dataset[index] = output.Encode
self.table_encoder.update({index:self.summary})
else :
pass
# ---- Show Encode Table ----- #
print('''\nLabel Encoding completed in Successfully.\n
Next steps: \n
1. To view table_encoder, Execute the follow: \n
for index in table_encoder :
print(f'\\n{index} \\n',table_encoder[index])
2. For inverse, execute the follow : \n
df = label_encoder_contain_missing_values().
inverse_numeric_to_categorical(table_encoder, df) ''')
return self.table_encoder ,self.dataset
def inverse_numeric_to_categorical (self,table_encoder, df):
dataset = df.copy()
for column in table_encoder.keys():
df_column = df[column].to_frame()
output = pd.merge(left=df_column,right = table_encoder[column], how='left',left_on= column,right_on='Encode')#.rename(columns={'column_x' :'encode','column_y':'category'})
df[column]= output.column
print('\nInverse Label Encoding, from categorical to numerical completed in Successfully.\n')
return df
**execute command from categorical to numerical** <br>
table_encoder, df = label_encoder_contain_missing_values().categorical_to_numeric(df)
**execute command from numerical to categorical** <br>
df = label_encoder_contain_missing_values().inverse_numeric_to_categorical(table_encoder, df)
An easy way is this
It is an example of Titanic
LABEL_COL = ["Sex", "Embarked"]
def label(df):
_df = df.copy()
le = LabelEncoder()
for col in LABEL_COL:
# Not NaN index
idx = ~_df[col].isna()
_df.loc[idx, col] \
= le.fit(_df.loc[idx, col]).transform(_df.loc[idx, col])
return _df
The most voted answer by #Kerem has typos, therefore I am posting the corrected and improved answer here:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
for j in a.columns.values:
le = LabelEncoder()
### fit with the desired col, col in position 0 for this ###example
fit_by = pd.Series([i for i in a[j].unique() if type(i) == str])
le.fit(fit_by)
### Set transformed col leaving np.NaN as they are
a["transformed"] = a[j].apply(lambda x: le.transform([x])[0] if type(x) == str else x)
You can handle missing values by replacing it with string 'NaN'. The category can be obtained by le.transfrom().
le.fit_transform(a.fillna('NaN'))
category = le.transform(['NaN'])
Another solution is for label encoder to ignore missing values.
a = le.fit_transform(a.astype(str))
You can fill the na's by some value and later change the dataframe column type to string to make things work.
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
a.fillna(99)
le = LabelEncoder()
le.fit_transform(a.astype(str))
Following encoder addresses None values in each category.
class MultiColumnLabelEncoder:
def __init__(self):
self.columns = None
self.led = defaultdict(preprocessing.LabelEncoder)
def fit(self, X):
self.columns = X.columns
for col in self.columns:
cat = X[col].unique()
cat = [x if x is not None else "None" for x in cat]
self.led[col].fit(cat)
return self
def fit_transform(self, X):
if self.columns is None:
self.fit(X)
return self.transform(X)
def transform(self, X):
return X.apply(lambda x: self.led[x.name].transform(x.apply(lambda e: e if e is not None else "None")))
def inverse_transform(self, X):
return X.apply(lambda x: self.led[x.name].inverse_transform(x))
Uses Example
df = pd.DataFrame({
'pets': ['cat', 'dog', 'cat', 'monkey', 'dog', 'dog'],
'owner': ['Champ', 'Ron', 'Brick', None, 'Veronica', 'Ron'],
'location': ['San_Diego', 'New_York', 'New_York', 'San_Diego', 'San_Diego',
None]
})
print(df)
location owner pets
0 San_Diego Champ cat
1 New_York Ron dog
2 New_York Brick cat
3 San_Diego None monkey
4 San_Diego Veronica dog
5 None Ron dog
le = MultiColumnLabelEncoder()
le.fit(df)
transformed = le.transform(df)
print(transformed)
location owner pets
0 2 1 0
1 0 3 1
2 0 0 0
3 2 2 2
4 2 4 1
5 1 3 1
inverted = le.inverse_transform(transformed)
print(inverted)
location owner pets
0 San_Diego Champ cat
1 New_York Ron dog
2 New_York Brick cat
3 San_Diego None monkey
4 San_Diego Veronica dog
5 None Ron dog
This function takes a column from a dataframe and return the column where only non-NaNs are label encoded, the rest remains untouched
import pandas as pd
from sklearn.preprocessing import LabelEncoder
def label_encode_column(col):
nans = col.isnull()
nan_lst = []
nan_idx_lst = []
label_lst = []
label_idx_lst = []
for idx, nan in enumerate(nans):
if nan:
nan_lst.append(col[idx])
nan_idx_lst.append(idx)
else:
label_lst.append(col[idx])
label_idx_lst.append(idx)
nan_df = pd.DataFrame(nan_lst, index=nan_idx_lst)
label_df = pd.DataFrame(label_lst, index=label_idx_lst)
label_encoder = LabelEncoder()
label_df = label_encoder.fit_transform(label_df.astype(str))
label_df = pd.DataFrame(label_df, index=label_idx_lst)
final_col = pd.concat([label_df, nan_df])
return final_col.sort_index()
This is how I did it:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
UNKNOWN_TOKEN = '<unknown>'
a = pd.Series(['A','B','C', 'D','A'], dtype=str).unique().tolist()
a.append(UNKNOWN_TOKEN)
le = LabelEncoder()
le.fit_transform(a)
embedding_map = dict(zip(le.classes_, le.transform(le.classes_)))
and when applying to new test data:
test_df = test_df.apply(lambda x: x if x in embedding_map else UNKNOWN_TOKEN)
le.transform(test_df)
I also wanted to contribute my workaround, as I found the others a bit more tedious when working with categorical data which contains missing values
# Create a random dataframe
foo = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
# Randomly intersperse column 'A' with missing data (NaN)
foo['A'][np.random.randint(0,len(foo), size=20)] = np.nan
# Convert this series to string, to simulate our problem
series = foo['A'].astype(str)
# np.nan are converted to the string "nan", mask these out
mask = (series == "nan")
# Apply the LabelEncoder to the unmasked series, replace the masked series with np.nan
series[~mask] = LabelEncoder().fit_transform(series[~mask])
series[mask] = np.nan
foo['A'] = series
This is my attempt!
import numpy as np
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#Now lets encode the incomplete Cabin feature
titanic_train_le['Cabin'] = le.fit_transform(titanic_train_le['Cabin'].astype(str))
#get nan code for the cabin categorical feature
cabin_nan_code=le.transform(['nan'])[0]
#Now, retrieve the nan values in the encoded data
titanic_train_le['Cabin'].replace(cabin_nan_code,np.nan,inplace=True)
I just created my own encoder which can encode a dataframe at once.
Using this class, None is encoded to 0. It can be handy when trying to make sparse matrix.
Note that the input dataframe must include categorical columns only.
class DF_encoder():
def __init__(self):
self.mapping = {None : 0}
self.inverse_mapping = {0 : None}
self.all_keys =[]
def fit(self,df:pd.DataFrame):
for col in df.columns:
keys = list(df[col].unique())
self.all_keys += keys
self.all_keys = list(set(self.all_keys))
for i , item in enumerate(start=1 ,iterable=self.all_keys):
if item not in self.mapping.keys():
self.mapping[item] = i
self.inverse_mapping[i] = item
def transform(self,df):
temp_df = pd.DataFrame()
for col in df.columns:
temp_df[col] = df[col].map(self.mapping)
return temp_df
def inverse_transform(self,df):
temp_df = pd.DataFrame()
for col in df.columns:
temp_df[col] = df[col].map(self.inverse_mapping)
return temp_df
I faced the same problem but none of the above worked for me. So I added a new row to the training data consisting only "nan"

Categories