How can I modify a Pandas DataFrame by reference? - python

I'm trying to write a Python function that does One-Hot encoding in-place but I'm having trouble finding a way to do a concat operation in-place at the end. It appears to make a copy of my DataFrame for the concat output and I am unable to assign this to my DataFrame that I passed by reference.
How can this be done?
def one_hot_encode(df, col: str):
"""One-Hot encode inplace. Includes NAN.
Keyword arguments:
df (DataFrame) -- the DataFrame object to modify
col (str) -- the column name to encode
"""
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
df.drop(col, axis=1, inplace=True)
df[:] = pd.concat([df.iloc[:, :insert_loc], insert_data, df.iloc[:, insert_loc:]], axis=1) # Doesn't take effect outside function

I don't think you can pass function arguments by reference in python (see: How do I pass a variable by reference? )
Instead what you can do is just return the modified df from your function, and assign result to the original df:
def one_hot_encode(df, col: str):
...
return df
...
df=one_hot_encode(df, col)

To make the change take affect outside the function, we have to change the object that was passed in rather than replace its name (inside the function) with a new object.
To assign the new columns, you can use
df[insert_data.columns] = insert_data
instead of the concat.
That doesn't take advantage of your careful insert order though.
To retain your order, we can redindex the data frame.
df.reindex(columns=cols)
where cols is the combined list of columns in order:
cols = [cols[:insert_loc] + list(insert_data.columns) + cols[insert_loc:]]
Putting it all together,
import pandas as pd
def one_hot_encode(df, col: str):
"""One-Hot encode inplace. Includes NAN.
Keyword arguments:
df (DataFrame) -- the DataFrame object to modify
col (str) -- the column name to encode
"""
cols = list(df.columns)
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
cols = [cols[:insert_loc] + list(insert_data.columns) + cols[insert_loc:]]
df[insert_data.columns] = insert_data
df.reindex(columns=cols)
df.drop(col, axis=1, inplace=True)
import seaborn
diamonds=seaborn.load_dataset("diamonds")
col="color"
one_hot_encode(diamonds, "color")
assert( "color" not in diamonds.columns )
assert( len([c for c in diamonds.columns if c.startswith("color")]) == 8 )

df.insert is inplace--but can only insert one column at a time. It might not be worth the reorder.
def one_hot_encode2(df, col: str):
"""One-Hot encode inplace. Includes NAN.
Keyword arguments:
df (DataFrame) -- the DataFrame object to modify
col (str) -- the column name to encode
"""
cols = list(df.columns)
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
for offset, newcol in enumerate(insert_data.columns):
df.insert(loc=insert_loc+offset, column=newcol, value = insert_data[[newcol]])
df.drop(col, axis=1, inplace=True)
import seaborn
diamonds=seaborn.load_dataset("diamonds")
col="color"
one_hot_encode2(diamonds, "color")
assert( "color" not in diamonds.columns )
assert(len([c for c in diamonds.columns if c.startswith("color")]) == 8)
assert([(i) for i,c in enumerate(diamonds.columns) if c.startswith("color")][0] == 2)

The scope of the variables of a function are only inside that function. Simply include a return statement in the end of the function to get your modified dataframe as output. Calling this function will now return your modified dataframe. Also while assigning new (dummy) columns, instead of df[:] use df, as you are changing the dimension of original dataframe.
def one_hot_encode(df, col: str):
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
df.drop(col, axis=1, inplace=True)
df = pd.concat([df.iloc[:, :insert_loc], insert_data, df.iloc[:, insert_loc:]], axis=1)
return df
Now to see the modified dataframe, call the function and assign it to a new/existing dataframe as below
df=one_hot_encode(df,'<any column name>')

Related

About python vaex merging columns to a new column while changing int to float

I am able to write a function to merge columns to a new column, but fail to change int column into float before changing to string for merging.
I hope that in the new merged column, those integer would have pending ".00000".
At the end I was trying to make merged column as key for joining two vaex on multiple key/column. As it seems vaex only take one column/key for joining two vaex, I need to make combined column as key.
The changing of int to float is in case that column in one vaex is int and in another vaex is float.
code is as below.
Function new_column_by_column_merging is working, but function new_column_by_column_merging2 is not. Wondering if there is any way to make it work.
import vaex
import pandas as pd
import numpy as np
def new_column_by_column_merging(df, columns=None):
if columns is None:
columns = df.get_column_names()
if type(columns) is str:
df['merged_column_key'] = df[columns]
return df
df['merged_column_key'] = np.array(['']*len(df))
for col in columns:
df['merged_column_key'] = df['merged_column_key'] + '_' + df[col].astype('string')
return df
def new_column_by_column_merging2(df, columns=None):
if columns is None:
columns = df.get_column_names()
if type(columns) is str:
df['merged_column_key'] = df[columns]
return df
df['merged_column_key'] = np.array(['']*len(df))
for col in columns:
try:
df[col] = df[col].astype('float')
except:
print('fail to convert to float')
df['merged_column_key'] = df['merged_column_key'] + '_' + df[col].astype('string')
return df
pandas_df = pd.DataFrame({'Name': ['Tom', 'Joseph', 'Krish', 'John'], 'Last Name': ['Johnson', 'Cameron', 'Biden', 'Washington'], 'Age': [20, 21, 19, 18], 'Weight': [60.0, 61.0, 62.0, 63.0]})
print('pandas_df is')
print(pandas_df)
df = vaex.from_pandas(df=pandas_df, copy_index=False)
df1 = new_column_by_column_merging(df, ['Name', 'Age', 'Weight'])
print('new_column_by_column_merging returns')
print(df1)
df2 = new_column_by_column_merging2(df, ['Name', 'Age', 'Weight'])
print('new_column_by_column_merging2 returns')
print(df2)
It looks like the vaex expression system does not always play nicely with the try / except checks. So you need to be careful with the dtypes. One way of handing this:
import vaex
df = vaex.datasets.titanic() # dataframe for testing
def new_column_by_column_merging2(df, columns=None):
if columns is None:
columns = df.get_column_names()
if type(columns) is str:
df['merged_column_key'] = df[columns]
return df
df['merged_column_key'] = np.array(['']*len(df))
for col in columns:
if df[col].is_string():
pass
else:
df[col] = df[col].astype('float')
df['merged_column_key'] = df['merged_column_key'] + '_' + df[col].astype('string')
return df
new_column_by_column_merging2(df) # should work
Basically i modified the try/except statement to explicitly check for strings (since they can't be converted to floats). You might have to extend that check to check for other things like datetime etc.. if needed. Hope this helps

How to generalize a function written for a specific column in a dataframe to be usable on any similar column?

How can I adjust this code so that it is useable for any column in the dataframe? Currently it only works on the column called "Gaps", but I have 10 other columns to which I need to apply this same function.
def get_averages(df: pd.DataFrame, column: str) -> pd.DataFrame:
'''
Add a column in place, with the averages
of each `Num` cyclical item for each row
'''
# work with a new dataframe
df2 = (
df[['FileName', 'Num', column]]
.explode('Gaps', ignore_index=True)
)
df2.Gaps = df2.Gaps.astype(float)
df2['tag'] = ( # add cyclic tags to each row, within each FileName
df2.groupby('FileName')[column]
.transform('cumcount') # similar to range(len(group))
% df2.Num # get the modulo of the row number within the group
)
# get averages and collect into lists
df2 = df2.groupby(['FileName', 'tag'])[column].mean() # get average
df2.rename(f'{column}_avgs', inplace=True)
# collect in a list by Filename and merge with original df
df2 = df2.groupby('FileName').agg(list)
df = df.merge(df2, on='FileName')
return df
df = get_averages(df, 'Gaps')
Use the parameter variable instead of hard-coding the column name:
df2 = (
df[['FileName', 'Num', column]]
.explode(column, ignore_index=True)
)
df2[column] = df2[column].astype(float)

Rename Columns in Pandas Using Lambda Function Rather Than a Function

I'm trying to rename column headings in my dataframe in pandas using .rename().
Basically, the headings are :
column 1: "Country name[9]"
column 2: "Official state name[5]"
#etc.
I need to remove [number].
I can do that with a function:
def column(string):
for x, v in enumerate(string):
if v == '[':
return string[:x]
But I wanted to know how to convert this to a lambda function so that I can use
df.rename(columns = lambda x: do same as function)
I've never used lambda functions before so I'm not sure of the syntax to get it to work correctly.
First you would have to create function which returns new or old value - never None.
def column(name):
if '[' in name:
return name[:name.index('[')] # new - with change
else:
return name # old - without change
and then you can use it as
df.rename(columns=lambda name:columns(name))
or even simpler
df.rename(columns=columns)
Or you can convert your function to real lambda
df.rename(columns=(lambda name: name[:name.index('[')] if '[' in name else name) )
but sometimes it is more readable to keep def column(name) and use columns=column. And not all constructions can be used in lambda - ie. you can't assing value to variable (I don't know if you can use new operator := ("walrus") in Python 3.8).
Minimal working code
import pandas as pd
data = {
'Country name[9]': [1,2,3],
'Official state name[5]': [4,5,6],
'Other': [7,8,9],
}
df = pd.DataFrame(data)
def column(name):
if '[' in name:
return name[:name.index('[')]
else:
return name
print(df.columns)
df = df.rename(columns=column)
# or
df = df.rename(columns=(lambda name: name[:name.index('[')] if '[' in name else name) )
print(df.columns)

Is there a better way to manipulate column names in a pandas dataframe?

I'm working with a large dataframe and need a way to dynamically rename column names.
Here's a slow method I'm working with:
# Create a sample dataframe
df = pd.DataFrame.from_records([
{'Name':'Jay','Favorite Color (BLAH)':'Green'},
{'Name':'Shay','Favorite Color (BLAH)':'Blue'},
{'Name':'Ray','Favorite Color (BLAH)':'Yellow'},
])
# Current columns are: ['Name', 'Favorite Color (BLAH)']
# ------
# build two lambdas to clean the column names
f_clean = lambda x: x.split('(')[0] if ' (' in x else x
f_join = lambda x: '_'.join(x.split())
df.columns = df.columns.map(f_clean, f_join).map(f_join).str.lower()
# Columns are now: ['name', 'favorite_color']
Is there a better method for solving this?
You could define a clean function and just apply to all the columns using list comprehension.
def clean(name):
name = name.split('(')[0] if ' (' in name else name
name = '_'.join(name.split())
return name
df.columns = [clean(col) for col in df.columns]
It's clear what's happening and not overly verbose.

How to extract entire part of string after certain character in dataframe column?

I am working on using the below code to extract the last number of pandas dataframe column name.
names = df.columns.values
new_df = pd.DataFrame()
for name in names:
if ('.value.' in name) and df[name][0]:
last_number = int(name[-1])
print(last_number)
key, value = my_dict[last_number]
try:
new_df[value][0] = list(new_df[value][0]) + [key]
except:
new_df[value] = [key]
name is a string that looks like this:
'data.answers.1234567890.value.0987654321'
I want to take the entire number after .value. as in the IF statement. How would do this in the IF statement above?
Use str.split, and extract the last slice with -1 (also gracefully handles false cases):
df = pd.DataFrame(columns=[
'data.answers.1234567890.value.0987654321', 'blahblah.value.12345', 'foo'])
df.columns = df.columns.str.split('value.').str[-1]
df.columns
# Index(['0987654321', '12345', 'foo'], dtype='object')
Another alternative is splitting inside a listcomp:
df.columns = [x.split('value.')[-1] for x in df.columns]
df.columns
# Index(['0987654321', '12345', 'foo'], dtype='object')

Categories