Method Chaining in Pandas: str.replace not working - python

I would like to read in an excel file, and using method chaining, convert the column names into lower case and replace any white space into _. The following code runs fine
def supp_read(number):
filename = f"supplemental-table{number}.xlsx"
df = (pd.read_excel(filename,skiprows=5)
.rename(columns = str.lower))
return df
But the code below does not
def supp_read(number):
filename = f"supplemental-table{number}.xlsx"
df = (pd.read_excel(filename,skiprows=5)
.rename(columns = str.lower)
.rename(columns = str.replace(old=" ",new="_")))
return df
After adding the str.replace line I get the following error: No value for argument 'self' in unbound method call. Can someone shed some light on what I can do to fix this error and why the above does not work?
In addition, when I use str.lower() I get the same error. Why does str.lower work but not str.lower()?

Here's a different syntax which I frequently use:
def supp_read(number):
filename = f"supplemental-table{number}.xlsx"
df = pd.read_excel(filename,skiprows=5)
df.columns = df.columns.str.lower().replace(" ", "_")
return df

Related

Creating a python function to change sequence of columns

I am able to change the sequence of columns using below code I found on stackoverflow, now I am trying to convert it into a function for regular use but it doesnt seem to do anything. Pycharm says local variable df_name value is not used in last line of my function.
Working Code
columnsPosition = list(df.columns)
F, H = columnsPosition.index('F'), columnsPosition.index('H')
columnsPosition[F], columnsPosition[H] = columnsPosition[H], columnsPosition[F]
df = df[columnsPosition]
My Function - Doesnt work, need to make this work
def change_col_seq(df_name, old_col_position, new_col_position):
columnsPosition = list(df_name.columns)
F, H = columnsPosition.index(old_col_position), columnsPosition.index(new_col_position)
columnsPosition[F], columnsPosition[H] = columnsPosition[H], columnsPosition[F]
df_name = df_name[columnsPosition] # pycharm has issue on this line
I have tried adding return on last statement of function but I am unable to make it work.
To re-order the Columns
To change the position of 2 columns:
def change_col_seq(df_name:pd.DataFrame, old_col_position:str, new_col_position:str):
df_name[new_col_position], df_name[old_col_position] = df_name[old_col_position].copy(), df_name[new_col_position].copy()
df = df_name.rename(columns={old_col_position:new_col_position, new_col_position:old_col_position})
return df
To Rename the Columns
You can use the rename method (Documentation)
If you want to change the name of just one column:
def change_col_name(df_name, old_col_name:str, new_col_name:str):
df = df_name.rename(columns={old_col_name: new_col_name})
return df
If you want to change the name of multiple column:
def change_col_name(df_name, old_col_name:list, new_col_name:list):
df = df_name.rename(columns=dict(zip(old_col_name, new_col_name)))
return df

Pandas Attribute error: Nonetype has no attribute rename

learning this from a tutorial, the code isn't working on my machine. error in line with df.rename
def compile_data():
colist = pd.read_csv("nse500symbolistnov2020.csv")
tickers = colist['Symbol']
maindf = pd.DataFrame()
for count,ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
df = df.set_index('Date',inplace=True)
df = df.rename(columns={'Adj Close': ticker},inplace=True)
df.drop(['Open','High','Low','CLose','Volume'],1,inplace=True)
if maindf.empty:
maindf = df
else:
maindf = maindf.join(df, how='outer')
if count % 10 == 0:
print(count)
print(maindf.head())
maindf.to_csv('NSE60joined.csv')
The problem is in the line
df = df.set_index('Date',inplace=True)
Either remove inplace=True, or remove the assignment df =, leaving just
df.set_index('Date',inplace=True)
The same goes for the next line. Either use inplace=True, or assign the new dataframe to df, not both.
When you specify inplace=True and want to see it's contents, it would return None as they merely mutate the DF instead of creating a new copy of it. Basically, you're assigning None to the result and hence it complains of the AttributeError as it isn't a df.DataFrame object anymore to access it's .head() method.
You can do it now in two ways:
No assigning with inplace parameter
df.rename(columns={'Adj Close': ticker},inplace=True)
assign without inplace parameter
df= df.rename(columns={'Y':l})

Function to split & expand returning NameError

def unique_unit_split(df):
df_unit_list = df_master.loc[df_master['type'] == 'unit']
df_unit_list = df_unit_list.key.tolist()
for i in range(len(df_unit_list)):
df_unit_list[i] = int(df_unit_list[i])
split_1 = df_units.units.str.split('[","]',expand=True).stack()
df_units_update = df_units.join(pd.Series(index=split_1.index.droplevel(1), data=split_1.values, name='unit_split'))
df_units_final = df_units_update[df_units_update['unit_split'].isin(df_unit_list)]
return(df)
Updated script: still not working
df_unit_list = []
split_1 = pd.DataFrame()
df_units_update = pd.DataFrame()
df_units_final = pd.DataFrame()
def unique_unit_split(df):
df_unit_list = df_master.loc[df_master['type'] == 'unit']
df_unit_list = df_unit_list.key.tolist()
for i in range(len(df_unit_list)):
df_unit_list[i] = int(df_unit_list[i])
split_1 = df_units.units.str.split('[","]',expand=True).stack()
df_units_update = df_units.join(pd.Series(index=split_1.index.droplevel(1), data=split_1.values, name='unit_split'))
df_units_final = df_units_update[df_units_update['unit_split'].isin(df_unit_list)]
return(df)
Above function originally worked when I split up the two actions (code inclusive of the for loop and above was in a function then everything below split_1 was in another function). Now that I tried to condense them, I am getting a NameError (image attached). Anyone know how I can resolve this issue and ensure my final df (df_units_final) is defined?
For more insight on this function: I have a df with comma separated values in one column and I needed to split that column, drop the [] and only keep rows with the #s I need which were defined in the list created "df_unit_list".
NameError Details
The issue was stated above (not defining df_units_final) AND my for_loop was forcing the list to be int when the values in the other df were actually strings.
Working Code

Function to print dataframe, that uses df name as an argument

In function, I can't use argument to define the name of the df in df.to_csv().
I have a long script to pull apart and understand. To do so I want to save the different dataframes it uses and store them in order. I created a function to do this and add the order number 01 (number_of_interim_exports) to the name (from argument).
My problem is that I need to use this for multiple dataframe names, but the df.to_csv part won't accept an argument in place of df...
def print_interim_results_any(name, num_exports, df_name):
global number_of_interim_exports
global print_interim_outputs
if print_interim_outputs == 1:
csvName = str(number_of_interim_exports).zfill(2) + "_" +name
interimFileName = "interim_export_"+csvName+".csv"
df.to_csv(interimFileName, sep=;, encoding='utf-8', index=False)
number_of_interim_exports += 1
I think i just screwed something else up: this works fine:
import pandas as pd
df = pd.DataFrame({1:[1,2,3]})
def f(frame):
frame.to_csv("interimFileName.csv")
f(df)

passing a variable into apply() in pandas

I am having trouble getting the syntax right for applying a function to a dataframe. I am trying to create a new column in a dataframe by joining the strings in two other columns, passing in a separator. I get the error
TypeError: ("apply_join() missing 1 required positional argument: 'sep'", 'occurred at index cases')
If I add sep to the apply_join() function call, that also fails:
File "unite.py", line 37, in unite
tibble_extra = df[cols].apply(apply_join, sep)
NameError: name 'sep' is not defined
import pandas as pd
from io import StringIO
tibble3_csv = """country,year,cases,population
Afghanistan,1999,745,19987071
Afghanistan,2000,2666,20595360
Brazil,1999,37737,172006362
Brazil,2000,80488,174504898
China,1999,212258,1272915272
China,2000,213766,1280428583"""
with StringIO(tibble3_csv) as fp:
tibble3 = pd.read_csv(fp)
print(tibble3)
def str_join_elements(x, sep=""):
assert type(sep) is str
return sep.join((str(xi) for xi in x))
def unite(df, cols, new_var, combine=str_join_elements):
def apply_join(x, sep):
joinstr = str_join(x, sep)
return pd.Series({new_var[i]:s for i, s in enumerate(joinstr)})
fixed_vars = df.columns.difference(cols)
tibble = df[fixed_vars].copy()
tibble_extra = df[cols].apply(apply_join)
return pd.concat([tibble, tibble_extra], axis=1)
table3_again = unite(tibble3, ['cases', 'population'], 'rate', combine=lambda x: str_join_elements(x, "/"))
print(table3_again)
Use lambda when you have multiple parameters i.e
df[cols].apply(lambda x: apply_join(x,sep),axis=1)
Or pass parameters with the help of args parameter i.e
df[cols].apply(apply_join,args=[sep],axis=1)
You just add it into the apply statement:
tibble_extra = df[cols].apply(apply_join, sep=...)
Also, you should specify the axis. It may work without it, but its a good habit to prevent errors:
tibble_extra = df[cols].apply(apply_join, sep=..., axis=1(columns) or 0(rows|default))

Categories