I have a codebase where this pattern is very common:
df # Some pandas dataframe with columns userId, sessionId
def add_session_statistics(df):
df_statistics = get_session_statistics(df.sessionId.unique())
return df.merge(df_statistics, on='sessionId', how='left')
def add_user_statistics(df):
df_statistics = add_user_statistics(df.userId.unique())
return df.merge(df_statistics, on='sessionId', how='left')
# etc..
df_enriched = (df
.pipe(add_session_statistics)
.pipe(add_user_statistics)
)
However, in another part of the codebase I have 'userId', 'sessionId' as the index of the dataframe. Something like:
X = df.set_index(['userId', 'sessionId'])
This means I can't use the add_{somthing}_statistics() functions on X without resetting the index each time.
Is there any decorator I can add to the add_{somthing}_statistics() to make them reset the index if they get a KeyError when attempting the merge on a column that is not there?
This seems to work:
def index_suspension_on_add(add_function):
def _helper(df):
try:
return df.pipe(add_function)
except Exception:
index_names = df.index.names
return (df
.reset_index()
.pipe(add_function)
.set_index(index_names)
)
return _helper
#index_suspension_on_add
def add_user_statistics(df):
...
Related
For the life of me I cannot figure out why this function is not returning anything. Any insight will be greatly appreciated!
Basically I create a list of string variables that I am preserving in a Pandas DF. I am using the DF to pull the variable to plug into the function via a .apply() method. But my return function yields NONE results in my DF.
def add_combinations_to_directory(comb_tuples, person_id):
meta_list = []
for comb in comb_tuples:
concat_name = generate_normalized_name(comb)
metaphone_tuple = doublemetaphone(concat_name)
meta_list.append(metaphone_tuple[0])
if metaphone_tuple[1] != '':
meta_list.append(metaphone_tuple[1])
if metaphone_tuple[0] in __lookup_dict[0]:
__lookup_dict[0][metaphone_tuple[0]].append(person_id)
else:
__lookup_dict[0][metaphone_tuple[0]] = [person_id]
if metaphone_tuple[1] in __lookup_dict[1]:
__lookup_dict[1][metaphone_tuple[1]].append(person_id)
else:
__lookup_dict[1][metaphone_tuple[1]] = [person_id]
print(meta_list)
return meta_list
def add_person_to_lookup_directory(person_id, name_tuple):
add_combinations_to_directory(name_tuple, person_id)
def create_meta_names(x, id):
add_person_to_lookup_directory(id, x)
other['Meta_names'] = other.apply(lambda x: create_meta_names(x['Owners'], x['place_id']), axis=1)
Figured it out! it was a problem of nested functions. The return value from the add_combinations_to_directory was being returned to the add_person_to_lookup_directory function and not passing through to the dataframe.
I am new to pandas, I have a doubt in returning a data frame from a function. I have a function which creates three new data frames based on the parameters given to it, the function has to return only the data frames which are non-empty. How do I do that?
my code:
def df_r(df,colname,t1):
t1_df = pd.DataFrame()
t2_df = pd.DataFrame()
t3_df = pd.DataFrame()
if t1 :
for colname in df:
some code
some code
t1_df = some data
if t2 :
for colname in df:
some code
some code
t2_df = some data
if t3 :
for colname in df:
some code
some code
t3_df = some data
list = [t1_df,t2_df,t3_df]
Now it should return only the t1_df as the parameter was given t1. So I have inserted all three into a list
list = [t1_df,t2_df,t3_df]
how to check if which df is non-empty and return it?
Just check for empty attribute for each DataFrame
eg.
df = pd.DataFrame()
if df.empty:
print("DataFrame is empty")
output:
DataFrame is empty
pd.empty would return True if DataFrame is empty, else it would return False
This would work even if column names are present but are still missing the data.
So to answer specific to your case
list = [t1_df,t2_df,t3_df]
for df in list:
if not df.empty:
return df
assuming your case has only one of the DataFrame non-empty
if t1_df.empty != True:
return t1_df
elif t2_df.empty !=True:
return t2_df
else:
return t2_df
What would be the best method of turning a code like below to be able to accept as many dataframes as we would like?
def q_grab(df, df2, df3, q): #accepts three dataframes and a column name. Looks up column in all dataframes and combine to one
data = df[q], df2[q], df3[q]
headers = [q+"_1", q+"_2", q+"_3"]
data2 = pd.concat(data, axis = 1, keys=headers)
return data2
q = 'covid_condition'
data2 = q_grab(df, df2, df3, q) #If I run function pid_set first, it will create new df based on pID it looks like
One approach is to use * operator to get a list of arguments
(but name your final argument, so it isn't part of the list):
Something like this:
def q_grab(*dfs, q=None): # q is a named argument to signal end of positional arguments
data = [df[q] for df in dfs]
headers = [q+"_"+str(i) for i in range(len(dfs))]
data2 = pd.concat(data, axis = 1, keys=headers)
return data2
q = 'covid_condition'
data2 = q_grab(df, df2, df3, q=q)
A probably cleaner alternative, is to go ahead and pass a list of dataframes as the first argument:
def q_grab(dfs,q):
called with:
data2 = q.grab([df,df2,df3], q)
using the function code as above
I am trying to read a csv file using panda and parse it and then upload the results in my django database. Well, for now i am converting each dataframe to a list and then iterating over the list to save it in the DB. But my solution is inefficient when the list is really big for each column. How can i make it better ?
fileinfo = pd.read_csv(csv_file, sep=',',
names=['Series_reference', 'Period', 'Data_value', 'STATUS',
'UNITS', 'Subject', 'Group', 'Series_title_1', 'Series_title_2',
'Series_title_3','Series_tile_4','Series_tile_5'],
skiprows = 1)
# serie = fileinfo[fileinfo['Series_reference']]
s = fileinfo['Series_reference'].values.tolist()
p = fileinfo['Period'].values.tolist()
d = fileinfo['Data_value'].values.tolist()
st = fileinfo['STATUS'].values.tolist()
u = fileinfo['UNITS'].values.tolist()
sub = fileinfo['Subject'].values.tolist()
gr = fileinfo['Group'].values.tolist()
stt= fileinfo['Series_title_1'].values.tolist()
while count < len(s):
b = Testdata(
Series_reference = s[count],
Period = p[count],
Data_value = d[count],
STATUS = st[count],
UNITS = u[count],
Subject = sub[count],
Group = gr[count],
Series_title_1 = stt[count]
)
b.save()
count = count + 1
You can use pandas apply function. You can pass axis=1 to apply a given function to every row:
df.apply(
creational_function, # Method that creates your structure
axis=1, # Apply to every row
args=(arg1, arg2) # Additional args to creational_function
)
in creational_function the first argument received is the row, where you can access specific columns likewise the original dataframe
def creational_function(row, arg1, arg2):
s = row['Series_reference']
# For brevity I skip the others arguments...
# Create TestData
# Save
Note that arg1 and arg2 are the same for every row.
If you want to do something more with your created TestData objects, you can change creational_function to return a value, then df.apply will return a list containing all elements returned by the passed function.
I wrote the following function:
def unique_values(df, column):
unique = df[column].unique()
clean = pd.DataFrame.from_dict(unique)
clean.columns = [column]
return clean
I would like to apply the following function to various columns in a df. As something like this:
unique1, unique2, unique3 = unique_values(df, "Column1", "Column2", "Column3")
If I add an args in the following way:
def unique_values(df, *column):
unique = df[column].unique()
clean = pd.DataFrame.from_dict(unique)
clean.columns = [column]
return clean
and apply the function like this:
unique1, unique2, unique3 = unique_values(df, "Column1", "Column2", "Column3")
I get the following error:
KeyError: ('Column1', 'Column2', 'Column3')
Any help would be appreciated
You can do it this way by iterating though column:
def unique_values(df, *column):
to_return=[]
for col in column:
unique = df[col].unique()
clean = pd.DataFrame.from_dict(unique)
clean.columns = [col]
to_return.append(clean)
return to_return
# this way this works:
unique1, unique2, unique3 = unique_values(df, "Column1", "Column2", "Column3")
You can write a small wrapper function that calls your unique_values() function with the list of columns that you pass as arguments, like so:
def df_unique_values(df, *columns):
return [unique_values(df, x) for x in columns]
This function returns a list containing the result of each call to unique_values() for a different column. Use this function like so:
unique1, unique2, unique3 = df_unique_values(df, "Column1", "Column2", "Column3")