I would like to return several dataframes from def function using unique names based on variables. My code as follows:
def plots_without_outliers(parameter):
"""
The function removes outliers from dataframe variables and plots boxplot and historams
"""
Q1 = df[parameter].quantile(0.25)
Q3 = df[parameter].quantile(0.75)
IQR = Q3 - Q1
df_without_outliers = df[(df[parameter] > (Q1-1.5*IQR)) & (df[parameter] < (Q3+1.5*IQR))]
g = sns.FacetGrid(df_without_outliers, col='tariff', height=5)
g.map(sns.boxplot, parameter, order=['ultra', 'smart'], color='#fec44f', showmeans=True)
g = sns.FacetGrid(df_without_outliers, col='tariff', height=5)
g.map(plt.hist, parameter, bins = 12, color='#41ab5d')
return df_without_outliers
Then I pass a number of variables :
plots_without_outliers('total_minutes_spent_per_month')
plots_without_outliers('number_sms_spent_per_month')
In addition to graphs I want to have dataframes returned with unique names to use them later on. For example:
df_without_outliers_total_minutes_spent_per_month
and
df_without_outliers_number_sms_spent_per_month
What would be the best way to deal with this issue? Thank you very much for your help.
A common way to deal with this is by using a dictionary, which you can make a global variable outside of the function and then update with the returned dataframe and the corresponding name as dictionary key.
dict_of_dfs = dict()
def plots_without_outliers(parameter):
# your function statements
return df_without_outliers
for col in ['total_minutes_spent_per_month', 'number_sms_spent_per_month']:
dict_of_dfs['df_without_outliers_' + col] = (
plots_without_outliers(col)
)
You can then get each dataframe from the dictionary with e.g., dict_of_dfs['df_without_outliers_total_minutes_spent_per_month']
Related
Hi guys I am quite new on creating function in Python. How can I simplify this process of filtering data based on Year using a function?
df_2013 = df_train1[df_train1['year']== 2013][['month','sales']]
df_2013 = df_2013.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s13'})
df_2014 = df_train1[df_train1['year']== 2014][['month','sales']]
df_2014 = df_2014.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s14'})
df_2015 = df_train1[df_train1['year']== 2015][['month','sales']]
df_2015 = df_2015.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s15'})
df_2016 = df_train1[df_train1['year']== 2016][['month','sales']]
df_2016 = df_2016.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s16'})
df_2017 = df_train1[df_train1['year']== 2017][['month','sales']]
df_2017 = df_2017.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s17'})
Consider using lists or dicts and avoid separate, similarly structured objects in global environment. Doing so, you keep your code DRY (Don't Rrepeat Yourself) by applying same methods on elements of dictionary or list.
# DEFINED METHOD TO ENCAPSULATE ALL OPERATIONS
def build_aggregate_frame(year_param):
agg_df = (
df_train1.query("year == #year_param")
.groupby("month", as_index= False)
.agg({"sales": "mean"})
.rename(columns={"sales": f"s{year_param[-2:]}"})
)
return agg_df
# DICTIONARY OF AGGREGATED DATA FRAMES
train_year_df_dict = {
str(year): build_aggregate_frame(year) for year in range(2013, 2018)
}
# ACCESS INDIVIDUAL YEAR DATA TRAMES
train_year_df_dict["2013"]
train_year_df_dict["2014"]
...
train_year_df_dict["2017"]
I have the following function. It calculates the euclidean distances form some financial figures between companies and give me the closest company. Unfortunately, sometimes the closest company is the same company. Does anyone know how I can adjust the function so that it does not return the same company?
#Calculating the closest distances
records = df_ipos.to_dict('records') #converting dataframe to a list of dictionaries
def return_closest(df,inp_record):
"""returns the closest euclidean distanced record"""
filtered_records = df.to_dict('records')#converting dataframe to a list of dictionaries
for record in filtered_records: #iterating through dictionaries
params = ['z_SA','z_LEV','z_AT', 'z_PM', 'z_RG']#parameters to calculate euclidean distance
distance = []
for param in params:
d1,d2 = record.get(param,0),inp_record.get(param,0) # fetching value of these parameters. default is0 if not found
if d1!=d1: #checking isNan
d1 = 0
if d2!=d2:
d2 = 0
distance.append((d1 - d2)**2)
euclidean = math.sqrt(sum(distance))
record['Euclidean distance'] = round(euclidean,6) #assigning to a new key
distance_records = sorted(filtered_records,key = lambda x:x['Euclidean distance']) #sorting in increasing order
return next(filter(lambda x:x['Euclidean distance'],distance_records),None) #returning the lowest value which is not zero. Default None
for record in records:
ipo_year = record.get('IPO Year')
sic_code = record.get('SIC-Code')
df = df_fundamentals[df_fundamentals['Year']==ipo_year]
df = df[df['SIC-Code']==sic_code] #filtering dataframe
closest_record = return_closest(df,record)
if closest_record:
record['Closest Company'] = closest_record.get('Name') #adding new columns
record['Actual Distance'] = closest_record.get('Euclidean distance')
df_dist = pd.DataFrame(records) #changing list of dictionaries back to dataframe
thanks in advance!
Based on your question, it is not exactly clear to me what your inputs are.
But as a simple fix, I would suggest you check before your function's for loop, whether the record you are comparing is identical to the one which you check against, i.e., add:
...
filtered_records = [rec for rec in filtered_records if rec['Name'] != inp_record['Name']]
for record in filtered_records: #iterating through dictionaries
...
This only applies, if 'Name' really contains the company name. Also for your function not to work, there seems to be an absolute distance greater zero when comparing your parameters. I am not sure if this is intended, maybe you look at data from different years? I cannot really tell, due to the limited amount of information.
I have a data frame in pandas, one of the columns contains time intervals presented as strings like 'P1Y4M1D'.
The example of the whole CSV:
oci,citing,cited,creation,timespan,journal_sc,author_sc
0200100000236252421370109080537010700020300040001-020010000073609070863016304060103630305070563074902,"10.1002/pol.1985.170230401","10.1007/978-1-4613-3575-7_2",1985-04,P2Y,no,no
...
I created a parsing function, that takes that string 'P1Y4M1D' and returns an integer number.
I am wondering how is it possible to change all the column values to parsed values using that function?
def do_process_citation_data(f_path):
global my_ocan
my_ocan = pd.read_csv("citations.csv",
names=['oci', 'citing', 'cited', 'creation', 'timespan', 'journal_sc', 'author_sc'],
parse_dates=['creation', 'timespan'])
my_ocan = my_ocan.iloc[1:] # to remove the first row iloc - to select data by row numbers
my_ocan['creation'] = pd.to_datetime(my_ocan['creation'], format="%Y-%m-%d", yearfirst=True)
return my_ocan
def parse():
mydict = dict()
mydict2 = dict()
i = 1
r = 1
for x in my_ocan['oci']:
mydict[x] = str(my_ocan['timespan'][i])
i +=1
print(mydict)
for key, value in mydict.items():
is_negative = value.startswith('-')
if is_negative:
date_info = re.findall(r"P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?$", value[1:])
else:
date_info = re.findall(r"P(?:(\d+)Y)?(?:(\d+)M)?(?:(\d+)D)?$", value)
year, month, day = [int(num) if num else 0 for num in date_info[0]] if date_info else [0,0,0]
daystotal = (year * 365) + (month * 30) + day
if not is_negative:
#mydict2[key] = daystotal
return daystotal
else:
#mydict2[key] = -daystotal
return -daystotal
#print(mydict2)
#return mydict2
Probably I do not even need to change the whole column with new parsed values, the final goal is to write a new function that returns average time of ['timespan'] of docs created in a particular year. Since I need parsed values, I thought it would be easier to change the whole column and manipulate a new data frame.
Also, I am curious what could be a way to apply the parsing function on each ['timespan'] row without modifying a data frame, I can only assume It could be smth like this, but I don't have a full understanding of how to do that:
for x in my_ocan['timespan']:
x = parse(str(my_ocan['timespan'])
How can I get a column with new values? Thank you! Peace :)
A df['timespan'].apply(parse) (as mentioned by #Dan) should work. You would need to modify only the parse function in order to receive the string as an argument and return the parsed string at the end. Something like this:
import pandas as pd
def parse_postal_code(postal_code):
# Splitting postal code and getting first letters
letters = postal_code.split('_')[0]
return letters
# Example dataframe with three columns and three rows
df = pd.DataFrame({'Age': [20, 21, 22], 'Name': ['John', 'Joe', 'Carla'], 'Postal Code': ['FF_222', 'AA_555', 'BB_111']})
# This returns a new pd.Series
print(df['Postal Code'].apply(parse_postal_code))
# Can also be assigned to another column
df['Postal Code Letter'] = df['Postal Code'].apply(parse_postal_code)
print(df['Postal Code Letter'])
I'm trying to find out averages and standard deviation of multiple columns of my dataset and then save them as a new column in a new dataframe. i.e. for every 'GROUP' in the dataset, I want one columns in the new dataframe with its average and SD. I came up with the following script but I'm not able to name it dynamically.
Average_F1_S_list, Average_F1_M_list, SD_F1_S_list, SD_F1_M_list = ([] for i in range(4))
Groups= DF['GROUP'].unique().tolist()
for key in Groups:
Average_F1_S = DF_DICT[key]['F1_S'].mean()
Average_F1_S_list.append(Average_F1_S)
SD_F1_S = DF_DICT[key]['F1_S'].std()
SD_F1_S_list.append(SD_F1_S)
Average_F1_M = DF_DICT[key]['F1_M'].mean()
Average_F1_M_list.append(Average_F1_M)
SD_F1_M = DF_DICT[key]['F1_M'].std()
SD_F1_M_list.append(SD_F1_M)
df=pd.DataFrame({'Group':Groups,
'Average_F1_S':Average_F1_S_list,'Standard_Dev_F1_S':SD_F1_S_list,
'Average_F1_M':Average_F1_M_list,'Standard_Dev_F1_M':SD_F1_M_list},
columns=['Group','Average_F1_S','Standard_Dev_F1_S','Average_F1_M', 'Standard_Dev_F1_M'])
This will not be a good solution as there are too many features. Is there any way I can create the lists dynamically?
This should do the trick! Hope this helps
# These are all the keys you want
key_names = ['F1_S', 'F1_M']
# Holds the data you want to pass to the dataframe.
df_info = {'Groups': Groups}
for group_name in Groups:
# For each group in the groups, we iterate over all the keys we want.
for key in key_names:
# Generate a keyname that you want for your dataframe.
avg_key_name = key + '_Average'
std_key_name = key + '_Standard_Dev'
if avg_key_name not in df_info:
df_info[avg_key_name] = []
df_info[std_key_name] = []
df_info[avg_key_name].append(DF_DICT[group_name][key].mean())
df_info[std_key_name].append(DF_DICT[group_name][key].std())
df = pd.DataFrame(df_info)
This question pertains to one posted here:
Sort dataframe rows independently by values in another dataframe
In the linked question, I utilize a Pandas Dataframe to sort each row independently using values in another Pandas Dataframe. The function presented there works perfectly every single time it is directly called. For example:
import pandas as pd
import numpy as np
import os
##Generate example dataset
d1 = {}
d2 = {}
d3 = {}
d4 = {}
## generate data:
np.random.seed(5)
for col in list("ABCDEF"):
d1[col] = np.random.randn(12)
d2[col+'2'] = np.random.random_integers(0,100, 12)
d3[col+'3'] = np.random.random_integers(0,100, 12)
d4[col+'4'] = np.random.random_integers(0,100, 12)
t_index = pd.date_range(start = '2015-01-31', periods = 12, freq = "M")
#place data into dataframes
dat1 = pd.DataFrame(d1, index = t_index)
dat2 = pd.DataFrame(d2, index = t_index)
dat3 = pd.DataFrame(d3, index = t_index)
dat4 = pd.DataFrame(d4, index = t_index)
## Functions
def sortByAnthr(X,Y,Xindex, Reverse=False):
#order the subset of X.index by Y
ordrX = [x for (x,y) in sorted(zip(Xindex,Y), key=lambda pair: pair[1],reverse=Reverse)]
return(ordrX)
def OrderRow(row,df):
ordrd_row = df.ix[row.dropna().name,row.dropna().values].tolist()
return(ordrd_row)
def r_selectr(dat2,dat1, n, Reverse=False):
ordr_cols = dat1.apply(lambda x: sortByAnthr(x,dat2.loc[x.name,:],x.index,Reverse),axis=1).iloc[:,-n:]
ordr_cols.columns = list(range(0,n)) #assign interpretable column names
ordr_r = ordr_cols.apply(lambda x: OrderRow(x,dat1),axis=1)
return([ordr_cols, ordr_r])
## Call functions
ordr_cols2,ordr_r = r_selectr(dat2,dat1,5)
##print output:
print("Ordering set:\n",dat2.iloc[-2:,:])
print("Original set:\n", dat1.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
As can be checked, the columns of dat1 are correctly ordered according to the values in dat2.
However, when called from a loop over dataframes, it does not rank/index correctly and produces completely dubious results. Although I am not quite able to recreate the problem using the reduced version presented here, the idea should be the same.
## Loop test:
out_dict = {}
data_dicts = {'dat2':dat2, 'dat3': dat3, 'dat4':dat4}
for i in range(3):
#this outer for loop supplies different parameter values to a wrapper
#function that calls r_selectr.
for key in data_dicts.keys():
ordr_cols,_ = r_selectr(data_dicts[key], dat1,5)
out_list.append(ordr_cols)
#do stuff here
#print output:
print("Ordering set:\n",dat3.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
In my code (almost completely analogous to the example given here), the ordr_cols are no longer ordered correctly for any of the sorting data frames.
I currently solve the issue by separating the ordering and indexing operations with r_selectr into two separate functions. That, for some reason, resolves the issue though I have no idea why.