Hi guys I am quite new on creating function in Python. How can I simplify this process of filtering data based on Year using a function?
df_2013 = df_train1[df_train1['year']== 2013][['month','sales']]
df_2013 = df_2013.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s13'})
df_2014 = df_train1[df_train1['year']== 2014][['month','sales']]
df_2014 = df_2014.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s14'})
df_2015 = df_train1[df_train1['year']== 2015][['month','sales']]
df_2015 = df_2015.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s15'})
df_2016 = df_train1[df_train1['year']== 2016][['month','sales']]
df_2016 = df_2016.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s16'})
df_2017 = df_train1[df_train1['year']== 2017][['month','sales']]
df_2017 = df_2017.groupby('month').agg({'sales':'mean'}).reset_index().rename(columns={'sales':'s17'})
Consider using lists or dicts and avoid separate, similarly structured objects in global environment. Doing so, you keep your code DRY (Don't Rrepeat Yourself) by applying same methods on elements of dictionary or list.
# DEFINED METHOD TO ENCAPSULATE ALL OPERATIONS
def build_aggregate_frame(year_param):
agg_df = (
df_train1.query("year == #year_param")
.groupby("month", as_index= False)
.agg({"sales": "mean"})
.rename(columns={"sales": f"s{year_param[-2:]}"})
)
return agg_df
# DICTIONARY OF AGGREGATED DATA FRAMES
train_year_df_dict = {
str(year): build_aggregate_frame(year) for year in range(2013, 2018)
}
# ACCESS INDIVIDUAL YEAR DATA TRAMES
train_year_df_dict["2013"]
train_year_df_dict["2014"]
...
train_year_df_dict["2017"]
Related
Hi I have code which looks like this:
with open("file123.json") as json_file:
data = json.load(json_file)
df_1 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data["spt"][1].items()]))
df_1_made =pd.json_normalize(json.loads(df_1.to_json(orient="records"))).T.drop(["content.id","shortname","name"])
df_2 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data["spt"][2].items()]))
df_2_made = pd.json_normalize(json.loads(df_2.to_json(orient="records"))).T.drop(["content.id","shortname","name"])
df_3 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in data["spt"][3].items()]))
df_3_made = pd.json_normalize(json.loads(df_3.to_json(orient="records"))).T.drop(["content.id","shortname","name"])
which the dataframe is built from a json file
the problem is that I am dealing with different json files and each one of them can lead to different number of dataframes. so the code above is 3, it may change to 7. Is there any way to make a for loop taking the length of the data:
length = len(data["spt"])
and make the correct number of dataframes from it? so I do not need to do it manually.
The simplest option here will be to put all your dataframes into a dictionary or a list. First define a function that creates the dataframe and then use a list comprehension.
def create_df(data):
df = pd.DataFrame(
dict(
[(k,pd.Series(v)) for k,v in data]
)
)
df =pd.json_normalize(
json.loads(
df.to_json(orient="records")
)
).T.drop(["content.id","shortname","name"])
return df
my_list_of_dfs = [create_df(data.items()) for x in data["spt"]]
I have a data frame that I generated inside for loop. I am trying to save this data frame so that I can access it outside of the loop. I have a snippet of my code below.
my_excel_sample = pd.read_excel(r"mypath\mydata.xlsx",sheet_name=None)
for tabs in my_excel_sample.keys():
actualData = pd.DataFrame(removeEmptyColumns(my_excel_sample[tabs],0))
data = replaceNanValues(actualData,0)
data = renameColumns(data,0)
data = removeFooters(data,0)
data.reset_index(drop=True, inplace=True)
data = pd.DataFrame(RowMerger(data,0))
Now I want to use data outside of the loop. Can anyone help me to solve this?
You are creating several dataframes iteratively inside for loop and storing it in variable data.
You can just add the dataframes (data) to a list and then access them anytime you want.
Try this :
my_excel_sample = pd.read_excel(r"mypath\mydata.xlsx",sheet_name=None)
final_df_list = []
for tabs in my_excel_sample.keys():
actualData = pd.DataFrame(removeEmptyColumns(my_excel_sample[tabs],0))
data = replaceNanValues(actualData,0)
data = renameColumns(data,0)
data = removeFooters(data,0)
data.reset_index(drop=True, inplace=True)
data = pd.DataFrame(RowMerger(data,0))
final_df_list.append(data)
print(final_df_list)
If you ave any type of identifier that you can use to recognize the dataframes later, I would suggest you to use a dictionary. Make the identifier as keys and variable data as value.
Here is an example where I take serial number as key :
my_excel_sample = pd.read_excel(r"mypath\mydata.xlsx",sheet_name=None)
final_df_dict = dict()
ind = 0
for tabs in my_excel_sample.keys():
actualData = pd.DataFrame(removeEmptyColumns(my_excel_sample[tabs],0))
data = replaceNanValues(actualData,0)
data = renameColumns(data,0)
data = removeFooters(data,0)
data.reset_index(drop=True, inplace=True)
data = pd.DataFrame(RowMerger(data,0))
final_df_dict[ind] = data
ind += 1
print(final_df_dict)
I'm trying to find out averages and standard deviation of multiple columns of my dataset and then save them as a new column in a new dataframe. i.e. for every 'GROUP' in the dataset, I want one columns in the new dataframe with its average and SD. I came up with the following script but I'm not able to name it dynamically.
Average_F1_S_list, Average_F1_M_list, SD_F1_S_list, SD_F1_M_list = ([] for i in range(4))
Groups= DF['GROUP'].unique().tolist()
for key in Groups:
Average_F1_S = DF_DICT[key]['F1_S'].mean()
Average_F1_S_list.append(Average_F1_S)
SD_F1_S = DF_DICT[key]['F1_S'].std()
SD_F1_S_list.append(SD_F1_S)
Average_F1_M = DF_DICT[key]['F1_M'].mean()
Average_F1_M_list.append(Average_F1_M)
SD_F1_M = DF_DICT[key]['F1_M'].std()
SD_F1_M_list.append(SD_F1_M)
df=pd.DataFrame({'Group':Groups,
'Average_F1_S':Average_F1_S_list,'Standard_Dev_F1_S':SD_F1_S_list,
'Average_F1_M':Average_F1_M_list,'Standard_Dev_F1_M':SD_F1_M_list},
columns=['Group','Average_F1_S','Standard_Dev_F1_S','Average_F1_M', 'Standard_Dev_F1_M'])
This will not be a good solution as there are too many features. Is there any way I can create the lists dynamically?
This should do the trick! Hope this helps
# These are all the keys you want
key_names = ['F1_S', 'F1_M']
# Holds the data you want to pass to the dataframe.
df_info = {'Groups': Groups}
for group_name in Groups:
# For each group in the groups, we iterate over all the keys we want.
for key in key_names:
# Generate a keyname that you want for your dataframe.
avg_key_name = key + '_Average'
std_key_name = key + '_Standard_Dev'
if avg_key_name not in df_info:
df_info[avg_key_name] = []
df_info[std_key_name] = []
df_info[avg_key_name].append(DF_DICT[group_name][key].mean())
df_info[std_key_name].append(DF_DICT[group_name][key].std())
df = pd.DataFrame(df_info)
I am trying to read a csv file using panda and parse it and then upload the results in my django database. Well, for now i am converting each dataframe to a list and then iterating over the list to save it in the DB. But my solution is inefficient when the list is really big for each column. How can i make it better ?
fileinfo = pd.read_csv(csv_file, sep=',',
names=['Series_reference', 'Period', 'Data_value', 'STATUS',
'UNITS', 'Subject', 'Group', 'Series_title_1', 'Series_title_2',
'Series_title_3','Series_tile_4','Series_tile_5'],
skiprows = 1)
# serie = fileinfo[fileinfo['Series_reference']]
s = fileinfo['Series_reference'].values.tolist()
p = fileinfo['Period'].values.tolist()
d = fileinfo['Data_value'].values.tolist()
st = fileinfo['STATUS'].values.tolist()
u = fileinfo['UNITS'].values.tolist()
sub = fileinfo['Subject'].values.tolist()
gr = fileinfo['Group'].values.tolist()
stt= fileinfo['Series_title_1'].values.tolist()
while count < len(s):
b = Testdata(
Series_reference = s[count],
Period = p[count],
Data_value = d[count],
STATUS = st[count],
UNITS = u[count],
Subject = sub[count],
Group = gr[count],
Series_title_1 = stt[count]
)
b.save()
count = count + 1
You can use pandas apply function. You can pass axis=1 to apply a given function to every row:
df.apply(
creational_function, # Method that creates your structure
axis=1, # Apply to every row
args=(arg1, arg2) # Additional args to creational_function
)
in creational_function the first argument received is the row, where you can access specific columns likewise the original dataframe
def creational_function(row, arg1, arg2):
s = row['Series_reference']
# For brevity I skip the others arguments...
# Create TestData
# Save
Note that arg1 and arg2 are the same for every row.
If you want to do something more with your created TestData objects, you can change creational_function to return a value, then df.apply will return a list containing all elements returned by the passed function.
This question pertains to one posted here:
Sort dataframe rows independently by values in another dataframe
In the linked question, I utilize a Pandas Dataframe to sort each row independently using values in another Pandas Dataframe. The function presented there works perfectly every single time it is directly called. For example:
import pandas as pd
import numpy as np
import os
##Generate example dataset
d1 = {}
d2 = {}
d3 = {}
d4 = {}
## generate data:
np.random.seed(5)
for col in list("ABCDEF"):
d1[col] = np.random.randn(12)
d2[col+'2'] = np.random.random_integers(0,100, 12)
d3[col+'3'] = np.random.random_integers(0,100, 12)
d4[col+'4'] = np.random.random_integers(0,100, 12)
t_index = pd.date_range(start = '2015-01-31', periods = 12, freq = "M")
#place data into dataframes
dat1 = pd.DataFrame(d1, index = t_index)
dat2 = pd.DataFrame(d2, index = t_index)
dat3 = pd.DataFrame(d3, index = t_index)
dat4 = pd.DataFrame(d4, index = t_index)
## Functions
def sortByAnthr(X,Y,Xindex, Reverse=False):
#order the subset of X.index by Y
ordrX = [x for (x,y) in sorted(zip(Xindex,Y), key=lambda pair: pair[1],reverse=Reverse)]
return(ordrX)
def OrderRow(row,df):
ordrd_row = df.ix[row.dropna().name,row.dropna().values].tolist()
return(ordrd_row)
def r_selectr(dat2,dat1, n, Reverse=False):
ordr_cols = dat1.apply(lambda x: sortByAnthr(x,dat2.loc[x.name,:],x.index,Reverse),axis=1).iloc[:,-n:]
ordr_cols.columns = list(range(0,n)) #assign interpretable column names
ordr_r = ordr_cols.apply(lambda x: OrderRow(x,dat1),axis=1)
return([ordr_cols, ordr_r])
## Call functions
ordr_cols2,ordr_r = r_selectr(dat2,dat1,5)
##print output:
print("Ordering set:\n",dat2.iloc[-2:,:])
print("Original set:\n", dat1.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
As can be checked, the columns of dat1 are correctly ordered according to the values in dat2.
However, when called from a loop over dataframes, it does not rank/index correctly and produces completely dubious results. Although I am not quite able to recreate the problem using the reduced version presented here, the idea should be the same.
## Loop test:
out_dict = {}
data_dicts = {'dat2':dat2, 'dat3': dat3, 'dat4':dat4}
for i in range(3):
#this outer for loop supplies different parameter values to a wrapper
#function that calls r_selectr.
for key in data_dicts.keys():
ordr_cols,_ = r_selectr(data_dicts[key], dat1,5)
out_list.append(ordr_cols)
#do stuff here
#print output:
print("Ordering set:\n",dat3.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
In my code (almost completely analogous to the example given here), the ordr_cols are no longer ordered correctly for any of the sorting data frames.
I currently solve the issue by separating the ordering and indexing operations with r_selectr into two separate functions. That, for some reason, resolves the issue though I have no idea why.