How would I find the quarterly averages of these monthly figures? - python

My dataset is similar to the below:
data = [['Jane', 10,10.5,11,45,66,21,88,99,77,41,22], ['John',11,22,55,34,44,22,44,55,88,56,47],['Tom',23,32,43,12,11,44,77,85,99,45,63]]
df = pd.DataFrame(data, columns = ['Name', '09-Aug-21', 'Aug-21', '02-Sep-21', 'Sep-21', '18-Oct-21', 'Oct-21', '02-Nov-21','Nov-21','14-Dec-21', 'Dec-21', '15-Jan-22'])
df
How can I add columns to this which show the quarterly figure, which is an average of the preceding three months? Eg, suppose we started at adding a column after 'Dec-21' called Q4 2021 which took the average of the columns called 'Oct-21', 'Nov-21' and 'Dec-21'.
Will I need to create a function which takes the preceding three values and returns an average, and then concatenate this to my dataframe? It does not have to be directly after each period, eg I am also happy to add all of the Quarterly averages right at the end.

from datetime import datetime
def get_quarter_name(timestamp):
"""Convert '2021-12-01' to 'Q4-2021'
"""
return f"Q{timestamp.quarter}-{timestamp.year}"
# your data
data = [['Jane', 10,10.5,11,45,66,21,88,99,77,41,22], ['John',11,22,55,34,44,22,44,55,88,56,47],['Tom',23,32,43,12,11,44,77,85,99,45,63]]
df = pd.DataFrame(data, columns = ['Name', '09-Aug-21', 'Aug-21', '02-Sep-21', 'Sep-21', '18-Oct-21', 'Oct-21', '02-Nov-21','Nov-21','14-Dec-21', 'Dec-21', '15-Jan-22'])
# filter only relevant columns, which start with an alphabetical character
cols = [col for col in df.columns if not col[0].isdigit()]
# extract only relevant columns and transpose
df_T = df[cols].set_index("Name").T
# convert index values to dates
df_T.index = pd.Index([pd.Timestamp(datetime.strptime(d,'%b-%y').strftime('%Y-%m-%d')) for d in df_T.index])
# resample by Quarters and transpose again to original format
df_quarter = df_T.resample("Q").mean().T
# rename columns to quarter-like descriptions
df_quarter.columns = [get_quarter_name(col) for col in df_quarter.columns]
df_quarter is your final answer which you can merge back to original df
Output:
Q3-2021 Q4-2021
Name
Jane 27.75 53.666667
John 28.00 44.333333
Tom 22.00 58.000000

here is one way to do it
# Define your quarters month
q1=['Aug','Sep']
q2=['Oct','Nov']
q3=['Dec','Jan']
df['q1']=df[df.columns[(df.columns.str.contains(rf'|'.join(q1)) )]].mean(axis=1)
df['q2']=df[df.columns[(df.columns.str.contains(rf'|'.join(q2)) )]].mean(axis=1)
df['q3']=df[df.columns[(df.columns.str.contains(rf'|'.join(q3)) )]].mean(axis=1)
df
Name 09-Aug-21 Aug-21 02-Sep-21 Sep-21 18-Oct-21 Oct-21 02-Nov-21 Nov-21 14-Dec-21 Dec-21 15-Jan-22 q1 q2 q3
0 Jane 10 10.5 11 45 66 21 88 99 77 41 22 19.125 68.50 46.666667
1 John 11 22.0 55 34 44 22 44 55 88 56 47 30.500 41.25 63.666667
2 Tom 23 32.0 43 12 11 44 77 85 99 45 63 27.500 54.25 69.000000

This is kinda messy, but it SHOULD allow you to dynamically generate a column for each quarter (does not have the quarter year you could add that logic if you want).
data = [['Jane', 10,10.5,11,45,66,21,88,99,77,41,22], ['John',11,22,55,34,44,22,44,55,88,56,47],['Tom',23,32,43,12,11,44,77,85,99,45,63]]
df = pd.DataFrame(data, columns = ['Name', '09-Aug-21', 'Aug-21', '02-Sep-21', 'Sep-21', '18-Oct-21', 'Oct-21', '02-Nov-21','Nov-21','14-Dec-21', 'Dec-21', '15-Jan-22'])
columns_to_use = [column for column in df.columns if column[0].isalpha()]
df = df[columns_to_use]
df = df.melt(id_vars = 'Name')
df['variable'] = '01-' + df['variable']
df['variable'] = pd.to_datetime(df['variable'],infer_datetime_format=True)
df['Quarter'] = df['variable'].dt.quarter
df['Quarter_Avg'] = df.groupby(['Name', 'Quarter'])['value'].transform('mean')
df1 = df.groupby(['Name', 'Quarter'])['Quarter_Avg'].agg('mean').reset_index()
df1['Quarter'] = 'Quarter ' + df1['Quarter'].astype(str)
df1 = df1.pivot_table(index = 'Name', columns = 'Quarter', values = 'Quarter_Avg').reset_index()
df['variable'] = df['variable'].astype(str)
df['variable'] = df['variable'].apply(lambda x : '-'.join(x.split('-')[0:2]))
df = df.pivot_table(index = 'Name', columns = 'variable', values = 'value').reset_index()
df_final = df.merge(df1, on = 'Name')
df_final

A fair amount of steps but it gives you the expected result
from datetime import datetime
data = [['Jane', 10,10.5,11,45,66,21,88,99,77,41,22,22], ['John',11,22,55,34,44,22,44,55,88,56,47,47],['Tom',23,32,43,12,11,44,77,85,99,45,63,63]]
df = pd.DataFrame(data, columns = ['Name', '09-Aug-21', 'Aug-21', '02-Sep-21', 'Sep-21', '18-Oct-21', 'Oct-21', '02-Nov-21','Nov-21','14-Dec-21', 'Dec-21', '15-Jan-22', 'Jan-22'])
# Melt the data frame by date
meltedDF = df.melt(id_vars=["Name"], var_name=["Date"])
# Remove the dates that don't match the "Month-year" format
meltedDF = meltedDF[pd.to_datetime(meltedDF.Date, format='%b-%y', errors='coerce').notna()].reset_index(drop=True)
# Convert those dates to datetime objects
meltedDF["Date"] = pd.to_datetime(meltedDF.Date, format='%b-%y')
# Find the quarter that those dates fall into and add the year string to the that quarter
meltedDF["Quarter"] = "Q" + meltedDF.Date.dt.quarter.astype(str) + " " + meltedDF.Date.dt.year.astype(str)
# Group by the quarter and the person's name then get the mean of their values
meltedDF = meltedDF.groupby(["Quarter", "Name"], as_index=False).mean().round(1)
# Pivot the table's Quarter values to be column names
meltedDF = pd.pivot_table(meltedDF, index=['Name'], values=['value'], columns="Quarter")
# Combine the names and the Quarter total values
meltedDF = pd.concat([meltedDF.reset_index()["Name"], meltedDF.reset_index()["value"]], axis=1)
# Merge these values back into the original Dataframe
df = df.merge(meltedDF, left_on='Name', right_on='Name')
Output:
Name 09-Aug-21 Aug-21 02-Sep-21 Sep-21 18-Oct-21 Oct-21 02-Nov-21 Nov-21 14-Dec-21 Dec-21 15-Jan-22 Jan-22 Q1 2022 Q3 2021 Q4 2021
0 Jane 10 10.5 11 45 66 21 88 99 77 41 22 22 22.0 27.8 53.7
1 John 11 22.0 55 34 44 22 44 55 88 56 47 47 47.0 28.0 44.3
2 Tom 23 32.0 43 12 11 44 77 85 99 45 63 63 63.0 22.0 58.0

Related

using usecols when specifying a multi-index header in Python Pandas

I have a huge data to read based on two headers, But when I am using multi-index approach I am unable to use 'usecols' in pandas dataframe.
when I am using
df = pd.read_csv(files, delimiter=' ', header=[0,1])
it is taking too much of time and memory
another approach I am trying to use is
df = pd.read_csv(files, delimiter=' ', usecols = ["80.375"])
it is taking only one colomn, rather it should take all the four colomn with header '80.375'
Desired output
Please suggest any alternative approach
Thanks in advance
You can use two pass to extract data and headers.
# read_csv common options
opts = {'sep': ' ', 'header': None}
# Extract headers, create MultiIndex
headers = pd.read_csv('data.csv', **opts, nrows=2)
mi = pd.MultiIndex.from_frame(headers.T)
# Keep desired columns
dti = [0, 1, 2] # Year, Month, Day
cols = mi.get_locs([80.375]).tolist()
# Build dataframe
df = pd.read_csv('data.csv', **opts, skiprows=2, index_col=dti, usecols=dti+cols)
df.columns = mi[cols]
df = df.rename_axis(index=['Year', 'Month', 'Day'], columns=['Lvl1', 'Lvl2'])
df.index = pd.to_datetime(df.index.to_frame()).rename('DateTime')
Output:
>>> df
Lvl1 80.375
Lvl2 28.625 28.875 29.125 29.375
DateTime
2015-01-01 21 22 23 24
2015-01-02 31 32 33 34
2015-01-03 41 42 43 44
2015-01-04 51 52 53 54
Input csv file:
80.125 80.375 80.375 80.375 80.375 80.625
28.875 28.625 28.875 29.125 29.375 28.875
2015 1 1 20 21 22 23 24 25
2015 1 2 30 31 32 33 34 35
2015 1 3 40 41 42 43 44 45
2015 1 4 50 51 52 53 54 55
Update
I need to convert the output in single header row.
# Extract headers, create MultiIndex
headers = pd.read_csv('data.csv', sep=' ', header=None, nrows=2)
mi = pd.MultiIndex.from_frame(headers.T)
# Keep desired columns
dti_cols = [0, 1, 2] # Year, Month, Day
dti_names = ['Year', 'Month', 'Day']
dat_cols = mi.get_locs([80.375]).tolist()
dat_names = mi[cols].to_flat_index().map(lambda x: f"{x[0]}_{x[1]}").tolist()
# Build dataframe
df = (pd.read_csv('data.csv', sep=' ', header=None, skiprows=2,
usecols=dti_cols+dat_cols, names=dti_names+dat_names,
parse_dates={'Date': ['Year', 'Month', 'Day']}))
Output:
>>> df
Date 80.375_28.625 80.375_28.875 80.375_29.125 80.375_29.375
0 2015-01-01 21 22 23 24
1 2015-01-02 31 32 33 34
2 2015-01-03 41 42 43 44
3 2015-01-04 51 52 53 54

How can i calculate pct changes between groups of colums efficiently?

I have a set of columsn like so:
q1_cash_total, q2_cash_total, q3_cash_total,
q1_shop_us, q2_shop_us, q3_shop_us,
etc, i have about 40 similarly named column names like this. I wish to calculate the pct changes between each of these groups of 3. e.g. i know individually i can do:
df[['q1_cash_total', 'q2_cash_total', 'q3_cash_total']].pct_change().add_suffix('_PCT_CHG')
to do this for every 3 i do:
q1 = [col for col in df.columns if 'q1' in col ]
q2 = [col for col in df.columns if 'q2' in col ]
q3 = [col for col in df.columns if 'q3' in col ]
q_cols = q1+q2+q3
dflist = []
for col in df[q_cols].columns:
#col[3:] to just get col name without the q1_/q2_ etc
print(col[3:])
cols = [c for c in df.columns if col[3:] in c]
pct = df[cols].pct_change().add_suffix('_PCT_CHG')
dflist.append(pct)
pcts_df = pd.concat(dflist)
I cannot think of a cleaner way to do this. Does anybody have any ideas? How can i also do it such that i do the pct change between q1 and q3 too instead of successively.
You could create a dataframe containing only the desires columns, for that, filter column names starting with q immediately follow by one or more digits and an underscore (^q\d+?_). Remove the prefix and keep only unique column names using pd.unique. For each unique column name, filter columns with that specific name and apply the percentage change along the columns axis (.pct_change(axis='columns')) to obtain the changes between q1, q2 and q3.
To get the percentage change between q1 and q3 you can select those columns by name over the previous created dataframe (df_q) and apply the same pct_change executed earlier.
df used as input
q1_cash_total q1_shop_us q2_cash_total q2_shop_us q3_cash_total q3_shop_us another_col numCols dataCols
0 52 93 15 72 61 21 83 87 75
1 75 88 24 3 22 53 2 88 30
2 38 2 64 60 21 33 76 58 22
3 89 49 91 59 42 92 60 80 15
4 62 62 47 62 51 55 64 3 51
df_q = df.filter(regex='^q\d+?_')
unique_cols = pd.unique([c[3:] for c in df_q.columns])
dflist = []
for col in unique_cols:
q_name = df_q.filter(like=col)
df_s = q_name.pct_change(axis='columns').add_suffix('_PCT_CHG')
dflist.append(df_s)
df_s = df_q[[f'q1_{col}', f'q3_{col}']].pct_change(axis='columns').add_suffix('_Q1-Q3')
dflist.append(df_s)
pcts_df = pd.concat(dflist, axis=1)
Output from pcts_df
q1_cash_total_PCT_CHG q2_cash_total_PCT_CHG q3_cash_total_PCT_CHG ... q3_shop_us_PCT_CHG q1_shop_us_Q1-Q3 q3_shop_us_Q1-Q3
0 NaN -0.711538 3.066667 ... -0.708333 NaN -0.774194
1 NaN -0.680000 -0.083333 ... 16.666667 NaN -0.397727
2 NaN 0.684211 -0.671875 ... -0.450000 NaN 15.500000
3 NaN 0.022472 -0.538462 ... 0.559322 NaN 0.877551
4 NaN -0.241935 0.085106 ... -0.112903 NaN -0.112903
[5 rows x 10 columns]

Combine multiple rows based on one colum value and add extra columns based on other column value in Pandas

I have the following dataframe:
Name rollNumber external_roll_number testDate marks
0 John 34 234 2021-04-28 15
1 John 34 234 2021-03-28 25
I would like to convert it like this:
Name rollNumber external_roll_number testMonth marks testMonth marks
0 John 34 234 April 15 March 25
If the above is not possible then I would atleast want it to be like this:
Name rollNumber external_roll_number testDate marks testDate marks
0 John 34 234 2021-04-28 15 2021-03-28 25
How can I convert my dataframe to the desired output? This change will be based on the Name column of the rows.
EDIT 1
I tried using pivot_table like this but I did not get the desired result.
merged_df_pivot = pd.pivot_table(merged_df, index=["name", "testDate"], aggfunc="first", dropna=False).fillna("")
When I try to iterate through the merged_df_pivot like this:
for index, details in merged_df_pivot.iterrows():
I am again getting two rows and also I was not able to add the new testMonth column by the above method.
core is unstack() month to be columns
detail then to re-structure month-by month marks columns to required structure
generally consider bad practice to have duplicate column names, hence have suffixed them
df = pd.read_csv(io.StringIO(""" Name rollNumber external_roll_number testDate marks
0 John 34 234 2021-04-28 15
1 John 34 234 2021-03-28 25
"""), sep="\s+")
df["testDate"] =pd.to_datetime(df["testDate"])
df = df.assign(testMonth = df["testDate"].dt.strftime("%B")).drop(columns="testDate")
dft = (df.set_index([c for c in df.columns if c!="marks"])
.unstack("testMonth") # make month a column
.droplevel(0, axis=1) # remove unneeded level in columns
# create columns for months from column names and rename marks columns
.pipe(lambda d: d.assign(**{f"testMonth_{i+1}":c
for i,c in enumerate(d.columns)}).rename(columns={c:f"marks_{i+1}"
for i,c in enumerate(d.columns)}))
.reset_index()
)
output
Name
rollNumber
external_roll_number
marks_1
marks_2
testMonth_1
testMonth_2
0
John
34
234
15
25
April
March

Python pandas data frame remove row where index name DOES NOT occurs in other data frame

I have two data frames. I want to remove rows where the indexes do not occur in both data frames.
Here is an example of the data frames:
import pandas as pd
data = {'Correlation': [1.000000, 0.607340, 0.348844]}
df = pd.DataFrame(data, columns=['Correlation'])
df = df.rename(index={0: 'GINI'})
df = df.rename(index={1: 'Central government debt, total (% of GDP)'})
df = df.rename(index={2: 'Grants and other revenue (% of revenue)'})
data_2 = {'Correlation': [1.000000, 0.607340, 0.348844, 0.309390, -0.661046]}
df_2 = pd.DataFrame(data_2, columns=['Correlation'])
df_2 = df_2.rename(index={0: 'GINI'})
df_2 = df_2.rename(index={1: 'Central government debt, total (% of GDP)'})
df_2 = df_2.rename(index={2: 'Grants and other revenue (% of revenue)'})
df_2 = df_2.rename(index={3: 'Compensation of employees (% of expense)'})
df_2 = df_2.rename(index={4: 'Central government debt, total (current LCU)'})
I have found this question: How to remove rows in a Pandas dataframe if the same row exists in another dataframe? but was unable to use it as I am trying to remove if the index name is the same.
I also saw this question: pandas get rows which are NOT in other dataframe but removes rows which are equal in both data frames but I also did not find this useful.
What I have thought to do is to transpose then concat the data frames and remove duplicate columns:
df = df.T
df_2 = df_2.T
df3 = pd.concat([df,df_2],axis = 1)
df3.iloc[: , ~df3.columns.duplicated()]
The problem with this is that it only removes one of the columns that is duplicated but I want it to remove both these columns.
Any help doing this would be much appreciated, cheers.
You can just compare the indexes and use .loc to pull the relevant rows:
In [19]: df1 = pd.DataFrame(list(range(50)), index=range(0, 100, 2))
In [20]: df2 = pd.DataFrame(list(range(34)), index=range(0, 100, 3))
In [21]: df2.loc[df2.index.difference(df1.index)]
Out[21]:
0
3 1
9 3
15 5
21 7
27 9
33 11
39 13
45 15
51 17
57 19
63 21
69 23
75 25
81 27
87 29
93 31
99 33
you can simply do this for indices in df2 but not in df1
df_2[~df_2.index.isin(df.index)]
Correlation
Compensation of employees (% of expense) 0.309390
Central government debt, total (current LCU) -0.661046
I have managed to work this out by adapting the answers already submitted:
df_2[df_2.index.isin(df.index)]

Grouping data by column name

I have a dataframe that looks like:
[date1] [date1] [date2] [date2]
[Min:] [Max:] [Min:] [Max:]
A B C D
and my desired output would look like:
['Date'] ['Min'] ['Max']
[date 1] A B
[date 2] C D
How would I do this in pandas?
I'm simply importing a csv I have locally saved.
import pandas as pd
import csv
import datetime
SampleWeatherDate = pd.read_csv(weatherdata.csv)
This is what my data looks like in excel
You can use T and pivot if first and second rows are columns:
print df
date1 date2
Min Max Min Max
0 A B C D
print df.columns
MultiIndex(levels=[[u'date1', u'date2'], [u'Max', u'Min']],
labels=[[0, 0, 1, 1], [1, 0, 1, 0]])
#transpose and reset_index
df = df.T.reset_index()
#set columns names
df.columns =['a','b','c']
print df
a b c
0 date1 Min A
1 date1 Max B
2 date2 Min C
3 date2 Max D
#pivot
print df.pivot(index='a', columns='b', values='c')
b Max Min
a
date1 B A
date2 D C
Solution with data:
import pandas as pd
import io
temp=u"""Date;2/4/17;2/4/17;2/5/17;2/5/17;2/6/17;2/6/17
City:;Min:;Max:;Min:;Max:;Min:;Max:
New York;28;34;29;35;30;36
Los Angeles;80;86;81;87;82;88"""
#after testing replace io.StringIO(temp) to filename
df = pd.read_csv(io.StringIO(temp), sep=";", index_col=0, header=[0,1])
print df
Date 2/4/17 2/5/17 2/6/17
City: Min: Max: Min: Max: Min: Max:
New York 28 34 29 35 30 36
Los Angeles 80 86 81 87 82 88
#transpose and reset_index
df = df.T.reset_index()
#convert column Date to datetime
df['Date'] = pd.to_datetime(df['Date'])
#strip : from column City:
df['City:'] = df['City:'].str.strip(':')
#remove : from column name City:
df.rename(columns={'City:':'City'}, inplace=True)
print df
Date City New York Los Angeles
0 2017-02-04 Min 28 80
1 2017-02-04 Max 34 86
2 2017-02-05 Min 29 81
3 2017-02-05 Max 35 87
4 2017-02-06 Min 30 82
5 2017-02-06 Max 36 88
print df.pivot(index='Date', columns='City')
New York Los Angeles
City Max Min Max Min
Date
2017-02-04 34 28 86 80
2017-02-05 35 29 87 81
2017-02-06 36 30 88 82
You don't need the csv module, as you can read it directly with Pandas.
df = sample_weather_data = pd.read_csv(weatherdata.csv)
You're source data is formatted poorly, so there is quite a bit of munging to do.
>>> df
Date 2/4/17 2/4/17.1 2/5/17 2/5/17.1 2/6/17 2/6/17.1
0 City: Min: Max: Min: Max: Min: Max:
1 New York 28 34 29 35 30 36
2 Los Angeles 80 86 81 87 82 88
First, note how the dates repeat with .1 appended on the second date. Also note that the first column is Date:
>>> df.columns
Index([u'Date', u'2/4/17', u'2/4/17.1', u'2/5/17', u'2/5/17.1', u'2/6/17', u'2/6/17.1'], dtype='object')
Let's extract every other date starting with the first (note that Python uses zero based indexing).
dates = df.columns[1::2]
>>> dates
Index([u'2/4/17', u'2/5/17', u'2/6/17'], dtype='object')
While we're at it, we can convert them to timestamps.
dates = pd.to_datetime(dates)
>>> dates
DatetimeIndex(['2017-02-04', '2017-02-05', '2017-02-06'], dtype='datetime64[ns]', freq=None)
We can use the same technique to extract the City, Min and Max values. iloc is for integer location selection. It uses (row, column) selection indices. We are ignoring the first value (the zero index value), so we use [1:] to select all rows except the first one.
cities = df.iloc[1:, 0] # Column 0
min_max_vals = df.iloc[1:, 1:] # Every column starting at 1, ignoring first row.
We can index min_max_vals with cities:
min_max_vals.index = cities
We now need to create a MultiIndex with the dates and Min/Max and assign it to the dataframe.
min_max_vals.columns = pd.MultiIndex.from_product([dates, ['Min', 'Max']])
You're desired output above is missing the city, so I assume you really want something like this:
['City 1'] ['City 2]
['Date'] ['Min'] ['Max'] ['Min'] ['Max']
[date 1] A B E F
[date 2] C D G H
Transposing the results and unstacking:
>>> min_max_vals.T.unstack()
Date New York Los Angeles
Max Min Max Min
2017-02-04 34 28 86 80
2017-02-05 35 29 87 81
2017-02-06 36 30 88 82
Summary
df = sample_weather_data = pd.read_csv('weatherdata.csv')
dates = pd.to_datetime(df.columns[1::2])
min_max_vals = df.iloc[1:, 1:]
min_max_vals.index = df.iloc[1:, 0]
min_max_vals.columns = pd.MultiIndex.from_product([dates, ['Min', 'Max']])
df = min_max_vals.T.unstack()

Categories