How to group and calculate monthly average in pandas dataframe - python

I am trying to group a dataset based on the name and find the monthly average. i.e sum all the values for each name divided by the number of the distinct month for each name.
For example,
name time values
A 2011-01-17 10
B 2011-02-17 20
A 2011-01-11 10
A 2011-03-17 30
B 2011-02-17 10
The expected result is
name monthly_avg
A 25
B 30
I have tried
data.groupby(['name'])['values'].mean().reset_index(name='Monthly Average')
but it gives the output below instead of my desired output above:
name Monthly Average
A 16.666667
B 15.000000

Convert values to datetimes first, then aggregate sum per name and months by Grouper and last get mean per first level name:
data['time'] = pd.to_datetime(data['time'])
df = (data.groupby(['name', pd.Grouper(freq='m', key='time')])['values'].sum()
.groupby(level=0)
.mean()
.reset_index(name='Monthly Average'))
print (df)
name Monthly Average
0 A 25
1 B 30
With months period solution is if change Grouper to Series.dt.to_period:
data['time'] = pd.to_datetime(data['time'])
df = (data.groupby(['name', data['time'].dt.to_period('m')])['values']
.sum()
.groupby(level=0)
.mean()
.reset_index(name='Monthly Average'))
print (df)
name Monthly Average
0 A 25
1 B 30

Use:
df = pd.DataFrame({'name': ['A', 'B', 'A', 'A', 'B'], 'time': ['2011-01-17', '2011-02-1', '2011-01-11', '2011-03-17', '2011-02-17'], 'vals': range(5)})
df['month']=pd.to_datetime(df['time']).dt.month
a1 = df.groupby('name')['month'].apply(lambda x: len(set(x)))
a2 = df.groupby('name')['vals'].sum()
a2/a1
output:

Related

Group by Category and find Percent Change for given frequency

I have a dataset, df, that I wish to group by the category and find the percent change for a given frequency
Cat Value Date
A 1 7/1/2020
A 2 7/2/2020
B 20 7/1/2020
B 40 7/3/2020
Desired Output
Cat Diff pct_change Date
A 1 100 7/2/2020
B 20 100 7/3/2020
This is what I am doing
df1=df.groupby(pd.Grouper(key='Cat', freq='1D')).sum() #Group by the Cat
df1['PercentageDiff'] = df1['Value'].pct_change().mul(100) #Find Pct_change
df1['ValueDiff'] = df1['Value'].diff() #Find Value diff
Any help is appreciated.
I believe you want working per groups with DataFrame.groupby and last remove first values per groups filled by misisng values by DataFrame.dropna:
df['Date'] = pd.to_datetime(df['Date'])
df['Diff'] = df.groupby('Cat')['Value'].diff()
df['pct_change'] = df.groupby('Cat')['Value'].pct_change().mul(100)
df = df.dropna(subset=['pct_change'])[['Cat','Diff','pct_change','Date']]
print (df)
Cat Diff pct_change Date
1 A 1.0 100.0 2020-07-02
3 B 20.0 100.0 2020-07-03
This should help:
def f(x):
d = {}
d['Diff'] = x.iloc[1, 'Value'] - x.iloc[0, 'Value']
d['Perc_change'] = 100*(x.iloc[1, 'Value'] - x.iloc[0, 'Value'])/x.iloc[0,'Value']
d['Date'] = max(x['Data'])
return pd.Series(d, index=['Diff', 'Perc_change', 'Date'])
df['Date'] = pd.to_datetime(df.Date)
df = df.sort('Date')
df.groupby(['Cat']).apply(f)

divide multiple rows by single group of values in pandas dataframe

I have a similar dataframe to the following (but it has hundreds of stocks rather than A and B). I also do not know how many stocks will be in the dataframe. I am trying to dividend the Index row by all stocks matched by Date column (stock A on Date 5/15/2020 dividend by INDEX on 5/15/2020 then Stock A on Date 5/16/2020 divided by INDEX on 5/16/2020, etc then Stock B on Date 5/15/2020 dividend by INDEX on 5/15/2020, etc.). I add the answer I want in the DESIRED column but do not know how to get it.
d = {'Stock' : pd.Series(['A', 'A', 'A','B', 'B', 'B', 'INDEX', 'INDEX', 'INDEX']),
'Date' : pd.Series(['5/15/2020', '5/16/2020', '5/17/2020','5/15/2020', \
'5/16/2020', '5/17/2020','5/15/2020','5/16/2020','5/17/2020']),
'Price' : pd.Series([10,20,30,20,40,60,2,5,10]),
'DESIRED' : pd.Series([5,4,3,10,8,6,1,1,1])}
df = pd.DataFrame(d)
df
import pandas as pd
d = {'Stock' : pd.Series(['A', 'A', 'A','B', 'B', 'B', 'INDEX', 'INDEX', 'INDEX']),
'Date' : pd.Series(['5/15/2020', '5/16/2020', '5/17/2020','5/15/2020', \
'5/16/2020', '5/17/2020','5/15/2020','5/16/2020','5/17/2020']),
'Price' : pd.Series([10,20,30,20,40,60,2,5,10]),
'DESIRED' : pd.Series([5,4,3,10,8,6,1,1,1])}
df = pd.DataFrame(d)
Here's a possible solution:
#First we build a dataframe containing only index rows
df_index = df[df.Stock == 'INDEX']
#and we get rid of those rows from the original dataframe
df = df[df.Stock != 'INDEX']
#now we merge them
df = df.merge(df_index[['Date', 'Price']], on='Date', suffixes = ['', '_index'])
#and we simply create the new column
df['hooray!'] = df.Price/df.Price_index
#If you want you can delete the column
#del df['Price_index']
Output:
Stock Date Price DESIRED Price_index hooray!
0 A 5/15/2020 10 5 2 5.0
1 B 5/15/2020 20 10 2 10.0
2 A 5/16/2020 20 4 5 4.0
3 B 5/16/2020 40 8 5 8.0
4 A 5/17/2020 30 3 10 3.0
5 B 5/17/2020 60 6 10 6.0
This should do the trick:
import pandas as pd
#data (NOTE: i've removed the desired column)
d = {'Stock' : pd.Series(['A', 'A', 'A','B', 'B', 'B', 'INDEX', 'INDEX', 'INDEX']),
'Date' : pd.Series(['5/15/2020', '5/16/2020', '5/17/2020','5/15/2020', \
'5/16/2020', '5/17/2020','5/15/2020','5/16/2020','5/17/2020']),
'Price' : pd.Series([10,20,30,20,40,60,2,5,10])}
#create dataframe
df = pd.DataFrame(d)
#create emoty desired column
df['DESIRED'] = ''
#create sub dataframes for stocks and indices
stocksDf = df.loc[df['Stock'] != 'INDEX'].reset_index(drop=True)
indexDf = df.loc[df['Stock'] == 'INDEX'].reset_index(drop=True)
#loop over stocks dataframe
for i, row in stocksDf.iterrows():
#define needed values
stocks = stocksDf.at[i, 'Stock']
price = stocksDf.at[i, 'Price']
date = stocksDf.at[i, 'Date']
#get index matching date of stock
matchingIndex = indexDf.loc[indexDf['Date'] == date].reset_index(drop=True)
#if doesn't exists just print no matching index
if len(matchingIndex)==0:
df['DESIRED'].loc[(df['Stock'] == stocks) & (df['Price'] == price) & (df['Date'] == date)] = 'No Matching Index'
else:
#if exists calculate Desired as Price of stock / price of index
indexPrice = matchingIndex.at[0,'Price']
df['DESIRED'].loc[(df['Stock'] == stocks) & (df['Price'] == price) & (df['Date'] == date)] = df['Price'] / indexPrice
#for indices just set desired as 1
df['DESIRED'].loc[df['Stock'] == 'INDEX'] = 1
print(df)
Stock Date Price DESIRED
0 A 5/15/2020 10 5
1 A 5/16/2020 20 4
2 A 5/17/2020 30 3
3 B 5/15/2020 20 10
4 B 5/16/2020 40 8
5 B 5/17/2020 60 6
6 INDEX 5/15/2020 2 1
7 INDEX 5/16/2020 5 1
8 INDEX 5/17/2020 10 1

Groupby sum, index vs. column results

For the following dataframe:
df = pd.DataFrame({'group':['a','a','b','b'], 'data':[5,10,100,30]},columns=['group', 'data'])
print(df)
group data
0 a 5
1 a 10
2 b 100
3 b 30
When grouping by column, adding and creating a new column, the result is:
df['new'] = df.groupby('group')['data'].sum()
print(df)
group data new
0 a 5 NaN
1 a 10 NaN
2 b 100 NaN
3 b 30 NaN
However if we reset the df to the original data and move the group column to the index,
df.set_index('group', inplace=True)
print(df)
data
group
a 5
a 10
b 100
b 30
And then group and sum, then we get:
df['new'] = df.groupby('group')['data'].sum()
print(df)
data new
group
a 5 15
a 10 15
b 100 130
b 30 130
Why does the column group not set the values in the new column but the index grouping does set the values in the new column?
Better here is use GroupBy.transform for return Series with same size like original DataFrame, so after assign all working correctly:
df['new'] = df.groupby('group')['data'].transform('sum')
Because if assign new Series values are align by index values. If index is different, get NaNs:
print (df.groupby('group')['data'].sum())
group
a 15
b 130
Name: data, dtype: int64
Different index values - get NaNs:
print (df.groupby('group')['data'].sum().index)
Index(['a', 'b'], dtype='object', name='group')
print (df.index)
RangeIndex(start=0, stop=4, step=1)
df.set_index('group', inplace=True)
print (df.groupby('group')['data'].sum())
group
a 15
b 130
Name: data, dtype: int64
Index can align, because values matched:
print (df.groupby('group')['data'].sum().index)
Index(['a', 'b'], dtype='object', name='group')
print (df.index)
Index(['a', 'a', 'b', 'b'], dtype='object', name='group')
You're not getting what you want because when using df.groupby('group')['data'].sum(), this is returning an aggregated result with group as index:
group
a 15
b 130
Name: data, dtype: int64
Where clearly indexes are not aligned.
If you want this to work you'll have to use transform, which returns a Series with the transformed vales which has the same axis length as self:
df['new'] = df.groupby('group')['data'].transform('sum')
group data new
0 a 5 15
1 a 10 15
2 b 100 130
3 b 30 130

Pandas dataframe resample and count events per day

I have a dataframe with time-index. I can resample the data to get (e.g) mean per-day, however I would like also to get the counts per day. Here is a sample:
import datetime
import pandas as pd
import numpy as np
dates = pd.date_range(datetime.datetime(2012, 4, 5, 11,
0),datetime.datetime(2012, 4, 7, 7, 0),freq='5H')
var1 = np.random.sample(dates.size) * 10.0
var2 = np.random.sample(dates.size) * 10.0
df = pd.DataFrame(data={'var1': var1, 'var2': var2}, index=dates)
df1=df.resample('D').mean()
I'd like to get also a 3rd column 'count' which counts per day:
count
3
5
7
Thank you very much!
Use Resampler.agg and then flatten MultiIndex in columns:
df1 = df.resample('D').agg({'var1': 'mean','var2': ['mean', 'size']})
df1.columns = df1.columns.map('_'.join)
df1 = df1.rename(columns={'var2_size':'count'})
print (df1)
var1_mean var2_mean count
2012-04-05 3.992166 4.968410 3
2012-04-06 6.843105 6.193568 5
2012-04-07 4.568436 3.135089 1
Alternative solution with Grouper:
df1 = df.groupby(pd.Grouper(freq='D')).agg({'var1': 'mean','var2': ['mean', 'size']})
df1.columns = df1.columns.map('_'.join)
df1 = df1.rename(columns={'var2_size':'count'})
print (df1)
var1_mean var2_mean count
2012-04-05 3.992166 4.968410 3
2012-04-06 6.843105 6.193568 5
2012-04-07 4.568436 3.135089 1
EDIT:
r = df.resample('D')
df1 = r.mean().add_suffix('_mean').join(r.size().rename('count'))
print (df1)
var1_mean var2_mean count
2012-04-05 7.840487 6.885030 3
2012-04-06 4.762477 5.091455 5
2012-04-07 2.702414 6.046200 1

pandas - How to get 7 day sum for groups when some groups do not have entries on all days

I have some data like this:
date, group_name, value
-------------------
2017-07-01, A, 10
2017-07-05, A, 4
2017-07-05, B, 21
I want to compute the rolling 7 day sum of each group but the data only for each group only has records when the value is > 0 for that day.
I want the output to look like:
date, group_name, value, 7d_sum_of_value
----------------------------------------
2017-07-01, A, 10, 10
2017-07-05, A, 4, 14
2017-07-05, B, 21, 21
Use timedeltas:
import pandas as pd
from datetime import datetime, timedelta
testdata = pd.DataFrame({'date': ['2017-07-01', '2017-07-05', '2017-07-05'], 'group_name': ['A', 'A', 'B'], 'value': [10, 4, 21]})
testdata['7d_sum_of_value'] = 0
for index1, row1 in testdata.iterrows(): # Iterate over all rows in data
rolling_sum = row1['value'] # Initialize rolling_sum
group_data = testdata[testdata['group_name'] == row1['group_name']] # Choose subset of data that pertains to the appropriate group
for index2, row2 in group_data.iterrows():
date_diff = datetime.strptime(row1['date'], '%Y-%m-%d')-datetime.strptime(row2['date'], '%Y-%m-%d') # Get time difference between two dates
if (date_diff >= timedelta(0)) and (date_diff <= timedelta(7)) and (index1 != index2):
rolling_sum += row2['value'] # Update rolling sum
testdata.set_value(index1, '7d_sum_of_value', rolling_sum) # Insert rolling sum into data
You can try this , but according to your data size (7 million), there must be some solution more efficient than mine.
df = pd.DataFrame({'date': ['2017-07-01', '2017-07-05', '2017-07-05'], 'group_name': ['A', 'A', 'B'], 'value': [10, 4, 21]})
df.date=pd.to_datetime(df.date)
df = df.set_index('date')
df['date']=df.index
A=df.groupby('group_name')['value'].apply(lambda x:pd.rolling_sum(x.resample("1d"), 7, min_periods=1)).reset_index()
df.merge(A,left_on=['date','group_name'],right_on=['date','group_name'],how='left')
Out[201]:
group_name value_x date value_y
0 A 10 2017-07-01 10.0
1 A 4 2017-07-05 14.0
2 B 21 2017-07-05 21.0

Categories