How to resample until a specific date criteria is met - python

With some help from the community I have managed to get to the below function. previous question on building the functionI am trying to work out how to get the resampled date to run to the latest date that appears in anywhere in either of the input data sets for any code. Below I have included the current output I am getting and my desired output.
Input data:
Input 1 df1 - In
date code qty
0 2019-01-10 A 20
1 2019-01-10 B 12
2 2019-01-10 C 10
3 2019-01-11 A 2
4 2019-01-11 B 30
5 2019-01-11 C 2
7 2019-01-12 A 4
8 2019-01-12 B 6
11 2019-01-13 A 10
12 2019-01-13 B 12
13 2019-01-13 C 1
Input 2 df2 - Outbound
date code qty
0 2019-01-11 A 5
1 2019-01-11 B 1
2 2019-01-11 C 3
3 2019-01-12 A 100
6 2019-01-13 B 1
7 2019-01-13 C 1
8 2019-01-15 A 1
9 2019-01-16 B 1
Existing Code:
from numba import njit
#njit
def poscumsum(x):
total = 0
result = np.empty(x.shape)
for i, y in enumerate(x):
total += y
if total < 0:
total = 0
result[i] = total
return result
a = df1.set_index(['code', 'date'])
b = df2.set_index(['code', 'date'])
idx = a.index.union(b.index).sort_values()
df3 = (a.reindex(idx, fill_value=0) - b.reindex(idx, fill_value=0))
df3 = df3.groupby('code').resample('D', level='date').sum()
df3['qty'] = df3.groupby('code')['qty'].transform(
lambda g: poscumsum(g.values))
Current Output
each code is only represented for dates on which they appear in the In or Out dfs.
code date qty
0 A 2019-01-10 20
1 A 2019-01-11 17
2 A 2019-01-12 0
3 A 2019-01-13 10
4 A 2019-01-14 10
5 A 2019-01-15 9
6 B 2019-01-10 12
7 B 2019-01-11 41
8 B 2019-01-12 47
9 B 2019-01-13 58
10 B 2019-01-14 58
11 B 2019-01-15 58
12 B 2019-01-16 57
13 C 2019-01-10 10
14 C 2019-01-11 9
15 C 2019-01-12 9
16 C 2019-01-13 9
Desired Output:
each code is represented for each date between 2019-01-10 & 2019-01-16
code date qty
0 A 2019-01-10 20
1 A 2019-01-11 17
2 A 2019-01-12 0
3 A 2019-01-13 10
4 A 2019-01-14 10
5 A 2019-01-15 9
6 A 2019-01-16 9
7 B 2019-01-10 12
8 B 2019-01-11 41
9 B 2019-01-12 47
10 B 2019-01-13 58
11 B 2019-01-14 58
12 B 2019-01-15 58
13 B 2019-01-16 57
14 C 2019-01-10 10
15 C 2019-01-11 9
16 C 2019-01-12 9
17 C 2019-01-13 9
18 C 2019-01-14 9
19 C 2019-01-15 9
20 C 2019-01-16 9

Ok, here is a 2D version of poscumsum (and generalized to cap the running sum at min and/or max):
#njit
def cumsum_capped_2d(x, xmin=None, xmax=None):
n, m = x.shape
result = np.empty_like(x)
if n == 0:
return result
total = np.zeros_like(x[0])
for i in range(n):
total += x[i]
if xmin is not None:
total[total < xmin] = xmin
if xmax is not None:
total[total > xmax] = xmax
result[i] = total
return result
And here is how to use it (now that you want all dates spanning the same period); the good news is that there is no more groupby (so it is faster than ever):
a = df1.pivot('date', 'code', 'qty')
b = df2.pivot('date', 'code', 'qty')
idx = a.index.union(b.index).sort_values()
df3 = (a.reindex(idx, fill_value=0) - b.reindex(idx, fill_value=0)).resample('D').sum()
df3.values[:, :] = cumsum_capped_2d(df3.values, xmin=0)
Or, in two (convoluted) lines:
df3 = (df1.set_index(['date', 'code']).subtract(df2.set_index(['date', 'code']), fill_value=0)
.unstack('code', fill_value=0).resample('D').sum())
df3.values[:, :] = cumsum_capped_2d(df3.values, xmin=0)
On your data:
>>> df3
code A B C
date
2019-01-10 20.0 12.0 10.0
2019-01-11 17.0 41.0 9.0
2019-01-12 0.0 41.0 9.0
2019-01-13 0.0 52.0 9.0
2019-01-14 0.0 52.0 9.0
2019-01-15 0.0 52.0 9.0
2019-01-16 0.0 51.0 9.0
Of course, you are free stack back into a skinny df, re-order, drop index, etc. For example, to match your desired output:
>>> df3.stack().swaplevel(0,1).sort_index().reset_index()
code date qty
0 A 2019-01-10 20.0
1 A 2019-01-11 17.0
2 A 2019-01-12 0.0
3 A 2019-01-13 10.0
4 A 2019-01-14 10.0
5 A 2019-01-15 9.0
6 A 2019-01-16 9.0
7 B 2019-01-10 12.0
8 B 2019-01-11 41.0
9 B 2019-01-12 47.0
10 B 2019-01-13 58.0
11 B 2019-01-14 58.0
12 B 2019-01-15 58.0
13 B 2019-01-16 57.0
14 C 2019-01-10 10.0
15 C 2019-01-11 9.0
16 C 2019-01-12 9.0
17 C 2019-01-13 9.0
18 C 2019-01-14 9.0
19 C 2019-01-15 9.0
20 C 2019-01-16 9.0

Here is another approach using reindex. You can generate a date_range of unique values per day across all groups called dates. Then, get the unique codes and create a mutli-index to reindex by with pd.MultiIndex.from_product(). Then, reindex and forward fill with ffill():
d = pd.to_datetime(df3.index.get_level_values(1))
dates = pd.date_range(d.min(), d.max(), freq= '1d')
codes = df3.index.get_level_values(0).unique()
idx = pd.MultiIndex.from_product([codes, dates], names=['date', 'code'])
df3 = df3.reindex(idx).reset_index().ffill()
Full code and output:
# original code
from numba import njit
#njit
def poscumsum(x):
total = 0
result = np.empty(x.shape)
for i, y in enumerate(x):
total += y
if total < 0:
total = 0
result[i] = total
return result
df1['date'] = pd.to_datetime(df1['date'])
df2['date'] = pd.to_datetime(df2['date'])
a = df1.set_index(['code', 'date'])
b = df2.set_index(['code', 'date'])
idx = a.index.union(b.index).sort_values()
df3 = (a.reindex(idx, fill_value=0) - b.reindex(idx, fill_value=0))
df3 = df3.groupby('code').resample('D', level='date').sum()
df3['qty'] = df3.groupby('code')['qty'].transform(
lambda g: poscumsum(g.values))
#code I added
d = pd.to_datetime(df3.index.get_level_values(1))
dates = pd.date_range(d.min(), d.max(), freq= '1d')
codes = df3.index.get_level_values(0).unique()
idx = pd.MultiIndex.from_product([codes, dates], names=['date', 'code'])
df3 = df3.reindex(idx).reset_index().ffill()
df3
Out[1]:
date code qty
0 A 2019-01-10 20.0
1 A 2019-01-11 17.0
2 A 2019-01-12 0.0
3 A 2019-01-13 10.0
4 A 2019-01-14 10.0
5 A 2019-01-15 9.0
6 A 2019-01-16 9.0
7 B 2019-01-10 12.0
8 B 2019-01-11 41.0
9 B 2019-01-12 47.0
10 B 2019-01-13 58.0
11 B 2019-01-14 58.0
12 B 2019-01-15 58.0
13 B 2019-01-16 57.0
14 C 2019-01-10 10.0
15 C 2019-01-11 9.0
16 C 2019-01-12 9.0
17 C 2019-01-13 9.0
18 C 2019-01-14 9.0
19 C 2019-01-15 9.0
20 C 2019-01-16 9.0

Related

How to transform weekly data to daily in pandas?

I have the following dataframe
import pandas as pd
foo = pd.DataFrame({'date':['2019-09-30', '2019-10-07', '2019-09-30', '2019-10-07'], 'sales': [7, 14, 28, 35], 'country': ['a', 'a', 'b', 'b']})
The date value changes weekly by country.
I would like to expand this dataframe, so that the date column changes daily by country and that the daily sales value for is the weekly sales value divided by 7
Use DataFrameGroupBy.resample with Resampler.ffill and divide values by 7, but also is necessary add last duplicated rows by country with added 6 days for avoid omit last days of last week per groups:
foo['date'] = pd.to_datetime(foo['date'])
mask = foo['country'].duplicated(keep='last')
foo1 = foo[~mask].assign(date = lambda x: x['date'] + pd.Timedelta(6, unit='d'))
foo = foo.append(foo1, ignore_index=True)
print (foo)
date sales country
0 2019-09-30 7 a
1 2019-10-07 14 a
2 2019-09-30 28 b
3 2019-10-07 35 b
4 2019-10-13 14 a
5 2019-10-13 35 b
If datetimes are not ordered per groups you can use this alternative:
foo1 = (foo.loc[foo.groupby('country')['date'].idxmax()]
.assign(date = lambda x: x['date'] + pd.Timedelta(6, unit='d')))
foo = foo.append(foo1, ignore_index=True)
print (foo)
date sales country
0 2019-09-30 7 a
1 2019-10-07 14 a
2 2019-09-30 28 b
3 2019-10-07 35 b
4 2019-10-13 14 a
5 2019-10-13 35 b
df = (foo.set_index('date')
.groupby('country')['sales']
.resample('d')
.ffill()
.div(7)
.reset_index()
)
print (df)
country date sales
0 a 2019-09-30 1.0
1 a 2019-10-01 1.0
2 a 2019-10-02 1.0
3 a 2019-10-03 1.0
4 a 2019-10-04 1.0
5 a 2019-10-05 1.0
6 a 2019-10-06 1.0
7 a 2019-10-07 2.0
8 a 2019-10-08 2.0
9 a 2019-10-09 2.0
10 a 2019-10-10 2.0
11 a 2019-10-11 2.0
12 a 2019-10-12 2.0
13 a 2019-10-13 2.0
14 b 2019-09-30 4.0
15 b 2019-10-01 4.0
16 b 2019-10-02 4.0
17 b 2019-10-03 4.0
18 b 2019-10-04 4.0
19 b 2019-10-05 4.0
20 b 2019-10-06 4.0
21 b 2019-10-07 5.0
22 b 2019-10-08 5.0
23 b 2019-10-09 5.0
24 b 2019-10-10 5.0
25 b 2019-10-11 5.0
26 b 2019-10-12 5.0
27 b 2019-10-13 5.0
If not encessary add last rows:
foo['date'] = pd.to_datetime(foo['date'])
df1 = (foo.set_index('date')
.groupby('country')['sales']
.resample('d')
.ffill()
.div(7)
.reset_index()
)
print (df1)
country date sales
0 a 2019-09-30 1.0
1 a 2019-10-01 1.0
2 a 2019-10-02 1.0
3 a 2019-10-03 1.0
4 a 2019-10-04 1.0
5 a 2019-10-05 1.0
6 a 2019-10-06 1.0
7 a 2019-10-07 2.0
8 b 2019-09-30 4.0
9 b 2019-10-01 4.0
10 b 2019-10-02 4.0
11 b 2019-10-03 4.0
12 b 2019-10-04 4.0
13 b 2019-10-05 4.0
14 b 2019-10-06 4.0
15 b 2019-10-07 5.0

How to add missing dates within date interval?

I have a dataframe like as shown below
df = pd.DataFrame({
'subject_id':[1,1,1,1,1,1,1,2,2,2,2,2],
'time_1' :['2173-04-03 12:35:00','2173-04-03 12:50:00','2173-04-05
12:59:00','2173-05-04 13:14:00','2173-05-05 13:37:00','2173-07-06
13:39:00','2173-07-08 11:30:00','2173-04-08 16:00:00','2173-04-09
22:00:00','2173-04-11 04:00:00','2173- 04-13 04:30:00','2173-04-14 08:00:00'],
'val' :[5,5,5,5,1,6,5,5,8,3,4,6]})
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
df['month'] = df['time_1'].dt.month
As you can see from the dataframe above that there are few missing dates in between. I would like to create new records for those dates and fill in values from the immediate previous row
def dt(df):
r = pd.date_range(start=df.date.min(), end=df.date.max())
df.set_index('date').reindex(r)
new_df = df.groupby(['subject_id','month']).apply(dt)
This generates all the dates. I only want to find the missing date within the input date interval for each subject for each month
I did try the code from this related post. Though it helped me but doesn't get me the expected output for this updated/new requirement. As we do left join, it copies all records. I can't do inner join either because it will drop non-match column. I want a mix of left join and inner join
Currently it creates new records for all 365 days in a year which I don't want. something like below. This is not expected
I only wish to add missing dates between input date interval as shown below. For example subject = 1, in the 4th month has records from 3rd and 5th. but 4th is missing. So we add record for 4th day alone. We don't need 6th,7th etc unlike current output. Similarly in 7th month, record for 7th day missing. so we just add a new record for that
I expect my output to be like as shown below
Here is problem you need resample for append new days, so it is necessary.
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
df['date'] = df['time_1'].dt.floor('d')
df1 = (df.set_index('date')
.groupby('subject_id')
.resample('d')
.last()
.index
.to_frame(index=False))
print (df1)
subject_id date
0 1 2173-04-03
1 1 2173-04-04
2 1 2173-04-05
3 1 2173-04-06
4 1 2173-04-07
.. ... ...
99 2 2173-04-10
100 2 2173-04-11
101 2 2173-04-12
102 2 2173-04-13
103 2 2173-04-14
[104 rows x 2 columns]
Idea is remove unnecessary missing rows - you can create threshold for minimum consecutive mising values (here 5) and remove rows (created new column fro easy test):
df2 = df1.merge(df, how='left')
thresh = 5
mask = df2['day'].notna()
s = mask.cumsum().mask(mask)
df2['count'] = s.map(s.value_counts())
df2 = df2[(df2['count'] < thresh) | (df2['count'].isna())]
print (df2)
subject_id date time_1 val day count
0 1 2173-04-03 2173-04-03 12:35:00 5.0 3.0 NaN
1 1 2173-04-03 2173-04-03 12:50:00 5.0 3.0 NaN
2 1 2173-04-04 NaT NaN NaN 1.0
3 1 2173-04-05 2173-04-05 12:59:00 5.0 5.0 NaN
32 1 2173-05-04 2173-05-04 13:14:00 5.0 4.0 NaN
33 1 2173-05-05 2173-05-05 13:37:00 1.0 5.0 NaN
95 1 2173-07-06 2173-07-06 13:39:00 6.0 6.0 NaN
96 1 2173-07-07 NaT NaN NaN 1.0
97 1 2173-07-08 2173-07-08 11:30:00 5.0 8.0 NaN
98 2 2173-04-08 2173-04-08 16:00:00 5.0 8.0 NaN
99 2 2173-04-09 2173-04-09 22:00:00 8.0 9.0 NaN
100 2 2173-04-10 NaT NaN NaN 1.0
101 2 2173-04-11 2173-04-11 04:00:00 3.0 11.0 NaN
102 2 2173-04-12 NaT NaN NaN 1.0
103 2 2173-04-13 2173-04-13 04:30:00 4.0 13.0 NaN
104 2 2173-04-14 2173-04-14 08:00:00 6.0 14.0 NaN
Last use previous solution:
df2 = df2.groupby(df['subject_id']).ffill()
dates = df2['time_1'].dt.normalize()
df2['time_1'] += np.where(dates == df2['date'], 0, df2['date'] - dates)
df2['day'] = df2['time_1'].dt.day
df2['val'] = df2['val'].astype(int)
print (df2)
subject_id date time_1 val day count
0 1 2173-04-03 2173-04-03 12:35:00 5 3 NaN
1 1 2173-04-03 2173-04-03 12:50:00 5 3 NaN
2 1 2173-04-04 2173-04-04 12:50:00 5 4 1.0
3 1 2173-04-05 2173-04-05 12:59:00 5 5 1.0
32 1 2173-05-04 2173-05-04 13:14:00 5 4 NaN
33 1 2173-05-05 2173-05-05 13:37:00 1 5 NaN
95 1 2173-07-06 2173-07-06 13:39:00 6 6 NaN
96 1 2173-07-07 2173-07-07 13:39:00 6 7 1.0
97 1 2173-07-08 2173-07-08 11:30:00 5 8 1.0
98 2 2173-04-08 2173-04-08 16:00:00 5 8 1.0
99 2 2173-04-09 2173-04-09 22:00:00 8 9 1.0
100 2 2173-04-10 2173-04-10 22:00:00 8 10 1.0
101 2 2173-04-11 2173-04-11 04:00:00 3 11 1.0
102 2 2173-04-12 2173-04-12 04:00:00 3 12 1.0
103 2 2173-04-13 2173-04-13 04:30:00 4 13 1.0
104 2 2173-04-14 2173-04-14 08:00:00 6 14 1.0
EDIT: Solution with reindex for each month:
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
df['date'] = df['time_1'].dt.floor('d')
df['month'] = df['time_1'].dt.month
df1 = (df.drop_duplicates(['date','subject_id'])
.set_index('date')
.groupby(['subject_id', 'month'])
.apply(lambda x: x.reindex(pd.date_range(x.index.min(), x.index.max())))
.rename_axis(('subject_id','month','date'))
.index
.to_frame(index=False)
)
print (df1)
subject_id month date
0 1 4 2173-04-03
1 1 4 2173-04-04
2 1 4 2173-04-05
3 1 5 2173-05-04
4 1 5 2173-05-05
5 1 7 2173-07-06
6 1 7 2173-07-07
7 1 7 2173-07-08
8 2 4 2173-04-08
9 2 4 2173-04-09
10 2 4 2173-04-10
11 2 4 2173-04-11
12 2 4 2173-04-12
13 2 4 2173-04-13
14 2 4 2173-04-14
df2 = df1.merge(df, how='left')
df2 = df2.groupby(df2['subject_id']).ffill()
dates = df2['time_1'].dt.normalize()
df2['time_1'] += np.where(dates == df2['date'], 0, df2['date'] - dates)
df2['day'] = df2['time_1'].dt.day
df2['val'] = df2['val'].astype(int)
print (df2)
subject_id month date time_1 val day
0 1 4 2173-04-03 2173-04-03 12:35:00 5 3
1 1 4 2173-04-03 2173-04-03 12:50:00 5 3
2 1 4 2173-04-04 2173-04-04 12:50:00 5 4
3 1 4 2173-04-05 2173-04-05 12:59:00 5 5
4 1 5 2173-05-04 2173-05-04 13:14:00 5 4
5 1 5 2173-05-05 2173-05-05 13:37:00 1 5
6 1 7 2173-07-06 2173-07-06 13:39:00 6 6
7 1 7 2173-07-07 2173-07-07 13:39:00 6 7
8 1 7 2173-07-08 2173-07-08 11:30:00 5 8
9 2 4 2173-04-08 2173-04-08 16:00:00 5 8
10 2 4 2173-04-09 2173-04-09 22:00:00 8 9
11 2 4 2173-04-10 2173-04-10 22:00:00 8 10
12 2 4 2173-04-11 2173-04-11 04:00:00 3 11
13 2 4 2173-04-12 2173-04-12 04:00:00 3 12
14 2 4 2173-04-13 2173-04-13 04:30:00 4 13
15 2 4 2173-04-14 2173-04-14 08:00:00 6 14
Does this help?
def fill_dates(df):
result = pd.DataFrame()
for i,row in df.iterrows():
if i == 0:
result = result.append(row)
else:
start_date = result.iloc[-1]['time_1']
end_date = row['time_1']
# print(start_date, end_date)
delta = (end_date - start_date).days
# print(delta)
if delta > 0 and start_date.month == end_date.month:
for j in range(delta):
day = start_date + timedelta(days=j+1)
new_row = result.iloc[-1].copy()
new_row['time_1'] = day
new_row['remarks'] = 'added'
if new_row['time_1'].date() != row['time_1'].date():
result = result.append(new_row)
result = result.append(row)
else:
result = result.append(row)
result.reset_index(inplace = True)
return result

Improve Performance of Apply Method

I would like to groupby by the variable of my df "cod_id" and then apply this function:
[df.loc[df['dt_op'].between(d, d + pd.Timedelta(days = 7)), 'quantity'].sum() \
for d in df['dt_op']]
Moving from this df:
print(df)
dt_op quantity cod_id
20/01/18 1 613
21/01/18 8 611
21/01/18 1 613
...
To this one:
print(final_df)
n = 7
dt_op quantity product_code Final_Quantity
20/01/18 1 613 2
21/01/18 8 611 8
25/01/18 1 613 1
...
I tried with:
def lookforward(x):
L = [x.loc[x['dt_op'].between(row.dt_op, row.dt_op + pd.Timedelta(days=7)), \
'quantity'].sum() for row in x.itertuples(index=False)]
return pd.Series(L, index=x.index)
s = df.groupby('cod_id').apply(lookforward)
s.index = s.index.droplevel(0)
df['Final_Quantity'] = s
print(df)
dt_op quantity cod_id Final_Quantity
0 2018-01-20 1 613 2
1 2018-01-21 8 611 8
2 2018-01-21 1 613 1
But it is not an efficient solution, since it is computationally slow;
How can I improve its performance?
I would achieve it even with a new code/new function that leads to the same result.
EDIT:
Subset of the original dataset, with just one product (cod_id == 2), I tried to run on the code provided by "w-m":
print(df)
cod_id dt_op quantita final_sum
0 2 2017-01-03 1 54.0
1 2 2017-01-04 1 53.0
2 2 2017-01-13 1 52.0
3 2 2017-01-23 2 51.0
4 2 2017-01-26 1 49.0
5 2 2017-02-03 1 48.0
6 2 2017-02-27 1 47.0
7 2 2017-03-05 1 46.0
8 2 2017-03-15 1 45.0
9 2 2017-03-23 1 44.0
10 2 2017-03-27 2 43.0
11 2 2017-03-31 3 41.0
12 2 2017-04-04 1 38.0
13 2 2017-04-05 1 37.0
14 2 2017-04-15 2 36.0
15 2 2017-04-27 2 34.0
16 2 2017-04-30 1 32.0
17 2 2017-05-16 1 31.0
18 2 2017-05-18 1 30.0
19 2 2017-05-19 1 29.0
20 2 2017-06-03 1 28.0
21 2 2017-06-04 1 27.0
22 2 2017-06-07 1 26.0
23 2 2017-06-13 2 25.0
24 2 2017-06-14 1 23.0
25 2 2017-06-20 1 22.0
26 2 2017-06-22 2 21.0
27 2 2017-06-28 1 19.0
28 2 2017-06-30 1 18.0
29 2 2017-07-03 1 17.0
30 2 2017-07-06 2 16.0
31 2 2017-07-07 1 14.0
32 2 2017-07-13 1 13.0
33 2 2017-07-20 1 12.0
34 2 2017-07-28 1 11.0
35 2 2017-08-06 1 10.0
36 2 2017-08-07 1 9.0
37 2 2017-08-24 1 8.0
38 2 2017-09-06 1 7.0
39 2 2017-09-16 2 6.0
40 2 2017-09-20 1 4.0
41 2 2017-10-07 1 3.0
42 2 2017-11-04 1 2.0
43 2 2017-12-07 1 1.0
Edit 181017: this approach doesn't work due to forward rolling functions on sparse time series not currently being supported by pandas, see the comments.
Using for loops can be a performance killer when doing pandas operations.
The for loop around the rows plus their timedelta of 7 days can be replaced with a .rolling("7D"). To get a forward-rolling time delta (current date + 7 days), we reverse the df by date, as shown here.
Then no custom function is required anymore, and you can just take .quantity.sum() from the groupby.
quant_sum = df.sort_values("dt_op", ascending=False).groupby("cod_id") \
.rolling("7D", on="dt_op").quantity.sum()
cod_id dt_op
611 2018-01-21 8.0
613 2018-01-21 1.0
2018-01-20 2.0
Name: quantity, dtype: float64
result = df.set_index(["cod_id", "dt_op"])
result["final_sum"] = quant_sum
result.reset_index()
cod_id dt_op quantity final_sum
0 613 2018-01-20 1 2.0
1 611 2018-01-21 8 8.0
2 613 2018-01-21 1 1.0
Implementing the exact behavior from the question is difficult due to two shortcoming in pandas: neither groupby/rolling/transform nor forward looking rolling sparse dates being implemented (see other answer for more details).
This answer attempts to work around both by resampling the data, filling in all days, and then joining the quant_sums back with the original data.
# Create a temporary df with all in between days filled in with zeros
filled = df.set_index("dt_op").groupby("cod_id") \
.resample("D").asfreq().fillna(0) \
.quantity.to_frame()
# Reverse and sum
filled["quant_sum"] = filled.reset_index().set_index("dt_op") \
.iloc[::-1] \
.groupby("cod_id") \
.rolling(7, min_periods=1) \
.quantity.sum().astype(int)
# Join with original `df`, dropping the filled days
result = df.set_index(["cod_id", "dt_op"]).join(filled.quant_sum).reset_index()

pandas cumulative sum of stock in warehouse

Consider the warehouse stocks on different days
day action quantity symbol
0 1 40 a
1 1 53 b
2 -1 21 a
3 1 21 b
4 -1 2 a
5 1 42 b
Here, day represents time series, action represents buy/sell for specific product (symbol) and of quantity.
For this dataframe, How do I calculate the cumulative sum daily, for each product.
Basically, a resultant dataframe as below:
days a b
0 40 0
1 40 53
2 19 53
3 19 64
4 17 64
5 17 106
I have tried cumsum() with groupby and was unsuccessful with it
Using pivot_table
In [920]: dff = df.pivot_table(
index=['day', 'action'], columns='symbol',
values='quantity').reset_index()
In [921]: dff
Out[921]:
symbol day action a b
0 0 1 40.0 NaN
1 1 1 NaN 53.0
2 2 -1 21.0 NaN
3 3 1 NaN 21.0
4 4 -1 2.0 NaN
5 5 1 NaN 42.0
Then, mul the action, take cumsum, forward fill missing values, and finally replace NaNs with 0
In [922]: dff[['a', 'b']].mul(df.action, 0).cumsum().ffill().fillna(0)
Out[922]:
symbol a b
0 40.0 0.0
1 40.0 53.0
2 19.0 53.0
3 19.0 74.0
4 17.0 74.0
5 17.0 116.0
Final result
In [926]: dff[['a', 'b']].mul(df.action, 0).cumsum().ffill().fillna(0).join(df.day)
Out[926]:
a b day
0 40.0 0.0 0
1 40.0 53.0 1
2 19.0 53.0 2
3 19.0 74.0 3
4 17.0 74.0 4
5 17.0 116.0 5
Nevermind, didn't see pandas tag. This is just plain Python.
Try this:
sums = []
currentsums = {'a': 0, 'b': 0}
for i in data:
currentsums[i['symbol']] += i['action'] * i['quantity']
sums.append({'a': currentsums['a'], 'b': currentsums['b']})
Try it online!
Note that it gives a different result than you posted because you calculated wrong.

How to sum python pandas dataframe in certain time range

I have a dataframe like this
df
order_date amount
0 2015-10-02 1
1 2015-12-21 15
2 2015-12-24 3
3 2015-12-26 4
4 2015-12-27 5
5 2015-12-28 10
I would like to sum on df["amount"] based on range from df["order_date"] to df["order_date"] + 6 days
order_date amount sum
0 2015-10-02 1 1
1 2015-12-21 15 27 //comes from 15 + 3 + 4 + 5
2 2015-12-24 3 22 //comes from 3 + 4 + 5 + 10
3 2015-12-26 4 19
4 2015-12-27 5 15
5 2015-12-28 10 10
the data type of order_date is datetime
have tried to use iloc but it did not work well...
if anyone has any idea/example on who to work on this,
please kindly let me know.
If pandas rolling allowed left-aligned window (default is right-aligned) then the answer would be a simple single liner: df.set_index('order_date').amount.rolling('7d',min_periods=1,align='left').sum(), however forward-looking has not been implemented yet (i.e. rolling does not accept an align parameter). So, the trick I came up with is to "reverse" the dates temporarily. Solution:
df.index = pd.to_datetime(pd.datetime.now() - df.order_date)
df['sum'] = df.sort_index().amount.rolling('7d',min_periods=1).sum()
df.reset_index(drop=True)
Output:
order_date amount sum
0 2015-10-02 1 1.0
1 2015-12-21 15 27.0
2 2015-12-24 3 22.0
3 2015-12-26 4 19.0
4 2015-12-27 5 15.0
5 2015-12-28 10 10.0
Expanding on my comment:
from datetime import timedelta
df['sum'] = 0
for i in range(len(df)):
dt1 = df['order_date'][i]
dt2 = dt1 + timedelta(days=6)
df['sum'][i] = sum(df['amount'][(df['order_date'] >= dt1) & (df['order_date'] <= dt2)])
There's probably a much better way to do this but it works...
There is my way for this problem. It works.. (I believe there should be a much better way to do this.)
import pandas as pd
df['order_date']=pd.to_datetime(pd.Series(df.order_date))
Temp=pd.DataFrame(pd.date_range(start='2015-10-02', end='2017-01-01'),columns=['STDate'])
Temp=Temp.merge(df,left_on='STDate',right_on='order_date',how='left')
Temp['amount']=Temp['amount'].fillna(0)
Temp.sort(['STDate'],ascending=False,inplace=True)
Temp['rolls']=pd.rolling_sum(Temp['amount'],window =7,min_periods=0)
Temp.loc[Temp.STDate.isin(df.order_date),:].sort(['STDate'],ascending=True)
STDate Unnamed: 0 order_date amount rolls
0 2015-10-02 0.0 2015-10-02 1.0 1.0
80 2015-12-21 1.0 2015-12-21 15.0 27.0
83 2015-12-24 2.0 2015-12-24 3.0 22.0
85 2015-12-26 3.0 2015-12-26 4.0 19.0
86 2015-12-27 4.0 2015-12-27 5.0 15.0
87 2015-12-28 5.0 2015-12-28 10.0 10.0
Set order_date to be DatetimeIndex, so that you can use df.ix[time1:time2] to get the time range rows, then filter amount column and sum them.
You can try with :
from datetime import timedelta
df = pd.read_fwf('test2.csv')
df.order_date = pd.to_datetime(df.order_date)
df =df.set_index(pd.DatetimeIndex(df['order_date']))
sum_list = list()
for i in range(len(df)):
sum_list.append(df.ix[df.ix[i]['order_date']:(df.ix[i]['order_date'] + timedelta(days=6))]['amount'].sum())
df['sum'] = sum_list
df
Output:
order_date amount sum
2015-10-02 2015-10-02 1 1
2015-12-21 2015-12-21 15 27
2015-12-24 2015-12-24 3 22
2015-12-26 2015-12-26 4 19
2015-12-27 2015-12-27 5 15
2015-12-28 2015-12-28 10 10
import datetime
df['order_date'] = pd.to_datetime(df['order_date'], format='%Y-%m-%d')
df.set_index(['order_date'], inplace=True)
# Sum rows within the range of six days in the future
d = {t: df[(df.index >= t) & (df.index <= t + datetime.timedelta(days=6))]['amount'].sum()
for t in df.index}
# Assign the summed values back to the dataframe
df['amount_sum'] = [d[t] for t in df.index]
df is now:
amount amount_sum
order_date
2015-10-02 1.0 1.0
2015-12-21 15.0 27.0
2015-12-24 3.0 22.0
2015-12-26 4.0 19.0
2015-12-27 5.0 15.0
2015-12-28 10.0 10.0

Categories