Pandas calculate aggrerage value with respect to current row - python

Let's say we have this data:
df = pd.DataFrame({
'group_id': [100,100,100,101,101,101,101],
'amount': [30,40,10,20,25,80,40]
})
df.index.name = 'id'
df.set_index(['group_id', df.index], inplace=True)
It looks like this:
amount
group_id id
100 0 30
1 40
2 10
101 3 20
4 25
5 80
6 40
The goal is to compute a new column, that's the sum of all amounts less than the current one. I.e. We want this result.
amount sum_of_smaller_amounts
group_id id
100 0 30 10
1 40 40 # 30 + 10
2 10 0 # smallest amount
101 3 20 0 # smallest
4 25 20
5 80 85 # 20 + 25 + 40
6 40 45 # 20 + 25
Ideally this should be (very) efficient as the real dataframe could be millions of rows.

Better solution (I think):
df['sum_smaller_amount'] = (df_sort.groupby('group_id')['amount']
.transform(lambda x: x.mask(x.duplicated(),0).cumsum()) -
df['amount'])
Output:
amount sum_smaller_amount
group_id id
100 0 30 10.0
1 40 40.0
2 10 0.0
101 3 20 0.0
4 25 20.0
5 80 85.0
6 40 45.0
Another way to do this to use a cartesian product and filter:
df.merge(df.reset_index(), on='group_id', suffixes=('_sum_smaller',''))\
.query('amount_sum_smaller < amount')\
.groupby(['group_id','id'])[['amount_sum_smaller']].sum()\
.join(df, how='right').fillna(0)
Output:
amount_sum_smaller amount
group_id id
100 0 10.0 30
1 40.0 40
2 0.0 10
101 3 0.0 20
4 20.0 25
5 85.0 80
6 45.0 40

You want sort_values and cumsum:
df['new_amount']= (df.sort_values('amount')
.groupby(level='group_id')
['amount'].cumsum() - df['amount'])
Output:
amount new_amount
group_id id
100 0 30 10
1 40 40
2 10 0
101 3 20 0
4 25 20
5 80 85
6 40 45
Update: fix for repeated values:
# the data
df = pd.DataFrame({
'group_id': [100,100,100,100,101,101,101,101],
'amount': [30,40,10,30,20,25,80,40]
})
df.index.name = 'id'
df.set_index(['group_id', df.index], inplace=True)
# sort values:
df_sorted = df.sort_values('amount')
# cumsum
s1 = df_sorted.groupby('group_id')['amount'].cumsum()
# value counts
s2 = df_sorted.groupby(['group_id', 'amount']).cumcount() + 1
# instead of just subtracting df['amount'], we subtract amount * counts
df['new_amount'] = s1 - df['amount'].mul(s2)
Output (note the two values 30 in group 100)
amount new_amount
group_id id
100 0 30 10
1 40 70
2 10 0
3 30 10
101 4 20 0
5 25 20
6 80 85
7 40 45

I'm intermediate on pandas, not sure on efficiency but here's a solution:
temp_df = df.sort_values(['group_id','amount'])
temp_df = temp_df.mask(temp_df['amount'] == temp_df['amount'].shift(), other=0).groupby(level='group_id').cumsum()
df['sum'] = temp_df.sort_index(level='id')['amount'] - df['amount']
Result:
amount sum
group_id id
100 0 30 10
1 40 40
2 10 0
101 3 20 0
4 25 20
5 80 85
6 40 45
7 40 45
You can substitute the last line with these if they help efficiency somehow:
df['sum'] = df.subtract(temp_df).multiply(-1)
# or
df['sum'] = (~df).add(temp_df + 1)

Related

dropping duplicates on one specific column and add a new column as count of repeat records pandas

I have a pandas df like this
student_id
A
B
1
3
13
2
4
23
1
5
12
4
28
32
1
38
12
2
21
14
My desired output:
I want to drop the duplicates, and count how many duplicates there are according to student_id
and keeping the last record/row and append the count column as new column, also average the duplicated rows entry in A and B as new columns
student_id
A
B
count
average A rounded
average B rounded
1
38
12
3
15
12
2
21
14
2
13
19
4
28
32
1
28
32
You can use named aggregation:
df.groupby('student_id', as_index=False).agg(
A=('A', 'last'),
B=('B', 'last'),
count=('student_id', 'count'),
average_A_rounded=('A', lambda x: np.mean(x).round()),
average_B_rounded=('B', lambda x: np.mean(x).round()),
)
# student_id A B count average_A_rounded average_B_rounded
# 0 1 38 12 3 15 12
# 1 2 21 14 2 12 18
# 2 4 28 32 1 28 32
I see that you want round the values "half-up". So to extend the #tdy answer:
def round_half_up(x):
mask = x >= 0
out = np.empty_like(x)
out[mask] = np.floor(x[mask] + 0.5)
out[~mask] = np.ceil(x[~mask] - 0.5)
return out
df = df.groupby("student_id", as_index=False).agg(
A=("A", "last"),
B=("B", "last"),
count=("A", "count"),
average_A_rounded=("A", "mean"),
average_B_rounded=("B", "mean"),
)
print(df.apply(round_half_up).astype(int))
Prints:
student_id A B count average_A_rounded average_B_rounded
0 1 38 12 3 15 12
1 2 21 14 2 13 19
2 4 28 32 1 28 32

Applying Pandas iterrows logic across many groups in a dataframe

I am having trouble applying some logic across my entire dataset. I am able to apply the logic on a small "group" but not on all of the groups (note, the groups are made by primaryFilter and secondaryFilter. Do you all mind pointing me in the right direction to go about this?
Entire Data
import pandas as pd
import numpy as np
myInput = {
'primaryFilter': [100,100,100,100,100,100,100,100,100,100,200,200,200,200,200,200,200,200,200,200],
'secondaryFilter': [1,1,1,1,2,2,2,3,3,3,1,1,2,2,2,2,3,3,3,3],
'constantValuePerGroup': [15,15,15,15,20,20,20,17,17,17,10,10,30,30,30,30,22,22,22,22],
'someValue':[3,1,4,7,9,9,2,7,3,7,6,4,7,10,10,3,4,6,7,5]
}
df_input = pd.DataFrame(data=myInput)
df_input
Test Data (First Group)
df_test = df_input[df_input.primaryFilter.isin([100])]
df_test = df_test[df_test.secondaryFilter == 1.0]
df_test['newColumn'] = np.nan
for index,row in df_test.iterrows():
if index==0:
print("start")
df_test.loc[0, 'newColumn'] = 0
elif index==df_test.shape[0]-1:
df_test.loc[index, 'newColumn'] = df_test.loc[index-1, 'newColumn'] + df_test.loc[index-1, 'someValue']
print("end")
else:
print("inter")
df_test.loc[index, 'newColumn'] = df_test.loc[index-1, 'newColumn'] + df_test.loc[index-1, 'someValue']
df_test["delta"] = df_test["constantValuePerGroup"] - df_test['newColumn']
df_test.head()
Here is the output of the test
I now would like to apply the above logic to the remaining groups 100,2 and 100,3 and 200,1 and so forth..
No need to use iterrows here, you can group the dataframe on primaryFilter and secondaryFilter columns then for each unique group take the cumulative sum of values in column someValue and shift the resulting cummulative sum by 1 position downwards to obtain newColumn. Finally subtract newColumn from constantValuePerGroup to get the delta.
df_input['newColumn'] = df_input.groupby(['primaryFilter', 'secondaryFilter'])['someValue'].apply(lambda s: s.cumsum().shift(fill_value=0))
df_input['delta'] = df_input['constantValuePerGroup'] - df_input['newColumn']
>>> df_input
primaryFilter secondaryFilter constantValuePerGroup someValue newColumn delta
0 100 1 15 3 0 15
1 100 1 15 1 3 12
2 100 1 15 4 4 11
3 100 1 15 7 8 7
4 100 2 20 9 0 20
5 100 2 20 9 9 11
6 100 2 20 2 18 2
7 100 3 17 7 0 17
8 100 3 17 3 7 10
9 100 3 17 7 10 7
10 200 1 10 6 0 10
11 200 1 10 4 6 4
12 200 2 30 7 0 30
13 200 2 30 10 7 23
14 200 2 30 10 17 13
15 200 2 30 3 27 3
16 200 3 22 4 0 22
17 200 3 22 6 4 18
18 200 3 22 7 10 12
19 200 3 22 5 17 5

How to add differentiate series result to another column from index 0 in pandas dataframe?

Here is data frame df1 and taken A column series.
df1
A B
0 10 SLC
1 20 MNS
2 60 LLK
3 40 GNT
4 22 VJZ
5 06 NLR
I have differentiated the series with the below code.
df1['difference'] = df1['A'].diff().fillna(0)
df1
A B difference
0 10 SLC 0 <<---- place 10-20 = -10 value here
1 20 MNS -10 <<---- place 20-60 = -40 value here
2 60 LLK -40 <<---- place 60-40 = 20 value here
3 40 GNT 20 ..............
4 22 VJZ 18 ..............
5 06 NLR 16 ..............
How to place the difference between 10 and 20 in the '0'th index of
'difference' column and so on?
Change default 1 to -1 for difference with following row:
df1['difference'] = df1['A'].diff(-1).fillna(df1['A'])
print (df1)
A B difference
0 10 SLC -10.0
1 20 MNS -40.0
2 60 LLK 20.0
3 40 GNT 18.0
4 22 VJZ 16.0
5 6 NLR 6.0

Fill in missing values based on series and populate second row based on previous or next row

I have a csv with 4 columns. The file contains some missing rows based on the series.
Input:-
No A B C
1 10 50 12
3 40 50 12
4 20 60 15
6 80 80 18
Output:-
No A B C
1 10 50 12
2 10 50 12
3 40 50 12
4 20 60 15
5 20 60 15
6 80 80 18
I need python and pandas code to generate the above output.
Use if No is column - create index by No and DataFrame.reindex by range with all possible values:
v = range(df['No'].min(), df['No'].max() + 1)
df1 = df.set_index('No').reindex(v, method='ffill').reset_index()
print (df1)
No A B C
0 1 10 50 12
1 2 10 50 12
2 3 40 50 12
3 4 20 60 15
4 5 20 60 15
5 6 80 80 18
Use if No is index solution is changed a bit:
v = range(df.index.min(), df.index.max() + 1)
df1 = df.reindex(v, method='ffill')
print (df1)
A B C
No
1 10 50 12
2 10 50 12
3 40 50 12
4 20 60 15
5 20 60 15
6 80 80 18
Create a dataframe of your missing rows
missing_list = [[i] + [pd.np.nan]*(df.shape[1] - 1) for i in range(df.No.min(), df.No.max()) if i not in df.No]
missing_df = pd.DataFrame(missing_list, columns=df.columns)
Concat to original dataframe, sort and forward fill
pd.concat([df, missing_df]).sort_values('No').ffill()

calculate the amount spend based on every month which depends on another column value ID

I am trying to get the amount spent on each type ID based on the month column
Dataset :
ID TYPE_ID Month_year Amount
100 1 jun_2019 20
100 1 jul_2019 30
100 2 jun_2019 10
200 1 jun_2019 50
200 1 jun_2019 30
100 2 jul_2019 20
200 2 jun_2019 40
200 2 jul_2019 10
200 2 jun_2019 20
200 1 jul_2019 30
100 1 jul_2019 10
Output :
Based on every type ID, I want to calculate the spend depending on the month . The column value TYPEID_1_jun2019 tells me the no of transactions done in that particular month. Amount_type1_jun2019 tells me the total amount spend in every month based on my type ID.
ID TYPEID_1_jun2019 Amount_type1_jun2019 TYPEID_1_jul2019 Amount_type1_jul2019 TYPEID_2_jun2019 Amount_type2_jun2019 TYPEID_2_jul2019 Amount_type2_jul2019
100 1 20 2 40 1 10 1 20
200 1 80 1 30 2 60 1 10
EDIT : I also want to calculate the average monthly spent for every ID
Output : Also include these columns,
ID Average_type1_jul2019 Average_type1_jun2019
100 20 10
The formula I used to calculate the average is amount spent in july with type ID 1 divided by the total months.
First convert Month_year to datetimes for correct order, then create helper column type and aggregate sum with size, reshape by DataFrame.unstack, sorting by DataFrame.sort_index and last flatten MultiIndex with datetimes to original format:
df['Month_year'] = pd.to_datetime(df['Month_year'], format='%b_%Y')
df1 = (df.assign(type=df['TYPE_ID']).groupby(['ID','Month_year','TYPE_ID'])
.agg({'Amount':'sum', 'type':'size'})
.unstack([1,2])
.sort_index(axis=1, level=[1,2]))
df1.columns = df1.columns.map(lambda x: f'{x[0]}_{x[2]}_{x[1].strftime("%b_%Y")}')
df1 = df1.reset_index()
print (df1)
ID Amount_1_Jun_2019 type_1_Jun_2019 Amount_2_Jun_2019 \
0 100 20 1 10
1 200 80 2 60
type_2_Jun_2019 Amount_1_Jul_2019 type_1_Jul_2019 Amount_2_Jul_2019 \
0 1 40 2 20
1 2 30 1 10
type_2_Jul_2019
0 1
1 1
EDIT:
#removed sorting anf flatteting MultiIndex
df['Month_year'] = pd.to_datetime(df['Month_year'], format='%b_%Y')
df1 = (df.assign(type=df['TYPE_ID']).groupby(['ID','Month_year','TYPE_ID'])
.agg({'Amount':'sum', 'type':'size'})
.unstack([1,2]))
print (df1)
Amount type
Month_year 2019-06-01 2019-07-01 2019-06-01 2019-07-01
TYPE_ID 1 2 1 2 1 2 1 2
ID
100 20 10 40 20 1 1 2 1
200 80 60 30 10 2 2 1 1
#get number of unique mmonth_year per ID and type and divided by Amount
df2 = df.groupby(['ID','TYPE_ID'])['Month_year'].nunique().unstack()
df3 = df1.xs('Amount', axis=1, level=0).div(df2, level=1)
#added top level Average
df3.columns = pd.MultiIndex.from_tuples([('Average', a, b) for a, b in df3.columns])
print (df3)
Average
2019-06-01 2019-07-01
1 2 1 2
ID
100 10.0 5.0 20.0 10.0
200 40.0 30.0 15.0 5.0
#join together, sorting and flatten MultiIndex
df5 = pd.concat([df1, df3],axis=1).sort_index(axis=1, level=[1,2])
df5.columns = df5.columns.map(lambda x: f'{x[0]}_{x[2]}_{x[1].strftime("%b_%Y")}')
df5 = df5.reset_index()
print (df5)
ID Amount_1_Jun_2019 Average_1_Jun_2019 type_1_Jun_2019 \
0 100 20 10.0 1
1 200 80 40.0 2
Amount_2_Jun_2019 Average_2_Jun_2019 type_2_Jun_2019 Amount_1_Jul_2019 \
0 10 5.0 1 40
1 60 30.0 2 30
Average_1_Jul_2019 type_1_Jul_2019 Amount_2_Jul_2019 Average_2_Jul_2019 \
0 20.0 2 20 10.0
1 15.0 1 10 5.0
type_2_Jul_2019
0 1
1 1

Categories