Python Pandas groupby limited cumulative sum

Python Pandas groupby limited cumulative sum - python

This is my dataframe
import pandas as pd
import numpy as np
data = {'c1':[-1,-1,1,1,np.nan,1,1,1,1,1,np.nan,-1],\
'c2':[1,1,1,-1,1,1,-1,-1,1,-1,1,np.nan]}
index = pd.date_range('2000-01-01','2000-03-20', freq='W')
df = pd.DataFrame(index=index, data=data)
>>> df
c1 c2
2000-01-02 -1.0 1.0
2000-01-09 -1.0 1.0
2000-01-16 1.0 1.0
2000-01-23 1.0 -1.0
2000-01-30 NaN 1.0
2000-02-06 1.0 1.0
2000-02-13 1.0 -1.0
2000-02-20 1.0 -1.0
2000-02-27 1.0 1.0
2000-03-05 1.0 -1.0
2000-03-12 NaN 1.0
2000-03-19 -1.0 NaN
and this is a cumulative sum by month
df2 = df.groupby(df.index.to_period('m')).cumsum()
>>> df2
c1 c2
2000-01-02 -1.0 1.0
2000-01-09 -2.0 2.0
2000-01-16 -1.0 3.0
2000-01-23 0.0 2.0
2000-01-30 NaN 3.0
2000-02-06 1.0 1.0
2000-02-13 2.0 0.0
2000-02-20 3.0 -1.0
2000-02-27 4.0 0.0
2000-03-05 1.0 -1.0
2000-03-12 NaN 0.0
2000-03-19 0.0 NaN
what I need more is to ignore the increment if it is more than 3 or less than 0, something like this function
def cumsum2(arr, low=-float('Inf'), high=float('Inf')):
arr2 = np.copy(arr)
sm = 0
for index, elem in np.ndenumerate(arr):
if not np.isnan(elem):
sm += elem
if sm > high:
sm = high
if sm < low:
sm = low
arr2[index] = sm
return arr2
the desired result is
c1 c2
2000-01-02 0.0 1.0
2000-01-09 0.0 2.0
2000-01-16 1.0 3.0
2000-01-23 2.0 2.0
2000-01-30 2.0 3.0
2000-02-06 1.0 1.0
2000-02-13 2.0 0.0
2000-02-20 3.0 0.0
2000-02-27 3.0 1.0
2000-03-05 1.0 0.0
2000-03-12 1.0 1.0
2000-03-19 0.0 1.0
I tried to use apply and lambda but doesn't work and it's slow for large dataframe.
df.groupby(df.index.to_period('m')).apply(lambda x: cumsum2(x, 0, 3))
What's wrong? Is there a faster way?

You can try accumulate from itertools and use a custom function to clip values between 0 and 3:
from itertools import accumulate
lb = 0 # lower bound
ub = 3 # upper bound
def cumsum2(dfm):
def clip(bal, val):
return np.clip(bal + val, lb, ub)
return list(accumulate(dfm.to_numpy(), clip, initial=0))[1:]
out = df.fillna(0).groupby(df.index.to_period('m')).transform(cumsum2)
Output:
>>> out
c1 c2
2000-01-02 0.0 1.0
2000-01-09 0.0 2.0
2000-01-16 1.0 3.0
2000-01-23 2.0 2.0
2000-01-30 2.0 3.0
2000-02-06 1.0 1.0
2000-02-13 2.0 0.0
2000-02-20 3.0 0.0
2000-02-27 3.0 1.0
2000-03-05 1.0 0.0
2000-03-12 1.0 1.0
2000-03-19 0.0 1.0

In such sophisticated case we can resort to pandas.Series.rolling with window of size 2 piping each window to a custom function to keep each interim accumulation within a certain threshold:
def cumsum_tsh(x, low=-float('Inf'), high=float('Inf')):
def f(w):
w[-1] = min(high, max(low, w[0] if w.size == 1 else w[0] + w[1]))
return w[-1]
return x.apply(lambda s: s.rolling(2, min_periods=1).apply(f))
res = df.fillna(0).groupby(df.index.to_period('m'), group_keys=False)\
.apply(lambda x: cumsum_tsh(x, 0, 3))
c1 c2
2000-01-02 0.0 1.0
2000-01-09 0.0 2.0
2000-01-16 1.0 3.0
2000-01-23 2.0 2.0
2000-01-30 2.0 3.0
2000-02-06 1.0 1.0
2000-02-13 2.0 0.0
2000-02-20 3.0 0.0
2000-02-27 3.0 1.0
2000-03-05 1.0 0.0
2000-03-12 1.0 1.0
2000-03-19 0.0 1.0

I've tried various solutions, for some reason the fastest is manipulating single columns of frames created by groupby.
This is the code if it can be useful to anyone
def cumsum2(frame, low=-float('Inf'), high=float('Inf')):
for col in frame.columns:
sm = 0
xs = []
for e in frame[col]:
sm += e
if sm > high:
sm = high
if sm < low:
sm = low
xs.append(sm)
frame[col] = xs
return frame
res = df.fillna(0).groupby(df.index.to_period('m'), group_keys=False)\
.apply(cumsum2,0,3)

Related

Pandas perform identical transformation on multiple dataframes where inline is not an option

I have a situation where I want to shift multiple dataframes by one line. A for loop is the first thing that comes to mind, but that does not actually store the dataframe. Other SO posts suggest using the inline=True option, which works for certain transformations but not shift as inline is not an option. Obviously I could just manually do it by writing several lines of df = df.shift(1) but in the spirit of learning the most pythonic method... Here's a MRE, showing both a standard for loop and a function-based method:
import pandas as pd
import numpy as np
def shifter(df):
df = df.shift(1)
return df
data1 = {'a':np.random.randint(0, 5, 20),
'b':np.random.randint(0, 5, 20),
'c':np.random.randint(0, 5, 20)}
data2 = {'foo':np.random.randint(0, 5, 20),
'bar':np.random.randint(0, 5, 20),
'again':np.random.randint(0, 5, 20)}
df1 = pd.DataFrame(data=data1)
df2 = pd.DataFrame(data=data2)
print(df1)
for x in [df1, df2]:
x = x.shift(1)
print(df1)
for x in [df1, df2]:
x = shifter(x)
print(df1)

You need to assign back the content, not create a new variable:
def shifter(df):
return df.shift(1)
for x in [df1, df2]:
x[:] = shifter(x)
Or, in place within the function:
def shifter(df):
df[:] = df.shift(1)
for x in [df1, df2]:
shifter(x)

Another approach would be:
import pandas as pd
import numpy as np
def shifter(df):
a = df.shift(1)
return a
data1 = {'a':np.random.randint(0, 5, 20),
'b':np.random.randint(0, 5, 20),
'c':np.random.randint(0, 5, 20)}
data2 = {'foo':np.random.randint(0, 5, 20),
'bar':np.random.randint(0, 5, 20),
'again':np.random.randint(0, 5, 20)}
df1 = pd.DataFrame(data=data1)
df2 = pd.DataFrame(data=data2)
Y = []
for x in [df1, df2]:
x = shifter(x)
Y.append(x)
Ydf = pd.concat(Y, axis = 1)
print(Ydf)
which returns
a b c foo bar again
0 NaN NaN NaN NaN NaN NaN
1 2.0 3.0 3.0 3.0 2.0 1.0
2 3.0 1.0 1.0 4.0 3.0 0.0
3 0.0 1.0 4.0 2.0 2.0 4.0
4 0.0 3.0 0.0 3.0 2.0 1.0
5 4.0 4.0 4.0 2.0 2.0 2.0
6 0.0 1.0 3.0 4.0 2.0 3.0
7 1.0 0.0 3.0 4.0 3.0 3.0
8 4.0 2.0 3.0 0.0 0.0 2.0
9 1.0 1.0 4.0 4.0 3.0 2.0
10 4.0 3.0 4.0 0.0 1.0 4.0
11 0.0 3.0 1.0 1.0 2.0 4.0
12 2.0 0.0 4.0 4.0 3.0 0.0
13 3.0 3.0 4.0 4.0 1.0 2.0
14 4.0 1.0 4.0 0.0 1.0 0.0
15 3.0 4.0 0.0 3.0 0.0 3.0
16 1.0 4.0 2.0 0.0 3.0 1.0
17 3.0 2.0 0.0 0.0 2.0 0.0
18 1.0 1.0 1.0 3.0 3.0 4.0
19 1.0 3.0 2.0 1.0 1.0 0.0

Set only certain column values to zero when value exceeds threshold

I have a dataframe that looks like this:
iso3 prod_level alloc_key cell5m x y rec_type tech_type unit whea_a ... acof_pct_prod rcof_pct_prod coco_pct_prod teas_pct_prod toba_pct_prod bana_pct_prod trof_pct_prod temf_pct_prod vege_pct_prod rest_pct_prod
35110 IND IN16011 9243059 3990418 74.875000 13.041667 P A mt 0.0 ... 1.0 1.0 1.0 1.0 1.0 0.958586 0.449218 1.0 1.0 0.004520
35109 IND IN16011 9243058 3990417 74.791667 13.041667 P A mt 0.0 ... 1.0 1.0 1.0 1.0 1.0 0.970957 0.459725 1.0 1.0 0.009037
35406 IND IN16003 9283093 4007732 77.708333 12.708333 P A mt 0.0 ... 1.0 1.0 1.0 1.0 1.0 0.883868 1.000000 1.0 1.0 0.012084
35311 IND IN16011 9273062 4003381 75.125000 12.791667 P A mt 0.0 ... 1.0 1.0 1.0 1.0 1.0 0.942550 0.381430 1.0 1.0 0.015024
35308 IND IN16011 9273059 4003378 74.875000 12.791667 P A mt 0.0 ... 1.0 1.0 1.0 1.0 1.0 0.991871 0.887494 1.0 1.0 0.017878
I want to set all values that are greater than 0.9 in columns that end in 'prod' to zero. I can select only those columns like this:
cols2=[col for col in df.columns if col.endswith('_prod')]
df[cols2]
whea_pct_prod rice_pct_prod maiz_pct_prod barl_pct_prod pmil_pct_prod smil_pct_prod sorg_pct_prod pota_pct_prod swpo_pct_prod cass_pct_prod ... acof_pct_prod rcof_pct_prod coco_pct_prod teas_pct_prod toba_pct_prod bana_pct_prod trof_pct_prod temf_pct_prod vege_pct_prod rest_pct_prod
35110 1.0 0.958721 0.359063 1.0 1.0 1.000000 1.0 1.0 1.00000 0.992816 ... 1.0 1.0 1.0 1.0 1.0 0.958586 0.449218 1.0 1.0 0.004520
35109 1.0 0.878148 0.200283 1.0 1.0 1.000000 1.0 1.0 1.00000 0.993140 ... 1.0 1.0 1.0 1.0 1.0 0.970957 0.459725 1.0 1.0 0.009037
35406 1.0 0.996354 0.980844 1.0 1.0 0.274348 1.0 1.0 0.99945 1.000000 ... 1.0 1.0 1.0 1.0 1.0 0.883318 1.000000 1.0 1.0 0.012084
35311 1.0 0.570999 0.341217 1.0 1.0 1.000000 1.0 1.0 1.00000 0.997081 ... 1.0 1.0 1.0 1.0 1.0 0.942550 0.381430 1.0 1.0 0.015024
35308 1.0 0.657520 0.161771 1.0 1.0 1.000000 1.0 1.0 1.00000 0.991491 ... 1.0 1.0 1.0 1.0 1.0 0.991871 0.887494 1.0 1.0 0.017878
Now, when I try and set the values greater than 0.9 to be zero, it does not work.
df[cols2][df[cols2]>0.9]=0
What should I be doing instead?

You can use df.where(cond, other) to replace the values with other where cond == False.
df[cols2] = df[cols2].where(df[cols]<=0.9, other=0)

How to drop float feature where % NANs is higher than a certain number?

I'm trying to drop a feature which if float and a number of missing values is higher than certain number.
I've tried:
# Define threshold to 1/6
threshold = 0.1667
# Drop float > threshold
for f in data:
if data[f].dtype==float & data[f].isnull().sum() / data.shape[0] > threshold: del data[f]
..which raises an error:
TypeError: unsupported operand type(s) for &: 'type' and
'numpy.float64'
Help would be aprreciated.

Use DataFrame.select_dtypes for only floats columns, check missing values and get mean - sum/count and add another non floats columns by Series.reindex, last filter by inverse condition > to <= by boolean indexing:
np.random.seed(2019)
df = pd.DataFrame(np.random.choice([np.nan,1], p=(0.2,0.8),size=(10,10))).assign(A='a')
print (df)
0 1 2 3 4 5 6 7 8 9 A
0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
1 1.0 1.0 NaN 1.0 NaN 1.0 NaN 1.0 1.0 1.0 a
2 1.0 1.0 1.0 1.0 1.0 NaN 1.0 NaN 1.0 1.0 a
3 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 NaN 1.0 a
4 1.0 NaN 1.0 1.0 1.0 1.0 1.0 NaN 1.0 1.0 a
5 1.0 1.0 1.0 1.0 1.0 1.0 NaN 1.0 1.0 1.0 a
6 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
7 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
8 1.0 NaN 1.0 1.0 1.0 1.0 NaN 1.0 1.0 1.0 a
9 NaN 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 NaN a
threshold = 0.1667
df1 = df.select_dtypes(float).isnull().mean().reindex(df.columns, fill_value=False)
df = df.loc[:, df1 <= threshold]
print (df)
0 2 3 4 5 8 9 A
0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
1 1.0 NaN 1.0 NaN 1.0 1.0 1.0 a
2 1.0 1.0 1.0 1.0 NaN 1.0 1.0 a
3 1.0 1.0 1.0 1.0 1.0 NaN 1.0 a
4 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
6 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
7 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
8 1.0 1.0 1.0 1.0 1.0 1.0 1.0 a
9 NaN 1.0 1.0 1.0 1.0 1.0 NaN a

Python - DataFrame: Multiply multiple columns by another column and save in new columns

I couldn't find an efficient away of doing that.
I have below DataFrame in Python with columns from A to Z
A B C ... Z
0 2.0 8.0 1.0 ... 5.0
1 3.0 9.0 0.0 ... 4.0
2 4.0 9.0 0.0 ... 3.0
3 5.0 8.0 1.0 ... 2.0
4 6.0 8.0 0.0 ... 1.0
5 7.0 9.0 1.0 ... 0.0
I need to multiply each of the columns from B to Z by A, (B x A, C x A, ..., Z x A), and save the results on new columns (R1, R2 ..., R25).
I would have something like this:
A B C ... Z R1 R2 ... R25
0 2.0 8.0 1.0 ... 5.0 16.0 2.0 ... 10.0
1 3.0 9.0 0.0 ... 4.0 27.0 0.0 ... 12.0
2 4.0 9.0 0.0 ... 3.0 36.0 0.0 ... 12.0
3 5.0 8.0 1.0 ... 2.0 40.0 5.0 ... 10.0
4 6.0 8.0 0.0 ... 1.0 48.0 0.0 ... 6.0
5 7.0 9.0 1.0 ... 0.0 63.0 7.0 ... 0.0
I was able to calculate the results using below code, but from here I would need to merge with original df. Doesn't sound efficient. There must be a simple/clean way of doing that.
df.loc[:,'B':'D'].multiply(df['A'], axis="index")
That's an example, my real DataFrame has 160 columns x 16k rows.

Create new columns names by list comprehension and then join to original:
df1 = df.loc[:,'B':'D'].multiply(df['A'], axis="index")
df1.columns = ['R{}'.format(x) for x in range(1, len(df1.columns) + 1)]
df = df.join(df1)
print (df)
A B C Z R1 R2
0 2.0 8.0 1.0 5.0 16.0 2.0
1 3.0 9.0 0.0 4.0 27.0 0.0
2 4.0 9.0 0.0 3.0 36.0 0.0
3 5.0 8.0 1.0 2.0 40.0 5.0
4 6.0 8.0 0.0 1.0 48.0 0.0
5 7.0 9.0 1.0 0.0 63.0 7.0

Iterating and modifying dataframe using Pandas groupby

I am working with a large array of 1's and need to systematically remove 0's from sections of the array. The large array is comprised of many smaller arrays, for each smaller array I need to replace its upper and lower triangles with 0's systematically. For example we have an array with 5 sub arrays indicated by the index value (all sub-arrays have the same number of columns):
0 1 2
0 1.0 1.0 1.0
1 1.0 1.0 1.0
1 1.0 1.0 1.0
2 1.0 1.0 1.0
2 1.0 1.0 1.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
3 1.0 1.0 1.0
3 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
4 1.0 1.0 1.0
4 1.0 1.0 1.0
4 1.0 1.0 1.0
4 1.0 1.0 1.0
I want each group of rows to be modified in its upper and lower triangle such that the resulting matrix is:
0 1 2
0 1.0 1.0 1.0
1 1.0 1.0 0.0
1 0.0 1.0 1.0
2 1.0 0.0 0.0
2 0.0 1.0 0.0
2 0.0 0.0 1.0
3 1.0 0.0 0.0
3 1.0 1.0 0.0
3 0.0 1.0 1.0
3 0.0 0.0 1.0
4 1.0 0.0 0.0
4 1.0 1.0 0.0
4 1.0 1.0 1.0
4 0.0 1.0 1.0
4 0.0 0.0 1.0
At the moment I am using only numpy to achieve this resulting array, but I think I can speed it up using Pandas grouping. In reality my dataset is very large almost 500,000 rows long. The numpy code is below:
import numpy as np
candidateLengths = np.array([1,2,3,4,5])
centroidLength =3
smallPaths = [min(l,centroidLength) for l in candidateLengths]
# This is the k_values of zeros to delete. To be used in np.tri
k_vals = list(map(lambda smallPath: centroidLength - (smallPath), smallPaths))
maskArray = np.ones((np.sum(candidateLengths), centroidLength))
startPos = 0
endPos = 0
for canNo, canLen in enumerate(candidateLengths):
a = np.ones((canLen, centroidLength))
a *= np.tri(*a.shape, dtype=np.bool, k=k_vals[canNo])
b = np.fliplr(np.flipud(a))
c = a*b
endPos = startPos + canLen
maskArray[startPos:endPos, :] = c
startPos = endPos
print(maskArray)
When I run this on my real dataset it takes nearly 5-7secs to execute. I think this is down to this massive for loop. How can I use pandas groupings to achieve a higher speed? Thanks

New Answer
def tris(n, m):
if n < m:
a = np.tri(m, n, dtype=int).T
else:
a = np.tri(n, m, dtype=int)
return a * a[::-1, ::-1]
idx = np.append(df.index.values, -1)
w = np.append(-1, np.flatnonzero(idx[:-1] != idx[1:]))
c = np.diff(w)
df * np.vstack([tris(n, 3) for n in c])
0 1 2
0 1.0 1.0 1.0
1 1.0 1.0 0.0
1 0.0 1.0 1.0
2 1.0 0.0 0.0
2 0.0 1.0 0.0
2 0.0 0.0 1.0
3 1.0 0.0 0.0
3 1.0 1.0 0.0
3 0.0 1.0 1.0
3 0.0 0.0 1.0
4 1.0 0.0 0.0
4 1.0 1.0 0.0
4 1.0 1.0 1.0
4 0.0 1.0 1.0
4 0.0 0.0 1.0
Old Answer
I define some helper triangle functions
def tris(n, m):
if n < m:
a = np.tri(m, n, dtype=int).T
else:
a = np.tri(n, m, dtype=int)
return a * a[::-1, ::-1]
def tris_df(df):
n, m = df.shape
return pd.DataFrame(tris(n, m), df.index, df.columns)
Then
df * df.groupby(level=0, group_keys=False).apply(tris_df)
0 1 2
0 1.0 1.0 1.0
1 1.0 1.0 0.0
1 0.0 1.0 1.0
2 1.0 0.0 0.0
2 0.0 1.0 0.0
2 0.0 0.0 1.0
3 1.0 0.0 0.0
3 1.0 1.0 0.0
3 0.0 1.0 1.0
3 0.0 0.0 1.0
4 1.0 0.0 0.0
4 1.0 1.0 0.0
4 1.0 1.0 1.0
4 0.0 1.0 1.0
4 0.0 0.0 1.0

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Pandas groupby limited cumulative sum - python

Related

Pandas perform identical transformation on multiple dataframes where inline is not an option

Set only certain column values to zero when value exceeds threshold

How to drop float feature where % NANs is higher than a certain number?

Python - DataFrame: Multiply multiple columns by another column and save in new columns

Iterating and modifying dataframe using Pandas groupby

Categories

Resources