Iterate over Pandas on MultiIndex levels and groupby to get totals - python

I have a pandas dataframe like where the first four columns form a multiindex:
import pandas as pd
data = [[1, 'A', 1, 0, 10],
[1, 'A', 0, 1, 10],
[1, 'A', 1, 1, 10],
[1, 'A', 0, 0, 10],
[1, 'B', 1, 0, 10],
[1, 'B', 0, 1, 10],
[1, 'B', 1, 1, 10],
[1, 'B', 0, 0, 10]]
cols = ['user_id','type','flag1','flag2','cnt']
df = pd.DataFrame(data,columns = cols)
df = df.set_index(['user_id','type','flag1','flag2'])
print df
user_id type flag1 flag2 cnt
________________________________________
1 A 1 0 10
1 A 0 1 10
1 A 1 1 10
1 A 0 0 10
1 B 1 0 10
1 B 0 1 10
1 B 1 1 10
1 B 0 0 10
I'd like to iterate over the index values to get the grouped total count for each unique index values like so:
user_id type flag1 flag2 cnt
________________________________________
1 ALL ALL ALL 80
1 ALL ALL 0 40
1 ALL ALL 1 40
1 ALL 1 ALL 40
1 ALL 0 ALL 40
1 A ALL ALL 40
1 B ALL ALL 40
1 A ALL 0 20
1 A ALL 1 20
1 B ALL 0 20
1 B ALL 1 20
1 A 1 ALL 20
1 A 0 ALL 20
1 B 1 ALL 20
1 B 0 ALL 20
1 A 1 0 10
1 A 0 1 10
1 A 1 1 10
1 A 0 0 10
1 B 1 0 10
1 B 0 1 10
1 B 1 1 10
1 B 0 0 10
I'm able to generate each group easily using query and groupby, but ideally I'd like to be able to iterate over any number of index columns to get the sum of the cnt column.

Similar to previous answers, here's a slightly more streamlined approach using itertools and groupby:
from itertools import chain, combinations
indices = ['user_id','type','flag1','flag2']
powerset = list(chain.from_iterable(combinations(indices, r) for r in range(1,len(indices)+1)))
master = (pd.concat([df.reset_index().groupby(p, as_index=False).sum()
for p in powerset if p[0] == "user_id"])[cols]
.replace([None,4,2], "ALL")
.sort_values("cnt", ascending=False))
Output:
user_id type flag1 flag2 cnt
0 1 ALL ALL ALL 80
0 1 A ALL ALL 40
1 1 B ALL ALL 40
0 1 ALL 0 ALL 40
1 1 ALL 1 ALL 40
0 1 ALL ALL 0 40
1 1 ALL ALL 1 40
3 1 ALL 1 1 20
2 1 ALL 1 0 20
1 1 ALL 0 1 20
0 1 ALL 0 0 20
3 1 B 1 1 20
2 1 B 1 0 20
1 1 A 1 1 20
0 1 A 1 0 20
3 1 B 1 1 20
2 1 B 0 1 20
1 1 A 1 1 20
0 1 A 0 1 20
0 1 A 0 0 10
1 1 A 0 1 10
2 1 A 1 0 10
3 1 A 1 1 10
4 1 B 0 0 10
5 1 B 0 1 10
6 1 B 1 0 10
7 1 B 1 1 10
The powerset computation is taken directly from the itertools docs.

#build all groupby key combinations
import itertools
keys = ['user_id', 'type', 'flag1', 'flag2']
key_combos = [c for i in range(len(keys)) for c in itertools.combinations(keys, i+1)]
#make sure only select the combos with 'user_id' in it
key_combos = [list(e) for e in key_combos if 'user_id' in e]
#groupby using all groupby keys and concatenate the results to a Dataframe
df2 = pd.concat([df.groupby(by=key).cnt.sum().to_frame().reset_index() for key in sorted(key_combos)])
#Fill na with ALL and re-order columns
df2.fillna('ALL')[['user_id','type','flag1','flag2','cnt']]
Out[521]:
user_id type flag1 flag2 cnt
0 1 ALL ALL ALL 80
0 1 ALL 0 ALL 40
1 1 ALL 1 ALL 40
0 1 ALL 0 0 20
1 1 ALL 0 1 20
2 1 ALL 1 0 20
3 1 ALL 1 1 20
0 1 ALL ALL 0 40
1 1 ALL ALL 1 40
0 1 A ALL ALL 40
1 1 B ALL ALL 40
0 1 A 0 ALL 20
1 1 A 1 ALL 20
2 1 B 0 ALL 20
3 1 B 1 ALL 20
0 1 A 0 0 10
1 1 A 0 1 10
2 1 A 1 0 10
3 1 A 1 1 10
4 1 B 0 0 10
5 1 B 0 1 10
6 1 B 1 0 10
7 1 B 1 1 10
0 1 A ALL 0 20
1 1 A ALL 1 20
2 1 B ALL 0 20
3 1 B ALL 1 20

I mainly used combinations and product from itertools.
combinations is for all combinations of values within each column.
product is for all combinations of values across all column.
import pandas as pd
from itertools import combinations, product
import numpy as np
def iterativeSum(df, cols, target_col):
# All possible combinations within each column
comb_each_col = []
for col in cols:
# Take 1 to n element in the unique set of values in each column
each_col = [list(combinations(set(df[col]), i))
for i in range(1, len(set(df[col]))+1)]
# Flat the list
each_col = [list(x) for sublist in each_col for x in sublist]
# Record the combination
comb_each_col.append(each_col)
# All possible combinations across all columns
comb_all_col = list(product(*comb_each_col))
result = pd.DataFrame()
# Iterate over all combinations
for value in comb_all_col:
# Get condition which match the value in each column
condition = np.array(
[df[col].isin(v).values for col, v in zip(cols, value)]).all(axis=0)
# Get the sum of rows which meet the condition
condition_sum = df.loc[condition][target_col].sum()
# Format values for output
value2 = []
for x in value:
try:
# String can be joined together directly
value2.append(','.join(x))
except:
# Numbers can be joined after converted to string
x = [str(y) for y in x]
value2.append(','.join(x))
# Put result into table
result = pd.concat([result, pd.DataFrame([value2+[condition_sum]])])
result.columns = cols + [target_col]
return(result)
data = [[1, 'A', 1, 0, 10],
[1, 'A', 0, 1, 10],
[1, 'A', 1, 1, 10],
[1, 'A', 0, 0, 10],
[1, 'B', 1, 0, 10],
[1, 'B', 0, 1, 10],
[1, 'B', 1, 1, 10],
[1, 'B', 0, 0, 10]]
cols = ['user_id', 'type', 'flag1', 'flag2', 'cnt']
df = pd.DataFrame(data, columns=cols)
# Columns for grouping
grouped_cols = ['type', 'flag1', 'flag2']
# Columns for summing
target_col = 'cnt'
print iterativeSum(df, grouped_cols, target_col)
Result:
type flag1 flag2 cnt
0 A 0 0 10
0 A 0 1 10
0 A 0 0,1 20
0 A 1 0 10
0 A 1 1 10
0 A 1 0,1 20
0 A 0,1 0 20
0 A 0,1 1 20
0 A 0,1 0,1 40
0 B 0 0 10
0 B 0 1 10
0 B 0 0,1 20
0 B 1 0 10
0 B 1 1 10
0 B 1 0,1 20
0 B 0,1 0 20
0 B 0,1 1 20
0 B 0,1 0,1 40
0 A,B 0 0 20
0 A,B 0 1 20
0 A,B 0 0,1 40
0 A,B 1 0 20
0 A,B 1 1 20
0 A,B 1 0,1 40
0 A,B 0,1 0 40
0 A,B 0,1 1 40
0 A,B 0,1 0,1 80

Related

Is there a way to iterate through list of string in a dataframe?

I wrote the following code. I want to replace the number "1" with "0" whenever it appear twice or more for a particular universal_id and the number "1" that is left should be in the row where days are the lowest. The below code does the work but I want to iterate over more then one universal_id. Column "e" is ok for 'efra" I want this to do for other ID's and other columns.
pdf1 = pd.DataFrame(
[[1, 0,1, 0,1, 60, 'fdaf'],
[1, 1,0, 0,1, 350, 'fdaf'],
[1, 1,0, 0,1, 420, 'erfa'],
[0, 1,0, 0,1, 410, 'erfa']],
columns=['A', 'B', 'c', 'd', 'e', 'days','universal_id'])
pdf1['A'] = np.where(pdf1['days']==pdf1['days'].min(),1,0)
zet = pdf1.loc[pdf1['e'].isin([1]) &
pdf1['universal_id'].str.contains('erfa')]
zet['e'] = np.where(zet['days']==zet['days'].min(),1,0)
pdf1.loc[zet.index, :] = zet[:]
pdf1
Output:
A B c d e days universal_id
0 1 0 1 0 1 60 fdaf
1 0 1 0 0 1 350 fdaf
2 0 1 0 0 0 420 erfa
3 0 1 0 0 1 410 erfa
You can use:
df2 = pdf1.sort_values(by='days')
m1 = df2['A'].eq(1)
m2 = df2[['A', 'universal_id']].duplicated()
pdf1.loc[m1&m2, 'A'] = 0
output:
A B c d e days universal_id
0 1 0 1 0 1 60 fdaf
1 0 1 0 0 1 350 fdaf
2 1 1 0 0 1 420 erfa
3 0 1 0 0 1 410 erfa
for e, f you want to follow the same logic:
m1 = df2['A'].eq(1)
m3 = df2[['e', 'universal_id']].duplicated()
pdf1.loc[m1&m3, 'e'] = 0
output:
A B c d e days universal_id
0 1 0 1 0 1 60 fdaf
1 0 1 0 0 0 350 fdaf
2 1 1 0 0 0 420 erfa
3 0 1 0 0 1 410 erfa

Pandas resetting cumsum() based on a condition of another column

I have a column called 'on' with a series of 0 and 1:
d1 = {'on': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0]}
df = pd.DataFrame(d1)
I want to create a new column called 'value' such that it do a cumulative count cumsum() only when the '1' of the 'on' column is on and recount from zero once the 'on' column shows zero.
I tried using a combination of cumsum() and np.where but I don't get what I want as follows:
df['value_try'] = df['on'].cumsum()
df['value_try'] = np.where(df['on'] == 0, 0, df['value_try'])
Attempt:
on value_try
0 0 0
1 0 0
2 0 0
3 1 1
4 1 2
5 1 3
6 0 0
7 0 0
8 1 4
9 1 5
10 0 0
What my desired output would be:
on value
0 0 0
1 0 0
2 0 0
3 1 1
4 1 2
5 1 3
6 0 0
7 0 0
8 1 1
9 1 2
10 0 0
You can set groups on consecutive 0 or 1 by checking whether the value of on is equal to that of previous row by .shift() and get group number by .Series.cumsum(). Then for each group use .Groupby.cumsum() to get the value within group.
g = df['on'].ne(df['on'].shift()).cumsum()
df['value'] = df.groupby(g).cumsum()
Result:
print(df)
on value
0 0 0
1 0 0
2 0 0
3 1 1
4 1 2
5 1 3
6 0 0
7 0 0
8 1 1
9 1 2
10 0 0
Let us try cumcount + cumsum
df['out'] = df.groupby(df['on'].eq(0).cumsum()).cumcount()
Out[18]:
0 0
1 0
2 0
3 1
4 2
5 3
6 0
7 0
8 1
9 2
10 0
dtype: int64

How can I make my loop much faster/better for this particular problem?

Python beginner here.
Here is my problem:
I have a a csv file with roughly 3200 rows and 660 columns.
The rows are filled with either 0s, 1s, or 50s.
I need to update the newly created column 'answer' by these requirements:
It should be the sum of 1s in that row that happen before a '50' occurs.
If there is no '50' in that row, just update the last column to a zero.
so, for example, the row [1, 0, 0, 0, 1, 1, 50, 0, 0, 0, 1] should have a new value at the end of it as '3' because we found three 1s before finding a 50.
Here's my code:
df_numRows = len(df.values)
df_numCols = len(df.columns)
for row in range(df_numRows):
df_sum = 0
for col in range(df_numCols):
if '50' not in df.values[row]:
df.at[row, 'answer'] = '0'
elif df.values[row][col] == '0':
continue
elif df.values[row][col] == '1':
df_sum += 1
df.at[row, 'answer'] = df_sum
elif df.values[row][col] == '50':
break
I wrote this nested for loop to iterate through my Pandas dataframe but it seems to take a VERY long time to run.
I ran this piece of code on the same dataset but with only 100 rows x 660 columns and it took about 1.5 mins, however, when I try to run it on the entire thing, it ran for about 2.5 hours and I just shut it down because I thought it had taken too long.
How can I make my code more efficient/faster/better? I would love any help at all from you guys, and I apologize in advance if this is an easy question but I am just getting started in Python!
Thanks guys!
Just do cumprod after we find the 50, if it is 50 we all values below will become 0 , then we using this Boolean dataframe filter the original df , and do sum
df=pd.DataFrame({'A':[1, 0, 0, 0, 1, 1, 50, 0, 0, 0, 1] })
df.mul(df.ne(50).cumprod()).sum()
Out[35]:
A 3
dtype: int64
Setup
df = pd.DataFrame([
[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1], # No 50s
[1, 0, 0, 0, 1, 1, 50, 0, 0, 0, 1], # One 50
[1, 50, 0, 0, 1, 50, 50, 0, 0, 0, 1], # Three 50s but 2 are consecutive
[1, 50, 0, 0, 1, 1, 50, 0, 0, 0, 1], # Two 50s
])
df
0 1 2 3 4 5 6 7 8 9 10
0 1 0 0 0 1 1 0 0 0 0 1
1 1 0 0 0 1 1 50 0 0 0 1
2 1 50 0 0 1 50 50 0 0 0 1
3 1 50 0 0 1 1 50 0 0 0 1
Use logical_and with its accumulate method
np.logical_and will take the and operator and apply it to a group of booleans. The accumulate part says to keep applying it and as we go keep track of the most recent and of all prior booleans. By specifying axis=1 I'm saying to do this for each row. This returns an array of booleans where the rows are true until we hit the value of 50. I then check to see of any are fifty withe all(1). The proper multiplication gives the sums of all values not 50 prior to the first 50... for each row.
d = np.logical_and.accumulate(df.ne(50), axis=1)
df.mul(d).mul(~d.all(1), 0).sum(1)
0 0
1 3
2 1
3 1
dtype: int64
Combine to get new column
d = np.logical_and.accumulate(df.ne(50), axis=1)
df.assign(answer=df.mul(d).mul(~d.all(1), 0).sum(1))
0 1 2 3 4 5 6 7 8 9 10 asnswer
0 1 0 0 0 1 1 0 0 0 0 1 0
1 1 0 0 0 1 1 50 0 0 0 1 3
2 1 50 0 0 1 50 50 0 0 0 1 1
3 1 50 0 0 1 1 50 0 0 0 1 1
If you want to go full blown Numpy
v = df.values
a = np.logical_and.accumulate(v != 50, axis=1)
df.assign(answer=(v * (a & ~a.all(1, keepdims=True))).sum(1))
0 1 2 3 4 5 6 7 8 9 10 asnswer
0 1 0 0 0 1 1 0 0 0 0 1 0
1 1 0 0 0 1 1 50 0 0 0 1 3
2 1 50 0 0 1 50 50 0 0 0 1 1
3 1 50 0 0 1 1 50 0 0 0 1 1
This solves it, though bit robust:
import pandas as pd
import numpy as np
np.random.seed(1)
df = pd.DataFrame(np.random.choice([0, 1, 50], (3200,660)))
data = df.values
idxs = [np.where(d == 50) for d in data]
sums = [sum(d[:i[0][0]]) if i[0].size else 0 for d, i in zip(data, idxs)]
data = np.column_stack((data, sums))
df = df.assign(answer=sums)
df.head()
# 0 1 2 3 4 5 6 7 8 9 ... 651 652 653 654 655 \
#0 1 0 0 1 1 0 0 1 0 1 ... 50 50 1 1 0
#1 1 0 50 1 50 50 0 1 1 50 ... 1 0 1 0 0
#2 50 0 1 0 1 50 1 50 0 50 ... 0 50 1 50 50
#3 0 1 0 50 1 0 0 50 1 0 ... 1 1 0 1 1
#4 1 50 1 1 1 1 0 50 50 1 ... 0 1 0 1 0
#
# 656 657 658 659 answer
#0 0 0 1 0 5
#1 1 50 0 50 1
#2 50 1 1 50 0
#3 0 50 1 50 1
#4 0 50 0 50 1
Please try this logic and let me know if this helps.
df_numRows = len(df.values)
df_numCols = len(df.columns)
for row in range(df_numRows):
df_sum = 0
try:
indexOf50 = np.argwhere(df.loc[row]==50)[0][0]
colArrayTill50 = df.loc[row][:indexOf50].values
numberOfOne = colArrayTill50.sum()
except:
numberOfOne = 0
print(numberOfOne)

pandas dataframe: how to count the number of 1 rows in a binary column?

I have the following pandas DataFrame:
import pandas as pd
import numpy as np
df = pd.DataFrame({"first_column": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]})
>>> df
first_column
0 0
1 0
2 0
3 1
4 1
5 1
6 0
7 0
8 1
9 1
10 0
11 0
12 0
13 0
14 1
15 1
16 1
17 1
18 1
19 0
20 0
first_column is a binary column of 0s and 1s. There are "clusters" of consecutive ones, which are always found in pairs of at least two.
My goal is to create a column which "counts" the number of rows of ones per group:
>>> df
first_column counts
0 0 0
1 0 0
2 0 0
3 1 3
4 1 3
5 1 3
6 0 0
7 0 0
8 1 2
9 1 2
10 0 0
11 0 0
12 0 0
13 0 0
14 1 5
15 1 5
16 1 5
17 1 5
18 1 5
19 0 0
20 0 0
This sounds like a job for df.loc(), e.g. df.loc[df.first_column == 1]...something
I'm just not sure how to take into account each individual "cluster" of ones, and how to label each of the unique clusters with the "row count".
How would one do this?
Here's one approach with NumPy's cumsum and bincount -
def cumsum_bincount(a):
# Append 0 & look for a [0,1] pattern. Form a binned array based off 1s groups
ids = a*(np.diff(np.r_[0,a])==1).cumsum()
# Get the bincount, index into the count with ids and finally mask out 0s
return a*np.bincount(ids)[ids]
Sample run -
In [88]: df['counts'] = cumsum_bincount(df.first_column.values)
In [89]: df
Out[89]:
first_column counts
0 0 0
1 0 0
2 0 0
3 1 3
4 1 3
5 1 3
6 0 0
7 0 0
8 1 2
9 1 2
10 0 0
11 0 0
12 0 0
13 0 0
14 1 5
15 1 5
16 1 5
17 1 5
18 1 5
19 0 0
20 0 0
Set the first 6 elems to be 1s and then test out -
In [101]: df.first_column.values[:5] = 1
In [102]: df['counts'] = cumsum_bincount(df.first_column.values)
In [103]: df
Out[103]:
first_column counts
0 1 6
1 1 6
2 1 6
3 1 6
4 1 6
5 1 6
6 0 0
7 0 0
8 1 2
9 1 2
10 0 0
11 0 0
12 0 0
13 0 0
14 1 5
15 1 5
16 1 5
17 1 5
18 1 5
19 0 0
20 0 0
Since first_column is binary, I can use astype(bool) to get True/False
If I take the opposite of those and cumsum I get a handy way of lumping together the Trues or 1s
I then groupby and count with transform
transform broadcasts the count aggregation across the original index
I first use where to group all 0s together.
I use where again to set their counts to 0
I use assign to generate a copy of df with a new column. This is because I don't want to clobber the df we already have. If you want to write directly to df use df['counts'] = c
t = df.first_column.astype(bool)
c = df.groupby((~t).cumsum().where(t, -1)).transform('count').where(t, 0)
df.assign(counts=c)
first_column counts
0 0 0
1 0 0
2 0 0
3 1 3
4 1 3
5 1 3
6 0 0
7 0 0
8 1 2
9 1 2
10 0 0
11 0 0
12 0 0
13 0 0
14 1 5
15 1 5
16 1 5
17 1 5
18 1 5
19 0 0
20 0 0
Here is another approach with pandas groupby, which I think is quite readable. A (possible) advantage is that does not rely on the assumption that only 1 and 0 are present in the column.
The main insight is to create groups of consecutive values and then simply compute their length. We also carry the information of the value in the group, so we can filter for zeros.
# Relevant column -> grouper needs to be 1-Dimensional
col_vals = df['first_column']
# Group by sequence of consecutive values and value in the sequence.
grouped = df.groupby(((col_vals!=col_vals.shift(1)).cumsum(), col_vals))
# Get the length of consecutive values if they are different from zero, else zero
df['counts'] = grouped['first_column'].transform(lambda group: len(group))\
.where(col_vals!=0, 0)
This is how the groups and keys look like:
for key, group in grouped:
print key, group
(1, 0) first_column
0 0
1 0
2 0
(2, 1) first_column
3 1
4 1
5 1
(3, 0) first_column
6 0
7 0
(4, 1) first_column
8 1
9 1
(5, 0) first_column
10 0
11 0
12 0
13 0
(6, 1) first_column
14 1
15 1
16 1
17 1
18 1
(7, 0) first_column
19 0
20 0

Create new columns in pandas from python nested lists

I have a pandas data frame. One of the columns has a nested list. I would like to create new columns from the nested list
Example:
L = [[1,2,4],
[5,6,7,8],
[9,3,5]]
I want all the elements in the nested lists as columns. The value should be one if the list has the element and zero if it does not.
1 2 4 5 6 7 8 9 3
1 1 1 0 0 0 0 0 0
0 0 0 1 1 1 1 0 0
0 0 0 1 0 0 0 1 1
You can try the following:
df = pd.DataFrame({"A": L})
df
# A
#0 [1, 2, 4]
#1 [5, 6, 7, 8]
#2 [9, 3, 5]
# for each cell, use `pd.Series(1, x)` to create a Series object with the elements in the
# list as the index which will become the column headers in the result
df.A.apply(lambda x: pd.Series(1, x)).fillna(0).astype(int)
# 1 2 3 4 5 6 7 8 9
#0 1 1 0 1 0 0 0 0 0
#1 0 0 0 0 1 1 1 1 0
#2 0 0 1 0 1 0 0 0 1
pandas
Very similar to #Psidom's answer. However, I use pd.value_counts and will handle repeats
Use #Psidom's df
df = pd.DataFrame({'A': L})
df.A.apply(pd.value_counts).fillna(0).astype(int)
numpy
More involved, but speedy
lst = df.A.values.tolist()
n = len(lst)
lengths = [len(sub) for sub in lst]
flat = np.concatenate(lst)
u, inv = np.unique(flat, return_inverse=True)
rng = np.arange(n)
slc = np.hstack([
rng.repeat(lengths)[:, None],
inv[:, None]
])
data = np.zeros((n, u.shape[0]), dtype=np.uint8)
data[slc[:, 0], slc[:, 1]] = 1
pd.DataFrame(data, df.index, u)
Results
1 2 3 4 5 6 7 8 9
0 1 1 0 1 0 0 0 0 0
1 0 0 0 0 1 1 1 1 0
2 0 0 1 0 1 0 0 0 1

Categories