I'm trying to do a somewhat complicated pandas groupby operation. Here's some functional but slow pandas code.
# Construct a toy dataframe
idx1 = ["bar", "baz", "foo"]
idx2 = list(range(100, 104))
idx3 = list(range(3))
num_data = len(idx1) * len(idx2) * len(idx3)
index = pd.MultiIndex.from_product((idx1, idx2, idx3), names=["first", "third", "fourth"])
np.random.seed(0)
x = np.random.randint(low=0, high=2, size=num_data, dtype=bool)
input_df = pd.DataFrame(index=index, data={"x": x}).reset_index()
input_df["second"] = "positive"
input_df["second"][input_df["third"] != 100] = "negative"
input_df["third"][input_df["third"] == 101] = 100
# Complication: Not all groups when grouped by "fourth" will have the same indices. Most indices will be shared by most
# "fourth" groups, but the intersection is not complete.
mask = np.ones(num_data, dtype=bool)
mask[[17, 18]] = False
input_df = input_df[mask]
input_df = input_df.set_index(["first", "second", "third", "fourth"])
input_df looks like this:
x
first second third fourth
bar positive 100 0 False
1 False
2 True
negative 100 0 True
1 False
2 True
102 0 False
1 True
2 False
103 0 True
1 False
2 True
baz positive 100 0 False
1 False
2 False
negative 100 0 False # Notice some missing rows here
1 True
102 1 True
2 True
103 0 True
1 True
2 False
foo positive 100 0 False
1 False
2 True
negative 100 0 True
1 False
2 False
102 0 False
1 True
2 True
103 0 True
1 True
2 True
Dataframe guarantees/properties:
There will always be exactly one positive "third" group in each "first group".
There are N (variable) negative "third" groups in each "first group"
What I want to do efficiently:
For each "first" group:
Compare all negative "third" groups to the single positive "third" group (see code for what "compare" means).
dfs = []
# For for each "first" group:
for first, first_df in input_df.groupby("first"):
# Separate the positive group and negative groups
positive_mask = first_df.index.get_level_values("second") == "positive"
first_df = first_df.droplevel(["first"])
positive_df = first_df[positive_mask]
negative_dfs = first_df[~positive_mask]
positive_df = positive_df.droplevel(["second", "third"])
# Do some computations w.r.t. each negative "third" group and its corresponding positive group.
for third, negative_df in negative_dfs.groupby("third"):
negative_df = negative_df.droplevel(["second", "third"])
# Compare the positive/negative group based on their "fourth" indices alone
# Note that for indices not in their intersection "False" is assigned.
true_true = negative_df["x"] & positive_df["x"]
true_false = negative_df["x"] & ~positive_df["x"]
false_false = ~negative_df["x"] & ~positive_df["x"]
false_true = ~negative_df["x"] & positive_df["x"]
df = pd.DataFrame({
"true_true": true_true,
"true_false": true_false,
"false_false": false_false,
"false_true": false_true
}).reset_index()
df["first"] = first
df["second"] = "negative"
df["third"] = third
dfs.append(df)
# Output: A big dataframe of the computed values of all the negative "third" groups.
output_df = pd.concat(dfs)
output_df = output_df.set_index(["first", "second", "third", "fourth"], verify_integrity=True).sort_index()
That means that output_df looks like this. Note the rows that are all false where there were missing "fourth" indices in the original dataframe.
true_true true_false false_false false_true
first second third fourth
bar negative 100 0 False True False False
1 False False True False
2 True False False False
102 0 False False True False
1 False True False False
2 False False False True
103 0 False True False False
1 False False True False
2 True False False False
baz negative 100 0 False False True False
1 False True False False
2 False False False False # All false from missing data
102 0 False False False False # All false from missing data
1 False True False False
2 False True False False
103 0 False True False False
1 False True False False
2 False False True False
foo negative 100 0 False True False False
1 False False True False
2 False False False True
102 0 False False True False
1 False True False False
2 True False False False
103 0 False True False False
1 False True False False
2 True False False False
Doing this in a loop is extraordinarily slow :( Not only the looping itself, but profiling shows a lot of time being spent doing the inner loop comparison operations since the indices have to be aligned.
Is there a more efficient way to perform this computation, maybe with not so much looping?
EDIT: Added random seed for deterministic example data & updated input/output data.
You can try this, not sure if this is more efficient or not:
dfi = input_df['x'].unstack(level=['second','fourth'])
dfi.update(dfi.groupby('first').ffill()[['positive']])
dfi = dfi.stack()
neg_nulls = dfi['negative'].isna()
pos_nulls = dfi['positive'].isna()
dfi = dfi.fillna(False)
dfi['true_true'] = dfi["negative"] & dfi["positive"]
dfi['true_false'] = dfi["negative"] & ~dfi["positive"]
dfi['false_false'] = ~dfi["negative"] & ~dfi["positive"]
dfi['false_true'] = ~dfi["negative"] & dfi["positive"]
dfi[neg_nulls] = False
dfi[pos_nulls] = False
df_out = dfi.rename_axis([None], axis=1)\
.assign(second='negative')\
.set_index('second', append=True)\
.reorder_levels([0,3,1,2])\
.drop(['positive', 'negative'], axis=1)
Output with timings (Update with np.random.seed(0)):
true_true true_false false_false false_true
first second third fourth
bar negative 100 0 False True False False
1 False False True False
2 True False False False
102 0 False False True False
1 False True False False
2 False False False True
103 0 False True False False
1 False False True False
2 True False False False
baz negative 100 0 False False True False
1 False True False False
2 False False False False
102 0 False False False False
1 False True False False
2 False True False False
103 0 False True False False
1 False True False False
2 False False True False
foo negative 100 0 False True False False
1 False False True False
2 False False False True
102 0 False False True False
1 False True False False
2 True False False False
103 0 False True False False
1 False True False False
2 True False False False
Timings
21.8 ms ± 549 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
50.2 ms ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
all(df_out==output_df)
True
Details
Reshape input dataframe to have postive and negatives side-by-side
with 'fourth'
Fill forward the positives for each 'third' and update dataframe
Reshape, stacking 'fourths' to have a column of positive next to
negatives
Apply true-true ... false-false logic
Set all Falses for missing negatives and missing positives
Reshape to get desired output dataframe.
(No looping, but considerable reshaping dataframes)
Related
Say I have a dataframe. (Original dataframe has 91 columns 1000 rows)
0 1 2 3
0 False False False True
1 True False False False
2 True False False False
3 False False True False
4 False True True False
5 False False False False
6 True True True True
I need to get the AND/OR values for all the columns in my dataframe. So the resultant OR, AND values would be.
OR AND
0 True False
1 True False
2 True False
3 True False
4 True False
5 False False
6 True True
I can do this by looping over all my columns and calculate the boolean for each column but I was looking for a more dataframe level approach without actually going through the columns.
You can use any and all.
df = df.assign(OR=df.any(axis=1), AND=df.all(axis=1))
You can sum along the columns and then the OR is indicated by sum > 0, and AND is indicated by sum == len(df.columns):
total = df.sum(axis=1)
res = pd.DataFrame({"OR": total > 0, "AND": total == len(df.columns)})
If you have many columns this is more efficient as it only iterates over the entire matrix once (in the worst case, depending on the input distribution and implementation of any/all iterating twice can be faster).
I have a dataset, which has two columns:
index Value
0 True
1 True
2 False
3 True
Is it possible to obtain a matrix that looks like
index 0 1 2 3
0 True True False True
1 True True False True
2 False False False False
3 True True False True
I tried pd.crosstab, still not able to get the matrix, can anyone please help?
A possible way:
m = np.tile(df['Value'], len(df)).reshape(-1, len(df)) * df[['Value']].values
out = pd.DataFrame(m)
print(out)
# Output
0 1 2 3
0 True True False True
1 True True False True
2 False False False False
3 True True False True
First, convert the values of Value columns to a numpy array using to_numpy. Then take advantage of numpy broadcasting by creating an extra axis with [:,None] and computing the bitwise and operation:
vals = df['Value'].to_numpy()
res = pd.DataFrame(vals[:,None] & vals, index=df.index)
Output:
>>> res
0 1 2 3
index
0 True True False True
1 True True False True
2 False False False False
3 True True False True
i have df like this
a b c
0 True False True
1 False False False
2 True True True
i want this
a b c Result
0 True False True True
1 False False False False
2 True True True True
if any one Value True then Result True ele false
You can use any():
df['result'] = df.any(1)
# or with pd.assign
df = df.assign(result = df.any(1))
both will print:
a b c result
0 True False True True
1 False False False False
2 True True True True
Note that 1 is short for axis=1, i.e. perform operation row-wise
It's quite easy...
if a or b or c:
#do stuff
or you could also use
if a | b | c:
#do stuff
Use any with (axis=1) to check the existance of any True in each row.
df['result'] = df.any(axis=1)
If values are string rather than boolean then:
df['result'] = df.eq('True').any(axis=1)
I'm trying to change the first instance of True to False in my DataFrame dependent on row:
A B C
Number
1 True True True
2 False True True
3 False False True
A B C
Number
1 False True True
2 False False True
3 False False False
Every time I try using the for index, row in target_df.iterrows(): line it ends up never finding any 'True' when I look through the row.
Thanks in advance!
You can use the cumulative sum of the Boolean values (False corresponds to 0; True to 1) for each row, along with DataFrame.mask():
>>> condition = df.cumsum(axis=1) == 1
>>> df.mask(condition, False)
a b c
0 False True True
1 False False True
2 False False False
df.mask(self, cond, other=nan)
Return an object of same shape as self and whose corresponding entries
are from self where cond is False and otherwise are from other.
In this case, condition is False everywhere except the points at which you want to switch True -> False:
>>> condition
a b c
0 True False False
1 False True False
2 False False True
One other option would be to use NumPy:
>>> row, col = np.where(df.cumsum(axis=1) == 1)
>>> df.values[row, col] = False
>>> df
a b c
0 False True True
1 False False True
2 False False False
I have a dataframe consisting of boolean values. I'd like to match certain multi-column patterns in the dataframe. The pattern would look like:
bar foo
0 False True
1 True False
And the expected output would look like:
foo bar pattern
0 True False False
1 True False False
2 True False True
3 False True False
4 False True False
5 False True False
6 False False False
7 False False False
8 False False False
9 False True False
10 False True False
11 False True False
12 False True False
13 False True False
14 False True False
15 False True False
16 True False False
17 True False False
18 True False True
19 False True False
20 False True False
21 False True False
22 True False True
23 False True False
24 False True False
25 False True False
I came up with my own implementation, but I guess there should be a better one.
def matcher(df, pattern):
def aggregator(pattern):
"""Returns a dict of columnswith their aggregator function
which is the partially applied inner in this case"""
def inner(col, window):
return (window == pattern[col]).all()
return {col: partial(inner, col) for col in pattern.columns}
aggregated = (df
# Feed the chunks to aggregator in `len(pattern)` sized windows
.rolling(len(pattern))
.aggregate(aggregator(pattern))
# I'd like it to return True at the beginning of the match
.shift(-len(pattern) + 1)
# rows consisting of nan return true to `.all()`
.fillna(False))
ret = [row.all() for _, row in aggregated.iterrows()]
return pd.Series(ret)
My biggest concern is handling nan values, and the lack of wildcard support (in order to support not necessarily box-shaped patterns).
Any suggestions?
If pd.concat() is not too expensive to you, below code will work quite well with efficiency because there is no loop and no nested function.
print(df) # Original data without 'pattern' column.
df_wide = pd.concat([df, df.shift(-1)], axis=1)
df_wide.columns = ['foo0', 'bar0', 'foo-1', 'bar-1']
pat = ((df_wide['foo0'] == True) & (df_wide['bar-1'] == True)) & \
((df_wide['bar0'] == False) & (df_wide['foo-1'] == False))
df['pattern'] = False
df.loc[df_wide[pat].index, 'pattern'] = True
print(df) # Result data with 'pattern' column.
# Original data without 'pattern' column.
foo bar
0 True False
1 True False
2 True False
3 False True
4 False True
5 False True
...
# Result data with 'pattern' column.
foo bar pattern
0 True False False
1 True False False
2 True False True
3 False True False
4 False True False
5 False True False
6 False False False
7 False False False
8 False False False
9 False True False
10 False True False
11 False True False
12 False True False
13 False True False
14 False True False
15 False True False
16 True False False
17 True False False
18 True False True
19 False True False
20 False True False
21 False True False
22 True False True
23 False True False
24 False True False
25 False True False
Suppose df1 is your patten df and df2 is your value df, you can use apply to check the pattern. For every row, we get the current row and the next row and then compare the 2*2 array with df1 element else and check if all elements are the same.
df2.apply(lambda x: (df2[['foo','bar']].iloc[x.name:x.name+2].values\
==df1[['foo','bar']].values).all(),axis=1)
Out[213]:
0 False
1 False
2 True
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 True
19 False
20 False
21 False
22 True
23 False
24 False
25 False
dtype: bool