Related
I have a pandas DataFrame with values in columns A, B, C, and D and want to determine for every row the first and last non-zero column. BUT the order of the elements is not the same for all rows. It is determined by columns item_0, item_1 and item_2.
While I can easily do this by applying a function to every row this becomes very slow for my DataFrame. Is there an elegant, more pythonic / pandasy way to do this?
Input:
A B C D item_0 item_1 item_2
0 1 2 0 0 A B C
1 0 1 1 0 A B C
2 1 0 1 0 A B C
3 0 2 0 0 D A B
4 1 1 0 1 D A B
5 0 0 0 1 D A B
Expected Output:
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
Update: Here's the current code with apply
import pandas as pd
def first_and_last_for_row(row):
reference_list = row[["item_0", "item_1", "item_2"]].tolist()
list_to_sort = (
row[["A", "B", "C", "D"]].index[row[["A", "B", "C", "D"]] > 0].tolist()
)
ordered_list = [l for l in reference_list if l in list_to_sort]
if len(ordered_list) == 0:
return None, None
else:
return ordered_list[0], ordered_list[-1]
df = pd.DataFrame(
{
"A": [1, 0, 1, 0, 1, 0],
"B": [2, 1, 0, 2, 1, 0],
"C": [0, 1, 1, 0, 0, 0],
"D": [0, 0, 0, 0, 1, 1],
"item_0": ["A", "A", "A", "D", "D", "D"],
"item_1": ["B", "B", "B", "A", "A", "A"],
"item_2": ["C", "C", "C", "B", "B", "B"],
}
)
df[["first", "last"]] = df.apply(first_and_last_for_row, axis=1, result_type="expand")
Here is a fully vectorized numpy approach. It's not very complex but has quite a few steps so I also provided a commented version of the code:
cols = ['A', 'B', 'C', 'D']
a = df[cols].to_numpy()
idx = df.filter(like='item_').replace({k:v for v,k in enumerate(cols)}).to_numpy()
b = a[np.arange(len(a))[:,None], idx] != 0
first = b.argmax(1)
last = b.shape[1]-np.fliplr(b).argmax(1)-1
c = df.filter(like='item_').to_numpy()
df[['first', 'last']] = c[np.arange(len(c))[:,None],
np.vstack((first, last)).T]
mask = b[np.arange(len(b)), first]
df[['first', 'last']] = df[['first', 'last']].where(pd.Series(mask, index=df.index))
commented code:
cols = ['A', 'B', 'C', 'D']
# convert to numpy array
a = df[cols].to_numpy()
# array([[1, 2, 0, 0],
# [0, 1, 1, 0],
# [1, 0, 1, 0],
# [0, 2, 0, 0],
# [1, 1, 0, 1],
# [0, 0, 0, 1]])
# get indexer as numpy array
idx = df.filter(like='item_').replace({k:v for v,k in enumerate(cols)}).to_numpy()
# array([[0, 1, 2],
# [0, 1, 2],
# [0, 1, 2],
# [3, 0, 1],
# [3, 0, 1],
# [3, 0, 1]])
# reorder columns and get non-zero
b = a[np.arange(len(a))[:,None], idx] != 0
# array([[ True, True, False],
# [False, True, True],
# [ True, False, True],
# [False, False, True],
# [ True, True, True],
# [ True, False, False]])
# first non-zero
first = b.argmax(1)
# array([0, 1, 0, 2, 0, 0])
# last non-zero
last = b.shape[1]-np.fliplr(b).argmax(1)-1
# array([1, 2, 2, 2, 2, 0])
# get back column names from position
c = df.filter(like='item_').to_numpy()
df[['first', 'last']] = c[np.arange(len(c))[:,None],
np.vstack((first, last)).T]
# optional
# define a mask in case a zero was selected
mask = b[np.arange(len(b)), first]
# array([ True, True, True, True, True, True])
# mask where argmax was 0
df[['first', 'last']] = df[['first', 'last']].where(pd.Series(mask, index=df.index))
output:
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
Let me try with a first attempt to "optimize", just by avoiding inner looping.
The solution here is about 1.7x faster on 60k rows (I didn't have the patience to wait for 600k)
def first_and_last(row):
# select order given by items
i0, i1, i2 = items = np.array(row[["item_0", "item_1", "item_2"]])
# select values in right order
v0, v1, v2 = values = np.array(row[[i0, i1, i2]])
pos_values = (values > 0)
n_positives = np.sum(values)
if n_positives == 0:
return np.nan, np.nan
else:
return items[pos_values][[0, -1]]
Then:
df_ = pd.concat([df]*10_000)
# Original function
%time df_.apply(first_and_last_for_row, axis=1, result_type="expand")
CPU times: user 53.3 s, sys: 22.5 ms, total: 53.4 s
Wall time: 53.4 s
# New function
%time df_.apply(first_and_last, axis=1, result_type="expand")
CPU times: user 32.9 s, sys: 0 ns, total: 32.9 s
Wall time: 32.9 s
However, apply method is not optimal, there are other ways to iterate over a dataframe. In particular, you can use itertuples method:
def first_and_last_iter(row):
# select order given by items
i0, i1, i2 = items = np.array([getattr(row, "item_0"), getattr(row, "item_1"),getattr(row, "item_2")])
# select values in right order
v0, v1, v2 = values = np.array([getattr(row, i0), getattr(row, i1),getattr(row,i2)])
pos_values = (values > 0)
n_positives = np.sum(values)
if n_positives == 0:
return np.nan, np.nan
else:
return items[pos_values][[0, -1]]
%time df_[["first", "last"]] = [first_and_last_iter(row) for row in df_.itertuples()]
CPU times: user 1.05 s, sys: 0 ns, total: 1.05 s
Wall time: 1.05 s
And that's 50x improvement
Assuming your DataFrame is named df, here is something that works using filtering and no loops. It will work with all-zero lines too (value will be NaN in this case).
On my machine, it runs 10,000,000 rows in about 13 seconds.
# create filters stating if each column <item_n> is not zero
i0 = df.lookup(df.index, df.item_0).astype(bool) # [True, False, True, False, True, True]
i1 = df.lookup(df.index, df.item_1).astype(bool)
i2 = df.lookup(df.index, df.item_2).astype(bool)
# for the "first" column, fill with value of item_0 if column is not zero
df['first'] = df.item_0[i0] # ['A', NaN, 'A', NaN, 'D', 'D']
# fill the Nans with values of item_1 if column is not zero
df['first'][~i0 & i1] = df.item_1[~i0 & i1]
# fill the remaining Nans with values of item_2 if column is not zero
df['first'][~i0 & ~i1 & i2] = df.item_2[~i0 & ~i1 & i2]
# apply the same logic in reverse order for "last"
df['last'] = df.item_2[i2]
df['last'][~i2 & i1] = df.item_1[~i2 & i1]
df['last'][~i2 & ~i1 & i0] = df.item_0[~i2 & ~i1 & i0]
Output:
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
df = pd.DataFrame(
{
"A": [1, 0, 1, 0, 1, 0],
"B": [2, 1, 0, 2, 1, 0],
"C": [0, 1, 1, 0, 0, 0],
"D": [0, 0, 0, 0, 1, 1],
"item_0": ["A", "A", "A", "D", "D", "D"],
"item_1": ["B", "B", "B", "B", "B", "B"],
"item_2": ["C", "C", "C", "A", "A", "A"],
}
)
first = []
last = []
for i in range(df.shape[0]):
check1 = []
for j in df.columns:
t1 = list(df.loc[i:i][j].values)[0]
try:
if t1 > 0:
check1.append(j)
except TypeError:
continue
if len(check1) == 2:
first.append(check1[0])
last.append(check1[1])
check1.clear()
elif len(check1) == 3:
first.append(check1[2])
last.append(check1[1])
check1.clear()
elif len(check1) == 1:
first.append(check1[0])
last.append(check1[0])
check1.clear()
output:
good question
def function1(ss:pd.Series):
ss1=ss.loc[ss.iloc[4:].tolist()]
ld1=lambda ss2:ss2.loc[lambda ss3:(ss3>0).cumsum()==1].head(1).index.values[0]
return pd.Series([ld1(ss1),ld1(ss1[::-1])],index=['first','last'])
df1.join(df1.apply(function1,axis=1))
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
I have a dataframe where I want to group rows based on a column. Some of the columns in the rows I want to sum up and the others I want to aggregate as a list.
#creating sample data
df = pd.DataFrame(np.random.rand(4,4), columns=list('abcd'))
df['id'] = [1,2,1,4]
df['group'] = [[0,1,2,3] , [0,2,3,4], [1,1,1,1], 1]
df
Out[5]:
a b c d id group
0 0.850058 0.160497 0.742296 0.354296 1 [0, 1, 2, 3]
1 0.598759 0.399200 0.799157 0.908174 2 [0, 2, 3, 4]
2 0.160764 0.671702 0.414800 0.429992 1 [1, 1, 1, 1]
3 0.011089 0.581518 0.718829 0.610140 4 1
Here I want to combine row 0 and row 2 as they have the same id. When doing this, I want to sum up the values in columns a, b, c and d but for column group, I want the lists to be appended. How can I do this?
My expected output is:
a b c d id group
0 1.155671 1.670582 0.392744 0.681494 1 [0, 1, 2, 3, 1, 1, 1, 1]
1 0.598759 0.399200 0.799157 0.908174 2 [0, 2, 3, 4]
2 0.011089 0.581518 0.718829 0.610140 4 1
(When I use only the sum or df.groupby(['id'])['group'].apply(list), the other columns are dropped. )
Use groupby.aggregate
df.groupby('id').agg({k: sum for k in ['a', 'b', 'c', 'd', 'group']})
A one-liner alternative would be using numeric_only flag. But be careful with the columns you are feeding in.
df.groupby('id').sum(numeric_only=False)
Output
a b c d group
id
1 1.488778 0.802794 0.949768 0.952676 [0, 1, 2, 3, 1, 1, 1, 1]
2 0.488390 0.512301 0.064922 0.233875 [0, 2, 3, 4]
4 0.649945 0.267125 0.229313 0.156696 1
First Solution:
We can arrive at the task in 2 steps, the 1st step using GroupBy.sum to get the grouped sum of the first 4 columns. The 2nd step acting on the column group only and concat the lists also by GroupBy.sum
df.groupby('id').sum().join(df.groupby('id')['group'].sum()).reset_index()
Input (Different values owing to the different random numbers generated)
a b c d id group
0 0.758148 0.781987 0.310849 0.600912 1 [0, 1, 2, 3]
1 0.694848 0.755622 0.947359 0.708422 2 [0, 2, 3, 4]
2 0.515446 0.454484 0.169883 0.697287 1 [1, 1, 1, 1]
3 0.361939 0.325718 0.143510 0.077142 4 1
Output:
id a b c d group
0 1 1.273594 1.236471 0.480732 1.298199 [0, 1, 2, 3, 1, 1, 1, 1]
1 2 0.694848 0.755622 0.947359 0.708422 [0, 2, 3, 4]
2 4 0.361939 0.325718 0.143510 0.077142 1
Second Solution
We can also use GroupBy.agg with named aggegation, as follows:
df.groupby('id', as_index=False).agg(a=('a', 'sum'), b=('b', 'sum'), c=('c', 'sum'), d=('d', 'sum'), group=('group', 'sum'))
Result:
id a b c d group
0 1 1.273594 1.236471 0.480732 1.298199 [0, 1, 2, 3, 1, 1, 1, 1]
1 2 0.694848 0.755622 0.947359 0.708422 [0, 2, 3, 4]
2 4 0.361939 0.325718 0.143510 0.077142 1
Does this work:
pd.merge(df.groupby('id', as_index = False).sum(), df.groupby('id')['group'].apply(sum).reset_index(), on = 'id')
id a b c d group
0 1 1.241602 0.839409 0.779673 0.639509 [0, 1, 2, 3, 1, 1, 1, 1]
1 2 0.967984 0.838906 0.313017 0.498611 [0, 2, 3, 4]
2 4 0.042871 0.367209 0.676656 0.178939 1
I have a dataframe of N columns. Each element in the dataframe is in the range 0, N-1.
For example, my dataframce can be something like (N=3):
A B C
0 0 2 0
1 1 0 1
2 2 2 0
3 2 0 0
4 0 0 0
I want to create a co-occurrence matrix (please correct me if there is a different standard name for that) of size N x N which each element ij contains the number of times that element i and j assume the same value.
A B C
A x 2 3
B 2 x 2
C 3 2 x
Where, for example, matrix[0, 1] means that A and B assume the same value 2 times.
I don't care about the value on the diagonal.
What is the smartest way to do that?
DataFrame.corr
We can define a custom callable function for calculating the correlation between the columns of the dataframe, this callable takes two 1D numpy arrays as its input arguments and return's the count of the number of times the elements in these two arrays equal to each other
df.corr(method=lambda x, y: (x==y).sum())
A B C
A 1.0 2.0 3.0
B 2.0 1.0 2.0
C 3.0 2.0 1.0
Let's try broadcasting across the transposition and summing axis 2:
import pandas as pd
df = pd.DataFrame({
'A': {0: 0, 1: 1, 2: 2, 3: 2, 4: 0},
'B': {0: 2, 1: 0, 2: 2, 3: 0, 4: 0},
'C': {0: 0, 1: 1, 2: 0, 3: 0, 4: 0}
})
vals = df.T.values
e = (vals[:, None] == vals).sum(axis=2)
new_df = pd.DataFrame(e, columns=df.columns, index=df.columns)
print(new_df)
e:
[[5 2 3]
[2 5 2]
[3 2 5]]
Turn back into a dataframe:
new_df = pd.DataFrame(e, columns=df.columns, index=df.columns)
new_df:
A B C
A 5 2 3
B 2 5 2
C 3 2 5
I don't know about the smartest way but I think this works:
import numpy as np
m = np.array([[0, 2, 0], [1, 0, 1], [2, 2, 0], [2, 0, 0], [0, 0, 0]])
n = 3
ans = np.zeros((n, n))
for i in range(n):
for j in range(i+1, n):
ans[i, j] = len(m) - np.count_nonzero(m[:, i] - m[:, j])
print(ans + ans.T)
I have a pandas.DataFrame such as:
1 2 3
1 1 0 0
2 0 1 0
3 0 0 1
Which has been created from a set containing relations such as:
{(1,1),(2,2),(3,3)}
I am trying to make the equivalence classes for this. Something like this:
[1] = {1}
[2] = {2}
[3] = {3}
I have done the following so far:
testGenerator = generatorTest(matrix)
indexCount = 1
while True:
classRelation, loopCount = [], 1
iterable = next(testGenerator)
for i in iterable[1:]:
if i == 1:
classRelation.append(loopCount)
loopCount += 1
print ("[",indexCount,"] = ",set(classRelation))
indexCount += 1
Which as you can see is very messy. But I do get more or less desired output:
[ 1 ] = {1}
[ 2 ] = {2}
[ 3 ] = {3}
How can I accomplish the same output, in a tidier and more pythonic fashion?
In this case you can use pandas.DataFrame.idxmax() like:
Code:
df.idxmax(axis=1)
Test code:
df = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]],
columns=[1, 2, 3], index=[1, 2, 3, 4])
print(df.idxmax(axis=1))
Results:
1 1
2 2
3 3
4 2
dtype: int64
Consider the dataframe df
df = pd.DataFrame(np.eye(3, dtype=int), [1, 2, 3], [1, 2, 3])
numpy.where
i, j = np.where(df.values == 1)
list(zip(df.index[i], df.columns[j]))
[(1, 1), (2, 2), (3, 3)]
stack and compress
s = df.stack()
s.compress(s.astype(bool)).index.tolist()
[(1, 1), (2, 2), (3, 3)]
I have the following pandas series (represented as a list):
[7,2,0,3,4,2,5,0,3,4]
I would like to define a new series that returns distance to the last zero. It means that I would like to have the following output:
[1,2,0,1,2,3,4,0,1,2]
How to do it in pandas in the most efficient way?
The complexity is O(n). What will slow it down is doing a for loop in python. If there are k zeros in the series, and log k is negligibile comparing to the length of series, an O(n log k) solution would be:
>>> izero = np.r_[-1, (ts == 0).nonzero()[0]] # indices of zeros
>>> idx = np.arange(len(ts))
>>> idx - izero[np.searchsorted(izero - 1, idx) - 1]
array([1, 2, 0, 1, 2, 3, 4, 0, 1, 2])
A solution in Pandas is a little bit tricky, but could look like this (s is your Series):
>>> x = (s != 0).cumsum()
>>> y = x != x.shift()
>>> y.groupby((y != y.shift()).cumsum()).cumsum()
0 1
1 2
2 0
3 1
4 2
5 3
6 4
7 0
8 1
9 2
dtype: int64
For the last step, this uses the "itertools.groupby" recipe in the Pandas cookbook here.
A solution that may not be as performant (haven't really checked), but easier to understand in terms of the steps (at least for me), would be:
df = pd.DataFrame({'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]})
df
df['flag'] = np.where(df['X'] == 0, 0, 1)
df['cumsum'] = df['flag'].cumsum()
df['offset'] = df['cumsum']
df.loc[df.flag==1, 'offset'] = np.nan
df['offset'] = df['offset'].fillna(method='ffill').fillna(0).astype(int)
df['final'] = df['cumsum'] - df['offset']
df
It's sometimes surprising to see how simple it is to get c-like speeds for this stuff using Cython. Assuming your column's .values gives arr, then:
cdef int[:, :, :] arr_view = arr
ret = np.zeros_like(arr)
cdef int[:, :, :] ret_view = ret
cdef int i, zero_count = 0
for i in range(len(ret)):
zero_count = 0 if arr_view[i] == 0 else zero_count + 1
ret_view[i] = zero_count
Note the use of typed memory views, which are extremely fast. You can speed it further using #cython.boundscheck(False) decorating a function using this.
Another option
df = pd.DataFrame({'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]})
zeros = np.r_[-1, np.where(df.X == 0)[0]]
def d0(a):
return np.min(a[a>=0])
df.index.to_series().apply(lambda i: d0(i - zeros))
Or using pure numpy
df = pd.DataFrame({'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]})
a = np.arange(len(df))[:, None] - np.r_[-1 , np.where(df.X == 0)[0]][None]
np.min(a, where=a>=0, axis=1, initial=len(df))
Yet another way to do this using Numpy accumulate. The only catch is, to initialize the counter at zero you need to insert a zero infront of the series values.
import numpy as np
# Define Python function
f = lambda a, b: 0 if b == 0 else a + 1
# Convert to Numpy ufunc
npf = np.frompyfunc(f, 2, 1)
# Apply recursively over series values
x = npf.accumulate(np.r_[0, s.values])[1:]
print(x)
array([1, 2, 0, 1, 2, 3, 4, 0, 1, 2], dtype=object)
Here is a way without using groupby:
((v:=pd.Series([7,2,0,3,4,2,5,0,3,4]).ne(0))
.cumsum()
.where(v.eq(0)).ffill().fillna(0)
.rsub(v.cumsum())
.astype(int)
.tolist())
Output:
[1, 2, 0, 1, 2, 3, 4, 0, 1, 2]
Maybe pandas is not the best tool for this as in the answer by #behzad.nouri, however here is another variation:
df = pd.DataFrame({'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]})
z = df.ne(0).X
z.groupby((z != z.shift()).cumsum()).cumsum()
0 1
1 2
2 0
3 1
4 2
5 3
6 4
7 0
8 1
9 2
Name: X, dtype: int64
Solution 2:
If you write the following code you will get almost everything you need, except that the first row starts from 0 and not 1:
df = pd.DataFrame({'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]})
df.eq(0).cumsum().groupby('X').cumcount()
0 0
1 1
2 0
3 1
4 2
5 3
6 4
7 0
8 1
9 2
dtype: int64
This happened because cumulative sum starts the counting from 0. To get the desired results, I added a 0 to the first row, calculated everything and then dropped the 0 at the end to get:
x = pd.Series([0], index=[0])
df = pd.concat([x, df])
df.eq(0).cumsum().groupby('X').cumcount().reset_index(drop=True).drop(0).reset_index(drop=True)
0 1
1 2
2 0
3 1
4 2
5 3
6 4
7 0
8 1
9 2
dtype: int64