I have a pandas DataFrame with values in columns A, B, C, and D and want to determine for every row the first and last non-zero column. BUT the order of the elements is not the same for all rows. It is determined by columns item_0, item_1 and item_2.
While I can easily do this by applying a function to every row this becomes very slow for my DataFrame. Is there an elegant, more pythonic / pandasy way to do this?
Input:
A B C D item_0 item_1 item_2
0 1 2 0 0 A B C
1 0 1 1 0 A B C
2 1 0 1 0 A B C
3 0 2 0 0 D A B
4 1 1 0 1 D A B
5 0 0 0 1 D A B
Expected Output:
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
Update: Here's the current code with apply
import pandas as pd
def first_and_last_for_row(row):
reference_list = row[["item_0", "item_1", "item_2"]].tolist()
list_to_sort = (
row[["A", "B", "C", "D"]].index[row[["A", "B", "C", "D"]] > 0].tolist()
)
ordered_list = [l for l in reference_list if l in list_to_sort]
if len(ordered_list) == 0:
return None, None
else:
return ordered_list[0], ordered_list[-1]
df = pd.DataFrame(
{
"A": [1, 0, 1, 0, 1, 0],
"B": [2, 1, 0, 2, 1, 0],
"C": [0, 1, 1, 0, 0, 0],
"D": [0, 0, 0, 0, 1, 1],
"item_0": ["A", "A", "A", "D", "D", "D"],
"item_1": ["B", "B", "B", "A", "A", "A"],
"item_2": ["C", "C", "C", "B", "B", "B"],
}
)
df[["first", "last"]] = df.apply(first_and_last_for_row, axis=1, result_type="expand")
Here is a fully vectorized numpy approach. It's not very complex but has quite a few steps so I also provided a commented version of the code:
cols = ['A', 'B', 'C', 'D']
a = df[cols].to_numpy()
idx = df.filter(like='item_').replace({k:v for v,k in enumerate(cols)}).to_numpy()
b = a[np.arange(len(a))[:,None], idx] != 0
first = b.argmax(1)
last = b.shape[1]-np.fliplr(b).argmax(1)-1
c = df.filter(like='item_').to_numpy()
df[['first', 'last']] = c[np.arange(len(c))[:,None],
np.vstack((first, last)).T]
mask = b[np.arange(len(b)), first]
df[['first', 'last']] = df[['first', 'last']].where(pd.Series(mask, index=df.index))
commented code:
cols = ['A', 'B', 'C', 'D']
# convert to numpy array
a = df[cols].to_numpy()
# array([[1, 2, 0, 0],
# [0, 1, 1, 0],
# [1, 0, 1, 0],
# [0, 2, 0, 0],
# [1, 1, 0, 1],
# [0, 0, 0, 1]])
# get indexer as numpy array
idx = df.filter(like='item_').replace({k:v for v,k in enumerate(cols)}).to_numpy()
# array([[0, 1, 2],
# [0, 1, 2],
# [0, 1, 2],
# [3, 0, 1],
# [3, 0, 1],
# [3, 0, 1]])
# reorder columns and get non-zero
b = a[np.arange(len(a))[:,None], idx] != 0
# array([[ True, True, False],
# [False, True, True],
# [ True, False, True],
# [False, False, True],
# [ True, True, True],
# [ True, False, False]])
# first non-zero
first = b.argmax(1)
# array([0, 1, 0, 2, 0, 0])
# last non-zero
last = b.shape[1]-np.fliplr(b).argmax(1)-1
# array([1, 2, 2, 2, 2, 0])
# get back column names from position
c = df.filter(like='item_').to_numpy()
df[['first', 'last']] = c[np.arange(len(c))[:,None],
np.vstack((first, last)).T]
# optional
# define a mask in case a zero was selected
mask = b[np.arange(len(b)), first]
# array([ True, True, True, True, True, True])
# mask where argmax was 0
df[['first', 'last']] = df[['first', 'last']].where(pd.Series(mask, index=df.index))
output:
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
Let me try with a first attempt to "optimize", just by avoiding inner looping.
The solution here is about 1.7x faster on 60k rows (I didn't have the patience to wait for 600k)
def first_and_last(row):
# select order given by items
i0, i1, i2 = items = np.array(row[["item_0", "item_1", "item_2"]])
# select values in right order
v0, v1, v2 = values = np.array(row[[i0, i1, i2]])
pos_values = (values > 0)
n_positives = np.sum(values)
if n_positives == 0:
return np.nan, np.nan
else:
return items[pos_values][[0, -1]]
Then:
df_ = pd.concat([df]*10_000)
# Original function
%time df_.apply(first_and_last_for_row, axis=1, result_type="expand")
CPU times: user 53.3 s, sys: 22.5 ms, total: 53.4 s
Wall time: 53.4 s
# New function
%time df_.apply(first_and_last, axis=1, result_type="expand")
CPU times: user 32.9 s, sys: 0 ns, total: 32.9 s
Wall time: 32.9 s
However, apply method is not optimal, there are other ways to iterate over a dataframe. In particular, you can use itertuples method:
def first_and_last_iter(row):
# select order given by items
i0, i1, i2 = items = np.array([getattr(row, "item_0"), getattr(row, "item_1"),getattr(row, "item_2")])
# select values in right order
v0, v1, v2 = values = np.array([getattr(row, i0), getattr(row, i1),getattr(row,i2)])
pos_values = (values > 0)
n_positives = np.sum(values)
if n_positives == 0:
return np.nan, np.nan
else:
return items[pos_values][[0, -1]]
%time df_[["first", "last"]] = [first_and_last_iter(row) for row in df_.itertuples()]
CPU times: user 1.05 s, sys: 0 ns, total: 1.05 s
Wall time: 1.05 s
And that's 50x improvement
Assuming your DataFrame is named df, here is something that works using filtering and no loops. It will work with all-zero lines too (value will be NaN in this case).
On my machine, it runs 10,000,000 rows in about 13 seconds.
# create filters stating if each column <item_n> is not zero
i0 = df.lookup(df.index, df.item_0).astype(bool) # [True, False, True, False, True, True]
i1 = df.lookup(df.index, df.item_1).astype(bool)
i2 = df.lookup(df.index, df.item_2).astype(bool)
# for the "first" column, fill with value of item_0 if column is not zero
df['first'] = df.item_0[i0] # ['A', NaN, 'A', NaN, 'D', 'D']
# fill the Nans with values of item_1 if column is not zero
df['first'][~i0 & i1] = df.item_1[~i0 & i1]
# fill the remaining Nans with values of item_2 if column is not zero
df['first'][~i0 & ~i1 & i2] = df.item_2[~i0 & ~i1 & i2]
# apply the same logic in reverse order for "last"
df['last'] = df.item_2[i2]
df['last'][~i2 & i1] = df.item_1[~i2 & i1]
df['last'][~i2 & ~i1 & i0] = df.item_0[~i2 & ~i1 & i0]
Output:
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
df = pd.DataFrame(
{
"A": [1, 0, 1, 0, 1, 0],
"B": [2, 1, 0, 2, 1, 0],
"C": [0, 1, 1, 0, 0, 0],
"D": [0, 0, 0, 0, 1, 1],
"item_0": ["A", "A", "A", "D", "D", "D"],
"item_1": ["B", "B", "B", "B", "B", "B"],
"item_2": ["C", "C", "C", "A", "A", "A"],
}
)
first = []
last = []
for i in range(df.shape[0]):
check1 = []
for j in df.columns:
t1 = list(df.loc[i:i][j].values)[0]
try:
if t1 > 0:
check1.append(j)
except TypeError:
continue
if len(check1) == 2:
first.append(check1[0])
last.append(check1[1])
check1.clear()
elif len(check1) == 3:
first.append(check1[2])
last.append(check1[1])
check1.clear()
elif len(check1) == 1:
first.append(check1[0])
last.append(check1[0])
check1.clear()
output:
good question
def function1(ss:pd.Series):
ss1=ss.loc[ss.iloc[4:].tolist()]
ld1=lambda ss2:ss2.loc[lambda ss3:(ss3>0).cumsum()==1].head(1).index.values[0]
return pd.Series([ld1(ss1),ld1(ss1[::-1])],index=['first','last'])
df1.join(df1.apply(function1,axis=1))
A B C D item_0 item_1 item_2 first last
0 1 2 0 0 A B C A B
1 0 1 1 0 A B C B C
2 1 0 1 0 A B C A C
3 0 2 0 0 D A B B B
4 1 1 0 1 D A B D B
5 0 0 0 1 D A B D D
I have a DataFrame like the below
A B
1 {1:3,2:0,3:5}
2 {3:2}
3 {1:2,2:3,3:9}
I want the column B to have missing keys in few rows like for example 2nd row only has key 3, but key 1 and key 2 are missing. For key 1, I want to set value to 1, for key 2 i want to set value to 2 and the final dataframe I would like is`
A B
1 {1:3,2:0,3:5}
2 {1:1,2:1,3:2}
3 {1:2,2:3,3:9}
One idea is use merge of dicts, but is necessary first pass missing for avoid overwrite existing keys:
missing = {1:1, 2:2}
df['B'] = df['B'].apply(lambda x: {**missing, **x})
print (df)
A B
0 1 {1: 3, 2: 0, 3: 5}
1 2 {1: 1, 2: 2, 3: 2}
2 3 {1: 2, 2: 3, 3: 9}
If change order values are overwrite:
df['B1'] = df['B'].apply(lambda x: {**x, **missing})
print (df)
A B B1
0 1 {1: 3, 2: 0, 3: 5} {1: 1, 2: 2, 3: 5}
1 2 {1: 1, 2: 2, 3: 2} {1: 1, 2: 2, 3: 2}
2 3 {1: 2, 2: 3, 3: 9} {1: 1, 2: 2, 3: 9}
If want more dynamic solution and add all misisng keys to same value, e.g. 1:
missing = dict.fromkeys(set().union(*df['B'].tolist()), 1)
df['B'] = df['B'].apply(lambda x: {**missing, **x})
print (df)
A B
0 1 {1: 3, 2: 0, 3: 5}
1 2 {1: 1, 2: 1, 3: 2}
2 3 {1: 2, 2: 3, 3: 9}
EDIT:
For replace values by means:
print (df)
A B
0 1 {1:3,2:5}
1 2 {3:2}
2 3 {1:2,2:3,3:9}
df['B'] = df['B'].apply(lambda x: {**dict.fromkeys([1,2,3], np.mean(list(x.values()))), **x})
print (df)
A B
0 1 {1: 3, 2: 5, 3: 4.0}
1 2 {1: 2.0, 2: 2.0, 3: 2}
2 3 {1: 2, 2: 3, 3: 9}
i have one dataframe df as:
invoice_id|customer_id|items|batch
110|425|{'a': 50, 'b': 46}|no518
994528|a863|{'a': 21, 'c': 25}|as22
24|t311|{'scissor': 6, 'rock': 6}|af10
and another dataframe df1 as :
invoice_id|defect
110|a
994528|c
i want to search the value in df1['defect'] in df['items'] column.
This is my expected output:
invoice_id|customer_id|items|batch|defects_in_items
110|425|{'a': 50, 'b': 46}|no518|50
994528|a863|{'a': 21, 'c': 25}|as22|25
24|t311|{'scissor': 6, 'rock': 6}|af10|0
Can anybody help. Thanks in advance
Merge the two dataframe, and then use apply.
import ast
df2 = df.merge(df1, on=["invoice_id"], how="left")
df2["defects_in_items"] = df2.apply(lambda x: ast.literal_eval(x["items"]).get(x["defect"],0), axis=1)
df2.iloc[:,[0,1,2,3,5]]
result:
invoice_id customer_id items batch defects_in_items
0 110 425 {'a': 50, 'b': 46} no518 50
1 994528 a863 {'a': 21, 'c': 25} as22 25
2 24 t311 {'scissor': 6, 'rock': 6} af10 0
ps. I use a txt file to get the two dataframe, so my 'items' column's type is str, ast.literal_eval is to change the column's type to dict.
First, create a mapping using map:
mapping = df.invoice_id.map(df1.set_index('invoice_id').defect)
mapping
0 a
1 c
2 NaN
Name: invoice_id, dtype: object
Next, iterate over df['items'] and index with the appropriate mapping value for that row:
df['defects_in_items'] = [i.get(j, 0) for i, j in zip(df['items'], mapping)]
Or, equivalently, define a function to perform the mapping and vectorize it:
def mapper(i, j):
return i.get(j, 0)
v = np.vectorize(mapper)
df['defects_in_items'] = v(df['items'], mapping)
Outputs this;
df
invoice_id customer_id items batch defects_in_items
0 110 425 {'a': 50, 'b': 46} no518 50
1 994528 a863 {'a': 21, 'c': 25} as22 25
2 24 t311 {'scissor': 6, 'rock': 6} af10 0
Another way:
# create sample data
df = pd.DataFrame({'invoice_id':[110,994528,24],
'customer_id':['425','a863','t311'],
'citems' :[{'a': 50, 'b': 46},{'a': 21, 'c': 25},{'scissor': 6, 'rock': 6}],
'batch':['no518','as22','af10']})
df2 = pd.DataFrame({'invoice_id':[110,994528], 'defect':['a','c']})
## merge data
df = df.merge(df2, on='invoice_id', how='left').fillna(0)
## iterate over rows and create new column
for index, row in df.iterrows():
if row['defect'] in row['citems']:
df.loc[index, 'defect_in_items'] = df.loc[index, 'citems'].get(df.loc[index, 'defect'],0)
else:
df.loc[index, 'defect_in_items'] = 0
## answer
batch citems customer_id invoice_id defect defect_in_items
0 no518 {'a': 50, 'b': 46} 425. 110 a 50.0
1 as22 {'a': 21, 'c': 25} a863 994528 c 25.0
2 af10 {'scissor': 6, 'rock': 6} t311 24 0 0.0
I'm having a problem with counting letters in a string and then updating this value to a dictionary.
I am iterating through a dictionary of alphabet letters and using the .count() function at each pair to compare to a sentence.
The count is working and returns the number you would expect (in this case xval), however this is not updating the dictionary when I use dic[k] = xval.
Infuriatingly simple. Please offer some advice if you can.
import string
type = ("cat sat on the mat").lower()
dic = dict.fromkeys(string.ascii_lowercase, 0)
print(dic)
for k, v in dic.items():
xval = type.count(k)
print(xval)
dic[k] = xval
print ("%s: %s" % (k, v))
And here is the output for completion. Many thanks.
{'m': 0, 'l': 0, 'h': 0, 'f': 0, 'd': 0, 'x': 0, 'i': 0, 's': 0, 'r': 0, 'u': 0, 'z': 0, 't': 0, 'c': 0, 'a': 0, 'q': 0, 'p': 0, 'j': 0, 'n': 0, 'g': 0, 'w': 0, 'o': 0, 'e': 0, 'k': 0, 'b': 0, 'v': 0, 'y': 0}
0
f: 0
0
q: 0
0
r: 0
1
m: 0
1
c: 0
3
a: 0
0
u: 0
0
g: 0
1
e: 0
0
d: 0
0
v: 0
1
h: 0
0
y: 0
0
k: 0
4
t: 0
0
i: 0
1
o: 0
0
w: 0
0
b: 0
1
s: 0
0
l: 0
0
j: 0
0
x: 0
0
z: 0
1
n: 0
0
p: 0
Process finished with exit code 0
You are printing k and v which do not reflect the updated dictionary.
import string
type = ("cat sat on the mat").lower()
dic = dict.fromkeys(string.ascii_lowercase, 0)
print(dic)
for k in dic:
xval = type.count(k)
print(xval)
dic[k] = xval
print k, ":", dic[k]
outputs
{'a': 0, 'c': 0, 'b': 0, 'e': 0, 'd': 0, 'g': 0, 'f': 0, 'i': 0, 'h': 0, 'k': 0, 'j': 0, 'm': 0, 'l': 0, 'o': 0, 'n': 0, 'q': 0, 'p': 0, 's': 0, 'r': 0, 'u': 0, 't': 0, 'w': 0, 'v': 0, 'y': 0, 'x': 0, 'z': 0}
3
a : 3
1
c : 1
0
b : 0
1
e : 1
0
d : 0
0
g : 0
0
f : 0
0
i : 0
1
h : 1
0
k : 0
0
j : 0
1
m : 1
0
l : 0
1
o : 1
1
n : 1
0
q : 0
0
p : 0
1
s : 1
0
r : 0
0
u : 0
4
t : 4
0
w : 0
0
v : 0
0
y : 0
0
x : 0
0
z : 0