I have a df with multiple columns like this (there are many more cols & rows):
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy',},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarkt':'wf'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k' },
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' }
])
my expected df should look like this in the end:
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy','new_col_1':'all'},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m','new_col_1':'[5000]'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z','new_col_1':'all'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarket':'wf','new_col_1':'[77]'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf','new_col_1':'all'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k', 'new_col_1':'[64]', 'new_col_2':'[2]'},
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' ,'new_col_1':'all'}
])
and here are multiple conditions I want to apply on cols fruit_code & vegetable_code to get two new columns:
UPDATE
def fruits_vegetable(row):
if len(str(row['fruit_code'])) == 1: # fruit_code in new_col_1
row['new_col_1'] = row['fruit_code']
elif len(str(row['fruit_code'])) == 1 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) > 2 and len(str(row['vegetable_code'])) == 1: # vegetable_code in new_col_1
row['new_col_1'] = row['vegetable_code']
elif len(str(row['fruit_code'])) > 3 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) == 2 and len(str(row['vegetable_code'])) >= 0: # fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = row['fruit_code'][0]
row['new_col_2'] = row['fruit_code'][1]
return row
df = df.apply(fruits_vegetable, axis=1)
I'm still stuck, now I get "all" in some of the rows for the first column, but the second does not change.
If someone has some insights, that would be great.
Thanks, much appreciated
First is necessary convert strings repr of lists by ast.literal_eval to lists, then for chceck length remove casting to strings. If need one element lists instead scalars use [] in fruit[0] and fruit[1] and last change order of condition for len(fruit) == 1, also change len(fruit) > 3 to len(fruit) > 2 for match first row:
def fruits_vegetable(row):
fruit = ast.literal_eval(row['fruit_code'])
vege = ast.literal_eval(row['vegetable_code'])
if len(fruit) == 1 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) > 2 and len(vege) == 1: # vegetable_code in new_col_1
row['new_col_1'] = vege
elif len(fruit) > 2 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) == 2 and len(vege) >= 0:# fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = [fruit[0]]
row['new_col_2'] = [fruit[1]]
elif len(fruit) == 1: # fruit_code in new_col_1
row['new_col_1'] = fruit
return row
df = df.apply(fruits_vegetable, axis=1)
print (df)
ID date fruit_code new_col_1 new_col_2 supermarket \
0 1 2022-01-01 [100,99,300] all NaN xy
1 2 2022-01-01 [67,200,87] [5000] NaN z, m
2 3 2021-01-01 [100,5,300,78] all NaN wf, z
3 4 2020-01-01 [77] [77] NaN NaN
4 5 2022-15-01 [100,200,546,33] all NaN t, wf
5 6 2002-12-01 [64,2] [64] [2] k
6 7 2018-12-01 [5] all NaN p
supermarkt vegetable_code
0 NaN [1000,2000,3000]
1 NaN [5000]
2 NaN [7000,2000,3000]
3 wf [1000]
4 NaN [4000,2000,3000]
5 NaN [6000,8000,1000]
6 NaN [6000,8000,1000]
Related
How to speed up my custom function?
I have three list of numbers :
list1
list2
list3
And Pandas Dataframe like this:
id
inum
DESC_1
recs
id1
inum1
1
recs1
id2
inum2
2
recs2
id3
inum3
3
recs3
And my custom function:
def keep_inum(row):
if len(row) != 0:
if int(row['inum']) in list1:
if row['DESC_1'] == 1:
return row['recs']
else:
return ''
elif int(row['inum']) in list2:
if row['DESC_1'] == 2:
return row['recs']
else:
return ''
elif int(row['inum']) in list3:
if row['DESC_1'] == 3:
return row['recs']
else:
return ''
else:
return row['recs']
else:
pass
Apply func to DF:
df['recs'] = df.apply(keep_inum, axis = 1)
By not using a custom function at all:
import pandas as pd
df = pd.DataFrame(
{
"id": ["id1", "id2", "id3", "id4"],
"inum": ["111", "222", "333", "331"],
"DESC_1": [1, 4, 3, 3],
"recs": ["recs1", "recs2", "recs3", "yes"],
}
)
print(df)
print("---")
list1 = [111]
list2 = [222]
list3 = [333, 331]
# Cast inum to int in one go
df["inum_int"] = df["inum"].astype(int)
# Empty the recs where inum doesn't match desc
df.loc[df["inum_int"].isin(list1) & ~(df["DESC_1"] == 1), "recs"] = ""
df.loc[df["inum_int"].isin(list2) & ~(df["DESC_1"] == 2), "recs"] = ""
df.loc[df["inum_int"].isin(list3) & ~(df["DESC_1"] == 3), "recs"] = ""
df.drop(columns=["inum_int"], inplace=True)
print(df)
This outputs
id inum DESC_1 recs
0 id1 111 1 recs1
1 id2 222 4 recs2
2 id3 333 3 recs3
3 id4 331 3 yes
---
id inum DESC_1 recs
0 id1 111 1 recs1
1 id2 222 4
2 id3 333 3 recs3
3 id4 331 3 yes
I have a data frame that consists of a time-series of integers. I'm trying to group the data frame by year and then for each year count the number of times that the sum of the absolute value of consecutive entries with the same sign is greater than or equal to 5.
>>> import pandas as pd
>>> l = [1, -1, -4, 2, 2, 4, 5, 1, -3, -4]
>>> idx1 = pd.date_range('2019-01-01',periods=5)
>>> idx2 = pd.date_range('2020-01-01',periods=5)
>>> idx = idx1.union(idx2)
>>> df = pd.DataFrame(l, index=idx, columns=['a'])
>>> df
a
2019-01-01 1
2019-01-02 -1
2019-01-03 -4 \\ 2019 count = 1: abs(-1) + abs(-4) >= 5
2019-01-04 2
2019-01-05 2
2020-01-01 4
2020-01-02 5 \\ 2020 count = 1: abs(4) + abs(5) + abs(1) = 10 >=5
2020-01-03 1
2020-01-04 -3
2020-01-05 -4 \\ 2020 count = 2: abs(-3) + abs(-4) = 7 >= 5
The desired output is:
2019 1
2020 2
My approach to solve this problem is to chain groupby and apply. Below are the implementations of the functions I created to pass to groupby and apply respectively.
>>> def get_year(x):
return x.year
>>> def count(group, t=5):
c = 0 # counter
s = 0 # sum of consec vals w same sign
for i in range(1,len(group)):
if np.sign(group['a'].iloc[i-1]) == np.sign(group['a'].iloc[i]):
if s == 0:
s = group['a'].iloc[i-1] + group['a'].iloc[i]
else:
s += group['a'].iloc[i]
if i == (len(group) -1):
return c + 1
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) >= t):
#if consec streak of vals w same sign is broken and abs(s) >= t then inc c and reset s
c += 1
s = 0
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) < t):
#if consec streak of vals w same sign is broken and abs(s) < t then reset s
s = 0
return c
>>> by_year = df.groupby(get_year)
>>> by_year.apply(count)
2019 1
2020 2
My question is:
Is there a more "pythonic" implementation of the above count function that produces the desired result but doesn't rely on for loops?
Looking for an alternative cleaner solution to my for loop. I'm looking to create a column (df['result']) based on the following logic:
Data:
d = {'a': [1,5], 'b': [2,4], 'c': [3,3], 'd':[4,2], 'e': [5,1]}
df = pd.DataFrame(d)
df['result'] = np.NaN
for i in range(len(df)):
if df['a'][i] == 1:
df['result'][i] = 1
elif df['b'][i] == 2:
df['result'][i] = 2
elif df['c'][i] == 3:
df['result'][i] = 3
elif df['d'][i] == 4:
df['result'][i] = 4
elif df['e'][i] == 5:
df['result'][i] = 5
else:
df['result'][i] = 0
Is there a cleaner way of creating this hierarchical logic without looping through like this?
Use numpy.select:
import numpy as np
df["result"] = np.select([df["a"].eq(1), df["b"].eq(2), df["c"].eq(3), df["d"].eq(4), df["e"].eq(5)],
[1,2,3,4,5],
0)
IIUC, try this (incase you have many columns and dont want to code the index):
m = df.eq(range(1,len(df.columns)+1))
df['result'] = df.where(m).bfill(1).iloc[:,0]
print(df)
a b c d e result
0 1 2 3 4 5 1.0
1 5 4 3 2 1 3.0
I have a pandas dataframe with a a set of values (prices). Within each group of initiator_id I need to sort prices ascending if the type == sell, and descending if the type == buy.
Then I add an id within each group. Right now I do:
df['bidnum'] = df.groupby(['initiator_id', 'type']).cumcount()
What is the efficient way to sort ascending within each group 'initiator_id', 'type == sell' and descending for 'initiator_id', 'type == buy' ?
Here is how the original dataset looks like now:
initiator_id price type bidnum
1 170.81 sell 0
2 170.81 sell 0
2 169.19 buy 0
3 170.81 sell 0
3 169.19 buy 0
3 70.81 sell 1
4 170.81 sell 0
4 169.19 buy 0
4 70.81 sell 1
4 69.19 buy 1
and I need something like:
initiator_id, price, type
1, 100,sell
1, 99, sell
1, 98, sell
1, 110, buy
1, 120, buy
1, 125, buy
So that sell subgroup within each initiator_id group is sorted descending, and buy subgroup is sorted ascending.
If you can assume that your "price" column will always contain non-negative values, we could "cheat". Assign a negative value to the prices of buy or sell operations, sort, and then calculate the absolute value to go back to the original prices:
If type is "buy", the price remains positive (2 * 1 - 1 = 1). If type is "sell", the price will become negative (2 * 0 - 1 = -1).
df["price"] = df["price"] * (2 * (df["type"] == "buy").astype(int) - 1)
Now sort values normally. I've included both "initiator_id" and "type" columns to match your expected output:
df = df.sort_values(["initiator_id", "type", "price"])
Finally, calculate the absolute value of the "price" column to retrieve your original values:
df["price"] = df["price"].abs()
Expected output of this operation on your sample input:
initiator_id price type bidnum
0 1 170.81 sell 0
2 2 169.19 buy 0
1 2 170.81 sell 0
4 3 169.19 buy 0
3 3 170.81 sell 0
5 3 70.81 sell 1
9 4 69.19 buy 1
7 4 169.19 buy 0
6 4 170.81 sell 0
8 4 70.81 sell 1
Edit: Comments added to describe how this solution works as it could be used in other more general contexts.
One solution:
final_df = pd.DataFrame()
grouped_df = df.groupby(['initiator_id', 'type'])
for key, item in grouped_df:
# the dfg are the different 'subgroups' created by groupby
# reset_index create a dataframe object from the groupby object so that we can
# append the subgroups after sorting them
# could also use item instead of get_group(key)
dfg = grouped_df.get_group(key).reset_index()
# the subgroups are sorted according to the value of the first cell in the
# 'type' column (by construction there is a unique value in this column)
orderg = (dfg.loc[0, 'type']=='buy') # ascending order if type = buy
final_df = final_df.append(dfg.sort_values('price', ascending=orderg))
# some cosmetics to arrive to the final df
final_df.drop(final_df.columns[0], axis=1, inplace=True)
final_df.reset_index(inplace=True, drop=True)
Output:
initiator_id price type
0 1 170.81 sell
1 2 169.19 buy
2 2 170.81 sell
3 3 169.19 buy
4 3 170.81 sell
5 3 70.81 sell
6 4 69.19 buy
7 4 169.19 buy
8 4 170.81 sell
9 4 70.81 sell
Everyone else gave a solution with pandas. Here I present a solution without pandas.
The input CSV:
initiator_id,price,type,bidnum
1,170.81,sell,0
2,170.81,sell,0
2,169.19,buy,0
3,170.81,sell,0
3,169.19,buy,0
3,70.81,sell,1
4,170.81,sell,0
4,169.19,buy,0
4,70.81,sell,1
4,69.19,buy,1
The output CSV:
initiator_id,price,type,bidnum
1,170.81,sell,0
2,170.81,sell,0
2,169.19,buy,0
3,170.81,sell,0
3,70.81,sell,1
3,169.19,buy,0
4,170.81,sell,0
4,70.81,sell,1
4,69.19,buy,1
4,169.19,buy,0
The code:
from collections import OrderedDict
import numpy
"""
the reason why this code uses exec is so that the ordering of columns can be arbitrary
"""
def remove_duplicates(seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
def returnLastIndex(temp2):
global mydict
temp3 = mydict['initiator_id'][temp2]
while True:
temp2 = temp2 + 1
try:
if mydict['initiator_id'][temp2] != temp3:
return temp2-1
except:
return temp2-1
def returnFirstIndex(temp2):
global mydict
temp3 = mydict['initiator_id'][temp2]
while temp2 >= 1:
temp2 = temp2 - 1
if mydict['initiator_id'][temp2] != temp3:
return temp2+1
return 0
with open("input.csv") as file:
lines = file.readlines()
new_lines = []
new_headers = []
for x in range(len(lines)): #loop to reamove headers and newlines
if x == 0:
for y in lines[x].strip().split(","):
new_headers.append(y)
else:
new_lines.append(lines[x].strip())
mydict = OrderedDict()
for x in new_headers:
exec("mydict['"+x+"'] = []")
for x in range(len(new_headers)):
for y in new_lines:
if new_headers[x] == "initiator_id":
exec("mydict['"+new_headers[x]+"'].append(int('"+y.split(",")[x]+"'))")
elif new_headers[x] == "price":
exec("mydict['"+new_headers[x]+"'].append(float('"+y.split(",")[x]+"'))")
else:
exec("mydict['"+new_headers[x]+"'].append('"+y.split(",")[x]+"')")
for x in new_headers:
exec("mydict['"+x+"'] = numpy.array(mydict['"+x+"'])")
temp1 = mydict['initiator_id'].argsort()
for x in (new_headers):
exec("mydict['"+x+"'] = mydict['"+x+"'][temp1]")
splice_list_first = []
for x in range(len(mydict['initiator_id'])):
splice_list_first.append(returnFirstIndex(x))
splice_list_last = []
for x in range(len(mydict['initiator_id'])):
splice_list_last.append(returnLastIndex(x))
splice_list_first = remove_duplicates(splice_list_first)
splice_list_last = remove_duplicates(splice_list_last)
master_string = ",".join(new_headers)+"\n"
for x in range(len(splice_list_first)):
temp4 = OrderedDict()
for y in new_headers:
exec("temp4['"+y+"'] = mydict['"+y+"']["+str(splice_list_first[x])+":"+str(splice_list_last[x]+1)+"]")
sell_index = []
buy_index = []
for z in range(len(temp4['type'])):
if temp4['type'][z] == "sell":
sell_index.append(z)
if temp4['type'][z] == "buy":
buy_index.append(z)
temp5 = OrderedDict()
for a in range(len(sell_index)):
for b in new_headers:
try:
exec("temp5['"+b+"']")
except:
exec("temp5['"+b+"'] = []")
exec("temp5['"+b+"'].append(temp4['"+b+"']["+str(sell_index[a])+":"+str(sell_index[a]+1)+"][0])")
try:
for c in new_headers:
exec("temp5['"+c+"'] = numpy.array(temp5['"+c+"'])")
temp7 = temp5['price'].argsort()[::-1]
for d in (new_headers):
exec("temp5['"+d+"'] = temp5['"+d+"'][temp7]")
for e in range(len(temp5['initiator_id'])):
for f in new_headers:
master_string = master_string + str(temp5[f][e])+","
master_string = master_string[:-1]+"\n"
except Exception as g:
pass
temp6 = OrderedDict()
for a in range(len(buy_index)):
for b in new_headers:
try:
exec("temp6['"+b+"']")
except:
exec("temp6['"+b+"'] = []")
exec("temp6['"+b+"'].append(temp4['"+b+"']["+str(buy_index[a])+":"+str(buy_index[a]+1)+"][0])")
try:
for c in new_headers:
exec("temp6['"+c+"'] = numpy.array(temp6['"+c+"'])")
temp7 = temp6['price'].argsort()
for d in (new_headers):
exec("temp6['"+d+"'] = temp6['"+d+"'][temp7]")
for e in range(len(temp6['initiator_id'])):
for f in new_headers:
master_string = master_string + str(temp6[f][e])+","
master_string = master_string[:-1]+"\n"
except Exception as g:
pass
print(master_string)
f = open("output.csv", "w")
f.write(master_string)
f.close()
I am trying to iterate over a pandas dataframe and update the value if condition is met but i am getting an error.
for line, row in enumerate(df.itertuples(), 1):
if row.Qty:
if row.Qty == 1 and row.Price == 10:
row.Buy = 1
AttributeError: can't set attribute
First iterating in pandas is possible, but very slow, so another vectorized solution are used.
I think you can use iterrows if you need iterating:
for idx, row in df.iterrows():
if df.loc[idx,'Qty'] == 1 and df.loc[idx,'Price'] == 10:
df.loc[idx,'Buy'] = 1
But better is to use vectorized solutions – set value by boolean mask with loc:
mask = (df['Qty'] == 1) & (df['Price'] == 10)
df.loc[mask, 'Buy'] = 1
Or solution with mask:
df['Buy'] = df['Buy'].mask(mask, 1)
Or if you need if...else use numpy.where:
df['Buy'] = np.where(mask, 1, 0)
Samples.
Set values by conditions:
df = pd.DataFrame({'Buy': [100, 200, 50],
'Qty': [5, 1, 1],
'Name': ['apple', 'pear', 'banana'],
'Price': [1, 10, 10]})
print (df)
Buy Name Price Qty
0 100 apple 1 5
1 200 pear 10 1
2 50 banana 10 1
mask = (df['Qty'] == 1) & (df['Price'] == 10)
df['Buy'] = df['Buy'].mask(mask, 1)
print (df)
Buy Name Price Qty
0 100 apple 1 5
1 1 pear 10 1
2 1 banana 10 1
df['Buy'] = np.where(mask, 1, 0)
print (df)
Buy Name Price Qty
0 0 apple 1 5
1 1 pear 10 1
2 1 banana 10 1
Ok, if you intend to set values in df then you need track the index values.
option 1
using itertuples
# keep in mind `row` is a named tuple and cannot be edited
for line, row in enumerate(df.itertuples(), 1): # you don't need enumerate here, but doesn't hurt.
if row.Qty:
if row.Qty == 1 and row.Price == 10:
df.set_value(row.Index, 'Buy', 1)
option 2
using iterrows
# keep in mind that `row` is a `pd.Series` and can be edited...
# ... but it is just a copy and won't reflect in `df`
for idx, row in df.iterrows():
if row.Qty:
if row.Qty == 1 and row.Price == 10:
df.set_value(idx, 'Buy', 1)
option 3
using straight up loop with get_value
for idx in df.index:
q = df.get_value(idx, 'Qty')
if q:
p = df.get_value(idx, 'Price')
if q == 1 and p == 10:
df.set_value(idx, 'Buy', 1)
pandas.DataFrame.set_value method is deprecated as of 0.21.0 pd.DataFrame.set_value
Use pandas.Dataframe.at
for index, row in df.iterrows():
if row.Qty and row.Qty == 1 and row.Price == 10:
df.at[index,'Buy'] = 1