for-loop pandas df alternative - python

Looking for an alternative cleaner solution to my for loop. I'm looking to create a column (df['result']) based on the following logic:
Data:
d = {'a': [1,5], 'b': [2,4], 'c': [3,3], 'd':[4,2], 'e': [5,1]}
df = pd.DataFrame(d)
df['result'] = np.NaN
for i in range(len(df)):
if df['a'][i] == 1:
df['result'][i] = 1
elif df['b'][i] == 2:
df['result'][i] = 2
elif df['c'][i] == 3:
df['result'][i] = 3
elif df['d'][i] == 4:
df['result'][i] = 4
elif df['e'][i] == 5:
df['result'][i] = 5
else:
df['result'][i] = 0
Is there a cleaner way of creating this hierarchical logic without looping through like this?

Use numpy.select:
import numpy as np
df["result"] = np.select([df["a"].eq(1), df["b"].eq(2), df["c"].eq(3), df["d"].eq(4), df["e"].eq(5)],
[1,2,3,4,5],
0)

IIUC, try this (incase you have many columns and dont want to code the index):
m = df.eq(range(1,len(df.columns)+1))
df['result'] = df.where(m).bfill(1).iloc[:,0]
print(df)
a b c d e result
0 1 2 3 4 5 1.0
1 5 4 3 2 1 3.0

Related

pandas apply function to multiple columns with condition and create new columns

I have a df with multiple columns like this (there are many more cols & rows):
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy',},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarkt':'wf'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k' },
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' }
])
my expected df should look like this in the end:
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy','new_col_1':'all'},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m','new_col_1':'[5000]'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z','new_col_1':'all'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarket':'wf','new_col_1':'[77]'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf','new_col_1':'all'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k', 'new_col_1':'[64]', 'new_col_2':'[2]'},
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' ,'new_col_1':'all'}
])
and here are multiple conditions I want to apply on cols fruit_code & vegetable_code to get two new columns:
UPDATE
def fruits_vegetable(row):
if len(str(row['fruit_code'])) == 1: # fruit_code in new_col_1
row['new_col_1'] = row['fruit_code']
elif len(str(row['fruit_code'])) == 1 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) > 2 and len(str(row['vegetable_code'])) == 1: # vegetable_code in new_col_1
row['new_col_1'] = row['vegetable_code']
elif len(str(row['fruit_code'])) > 3 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) == 2 and len(str(row['vegetable_code'])) >= 0: # fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = row['fruit_code'][0]
row['new_col_2'] = row['fruit_code'][1]
return row
df = df.apply(fruits_vegetable, axis=1)
I'm still stuck, now I get "all" in some of the rows for the first column, but the second does not change.
If someone has some insights, that would be great.
Thanks, much appreciated
First is necessary convert strings repr of lists by ast.literal_eval to lists, then for chceck length remove casting to strings. If need one element lists instead scalars use [] in fruit[0] and fruit[1] and last change order of condition for len(fruit) == 1, also change len(fruit) > 3 to len(fruit) > 2 for match first row:
def fruits_vegetable(row):
fruit = ast.literal_eval(row['fruit_code'])
vege = ast.literal_eval(row['vegetable_code'])
if len(fruit) == 1 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) > 2 and len(vege) == 1: # vegetable_code in new_col_1
row['new_col_1'] = vege
elif len(fruit) > 2 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) == 2 and len(vege) >= 0:# fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = [fruit[0]]
row['new_col_2'] = [fruit[1]]
elif len(fruit) == 1: # fruit_code in new_col_1
row['new_col_1'] = fruit
return row
df = df.apply(fruits_vegetable, axis=1)
print (df)
ID date fruit_code new_col_1 new_col_2 supermarket \
0 1 2022-01-01 [100,99,300] all NaN xy
1 2 2022-01-01 [67,200,87] [5000] NaN z, m
2 3 2021-01-01 [100,5,300,78] all NaN wf, z
3 4 2020-01-01 [77] [77] NaN NaN
4 5 2022-15-01 [100,200,546,33] all NaN t, wf
5 6 2002-12-01 [64,2] [64] [2] k
6 7 2018-12-01 [5] all NaN p
supermarkt vegetable_code
0 NaN [1000,2000,3000]
1 NaN [5000]
2 NaN [7000,2000,3000]
3 wf [1000]
4 NaN [4000,2000,3000]
5 NaN [6000,8000,1000]
6 NaN [6000,8000,1000]

How can I replace values in a CSV column from a range?

I am attempting to change the values of two columns in my dataset from specific numeric values (2, 10, 25 etc.) to single values (1, 2, 3 or 4) based on the percentile of the specific value within the dataset.
Using the pandas quantile() function I have got the ranges I wish to replace between, but I haven't figured out a working method to do so.
age1 = datasetNB.Age.quantile(0.25)
age2 = datasetNB.Age.quantile(0.5)
age3 = datasetNB.Age.quantile(0.75)
fare1 = datasetNB.Fare.quantile(0.25)
fare2 = datasetNB.Fare.quantile(0.5)
fare3 = datasetNB.Fare.quantile(0.75)
My current solution attempt for this problem is as follows:
for elem in datasetNB['Age']:
if elem <= age1:
datasetNB[elem].replace(to_replace = elem, value = 1)
print("set to 1")
elif (elem > age1) & (elem <= age2):
datasetNB[elem].replace(to_replace = elem, value = 2)
print("set to 2")
elif (elem > age2) & (elem <= age3):
datasetNB[elem].replace(to_replace = elem, value = 3)
print("set to 3")
elif elem > age3:
datasetNB[elem].replace(to_replace = elem, value = 4)
print("set to 4")
else:
pass
for elem in datasetNB['Fare']:
if elem <= fare1:
datasetNB[elem] = 1
elif (elem > fare1) & (elem <= fare2):
datasetNB[elem] = 2
elif (elem > fare2) & (elem <= fare3):
datasetNB[elem] = 3
elif elem > fare3:
datasetNB[elem] = 4
else:
pass
What should I do to get this working?
pandas already has one function to do that, pandas.qcut.
You can simply do
q_list = [0, 0.25, 0.5, 0.75, 1]
labels = range(1, 5)
df['Age'] = pd.qcut(df['Age'], q_list, labels=labels)
df['Fare'] = pd.qcut(df['Fare'], q_list, labels=labels)
Input
import numpy as np
import pandas as pd
# Generate fake data for the sake of example
df = pd.DataFrame({
'Age': np.random.randint(10, size=6),
'Fare': np.random.randint(10, size=6)
})
>>> df
Age Fare
0 1 6
1 8 2
2 0 0
3 1 9
4 9 6
5 2 2
Output
DataFrame after running the above code
>>> df
Age Fare
0 1 3
1 4 1
2 1 1
3 1 4
4 4 3
5 3 1
Note that in your specific case, since you want quartiles, you can just assign q_list = 4.

How can I count the values in the same column

I want to get the statistics of a long column, but I have the problems that in the colomn are diffrent datas(A,B,C,D..) and the same values (2) that I will count.
Example:
A
2
2
2
2
B
2
2
C
D
E
2
2
Output will be like:
A 4
B 2
C
D
E 2
Check where the Series, s, equals your magic number. Form groups after masking by that same check, but forward filling.
u = s.eq('2') # `2` if it's not a string
u.groupby(s.mask(u).ffill()).sum()
A 4.0
B 2.0
C 0.0
D 0.0
E 2.0
dtype: float64
Input data:
import pandas as pd
s = pd.Series(list('A2222B22CDE22'))
I am assuming that we are working with a text file. ('test_input.txt')
import pandas as pd
data = pd.read_csv('test_input.txt', header=None)
data = list(data[0])
final_out = dict()
last_item = None
for item in data:
try:
item = int(item)
except ValueError:
item = str(item)
if isinstance(item, str):
last_item = item
final_out[last_item] = 0
if isinstance(item, int):
final_out[last_item] += 1
print(final_out)
## {'A': 4, 'B': 2, 'C': 0, 'D': 0, 'E': 2}
print(pd.DataFrame.from_dict(final_out, orient='index'))
## 0
## A 4
## B 2
## C 0
## D 0
## E 2
# For order column, create first.
dataframe = dataframe.rename(columns={0:'unique'})
print(dataframe)
# Ordering
dataframe = dataframe.sort_values(by=['unique'])
print(dataframe)

iterate over pandas dataframe and update the value - AttributeError: can't set attribute

I am trying to iterate over a pandas dataframe and update the value if condition is met but i am getting an error.
for line, row in enumerate(df.itertuples(), 1):
if row.Qty:
if row.Qty == 1 and row.Price == 10:
row.Buy = 1
AttributeError: can't set attribute
First iterating in pandas is possible, but very slow, so another vectorized solution are used.
I think you can use iterrows if you need iterating:
for idx, row in df.iterrows():
if df.loc[idx,'Qty'] == 1 and df.loc[idx,'Price'] == 10:
df.loc[idx,'Buy'] = 1
But better is to use vectorized solutions – set value by boolean mask with loc:
mask = (df['Qty'] == 1) & (df['Price'] == 10)
df.loc[mask, 'Buy'] = 1
Or solution with mask:
df['Buy'] = df['Buy'].mask(mask, 1)
Or if you need if...else use numpy.where:
df['Buy'] = np.where(mask, 1, 0)
Samples.
Set values by conditions:
df = pd.DataFrame({'Buy': [100, 200, 50],
'Qty': [5, 1, 1],
'Name': ['apple', 'pear', 'banana'],
'Price': [1, 10, 10]})
print (df)
Buy Name Price Qty
0 100 apple 1 5
1 200 pear 10 1
2 50 banana 10 1
mask = (df['Qty'] == 1) & (df['Price'] == 10)
df['Buy'] = df['Buy'].mask(mask, 1)
print (df)
Buy Name Price Qty
0 100 apple 1 5
1 1 pear 10 1
2 1 banana 10 1
df['Buy'] = np.where(mask, 1, 0)
print (df)
Buy Name Price Qty
0 0 apple 1 5
1 1 pear 10 1
2 1 banana 10 1
Ok, if you intend to set values in df then you need track the index values.
option 1
using itertuples
# keep in mind `row` is a named tuple and cannot be edited
for line, row in enumerate(df.itertuples(), 1): # you don't need enumerate here, but doesn't hurt.
if row.Qty:
if row.Qty == 1 and row.Price == 10:
df.set_value(row.Index, 'Buy', 1)
option 2
using iterrows
# keep in mind that `row` is a `pd.Series` and can be edited...
# ... but it is just a copy and won't reflect in `df`
for idx, row in df.iterrows():
if row.Qty:
if row.Qty == 1 and row.Price == 10:
df.set_value(idx, 'Buy', 1)
option 3
using straight up loop with get_value
for idx in df.index:
q = df.get_value(idx, 'Qty')
if q:
p = df.get_value(idx, 'Price')
if q == 1 and p == 10:
df.set_value(idx, 'Buy', 1)
pandas.DataFrame.set_value method is deprecated as of 0.21.0 pd.DataFrame.set_value
Use pandas.Dataframe.at
for index, row in df.iterrows():
if row.Qty and row.Qty == 1 and row.Price == 10:
df.at[index,'Buy'] = 1

How to efficiently get indices of rows of DataFrame, where these rows meet certain cumulative criteria?

For example I would like to get letters indicating a row where period of at least two consecutive drops in other column begins.
Exemplary data:
a b
0 3 a
1 2 b
2 3 c
3 2 d
4 1 e
5 0 f
6 -1 g
7 3 h
8 1 i
9 0 j
Exemplary solution with simple loop:
import pandas as pd
df = pd.DataFrame({'a': [3,2,3,2,1,0,-1,3,1,0], 'b': list('abcdefghij')})
less = 0
l = []
prev_prev_row = df.iloc[0]
prev_row = df.iloc[1]
if prev_row['a'] < prev_prev_row['a']: less = 1
for i, row in df.iloc[2:len(df)].iterrows():
if row['a'] < prev_row['a']:
less = less + 1
else:
less = 0
if less == 2:
l.append(prev_prev_row['b'])
prev_prev_row = prev_row
prev_row = row
This gives list l:
['c', 'h']
Here's one approach with some help from NumPy and Scipy -
from scipy.ndimage.morphology import binary_closing
arr = df.a.values
mask1 = np.hstack((False,arr[1:] < arr[:-1],False))
mask2 = mask1 & (~binary_closing(~mask1,[1,1]))
final_mask = mask2[1:] > mask2[:-1]
out = list(df.b[final_mask])
use rolling(2) in reverse
s = df.a[::-1].diff().gt(0).rolling(2).sum().eq(2)
df.b.loc[s & (s != s.shift(-1))]
2 c
7 h
Name: b, dtype: object
if you actually wanted a list
df.b.loc[s & (s != s.shift(-1))].tolist()
['c', 'h']

Categories