Related
I have a df with multiple columns like this (there are many more cols & rows):
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy',},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarkt':'wf'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k' },
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' }
])
my expected df should look like this in the end:
df = pd.DataFrame([
{'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy','new_col_1':'all'},
{'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m','new_col_1':'[5000]'},
{'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z','new_col_1':'all'},
{'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarket':'wf','new_col_1':'[77]'},
{'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf','new_col_1':'all'},
{'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k', 'new_col_1':'[64]', 'new_col_2':'[2]'},
{'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' ,'new_col_1':'all'}
])
and here are multiple conditions I want to apply on cols fruit_code & vegetable_code to get two new columns:
UPDATE
def fruits_vegetable(row):
if len(str(row['fruit_code'])) == 1: # fruit_code in new_col_1
row['new_col_1'] = row['fruit_code']
elif len(str(row['fruit_code'])) == 1 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) > 2 and len(str(row['vegetable_code'])) == 1: # vegetable_code in new_col_1
row['new_col_1'] = row['vegetable_code']
elif len(str(row['fruit_code'])) > 3 and len(str(row['vegetable_code'])) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(str(row['fruit_code'])) == 2 and len(str(row['vegetable_code'])) >= 0: # fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = row['fruit_code'][0]
row['new_col_2'] = row['fruit_code'][1]
return row
df = df.apply(fruits_vegetable, axis=1)
I'm still stuck, now I get "all" in some of the rows for the first column, but the second does not change.
If someone has some insights, that would be great.
Thanks, much appreciated
First is necessary convert strings repr of lists by ast.literal_eval to lists, then for chceck length remove casting to strings. If need one element lists instead scalars use [] in fruit[0] and fruit[1] and last change order of condition for len(fruit) == 1, also change len(fruit) > 3 to len(fruit) > 2 for match first row:
def fruits_vegetable(row):
fruit = ast.literal_eval(row['fruit_code'])
vege = ast.literal_eval(row['vegetable_code'])
if len(fruit) == 1 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) > 2 and len(vege) == 1: # vegetable_code in new_col_1
row['new_col_1'] = vege
elif len(fruit) > 2 and len(vege) > 1: # write "all" in new_col_1
row['new_col_1'] = 'all'
elif len(fruit) == 2 and len(vege) >= 0:# fruit 1 new_col_1 & fruit 2 new_col_2
row['new_col_1'] = [fruit[0]]
row['new_col_2'] = [fruit[1]]
elif len(fruit) == 1: # fruit_code in new_col_1
row['new_col_1'] = fruit
return row
df = df.apply(fruits_vegetable, axis=1)
print (df)
ID date fruit_code new_col_1 new_col_2 supermarket \
0 1 2022-01-01 [100,99,300] all NaN xy
1 2 2022-01-01 [67,200,87] [5000] NaN z, m
2 3 2021-01-01 [100,5,300,78] all NaN wf, z
3 4 2020-01-01 [77] [77] NaN NaN
4 5 2022-15-01 [100,200,546,33] all NaN t, wf
5 6 2002-12-01 [64,2] [64] [2] k
6 7 2018-12-01 [5] all NaN p
supermarkt vegetable_code
0 NaN [1000,2000,3000]
1 NaN [5000]
2 NaN [7000,2000,3000]
3 wf [1000]
4 NaN [4000,2000,3000]
5 NaN [6000,8000,1000]
6 NaN [6000,8000,1000]
I am attempting to change the values of two columns in my dataset from specific numeric values (2, 10, 25 etc.) to single values (1, 2, 3 or 4) based on the percentile of the specific value within the dataset.
Using the pandas quantile() function I have got the ranges I wish to replace between, but I haven't figured out a working method to do so.
age1 = datasetNB.Age.quantile(0.25)
age2 = datasetNB.Age.quantile(0.5)
age3 = datasetNB.Age.quantile(0.75)
fare1 = datasetNB.Fare.quantile(0.25)
fare2 = datasetNB.Fare.quantile(0.5)
fare3 = datasetNB.Fare.quantile(0.75)
My current solution attempt for this problem is as follows:
for elem in datasetNB['Age']:
if elem <= age1:
datasetNB[elem].replace(to_replace = elem, value = 1)
print("set to 1")
elif (elem > age1) & (elem <= age2):
datasetNB[elem].replace(to_replace = elem, value = 2)
print("set to 2")
elif (elem > age2) & (elem <= age3):
datasetNB[elem].replace(to_replace = elem, value = 3)
print("set to 3")
elif elem > age3:
datasetNB[elem].replace(to_replace = elem, value = 4)
print("set to 4")
else:
pass
for elem in datasetNB['Fare']:
if elem <= fare1:
datasetNB[elem] = 1
elif (elem > fare1) & (elem <= fare2):
datasetNB[elem] = 2
elif (elem > fare2) & (elem <= fare3):
datasetNB[elem] = 3
elif elem > fare3:
datasetNB[elem] = 4
else:
pass
What should I do to get this working?
pandas already has one function to do that, pandas.qcut.
You can simply do
q_list = [0, 0.25, 0.5, 0.75, 1]
labels = range(1, 5)
df['Age'] = pd.qcut(df['Age'], q_list, labels=labels)
df['Fare'] = pd.qcut(df['Fare'], q_list, labels=labels)
Input
import numpy as np
import pandas as pd
# Generate fake data for the sake of example
df = pd.DataFrame({
'Age': np.random.randint(10, size=6),
'Fare': np.random.randint(10, size=6)
})
>>> df
Age Fare
0 1 6
1 8 2
2 0 0
3 1 9
4 9 6
5 2 2
Output
DataFrame after running the above code
>>> df
Age Fare
0 1 3
1 4 1
2 1 1
3 1 4
4 4 3
5 3 1
Note that in your specific case, since you want quartiles, you can just assign q_list = 4.
So i have this dataframe:
import pandas as pd
d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
'name':['ada','aad','ada','ada','dddd','fdd','ccc','cccd','ood','aaa','aaa','aar','rrp']
,'amount':[2,-12,12,-12,5,-5,2,3,-5,3,-10,10,-10]}
df1 = pd.DataFrame(d)
df1
id name amount
0 1 ada 2
1 1 aad -12
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
6 3 ccc 2
7 3 cccd 3
8 3 ood -5
9 4 aaa 3
10 4 aaa -10
11 4 aar 10
12 4 rrp -10
First i want to find the matching positive for negative amounts per id, which i do through this:
def match_pos_neg(df):
return df[df["amount"].isin(-df["amount"])]
df1 = df1.groupby("id").apply(match_pos_neg).reset_index(0, drop=True)
df1
id name amount
1 1 aad -12
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
10 4 aaa -10
11 4 aar 10
12 4 rrp -10
Next thing i want to do is to get only the pairs of matching pos and neg numbers that also have the highest similarity in the string column 'name'.So if an id has two other negative numbers that match with the positive i want to isolate the pairs with the highest similarity per id, so i want my desired output to be like this:
id name amount
2 1 ada 12
3 1 ada -12
4 2 dddd 5
5 2 fdd -5
10 4 aaa -10
11 4 aar 10
I guess i have to use some type of string similarity index like sequencematcher or jaccard etc., but i am not sure how to work around this. Any help on how to get my desired output would be very much appreciated.
You can try something like this:
please notice you can change the information you print as you wish, just need to edit the return values from the function create_sim
import pandas as pd
from operator import itemgetter
d = {'id': [1,1,1,1,2,2,3,3,3,4,4,4,4],
'name':['ada','aad','ada','ada','dddd','fdd','ccc','cccd','ood','aaa','aaa','aar','rrp']
,'amount':[2,-12,12,-12,5,-5,2,3,-5,3,-10,10,-10]}
df1 = pd.DataFrame(d)
def match_pos_neg(df):
return df[df["amount"].isin(-df["amount"])]
df1 = df1.groupby("id").apply(match_pos_neg).reset_index(0, drop=True)
print(df1)
def split(word):
return [char for char in word]
def DistJaccard(str1, str2):
l1 = set(split(str1))
l2 = set(split(str2))
return float(len(l1 & l2)) / len(l1 | l2)
def create_sim(df, idx):
idx_id = df['id'].values[idx]
idx_amount = df['amount'].values[idx]
idx_name = df['name'].values[idx]
df_t = df.loc[df['id'] == idx_id]
pos = [i for i in list(df_t['amount']) if i > 0] or None
neg = [i for i in list(df_t['amount']) if i < 0] or None
if pos and neg:
l = [x for x in list(df_t['amount']) if x == idx_amount * -1]
if len(l) > 0:
df_t = df.loc[df['amount'] == idx_amount * -1]
compare_list = list(df_t['name'])
list_results = []
for item in compare_list:
sim = DistJaccard(idx_name, item)
list_results.append((item, sim))
return max(list_results, key=itemgetter(1))
return None
count = 0
for index, row in df1.iterrows():
res = create_sim(df1, count)
if res:
print(f"The most similar word of {row['name']} is {res[0]} with similarity of {res[1]}")
else:
print(f"No similar words of {row['name']}")
count+=1
Edit:
In order to make a DF with the results you can change it to this:
count = 0
item1_id = []
item1_row = []
item1_name = []
item2_id = []
item2_row = []
item2_name = []
for index, row in df1.iterrows():
res = create_sim(df1, count)
item1_id.append(row['id'])
item1_row.append(count)
item1_name.append(row['name'])
if res:
row_idx = df1.loc[(df1['id'] == res[2]) & (df1['name'] == res[0]) & (df1['amount'] != row['amount']), "name"].index.tolist()
item2_id.append(row['id'])
item2_row.append(row_idx[0])
item2_name.append(res[0])
else:
item2_id.append(None)
item2_row.append(None)
item2_name.append(None)
count+=1
final = pd.DataFrame(item1_id, columns=['item 1 id'])
final['item 1 row'] = item1_row
final['item 1 name'] = item1_name
final['item 2 id'] = item2_id
final['item 2 row'] = item2_row
final['item 2 name'] = item2_name
print(final)
I have a dataframe:
SIN1 SIN2 SIN3
4778 5633 4343
I have tried,
count_col = len(df1.columns)
check=1
while check<=count_col:
check_str = str(check)
check_for_column = "SIN"+check_str in df1
name = "SIN"+check_str
if check_for_column == True:
df1[name] = df1[name].astype(str)
df1['SIN1'] = df1['SIN1'] + ',' + df1[name]
if check == count_col:
break
check += 1
df1[['SIN1']]
This shows 4778,4343,4778,4343.................
When I tried,
check=1
while check<=count_col:
check_str = str(check)
check_for_column = "SIN"+check_str in df1
name = "SIN"+check_str
if check == count_col:
break
check += 1
if check_for_column == True:
df1[name] = df1[name].astype(str)
df1['SIN1'] = df1['SIN1'] + ',' + df1[name]
df1[['SIN1']]
This shows 4778,4343
I want the result to be, 4778,5633,4343
Please don't suggest a way to directly concatenate with ','.
I used while loop because there can be any no of SIN columns so.
How to properly use while loop in this case?
Use apply to join column values:
>>> df['SIN'] = df.astype(str).apply(lambda x: ','.join(x), axis=1)
>>> df
SIN1 SIN2 SIN3 SIN
0 4778 5633 4343 4778,5633,4343
To select a subset of columns like SINxx, use filter:
df.filter(like='SIN') # or df.filter(regex='SIN\d+')
This code takes all columns with SIN in them, concatenates the values as a string, and assigns it to a new column SIN.
sin_cols = [ ( 'SIN' in col ) for col in df.columns ]
sdf = df.loc[ :, sin_cols ]
df[ 'SIN' ] = sdf.apply( lambda x: ', '.join( x.values.astype( str ) ), axis = 1 )
df before
id
T
SIN1
SIN2
Q
SIN3
0
8
3
6
9
8
1
1
6
1
7
1
2
5
2
4
8
6
df after
id
T
SIN1
SIN2
Q
SIN3
SIN
0
8
3
6
9
8
3, 6, 8
1
1
6
1
7
1
6, 1, 1
2
5
2
4
8
6
2, 4, 6
I'm working on an array called numbers which will be created with 4 columns called (x), (y), (z) respectively and the fourth is used in the program.
I want that if the x and y values of two rows coincide, then based on their c, one of them would be deleted from the main array (a "0" z value removes "1", a "1" z value removes "2" and a "2" z value removes "0").
The original array looks like:
[[12 15 2 0]
[65 23 0 0]
[24 66 2 0]
[65 23 1 0]
[24 66 0 0]]
The problem is that when I try to run the following program I do not get the required array at the end. The expected output array would look like:
[[12 15 2 0]
[65 23 0 0]
[24 66 2 0]]
I have given an extract from the program below
import numpy as np
#Array
numbers = np.array([[12,15,2,0],[65,23,0,0],[24,66,2,0],[65,23,1,0],[24,66,0,0]])
#Original Array
print(numbers)
#Lists to store x, y and z values
xs = []
ys = []
zs = []
#Any removed row is added into this list
removed = []
#Code to delete a row
for line1 in numbers:
for line2 in numbers:
if line1[0] == line2[0]:
if line2[1] == line2[1]:
if line1[2] == 1 and line2[2] == 0:
removed.append(line1)
if line1[2] == 0 and line2[2] == 2:
removed.append(line1)
if line1[2] == 2 and line2[2] == 1:
removed.append(line1)
for i in removed:
numbers = np.delete(numbers,i,axis=0)
for line in numbers:
xs.append(line[0])
ys.append(line[1])
zs.append(line[2])
#Update the original Array
for i in removed:
print(removed)
print()
print("x\n", xs)
print("y\n", ys)
print("z\n", zs)
print()
#Updated Array
print(numbers)
Test array
a = lifeforms = np.array([[12,15,2,0],
[13,13,0,0],
[13,13,1,0],
[13,13,2,0],
[65,23,1,0],
[24,66,2,0],
[14,14,1,0],
[14,14,1,1],
[14,14,1,2],
[14,14,2,0],
[15,15,3,2],
[15,15,2,0],
[65,23,0,0],
[24,66,0,0]])
Function that implements color selection.
test_one = np.array([[0,1],[1,0],[1,2],[2,1]])
test_two = np.array([[0,2],[2,0]])
def f(g):
a = g.loc[:,2].unique()
if np.any(np.all(a == test_one, axis=1)):
idx = (g[2] == g[2].min()).idxmax()
elif np.any(np.all(a == test_two, axis=1)):
idx = (g[2] == g[2].max()).idxmax()
else:
raise ValueError('group colors outside bounds')
return idx
Groupby first two columns; iterate over groups; save indices of desired rows; use those indices to select rows from the DataFrame.
df = pd.DataFrame(a)
gb = df.groupby([0,1])
indices = []
for k,g in gb:
if g.loc[:,2].unique().shape[0] > 2:
#print(f'(0,1,2) - dropping indices {g.index}')
continue
if g.shape[0] == 1:
indices.extend(g.index.to_list())
#print(f'unique - keeping index {g.index.values}')
continue
#print(g.loc[:,2])
try:
idx = f(g)
except ValueError as e:
print(sep)
print(e)
print(g)
print(sep)
continue
#print(f'keeping index {idx}')
indices.append(idx)
#print(sep)
print(df.loc[indices,:])
If you can use pandas, you can do the following:
x = np.array([[12,15,2,0],[65,23,0,1],[24,66,2,0],[65,23,1,0],[24,66,0,0]])
df = pd.DataFrame(x)
new_df = df.iloc[df.loc[:,(0,1)].drop_duplicates().index]
print(new_df)
0 1 2 3
0 12 15 2 0
1 65 23 0 1
2 24 66 2 0
What it does is the following:
transform the array to pandas data-frame
df.loc[:,(0,1)].drop_duplicates().index will return the indices of the rows you wish to keep (based on the first and second columns)
df.iloc will return the sliced data-frame.
Edit based on OP questions in the comments and #wwii remarks:
you can return to numpy array using .to_numpy(), so just do arr = new_df.to_numpy()
You can try the following:
xx = np.array([[12,15,2,0],[65,23,1,0],[24,66,2,0],[65,23,0,0],[24,66,0,0]])
df = pd.DataFrame(xx)
df_new = df.groupby([0,1], group_keys=False).apply(lambda x: x.loc[x[2].idxmin()])
df_new.reset_index(drop=True, inplace=True)
0 1 2 3
0 12 15 2 0
1 24 66 0 0
2 65 23 0 0
When there is a special heuristic to consider one can do the following:
import pandas as pd
import numpy as np
def f_(x):
vals = x[2].tolist()
if len(vals)==2:
# print(vals)
if vals[0] == 0 and vals[1] == 1:
return vals[0]
elif vals[0] == 1 and vals[1] == 0:
return vals[1]
elif vals[0] == 1 and vals[1] == 2:
return vals[0]
elif vals[0] == 2 and vals[1] == 0:
return vals[0]
elif len(vals) > 2:
return -1
else:
return x[2]
xx = np.array([[12,15,2,0],[65,23,1,0],[24,66,2,0],[65,23,0,0],[24,66,0,0]])
df = pd.DataFrame(xx)
df_new = df.groupby([0,1], group_keys=False).apply(lambda x: x.loc[x[2] == f_(x)])
df_new.reset_index(drop=True, inplace=True)
print(df_new)
0 1 2 3
0 12 15 2 0
1 24 66 2 0
2 65 23 0 0