Pandas Dataframe Merging Issue - python

I need to merge the following 2 dataframes:
df1:
A B C D F
0 1 a zz 10 11
1 1 a zz 15 11
2 2 b yy 20 12
3 3 c xx 30 13
4 4 d ww 40 14
5 5 e vv 50 15
6 6 f uu 60 16
7 7 g NaN 70 17
8 8 h ss 80 18
9 9 NaN rr 90 19
10 13 m nn 130 113
11 15 o ll 150 115
df2:
A B C D G
0 1 NaN zz 15 100
1 6 f uu 60 600
2 7 g tt 70 700
3 10 j qq 100 1000
4 12 l NaN 120 1200
5 14 n NaN 140 1400
The merged dataframe should be:
A B C D F G
0 1 a zz 10 11 None
1 1 a zz 15 11 100
2 2 b yy 20 12 None
3 3 c xx 30 13 None
4 4 d ww 40 14 None
5 5 e vv 50 15 None
6 6 f uu 60 16 600
7 7 g tt 70 17 700
8 8 h ss 80 18 None
9 9 NaN rr 90 19 None
10 13 m nn 130 113 None
11 15 o ll 150 115 None
12 10 j qq 100 None 1000
13 12 l NaN 120 None 1200
14 14 n NaN 140 None 1400
Following is the code to generate df1 and df2:
df1 = pd.DataFrame({'A': [1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 15],
'B': ['a', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', np.NAN, 'm', 'o'],
'C': ['zz', 'zz', 'yy', 'xx', 'ww', 'vv', 'uu', np.NAN, 'ss', 'rr', 'nn', 'll'],
'D': [10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 130, 150],
'F': [11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 113, 115]})
df2 = pd.DataFrame({'A': [1, 6, 7, 10, 12, 14],
'B': [np.NAN, 'f', 'g', 'j', 'l', 'n'],
'C': ['zz', 'uu', 'tt', 'qq', np.NAN, np.NAN],
'D': [15, 60, 70, 100, 120, 140],
'G': [100, 600, 700, 1000, 1200, 1400]})
I tried the following methods:
md1 = df1.merge(df2, how='outer')
md2 = df1.merge(df2, how='outer', on=['A', 'D'])
md3 = df1.merge(df2, how='outer', left_on=['A', 'D'], right_on=['A', 'D'])
md4 = df1.merge(df2, how='outer', left_on=['A', 'B', 'C', 'D'], right_on=['A', 'B', 'C', 'D'])
Following are the results of md1 and md4 (same result):
print(md1.to_string())
A B C D F G
0 1 a zz 10 11.0 NaN
1 1 a zz 15 11.0 NaN
2 2 b yy 20 12.0 NaN
3 3 c xx 30 13.0 NaN
4 4 d ww 40 14.0 NaN
5 5 e vv 50 15.0 NaN
6 6 f uu 60 16.0 600.0
7 7 g NaN 70 17.0 NaN
8 8 h ss 80 18.0 NaN
9 9 NaN rr 90 19.0 NaN
10 13 m nn 130 113.0 NaN
11 15 o ll 150 115.0 NaN
12 1 NaN zz 15 NaN 100.0
13 7 g tt 70 NaN 700.0
14 10 j qq 100 NaN 1000.0
15 12 l NaN 120 NaN 1200.0
16 14 n NaN 140 NaN 1400.0
Following are the results of md2 and md3 (same result):
print(md2.to_string())
A B_x C_x D F B_y C_y G
0 1 a zz 10 11.0 NaN NaN NaN
1 1 a zz 15 11.0 NaN zz 100.0
2 2 b yy 20 12.0 NaN NaN NaN
3 3 c xx 30 13.0 NaN NaN NaN
4 4 d ww 40 14.0 NaN NaN NaN
5 5 e vv 50 15.0 NaN NaN NaN
6 6 f uu 60 16.0 f uu 600.0
7 7 g NaN 70 17.0 g tt 700.0
8 8 h ss 80 18.0 NaN NaN NaN
9 9 NaN rr 90 19.0 NaN NaN NaN
10 13 m nn 130 113.0 NaN NaN NaN
11 15 o ll 150 115.0 NaN NaN NaN
12 10 NaN NaN 100 NaN j qq 1000.0
13 12 NaN NaN 120 NaN l NaN 1200.0
14 14 NaN NaN 140 NaN n NaN 1400.0
But none of the above results is what I need from the merge operation!
So, I wrote a function to get what I want:
def merge_df(d1, d2, on_columns):
d1_row_count = d1.shape[0]
d2_row_count = d2.shape[0]
d1_columns = list(d1.columns)
d2_columns = list(d2.columns)
extra_columns_in_d1 = []
extra_columns_in_d2 = []
common_columns = []
for c in d1_columns:
if c not in d2_columns:
extra_columns_in_d1.append(c)
else:
common_columns.append(c)
for c in d2_columns:
if c not in d1_columns:
extra_columns_in_d2.append(c)
print(common_columns)
# start with the merged dataframe equal to d1
md = d1.copy(deep=True)
# Append the extra columns to md (with None values in the newly appended columns)
for c in extra_columns_in_d2:
md[c] = [None] * d1_row_count
d1_new_row_number = d1_row_count
# iterate thru each row in d2
for i in range(d2_row_count):
# create the match query string
d1_match_condition = ''
for p, c in enumerate(on_columns):
d1_match_condition += c + ' == ' + str(d2.loc[i, c])
if p < (len(on_columns) - 1):
d1_match_condition += ' and '
match_in_d1 = d1.query(d1_match_condition)
# if match is not found, then append the row
if match_in_d1.shape[0] == 0:
# build a list representing the row to append
row_list = []
for c in common_columns:
row_list.append(d2.loc[i, c])
for c in extra_columns_in_d1:
row_list.append(None)
for c in extra_columns_in_d2:
row_list.append(d2.loc[i, c])
md.loc[d1_new_row_number] = row_list
d1_new_row_number += 1
# if match is found, then modify the found row
else:
match_in_d1_index = list(match_in_d1.index)[0]
for c in common_columns:
if (md.loc[match_in_d1_index, c]) is None or (md.loc[match_in_d1_index, c]) is np.NAN:
md.loc[match_in_d1_index, c] = d2.loc[i, c]
for c in extra_columns_in_d2:
md.loc[match_in_d1_index, c] = d2.loc[i, c]
return md
When I use this function, I get the desired merged dataframe:
md5 = merge_df(df1, df2, ['A', 'D'])
Am I missing something basic with the inbuilt dataframe merge method to get the desired result?

You could merge first then use .assing and .combine_first. The resulting columns of the merge need to put to toghether correctly by taking the value of the right df and update its value with the left df it has an entry at this specific point. This is what .combine_first does.
m = pd.merge(df1, df2, on=['A','D'], how='outer')
m.assign(B=m['B_x'].combine_first(m['B_y']), C=m['C_x'].combine_first(m['C_y']))\
.drop(['B_x','C_x','B_y','C_y'], 1)[['A','B','C','D','F','G']]
result
A B C D F G
0 1 a zz 10 11.0 NaN
1 1 a zz 15 11.0 100.0
2 2 b yy 20 12.0 NaN
3 3 c xx 30 13.0 NaN
4 4 d ww 40 14.0 NaN
5 5 e vv 50 15.0 NaN
6 6 f uu 60 16.0 600.0
7 7 g tt 70 17.0 700.0
8 8 h ss 80 18.0 NaN
9 9 NaN rr 90 19.0 NaN
10 13 m nn 130 113.0 NaN
11 15 o ll 150 115.0 NaN
12 10 j qq 100 NaN 1000.0
13 12 l NaN 120 NaN 1200.0
14 14 n NaN 140 NaN 1400.0

You have the format wrong on merge operation. Try the following code
result = df1.merge(df2,on=['A','D'], how='outer')
try this
df1 = df1.merge(df2,on=['A','D'],how='outer')
df1['C'] = df1[['C_x','C_y']].apply(lambda x: x['C_y'] if x['C_x'] is np.nan else x['C_x'],axis=1)
df1['B'] = df1[['B_x','B_y']].apply(lambda x: x['B_y'] if x['B_x'] is np.nan else x['B_x'],axis=1)
df1 = df1.drop(labels=['B_x','B_y','C_x','C_y'],axis=1)

Related

Pandas fill in group if condition is met

I have a DataFrame where I am looking to fill in values in a column based on their grouping. I only want to fill in the values (by propagating non-NaN values using ffill and bfill) if there is only one unique value in the column to be filled; otherwise, it should be left as is. My code below has a sample dataset where I try to do this, but I get an error.
Code:
df = pd.DataFrame({"A": [1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6, 6],
"B": ['a', 'a', np.nan, 'b', 'b', 'c', np.nan, 'd', np.nan, 'e', 'e', np.nan, 'h', 'h'],
"C": [5.0, np.nan, 4.0, 4.0, np.nan, 9.0, np.nan, np.nan, 9.0, 8.0, np.nan, 2.0, np.nan, np.nan]})
col_to_groupby = "A"
col_to_modify = "B"
group = df.groupby(col_to_groupby)
modified = group[group[col_to_modify].nunique() == 1].transform(lambda x: x.ffill().bfill())
df.update(modified)
Error:
KeyError: 'Columns not found: False, True'
Original dataset:
A B C
0 1 a 5.0
1 1 a NaN
2 2 NaN 4.0
3 2 b 4.0
4 2 b NaN
5 3 c 9.0
6 3 NaN NaN
7 3 d NaN
8 3 NaN 9.0
9 4 e 8.0
10 4 e NaN
11 5 NaN 2.0
12 6 h NaN
13 6 NaN NaN
Desired result:
A B C
0 1 a 5.0
1 1 a NaN
2 2 b 4.0
3 2 b 4.0
4 2 b NaN
5 3 c 9.0
6 3 NaN NaN
7 3 d NaN
8 3 NaN 9.0
9 4 e 8.0
10 4 e NaN
11 5 NaN 2.0
12 6 h NaN
13 6 h NaN
The above is the desired result because
row index 2 is in group 2, which only has 1 unique value in column B ("b"), so it is changed.
row indices 6 and 8 are in group 3, but there are 2 unique values in column B ("c" and "d"), so they are unaltered.
row index 5 is in group 11, but has no data in column B to propagate.
row index 13 is in group 6, which only has 1 unique value in column B ("h"), so it is changed.
One option is to add a condition in groupby.apply:
df[col_to_modify] = df.groupby(col_to_groupby)[col_to_modify].apply(lambda x: x.ffill().bfill() if x.nunique()==1 else x)
Another could be to use groupby + transform(nunique) + eq to create a boolean filter for the groups with unique values; then update those rows with groupby + first (first drops NaN) using where:
g = df.groupby(col_to_groupby)[col_to_modify]
df[col_to_modify] = g.transform('first').where(g.transform('nunique').eq(1), df[col_to_modify])
Output:
A B C
0 1 a 5.0
1 1 a NaN
2 2 b 4.0
3 2 b 4.0
4 2 b NaN
5 3 c 9.0
6 3 NaN NaN
7 3 d NaN
8 3 NaN 9.0
9 4 e 8.0
10 4 e NaN
11 5 NaN 2.0
12 6 h NaN
13 6 h NaN

How can I fill missing data in my dataframe faster for a big dataset, and without a SettingWithCopyWarning?

I have a dataframe with the count of people per day and per location. Without any missing data, I expect to have 4 lines per day: 2 locations and 2 genders. Some data is missing and should be replaced by the mean count, but only if that location has data for that gender on the day before.
If data is missing for more dan 1 day, I assume that there is supposed to be no data. So for example in my example dataframe: Day 2, Location X, Gender F should be filled, because Day 1, Location X, Gender F exists. But Day 4, Location Y, Gender F must stay empty, because Day 3, Location Y, Gender F does not exist.
The code below works for this small dataframe, but it's really slow for my large dataset. Is there a way to do this faster?
Can I avoid the SettingWithCopyWarnings in this case?
import pandas as pd
import numpy as np
import random
data = pd.DataFrame({'day': [1,1,2,3,3,4,5,1,2],
'location': ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'Y', 'Y'],
'gender': ['F', 'M', 'M','F', 'M','F', 'F','F','F'],
'count': random.sample(range(10, 30), 9)})
print(data.sort_values('day').reset_index(drop=True))
day location gender count
0 1 X F 17
1 1 X M 10
2 1 Y F 12
3 2 X M 20
4 2 Y F 15
5 3 X F 24
6 3 X M 29
7 4 X F 11
8 5 X F 14
data2 = pd.DataFrame()
for e, today in enumerate(list(set(data['day'].sort_values()))[1:]):
yesterday = (list(set(data['day'].sort_values()))[e])
today_df = data[(data['day']==today)].set_index(['location','gender'])
yesterday_df = data[(data['day']==yesterday)].set_index(['location','gender'])
today_missing = [[i[0],i[1]] for i in yesterday_df.index if i not in today_df.index]
for i in today_missing:
new_row = data[(data['day']==yesterday) & (data['location']==i[0]) & (data['gender']==i[1])]
new_row['day'] = today
new_row['count'] = int(np.mean(data['count'][(data['location']==i[0]) & (data['gender']==i[1])]))
data2 = data2.append(new_row, ignore_index=True)
data = data.append(data2).sort_values('day').reset_index(drop=True)
print(data)
day location gender count
0 1 X F 17
1 1 X M 10
2 1 Y F 12
3 2 X M 20
4 2 Y F 15
5 2 X F 16
6 3 X F 24
7 3 X M 29
8 3 Y F 13
9 4 X F 11
10 4 X M 19
11 5 X F 14
One solution can be to re-generate the posible combinations of location, gender and day
df = data.set_index(['location', 'gender', 'day'])
.reindex(pd.MultiIndex.from_product(
[['X', 'Y'], ['F', 'M'], range(1, 8)],
names=['location', 'gender', 'day']))
count
location gender day
X F 1 17.0
2 NaN
3 24.0
4 11.0
5 14.0
6 NaN
7 NaN
M 1 10.0
2 20.0
3 29.0
4 NaN
5 NaN
6 NaN
7 NaN
Y F 1 12.0
2 15.0
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
M 1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
1: Solution filling with mean per location, gender group
df.groupby(['location', 'gender']).transform(lambda x: x.fillna(x.mean(), limit=1)).dropna()
count
location gender day
X F 1 17.000000
2 16.500000
3 24.000000
4 11.000000
5 14.000000
M 1 10.000000
2 20.000000
3 29.000000
4 19.666667
Y F 1 12.000000
2 15.000000
3 13.500000
2: Solution interpolating linearly between days
Another solution can be to interpolate between days within the [location, gender] groups, with a limit of 1 day filling:
df.interpolate(level=['location', 'gender'], limit=1).dropna()
count
location gender day
X F 1 17.000000
2 20.500000
3 24.000000
4 11.000000
5 14.000000
6 12.666667
M 1 10.000000
2 20.000000
3 29.000000
4 25.600000
Y F 1 12.000000
2 15.000000
3 15.000000
You can remove the multiindex doing df.reset_index(). Hope it serves.

Using pandas cut function with groupby and group-specific bins

I have the following sample DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame({'Tag': ['A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C'],
'ID': [11, 12, 16, 19, 14, 9, 4, 13, 6, 18, 21, 1, 2],
'Value': [1, 13, 11, 12, 2, 3, 4, 5, 6, 7, 8, 9, 10]})
to which I add the percentage of the Value using
df['Percent_value'] = df['Value'].rank(method='dense', pct=True)
and add the Order using pd.cut() with pre-defined percentage bins
percentage = np.array([10, 20, 50, 70, 100])/100
df['Order'] = pd.cut(df['Percent_value'], bins=np.insert(percentage, 0, 0), labels = [1,2,3,4,5])
which gives
Tag ID Value Percent_value Order
0 A 11 1 0.076923 1
1 A 12 13 1.000000 5
2 A 16 11 0.846154 5
3 B 19 12 0.923077 5
4 B 14 2 0.153846 2
5 B 9 3 0.230769 3
6 B 4 4 0.307692 3
7 C 13 5 0.384615 3
8 C 6 6 0.461538 3
9 C 18 7 0.538462 4
10 C 21 8 0.615385 4
11 C 1 9 0.692308 4
12 C 2 10 0.769231 5
My Question
Now, instead of having a single percentage array (bins) for all Tags (groups), I have a separate percentage array for each Tag group. i.e., A, B and C. How can I apply df.groupby('Tag') and then apply pd.cut() using different percentage bins for each group from the following dictionary? Is there some direct-way avoiding for loops as I do below?
percentages = {'A': np.array([10, 20, 50, 70, 100])/100,
'B': np.array([20, 40, 60, 90, 100])/100,
'C': np.array([30, 50, 60, 80, 100])/100}
Desired outcome (Note: Order is now computed for each Tag using different bins):
Tag ID Value Percent_value Order
0 A 11 1 0.076923 1
1 A 12 13 1.000000 5
2 A 16 11 0.846154 5
3 B 19 12 0.923077 5
4 B 14 2 0.153846 1
5 B 9 3 0.230769 2
6 B 4 4 0.307692 2
7 C 13 5 0.384615 2
8 C 6 6 0.461538 2
9 C 18 7 0.538462 3
10 C 21 8 0.615385 4
11 C 1 9 0.692308 4
12 C 2 10 0.769231 4
My Attempt
orders = []
for k, g in df.groupby(['Tag']):
percentage = percentages[k]
g['Order'] = pd.cut(g['Percent_value'], bins=np.insert(percentage, 0, 0), labels = [1,2,3,4,5])
orders.append(g)
df_final = pd.concat(orders, axis=0, join='outer')
You can apply pd.cut within groupby,
df['Order'] = df.groupby('Tag').apply(lambda x: pd.cut(x['Percent_value'], bins=np.insert(percentages[x.name],0,0), labels=[1,2,3,4,5])).reset_index(drop = True)
Tag ID Value Percent_value Order
0 A 11 1 0.076923 1
1 A 12 13 1.000000 5
2 A 16 11 0.846154 5
3 B 19 12 0.923077 5
4 B 14 2 0.153846 1
5 B 9 3 0.230769 2
6 B 4 4 0.307692 2
7 C 13 5 0.384615 2
8 C 6 6 0.461538 2
9 C 18 7 0.538462 3
10 C 21 8 0.615385 4
11 C 1 9 0.692308 4
12 C 2 10 0.769231 4

Pandas: moving data from two dataframes to another with tuple index

I have three dataframes like the following:
final_df
other ref
(2014-12-24 13:20:00-05:00, a) NaN NaN
(2014-12-24 13:40:00-05:00, b) NaN NaN
(2018-07-03 14:00:00-04:00, d) NaN NaN
ref_df
a b c d
2014-12-24 13:20:00-05:00 1 2 3 4
2014-12-24 13:40:00-05:00 2 3 4 5
2017-11-24 13:10:00-05:00 ..............
2018-07-03 13:25:00-04:00 ..............
2018-07-03 14:00:00-04:00 9 10 11 12
2019-07-03 13:10:00-04:00 ..............
other_df
a b c d
2014-12-24 13:20:00-05:00 10 20 30 40
2014-12-24 13:40:00-05:00 20 30 40 50
2017-11-24 13:10:00-05:00 ..............
2018-07-03 13:20:00-04:00 ..............
2018-07-03 13:25:00-04:00 ..............
2018-07-03 14:00:00-04:00 90 100 110 120
2019-07-03 13:10:00-04:00 ..............
And I need to remplace the NaN values in my final_df with the related dataframe to be like that:
other ref
(2014-12-24 13:20:00-05:00, a) 10 1
(2014-12-24 13:40:00-05:00, b) 30 3
(2018-07-03 14:00:00-04:00, d) 110 11
How can I get it?
pandas.DataFrame.lookup
final_df['ref'] = ref_df.lookup(*zip(*final_df.index))
final_df['other'] = other_df.lookup(*zip(*final_df.index))
map and get
For when you have missing bits
final_df['ref'] = list(map(ref_df.stack().get, final_df.index))
final_df['other'] = list(map(other_df.stack().get, final_df.index))
Demo
Setup
idx = pd.MultiIndex.from_tuples([(1, 'a'), (2, 'b'), (3, 'd')])
final_df = pd.DataFrame(index=idx, columns=['other', 'ref'])
ref_df = pd.DataFrame([
[ 1, 2, 3, 4],
[ 2, 3, 4, 5],
[ 9, 10, 11, 12]
], [1, 2, 3], ['a', 'b', 'c', 'd'])
other_df = pd.DataFrame([
[ 10, 20, 30, 40],
[ 20, 30, 40, 50],
[ 90, 100, 110, 120]
], [1, 2, 3], ['a', 'b', 'c', 'd'])
print(final_df, ref_df, other_df, sep='\n\n')
other ref
1 a NaN NaN
2 b NaN NaN
3 d NaN NaN
a b c d
1 1 2 3 4
2 2 3 4 5
3 9 10 11 12
a b c d
1 10 20 30 40
2 20 30 40 50
3 90 100 110 120
Result
final_df['ref'] = ref_df.lookup(*zip(*final_df.index))
final_df['other'] = other_df.lookup(*zip(*final_df.index))
final_df
other ref
1 a 10 1
2 b 30 3
3 d 120 12
Another solution that can work with missing dates in ref_df and other_df:
index = pd.MultiIndex.from_tuples(final_df.index)
ref = ref_df.stack().rename('ref')
other = other_df.stack().rename('other')
result = pd.DataFrame(index=index).join(ref).join(other)

Why pandas unstack is throwing an error?

I am trying to unstack two columns :
cols = res.columns[:31]
res[cols] = res[cols].ffill()
res = res.set_index(cols + [31])[32].unstack().reset_index().rename_axis(None, 1)
But I am getting an error :
TypeError: can only perform ops with scalar values
What should I do to avoid it?
My original problem : LINK
I think need convert columns to list:
cols = res.columns[:31].tolist()
EDIT:
Index contains duplicate entries, cannot reshape
means duplicates, here for first 6 columns, so is impossible create new DataFrame, because first 6 column create new index and 7. column create new column, and for 8. column are 2 values:
0 1 2 3 4 5 6 7
0 xx s 1 d f df f 54
1 xx s 1 d f df f g4
New DataFrame:
index = xx s 1 d f df
column = f
value = 54
index = xx s 1 d f df
column = f
value = g4
So solution is aggregate, here working with strings, so need .apply(', '.join):
index = xx s 1 d f df
column = f
value = 54, g4
Or remove duplicates and keep first or last value of dupes rows by drop_duplicates:
index = xx s 1 d f df
column = f
value = 54
index = xx s 1 d f df
column = f
value = g4
res = pd.DataFrame({0: ['xx',np.nan,np.nan,np.nan,'ds', np.nan, np.nan, np.nan, np.nan, 'as'],
1: ['s',np.nan,np.nan,np.nan,'a', np.nan, np.nan, np.nan, np.nan, 't'],
2: ['1',np.nan,np.nan,np.nan,'s', np.nan, np.nan, np.nan, np.nan, 'r'],
3: ['d',np.nan, np.nan, np.nan,'d', np.nan, np.nan, np.nan, np.nan, 'a'],
4: ['f',np.nan, np.nan, np.nan,'f', np.nan, np.nan, np.nan, np.nan, '2'],
5: ['df',np.nan,np.nan,np.nan,'ds',np.nan, np.nan, np.nan, np.nan, 'ds'],
6: ['f','f', 'x', 'r', 'f', 'd', 's', '1', '3', 'k'],
7: ['54','g4', 'r4', '43', '64', '43', 'se', 'gf', 's3', 's4']})
cols = res.columns[:6].tolist()
res[cols] = res[cols].ffill()
print (res)
0 1 2 3 4 5 6 7
0 xx s 1 d f df f 54
1 xx s 1 d f df f g4
2 xx s 1 d f df x r4
3 xx s 1 d f df r 43
4 ds a s d f ds f 64
5 ds a s d f ds d 43
6 ds a s d f ds s se
7 ds a s d f ds 1 gf
8 ds a s d f ds 3 s3
9 as t r a 2 ds k s4
res =res.groupby(cols + [6])[7].apply(', '.join).unstack().reset_index().rename_axis(None, 1)
print (res)
0 1 2 3 4 5 1 3 d f k r s x
0 as t r a 2 ds NaN NaN NaN NaN s4 NaN NaN NaN
1 ds a s d f ds gf s3 43 64 NaN NaN se NaN
2 xx s 1 d f df NaN NaN NaN 54, g4 NaN 43 NaN r4 <-54, g4
Another solution is remove duplicates:
res = res.drop_duplicates(cols + [6])
res = res.set_index(cols + [6])[7].unstack().reset_index().rename_axis(None, 1)
print (res)
0 1 2 3 4 5 1 3 d f k r s x
0 as t r a 2 ds NaN NaN NaN NaN s4 NaN NaN NaN
1 ds a s d f ds gf s3 43 64 NaN NaN se NaN
2 xx s 1 d f df NaN NaN NaN 54 NaN 43 NaN r4 <- 54
res = res.drop_duplicates(cols + [6], keep='last')
res = res.set_index(cols + [6])[7].unstack().reset_index().rename_axis(None, 1)
print (res)
0 1 2 3 4 5 1 3 d f k r s x
0 as t r a 2 ds NaN NaN NaN NaN s4 NaN NaN NaN
1 ds a s d f ds gf s3 43 64 NaN NaN se NaN
2 xx s 1 d f df NaN NaN NaN g4 NaN 43 NaN r4 <- g4

Categories