Combine Dataframe rows to fill in missing data - python

Suppose I have a dataframe with rows containing missing data, but a set of columns acting as a key:
import pandas as pd
import numpy as np
data = {"id": [1, 1, 2, 2, 3, 3, 4 ,4], "name": ["John", "John", "Paul", "Paul", "Ringo", "Ringo", "George", "George"], "height": [178, np.nan, 182, np.nan, 175, np.nan, 188, np.nan], "weight": [np.nan, np.NaN, np.nan, 72, np.nan, 68, np.nan, 70]}
df = pd.DataFrame.from_dict(data)
print(df)
id name height weight
0 1 John 178.0 NaN
1 1 John NaN NaN
2 2 Paul 182.0 NaN
3 2 Paul NaN 72.0
4 3 Ringo 175.0 NaN
5 3 Ringo NaN 68.0
6 4 George 188.0 NaN
7 4 George NaN 70.0
How would I go about "squashing" these rows with duplicate keys down to pick the non-nan value (if it exists)?
desired output:
id name height weight
0 1 John 178.0 NaN
2 2 Paul 182.0 72.0
4 3 Ringo 175.0 68.0
6 4 George 188.0 70.0
The index doesn't matter, and there is always at most one row with Non-NaN data. I think I need to use groupby(['id', 'name']), but I'm not sure where to go from there.

If there are always only one non NaNs values per groups is possible aggregate many ways:
df = df.groupby(['id', 'name'], as_index=False).first()
Or:
df = df.groupby(['id', 'name'], as_index=False).last()
Or:
df = df.groupby(['id', 'name'], as_index=False).mean()
Or:
df = df.groupby(['id', 'name'], as_index=False).sum(min_count=1)

Related

Minimizing rows with a merge/squish in Pandas DataFrame with Multiple indexes

With a DataFrame like,
import pandas as pd
import numpy as np
df = pd.DataFrame({
'id_1': [33,33,33,33,22,22,88,100],
'id_2': [64,64,64,64,12,12,77,100],
'col_1': [np.nan, 'dog', np.nan, 'kangaroo', np.nan, np.nan, np.nan, np.nan],
'col_2': ['bike', 'car', np.nan, np.nan, 'train', np.nan, 'horse', np.nan],
'col_3': [np.nan, np.nan, 'star', 'meteor', np.nan, 'rock', np.nan, np.nan]
})
"""
id_1 id_2 col_1 col_2 col_3
0 33 64 NaN bike NaN
1 33 64 dog car NaN
2 33 64 NaN NaN star
3 33 64 kangaroo NaN meteor
4 22 12 NaN train NaN
5 22 12 NaN NaN rock
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN
"""
How can it be transformed into a minimum amount of rows without aggregating or losing data like the following?
id_1 id_2 col_1 col_2 col_3
0 33 64 dog bike star
1 33 64 kangaroo car meteor
3 22 12 NaN train rock
4 88 77 NaN horse NaN
5 100 100 NaN NaN NaN
Basically, for each group of id_X columns, the col_X columns' NaN values are replaced with other group values if applicable.
# melt (wide to long) on id_1, id_2 and sort the values
# this brings the NaN to the top
df2=df.melt(id_vars=['id_1', 'id_2'], var_name='col').sort_values(['id_1', 'id_2','col', 'value'])
# create a seq, to make the keys unique and pivot
df3=(df2.assign(seq=df2.groupby(['id_1','id_2','col' ]).cumcount())
.pivot(index=['id_1','id_2','seq'], columns=['col'], values='value').reset_index()
)
# for id_1 =100, you have all NaN and still want to keep it
# so remove rows with all NaN except when its for seq=0
df3=df3.loc[~((df3['seq']>0) &
(df3[['col_1','col_2','col_3']].isna().all(axis=1)) )]
# drop the seq (temp) column
df3.drop(columns='seq', inplace=True)
df3
col id_1 id_2 col_1 col_2 col_3
0 22 12 NaN train rock
2 33 64 dog bike meteor
3 33 64 kangaroo car star
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN
Another possible solution:
# this is to push up all not NaN values to the top of each column
df.loc[:, 'col_1':'col_3'] = df.groupby(
['id_1', 'id_2'], sort=False).transform(lambda x: sorted(x, key=pd.isnull))
# this is to remove all useless rows of NaN
df.loc[~(df.duplicated(['id_1', 'id_2']) &
df.loc[:, 'col_1':'col_3'].isna().all(axis=1))]
Output:
id_1 id_2 col_1 col_2 col_3
0 33 64 dog bike star
1 33 64 kangaroo car meteor
4 22 12 NaN train rock
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN
To avoid illegible Pandas voodoo, after your imports and df instantiation, you can do
def get_max_vals_from_row_sets(row, cols):
mn = 1
for col in cols:
mn = max(mn, len(row[col]))
return mn
def add_id_row(d, row, ids, cols):
max_vals = get_max_vals_from_row_sets(row, cols)
for _ in range(max_vals):
for id_ in ids:
d[id_].append(row[id_])
for col in cols:
if len(row[col]) != 0:
d[col].append(row[col].pop())
else:
d[col].append(np.nan)
def drop_set_nans(row, cols):
for col in cols:
if np.nan in row[col]:
row[col].remove(np.nan)
return row
def squash_out_redundant_nans(df, ids, cols):
df = df.groupby(ids).agg(set).reset_index()
d = {k: [] for k in df.columns}
for _, row in df1.iterrows():
drop_set_nans(row, cols)
add_id_row(d, row, ids, cols)
df = pd.DataFrame(d)
return df
ids = ['id_1', 'id_2']
cols = ['col_1', 'col_2', 'col_3']
df = squash_out_redundant_nans(df, ids, cols)
print(df)

Drop rows if some columns have some value

I have this df:
import pandas as pd
import numpy as np
d = {'name': ['bob', 'jake','Sem'], 'F1': [3, 4, np.nan], 'F2': [14, 40, 7], 'F3':
[np.nan, 1, 55]}
df = pd.DataFrame(data=d)
print (df)
out>>>
name F1 F2 F3
0 bob 3.0 14 NaN
1 jake 4.0 40 1.0
2 Sem NaN 7 NaN
I would like to delete all the rows that under at least 2 columns (between F1 F2 and F3) are NaN.
Like:
name F1 F2 F3
0 bob 3.0 14 NaN
1 jake 4.0 40 1.0
This is just an example, but I may have up to many columns (up to F100) and I may want to delete with other values instead of 2 out of 3 columns.
What is the best way to do this?
You can use the subset and thresh parameters of dropna:
df.dropna(subset=['F1', 'F2', 'F3'], thresh=2)
Example:
import pandas as pd
import numpy as np
d = {'name': ['bob', 'jake', 'Sem', 'Mario'],
'F1': [3, 4, np.nan, np.nan],
'F2': [14, 40, 7, 42],
'F3': [np.nan, 1, 55, np.nan]}
df = pd.DataFrame(data=d)
print(df)
name F1 F2 F3
0 bob 3.0 14 NaN
1 jake 4.0 40 1.0
2 Sem NaN 7 55.0
3 Mario NaN 42 NaN
df2 = df.dropna(subset=['F1', 'F2', 'F3'], thresh=2)
print(df2)
name F1 F2 F3
0 bob 3.0 14 NaN
1 jake 4.0 40 1.0
2 Sem NaN 7 55.0
Selecting the columns automatically:
cols = list(df.filter(regex=r'F\d+'))
df.dropna(subset=cols, thresh=2)
alternative without dropna
Using boolean indexing:
m = df.filter(regex=r'F\d+').isna().sum(1)
df2 = df[m.lt(2)]
values above threshold
drop if at least 2 values ≥ 4:
m = df.filter(regex=r'F\d+').gt(4).sum(1)
df2 = df[m.lt(2)]

Combining two dataframes and filling blanks in one with values of another (using email as index)

I have two dataframes that have the following columns : Phone, Email and Name
Dataframe1 has 20k in length, whereas dataframe2 has 1k length. I would like to fill the blanks in the Phone column in dataframe1 with the phone numbers in dataframe2 using the email as a match index between the two dataframes.
What is the best way to do this? I have tried combine_frist() and Merge() but combine_first() returns the value in the same row rather than the value that matches the email address. Merge() resulted in the same thing.
Am I wrong to think I need to set email as an index and then map phones to that index? I feel like this is correct but I simply do not know how to do this. Any help is appreciated! Thank you :)
Example :
In [1]
import pandas as pd
df1 = pd.DataFrame({'Phone': [1, NaN, 3, 4, 5, NaN, 7],
'Name': ['Bob', 'Jon', 'Iris', 'Jacob','Donald','Beatrice','Jane'],
'Email': ['bob#gmail.com','jon#gmail.com','iris#gmail.com','jacob#gmail.com','donald#gmail.com','beatrice#gmail.com','jane#gmail.cm'})
df2 = pd.DataFrame({'Phone': [2, 1, 3, 5],
'Name': ['Jon', 'Bob', 'Donald'],
'Email': ['jon#gmail.com','bob#gmail.com', 'donald#gmail.com'})
In [2]: df1
Out [2]:
Phone Name Email
1 Bob bob#gmail.com
NaN Jon jon#gmail.com
3 Iris iris#gmail.com
4 Jac jacob#gmail.com
5 Don donald#gmail.com
NaN Bea beatrice#gmail.com
7 Jane jane#gmail.com
x 20000 len
In [3]: df2
Out [3]:
Phone Name Email
2 Jon jon#gmail.com
1 Bob bob#gmail.com
6 Bea beatrice#gmail.com
5 Don donald#gmail.com
x 1100 len
What I've tried
In [4]: df3 = pd.merge(df1,df2, on="Email", how="left")
Out [4]:
Phone Name Email
1 Bob bob#gmail.com
1 Jon jon#gmail.com
3 Iris iris#gmail.com
4 Jac jacob#gmail.com
5 Don donald#gmail.com
NaN Bea beatrice#gmail.com
7 Jane jane#gmail.com
In [5]: df3 = df1.combine_first(df2)
Out [5]:
Phone Name Email
1 Bob bob#gmail.com
1 Jon jon#gmail.com
3 Iris iris#gmail.com
4 Jac jacob#gmail.com
5 Don donald#gmail.com
NaN Bea beatrice#gmail.com
7 Jane jane#gmail.com
What I would like it to look like:
In [6]: df3
Out [6]
1 Bob bob#gmail.com
2 Jon jon#gmail.com
3 Iris iris#gmail.com
4 Jac jacob#gmail.com
5 Don donald#gmail.com
6 Bea beatrice#gmail.com
7 Jane jane#gmail.com
Constructing the data frame like so:
df1 = pd.DataFrame({'Phone': [1, NaN, 3, 4, 5, NaN, 7],
'Name': ['Bob', 'Jon', 'Iris', 'Jacob','Donald','Beatrice','Jane'],
'Email': ['bob#gmail.com','jon#gmail.com','iris#gmail.com','jacob#gmail.com','donald#gmail.com','beatrice#gmail.com','jane#gmail.cm']})
df2 = pd.DataFrame({'Phone': [2, 1, 5, 6],
'Name': ['Jon', 'Bob', 'Donald', 'Beatrice'],
'Email': ['jon#gmail.com','bob#gmail.com', 'donald#gmail.com', 'beatrice#gmail.com']})
The merge gives:
>>> df1.merge(df2, on='Email', how='left')
Phone_x Name_x Email Phone_y Name_y
0 1.0 Bob bob#gmail.com 1.0 Bob
1 NaN Jon jon#gmail.com 2.0 Jon
2 3.0 Iris iris#gmail.com NaN NaN
3 4.0 Jacob jacob#gmail.com NaN NaN
4 5.0 Donald donald#gmail.com 5.0 Donald
5 NaN Beatrice beatrice#gmail.com 6.0 Beatrice
6 7.0 Jane jane#gmail.cm NaN NaN
Then reduce Phone over columns.
>>> df1.merge(df2, on='Email', how='left')[['Phone_x', 'Phone_y']].ffill(axis=1)
Phone_x Phone_y
0 1.0 1.0
1 NaN 2.0
2 3.0 3.0
3 4.0 4.0
4 5.0 5.0
5 NaN 6.0
6 7.0 7.0
Reassign the right-most column in that result - if output is assigned to result, access by result.iloc[:, -1] - as a new column to the original data frame.

Create a new dataframe from a for loop with value.counts()

I have a dataframe (df3) with 51 columns and managed to show the most common values in each feature with a for loop.
for col in df3.columns:
print('-' * 40 + col + '-' * 40 , end=' - ')
display(df3[col].value_counts().head(10))
Now I'd like to create a new dataframe called df4 with the results from the loop. That is the 10 most frequent values from all columns of df3. How can I do that?
I get values using
df4 = df3.apply(lambda col: col.value_counts().head(10).index)
Instead of for-loop I use apply.
Because .value_counts() creates Series which uses original IDs as index so I get .index
Minimal working example - because I have less values so I use head(2)
import pandas as pd
data = {
'A': [1,2,3,3,4,5,6,6,6],
'B': [4,5,6,4,2,3,4,8,8],
'C': [7,8,9,7,1,1,1,2,2]
} # columns
df = pd.DataFrame(data)
df2 = df.apply(lambda col: col.value_counts().head(2).index)
print(df2)
Result
A B C
0 6 4 1
1 3 8 7
EDIT:
If you have less then 10 results in column then you can convert to list expand with list which have 10 x NaN and after then crop it to [:10]
.head(10).index.tolist() + [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN])[:10])
Minimal working example
import pandas as pd
import numpy as np
data = {
'A': [1,2,3,3,4,5,6,6,6],
'B': [4,5,6,4,2,3,4,8,8],
'C': [7,8,9,7,1,1,1,2,2]
} # columns
df = pd.DataFrame(data)
NAN10 = [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]
df2 = df.apply(lambda col: (col.value_counts().head(10).index.tolist() + NAN10)[:10])
print(df2)
Result
A B C
0 6.0 4.0 1.0
1 3.0 8.0 7.0
2 5.0 6.0 2.0
3 4.0 5.0 9.0
4 2.0 3.0 8.0
5 1.0 2.0 NaN
6 NaN NaN NaN
7 NaN NaN NaN
8 NaN NaN NaN
9 NaN NaN NaN
You can also try to conver to Series and it may add NaN in missing places but it will skip rows which have only NaN
import pandas as pd
import numpy as np
data = {
'A': [1,2,3,3,4,5,6,6,6],
'B': [4,5,6,4,2,3,4,8,8],
'C': [7,8,9,7,1,1,1,2,2]
} # columns
df = pd.DataFrame(data)
df3 = df.apply(lambda col: pd.Series(col.value_counts().head(10).index))
print(df3)
Result
A B C
0 6 4 1.0
1 3 8 7.0
2 5 6 2.0
3 4 5 9.0
4 2 3 8.0
5 1 2 NaN

Merge three or more data frames with Pandas

In Pandas merge function you can merge two data frames, but I need to merge N, similar to an SQL statement where you combine N tables in a full outer join. For example, I need to merge the three data frames below by 'type_1', 'subject_id_1', 'type_2', 'subject_id_2' and 'type_3', 'subject_id_3'. Is this possible?
import pandas as pd
raw_data = {
'type_1': [1, 1, 0, 0, 1],
'subject_id_1': ['1', '2', '3', '4', '5'],
'first_name_1': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung']}
df_a = pd.DataFrame(raw_data, columns = ['type_1', 'subject_id_1', 'first_name_1'])
raw_datab = {
'type_2': [1, 1, 0, 0, 0],
'subject_id_2': ['4', '5', '6', '7', '8'],
'first_name_2': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty']}
df_b = pd.DataFrame(raw_datab, columns = ['type_2', 'subject_id_2', 'first_name_2'])
raw_datac = {
'type_3': [1, 1],
'subject_id_3': ['4', '5'],
'first_name_3': ['Joe', 'Paul']}
df_c = pd.DataFrame(raw_datac, columns = ['type_3', 'subject_id_3', 'first_name_3'])
### need to include here the third data frame
merged = pd.merge(df_a, df_b, left_on=['type_1','subject_id_1'],
right_on = ['type_2','subject_id_2'], how='outer')
print(merged)
Note: The names of the fields to join are different in each data frame.
I believe need join by indices created by set_index with concat:
dfs = [df_a.set_index(['type_1','subject_id_1']),
df_b.set_index(['type_2','subject_id_2']),
df_c.set_index(['type_3','subject_id_3'])]
df = pd.concat(dfs, axis=1)
print (df)
first_name_1 first_name_2 first_name_3
0 3 Allen NaN NaN
4 Alice NaN NaN
6 NaN Bran NaN
7 NaN Bryce NaN
8 NaN Betty NaN
1 1 Alex NaN NaN
2 Amy NaN NaN
4 NaN Billy Joe
5 Ayoung Brian Paul
df = pd.concat(dfs, axis=1).rename_axis(('type','subject_id')).reset_index()
print (df)
type subject_id first_name_1 first_name_2 first_name_3
0 0 3 Allen NaN NaN
1 0 4 Alice NaN NaN
2 0 6 NaN Bran NaN
3 0 7 NaN Bryce NaN
4 0 8 NaN Betty NaN
5 1 1 Alex NaN NaN
6 1 2 Amy NaN NaN
7 1 4 NaN Billy Joe
8 1 5 Ayoung Brian Paul

Categories