Minimizing rows with a merge/squish in Pandas DataFrame with Multiple indexes - python

With a DataFrame like,
import pandas as pd
import numpy as np
df = pd.DataFrame({
'id_1': [33,33,33,33,22,22,88,100],
'id_2': [64,64,64,64,12,12,77,100],
'col_1': [np.nan, 'dog', np.nan, 'kangaroo', np.nan, np.nan, np.nan, np.nan],
'col_2': ['bike', 'car', np.nan, np.nan, 'train', np.nan, 'horse', np.nan],
'col_3': [np.nan, np.nan, 'star', 'meteor', np.nan, 'rock', np.nan, np.nan]
})
"""
id_1 id_2 col_1 col_2 col_3
0 33 64 NaN bike NaN
1 33 64 dog car NaN
2 33 64 NaN NaN star
3 33 64 kangaroo NaN meteor
4 22 12 NaN train NaN
5 22 12 NaN NaN rock
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN
"""
How can it be transformed into a minimum amount of rows without aggregating or losing data like the following?
id_1 id_2 col_1 col_2 col_3
0 33 64 dog bike star
1 33 64 kangaroo car meteor
3 22 12 NaN train rock
4 88 77 NaN horse NaN
5 100 100 NaN NaN NaN
Basically, for each group of id_X columns, the col_X columns' NaN values are replaced with other group values if applicable.

# melt (wide to long) on id_1, id_2 and sort the values
# this brings the NaN to the top
df2=df.melt(id_vars=['id_1', 'id_2'], var_name='col').sort_values(['id_1', 'id_2','col', 'value'])
# create a seq, to make the keys unique and pivot
df3=(df2.assign(seq=df2.groupby(['id_1','id_2','col' ]).cumcount())
.pivot(index=['id_1','id_2','seq'], columns=['col'], values='value').reset_index()
)
# for id_1 =100, you have all NaN and still want to keep it
# so remove rows with all NaN except when its for seq=0
df3=df3.loc[~((df3['seq']>0) &
(df3[['col_1','col_2','col_3']].isna().all(axis=1)) )]
# drop the seq (temp) column
df3.drop(columns='seq', inplace=True)
df3
col id_1 id_2 col_1 col_2 col_3
0 22 12 NaN train rock
2 33 64 dog bike meteor
3 33 64 kangaroo car star
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN

Another possible solution:
# this is to push up all not NaN values to the top of each column
df.loc[:, 'col_1':'col_3'] = df.groupby(
['id_1', 'id_2'], sort=False).transform(lambda x: sorted(x, key=pd.isnull))
# this is to remove all useless rows of NaN
df.loc[~(df.duplicated(['id_1', 'id_2']) &
df.loc[:, 'col_1':'col_3'].isna().all(axis=1))]
Output:
id_1 id_2 col_1 col_2 col_3
0 33 64 dog bike star
1 33 64 kangaroo car meteor
4 22 12 NaN train rock
6 88 77 NaN horse NaN
7 100 100 NaN NaN NaN

To avoid illegible Pandas voodoo, after your imports and df instantiation, you can do
def get_max_vals_from_row_sets(row, cols):
mn = 1
for col in cols:
mn = max(mn, len(row[col]))
return mn
def add_id_row(d, row, ids, cols):
max_vals = get_max_vals_from_row_sets(row, cols)
for _ in range(max_vals):
for id_ in ids:
d[id_].append(row[id_])
for col in cols:
if len(row[col]) != 0:
d[col].append(row[col].pop())
else:
d[col].append(np.nan)
def drop_set_nans(row, cols):
for col in cols:
if np.nan in row[col]:
row[col].remove(np.nan)
return row
def squash_out_redundant_nans(df, ids, cols):
df = df.groupby(ids).agg(set).reset_index()
d = {k: [] for k in df.columns}
for _, row in df1.iterrows():
drop_set_nans(row, cols)
add_id_row(d, row, ids, cols)
df = pd.DataFrame(d)
return df
ids = ['id_1', 'id_2']
cols = ['col_1', 'col_2', 'col_3']
df = squash_out_redundant_nans(df, ids, cols)
print(df)

Related

Check if values in one dataframe match values from another, updating dataframe

Let's say I have 2 dataframes,
both have different lengths but the same amount of columns
df1 = pd.DataFrame({'country': ['Russia','Mexico','USA','Argentina','Denmark','Syngapore'],
'population': [41,12,26,64,123,24]})
df2 = pd.DataFrame({'country': ['Russia','Argentina','Australia','USA'],
'population': [44,12,23,64]})
Lets assume that some of the data in df1 is outdated and I've received a new dataframe that contains some new data but not which may or may not exist already in the outdated dataframe.
I want to find out if any of the values of df2.country are inside df1.country
By doing the following I'm able to return a boolean:
df = df1.country.isin(df2.country)
print(df)
Unfortunately I'm just creating a new dataframe containing the answer to my question
0 True
1 False
2 True
3 True
4 False
5 False
Name: country, dtype: bool
My goal here is to delete the rows of df1 which values match with df2 and add the new data, kind of like an update.
I've manage to come up with something like this:
df = df1.country.isin(df2.country)
i = 0
for x in df:
if x:
df1.drop(i, inplace=True)
i += 1
frames = [df1, df2]
df1 = pd.concat(frames)
df1.reset_index(drop=True, inplace=True)
print(df1)
which in fact works and updates the dataframe
country population
0 Mexico 12
1 Denmark 123
2 Syngapore 24
3 Russia 44
4 Argentina 12
5 Australia 23
6 USA 64
But I really believe there's a batter way of doing the same thing quicker and much more practical considering that the real dataframe is much bigger and updates every few seconds.
I'd love to hear some suggestions, Thanks!
Assuming col1 remains unique in the original dataframe, you can join the two tables together. Once you have them in the same dataframe, you can apply your logic i.e. update value from new dataframe if it is not null. You actually don't need to check if col2 has changed for every entry in col1. You can just replace col2 value with col1 as long as it is not NaN (based on your sample output).
df1 = pd.DataFrame({'col1': ['a','f','r','g','d','s'], 'col2': [41,12,26,64,123,24]})
df2 = pd.DataFrame({'col1': ['a','g','o','r'], 'col2': [44,12,23,64]})
# do the join
x= pd.merge(df1,df2,how='outer',
left_on="col1", right_on="col1")
col1 col2_x col2_y
0 a 41.0 44.0
1 f 12.0 NaN
2 r 26.0 64.0
3 g 64.0 12.0
4 d 123.0 NaN
5 s 24.0 NaN
6 o NaN 23.0
# apply your update rules
x['col2_x'] = np.where(
~x['col2_y'].isnull(),
x['col2_y'],x['col2_x']
)
col1 col2_x col2_y
0 a 44.0 44.0
1 f 12.0 NaN
2 r 64.0 64.0
3 g 12.0 12.0
4 d 123.0 NaN
5 s 24.0 NaN
6 o 23.0 23.0
#clean up
x.drop("col2_y", axis=1, inplace = True)
x.columns = ["col1", "col2"]
col1 col2
0 a 44.0
1 f 12.0
2 r 64.0
3 g 12.0
4 d 123.0
5 s 24.0
6 o 23.0
The isin approach is so close! Simply use the results from isin as a mask, then concat the rows from df1 that are not in (~) df2 with the rest of df2:
m = df1['country'].isin(df2['country'])
df3 = pd.concat((df1[~m], df2), ignore_index=True)
df3:
country population
0 Mexico 12
1 Denmark 123
2 Syngapore 24
3 Russia 44
4 Argentina 12
5 Australia 23
6 USA 64

How do I extract column values from one dataframe to another?

I have two pandas data frames df1 and df2
**df1** **df2**
cat id frequency id (other cols) A B C
A 23 2 23 ............. nan nan nan
A 43 8 43 ............. nan nan nan
B 23 56 30 ............. nan nan nan
C 30 4
I am looking for a way on how to extract information form df1 to df2 resulting in the format below, where the values of columns A, B and C are the frequency values from df1
**df2**
id (other cols) A B C
30 .......... 0 0 4
23 .......... 2 56 0
43 .......... 8 0 0
Use DataFrame.pivot with DataFrame.combine_first:
df11 = df1.pivot('cat', 'id', 'frequency')
#if id is column
df = df2.set_index('id').combine_first(df11)
#if id is index
df = df2.combine_first(df11)

Concatenating dataframes creates too many columns

I am reading a number of csv files in using a loop, all have 38 columns. I add them all to a list and then concatenate/create a dataframe. My issue is that despite all these csv files having 38 columns, my resultant dataframe somehow ends up with 105 columns.
Here is a screenshot:
How can I make the resultant dataframe have the correct 38 columns and stack all of rows on top of each other?
import boto3
import pandas as pd
import io
s3 = boto3.resource('s3')
client = boto3.client('s3')
bucket = s3.Bucket('alpha-enforcement-data-engineering')
appended_data = []
for obj in bucket.objects.filter(Prefix='closed/closed_processed/year_201'):
print(obj.key)
df = pd.read_csv(f's3://alpha-enforcement-data-engineering/{obj.key}', low_memory=False)
print(df.shape)
appended_data.append(df)
df_closed = pd.concat(appended_data, axis=0, sort=False)
print(df_closed.shape)
TLDR; check your column headers.
c = appended_data[0].columns
df_closed = pd.concat([df.set_axis(
c, axis=1, inplace=False) for df in appended_data], sort=False)
This happens because your column headers are different. Pandas will align your DataFrames on the headers when concatenating vertically, and will insert empty columns for DataFrames where that header is not present. Here's an illustrative example:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'C': [7, 8, 9], 'D': [10, 11, 12]})
df
A B
0 1 4
1 2 5
2 3 6
df2
C D
0 7 10
1 8 11
2 9 12
pd.concat([df, df2], axis=0, sort=False)
A B C D
0 1.0 4.0 NaN NaN
1 2.0 5.0 NaN NaN
2 3.0 6.0 NaN NaN
0 NaN NaN 7.0 10.0
1 NaN NaN 8.0 11.0
2 NaN NaN 9.0 12.0
Creates 4 columns. Whereas, you wanted only two. Try,
df2.columns = df.columns
pd.concat([df, df2], axis=0, sort=False)
A B
0 1 4
1 2 5
2 3 6
0 7 10
1 8 11
2 9 12
Which works as expected.

Pandas partial transpose

I want to reformat a dataframe by transeposing some columns with fixing other columns.
original data :
ID subID values_A
-- ----- --------
A aaa 10
B baa 20
A abb 30
A acc 40
C caa 50
B bbb 60
Pivot once :
pivot_table( df, index = ["ID", "subID"] )
Output:
ID subID values_A
-- ----- --------
A aaa 10
abb 30
acc 40
B baa 20
bbb 60
C caa 50
What I want to do ( Fix ['ID'] columns and partial transpose ) :
ID subID_1 value_1 subID_2 value_2 subID_3 value_3
-- ------- ------- -------- ------- ------- -------
A aaa 10 abb 30 acc 40
B baa 20 bbb 60 NaN NaN
C caa 50 NaN NaN NaN NaN
what I know max subIDs count value which are under each IDs.
I don't need any calculating value when pivot and transepose dataframe.
Please help
Use cumcount for counter, create MultiIndex by set_index, reshape by unstack and sort first level of MultiIndex in columns by sort_index. Last flatten it by list comprehension with reset_index:
g = df.groupby('ID').cumcount()
df = df.set_index(['ID', g]).unstack().sort_index(level=1, axis=1)
#python 3.6+
df.columns = [f'{a}_{b+1}' for a, b in df.columns]
#python bellow
#df.columns = ['{}_{}'.format(a, b+1) for a, b in df.columns]
df = df.reset_index()
print (df)
ID subID_1 values_A_1 subID_2 values_A_2 subID_3 values_A_3
0 A aaa 10.0 abb 30.0 acc 40.0
1 B baa 20.0 bbb 60.0 NaN NaN
2 C caa 50.0 NaN NaN NaN NaN

summarizing data frame in pandas - python

df = pd.DataFrame({'a':['y',NaN,'y',NaN,NaN,'x','x','y',NaN],'b':[NaN,'x',NaN,'y','x',NaN,NaN,NaN,'y'],'d':[1,0,0,1,1,1,0,1,0]})
I'm trying to summarize this dataframe using sum. I thought df.groupby(['a','b']).aggregate(sum) would work but it returns an empty Series.
How can I achieve this result?
a b
x 1 1
y 2 1
import numpy as np
import pandas as pd
NaN = np.nan
df = pd.DataFrame(
{'a':['y',NaN,'y',NaN,NaN,'x','x','y',NaN],
'b':[NaN,'x',NaN,'y','x',NaN,NaN,NaN,'y'],
'd':[32,12,55,98,23,11,9,91,3]})
melted = pd.melt(df, id_vars=['d'], value_vars=['a', 'b'])
result = pd.pivot_table(melted, values='d', index=['value'], columns=['variable'],
aggfunc=np.median)
print(result)
yields
variable a b
value
x 10.0 17.5
y 55.0 50.5
Explanation:
Melting the DataFrame with melted = pd.melt(df, value_vars=['a', 'b']) produces
d variable value
0 32 a y
1 12 a NaN
2 55 a y
3 98 a NaN
4 23 a NaN
5 11 a x
6 9 a x
7 91 a y
8 3 a NaN
9 32 b NaN
10 12 b x
11 55 b NaN
12 98 b y
13 23 b x
14 11 b NaN
15 9 b NaN
16 91 b NaN
17 3 b y
and now we can use pd.pivot_table to pivot and aggregate the d values:
result = pd.pivot_table(melted, values='d', index=['value'], columns=['variable'],
aggfunc=np.median)
Note that the aggfunc can take a list of functions, such as [np.sum, np.median, np.min, np.max, np.std] if you wish to summarize the data in more than one way.

Categories