PySpark join on multiple columns

PySpark join on multiple columns - python

I have 2 dataframes, and I would like to know whether it is possible to join across multiple columns in a more generic and compact way. For example, this is a very explicit way and hard to generalize in a function:
df = df1.join(df2,
on=[
(df1.event_date < df2.risk_date)
&
(df1.client_id == df2.client_id_risk)
&
(df1.col_thr_param_1 == df2.col_thr_param_1)
&
(df1.col_thr_param_2 == df2.col_thr_param_2)
&
(df1.col_thr_param_3 == df2.col_thr_param_3)
&
(df1.col_thr_param_4 == df2.col_thr_param_4)
], how="left"
)
If I have a list with the name of threshold columns that I want to join in:
thr = ["col_thr_param_1", "col_thr_param_2", "col_thr_param_3", "col_thr_param_4"]
Is it possible to pass it in a function and generalize the join? Or I always need to resort to call df1 and df2? Something like this.
def join_dfs(df1: DataFrame, df2: DataFrame, thr_cols: List[str]):
df = df1.join(df2,
on=[
(df1.event_date < df2.risk_date)
&
(df1.client_id == df2.client_id_risk)
&
**df1.thr_cols == **df2.thr_cols
], how="left"
)

Ideally you can use alias with a list using col() to join. You can try something like below:
from pyspark.sql import functions as F
def join_dfs(df1, df2, thr_cols):
df = df1.alias("df1").join(df2.alias("df2"),
on=[
[(F.col("df1.event_date") < F.col("df2.risk_date"))
,
(F.col("df1.client_id") == F.col("df2.client_id_risk")) ]+
[F.col(f"df1.{col}")==F.col(f"df2.{col}") for col in thr_cols]
], how="left"
)
return df

Related

Ambiguous columns error in pyspark while iteratively joining dataframes

I am currently writing a code to join(left) two dataframes multiple times iteratively based on a set of columns corresponding to the two dataframes on each iteration. For one iteration it is working fine but on second iteration I am getting ambiguous columns error.
This is the sample dataframe on which I am working
sample_data = [("Amit","","Gupta","36678","M",4000),
("Anita","Mathews","","40299","F",5000),
("Ram","","Aggarwal","42124","M",5000),
("Pooja","Anne","Goel","39298","F",5000),
("Geeta","Banuwala","Brown","12345","F",-2)
]
sample_schema = StructType([
StructField("firstname",StringType(),True),
StructField("middlename",StringType(),True),
StructField("lastname",StringType(),True),
StructField("id", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True)
])
df1 = spark.createDataFrame(data = sample_data, schema = sample_schema)
sample_data = [("Amit", "ABC","MTS","36678",10),
("Ani", "DEF","CS","40299",200),
("Ram", "ABC","MTS","421",40),
("Pooja", "DEF","CS","39298",50),
("Geeta", "ABC","MTS","12345",-20)
]
sample_schema = StructType([
StructField("firstname",StringType(),True),
StructField("Company",StringType(),True),
StructField("position",StringType(),True),
StructField("id", StringType(), True),
StructField("points", IntegerType(), True)
])
df2 = spark.createDataFrame(data = sample_data, schema = sample_schema)
The code I used for this is
def joint_left_custom(df1, df2, cols_to_join, cols_df1_to_keep, cols_df2_to_keep):
resultant_df = None
df1_cols = df1.columns
df2 = df2.withColumn("flag", lit(True))
for i in range(len(cols_to_join)):
joined_df = df1.join(df2, [(df1[col_1] == df2[col_2]) for col_1, col_2 in cols_to_join[i].items()], 'left')
joined_df = joined_df.select(*[df1[column] if column in cols_df1_to_keep else df2[column] for column in cols_df1_to_keep + cols_df2_to_keep])
df1 = (joined_df
.filter("flag is NULL")
.select(df1_cols)
)
resultant_df = (joined_df.filter(col("flag") == True) if i == 0
else resultant_df.filter(col("flag") == True).union(resultant_df)
)
return resultant_df
cols_to_join = [{"id": "id"}, {"firstname":"firstname"}]
cols_df1_to_keep = ["firstname", "middlename", "lastname", "id", "gender", "salary"]
cols_df2_to_keep = ["company", "position", "points"]
x = joint_left_custom(df1, df2, cols_to_join, cols_df1_to_keep, cols_df2_to_keep)
it works fine if I execute this code for single run but on second iteration for again joining the rest of the rows on column "firstname" which are not joined on basis of column "id" in first iteration it is throwing following error
Column position#29518, company#29517, points#29520 are ambiguous. It's probably because you joined several Datasets together, and some of these Datasets are the same. This column points to one of the Datasets but Spark is unable to figure out which one. Please alias the Datasets with different names via Dataset.as before joining them, and specify the column using qualified name, e.g. df.as("a").join(df.as("b"), $"a.id" > $"b.id"). You can also set spark.sql.analyzer.failAmbiguousSelfJoin to false to disable this check.

This is the example how you can do or conditional join.
df1.join(df2, on=(df1.id == df2.id) | (df1.firstname == df2.firstname), how='left')
To make the condition dynamic, you can use reduce to chain the conditions.
from functools import reduce
def chain_join_cond(prev, value):
(lcol, rcol) = list(value.items())[0]
return prev | (df1[lcol] == df2[rcol])
# If your condition is OR, use False for initial condition.
# If your condition is AND, use True for initial condition(and use & to concatenate the conditions.)
cond = reduce(chain_join_cond, cols_to_join, F.lit(False))
# Use the cond for `on` option in join.
# df1.join(df2, on=cond, how='left')
Then to get a specific column set from df1 or df2 use list comprehensions to generate the select statement.
df = (df1.join(df2, on=cond, how='left')
.select(*[df1[c] for c in cols_df1_to_keep], *[df2[c] for c in cols_df2_to_keep]))
If you have the cols_to_join as tuple instead of dict, you can slightly simplify the code.
cols_to_join = [("id", "id"), ("firstname", "firstname")]
cond = reduce(lambda p, v: p | (df1[v[0]] == df2[v[1]]) , cols_to_join, F.lit(False))

Pandas Reindex Multiindex Dataframe Replicating Index

Thank you for taking a look! I am having issues with a 4 level multiindex & attempting to make sure every possible value of the 4th index is represented.
Here is my dataframe:
np.random.seed(5)
size = 25
dict = {'Customer':np.random.choice( ['Bob'], size),
'Grouping': np.random.choice( ['Corn','Wheat','Soy'], size),
'Date':np.random.choice( pd.date_range('1/1/2018','12/12/2022', freq='D'), size),
'Data': np.random.randint(20,100, size=(size))
}
df = pd.DataFrame(dict)
# create the Sub-Group column
df['Sub-Group'] = np.nan
df.loc[df['Grouping'] == 'Corn', 'Sub-Group'] = np.random.choice(['White', 'Dry'], size=len(df[df['Grouping'] == 'Corn']))
df.loc[df['Grouping'] == 'Wheat', 'Sub-Group'] = np.random.choice(['SRW', 'HRW', 'SWW'], size=len(df[df['Grouping'] == 'Wheat']))
df.loc[df['Grouping'] == 'Soy', 'Sub-Group'] = np.random.choice(['Beans', 'Meal'], size=len(df[df['Grouping'] == 'Soy']))
df['Year'] = df.Date.dt.year
With that, I'm looking to create a groupby like the following:
(df.groupby(['Customer','Grouping','Sub-Group',df['Date'].dt.month,'Year'])
.agg(Units = ('Data','sum'))
.unstack()
)
This works as expected. I want to reindex this dataframe so that every single month (index 3) is represented & filled with 0s. The reason I want this is later on I'll be doing a cumulative sum of a groupby.
I have tried both the following reindex & nothing happens - many months are still missing.
rere = pd.date_range('2018-01-01','2018-12-31', freq='M').month
(df.groupby(['Customer','Grouping','Sub-Group',df['Date'].dt.month,'Year'])
.agg(Units = ('Data','sum'))
.unstack()
.fillna(0)
.pipe(lambda x: x.reindex(rere, level=3, fill_value=0))
)
I've also tried the following:
(df.groupby(['Customer','Grouping','Sub-Group',df['Date'].dt.month,'Year'])
.agg(Units = ('Data','sum'))
.unstack()
.fillna(0)
.pipe(lambda x: x.reindex(pd.MultiIndex.from_product(x.index.levels)))
)
The issue with the last one is that the index is much too long - it's doing the cartesian product of Grouping & Sub-Group when really there are no combinations of 'Wheat' as a Grouping & 'Dry' as 'Sub-Group'.
I'm looking for a flexible way to reindex this dataframe to make sure a specific index level (3rd in this case) has every option.
Thanks so much for any help!

try this:
def reindex_sub(g: pd.DataFrame):
g = g.droplevel([0, 1, 2])
result = g.reindex(range(1, 13))
return result
tmp = (df.groupby(['Customer','Grouping','Sub-Group',df['Date'].dt.month,'Year'])
.agg(Units = ('Data','sum'))
.unstack()
)
grouped = tmp.groupby(level=[0,1,2], group_keys=True)
out = grouped.apply(reindex_sub)
print(out)

How to merge DataFrames, using multiple columns and additionally using conditions from DataFrames columns use operator AND?

I have the part of code that works well in MS SQL:
select * from table_1
full outer join table_2 on
table_1.Name = table_2.Name_2 and
table_1.Qty = table_2.Qty_2 and
table_1.[Month] = table_2.Month_2 and
table_1.class = 'X' and
table_2.class_2 = 'Y'
But i need to create the same code in python and i am trying to do this:
df_merge = pd.merge(table_1, table_2, how='outer', left_on=['Name', 'Qty', 'Month'], right_on=['Name_2', 'Qty_2', 'Month_2'])
But how to add this condition and table_1.class = 'X' and table_2.class_2 = 'Y'for the merge without using where, it will give a different result:
& (table_1['class'] == 'X') & (table_2['class_2'] == 'Y')

How to create a new column with multiple values in python dataframe using for loop?

I want to create a function with a for loop that iterates through a small dataframe and adds a new column with different values depending on the conditions set.
I have tried the below, but it returns the output for my first if statement for all the rows (it prints 'Top Buyers' for every row):
def CustomerSegmentClassifier(df):
for i, row in df.iterrows():
if (df['Recency'] <= 200).any() or (df['Frequency'] >= 20).any():
df.at[i,'Cluster Name'] = 'Top Buyers'
elif (df['Recency'].between(201, 750)).any() or (df['Frequency'].between(5,19)).any():
df.at[i,'Cluster Name'] = 'Casual Buyers'
else:
df.at[i,'Cluster Name'] = 'Churned Buyers'
return df
Image of Output from CustomerSegmentClassifer Function
Any help would be greatly appreciated.

Avoid for-loops across rows for vectorized methods. Unlike general purpose Python using lists and dicts, pandas and numpy have several solutions for conditional logic on arrays and series.
Specifically, for your needs consider numpy.select and even use Series operators for inequality logic:
def CustomerSegmentClassifier(df):
conditions = [
((df['Recency'].le(200)) | (df['Frequency'].ge(20))),
(
(df['Recency'].between(201, 750)) |
(df['Frequency'].between(5, 19))
)
]
values = ['Top Buyers', 'Casual Buyers']
df['Cluster Name'] = np.select(
conditions, values, default='Churned Buyers'
)
return df

This is my try to solve your question :
import pandas as pd
#df = pd.read_csv('test_data.txt', sep=',', header=None)
#df.columns = ['Customer ID','Recency','Frequency','Monetary Value']
def CustomerSegmentClassifier(df):
for i, row in df.iterrows():
if (df['Recency'][i] <= 200) or (df['Frequency'][i] >= 20):
df.at[i,'Cluster Name'] = 'Top Buyers'
elif (200 > df['Recency'][i] <= 750) or (5 > df['Frequency'][i] < 20):
df.at[i,'Cluster Name'] = 'Casual Buyers'
else:
df.at[i,'Cluster Name'] = 'Churned Buyers'
return df
CustomerSegmentClassifier(df)
Output:

Is there a way of dinamically find partial matching numbers between columns in pandas dataframes?

Im looking for a way of comparing partial numeric values between columns from different dataframes, this columns are filled with something like social security numbers (they can´t and won´t repeat), so something like a dynamic isin() with be ideal.
This are representations of very large dataframes that I import from csv files.
{import numpy as np
import pandas as pd
df1 = pd.DataFrame({"S_number": ["271600", "860078", "342964", "763261", "215446", "205303", "973637", "814452", "399304", "404205"]})
df2 = pd.DataFrame({"Id_number": ["14452", "9930", "1544", "5303", "973637", "4205", "0271600", "342964", "763", "60078"]})
print(df1)
print(df2)
df2['Id_number_length']= df2['Id_number'].str.len()
df2.groupby('Id_number_length').count()
count_list = df2.groupby('Id_number_length')[['Id_number_length']].count()
print('count_list:\n', count_list)
df1 ['S_number'] = pd.to_numeric(df1['S_number'], downcast = 'integer')
df2['Id_number'] = pd.to_numeric(df2['Id_number'], downcast = 'integer')
inner_join = pd.merge(df1, df2, left_on =['S_number'], right_on = ['Id_number'] , how ='inner')
print('MATCH!:\n', inner_join)
outer_join = pd.merge(df1, df2, left_on =['S_number'], right_on = ['Id_number'] , how ='outer', indicator = True)
anti_join = outer_join[~(outer_join._merge == 'both')].drop('_merge', axis = 1)
print('UNMATCHED:\n', anti_join)
}
What I need to get is something as the following as a result of the inner join or whatever method:
{
df3 = pd.DataFrame({"S_number": ["271600", "860078", "342964", "763261", "215446", "205303", "973637", "814452", "399304", "404205"],
"Id_number": [ "027160", "60078","342964","763", "1544", "5303", "973637", "14452", "9930", "4205",]})
print('MATCH!:\n', df3)
}
I thought that something like this (very crude) pseudocode would work. Using count_list to strip parts of the numbers of df1 to fully match df2 instead of partially matching (notice that in df2 the missing or added numbers are always at the begining or the end)
{
for i in count_list:
if i ==6:
try inner join
except empty output
elif i ==5:
try
df1.loc[:,'S_number'] = df_ib_c.loc[:,'S_number'].str[1:]
inner join with df2
except empty output
try
df1.loc[:,'S_number'] = df_ib_c.loc[:,'S_number'].str[:-1]
inner join with df2
elif i == 4:
same as above...
}
But the lengths in count_list are variable so this for is an inefficient way.
Any help with this will be very appreciated, I´ve been stuck with this for days. Thanks in advance.

You can 'explode' each line of df1 into up to 45 lines. For example, SSN 123456789 can be map to [1,2,3...9,12,23,34,45..89,...12345678,23456789,123456789]. While this look bad, from algorithm standpoint it is O(1) for each row and therefore O(N) in total.
Using this new column as key, a simple 'merge on' can combine the 2 DFs easily - which is usually O(NlogN).

Here is an example of what I should do. I hope I've understood. Feel free to ask if it's not clear.
import pandas as pd
import joblib
from joblib import Parallel,delayed
# Building the base
df1 = pd.DataFrame({"S_number": ["271600", "860078", "342964", "763261", "215446", "205303", "973637", "814452", "399304", "404205"]})
df2 = pd.DataFrame({"Id_number": ["14452", "9930", "1544", "5303", "973637", "4205", "0271600", "342964", "763", "60078"]})
# Initiate empty list for indexes
IDX = []
# Using un function to paralleliza it if database is big
def func(x,y):
if all(c in df2.Id_number[y] for c in df1.S_number[x]):
return(x,y)
# using the max of processors
number_of_cpu = joblib.cpu_count()
# Prpeparing a delayed function to be parallelized
delayed_funcs = (delayed(func)(x,y) for x in range(len(df1)) for y in range(len(df2)))
# fiting it with processes and not threads
parallel_pool = Parallel(n_jobs=number_of_cpu,prefer="processes")
# Fillig the IDX List
IDX.append(parallel_pool(delayed_funcs))
# Droping the None
IDX = list(filter(None, IDX[0]))
# Making df3 with the tuples of indexes
df3 = pd.DataFrame(IDX)
# Making it readable
df3['df1'] = df1.S_number[df3[0]].to_list()
df3['df2'] = df2.Id_number[df3[1]].to_list()
df3
OUTPUT :

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

PySpark join on multiple columns - python

Related

Ambiguous columns error in pyspark while iteratively joining dataframes

Pandas Reindex Multiindex Dataframe Replicating Index

How to merge DataFrames, using multiple columns and additionally using conditions from DataFrames columns use operator AND?

How to create a new column with multiple values in python dataframe using for loop?

Is there a way of dinamically find partial matching numbers between columns in pandas dataframes?

Categories

Resources