Joining PySpark dataframes with conditional result column - python

I have these tables:
df1 df2
+---+------------+ +---+---------+
| id| many_cols| | id|criterion|
+---+------------+ +---+---------+
| 1|lots_of_data| | 1| false|
| 2|lots_of_data| | 1| true|
| 3|lots_of_data| | 1| true|
+---+------------+ | 3| false|
+---+---------+
I intend to create additional column in df1:
+---+------------+------+
| id| many_cols|result|
+---+------------+------+
| 1|lots_of_data| 1|
| 2|lots_of_data| null|
| 3|lots_of_data| 0|
+---+------------+------+
result should be 1 if there is a corresponding true in df2
result should be 0 if there's no corresponding true in df2
result should be null if there is no corresponding id in df2
I cannot think of an efficient way to do it. I am stuck with only the 3rd condition working after a join:
df = df1.join(df2, 'id', 'full')
df.show()
# +---+------------+---------+
# | id| many_cols|criterion|
# +---+------------+---------+
# | 1|lots_of_data| false|
# | 1|lots_of_data| true|
# | 1|lots_of_data| true|
# | 3|lots_of_data| false|
# | 2|lots_of_data| null|
# +---+------------+---------+
PySpark dataframes are created like this:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.getOrCreate()
df1cols = ['id', 'many_cols']
df1data = [(1, 'lots_of_data'),
(2, 'lots_of_data'),
(3, 'lots_of_data')]
df2cols = ['id', 'criterion']
df2data = [(1, False),
(1, True),
(1, True),
(3, None)]
df1 = spark.createDataFrame(df1data, df1cols)
df2 = spark.createDataFrame(df2data, df2cols)

A simple way would be to groupby df2 to get the max criterion by id the join with df1, this way you reduce the number of lines to join. The max of a boolean column is true if there is at least one corresponding true value:
from pyspark.sql import functions as F
df2_group = df2.groupBy("id").agg(F.max("criterion").alias("criterion"))
result = df1.join(df2_group, ["id"], "left").withColumn(
"result",
F.col("criterion").cast("int")
).drop("criterion")
result.show()
#+---+------------+------+
#| id| many_cols|result|
#+---+------------+------+
#| 1|lots_of_data| 1|
#| 3|lots_of_data| 0|
#| 2|lots_of_data| null|
#+---+------------+------+

You can try a correlated subquery to get the maximum Boolean from df2, and cast that to an integer.
df1.createOrReplaceTempView('df1')
df2.createOrReplaceTempView('df2')
df = spark.sql("""
select
df1.*,
(select int(max(criterion)) from df2 where df1.id = df2.id) as result
from df1
""")
df.show()
+---+------------+------+
| id| many_cols|result|
+---+------------+------+
| 1|lots_of_data| 1|
| 3|lots_of_data| 0|
| 2|lots_of_data| null|
+---+------------+------+

check out this solution. After joining. you can use multiple condition checks based on your requirement and assign the value accordingly using when clause and then take the max value of result grouping by id and other columns. you can use window function as well to calculate the max of result if you are just using just id for the partition.
from pyspark.sql import functions as F
from pyspark.sql.window import Window
df1cols = ['id', 'many_cols']
df1data = [(1, 'lots_of_data'),
(2, 'lots_of_data'),
(3, 'lots_of_data')]
df2cols = ['id', 'criterion']
df2data = [(1, False),
(1, True),
(1, True),
(3, False)]
df1 = spark.createDataFrame(df1data, df1cols)
df2 = spark.createDataFrame(df2data, df2cols)
df2_mod =df2.withColumnRenamed("id", "id_2")
df3=df1.join(df2_mod, on=df1.id== df2_mod.id_2, how='left')
cond1 = (F.col("id")== F.col("id_2"))& (F.col("criterion")==1)
cond2 = (F.col("id")== F.col("id_2"))& (F.col("criterion")==0)
cond3 = (F.col("id_2").isNull())
df3.select("id", "many_cols", F.when(cond1, 1).when(cond2,0).when(cond3, F.lit(None)).alias("result"))\
.groupBy("id", "many_cols").agg(F.max(F.col("result")).alias("result")).orderBy("id").show()
Result:
------
+---+------------+------+
| id| many_cols|result|
+---+------------+------+
| 1|lots_of_data| 1|
| 2|lots_of_data| null|
| 3|lots_of_data| 0|
+---+------------+------+
Using window function
w=Window().partitionBy("id")
df3.select("id", "many_cols", F.when(cond1, 1).when(cond2,0).when(cond3, F.lit(None)).alias("result"))\
.select("id", "many_cols", F.max("result").over(w).alias("result")).drop_duplicates().show()

I had to merge the ideas of proposed answers to get the solution which suited me most.
# The `cond` variable is very useful, here it represents several complex conditions
cond = F.col('criterion') == True
df2_grp = df2.select(
'id',
F.when(cond, 1).otherwise(0).alias('c')
).groupBy('id').agg(F.max(F.col('c')).alias('result'))
df = df1.join(df2_grp, 'id', 'left')
df.show()
#+---+------------+------+
#| id| many_cols|result|
#+---+------------+------+
#| 1|lots_of_data| 1|
#| 3|lots_of_data| 0|
#| 2|lots_of_data| null|
#+---+------------+------+

Related

How to generate the max values for new columns in PySpark dataframe?

Suppose I have a pyspark dataframe df.
+---+---+
| a| b|
+---+---+
| 1| 2|
| 2| 3|
| 4| 5|
+---+---+
I'd like to add new column c.
column c = max(0, column b - 100)
+---+---+---+
| a| b| c|
+---+---+---+
| 1|200|100|
| 2|300|200|
| 4| 50| 0|
+---+---+---+
How should I generate the new column c in pyspark dataframe? Thanks in advance!
Hope you are looking something like this:
from pyspark.sql.functions import col, lit, greatest
df = spark.createDataFrame(
[
(1, 200),
(2, 300),
(4, 50),
],
["a", "b"]
)
df_new = df.withColumn("c", greatest(lit(0), col("b")-lit(100)))
.show()

How to create a dataframe based on the lookup dataframe and create mulitple column on dynamic and map values in the specific columns

I have two dataframes one is the main and another one is the lookup dataframe. I need to achieve the third one in the customized form using pyspark. I need check the values in the column list_ids and check the match in the lookup dataframe and mark the count in the final dataframe. I have tried array intersect and array lookup but it is not working.
Main dataframe:
df = spark.createDataFrame([(123, [75319, 75317]), (212, [136438, 25274]), (215, [136438, 75317])], ("ID", "list_IDs"))
df.show()
+---+---------------+
| ID| list_IDs|
+---+---------------+
|123| [75319, 75317]|
|212|[136438, 25274]|
|215|[136438, 75317]|
+---+---------------+
Lookup Dataframe:
df_2 = spark.createDataFrame([(75319, "Wheat", 20), (75317, "Rice", 10), (136438, "Jowar", 30), (25274, "Rajma", 40)], ("ID", "Material", "Count"))
df_2.show()
+------+--------+-----+
| ID|Material|Count|
+------+--------+-----+
| 75319| Wheat| A|
| 75317| Rice| B|
|136438| Jowar| C|
| 25274| Rajma| D|
+------+--------+-----+
Need Resultant dataframe as
+---+---------------+------+------+-------+------+
| ID| list_IDs|Wheat | Rice | Jowar | Rajma|
+---+---------------+------+------+-------+------+
|123| [75319, 75317]| A| B| 0 | 0|
|212|[136438, 25274]| 0| 0| C | D|
|215|[136438, 75317]| 0| B| C | 0 |
+---+---------------+------+------+-------+------+
You can join the two dataframes and then pivot:
import pyspark.sql.functions as F
df2 = df.join(
df_2,
F.array_contains(df.list_IDs, df_2.ID)
).groupBy(df.ID, 'list_IDs').pivot('Material').agg(F.first('Count')).fillna(0)
result.show()
+---+---------------+-----+-----+----+-----+
| ID| list_IDs|Jowar|Rajma|Rice|Wheat|
+---+---------------+-----+-----+----+-----+
|212|[136438, 25274]| 30| 40| 0| 0|
|215|[136438, 75317]| 30| 0| 10| 0|
|123| [75319, 75317]| 0| 0| 10| 20|
+---+---------------+-----+-----+----+-----+

Check if values of column pyspark df exist in other column pyspark df

I have 2 pyspark dataframes and I want to check if the values of one column exist in a column in the other dataframe.
I have only seen solutions of how to filter the values that exist (like this), what I need to do is to return a column of true or false.
In pandas it will be something like this:
df_A["column1"].isin(df_B["column1"])
thanks in advance!
EDIT: a cleaner way to do this:
import pyspark.sql.functions as F
result = df1.join(df2.withColumn('flag', F.lit(True)), 'col1', 'left').fillna(False)
result.show()
+----+-----+
|col1| flag|
+----+-----+
| 0| true|
| 1| true|
| 2|false|
+----+-----+
Old answer:
df1 = spark.createDataFrame(([0],[1],[2])).toDF('col1')
df2 = spark.createDataFrame(([0],[1],[3])).toDF('col1')
df1.join(df2, 'col1', 'semi') \
.distinct() \
.select('col1', F.lit(True).alias('flag')) \
.join(df1, 'col1', 'right') \
.fillna(False, 'flag') \
.show()
+----+-----+
|col1| flag|
+----+-----+
| 0| true|
| 1| true|
| 2|false|
+----+-----+
you can collect all the values in column1 and then make a broadcast variable from it, on which you can write a udf
from pyspark.sql import udf
from pyspark.sql.types import BooleanType
df_B_col_1_values = df_B.rdd.map(lambda x: x.column1).distinct().collect()
df_B_col_1_values = sc.broadcast(set(df_B_col_1_values))
my_udf = udf(lambda x: x in df_B_col_1_values.value, BooleanType())
df_A = df_A.withColumn('column1_present', my_udf(col('column1'))
To tweak Mck's answer a little bit (drop duplicate df_A entries and select the relevant columns):
SHORT
df_A = df_A.withColumn("uid_A", F.monotonically_increasing_id()) # Add an index column
df_B = df_B.withColumn("uid_B", F.monotonically_increasing_id()) # Add an index column
df_A = df_A.alias('df_A')
(
df_A
.join(df_B.withColumn('flag',F.lit(True)),'col1','left')
.fillna(False)
.dropDuplicates(['uid_A'])
.select('df_A.col1','flag')
).show()
'''
+----+-----+
|col1| flag|
+----+-----+
| A|false|
| B|false|
| C| true|
| B|false|
| C| true|
| D| true|
+----+-----+
'''
FULL
### Initialise dataframe ###
df_A = pd.DataFrame({'col1': ['A', 'B', 'C', 'B', 'C', 'D'],
'col2': [1, 2, 3, 4, 5, 6]})
df_A = spark.createDataFrame(df_A)
df_B = pd.DataFrame({'col1': ['C', 'E', 'D', 'C', 'F', 'G', 'H'],
'col2': [10, 20, 30, 40, 50, 60, 70]})
df_A = spark.createDataFrame(df_B)
### PREVIOUS ###
df_A.join(df_B.withColumn('flag',F.lit(True)),'col1','left').fillna(False).show()
'''
+----+----+----+-----+
|col1|col2|col2| flag|
+----+----+----+-----+
| A| 1|null|false|
| B| 2|null|false|
| B| 4|null|false|
| C| 3| 10| true|
| C| 3| 40| true|
| C| 5| 10| true|
| C| 5| 40| true|
| D| 6| 30| true|
+----+----+----+-----+
'''
### BETTER ###
df_A = df_A.withColumn("uid_A", F.monotonically_increasing_id()) # Add an index column
df_B = df_B.withColumn("uid_B", F.monotonically_increasing_id()) # Add an index column
df_A = df_A.alias('df_A')
(
df_A
.join(df_B.withColumn('flag',F.lit(True)),'col1','left')
.fillna(False)
.dropDuplicates(['uid_A'])
.select('df_A.col1','flag')
).show()
'''
+----+-----+
|col1| flag|
+----+-----+
| A|false|
| B|false|
| C| true|
| B|false|
| C| true|
| D| true|
+----+-----+
'''
Extra nuggets: To take only column values based on the True/False values of the .isin results, it may be more straightforward to use pyspark's leftsemi join which takes only the left table columns based on the matching results of the specified cols on the right, shown also in this stackoverflow post.
# Pandas (isin)
df_A["column1"].isin(df_B["column1"])
# Pyspark (isin equivalent, applied to column values)
df_A.join(df_B, ["column1"], "leftsemi").select("column1")
'''
+----+----+
|col1|col2|
+----+----+
| C| 3|
| C| 5|
| D| 6|
+----+----+
'''
# Pandas (~isin)
df_A["column1"].isin(df_B["column1"])
# Pyspark (~isin equivalent, applied to column values)
df_A.join(df_B, ["column1"], "leftanti").select("column1")
Further join examples/info - PySpark Join Types | Join Two DataFrames

Spark.sql : Remove table's name on columns name

when I query my database as :
spark.sql("SELECT * FROM MyBase")
I have "MyBase." in front of all my column's name.
Can we simply avoid that ? If not, what is the best way to remove it ?
Example :
df = spark.sql("SELECT name, price FROM MyBase")
df.show(2)
+-------------+--------------+
| mybase.name | mybase.price |
+-------------+--------------+
| Jean | 15.0 |
| Phil | 10.0 |
+-------------+--------------+
I just want name and price as column name.
Thanks
It seems that you set the option
hive.resultset.use.unique.column.names=true
for somewhere in your hive-site.xml or on the spark session. It should be false for your purpose.
Try providing new set of columns to your dataframe by removing the prefix name upto .. Here is a sample solution which will work for all columns.
data = [(1,2,3), (11,22,33)]
df1 = spark.createDataFrame(data,['a.a','a.b','a.c'])
df1.show()
"""
+---+---+---+
|a.a|a.b|a.c|
+---+---+---+
| 1| 2| 3|
| 11| 22| 33|
+---+---+---+
"""
new_cols = [i[i.find('.')+1:] for i in df1.columns]
df2 = df1.toDF(*new_cols)
df2.show()
"""
+---+---+---+
| a| b| c|
+---+---+---+
| 1| 2| 3|
| 11| 22| 33|
+---+---+---+
"""
data = [(1,2,3), (11,22,33)]
df1 = spark.createDataFrame(data,['a.a','a.b','a.c'])
cols = [x.split('.')[1] for x in df1.columns]
display(df1.toDF(*cols))

Comparing columns in Pyspark

I am working on a PySpark DataFrame with n columns. I have a set of m columns (m < n) and my task is choose the column with max values in it.
For example:
Input: PySpark DataFrame containing :
col_1 = [1,2,3], col_2 = [2,1,4], col_3 = [3,2,5]
Ouput :
col_4 = max(col1, col_2, col_3) = [3,2,5]
There is something similar in pandas as explained in this question.
Is there any way of doing this in PySpark or should I change convert my PySpark df to Pandas df and then perform the operations?
You can reduce using SQL expressions over a list of columns:
from pyspark.sql.functions import max as max_, col, when
from functools import reduce
def row_max(*cols):
return reduce(
lambda x, y: when(x > y, x).otherwise(y),
[col(c) if isinstance(c, str) else c for c in cols]
)
df = (sc.parallelize([(1, 2, 3), (2, 1, 2), (3, 4, 5)])
.toDF(["a", "b", "c"]))
df.select(row_max("a", "b", "c").alias("max")))
Spark 1.5+ also provides least, greatest
from pyspark.sql.functions import greatest
df.select(greatest("a", "b", "c"))
If you want to keep name of the max you can use `structs:
from pyspark.sql.functions import struct, lit
def row_max_with_name(*cols):
cols_ = [struct(col(c).alias("value"), lit(c).alias("col")) for c in cols]
return greatest(*cols_).alias("greatest({0})".format(",".join(cols)))
maxs = df.select(row_max_with_name("a", "b", "c").alias("maxs"))
And finally you can use above to find select "top" column:
from pyspark.sql.functions import max
((_, c), ) = (maxs
.groupBy(col("maxs")["col"].alias("col"))
.count()
.agg(max(struct(col("count"), col("col"))))
.first())
df.select(c)
We can use greatest
Creating DataFrame
df = spark.createDataFrame(
[[1,2,3], [2,1,2], [3,4,5]],
['col_1','col_2','col_3']
)
df.show()
+-----+-----+-----+
|col_1|col_2|col_3|
+-----+-----+-----+
| 1| 2| 3|
| 2| 1| 2|
| 3| 4| 5|
+-----+-----+-----+
Solution
from pyspark.sql.functions import greatest
df2 = df.withColumn('max_by_rows', greatest('col_1', 'col_2', 'col_3'))
#Only if you need col
#from pyspark.sql.functions import col
#df2 = df.withColumn('max', greatest(col('col_1'), col('col_2'), col('col_3')))
df2.show()
+-----+-----+-----+-----------+
|col_1|col_2|col_3|max_by_rows|
+-----+-----+-----+-----------+
| 1| 2| 3| 3|
| 2| 1| 2| 2|
| 3| 4| 5| 5|
+-----+-----+-----+-----------+
You can also use the pyspark built-in least:
from pyspark.sql.functions import least, col
df = df.withColumn('min', least(col('c1'), col('c2'), col('c3')))
Another simple way of doing it. Let us say that the below df is your dataframe
df = sc.parallelize([(10, 10, 1 ), (200, 2, 20), (3, 30, 300), (400, 40, 4)]).toDF(["c1", "c2", "c3"])
df.show()
+---+---+---+
| c1| c2| c3|
+---+---+---+
| 10| 10| 1|
|200| 2| 20|
| 3| 30|300|
|400| 40| 4|
+---+---+---+
You can process the above df as below to get the desited results
from pyspark.sql.functions import lit, min
df.select( lit('c1').alias('cn1'), min(df.c1).alias('c1'),
lit('c2').alias('cn2'), min(df.c2).alias('c2'),
lit('c3').alias('cn3'), min(df.c3).alias('c3')
)\
.rdd.flatMap(lambda r: [ (r.cn1, r.c1), (r.cn2, r.c2), (r.cn3, r.c3)])\
.toDF(['Columnn', 'Min']).show()
+-------+---+
|Columnn|Min|
+-------+---+
| c1| 3|
| c2| 2|
| c3| 1|
+-------+---+
Scala solution:
df = sc.parallelize(Seq((10, 10, 1 ), (200, 2, 20), (3, 30, 300), (400, 40, 4))).toDF("c1", "c2", "c3"))
df.rdd.map(row=>List[String](row(0).toString,row(1).toString,row(2).toString)).map(x=>(x(0),x(1),x(2),x.min)).toDF("c1","c2","c3","min").show
+---+---+---+---+
| c1| c2| c3|min|
+---+---+---+---+
| 10| 10| 1| 1|
|200| 2| 20| 2|
| 3| 30|300| 3|
|400| 40| 4| 4|
+---+---+---+---+

Categories