Spark pivot one column but keep others intact - python

Given the following dataframe, how do I pivot the max scores but aggregate the sum of plays?
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql import Window
df = sqlContext.createDataFrame([
("u1", "g1", 10, 0, 1),
("u1", "g3", 2, 2, 1),
("u1", "g3", 5, 3, 1),
("u1", "g4", 5, 4, 1),
("u2", "g2", 1, 1, 1),
], ["UserID", "GameID", "Score", "Time", "Plays"])
Desired Output
+------+-------------+-------------+-----+
|UserID|MaxScoreGame1|MaxScoreGame2|Plays|
+------+-------------+-------------+-----+
| u1| 10| 5| 4|
| u2| 1| null| 1|
+------+-------------+-------------+-----+
I posted a solution below but I'm hoping to avoid using join.

I don't think it is a real improvement but you can add total number of plays
...
.select(
F.col("*"),
F.row_number().over(rowNumberWindow).alias("GameNumber"),
F.sum("Plays").over(rowNumberWindow.orderBy()).alias("total_plays")
)
...
and use it later as a secondary grouping column for pivot:
...
.groupBy("UserID", "total_plays")
.pivot("GameCol", ["MaxScoreGame1", "MaxScoreGame2"])
.agg(F.max("Score"))
...

Here's a solution using join, which I'm hoping to avoid:
Sum dataframe
df_sum = df.groupBy("UserID").agg(F.sum("Plays").alias("Plays")).alias("df_sum")
df_sum.show()
+------+-----+
|UserID|Plays|
+------+-----+
| u1| 4|
| u2| 1|
+------+-----+
rowNumberWindow = Window.partitionBy("UserID").orderBy(F.col("Time"))
Pivot dataframe
rowNumberWindow = Window.partitionBy("UserID").orderBy(F.col("Time"))
df_piv = (df
.groupBy("UserID", "GameID")
.agg(F.sum("Plays").alias("Plays"),
F.max("Score").alias("Score"),
F.min("Time").alias("Time"))
.select(F.col("*"),
F.row_number().over(rowNumberWindow).alias("GameNumber"))
.filter(F.col("GameNumber") <= F.lit(2))
.withColumn("GameCol", F.concat(F.lit("MaxScoreGame"), F.col("GameNumber")))
.groupBy("UserID")
.pivot("GameCol", ["MaxScoreGame1", "MaxScoreGame2"])
.agg(F.max("Score"))
).alias("df_piv")
df_piv.show()
+------+-------------+-------------+
|UserID|MaxScoreGame1|MaxScoreGame2|
+------+-------------+-------------+
| u1| 10| 5|
| u2| 1| null|
+------+-------------+-------------+
Joined dataframe
df_joined = df_sum.join(df_piv, F.col("df_sum.UserID") == F.col("df_piv.UserID"))
df_joined.show()
+------+-----+------+-------------+-------------+
|UserID|Plays|UserID|MaxScoreGame1|MaxScoreGame2|
+------+-----+------+-------------+-------------+
| u1| 4| u1| 10| 5|
| u2| 1| u2| 1| null|
+------+-----+------+-------------+-------------+

Related

How to generate the max values for new columns in PySpark dataframe?

Suppose I have a pyspark dataframe df.
+---+---+
| a| b|
+---+---+
| 1| 2|
| 2| 3|
| 4| 5|
+---+---+
I'd like to add new column c.
column c = max(0, column b - 100)
+---+---+---+
| a| b| c|
+---+---+---+
| 1|200|100|
| 2|300|200|
| 4| 50| 0|
+---+---+---+
How should I generate the new column c in pyspark dataframe? Thanks in advance!
Hope you are looking something like this:
from pyspark.sql.functions import col, lit, greatest
df = spark.createDataFrame(
[
(1, 200),
(2, 300),
(4, 50),
],
["a", "b"]
)
df_new = df.withColumn("c", greatest(lit(0), col("b")-lit(100)))
.show()

Aggregate GroupBy columns with "all"-like function pyspark

I have a dataframe with a primary key, date, variable, and value. I want to group by the primary key and determine if all values are equal to a provided value. Example data:
import pandas as pd
from datetime import date
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = pd.DataFrame({
"pk": [1, 1, 1, 1, 2, 2, 2, 2, 3, 4],
"date": [
date("2022-05-06"),
date("2022-05-13"),
date("2022-05-06"),
date("2022-05-06"),
date("2022-05-14"),
date("2022-05-15"),
date("2022-05-05"),
date("2022-05-05"),
date("2022-05-11"),
date("2022-05-12")
],
"variable": [A, B, C, D, A, A, E, F, A, G],
"value": [2, 3, 2, 2, 1, 1, 1, 1, 5, 4]
})
df = spark.createDataFrame(df)
df.show()
df1.show()
#+-----+-----------+--------+-----+
#|pk | date|variable|value|
#+-----+-----------+--------+-----+
#| 1| 2022-05-06| A| 2|
#| 1| 2022-05-13| B| 3|
#| 1| 2022-05-06| C| 2|
#| 1| 2022-05-06| D| 2|
#| 2| 2022-05-14| A| 1|
#| 2| 2022-05-15| A| 1|
#| 2| 2022-05-05| E| 1|
#| 2| 2022-05-05| F| 1|
#| 3| 2022-05-11| A| 5|
#| 4| 2022-05-12| G| 4|
#+-----+-----------+--------+-----+
So if I want to know whether, given a primary key, pk, all the values are equal to 1 (or any arbitrary Boolean test), how should I do this? I've tried performing an applyInPandas but that is not super efficient and it seems like there is probably a pretty simply method to do this.
For Spark 3.+, you could use forall function to check if all values collected by collect_list satisfy the boolean test.
import pyspark.sql.functions as F
df1 = (df
.groupby("pk")
.agg(F.expr("forall(collect_list(value), v -> v == 1)").alias("value"))
)
df1.show()
# +---+-----+
# | pk|value|
# +---+-----+
# | 1|false|
# | 3|false|
# | 2| true|
# | 4|false|
# +---+-----+
# or create a column using window function
df2 = df.withColumn("test", F.expr("forall(collect_list(value) over (partition by pk), v -> v == 1)"))
df2.show()
# +---+----------+--------+-----+-----+
# | pk| date|variable|value| test|
# +---+----------+--------+-----+-----+
# | 1|2022-05-06| A| 2|false|
# | 1|2022-05-13| B| 3|false|
# | 1|2022-05-06| C| 2|false|
# | 1|2022-05-06| D| 2|false|
# | 3|2022-05-11| A| 5|false|
# | 2|2022-05-14| A| 1| true|
# | 2|2022-05-15| A| 1| true|
# | 2|2022-05-05| E| 1| true|
# | 2|2022-05-05| F| 1| true|
# | 4|2022-05-12| G| 4|false|
# +---+----------+--------+-----+-----+
You might want to put it inside a case clause to handle NULL values.

Joining PySpark dataframes with conditional result column

I have these tables:
df1 df2
+---+------------+ +---+---------+
| id| many_cols| | id|criterion|
+---+------------+ +---+---------+
| 1|lots_of_data| | 1| false|
| 2|lots_of_data| | 1| true|
| 3|lots_of_data| | 1| true|
+---+------------+ | 3| false|
+---+---------+
I intend to create additional column in df1:
+---+------------+------+
| id| many_cols|result|
+---+------------+------+
| 1|lots_of_data| 1|
| 2|lots_of_data| null|
| 3|lots_of_data| 0|
+---+------------+------+
result should be 1 if there is a corresponding true in df2
result should be 0 if there's no corresponding true in df2
result should be null if there is no corresponding id in df2
I cannot think of an efficient way to do it. I am stuck with only the 3rd condition working after a join:
df = df1.join(df2, 'id', 'full')
df.show()
# +---+------------+---------+
# | id| many_cols|criterion|
# +---+------------+---------+
# | 1|lots_of_data| false|
# | 1|lots_of_data| true|
# | 1|lots_of_data| true|
# | 3|lots_of_data| false|
# | 2|lots_of_data| null|
# +---+------------+---------+
PySpark dataframes are created like this:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.getOrCreate()
df1cols = ['id', 'many_cols']
df1data = [(1, 'lots_of_data'),
(2, 'lots_of_data'),
(3, 'lots_of_data')]
df2cols = ['id', 'criterion']
df2data = [(1, False),
(1, True),
(1, True),
(3, None)]
df1 = spark.createDataFrame(df1data, df1cols)
df2 = spark.createDataFrame(df2data, df2cols)
A simple way would be to groupby df2 to get the max criterion by id the join with df1, this way you reduce the number of lines to join. The max of a boolean column is true if there is at least one corresponding true value:
from pyspark.sql import functions as F
df2_group = df2.groupBy("id").agg(F.max("criterion").alias("criterion"))
result = df1.join(df2_group, ["id"], "left").withColumn(
"result",
F.col("criterion").cast("int")
).drop("criterion")
result.show()
#+---+------------+------+
#| id| many_cols|result|
#+---+------------+------+
#| 1|lots_of_data| 1|
#| 3|lots_of_data| 0|
#| 2|lots_of_data| null|
#+---+------------+------+
You can try a correlated subquery to get the maximum Boolean from df2, and cast that to an integer.
df1.createOrReplaceTempView('df1')
df2.createOrReplaceTempView('df2')
df = spark.sql("""
select
df1.*,
(select int(max(criterion)) from df2 where df1.id = df2.id) as result
from df1
""")
df.show()
+---+------------+------+
| id| many_cols|result|
+---+------------+------+
| 1|lots_of_data| 1|
| 3|lots_of_data| 0|
| 2|lots_of_data| null|
+---+------------+------+
check out this solution. After joining. you can use multiple condition checks based on your requirement and assign the value accordingly using when clause and then take the max value of result grouping by id and other columns. you can use window function as well to calculate the max of result if you are just using just id for the partition.
from pyspark.sql import functions as F
from pyspark.sql.window import Window
df1cols = ['id', 'many_cols']
df1data = [(1, 'lots_of_data'),
(2, 'lots_of_data'),
(3, 'lots_of_data')]
df2cols = ['id', 'criterion']
df2data = [(1, False),
(1, True),
(1, True),
(3, False)]
df1 = spark.createDataFrame(df1data, df1cols)
df2 = spark.createDataFrame(df2data, df2cols)
df2_mod =df2.withColumnRenamed("id", "id_2")
df3=df1.join(df2_mod, on=df1.id== df2_mod.id_2, how='left')
cond1 = (F.col("id")== F.col("id_2"))& (F.col("criterion")==1)
cond2 = (F.col("id")== F.col("id_2"))& (F.col("criterion")==0)
cond3 = (F.col("id_2").isNull())
df3.select("id", "many_cols", F.when(cond1, 1).when(cond2,0).when(cond3, F.lit(None)).alias("result"))\
.groupBy("id", "many_cols").agg(F.max(F.col("result")).alias("result")).orderBy("id").show()
Result:
------
+---+------------+------+
| id| many_cols|result|
+---+------------+------+
| 1|lots_of_data| 1|
| 2|lots_of_data| null|
| 3|lots_of_data| 0|
+---+------------+------+
Using window function
w=Window().partitionBy("id")
df3.select("id", "many_cols", F.when(cond1, 1).when(cond2,0).when(cond3, F.lit(None)).alias("result"))\
.select("id", "many_cols", F.max("result").over(w).alias("result")).drop_duplicates().show()
I had to merge the ideas of proposed answers to get the solution which suited me most.
# The `cond` variable is very useful, here it represents several complex conditions
cond = F.col('criterion') == True
df2_grp = df2.select(
'id',
F.when(cond, 1).otherwise(0).alias('c')
).groupBy('id').agg(F.max(F.col('c')).alias('result'))
df = df1.join(df2_grp, 'id', 'left')
df.show()
#+---+------------+------+
#| id| many_cols|result|
#+---+------------+------+
#| 1|lots_of_data| 1|
#| 3|lots_of_data| 0|
#| 2|lots_of_data| null|
#+---+------------+------+

How to stack two columns into a single one in PySpark?

I have the following PySpark DataFrame:
id col1 col2
A 2 3
A 2 4
A 4 6
B 1 2
I want to stack col1 and col2 in order to get a single column as follows:
id col3
A 2
A 3
A 4
A 6
B 1
B 2
How can I do so?
df = (
sc.parallelize([
(A, 2, 3), (A, 2, 4), (A, 4, 6),
(B, 1, 2),
]).toDF(["id", "col1", "col2"])
)
The simplest is merge col1 and col2 into an array column and then explode it:
df.show()
+---+----+----+
| id|col1|col2|
+---+----+----+
| A| 2| 3|
| A| 2| 4|
| A| 4| 6|
| B| 1| 2|
+---+----+----+
df.selectExpr('id', 'explode(array(col1, col2))').show()
+---+---+
| id|col|
+---+---+
| A| 2|
| A| 3|
| A| 2|
| A| 4|
| A| 4|
| A| 6|
| B| 1|
| B| 2|
+---+---+
You can drop duplicates if you don't need them.
To do this, group by the "id", then collect the lists from both "col1" and "col2" in an aggregation, to then explode it again into one column.
To get the unique numbers, just drop the duplicates after.
I see that you also have the numbers sorted in your end result, this is done by sorting the concatted lists in the aggregation.
The following code:
from pyspark.sql.functions import concat, collect_list, explode, col, sort_array
df = (
sc.parallelize([
('A', 2, 3), ('A', 2, 4), ('A', 4, 6),
('B', 1, 2),
]).toDF(["id", "col1", "col2"])
)
result = df.groupBy("id") \
.agg(sort_array(concat(collect_list("col1"),collect_list("col2"))).alias("all_numbers")) \
.orderBy("id") \
.withColumn('number', explode(col('all_numbers'))) \
.dropDuplicates() \
.select("id","number") \
.show()
will yield:
+---+------+
| id|number|
+---+------+
| A| 2|
| A| 3|
| A| 4|
| A| 6|
| B| 1|
| B| 2|
+---+------+
Rather a simple solution if the number of columns involved is less.
df = (
sc.parallelize([
('A', 2, 3), ('A', 2, 4), ('A', 4, 6),
('B', 1, 2),
]).toDF(["id", "col1", "col2"])
)
df.show()
+---+----+----+
| id|col1|col2|
+---+----+----+
| A| 2| 3|
| A| 2| 4|
| A| 4| 6|
| B| 1| 2|
+---+----+----+
df1 = df.select(['id', 'col1'])
df2 = df.select(['id', 'col2']).withColumnRenamed('col2', 'col1')
df_new = df1.union(df2)
df_new = df_new.drop_duplicates()
df_new.show()
+---+----+
| id|col1|
+---+----+
| A| 3|
| A| 4|
| B| 1|
| A| 6|
| A| 2|
| B| 2|
+---+----+

Comparing columns in Pyspark

I am working on a PySpark DataFrame with n columns. I have a set of m columns (m < n) and my task is choose the column with max values in it.
For example:
Input: PySpark DataFrame containing :
col_1 = [1,2,3], col_2 = [2,1,4], col_3 = [3,2,5]
Ouput :
col_4 = max(col1, col_2, col_3) = [3,2,5]
There is something similar in pandas as explained in this question.
Is there any way of doing this in PySpark or should I change convert my PySpark df to Pandas df and then perform the operations?
You can reduce using SQL expressions over a list of columns:
from pyspark.sql.functions import max as max_, col, when
from functools import reduce
def row_max(*cols):
return reduce(
lambda x, y: when(x > y, x).otherwise(y),
[col(c) if isinstance(c, str) else c for c in cols]
)
df = (sc.parallelize([(1, 2, 3), (2, 1, 2), (3, 4, 5)])
.toDF(["a", "b", "c"]))
df.select(row_max("a", "b", "c").alias("max")))
Spark 1.5+ also provides least, greatest
from pyspark.sql.functions import greatest
df.select(greatest("a", "b", "c"))
If you want to keep name of the max you can use `structs:
from pyspark.sql.functions import struct, lit
def row_max_with_name(*cols):
cols_ = [struct(col(c).alias("value"), lit(c).alias("col")) for c in cols]
return greatest(*cols_).alias("greatest({0})".format(",".join(cols)))
maxs = df.select(row_max_with_name("a", "b", "c").alias("maxs"))
And finally you can use above to find select "top" column:
from pyspark.sql.functions import max
((_, c), ) = (maxs
.groupBy(col("maxs")["col"].alias("col"))
.count()
.agg(max(struct(col("count"), col("col"))))
.first())
df.select(c)
We can use greatest
Creating DataFrame
df = spark.createDataFrame(
[[1,2,3], [2,1,2], [3,4,5]],
['col_1','col_2','col_3']
)
df.show()
+-----+-----+-----+
|col_1|col_2|col_3|
+-----+-----+-----+
| 1| 2| 3|
| 2| 1| 2|
| 3| 4| 5|
+-----+-----+-----+
Solution
from pyspark.sql.functions import greatest
df2 = df.withColumn('max_by_rows', greatest('col_1', 'col_2', 'col_3'))
#Only if you need col
#from pyspark.sql.functions import col
#df2 = df.withColumn('max', greatest(col('col_1'), col('col_2'), col('col_3')))
df2.show()
+-----+-----+-----+-----------+
|col_1|col_2|col_3|max_by_rows|
+-----+-----+-----+-----------+
| 1| 2| 3| 3|
| 2| 1| 2| 2|
| 3| 4| 5| 5|
+-----+-----+-----+-----------+
You can also use the pyspark built-in least:
from pyspark.sql.functions import least, col
df = df.withColumn('min', least(col('c1'), col('c2'), col('c3')))
Another simple way of doing it. Let us say that the below df is your dataframe
df = sc.parallelize([(10, 10, 1 ), (200, 2, 20), (3, 30, 300), (400, 40, 4)]).toDF(["c1", "c2", "c3"])
df.show()
+---+---+---+
| c1| c2| c3|
+---+---+---+
| 10| 10| 1|
|200| 2| 20|
| 3| 30|300|
|400| 40| 4|
+---+---+---+
You can process the above df as below to get the desited results
from pyspark.sql.functions import lit, min
df.select( lit('c1').alias('cn1'), min(df.c1).alias('c1'),
lit('c2').alias('cn2'), min(df.c2).alias('c2'),
lit('c3').alias('cn3'), min(df.c3).alias('c3')
)\
.rdd.flatMap(lambda r: [ (r.cn1, r.c1), (r.cn2, r.c2), (r.cn3, r.c3)])\
.toDF(['Columnn', 'Min']).show()
+-------+---+
|Columnn|Min|
+-------+---+
| c1| 3|
| c2| 2|
| c3| 1|
+-------+---+
Scala solution:
df = sc.parallelize(Seq((10, 10, 1 ), (200, 2, 20), (3, 30, 300), (400, 40, 4))).toDF("c1", "c2", "c3"))
df.rdd.map(row=>List[String](row(0).toString,row(1).toString,row(2).toString)).map(x=>(x(0),x(1),x(2),x.min)).toDF("c1","c2","c3","min").show
+---+---+---+---+
| c1| c2| c3|min|
+---+---+---+---+
| 10| 10| 1| 1|
|200| 2| 20| 2|
| 3| 30|300| 3|
|400| 40| 4| 4|
+---+---+---+---+

Categories