I have a Spark Dataframe (json_df) and I need to create another Dataframe based on the json nested:
This is my current Dataframe:
I know I could do that manually like: final_df = json_df.select( col("Body.EquipmentId"),..... ) but I want to do that in a generic way.
note: for this specific DF, the json records has the same structure.
Any idea?
Thanks!
Programmatically, you can do it like this:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
conf = SparkConf()
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
df = sc.parallelize([({"A":1, "B":2},), ({"A":3,"B":4},), ({"A":5,"B":6},)]).toDF(['Body'])
keys_df = df.select(F.explode(F.map_keys(F.col('Body')))).distinct()
keys = list(map(lambda row: row[0], keys_df.collect()))
key_cols = list(map(lambda f: F.col("Body").getItem(f).alias(str(f)), keys))
final_cols = df.select(key_cols)
final_cols.show()
Which produces
+---+---+
| B| A|
+---+---+
| 2| 1|
| 4| 3|
| 6| 5|
+---+---+
If you have the entire list of keys already, you can skip the part where it gets the keys and just set the keys manually:
keys = ['A', 'B']
Source: https://mungingdata.com/pyspark/dict-map-to-multiple-columns/
Related
I have three pyspark dataframes. I want to put the dataset reference in a dictionary, write a loop, perform some operations on these three dataframes, and then save them for further analysis. But I struggle with the last step. I have two questions:
In my code below, how do I access the results in TRANSFORMS? When I type: print(TRANSFORMS[0])
I only get this cryptic result:
<function __main__.multi_output(Input_table, table_name='ONE')>
Is there a mistake in my code and the transormations are never made?
How do I modify the function so it saves three datasets like df_1_result, df_2_result, df_3_result which I can then later use in further analysis?
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName('Sparky').getOrCreate()
# Create the initial dataframe
data = [("James","M",60000),("Michael","M",70000),
("Robert",None,400000),("Maria","F",500000),
("Jen","",None)]
columns = ["name","gender","salary"]
df_when = spark.createDataFrame(data = data, schema = columns)
# Create three identical datasets
df_1 = df_when
df_2 = df_when
df_3 = df_when
TRANSFORMS = []
DATASETS = {
"ONE" : df_1,
"TWO" : df_2,
"THREE" : df_3,
}
for table_name, table_location in list(DATASETS.items()):
def multi_output(Input_table, table_name=table_name):
if table_name=="ONE":
output_table = Input_table.drop("name")
elif table_name=="TWO":
output_table== Input_table.drop("gender")
elif table_name=="THREE":
output_table = Input_table.drop("salary")
return output_table
TRANSFORMS.append(multi_output)
There are a couple of issues here:
Issue 1: TRANSFORMS.append(multi_output) simply adds the function definition to the TRANSFORMS list. The function is never invoked. Also, we should define it outside the for-loop.
Issue 2: The statement under the second condition has a typo.
The code below, should work as expected.
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName('Sparky').getOrCreate()
# Create the initial dataframe
data = [("James","M",60000),("Michael","M",70000),
("Robert",None,400000),("Maria","F",500000),
("Jen","",None)]
columns = ["name","gender","salary"]
df_when = spark.createDataFrame(data = data, schema = columns)
# Create three identical datasets
df_1 = df_when
df_2 = df_when
df_3 = df_when
TRANSFORMS = []
DATASETS = {
"ONE" : df_1,
"TWO" : df_2,
"THREE" : df_3,
}
def multi_output(Input_table, table_name):
output_table = Input_table
if table_name=="ONE":
output_table = Input_table.drop("name")
elif table_name=="TWO":
output_table= Input_table.drop("gender")
elif table_name=="THREE":
output_table = Input_table.drop("salary")
return output_table
for table_name, table_location in list(DATASETS.items()):
TRANSFORMS.append(multi_output(table_location,table_name))
len(TRANSFORMS)
TRANSFORMS[0].show()
TRANSFORMS[1].show()
TRANSFORMS[2].show()
+------+------+
|gender|salary|
+------+------+
| M| 60000|
| M| 70000|
| null|400000|
| F|500000|
| | null|
+------+------+
+-------+------+
| name|salary|
+-------+------+
| James| 60000|
|Michael| 70000|
| Robert|400000|
| Maria|500000|
| Jen| null|
+-------+------+
+-------+------+
| name|gender|
+-------+------+
| James| M|
|Michael| M|
| Robert| null|
| Maria| F|
| Jen| |
+-------+------+
I want to convert my list of dictionaries into DataFrame. This is the list:
mylist =
[
{"type_activity_id":1,"type_activity_name":"xxx"},
{"type_activity_id":2,"type_activity_name":"yyy"},
{"type_activity_id":3,"type_activity_name":"zzz"}
]
This is my code:
from pyspark.sql.types import StringType
df = spark.createDataFrame(mylist, StringType())
df.show(2,False)
+-----------------------------------------+
| value|
+-----------------------------------------+
|{type_activity_id=1,type_activity_id=xxx}|
|{type_activity_id=2,type_activity_id=yyy}|
|{type_activity_id=3,type_activity_id=zzz}|
+-----------------------------------------+
I assume that I should provide some mapping and types for each column, but I don't know how to do it.
Update:
I also tried this:
schema = ArrayType(
StructType([StructField("type_activity_id", IntegerType()),
StructField("type_activity_name", StringType())
]))
df = spark.createDataFrame(mylist, StringType())
df = df.withColumn("value", from_json(df.value, schema))
But then I get null values:
+-----+
|value|
+-----+
| null|
| null|
+-----+
In the past, you were able to simply pass a dictionary to spark.createDataFrame(), but this is now deprecated:
mylist = [
{"type_activity_id":1,"type_activity_name":"xxx"},
{"type_activity_id":2,"type_activity_name":"yyy"},
{"type_activity_id":3,"type_activity_name":"zzz"}
]
df = spark.createDataFrame(mylist)
#UserWarning: inferring schema from dict is deprecated,please use pyspark.sql.Row instead
# warnings.warn("inferring schema from dict is deprecated,"
As this warning message says, you should use pyspark.sql.Row instead.
from pyspark.sql import Row
spark.createDataFrame(Row(**x) for x in mylist).show(truncate=False)
#+----------------+------------------+
#|type_activity_id|type_activity_name|
#+----------------+------------------+
#|1 |xxx |
#|2 |yyy |
#|3 |zzz |
#+----------------+------------------+
Here I used ** (keyword argument unpacking) to pass the dictionaries to the Row constructor.
You can do it like this. You will get a dataframe with 2 columns.
mylist = [
{"type_activity_id":1,"type_activity_name":"xxx"},
{"type_activity_id":2,"type_activity_name":"yyy"},
{"type_activity_id":3,"type_activity_name":"zzz"}
]
myJson = sc.parallelize(mylist)
myDf = sqlContext.read.json(myJson)
Output :
+----------------+------------------+
|type_activity_id|type_activity_name|
+----------------+------------------+
| 1| xxx|
| 2| yyy|
| 3| zzz|
+----------------+------------------+
in Spark Version 2.4 it is possible to do directly with
df=spark.createDataFrame(mylist)
>>> mylist = [
... {"type_activity_id":1,"type_activity_name":"xxx"},
... {"type_activity_id":2,"type_activity_name":"yyy"},
... {"type_activity_id":3,"type_activity_name":"zzz"}
... ]
>>> df1=spark.createDataFrame(mylist)
>>> df1.show()
+----------------+------------------+
|type_activity_id|type_activity_name|
+----------------+------------------+
| 1| xxx|
| 2| yyy|
| 3| zzz|
+----------------+------------------+
I was also facing the same issue when creating dataframe from list of dictionaries.
I have resolved this using namedtuple.
Below is my code using data provided.
from collections import namedtuple
final_list = []
mylist = [{"type_activity_id":1,"type_activity_name":"xxx"},
{"type_activity_id":2,"type_activity_name":"yyy"},
{"type_activity_id":3,"type_activity_name":"zzz"}
]
ExampleTuple = namedtuple('ExampleTuple', ['type_activity_id', 'type_activity_name'])
for my_dict in mylist:
namedtupleobj = ExampleTuple(**my_dict)
final_list.append(namedtupleobj)
sqlContext.createDataFrame(final_list).show(truncate=False)
output
+----------------+------------------+
|type_activity_id|type_activity_name|
+----------------+------------------+
|1 |xxx |
|2 |yyy |
|3 |zzz |
+----------------+------------------+
my version informations are as follows
spark: 2.4.0
python: 3.6
It is not necessary to have my_list variable. since it was available I have used it to create namedtuple object otherwise directly namedtuple object can be created.
I have 2 data frames to compare both have the same number of columns and the comparison result should have the field that is mismatching and the values along with the ID.
Dataframe one
+-----+---+--------+
| name| id| City|
+-----+---+--------+
| Sam| 3| Toronto|
| BALU| 11| YYY|
|CLAIR| 7|Montreal|
|HELEN| 10| London|
|HELEN| 16| Ottawa|
+-----+---+--------+
Dataframe two
+-------------+-----------+-------------+
|Expected_name|Expected_id|Expected_City|
+-------------+-----------+-------------+
| SAM| 3| Toronto|
| BALU| 11| YYY|
| CLARE| 7| Montreal|
| HELEN| 10| Londn|
| HELEN| 15| Ottawa|
+-------------+-----------+-------------+
Expected Output
+---+------------+--------------+-----+
| ID|Actual_value|Expected_value|Field|
+---+------------+--------------+-----+
| 7| CLAIR| CLARE| name|
| 3| Sam| SAM| name|
| 10| London| Londn| City|
+---+------------+--------------+-----+
Code
Create example data
from pyspark.sql import SQLContext
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import SparkSession
sc = SparkContext()
sql_context = SQLContext(sc)
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR") # log only on fails
df_Actual = sql_context.createDataFrame(
[("Sam", 3,'Toronto'), ("BALU", 11,'YYY'), ("CLAIR", 7,'Montreal'),
("HELEN", 10,'London'), ("HELEN", 16,'Ottawa')],
["name", "id","City"]
)
df_Expected = sql_context.createDataFrame(
[("SAM", 3,'Toronto'), ("BALU", 11,'YYY'), ("CLARE", 7,'Montreal'),
("HELEN", 10,'Londn'), ("HELEN", 15,'Ottawa')],
["Expected_name", "Expected_id","Expected_City"]
)
Create empty dataframe for Result
field = [
StructField("ID",StringType(), True),
StructField("Actual_value", StringType(), True),
StructField("Expected_value", StringType(), True),
StructField("Field", StringType(), True)
]
schema = StructType(field)
Df_Result = sql_context.createDataFrame(sc.emptyRDD(), schema)
Join expected and actual on id's
df_cobined = df_Actual.join(df_Expected, (df_Actual.id == df_Expected.Expected_id))
col_names=df_Actual.schema.names
Loop through each column to find mismatches
for col_name in col_names:
#Filter for column values not matching
df_comp= df_cobined.filter(col(col_name)!=col("Expected_"+col_name ))\
.select(col('id'),col(col_name),col("Expected_"+col_name ))
#Add not matching column name
df_comp = df_comp.withColumn("Field", lit(col_name))
#Add to final result
Df_Result = Df_Result.union(df_comp)
Df_Result.show()
This code works as expected. However, in the real case, I have more columns and millions of rows to compare. With this code, it takes more time to finish the comparison. Is there a better way to increase the performance and get the same result?
One way to avoid doing the union is the following:
Create a list of columns to compare: to_compare
Next select the id column and use pyspark.sql.functions.when to compare the columns. For those with a mismatch, build an array of structs with 3 fields: (Actual_value, Expected_value, Field) for each column in to_compare
Explode the temp array column and drop the nulls
Finally select the id and use col.* to expand the values from the struct into columns.
Code:
StructType to store the mismatched fields.
import pyspark.sql.functions as f
# these are the fields you want to compare
to_compare = [c for c in df_Actual.columns if c != "id"]
df_new = df_cobined.select(
"id",
f.array([
f.when(
f.col(c) != f.col("Expected_"+c),
f.struct(
f.col(c).alias("Actual_value"),
f.col("Expected_"+c).alias("Expected_value"),
f.lit(c).alias("Field")
)
).alias(c)
for c in to_compare
]).alias("temp")
)\
.select("id", f.explode("temp"))\
.dropna()\
.select("id", "col.*")
df_new.show()
#+---+------------+--------------+-----+
#| id|Actual_value|Expected_value|Field|
#+---+------------+--------------+-----+
#| 7| CLAIR| CLARE| name|
#| 10| London| Londn| City|
#| 3| Sam| SAM| name|
#+---+------------+--------------+-----+
Join only those records where expected id equals actual and there is mismatch in any other column:
df1.join(df2, df1.id=df2.id and (df1.name != df2.name or df1.age != df2.age...))
This means you will do for loop only across mismatched rows, instead of whole dataset.
For this who are looking for an answer, I transposed the data frame and then did a comparison.
from pyspark.sql.functions import array, col, explode, struct, lit
def Transposedf(df, by,colheader):
# Filter dtypes and split into column names and type description
cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
# Spark SQL supports only homogeneous columns
assert len(set(dtypes)) == 1, "All columns have to be of the same type"
# Create and explode an array of (column_name, column_value) structs
kvs = explode(array([ struct(lit(c).alias("Field"), col(c).alias(colheader)) for c in cols ])).alias("kvs")
return df.select(by + [kvs]).select(by + ["kvs.Field", "kvs."+colheader])
Then the comparison looks like this
def Compare_df(df_Expected,df_Actual):
df_combined = (df_Actual
.join(df_Expected, ((df_Actual.id == df_Expected.id)
& (df_Actual.Field == df_Expected.Field)
& (df_Actual.Actual_value != df_Expected.Expected_value)))
.select([df_Actual.account_unique_id,df_Actual.Field,df_Actual.Actual_value,df_Expected.Expected_value])
)
return df_combined
I called these 2 functions as
df_Actual=Transposedf(df_Actual, ["id"],'Actual_value')
df_Expected=Transposedf(df_Expected, ["id"],'Expected_value')
#Compare the expected and actual
df_result=Compare_df(df_Expected,df_Actual)
I have a Dataframe with two columns: BrandWatchErwaehnungID and word_counts.
The word_counts column is the output of `CountVectorizer (a sparse vector). After dropped the empty rows I have created two new columns one with the indices of the sparse vector and one with their values.
help0 = countedwords_text['BrandWatchErwaehnungID','word_counts'].rdd\
.filter(lambda x : x[1].indices.size!=0)\
.map(lambda x : (x[0],x[1],DenseVector(x[1].indices) , DenseVector(x[1].values))).toDF()\
.withColumnRenamed("_1", "BrandWatchErwaenungID").withColumnRenamed("_2", "word_counts")\
.withColumnRenamed("_3", "word_indices").withColumnRenamed("_4", "single_word_counts")
I needed to convert them to dense vectors before adding to my Dataframe due to spark did not accept numpy.ndarray. My problem is that I now want to explode that Dataframeon the word_indices column but the explode method from pyspark.sql.functions does only support arrays or map as input.
I have tried:
help1 = help0.withColumn('b' , explode(help0.word_indices))
and get the following error:
cannot resolve 'explode(`word_indices')' due to data type mismatch: input to function explode should be array or map type
Afterwards I tried:
help1 = help0.withColumn('b' , explode(help0.word_indices.toArray()))
Which also did not worked...
Any suggestions?
You have to use udf:
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import *
from pyspark.ml.linalg import *
#udf("array<integer>")
def indices(v):
if isinstance(v, DenseVector):
return list(range(len(v)))
if isinstance(v, SparseVector):
return v.indices.tolist()
df = spark.createDataFrame([
(1, DenseVector([1, 2, 3])), (2, SparseVector(5, {4: 42}))],
("id", "v"))
df.select("id", explode(indices("v"))).show()
# +---+---+
# | id|col|
# +---+---+
# | 1| 0|
# | 1| 1|
# | 1| 2|
# | 2| 4|
# +---+---+
enter image description here
I want to get C4 by using formula,
for example ,when c1='104001',to calulate C4
You can use something like this below to add another column:
from pyspark.sql import Row
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import udf
sc = SparkContext()
sqlContext = SQLContext(sc)
l = [(25,24),[23,45],[24,56]]
rdd = sc.parallelize(l)
dummy = rdd.map(lambda x: Row(var1=int(x[0]),var2=int(x[1])))
dummyframe = sqlContext.createDataFrame(dummy)
def getValDivideSum(dataFrame):
max = dataFrame.agg({"var2":'sum'}).collect()[0][0]
dataFrame = dataFrame.withColumn("var3",dataFrame.var2/max).select("var1","var2","var3")
return dataFrame
Output will be something like this:
+----+----+-----+
|var1|var2| var3|
+----+----+-----+
| 25| 24|0.192|
| 23| 45| 0.36|
| 24| 56|0.448|
+----+----+-----+
Hope this helps.