Flatten XML dataframe in spark - python

from pyspark.sql.functions import *
def flatten_df(nested_df):
exist = True
while exist:
flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
if len(nested_cols) > 0:
print(nested_cols)
flat_df = nested_df.select(flat_cols +
[col("`"+nc+'`.`'+c+"`").alias((nc+'_'+c).replace(".","_"))
for nc in nested_cols
for c in nested_df.select("`"+nc+'`.*').columns])
nested_df=flat_df
#break
else:
exist = False
return flat_df
df = sqlContext.read.format("com.databricks.spark.xml").option("rowTag", "GetDocument").load("/FileStore/tables/test.xml")
df1=flatten_df(df)
Here is the code I am using to flatten an xml document. Basically I want to take a xml with nested xml and flatten all of it to a single row without any structured datatypes, so each value is a column. The above code works for test cases I have done, but I have tried on a very large XML and after a couple rounds of flattening (in the while loop) it breaks with the following error:
'Ambiguous reference to fields StructField(_Id,StringType,true), StructField(_id,StringType,true);'
I assume because it is trying to create 2 seperate columns with the same name. How can I avoid this but keep my code generic for any XML?
One thing to note, it is okay to have arrays as a datatype for a column, I will be exploding those arrays to seperate rows in a later step.
Update example
Original DF -
|-- Order: long (nullable = true)
|-- attval: string (nullable = true)
|-- children: struct (nullable = true)
| |-- id: string(nullable = true)
| |-- att: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- Order: long (nullable = true)
| | | |-- attval: string (nullable = true)
DF after function -
|-- Order: long (nullable = true)
|-- attval: string (nullable = true)
|-- children_id: string(nullable = true)
|-- children_att: array (nullable = true)
| |-- children_att_element_Order: long (nullable = true)
| |-- children_att_element_attval: string (nullable = true)

I was facing a similar issue and was able to parse my XML file as follow
Install the following Maven library: “com.databricks:spark-xml_2.10:0.4.1” on Databricks
Upload your file on DBFS using the following path: FileStore > tables > xml > sample_data
Run the following code:
data = spark.read.format("com.databricks.spark.xml").option("rootTag", "col1").option("rowTag", "col2").option("rowTag", "col3").load("dbfs:/FileStore/tables/sample_data.xml")
display(data)

Related

How to apply a schema from an existing DataFrame to another DataFrame with missing columns in PySpark

I have a JSON file with various levels of nested struct/array columns in one DataFrame, df_1. I have a smaller DataFrame, df_2, with less columns, but the column names match with some column names in df_1, and none of the nested structure.
I want to apply the schema from df_1 to df_2 in a way that the two share the same schema, taking the existing columns in df_2 where possible, and creating the columns/nested structure that exist in df_1 but not df_2.
df_1
root
|-- association_info: struct (nullable = true)
| |-- ancestry: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- doi: string (nullable = true)
| |-- gwas_catalog_id: string (nullable = true)
| |-- neg_log_pval: double (nullable = true)
| |-- study_id: string (nullable = true)
| |-- pubmed_id: string (nullable = true)
| |-- url: string (nullable = true)
|-- gold_standard_info: struct (nullable = true)
| |-- evidence: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- class: string (nullable = true)
| | | |-- confidence: string (nullable = true)
| | | |-- curated_by: string (nullable = true)
| | | |-- description: string (nullable = true)
| | | |-- pubmed_id: string (nullable = true)
| | | |-- source: string (nullable = true)
| |-- gene_id: string (nullable = true)
| |-- highest_confidence: string (nullable = true)
df_2
root
|-- study_id: string (nullable = true)
|-- description: string (nullable = true)
|-- gene_id: string (nullable = true)
The expected output would be to have the same schema as df_1, and for any columns that don't exist in df_2 to just fill with null.
I have tried completely flattening the structure of df_1 to join the two DataFrames, but then I'm unsure how to change it back into the original schema. All solutions I've attempted so far have been in PySpark. It would be preferable to use PySpark for performance considerations, but if a solution requires converted to a Pandas DataFrame that's also feasible.
df1.select('association_info.study_id',
'gold_standard_info.evidence.element.description',
'gold_standard_info.gene_id')
The above code will reach into the df1 and provide you requisite fields in df2. The schema will remain same.
Could you try the same.

Spark 3.2.2 Joining same dataframe multiple time does not drop column

We have some PySpark code that joins a table table_a, twice to another table table_b using the following code. After joining the table twice, we drop the key_hash column from the output DataFrame.
This code was working fine in spark version 3.0.1. Since upgrading to spark version 3.2.2, the behaviour has changed and during the first transform operation the key_hash field gets dropped from the output DataFrame but when the 2nd transform operation gets executed then the key_hash field still stays in the output_df.
Can someone please guide what has changed in Spark behaviour that is causing this issue?
def tr_join_sac_user(self, df_a):
def inner(df_b):
return (
df_b.join(df_a, on=df_b["sac_key_hash"] == df_a["key_hash"], how="left")
.drop(df_a.key_hash)
.drop(df_b.sac_key_hash)
)
return inner
def tr_join_sec_user(self, df_a):
def inner(df_b):
return (
df_b.join(df_a, on=df_b["sec_key_hash"] == df_a["key_hash"], how="left")
.drop(df_a.key_hash)
.drop(df_b.sec_key_hash)
)
return inner
table_a_df = spark.read.format("delta").load("/path/to/table_a")
table_b_df = spark.read.format("delta").load("/path/to/table_b")
output_df = table_b_df.transform(tr_join_sac_user(table_a_df))
output_df = output_df.transform(tr_join_sec_user(table_a_df))
If we use .drop('key_hash') instead of .drop(df_a.key_hash) that seems to work and the column does get dropped in 2nd transform as well. I would like to understand what has changed in Spark behaviour between these versions (or if it’s a bug) as this might have an impact in other places in our codebase as well.
Hi I also have an issue with this one, I don't know if its a bug or not but it seems not happening all time
utilization_raw = time_lab.crossJoin(approved_listing)
utilization_raw = utilization_raw\
.join(availability_series,
((utilization_raw.date_series == availability_series.availability_date)&\
(utilization_raw.listing_id == availability_series.listing_id)),"left")\
.drop(availability_series.listing_id).dropDuplicates()\ --> WORKING
.join(request_series,
((utilization_raw.date_series==request_series.request_date_series)&\
(utilization_raw.listing_id == request_series.listing_id)),"left")\
.drop(request_series.listing_id)\ --> WORKING
.join(listing_pricing,
((utilization_raw.date_series==listing_pricing.price_created_date)&\
(utilization_raw.listing_id==listing_pricing.listing_id)),'left').drop(listing_pricing.listing_id)\ --> NOT WORKING
Here's the result of printSchema()
root
|-- date_series: date (nullable = false)
|-- week_series: date (nullable = true)
|-- month_series: date (nullable = true)
|-- woy_num: integer (nullable = false)
|-- doy_num: integer (nullable = false)
|-- dow_num: integer (nullable = false)
|-- listing_id: integer (nullable = true)
|-- is_driverless: integer (nullable = false)
|-- listing_deleted_at: date (nullable = true)
|-- daily_gmv: decimal(38,23) (nullable = true)
|-- daily_nmv: decimal(38,23) (nullable = true)
|-- daily_calendar_gmv: decimal(31,13) (nullable = true)
|-- daily_calendar_nmv: decimal(31,13) (nullable = true)
|-- active_booking: long (nullable = true)
|-- is_available: integer (nullable = false)
|-- is_requested: integer (nullable = false)
|-- listing_id: integer (nullable = true) --> duplicated
|-- base_price: decimal(10,2) (nullable = true)
Update: what we did is we updated the databricks version from 9.1 to 11.3

Flatten a dynamic nested struct (struct inside struct) in PySpark

I'm struggling to flatten a JSON schema that has structs inside structs. The problem is that the inner structs names are dynamic so I can't access them easily using "." notion
The schema is similar to :
root
|-- A: string (nullable = true)
|-- Plugins: struct (nullable = true)
| |-- RfS: struct (nullable = true)
| |-- A
| |-- B
| |-- RtW: struct (nullable = true)
| |-- A
| |-- B
so As and Bs are fixed, but every JSON file has different number structs with different names (RfS,RtW) .. could be 2 .. could be 5 .. with dyanmic names that I don't know.
How can I flatten this structure easily in a dynamic way ?
The next solution is using a single select and chain function for flattening the final columns:
from pyspark.sql.functions import col
from itertools import chain
jsonData = """{
"A" : "some A",
"Plugins": {
"RfS": {
"A" : "RfSA",
"B" : "RfSB"
},
"RtW" : {
"A" : "RtWA",
"B" : "RtWA"
}
}
}"""
df = spark.read.json(sc.parallelize([jsonData]))
no_plug_cols = ["A"] # cols not in Plugins i.e A
plug_df = df.select("A", "Plugins.*")
# plug_df.printSchema()
# root
# |-- A: string (nullable = true)
# |-- RfS: struct (nullable = true)
# | |-- A: string (nullable = true)
# | |-- B: string (nullable = true)
# |-- RtW: struct (nullable = true)
# | |-- A: string (nullable = true)
# | |-- B: string (nullable = true)
# note that we use sets i.e set(plug_df.columns) - set(no_plug_cols) to retrieve cols in Plugins only
icols = [(col(f"{c}.A").alias(f"{c}.A"), col(f"{c}.B").alias(f"{c}.B"))
for c in (set(plug_df.columns) - set(no_plug_cols))]
# we use chain to flatten icols which is a list of tuples
plug_df.select(no_plug_cols + list(chain(*icols))).show()
# +------+-----+-----+-----+-----+
# | A|RfS.A|RfS.B|RtW.A|RtW.B|
# +------+-----+-----+-----+-----+
# |some A| RfSA| RfSB| RtWA| RtWA|
# +------+-----+-----+-----+-----+

Rename nested struct columns to all in lower case in a Spark DataFrame using PySpark

Similar kind of solution is already available using scala, but I need a solution in pyspark. I am new to python, need all your help on the same.
Below is the link for scala solution, For better understanding of requirement.
Rename nested struct columns in a Spark DataFrame
I am trying to change the names of a DataFrame columns in python. I am easily able to change the column names for direct fields but I'm facing difficulty while converting array struct columns.
Below is my DataFrame schema.
|-- VkjLmnVop: string (nullable = true)
|-- KaTasLop: string (nullable = true)
|-- AbcDef: struct (nullable = true)
| |-- UvwXyz: struct (nullable = true)
| | |-- MnoPqrstUv: string (nullable = true)
| | |-- ManDevyIxyz: string (nullable = true)
But I need the schema like below
|-- vkjlmnvop: string (nullable = true)
|-- kataslop: string (nullable = true)
|-- abcdef: struct (nullable = true)
| |-- uvwxyz: struct (nullable = true)
| | |-- mnopqrstuv: string (nullable = true)
| | |-- mandevyixyz: string (nullable = true)
How I can change Struct column names dynamically?
I have also found a different solution of similar logic with less number of lines.
import pyspark.sql.functions as spf
ds = {'AbcDef': {'UvwXyz': {'VkjLmnVop': 'abcd'}}, 'HijKS': 'fgds'}
df = spark.read.json(sc.parallelize([ds]))
df.printSchema()
"""
root
|-- AbcDef: struct (nullable = true)
| |-- UvwXyz: struct (nullable = true)
| | |-- VkjLmnVop: string (nullable = true)
|-- HijKS: string (nullable = true)
"""
for i in df.columns : df = df.withColumnRenamed(i, i.lower())
schemaDef = [y.replace("]","") for y in [x.replace("DataFrame[","") for x in df.__str__().split(", ")]]
for j in schemaDef :
columnName = j.split(": ")[0]
dataType = j.split(": ")[1]
df = df.withColumn(columnName, spf.col(columnName).cast(dataType.lower()))
df.printSchema()
"""
root
|-- abcdef: struct (nullable = true)
| |-- uvwxyz: struct (nullable = true)
| | |-- vkjlmnvop: string (nullable = true)
|-- hijks: string (nullable = true)
"""
I guess this is what you wanted. Hope it helps!
def get_column_wise_schema(df_string_schema, df_columns):
# Returns a dictionary containing column name and corresponding column schema as string.
column_schema_dict = {}
i = 0
while i < len(df_columns):
current_col = df_columns[i]
next_col = df_columns[i + 1] if i < len(df_columns) - 1 else None
current_col_split_key = '[' + current_col + ': ' if i == 0 else ' ' + current_col + ': '
next_col_split_key = ']' if i == len(df_columns) - 1 else ', ' + next_col + ': '
column_schema_dict[current_col] = df_string_schema.split(current_col_split_key)[1].\
split(next_col_split_key)[0]
i += 1
return column_schema_dict
def convert_colnames_to_lower(spark_df):
columns = spark_df.columns
column_wise_schema_dict = get_column_wise_schema(spark_df.__str__(), columns)
col_exprs = []
for column_name in columns:
column_schema_lowercase = column_wise_schema_dict[column_name]
col_exprs.append(spf.col(column_name).cast(column_schema_lowercase).
alias(column_name.lower()))
return spark_df.select(*col_exprs)
ds = {'AbcDef': {'UvwXyz': {'VkjLmnVop': 'abcd'}}, 'HijKS': 'fgds'}
df = spark.read.json(sc.parallelize([ds]))
df.printSchema()
"""
root
|-- AbcDef: struct (nullable = true)
| |-- UvwXyz: struct (nullable = true)
| | |-- VkjLmnVop: string (nullable = true)
|-- HijKS: string (nullable = true)
"""
converted_df = convert_colnames_to_lower(df)
converted_df.printSchema()
"""
root
|-- abcdef: struct (nullable = true)
| |-- uvwxyz: struct (nullable = true)
| | |-- vkjlmnvop: string (nullable = true)
|-- hijks: string (nullable = true)
"""

PySpark - Json explode nested with Struct and array of struct

I am trying to parse nested json with some sample json. Below is the print schema
|-- batters: struct (nullable = true)
| |-- batter: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- id: string (nullable = true)
| | | |-- type: string (nullable = true)
|-- id: string (nullable = true)
|-- name: string (nullable = true)
|-- ppu: double (nullable = true)
|-- topping: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- type: string (nullable = true)
|-- type: string (nullable = true)
Trying to explode batters,topping separately and combine them.
df_batter = df_json.select("batters.*")
df_explode1= df_batter.withColumn("batter", explode("batter")).select("batter.*")
df_explode2= df_json.withColumn("topping", explode("topping")).select("id",
"type","name","ppu","topping.*")
Unable to combine the two data frame.
Tried using single query
exploded1 = df_json.withColumn("batter", df_batter.withColumn("batter",
explode("batter"))).withColumn("topping", explode("topping")).select("id",
"type","name","ppu","topping.*","batter.*")
But getting error.Kindly help me to solve it. Thanks
You basically have to explode the arrays together using arrays_zip which returns a merged array of structs. Try this. I haven't tested but it should work.
from pyspark.sql import functions as F
df_json.select("id","type","name","ppu","topping","batters.*")\
.withColumn("zipped", F.explode(F.arrays_zip("batter","topping")))\
.select("id","type","name","ppu","zipped.*").show()
You could also do it one by one:
from pyspark.sql import functions as F
df1=df_json.select("id","type","name","ppu","topping","batters.*")\
.withColumn("batter", F.explode("batter"))\
.select("id","type","name","ppu","topping","batter")
df1.withColumn("topping", F.explode("topping")).select("id","type","name","ppu","topping.*","batter.*")

Categories