pyspark: heavy initialization with mapPartition

pyspark: heavy initialization with mapPartition - python

I want to partition a dataframe and iterate over each partition with some initialization in each iteration
data = [("2022-12-22",'d1',2,{'u1':{'sn':['s1','s2'],'fs':200}}),
("2022-12-22",'d2',1,{'u2':{'sn':['s2'],'fs':150},
'u3':{'sn':['s1'],'fs':50}}),
("2022-12-23",'d1',20,{'u1':{'sn':['s1','s2'],'fs':2000}}),
("2022-12-23",'d2',1,{'u2':{'sn':['s2'],'fs':1500},
'u3':{'sn':['s1'],'fs':500}})
]
usernode_schema = StructType([
StructField("sn",ArrayType(StringType())),
StructField("fs",LongType()),
])
userNodeType_schema = MapType(StringType(),usernode_schema)
schema = StructType([
StructField("date",StringType()),
StructField("id",StringType()),
StructField("rank",IntegerType()),
StructField("umap",userNodeType_schema)
])
df = spark.createDataFrame(data=data, schema = schema)
df.show(truncate=False)
def f_fm_row(x,sn_d,i):
sn_d['s1'] = sn_d.get('s1',0)+i
return {"date":x["date"], "id":x["id"],"rank":x["rank"],"s1":sn_d['s1']}
fm_row_schema = StructType([
StructField("date",StringType()),
StructField("id",StringType()),
StructField("rank",IntegerType()),
StructField("s1",IntegerType())
])
def f_map_par(par_df):
for i in range(0,2):
sn_dict ={}
par_df = map(lambda row: f_fm_row(row,sn_dict,i),par_df)
return par_df
#map partitions
(df
.repartition(col("date"))
.sortWithinPartitions(col("rank"))
.rdd
.mapPartitions(f_map_par)
).toDF(fm_row_schema).show()
But sn_dict ={} is not happening in each iteration. It's happens once only. How can I make sure that this initialization happens in each iteration?

Related

Check if Input dataset contains a key or not in PySpark

I have the following code as shown below. I need to check if the column y.lc.eoouh.ci is present in the input source and populate the column only if present, else it should be NULL.(The key lc is also optional)
The code below doesn't seem to work the way it is supposed to as even though y.lc.eoouch.ci is present in the input, it evaluates to NULL.
The has_column implementation is from here.
df = df_s_a \
.withColumn("ceci", \
udf(
lambda y : y.lc[-1].eoouh.ci \
if has_column(y, 'lc.eoouh.ci') \
else None, \
StringType()
)(col('eh') \
) \
) \
.select(
col('ceci')
)
df.show()
Sample input:
{
eh: {
lc: [
eoouch: {
ci: "1234ABC"
}
]
}
}

The df[something.path.somewhere] doesn't work. I'll have to investigate that option a bit.
I've managed to make it work like this:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
def has_column(df):
try:
df["lc"][0]["eoouch"]["ci"]
return True
except KeyError:
return False
if __name__ == "__main__":
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
data = [
{"eh": {"lc": [{"eoouch": {"ci": "test"}}]}},
{"eh": {"lc": [{"eoouch": {"as": "test"}}]}},
]
df = spark.createDataFrame(data)
add_column_udf = F.udf(
lambda y: y if has_column(y) else None,
StringType(),
)
df = df.withColumn("ceci", add_column_udf(F.col("eh")))
Result:
+----------------------------------+-------------------------+
|eh |ceci |
+----------------------------------+-------------------------+
|{lc -> [{eoouch -> {ci -> test}}]}|{lc=[{eoouch={ci=test}}]}|
|{lc -> [{eoouch -> {as -> test}}]}|null |
+----------------------------------+-------------------------+
It's not perfect since it's not a general solution for column name but it could be easily generalized since it works on a dict object.

Add aggregation from different dataframe as column

With this dataset:
start,end,rms,state,maxTemp,minTemp
2019-02-20T16:16:31.752Z,2019-02-20T17:33:34.750Z,4.588481,charge,35.0,32.0
2019-02-20T17:33:34.935Z,2019-02-20T18:34:49.737Z,5.770562,discharge,35.0,33.0
And this:
[{"EventDate":"2019-02-02T16:17:00.579Z","Value":"23"},
{"EventDate":"2019-02-02T16:18:01.579Z","Value":"23"},
{"EventDate":"2019-02-02T16:19:02.581Z","Value":"23"},
{"EventDate":"2019-02-02T16:20:03.679Z","Value":"23"},
{"EventDate":"2019-02-02T16:21:04.684Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:05.693Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:06.694Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:07.698Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:08.835Z","Value":"23"}]
schema = StructType([
StructField('EventDate', TimestampType(), True),
StructField('Value', FloatType(), True)
])
I want to add max and min values of the json dataset as columns into the csv dataset.
I have tried:
cyclesWithValues = csvDf\
.withColumn("max", jsondata.filter((col("EventDate") >= csvDf.start) & (col("EventDate") <= csvDf.end)).agg({"value": "max"}).head()["max(Value)"])\
.withColumn("min", jsondata.filter((col("EventDate") >= csvDf.start) & (col("EventDate") <= csvDf.end)).agg({"value": "min"}).head()["min(Value)"])
But I get this error:
AnalysisException: 'Resolved attribute(s) start#38271,end#38272 missing from EventDate#38283,Value#38286 in operator !Filter ((EventDate#38283 >= start#38271) && (EventDate#38283 <= end#38272)).;;\n!Filter ((EventDate#38283 >= start#38271) && (EventDate#38283 <= end#38272))\n+- Project [EventDate#38283, cast(Value#38280 as float) AS Value#38286]\n +- Project [to_timestamp(EventDate#38279, None) AS EventDate#38283, Value#38280]\n +- Relation[EventDate#38279,Value#38280] json\n'
I have a solution based on arrays, but it seems very slow, so I was hoping something like this would speed things up a bit.
Right now I am using this solution:
dfTemperature = spark.read.option("multiline", "true").json("path")
dfTemperatureCast = dfTemperature.withColumn("EventDate", to_timestamp(dfTemperature.EventDate)).withColumn("Value", dfTemperature.Value.cast('float'))
def AddVAluesToDf(row):
temperatures = dfTemperatureCast.filter((col("EventDate") >= row["start"]) & (col("EventDate") <= row["end"]))
maxTemp = temperatures.agg({"value": "max"}).head()["max(value)"]
minTemp = temperatures.agg({"value": "min"}).head()["min(value)"]
return (row.start, row.end, row.rms, row.state, maxTemp, minTemp)
pool = ThreadPool(10)
withValues = pool.map(AddVAluesToDf, rmsDf)
schema = StructType([
StructField('start', TimestampType(), True),
StructField('end', TimestampType(), True),
StructField('maxTemp', FloatType(), True),
StructField('minTemp', FloatType(), True)
])
cyclesDF = spark.createDataFrame(withValues, schema)

Python :concurrent.futures is executing whole code multiple times instead of the function being called in Executor.submit()

I have written a small code in pyspark to generate quantiles on a set of columns and I am calling this function using concurrent.futures,Because I want this to be done on two set of columns parallelly.
But instead of the function which should execute from Threadpool.executor,the whole code is getting executed thrice.
I am calling the function generate_advisor_quartiles() from the main() method of another python program.
from src.utils import sql_service, apa_constant as constant
from pyspark.sql.functions import monotonicallyIncreasingId
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext,HiveContext
from pyspark.sql.functions import lit
from pyspark.sql.types import *
from pyspark.ml.feature import
from concurrent.futures import *
from pyspark.ml.feature import Bucketizer
import numpy as np
import pandas as pd
import os
def generate_Quantiles(df,attr_list,spark_context):
jdf = df._jdf
quantileList = []
sqlContext = SQLContext(spark_context)
fields = [StructField('attribute', StringType(), True),
StructField('col1', DoubleType(), True),
StructField('col2', DoubleType(), True),
StructField('col3', DoubleType(), True),
StructField('col4', DoubleType(), True),
StructField('col5', DoubleType(), True)]
schema = StructType(fields)
for var in attr_list:
bindt = spark_context._jvm.com.dstsystems.apa.util.DFQuantileFunction.approxQuantile(jdf,[var.col_name],[0.0, 0.25, 0.5, 0.75, 1.0],0.0)
q0 = bindt[0][0]
q1 = bindt[0][1]
q2 = bindt[0][2]
q3 = bindt[0][3]
q4 = bindt[0][4]
colQuantileList = [q0,q1,q2,q3,q4]
quantileList.append(colQuantileList)
bindt = sorted(list(set(list(bindt[0]))))
bindt = [-float("inf")] + bindt
bindt.insert(len(bindt), float("inf"))
bindt.insert(len(bindt), float("NaN"))
bucketizer = Bucketizer().setInputCol(var.col_name).setOutputCol("{}_quantile".format(var.col_name)).setSplits(bindt)
df = bucketizer.transform(df)
df = df.withColumn("{}_quantile".format(var.col_name),(lit(4.0) - df["{}_quantile".format(var.col_name)]))
df.drop(var.col_name)
quantileRDD = spark_context.parallelize(quantileList)
quantileDF = sqlContext.createDataFrame(quantileRDD,schema)
df.count()
quantileDF.count()
return df,quantileDF
def generate_advisor_quartiles(spark_context, hive_context, log, **kwargs):
log.info("Started - Generate adviser quartile reports ")
sql = """describe dbName.tablename""" #.format(kwargs['sem_db'])
op = hive_context.sql(sql)
res = op.withColumn("ordinal_position", monotonicallyIncreasingId())
res.registerTempTable('attribs')
id_lst = hive_context.sql(
"select col_name from attribs where ordinal_position <= 24 order by ordinal_position").collect()
sql = "select %s from %s.tablename " % ((", ".join(str(v.col_name) for v in id_lst)), kwargs['sem_db'])
id_tbl = hive_context.sql(sql)
attr_lst = hive_context.sql(
"""select col_name from attribs where ordinal_position > 24 AND col_name not like '%vehicle%'
AND col_name not like '%cluster_num%'
AND col_name not like '%value_seg%'order by ordinal_position limit 2""").collect()
vhcl_lst = hive_context.sql(
"""select col_name from attribs where ordinal_position > 24 AND col_name not like '%vehicle%'
AND ( col_name like '%vehicle%'
OR col_name IN ('cluster_num', 'value_seg')
) order by ordinal_position""").collect()
sqltemp ="select %s from %s.Tablename" % ((", ".join(['entity_id'] + [str(vhcl.col_name) for vhcl in vhcl_lst])),kwargs['sem_db'])
id_tbl = hive_context.sql(sqltemp)
attr_lst1 = attr_lst[:len(attr_lst)//2]
attr_lst2 = attr_lst[len(attr_lst) // 2:]
# sqltemp = "select cast(entity_id as decimal(38,20)) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as decimal(38,20))" for attr in attr_lst), kwargs['sem_db'])
sqltemp1 = "select cast(entity_id as double) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as double)" for attr in attr_lst1), kwargs['sem_db'])
sqltemp2 = "select cast(entity_id as double) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as double)" for attr in attr_lst2), kwargs['sem_db'])
df1 = hive_context.sql(sqltemp1)
df1 = df1.replace(0, np.nan)
df2 = hive_context.sql(sqltemp2)
df2 = df2.replace(0, np.nan)
with ThreadPoolExecutor(max_workers=2) as executor2:
result1 = executor2.submit(generate_Quantiles, df1,attr_lst1,spark_context)
result2 = executor2.submit(generate_Quantiles, df2,attr_lst2,spark_context)
future_list = [result1, result2]
for future in as_completed(future_list):
print("completed")
df1,df2 = result1.result()
df3,df4 = result2.result()
finalQuantiles = df1.join(df3,"entity_id","inner")
quantilValuesDF = df2.union(df4)
finalQuantiles.show()
quantilValuesDF.show()

Executing SQL on Pandas Dataframe and storing results in same Dataframe

I have a data frame that looks like the image above. What I want to do is loop through the SQL statements under SQL_SCRIPT, execute them, and store the results in the next column over which would be called 'RESULTS'. When I just try and execute it (without storing it anywhere) it runs fine, but when I try and store the results in a new dataframe column it errors out with:
ValueError: cannot set a row with mismatched columns
Here is the code:
def run_tests(self):
s = self.connection()
df = self.retrieve_sql()
df_type = df.loc[df['STEP_TYPE'] == 'T']
df_to_list = df_type[['TABLE_NM', 'TEST_TABLE_NM', 'SQL_SCRIPT']]
print(df_to_list)
for sql_script in df_to_list['SQL_SCRIPT']:
df_to_list.loc['RESULTS'] = pd.read_sql(sql_script,s)
print(df_to_list)
Instead of read_sql I have also tried just using the session execute, which also works but I'm not sure how to store the results to the dataframe going that route:
def run_tests(self):
s = self.connection()
df = self.retrieve_sql()
df_type = df.loc[df['STEP_TYPE'] == 'T']
df_to_list = df_type[['TABLE_NM', 'TEST_TABLE_NM', 'SQL_SCRIPT']]
print(df_to_list)
for sql_script in df_to_list['SQL_SCRIPT']:
s.execute(sql_script)
Here is the connection function, if needed:
def connection(self):
con = self.load_json_file()
cfg_dsn = con['config']['dsn']
cfg_usr = con['config']['username']
cfg_pwd = con['config']['password']
udaExec = teradata.UdaExec(appName="DataAnalysis", version="1.0", logConsole=False)
session = udaExec.connect(method="odbc", dsn=cfg_dsn, username=cfg_usr, password=cfg_pwd)
return session

Consider running Series.apply on the column of SQL strings.
def run_tests(self):
s = self.connection()
c = s.cursor() # OPEN CURSOR
df = self.retrieve_sql()
df_type = df.loc[df['STEP_TYPE'] == 'T']
df_to_list = df_type[['TABLE_NM', 'TEST_TABLE_NM', 'SQL_SCRIPT']]
print(df_to_list)
# NEW METHOD TO RUN QUERY
def sql_run(x):
c.execute(x)
if c.rowcount > 0:
res = c.fetchone()[0]
else:
res = np.nan
return res
df_to_list['RESULTS'] = df_to_list['SQL_SCRIPT'].apply(sql_run)
print(df_to_list)

pyspark.sql.utils.AnalysisException: THEN and ELSE expressions should all be same type or coercible to a common type

I have the following code:
import pyspark.sql.functions as func
def get_alert(bid):
# for simplicity I only provide "return" part
return "1", "2"
get_alert_udf = func.udf(lambda bid:
get_alert(bid),
StructType(
[
StructField('prob', StringType()),
StructField('level', StringType())
]
)
)
df = df \
.withColumn("val", func.when(func.col("is_inside") == 1,
get_alert_udf(
func.col("building_id")
))
.otherwise(func.struct(func.lit("0"),func.lit("0")))
When I execute this code, I get the following error:
pyspark.sql.utils.AnalysisException:
u"cannot resolve
'CASE WHEN (`is_inside` = 1)
THEN <lambda>(building_id) ELSE named_struct('col1', '0', 'col2', '0') END' due to data type mismatch:
THEN and ELSE expressions should all be same type or coercible to a common type
In my case the outputs seem to have the same type in case of THEN and ELSE. I don't understand where is the difference between:
StructType(
[
StructField('prob', StringType()),
StructField('level', StringType())
]
)
and
func.struct(func.lit("0"),func.lit("0"))

The function you use returns a named struct. This means that both names and types have to match:
func.when(
func.col("is_inside") == 1,
get_alert_udf(func.col("building_id"))
).otherwise(
func.struct(func.lit("0").alias("prob"), func.lit("0").alias("level"))
)
or
schema = StructType([
StructField('prob', StringType()), StructField('level', StringType())
])
get_alert_udf = func.udf(get_alert, schema)
and then
func.when(
func.col("is_inside") == 1,
get_alert_udf(func.col("building_id"))
).otherwise(func.struct(func.lit("0"), func.lit("0")).cast(schema))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pyspark: heavy initialization with mapPartition - python

Related

Check if Input dataset contains a key or not in PySpark

Add aggregation from different dataframe as column

Python :concurrent.futures is executing whole code multiple times instead of the function being called in Executor.submit()

Executing SQL on Pandas Dataframe and storing results in same Dataframe

pyspark.sql.utils.AnalysisException: THEN and ELSE expressions should all be same type or coercible to a common type

Categories

Resources