Add aggregation from different dataframe as column - python

With this dataset:
start,end,rms,state,maxTemp,minTemp
2019-02-20T16:16:31.752Z,2019-02-20T17:33:34.750Z,4.588481,charge,35.0,32.0
2019-02-20T17:33:34.935Z,2019-02-20T18:34:49.737Z,5.770562,discharge,35.0,33.0
And this:
[{"EventDate":"2019-02-02T16:17:00.579Z","Value":"23"},
{"EventDate":"2019-02-02T16:18:01.579Z","Value":"23"},
{"EventDate":"2019-02-02T16:19:02.581Z","Value":"23"},
{"EventDate":"2019-02-02T16:20:03.679Z","Value":"23"},
{"EventDate":"2019-02-02T16:21:04.684Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:05.693Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:06.694Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:07.698Z","Value":"23"},
{"EventDate":"2019-02-02T17:40:08.835Z","Value":"23"}]
schema = StructType([
StructField('EventDate', TimestampType(), True),
StructField('Value', FloatType(), True)
])
I want to add max and min values of the json dataset as columns into the csv dataset.
I have tried:
cyclesWithValues = csvDf\
.withColumn("max", jsondata.filter((col("EventDate") >= csvDf.start) & (col("EventDate") <= csvDf.end)).agg({"value": "max"}).head()["max(Value)"])\
.withColumn("min", jsondata.filter((col("EventDate") >= csvDf.start) & (col("EventDate") <= csvDf.end)).agg({"value": "min"}).head()["min(Value)"])
But I get this error:
AnalysisException: 'Resolved attribute(s) start#38271,end#38272 missing from EventDate#38283,Value#38286 in operator !Filter ((EventDate#38283 >= start#38271) && (EventDate#38283 <= end#38272)).;;\n!Filter ((EventDate#38283 >= start#38271) && (EventDate#38283 <= end#38272))\n+- Project [EventDate#38283, cast(Value#38280 as float) AS Value#38286]\n +- Project [to_timestamp(EventDate#38279, None) AS EventDate#38283, Value#38280]\n +- Relation[EventDate#38279,Value#38280] json\n'
I have a solution based on arrays, but it seems very slow, so I was hoping something like this would speed things up a bit.
Right now I am using this solution:
dfTemperature = spark.read.option("multiline", "true").json("path")
dfTemperatureCast = dfTemperature.withColumn("EventDate", to_timestamp(dfTemperature.EventDate)).withColumn("Value", dfTemperature.Value.cast('float'))
def AddVAluesToDf(row):
temperatures = dfTemperatureCast.filter((col("EventDate") >= row["start"]) & (col("EventDate") <= row["end"]))
maxTemp = temperatures.agg({"value": "max"}).head()["max(value)"]
minTemp = temperatures.agg({"value": "min"}).head()["min(value)"]
return (row.start, row.end, row.rms, row.state, maxTemp, minTemp)
pool = ThreadPool(10)
withValues = pool.map(AddVAluesToDf, rmsDf)
schema = StructType([
StructField('start', TimestampType(), True),
StructField('end', TimestampType(), True),
StructField('maxTemp', FloatType(), True),
StructField('minTemp', FloatType(), True)
])
cyclesDF = spark.createDataFrame(withValues, schema)

Related

pyspark: heavy initialization with mapPartition

I want to partition a dataframe and iterate over each partition with some initialization in each iteration
data = [("2022-12-22",'d1',2,{'u1':{'sn':['s1','s2'],'fs':200}}),
("2022-12-22",'d2',1,{'u2':{'sn':['s2'],'fs':150},
'u3':{'sn':['s1'],'fs':50}}),
("2022-12-23",'d1',20,{'u1':{'sn':['s1','s2'],'fs':2000}}),
("2022-12-23",'d2',1,{'u2':{'sn':['s2'],'fs':1500},
'u3':{'sn':['s1'],'fs':500}})
]
usernode_schema = StructType([
StructField("sn",ArrayType(StringType())),
StructField("fs",LongType()),
])
userNodeType_schema = MapType(StringType(),usernode_schema)
schema = StructType([
StructField("date",StringType()),
StructField("id",StringType()),
StructField("rank",IntegerType()),
StructField("umap",userNodeType_schema)
])
df = spark.createDataFrame(data=data, schema = schema)
df.show(truncate=False)
def f_fm_row(x,sn_d,i):
sn_d['s1'] = sn_d.get('s1',0)+i
return {"date":x["date"], "id":x["id"],"rank":x["rank"],"s1":sn_d['s1']}
fm_row_schema = StructType([
StructField("date",StringType()),
StructField("id",StringType()),
StructField("rank",IntegerType()),
StructField("s1",IntegerType())
])
def f_map_par(par_df):
for i in range(0,2):
sn_dict ={}
par_df = map(lambda row: f_fm_row(row,sn_dict,i),par_df)
return par_df
#map partitions
(df
.repartition(col("date"))
.sortWithinPartitions(col("rank"))
.rdd
.mapPartitions(f_map_par)
).toDF(fm_row_schema).show()
But sn_dict ={} is not happening in each iteration. It's happens once only. How can I make sure that this initialization happens in each iteration?

Extract Schema from nested Json-String column in Pyspark

Assuming I have the following table:
body
{"Day":1,"vals":[{"id":"1", "val":"3"}, {"id":"2", "val":"4"}]}
My goal is to write down the schema in Pyspark for this nested json column. I've tried the following two things:
schema = StructType([
StructField("Day", StringType()),
StructField(
"vals",
StructType([
StructType([
StructField("id", StringType(), True),
StructField("val", DoubleType(), True)
])
StructType([
StructField("id", StringType(), True),
StructField("val", DoubleType(), True)
])
])
)
])
Here I get the error that of
'StructType' object has no attribute 'name'
Another approach was to declare the nested Arrays as ArrayType:
schema = StructType([
StructField("Day", StringType()),
StructField(
"vals",
ArrayType(
ArrayType(
StructField("id", StringType(), True),
StructField("val", DoubleType(), True)
, True)
ArrayType(
StructField("id", StringType(), True),
StructField("val", DoubleType(), True)
, True)
, True)
)
])
Here I get the following error:
takes from 2 to 3 positional arguments but 5 were given
Which propably comes from the array only taking the Sql type as an argument.
Can anybody tell me what their approach would be to create the schema, since I'm a super newbie to the whole topic..
This is the structure you are looking for:
Data = [
(1, [("1","3"), ("2","4")])
]
schema = StructType([
StructField('Day', IntegerType(), True),
StructField('vals', ArrayType(StructType([
StructField('id', StringType(), True),
StructField('val', StringType(), True)
]),True))
])
df = spark.createDataFrame(data=Data,schema=schema)
df.printSchema()
df.show(truncate=False)
This will get you the next output:
root
|-- Day: integer (nullable = true)
|-- vals: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- val: string (nullable = true)
+---+----------------+
|Day|vals |
+---+----------------+
|1 |[{1, 3}, {1, 3}]|
+---+----------------+

pyspark.sql.utils.AnalysisException: THEN and ELSE expressions should all be same type or coercible to a common type

I have the following code:
import pyspark.sql.functions as func
def get_alert(bid):
# for simplicity I only provide "return" part
return "1", "2"
get_alert_udf = func.udf(lambda bid:
get_alert(bid),
StructType(
[
StructField('prob', StringType()),
StructField('level', StringType())
]
)
)
df = df \
.withColumn("val", func.when(func.col("is_inside") == 1,
get_alert_udf(
func.col("building_id")
))
.otherwise(func.struct(func.lit("0"),func.lit("0")))
When I execute this code, I get the following error:
pyspark.sql.utils.AnalysisException:
u"cannot resolve
'CASE WHEN (`is_inside` = 1)
THEN <lambda>(building_id) ELSE named_struct('col1', '0', 'col2', '0') END' due to data type mismatch:
THEN and ELSE expressions should all be same type or coercible to a common type
In my case the outputs seem to have the same type in case of THEN and ELSE. I don't understand where is the difference between:
StructType(
[
StructField('prob', StringType()),
StructField('level', StringType())
]
)
and
func.struct(func.lit("0"),func.lit("0"))
The function you use returns a named struct. This means that both names and types have to match:
func.when(
func.col("is_inside") == 1,
get_alert_udf(func.col("building_id"))
).otherwise(
func.struct(func.lit("0").alias("prob"), func.lit("0").alias("level"))
)
or
schema = StructType([
StructField('prob', StringType()), StructField('level', StringType())
])
get_alert_udf = func.udf(get_alert, schema)
and then
func.when(
func.col("is_inside") == 1,
get_alert_udf(func.col("building_id"))
).otherwise(func.struct(func.lit("0"), func.lit("0")).cast(schema))

Windowing and aggregating pyspark DataFrame [duplicate]

This question already has an answer here:
Spark Structured Streaming using sockets, set SCHEMA, Display DATAFRAME in console
(1 answer)
Closed 5 years ago.
I'm trying to process incoming events from a socket, then windowing and aggregating the event data. I've hit a snag with the windowing. It appears that even though I specify a schema for the DataFrame, it does not translate into columns.
import sys
from pyspark.sql.types import StructType, StringType, TimestampType, FloatType, IntegerType, StructField
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
if __name__ == "__main__":
# our data currently looks like this (tab separated).
# -SYMBOL DATE PRICE TICKVOL BID ASK
# NQU7 2017-05-28T15:00:00 5800.50 12 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 5 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
if len(sys.argv) != 3:
# print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
exit(-1)
spark = SparkSession \
.builder \
.appName("StructuredTickStream") \
.getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')
# Read all the csv files written atomically in a directory
tickSchema = StructType([
StructField("symbol", StringType(), True),
StructField("dt", TimestampType(), True),
StructField("price", FloatType(), True),
StructField("tickvol", IntegerType(), True),
StructField("bid", FloatType(), True),
StructField("ask", FloatType(), True)
])
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
events_df.printSchema()
print("columns = ", events_df.columns)
ohlc_df = events_df \
.groupby(F.window("dt", "5 minutes", "1 minutes")) \
.agg(
F.first('price').alias('open'),
F.max('price').alias('high'),
F.min('price').alias('low'),
F.last('price').alias('close')
) \
.collect()
query = ohlc_df \
.writeStream \
.outputMode("complete") \
.format("console") \
.start()
query.awaitTermination()
The output of the print("columns = ", events_df.columns) is ['value'], and the process fails with the following trace:
pyspark.sql.utils.AnalysisException: "cannot resolve '`dt`' given input columns: [value];;\n'Aggregate [timewindow('dt, 300000000, 60000000, 0)], [timewindow('dt, 300000000, 60000000, 0) AS window#3, first('price, false) AS open#7, max('price) AS high#9, min('price) AS low#11, last('price, false) AS close#13]\n+- StreamingRelation DataSource(org.apache.spark.sql.SparkSession#3a32b1a2,socket,List(),Some(StructType(StructField(symbol,StringType,true), StructField(dt,TimestampType,true), StructField(price,FloatType,true), StructField(tickvol,IntegerType,true), StructField(bid,FloatType,true), StructField(ask,FloatType,true))),List(),None,Map(sep -> \t, host -> localhost, port -> 9999),None), textSocket, [value#0]\n"
Any idea what I'm doing wrong?
Your data frame has only one column value and here you are trying to access column dt from this events_df. This is the main reason of the problem.
Below statement clearly, shows it have single column value
print("columns = ", events_df.columns)
You need to inspect this
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
Why it is creating df with only one column.

Graph True/False values from multiple sources over time

I have multiple datasources that report a true/false value at specific timestamps like this:
{1338: [
(1377259958, False),
(1378703557, True)],
1343: [
(1377259911, True),
(1377812511, False),
(1377814321, True)],
1354: [
(1377260040, False),
(1377296033, True),
(1377382446, False),
(1377566041, True),
(1377582236, False),
(1377638031, True),
(1377641637, False),
(1377652434, True),
(1377814443, False),
(1377987234, True),
(1378073645, False),
(1378160039, True),
(1378246440, False),
(1378257238, True),
(1378341839, False),
(1378421045, True),
(1378514636, False),
(1378613637, True)],
1431: [
(1377260039, False),
(1377729842, True),
(1377731646, False),
(1378703641, True)]
}
Now I would like to plot this data in one graph so that every data source is on the y axis and the time is on the x axis. Every data source should colour the timeperiods between the data changing from True to False.
At the moment I have this:
In Reality there are more timestamps and a lot of more data sources but it's always the same procedure.
I'm using matplotlib with the following script:
def timelines(y, xstart, xstop, ok):
color = 'r' if ok else 'g'
print 'Graph', y, xstart, xstop, color
plt.hlines(y,xstart,xstop,color,lw=3)
plt.vlines(xstart, y+0.45,y-0.45,color,lw=0.5)
plt.vlines(xstop, y+0.45,y-0.45,color,lw=0.5)
maxtime = 0
mintime = time()
for probe, tl in timeline.iteritems():
newmax = max(tl, key=itemgetter(0))[0]
newmin = min(tl, key=itemgetter(0))[0]
if newmax > maxtime:
print "New maximum time: %s (Old: %s)" % (newmax, maxtime)
maxtime = newmax
if newmin < mintime:
print "New minimum time: %s (Old: %s)" % (newmin, mintime)
mintime = newmin
maxprobe = 0
probelist = []
for probe, tl in timeline.iteritems():
print probe, tl
maxprobe += 1
probelist.append(probe)
first = True
startpoint = mintime
for ts in tl:
if ts[0] <= mintime:
first = False
continue
print maxprobe, startpoint, ts[0], ts[1]
if first:
first = False
timelines(maxprobe, int(startpoint), int(ts[0]), not ts[1])
else:
timelines(maxprobe, int(startpoint), int(ts[0]), ts[1])
startpoint=ts[0]
if startpoint < maxtime:
print maxprobe, 'End', startpoint, maxtime, ts[1]
timelines(maxprobe, int(startpoint), maxtime, not ts[1])
label = ['']
label += probelist
plt.ylim((0, maxprobe+1))
plt.yticks(np.arange(0,maxprobe+1), label)
fig = plt.gcf()
fig.set_size_inches(18.5,0.15*maxprobe+1)
plt.savefig('timeline.png')
What I didn't figure out is how I would display a formatted date instead of the timestamp in the xaxis. Also, is there another way to scale the image instead of set_size_inches?

Categories