Windowing and aggregating pyspark DataFrame [duplicate] - python

This question already has an answer here:
Spark Structured Streaming using sockets, set SCHEMA, Display DATAFRAME in console
(1 answer)
Closed 5 years ago.
I'm trying to process incoming events from a socket, then windowing and aggregating the event data. I've hit a snag with the windowing. It appears that even though I specify a schema for the DataFrame, it does not translate into columns.
import sys
from pyspark.sql.types import StructType, StringType, TimestampType, FloatType, IntegerType, StructField
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
if __name__ == "__main__":
# our data currently looks like this (tab separated).
# -SYMBOL DATE PRICE TICKVOL BID ASK
# NQU7 2017-05-28T15:00:00 5800.50 12 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 5 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
if len(sys.argv) != 3:
# print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
exit(-1)
spark = SparkSession \
.builder \
.appName("StructuredTickStream") \
.getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')
# Read all the csv files written atomically in a directory
tickSchema = StructType([
StructField("symbol", StringType(), True),
StructField("dt", TimestampType(), True),
StructField("price", FloatType(), True),
StructField("tickvol", IntegerType(), True),
StructField("bid", FloatType(), True),
StructField("ask", FloatType(), True)
])
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
events_df.printSchema()
print("columns = ", events_df.columns)
ohlc_df = events_df \
.groupby(F.window("dt", "5 minutes", "1 minutes")) \
.agg(
F.first('price').alias('open'),
F.max('price').alias('high'),
F.min('price').alias('low'),
F.last('price').alias('close')
) \
.collect()
query = ohlc_df \
.writeStream \
.outputMode("complete") \
.format("console") \
.start()
query.awaitTermination()
The output of the print("columns = ", events_df.columns) is ['value'], and the process fails with the following trace:
pyspark.sql.utils.AnalysisException: "cannot resolve '`dt`' given input columns: [value];;\n'Aggregate [timewindow('dt, 300000000, 60000000, 0)], [timewindow('dt, 300000000, 60000000, 0) AS window#3, first('price, false) AS open#7, max('price) AS high#9, min('price) AS low#11, last('price, false) AS close#13]\n+- StreamingRelation DataSource(org.apache.spark.sql.SparkSession#3a32b1a2,socket,List(),Some(StructType(StructField(symbol,StringType,true), StructField(dt,TimestampType,true), StructField(price,FloatType,true), StructField(tickvol,IntegerType,true), StructField(bid,FloatType,true), StructField(ask,FloatType,true))),List(),None,Map(sep -> \t, host -> localhost, port -> 9999),None), textSocket, [value#0]\n"
Any idea what I'm doing wrong?

Your data frame has only one column value and here you are trying to access column dt from this events_df. This is the main reason of the problem.
Below statement clearly, shows it have single column value
print("columns = ", events_df.columns)
You need to inspect this
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
Why it is creating df with only one column.

Related

Why is PySpark not reading data from kafka using streaming, but works fine with reading it normally?

I'm trying to read in data from kafka using structured streaming, but the program doesn't seem to be getting any of it.
This code doesn't print any records to the console:
KAFKA_TOPIC = "stations-topic"
KAFKA_SERVER = "kafka:9092"
sc = SparkContext("local[4]")
ssc = StreamingContext(sc, 1)
spark = SparkSession.builder.appName("stations").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVER) \
.option("subscribe", KAFKA_TOPIC) \
.load()
df1 = df.selectExpr("CAST(value AS STRING)")
df2 = df1.select(from_csv(col("value"), df_schema_string).alias("stations"))
df3 = df2.select("stations.*")
out = df3 \
.writeStream \
.format("console") \
.outputMode("append") \
.start()
out.awaitTermination()
But if I modify it to not use streaming like this:
KAFKA_TOPIC = "stations-topic"
KAFKA_SERVER = "kafka:9092"
sc = SparkContext("local[4]")
ssc = StreamingContext(sc, 1)
spark = SparkSession.builder.appName("stations").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark \
.read \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVER) \
.option("subscribe", KAFKA_TOPIC) \
.load()
df1 = df.selectExpr("CAST(value AS STRING)")
df2 = df1.select(from_csv(col("value"), df_schema_string).alias("stations"))
df3 = df2.select("stations.*")
df3.show(10)
it prints the top 10 rows of data without issues. Any clue on what could be causing this?
Found the problem, I had to add .option("startingOffsets", "earliest") to the readStream

Databricks - seprating columns + using schema

I wrote a set of code to create a chart, I am trying to show this chart in 2 ways; with and without the schema function. When I have my code without schema, my problem is that the columns are all shown in one column, and I don't know how to separate them:
# File location and type
file_location = "/databricks-datasets/cs110x/ml-20m/data-001/movies.csv.gz"
file_type = "csv"
# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"
df_movies = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
print(df_movies.count())
display(df_movies)
Result:
And when I add schema, columns get separated but I get null values:
from pyspark.sql.types import *
movies_schema = StructType([
StructField('movieId', IntegerType()),
StructField('title', StringType()),
StructField('genres', StringType())
])
# File location and type
file_location = "/databricks-datasets/cs110x/ml-20m/data-001/movies.csv.gz"
file_type = "csv"
# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"
df_movies = spark.read.format(file_type) \
.schema(movies_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
print(df_movies.count())
display(df_movies)
Result:
I appreciate it if you could tell me how I can separate columns in the first section and how I can solve the problem with the null values in the second part.

Check if Input dataset contains a key or not in PySpark

I have the following code as shown below. I need to check if the column y.lc.eoouh.ci is present in the input source and populate the column only if present, else it should be NULL.(The key lc is also optional)
The code below doesn't seem to work the way it is supposed to as even though y.lc.eoouch.ci is present in the input, it evaluates to NULL.
The has_column implementation is from here.
df = df_s_a \
.withColumn("ceci", \
udf(
lambda y : y.lc[-1].eoouh.ci \
if has_column(y, 'lc.eoouh.ci') \
else None, \
StringType()
)(col('eh') \
) \
) \
.select(
col('ceci')
)
df.show()
Sample input:
{
eh: {
lc: [
eoouch: {
ci: "1234ABC"
}
]
}
}
The df[something.path.somewhere] doesn't work. I'll have to investigate that option a bit.
I've managed to make it work like this:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
def has_column(df):
try:
df["lc"][0]["eoouch"]["ci"]
return True
except KeyError:
return False
if __name__ == "__main__":
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
data = [
{"eh": {"lc": [{"eoouch": {"ci": "test"}}]}},
{"eh": {"lc": [{"eoouch": {"as": "test"}}]}},
]
df = spark.createDataFrame(data)
add_column_udf = F.udf(
lambda y: y if has_column(y) else None,
StringType(),
)
df = df.withColumn("ceci", add_column_udf(F.col("eh")))
Result:
+----------------------------------+-------------------------+
|eh |ceci |
+----------------------------------+-------------------------+
|{lc -> [{eoouch -> {ci -> test}}]}|{lc=[{eoouch={ci=test}}]}|
|{lc -> [{eoouch -> {as -> test}}]}|null |
+----------------------------------+-------------------------+
It's not perfect since it's not a general solution for column name but it could be easily generalized since it works on a dict object.

Chain methods only if needed

I have the following code:
spark = SparkSession.builder \
.appName("sss")\
.master("spark://" + os.environ["MASTER_HOST"] + ":7077")\
.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])\
.getOrCreate()
How can I configure the code, so that it only chains specific parts of the code if conditions are true: Here's an imaginary code:
spark = SparkSession.builder \
.appName("sss")\
.master("spark://" + os.environ["MASTER_HOST"] + ":7077")\ if os.environ["MASTER_HOST"]
.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])\ if os.environ["DEPLOY_MODE"]
.getOrCreate()
I think this will work, but this code seems to long for me:
spark = SparkSession.builder \
.appName("sss")\
if os.environ["MASTER_HOST"]:
spark = spark.master("spark://" + os.environ["MASTER_HOST"] + ":7077")
....
What's the smallest code snippet you can think of to produce the desired results?
This is the best you can do.
builder = SparkSession.builder.appName("sss")
if os.environ["MASTER_HOST"]:
builder = builder.master("spark://" + os.environ["MASTER_HOST"] + ":7077")
if os.environ["DEPLOY_MODE"]:
builder = builder.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])
spark = builder.getOrCreate()
Or
master_host = os.environ["MASTER_HOST"]
deploy_mode = os.environ["DEPLOY_MODE"]
builder = SparkSession.builder.appName("sss")
if master_host:
builder = builder.master("spark://" + master_host + ":7077")
if deploy_mode:
builder = builder.config("spark.submit.deployMode", deploy_mode)
spark = builder.getOrCreate()

Unable to finish the spark job on spark standalone cluster

I'm a very novice to spark, been on spark for just a week. This is my code in pyspark, running on standalone spark cluster with a single master and two slaves. Trying to run a job spark reading a 01. million record data and performing some manipulation on the data then dumping the dataframe on oracle table.Im having trouble completing the job. Its seems like this program as created 404 partitions to complete the tasks. On the console or terminal I can see 403/404 are completed but the last and final task on the partition 404 is taking forever to complete the job. I'm unable to complete the job. can anyone tell me the issue with my code. can anyone help with optimizing the performance on spark or can point me to a guide or something? Any tut or guide would help. Thanks in advance
# creating a spark session
spark = SparkSession \
.builder \
.appName("pyspark_testing_29012020") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# target table schema and column order
df_target = spark.read.csv("mycsv path", header = True)
df_local_schema = df_target.schema
df_column_order = df_target.columns
# dataframe with input file/dataset values and schema
df_source = spark.read\
.format("csv")\
.option("header", "false")\
.option("inferschema", "true")\
.option("delimiter", ",")\
.schema(df_local_schema)\
.load("csv path")
# dataframe with the target file/dataset values
df_target = spark.read\
.format("jdbc") \
.option("url", "jdbc:oracle:thin:system/oracle123#127.0.0.1:0101:orcl") \
.option("dbtable", "mydata") \
.option("user", "system") \
.option("password", "oracle123") \
.option("driver", "oracle.jdbc.driver.OracleDriver")\
.load()
# splitting the target table in to upper and lower sections
df_target_upper = df_target.where(df_target['Key'] < 5) # set A
df_source_upper = df_source.where(df_source['Key'] < 5) # set B
df_source_lower = df_source.where(df_source['Key'] > 4) # set D
df_target_lower = df_target.where(df_target['key'] > 4) # set C
''' now programming for the upper segment of the data '''
# set operation A-B
A_minus_B = df_target_upper.join(df_source_upper,
on=['key1', 'key2', 'key3', 'key4'],
how='left_anti')
A_minus_B = A_minus_B.select(sorted(df_column_order))
# set operation B-A
B_minus_A = df_source_upper.join(df_target_upper,
on=['key1', 'key2','key3','key4'],how = 'left_anti')
B_minus_A = B_minus_A.select(sorted(df_column_order))
# union of A-B and B-A
AmB_union_BmA = A_minus_B.union(B_minus_A)
AmB_union_BmA = AmB_union_BmA.select(sorted(df_column_order))
# A-B left anti B-A to get the uncommon record in both the dataframes
new_df = A_minus_B.join(B_minus_A, on=['key'], how = 'left_anti')
new_df = new_df.select(sorted(df_column_order))
AmB_union_BmA = AmB_union_BmA.select(sorted(df_column_order))
AnB = df_target_upper.join(df_source_upper,
on=['key1', 'key2', 'key3', 'key4'],
how='inner')
df_AnB_without_dupes = dropDupeDfCols(AnB)
new_AnB = df_AnB_without_dupes.select(sorted(df_column_order))
final_df = AmB_union_BmA.union(new_AnB)
final_df.show()
result_df = B_minus_A.union(new_df)
df_result_upper_seg = result_df.union(new_AnB)
''' now programming for the lower segment of the data '''
# set operation C-D
C_minus_D = df_target_lower.join(df_source_lower, on=['key'], how='left_anti')
C_minus_D = C_minus_D.select(sorted(df_column_order))
# set operation D-C
D_minus_C = df_source_lower.join(df_target_lower, on=['key'], how = 'left_anti')
D_minus_C = D_minus_C.select(sorted(df_column_order))
# union of C-D union D-C
CmD_union_DmC = C_minus_D.union(D_minus_C)
CmD_union_DmC = CmD_union_DmC.select(sorted(df_column_order))
# C-D left anti D-C to get the uncommon record in both the dataframes
lower_new_df = C_minus_D.join(D_minus_C, on=['key'], how = 'left_anti')
lower_new_df = lower_new_df.select(sorted(df_column_order))
CmD_union_DmC = CmD_union_DmC.select(sorted(df_column_order))
CnD = df_target_lower.join(df_source_lower,
on=['key'], how='inner')
new_CnD = dropDupeDfCols(CnD)
new_CnD = new_CnD.select(sorted(df_column_order))
lower_final_df = CmD_union_DmC.union(new_CnD)
result_df_lower = D_minus_C.union(lower_new_df)
df_result_lower_seg = result_df_lower.union(new_CnD)
df_final_result .write \
.format("jdbc") \
.mode("overwrite")\
.option("url", "jdbc:oracle:thin:system/oracle123#127.0.0.1:1010:orcl") \
.option("dbtable", "mydata") \
.option("user", "system") \
.option("password", "oracle123") \
.option("driver", "oracle.jdbc.driver.OracleDriver") \
.save()
Take a look at Spark UI and monitoring guide
Try to split your job into steps. Then find the step that is failing.

Categories