Unable to finish the spark job on spark standalone cluster - python

I'm a very novice to spark, been on spark for just a week. This is my code in pyspark, running on standalone spark cluster with a single master and two slaves. Trying to run a job spark reading a 01. million record data and performing some manipulation on the data then dumping the dataframe on oracle table.Im having trouble completing the job. Its seems like this program as created 404 partitions to complete the tasks. On the console or terminal I can see 403/404 are completed but the last and final task on the partition 404 is taking forever to complete the job. I'm unable to complete the job. can anyone tell me the issue with my code. can anyone help with optimizing the performance on spark or can point me to a guide or something? Any tut or guide would help. Thanks in advance
# creating a spark session
spark = SparkSession \
.builder \
.appName("pyspark_testing_29012020") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# target table schema and column order
df_target = spark.read.csv("mycsv path", header = True)
df_local_schema = df_target.schema
df_column_order = df_target.columns
# dataframe with input file/dataset values and schema
df_source = spark.read\
.format("csv")\
.option("header", "false")\
.option("inferschema", "true")\
.option("delimiter", ",")\
.schema(df_local_schema)\
.load("csv path")
# dataframe with the target file/dataset values
df_target = spark.read\
.format("jdbc") \
.option("url", "jdbc:oracle:thin:system/oracle123#127.0.0.1:0101:orcl") \
.option("dbtable", "mydata") \
.option("user", "system") \
.option("password", "oracle123") \
.option("driver", "oracle.jdbc.driver.OracleDriver")\
.load()
# splitting the target table in to upper and lower sections
df_target_upper = df_target.where(df_target['Key'] < 5) # set A
df_source_upper = df_source.where(df_source['Key'] < 5) # set B
df_source_lower = df_source.where(df_source['Key'] > 4) # set D
df_target_lower = df_target.where(df_target['key'] > 4) # set C
''' now programming for the upper segment of the data '''
# set operation A-B
A_minus_B = df_target_upper.join(df_source_upper,
on=['key1', 'key2', 'key3', 'key4'],
how='left_anti')
A_minus_B = A_minus_B.select(sorted(df_column_order))
# set operation B-A
B_minus_A = df_source_upper.join(df_target_upper,
on=['key1', 'key2','key3','key4'],how = 'left_anti')
B_minus_A = B_minus_A.select(sorted(df_column_order))
# union of A-B and B-A
AmB_union_BmA = A_minus_B.union(B_minus_A)
AmB_union_BmA = AmB_union_BmA.select(sorted(df_column_order))
# A-B left anti B-A to get the uncommon record in both the dataframes
new_df = A_minus_B.join(B_minus_A, on=['key'], how = 'left_anti')
new_df = new_df.select(sorted(df_column_order))
AmB_union_BmA = AmB_union_BmA.select(sorted(df_column_order))
AnB = df_target_upper.join(df_source_upper,
on=['key1', 'key2', 'key3', 'key4'],
how='inner')
df_AnB_without_dupes = dropDupeDfCols(AnB)
new_AnB = df_AnB_without_dupes.select(sorted(df_column_order))
final_df = AmB_union_BmA.union(new_AnB)
final_df.show()
result_df = B_minus_A.union(new_df)
df_result_upper_seg = result_df.union(new_AnB)
''' now programming for the lower segment of the data '''
# set operation C-D
C_minus_D = df_target_lower.join(df_source_lower, on=['key'], how='left_anti')
C_minus_D = C_minus_D.select(sorted(df_column_order))
# set operation D-C
D_minus_C = df_source_lower.join(df_target_lower, on=['key'], how = 'left_anti')
D_minus_C = D_minus_C.select(sorted(df_column_order))
# union of C-D union D-C
CmD_union_DmC = C_minus_D.union(D_minus_C)
CmD_union_DmC = CmD_union_DmC.select(sorted(df_column_order))
# C-D left anti D-C to get the uncommon record in both the dataframes
lower_new_df = C_minus_D.join(D_minus_C, on=['key'], how = 'left_anti')
lower_new_df = lower_new_df.select(sorted(df_column_order))
CmD_union_DmC = CmD_union_DmC.select(sorted(df_column_order))
CnD = df_target_lower.join(df_source_lower,
on=['key'], how='inner')
new_CnD = dropDupeDfCols(CnD)
new_CnD = new_CnD.select(sorted(df_column_order))
lower_final_df = CmD_union_DmC.union(new_CnD)
result_df_lower = D_minus_C.union(lower_new_df)
df_result_lower_seg = result_df_lower.union(new_CnD)
df_final_result .write \
.format("jdbc") \
.mode("overwrite")\
.option("url", "jdbc:oracle:thin:system/oracle123#127.0.0.1:1010:orcl") \
.option("dbtable", "mydata") \
.option("user", "system") \
.option("password", "oracle123") \
.option("driver", "oracle.jdbc.driver.OracleDriver") \
.save()

Take a look at Spark UI and monitoring guide
Try to split your job into steps. Then find the step that is failing.

Related

Why is PySpark not reading data from kafka using streaming, but works fine with reading it normally?

I'm trying to read in data from kafka using structured streaming, but the program doesn't seem to be getting any of it.
This code doesn't print any records to the console:
KAFKA_TOPIC = "stations-topic"
KAFKA_SERVER = "kafka:9092"
sc = SparkContext("local[4]")
ssc = StreamingContext(sc, 1)
spark = SparkSession.builder.appName("stations").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVER) \
.option("subscribe", KAFKA_TOPIC) \
.load()
df1 = df.selectExpr("CAST(value AS STRING)")
df2 = df1.select(from_csv(col("value"), df_schema_string).alias("stations"))
df3 = df2.select("stations.*")
out = df3 \
.writeStream \
.format("console") \
.outputMode("append") \
.start()
out.awaitTermination()
But if I modify it to not use streaming like this:
KAFKA_TOPIC = "stations-topic"
KAFKA_SERVER = "kafka:9092"
sc = SparkContext("local[4]")
ssc = StreamingContext(sc, 1)
spark = SparkSession.builder.appName("stations").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark \
.read \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVER) \
.option("subscribe", KAFKA_TOPIC) \
.load()
df1 = df.selectExpr("CAST(value AS STRING)")
df2 = df1.select(from_csv(col("value"), df_schema_string).alias("stations"))
df3 = df2.select("stations.*")
df3.show(10)
it prints the top 10 rows of data without issues. Any clue on what could be causing this?
Found the problem, I had to add .option("startingOffsets", "earliest") to the readStream

Databricks - seprating columns + using schema

I wrote a set of code to create a chart, I am trying to show this chart in 2 ways; with and without the schema function. When I have my code without schema, my problem is that the columns are all shown in one column, and I don't know how to separate them:
# File location and type
file_location = "/databricks-datasets/cs110x/ml-20m/data-001/movies.csv.gz"
file_type = "csv"
# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"
df_movies = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
print(df_movies.count())
display(df_movies)
Result:
And when I add schema, columns get separated but I get null values:
from pyspark.sql.types import *
movies_schema = StructType([
StructField('movieId', IntegerType()),
StructField('title', StringType()),
StructField('genres', StringType())
])
# File location and type
file_location = "/databricks-datasets/cs110x/ml-20m/data-001/movies.csv.gz"
file_type = "csv"
# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"
df_movies = spark.read.format(file_type) \
.schema(movies_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)
print(df_movies.count())
display(df_movies)
Result:
I appreciate it if you could tell me how I can separate columns in the first section and how I can solve the problem with the null values in the second part.

Chain methods only if needed

I have the following code:
spark = SparkSession.builder \
.appName("sss")\
.master("spark://" + os.environ["MASTER_HOST"] + ":7077")\
.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])\
.getOrCreate()
How can I configure the code, so that it only chains specific parts of the code if conditions are true: Here's an imaginary code:
spark = SparkSession.builder \
.appName("sss")\
.master("spark://" + os.environ["MASTER_HOST"] + ":7077")\ if os.environ["MASTER_HOST"]
.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])\ if os.environ["DEPLOY_MODE"]
.getOrCreate()
I think this will work, but this code seems to long for me:
spark = SparkSession.builder \
.appName("sss")\
if os.environ["MASTER_HOST"]:
spark = spark.master("spark://" + os.environ["MASTER_HOST"] + ":7077")
....
What's the smallest code snippet you can think of to produce the desired results?
This is the best you can do.
builder = SparkSession.builder.appName("sss")
if os.environ["MASTER_HOST"]:
builder = builder.master("spark://" + os.environ["MASTER_HOST"] + ":7077")
if os.environ["DEPLOY_MODE"]:
builder = builder.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])
spark = builder.getOrCreate()
Or
master_host = os.environ["MASTER_HOST"]
deploy_mode = os.environ["DEPLOY_MODE"]
builder = SparkSession.builder.appName("sss")
if master_host:
builder = builder.master("spark://" + master_host + ":7077")
if deploy_mode:
builder = builder.config("spark.submit.deployMode", deploy_mode)
spark = builder.getOrCreate()

Problems with group_by SqlAlchemy

I'm having a problem with SqlAlchemy and a group_by clause. See the SqlAlchemy query below.
I've got a SqlAlchemy query that includes a group_by clause and it's raising an exception, '(cx_Oracle.DatabaseError) ORA-00979: not a GROUP BY expression'. However, when I get the SQL generated by the SqlAlachemy query, and run that manually, the query works fine.
I'm not sure how to figure out what's wrong with the group_by clause. How can I debug this problem and figure out what I can do to fix it?
# create shorthand aliases
b = db.aliased(Batch)
bs = db.aliased(BatchingStatus)
bp = db.aliased(BatchPress)
bst = db.aliased(BatchState)
bit = db.aliased(BatchItem)
bin = db.aliased(BatchInput)
bpri = db.aliased(BatchPriority)
lcu = db.aliased(LCUser)
s = db.aliased(SubBatch)
w = db.aliased(WorkType)
ptw = db.aliased(LCProductToWorkType)
ctp = db.aliased(LCCategoryToProduct)
c = db.aliased(LCCategory)
# for correlated subquery
subq = (
db.session.query(ctp.product_name)
.join(c, c.category_id == ctp.category_id)
.filter(func.lower(c.category_path) == category)
.filter(ctp.active == 1)
)
# start of problem query
q = db.session.query(
b.batch_signature.label('batch_signature'),
b.batch_num,
b.created_date.label('created_date'),
bst.code.label('batch_state'),
func.min(bin.promise_date).label('due_out'),
bs.job_status,
bp.press_id.label('press_id'),
bp.description.label('press_description'),
bp.code.label('press_code'),
bp.active.label('press_active'),
func.listagg(bin.item_id, ',').within_group(bin.item_id).label('subbatches'),
bs.item_count.label('item_count'),
bs.product.label('product'),
bpri.code.label('priority'),
ptw.display_format.label('product_display_format'),
c.display_name.label('category_display_name'),
lcu.coalesce_first_name,
lcu.coalesce_last_name,
lcu.coalesce_email,
) \
.join(bs, (bs.batch_signature == b.batch_signature) & (bs.press_id == b.press_id)) \
.join(bp, bp.press_id == b.press_id) \
.join(bst, bst.state_id == b.state_id) \
.join(bit, bit.batch_id == b.batch_id) \
.join(bin, bin.batch_input_id == bit.batch_input_id) \
.join(bpri, bpri.priority_id == bin.priority_id) \
.join(lcu, lcu.username == bs.actor) \
.join(s, s.subbatchno == func.to_char(bin.item_id)) \
.join(w, w.worktypeenum == s.worktypeenum) \
.join(ptw, ptw.worktypeenum == w.worktypeenum) \
.join(ctp, ctp.category_to_product_id == ptw.category_to_product_id) \
.join(c, c.category_id == ctp.category_id) \
.filter(bs.product.in_(subq)) \
.filter(b.state_id <= 200) \
.group_by(
b.batch_signature,
b.batch_num,
b.created_date,
bst.code,
bs.job_status,
bp.press_id,
bp.description,
bp.code,
bp.active,
bs.item_count,
bs.product,
bpri.code,
ptw.display_format,
c.display_name,
lcu.coalesce_first_name,
lcu.coalesce_last_name,
lcu.coalesce_email,
) \
.order_by('batch_signature', 'batch_num', 'created_date')
try:
retval = q.all()
except Exception as e:
print e
The above doesn't show the models, some of which have #hybrid_property/#.expression methods, like the lcu.coalesce_first_name columns, which are an attempt to hid the #func.coalesce code that I thought was causing the group_by problems.

Windowing and aggregating pyspark DataFrame [duplicate]

This question already has an answer here:
Spark Structured Streaming using sockets, set SCHEMA, Display DATAFRAME in console
(1 answer)
Closed 5 years ago.
I'm trying to process incoming events from a socket, then windowing and aggregating the event data. I've hit a snag with the windowing. It appears that even though I specify a schema for the DataFrame, it does not translate into columns.
import sys
from pyspark.sql.types import StructType, StringType, TimestampType, FloatType, IntegerType, StructField
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
if __name__ == "__main__":
# our data currently looks like this (tab separated).
# -SYMBOL DATE PRICE TICKVOL BID ASK
# NQU7 2017-05-28T15:00:00 5800.50 12 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 5 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
if len(sys.argv) != 3:
# print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
exit(-1)
spark = SparkSession \
.builder \
.appName("StructuredTickStream") \
.getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')
# Read all the csv files written atomically in a directory
tickSchema = StructType([
StructField("symbol", StringType(), True),
StructField("dt", TimestampType(), True),
StructField("price", FloatType(), True),
StructField("tickvol", IntegerType(), True),
StructField("bid", FloatType(), True),
StructField("ask", FloatType(), True)
])
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
events_df.printSchema()
print("columns = ", events_df.columns)
ohlc_df = events_df \
.groupby(F.window("dt", "5 minutes", "1 minutes")) \
.agg(
F.first('price').alias('open'),
F.max('price').alias('high'),
F.min('price').alias('low'),
F.last('price').alias('close')
) \
.collect()
query = ohlc_df \
.writeStream \
.outputMode("complete") \
.format("console") \
.start()
query.awaitTermination()
The output of the print("columns = ", events_df.columns) is ['value'], and the process fails with the following trace:
pyspark.sql.utils.AnalysisException: "cannot resolve '`dt`' given input columns: [value];;\n'Aggregate [timewindow('dt, 300000000, 60000000, 0)], [timewindow('dt, 300000000, 60000000, 0) AS window#3, first('price, false) AS open#7, max('price) AS high#9, min('price) AS low#11, last('price, false) AS close#13]\n+- StreamingRelation DataSource(org.apache.spark.sql.SparkSession#3a32b1a2,socket,List(),Some(StructType(StructField(symbol,StringType,true), StructField(dt,TimestampType,true), StructField(price,FloatType,true), StructField(tickvol,IntegerType,true), StructField(bid,FloatType,true), StructField(ask,FloatType,true))),List(),None,Map(sep -> \t, host -> localhost, port -> 9999),None), textSocket, [value#0]\n"
Any idea what I'm doing wrong?
Your data frame has only one column value and here you are trying to access column dt from this events_df. This is the main reason of the problem.
Below statement clearly, shows it have single column value
print("columns = ", events_df.columns)
You need to inspect this
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
Why it is creating df with only one column.

Categories