Chain methods only if needed - python

I have the following code:
spark = SparkSession.builder \
.appName("sss")\
.master("spark://" + os.environ["MASTER_HOST"] + ":7077")\
.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])\
.getOrCreate()
How can I configure the code, so that it only chains specific parts of the code if conditions are true: Here's an imaginary code:
spark = SparkSession.builder \
.appName("sss")\
.master("spark://" + os.environ["MASTER_HOST"] + ":7077")\ if os.environ["MASTER_HOST"]
.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])\ if os.environ["DEPLOY_MODE"]
.getOrCreate()
I think this will work, but this code seems to long for me:
spark = SparkSession.builder \
.appName("sss")\
if os.environ["MASTER_HOST"]:
spark = spark.master("spark://" + os.environ["MASTER_HOST"] + ":7077")
....
What's the smallest code snippet you can think of to produce the desired results?

This is the best you can do.
builder = SparkSession.builder.appName("sss")
if os.environ["MASTER_HOST"]:
builder = builder.master("spark://" + os.environ["MASTER_HOST"] + ":7077")
if os.environ["DEPLOY_MODE"]:
builder = builder.config("spark.submit.deployMode", os.environ["DEPLOY_MODE"])
spark = builder.getOrCreate()
Or
master_host = os.environ["MASTER_HOST"]
deploy_mode = os.environ["DEPLOY_MODE"]
builder = SparkSession.builder.appName("sss")
if master_host:
builder = builder.master("spark://" + master_host + ":7077")
if deploy_mode:
builder = builder.config("spark.submit.deployMode", deploy_mode)
spark = builder.getOrCreate()

Related

Why is PySpark not reading data from kafka using streaming, but works fine with reading it normally?

I'm trying to read in data from kafka using structured streaming, but the program doesn't seem to be getting any of it.
This code doesn't print any records to the console:
KAFKA_TOPIC = "stations-topic"
KAFKA_SERVER = "kafka:9092"
sc = SparkContext("local[4]")
ssc = StreamingContext(sc, 1)
spark = SparkSession.builder.appName("stations").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVER) \
.option("subscribe", KAFKA_TOPIC) \
.load()
df1 = df.selectExpr("CAST(value AS STRING)")
df2 = df1.select(from_csv(col("value"), df_schema_string).alias("stations"))
df3 = df2.select("stations.*")
out = df3 \
.writeStream \
.format("console") \
.outputMode("append") \
.start()
out.awaitTermination()
But if I modify it to not use streaming like this:
KAFKA_TOPIC = "stations-topic"
KAFKA_SERVER = "kafka:9092"
sc = SparkContext("local[4]")
ssc = StreamingContext(sc, 1)
spark = SparkSession.builder.appName("stations").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
df = spark \
.read \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVER) \
.option("subscribe", KAFKA_TOPIC) \
.load()
df1 = df.selectExpr("CAST(value AS STRING)")
df2 = df1.select(from_csv(col("value"), df_schema_string).alias("stations"))
df3 = df2.select("stations.*")
df3.show(10)
it prints the top 10 rows of data without issues. Any clue on what could be causing this?
Found the problem, I had to add .option("startingOffsets", "earliest") to the readStream

Python Page Rank Streaming Application using Hadoop, py4j.protocol.Py4JJavaError: An error occurred while calling o27.partitions

I am trying to pass the page rank algorithm from simple python code to a Streaming application using python in spark which takes input according to a specific time (10) seconds in this case from another python script that place files into the directory, and this script was supposed to take them according to the time and analyze them , and when i run the following code i get error, which i do not know what cause the error, i tried to get .csv files,
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
def main(input_folder_location):
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 10) # Streaming will execute in each 3 seconds
ssc.checkpoint(input_folder_location) # 'mean directory name, Directory to be checked
links = spark.sparkContext.textFile(input_folder_location). \
map(lambda line: line.split(',')). \
map(lambda pages: (pages[0], pages[1])). \
distinct(). \
groupByKey(). \
map(lambda x: (x[0], list(x[1])))
ranks = links.map(lambda element: (element[0], 1.0))
# iterations = int(sys.argv[3])
iterations = 4
for x in range(iterations + 1):
contribs = links.join(ranks).flatMap(lambda row: computeContribs(row[1][0], row[1][1]))
print("\n")
print("------- Iter: " + str(x) + " --------")
ranks = contribs.reduceByKey(lambda v1, v2: v1 + v2).map(lambda x: (x[0], x[1] * 0.85 + 0.15))
for rank in ranks.collect():
print(rank)
print("\n")
print("------- Final Results --------")
for rank in ranks.collect():
print(rank)
ssc.start()
ssc.awaitTermination()
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield (neighbor, rank / len(neighbors))
if __name__ == "__main__":
if len(sys.argv) < 2:
sys.stderr.write(
"Error: Usage: StreamingApp.py <input-file-directory>")
sys.exit()
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
main(sys.argv[1])
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield (neighbor, rank / len(neighbors))
def main(input_folder_location):
sc = SparkContext.getOrCreate()
ssc = StreamingContext(sc, 3) #Streaming will execute in each 3 seconds
lines = ssc.textFileStream(input_folder_location) #'log/ mean directory name
print(" hello from app")
counts = lines.map(lambda line: line.split(",")) \
.map(lambda pages:(pages[0],pages[1])) \
.transform(lambda rdd: rdd.distinct()) \
.groupByKey() \
.map(lambda x: (x[0],list(x[1])))
ranks = counts.map(lambda element:(element[0],1.0))
iterations = 5
for x in range(1):
contribs = counts.join(ranks).flatMap(lambda row: computeContribs(row[1][0],row[1][1]))
print("\n")
print(" iter --------------",x)
ranks = contribs.reduceByKey(lambda v1,v2:v1+v2)
print("\n")
#counts.pprint()
ranks.pprint()
print(" finishing the task")
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
if len(sys.argv) < 2:
sys.stderr.write(
"Error: Usage: StreamingApp.py <input-file-directory>")
sys.exit()
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
main(sys.argv[1])

Unable to finish the spark job on spark standalone cluster

I'm a very novice to spark, been on spark for just a week. This is my code in pyspark, running on standalone spark cluster with a single master and two slaves. Trying to run a job spark reading a 01. million record data and performing some manipulation on the data then dumping the dataframe on oracle table.Im having trouble completing the job. Its seems like this program as created 404 partitions to complete the tasks. On the console or terminal I can see 403/404 are completed but the last and final task on the partition 404 is taking forever to complete the job. I'm unable to complete the job. can anyone tell me the issue with my code. can anyone help with optimizing the performance on spark or can point me to a guide or something? Any tut or guide would help. Thanks in advance
# creating a spark session
spark = SparkSession \
.builder \
.appName("pyspark_testing_29012020") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
# target table schema and column order
df_target = spark.read.csv("mycsv path", header = True)
df_local_schema = df_target.schema
df_column_order = df_target.columns
# dataframe with input file/dataset values and schema
df_source = spark.read\
.format("csv")\
.option("header", "false")\
.option("inferschema", "true")\
.option("delimiter", ",")\
.schema(df_local_schema)\
.load("csv path")
# dataframe with the target file/dataset values
df_target = spark.read\
.format("jdbc") \
.option("url", "jdbc:oracle:thin:system/oracle123#127.0.0.1:0101:orcl") \
.option("dbtable", "mydata") \
.option("user", "system") \
.option("password", "oracle123") \
.option("driver", "oracle.jdbc.driver.OracleDriver")\
.load()
# splitting the target table in to upper and lower sections
df_target_upper = df_target.where(df_target['Key'] < 5) # set A
df_source_upper = df_source.where(df_source['Key'] < 5) # set B
df_source_lower = df_source.where(df_source['Key'] > 4) # set D
df_target_lower = df_target.where(df_target['key'] > 4) # set C
''' now programming for the upper segment of the data '''
# set operation A-B
A_minus_B = df_target_upper.join(df_source_upper,
on=['key1', 'key2', 'key3', 'key4'],
how='left_anti')
A_minus_B = A_minus_B.select(sorted(df_column_order))
# set operation B-A
B_minus_A = df_source_upper.join(df_target_upper,
on=['key1', 'key2','key3','key4'],how = 'left_anti')
B_minus_A = B_minus_A.select(sorted(df_column_order))
# union of A-B and B-A
AmB_union_BmA = A_minus_B.union(B_minus_A)
AmB_union_BmA = AmB_union_BmA.select(sorted(df_column_order))
# A-B left anti B-A to get the uncommon record in both the dataframes
new_df = A_minus_B.join(B_minus_A, on=['key'], how = 'left_anti')
new_df = new_df.select(sorted(df_column_order))
AmB_union_BmA = AmB_union_BmA.select(sorted(df_column_order))
AnB = df_target_upper.join(df_source_upper,
on=['key1', 'key2', 'key3', 'key4'],
how='inner')
df_AnB_without_dupes = dropDupeDfCols(AnB)
new_AnB = df_AnB_without_dupes.select(sorted(df_column_order))
final_df = AmB_union_BmA.union(new_AnB)
final_df.show()
result_df = B_minus_A.union(new_df)
df_result_upper_seg = result_df.union(new_AnB)
''' now programming for the lower segment of the data '''
# set operation C-D
C_minus_D = df_target_lower.join(df_source_lower, on=['key'], how='left_anti')
C_minus_D = C_minus_D.select(sorted(df_column_order))
# set operation D-C
D_minus_C = df_source_lower.join(df_target_lower, on=['key'], how = 'left_anti')
D_minus_C = D_minus_C.select(sorted(df_column_order))
# union of C-D union D-C
CmD_union_DmC = C_minus_D.union(D_minus_C)
CmD_union_DmC = CmD_union_DmC.select(sorted(df_column_order))
# C-D left anti D-C to get the uncommon record in both the dataframes
lower_new_df = C_minus_D.join(D_minus_C, on=['key'], how = 'left_anti')
lower_new_df = lower_new_df.select(sorted(df_column_order))
CmD_union_DmC = CmD_union_DmC.select(sorted(df_column_order))
CnD = df_target_lower.join(df_source_lower,
on=['key'], how='inner')
new_CnD = dropDupeDfCols(CnD)
new_CnD = new_CnD.select(sorted(df_column_order))
lower_final_df = CmD_union_DmC.union(new_CnD)
result_df_lower = D_minus_C.union(lower_new_df)
df_result_lower_seg = result_df_lower.union(new_CnD)
df_final_result .write \
.format("jdbc") \
.mode("overwrite")\
.option("url", "jdbc:oracle:thin:system/oracle123#127.0.0.1:1010:orcl") \
.option("dbtable", "mydata") \
.option("user", "system") \
.option("password", "oracle123") \
.option("driver", "oracle.jdbc.driver.OracleDriver") \
.save()
Take a look at Spark UI and monitoring guide
Try to split your job into steps. Then find the step that is failing.

Python :concurrent.futures is executing whole code multiple times instead of the function being called in Executor.submit()

I have written a small code in pyspark to generate quantiles on a set of columns and I am calling this function using concurrent.futures,Because I want this to be done on two set of columns parallelly.
But instead of the function which should execute from Threadpool.executor,the whole code is getting executed thrice.
I am calling the function generate_advisor_quartiles() from the main() method of another python program.
from src.utils import sql_service, apa_constant as constant
from pyspark.sql.functions import monotonicallyIncreasingId
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext,HiveContext
from pyspark.sql.functions import lit
from pyspark.sql.types import *
from pyspark.ml.feature import
from concurrent.futures import *
from pyspark.ml.feature import Bucketizer
import numpy as np
import pandas as pd
import os
def generate_Quantiles(df,attr_list,spark_context):
jdf = df._jdf
quantileList = []
sqlContext = SQLContext(spark_context)
fields = [StructField('attribute', StringType(), True),
StructField('col1', DoubleType(), True),
StructField('col2', DoubleType(), True),
StructField('col3', DoubleType(), True),
StructField('col4', DoubleType(), True),
StructField('col5', DoubleType(), True)]
schema = StructType(fields)
for var in attr_list:
bindt = spark_context._jvm.com.dstsystems.apa.util.DFQuantileFunction.approxQuantile(jdf,[var.col_name],[0.0, 0.25, 0.5, 0.75, 1.0],0.0)
q0 = bindt[0][0]
q1 = bindt[0][1]
q2 = bindt[0][2]
q3 = bindt[0][3]
q4 = bindt[0][4]
colQuantileList = [q0,q1,q2,q3,q4]
quantileList.append(colQuantileList)
bindt = sorted(list(set(list(bindt[0]))))
bindt = [-float("inf")] + bindt
bindt.insert(len(bindt), float("inf"))
bindt.insert(len(bindt), float("NaN"))
bucketizer = Bucketizer().setInputCol(var.col_name).setOutputCol("{}_quantile".format(var.col_name)).setSplits(bindt)
df = bucketizer.transform(df)
df = df.withColumn("{}_quantile".format(var.col_name),(lit(4.0) - df["{}_quantile".format(var.col_name)]))
df.drop(var.col_name)
quantileRDD = spark_context.parallelize(quantileList)
quantileDF = sqlContext.createDataFrame(quantileRDD,schema)
df.count()
quantileDF.count()
return df,quantileDF
def generate_advisor_quartiles(spark_context, hive_context, log, **kwargs):
log.info("Started - Generate adviser quartile reports ")
sql = """describe dbName.tablename""" #.format(kwargs['sem_db'])
op = hive_context.sql(sql)
res = op.withColumn("ordinal_position", monotonicallyIncreasingId())
res.registerTempTable('attribs')
id_lst = hive_context.sql(
"select col_name from attribs where ordinal_position <= 24 order by ordinal_position").collect()
sql = "select %s from %s.tablename " % ((", ".join(str(v.col_name) for v in id_lst)), kwargs['sem_db'])
id_tbl = hive_context.sql(sql)
attr_lst = hive_context.sql(
"""select col_name from attribs where ordinal_position > 24 AND col_name not like '%vehicle%'
AND col_name not like '%cluster_num%'
AND col_name not like '%value_seg%'order by ordinal_position limit 2""").collect()
vhcl_lst = hive_context.sql(
"""select col_name from attribs where ordinal_position > 24 AND col_name not like '%vehicle%'
AND ( col_name like '%vehicle%'
OR col_name IN ('cluster_num', 'value_seg')
) order by ordinal_position""").collect()
sqltemp ="select %s from %s.Tablename" % ((", ".join(['entity_id'] + [str(vhcl.col_name) for vhcl in vhcl_lst])),kwargs['sem_db'])
id_tbl = hive_context.sql(sqltemp)
attr_lst1 = attr_lst[:len(attr_lst)//2]
attr_lst2 = attr_lst[len(attr_lst) // 2:]
# sqltemp = "select cast(entity_id as decimal(38,20)) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as decimal(38,20))" for attr in attr_lst), kwargs['sem_db'])
sqltemp1 = "select cast(entity_id as double) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as double)" for attr in attr_lst1), kwargs['sem_db'])
sqltemp2 = "select cast(entity_id as double) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as double)" for attr in attr_lst2), kwargs['sem_db'])
df1 = hive_context.sql(sqltemp1)
df1 = df1.replace(0, np.nan)
df2 = hive_context.sql(sqltemp2)
df2 = df2.replace(0, np.nan)
with ThreadPoolExecutor(max_workers=2) as executor2:
result1 = executor2.submit(generate_Quantiles, df1,attr_lst1,spark_context)
result2 = executor2.submit(generate_Quantiles, df2,attr_lst2,spark_context)
future_list = [result1, result2]
for future in as_completed(future_list):
print("completed")
df1,df2 = result1.result()
df3,df4 = result2.result()
finalQuantiles = df1.join(df3,"entity_id","inner")
quantilValuesDF = df2.union(df4)
finalQuantiles.show()
quantilValuesDF.show()

Windowing and aggregating pyspark DataFrame [duplicate]

This question already has an answer here:
Spark Structured Streaming using sockets, set SCHEMA, Display DATAFRAME in console
(1 answer)
Closed 5 years ago.
I'm trying to process incoming events from a socket, then windowing and aggregating the event data. I've hit a snag with the windowing. It appears that even though I specify a schema for the DataFrame, it does not translate into columns.
import sys
from pyspark.sql.types import StructType, StringType, TimestampType, FloatType, IntegerType, StructField
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
if __name__ == "__main__":
# our data currently looks like this (tab separated).
# -SYMBOL DATE PRICE TICKVOL BID ASK
# NQU7 2017-05-28T15:00:00 5800.50 12 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 5 5800.50 5800.50
# NQU7 2017-05-28T15:00:00 5800.50 1 5800.50 5800.50
if len(sys.argv) != 3:
# print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
exit(-1)
spark = SparkSession \
.builder \
.appName("StructuredTickStream") \
.getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('WARN')
# Read all the csv files written atomically in a directory
tickSchema = StructType([
StructField("symbol", StringType(), True),
StructField("dt", TimestampType(), True),
StructField("price", FloatType(), True),
StructField("tickvol", IntegerType(), True),
StructField("bid", FloatType(), True),
StructField("ask", FloatType(), True)
])
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
events_df.printSchema()
print("columns = ", events_df.columns)
ohlc_df = events_df \
.groupby(F.window("dt", "5 minutes", "1 minutes")) \
.agg(
F.first('price').alias('open'),
F.max('price').alias('high'),
F.min('price').alias('low'),
F.last('price').alias('close')
) \
.collect()
query = ohlc_df \
.writeStream \
.outputMode("complete") \
.format("console") \
.start()
query.awaitTermination()
The output of the print("columns = ", events_df.columns) is ['value'], and the process fails with the following trace:
pyspark.sql.utils.AnalysisException: "cannot resolve '`dt`' given input columns: [value];;\n'Aggregate [timewindow('dt, 300000000, 60000000, 0)], [timewindow('dt, 300000000, 60000000, 0) AS window#3, first('price, false) AS open#7, max('price) AS high#9, min('price) AS low#11, last('price, false) AS close#13]\n+- StreamingRelation DataSource(org.apache.spark.sql.SparkSession#3a32b1a2,socket,List(),Some(StructType(StructField(symbol,StringType,true), StructField(dt,TimestampType,true), StructField(price,FloatType,true), StructField(tickvol,IntegerType,true), StructField(bid,FloatType,true), StructField(ask,FloatType,true))),List(),None,Map(sep -> \t, host -> localhost, port -> 9999),None), textSocket, [value#0]\n"
Any idea what I'm doing wrong?
Your data frame has only one column value and here you are trying to access column dt from this events_df. This is the main reason of the problem.
Below statement clearly, shows it have single column value
print("columns = ", events_df.columns)
You need to inspect this
events_df = spark \
.readStream \
.option("sep", "\t") \
.option("host", sys.argv[1]) \
.option("port", sys.argv[2]) \
.format("socket") \
.schema(tickSchema) \
.load()
Why it is creating df with only one column.

Categories