Understanding Spark broadcasting - python

I find it pretty confusing using a broadcasted variable inside a UDF from an imported function. Say I make a broadcasted variable inside an imported function from the main file. It works if I have a UDF defined inside the function (second_func) but not outside (third_func).
Why is this happening?
Are UDFs advised being defined inside the function that calls it?
# test_utils.py
from pyspark.sql import types as T
from pyspark.sql import functions as F
#F.udf(T.StringType())
def do_smth_out():
return broadcasted.value["a"]
def second_func(spark, df):
#F.udf(T.StringType())
def do_smth_in():
return broadcasted.value["a"]
data = {"a": "c"}
sc = spark.sparkContext
broadcasted = sc.broadcast(data)
return df.withColumn("a", do_smth_in())
def third_func(spark, df):
data = {"a": "c"}
sc = spark.sparkContext
broadcasted = sc.broadcast(data)
return df.withColumn("a", do_smth_out())
# main.py
from pyspark.sql import types as T
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from test_utils import first_func, second_func
#F.udf(T.StringType())
def do_smth():
return broadcasted.value["a"]
if __name__ == "__main__":
spark = SparkSession \
.builder \
.getOrCreate()
sc = spark.sparkContext
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
df = sc.parallelize(data).toDF(columns)
broadcasted = sc.broadcast({"a": "c"})
print("First trial")
df.withColumn("a", do_smth()).show()
# Works
print("Second trial")
second_func(spark, df).show()
# Works
print("Third trial")
third_func(spark, df).show()
# Doesn't work

Related

PicklingError: Could not serialize object: TypeError: can't pickle SSLContext objects

I'm trying to use aws encryption sdk to encrypt pyspark columns but running into this error. Here's the code
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, lit
from pyspark.context import SparkContext
import aws_encryption_sdk
from aws_encryption_sdk import CommitmentPolicy
client = aws_encryption_sdk.EncryptionSDKClient(
commitment_policy=CommitmentPolicy.REQUIRE_ENCRYPT_REQUIRE_DECRYPT
)
kms_kwargs = dict(key_ids=[key_arn])
global master_key_provider
master_key_provider = aws_encryption_sdk.StrictAwsKmsMasterKeyProvider(**kms_kwargs)
df = spark.read.csv('test.csv', inferSchema = True, header = True)
def encrypt_string(text):
encrypted_text, encryptor_header = client.encrypt(
source=text, key_provider=master_key_provider
)
return encrypted_text
udf_encrypt = udf(lambda text: encrypt_string(text))
def spark_encrypt(df, colmn):
return df.withColumn("segment_encrypt", udf_encrypt(col(colmn)))
df_out = spark_encrypt(df, "segment")
Is there a way to resolve this ?

caching and reusing pyspark dataframe in loop

I have 2 script and a use case where I need to create a dataframe in one script and use it in another in loop. something like below:
script 1 :
def generate_data(spark, logger, conf):
processed_data_final = None
path_1 = conf["raw_data_path_1"]
path_2 = conf["raw_data_path_2"]
df_path1 = spark.read.parquet(path_1)
df_path1.cache()
df_path1.take(1) //calling action item as spark does lazy evaluation
df_path2 = spark.read.parquet(path_2)
df_path2.cache()
df_path2.take(1)
for dt in date_list:
processed_data = process_data(spark, logger, conf, dt, df_path1, df_path2)
if processed_data is None:
processed_data_final = processed_data
else:
processed_data_final = processed_data_final.union(processed_data)
return processed_data_final
if __name__ == "__main__":
# generate global variables: spark, logger
if 5 == len(sys.argv):
env = sys.argv[1]
job_id = sys.argv[2]
else:
print("parameter {env} or {job_id}")
exit(1)
app_name = "past_viewership_" + job_id
spark = SparkSession \
.builder \
.appName(app_name) \
.config("spark.storage.memoryFraction", 0) \
.config("spark.driver.maxResultSize", "-1") \
.getOrCreate()
sc = spark.sparkContext
generate_data(spark, logger, conf)
In script 2 I reuse the dateframe from script1 in script2 like:
def process_data(spark, conf, df_path1, df_path2):
path3= conf['path3']
df3 = spark.read.parquet(path3)
res_df = df3.join(df_path1, ["id"],"inner").join(df_path2,["id"], "inner")
return res_df
This is rough code explaining the flow, in this flow I see in the logs that it is loading df_path1 and df_path2 again in the loop. I was expecting it to use the cached dataframe. How can can I avoid reading the df_path1 and df_path2 again in the loop?
Calling dataframe.take(1) does not materialize the entire dataframe. Spark's Catalyst optimizer will modify the physical plan to only read the first partition of the dataframe since only the first record is needed. Hence, only the first partition is cached until the rest of the records are read.

How to Access variable from different file

My first File "sk_read_write.py" is as follows:
from spark_conn import *
from Table_structure import *
class read_write1:
def sk_read_write1(self,schema,spark):
df3 = spark.read.option("multiline", "true").option("mode", "PERMISSIVE").schema(schema).json(
"C:\\Users\\komu0\\Desktop\\Read\\*.json")
print(Spark_connect_1.connection())
df3.write.format('jdbc').options( url= Spark_connect_1.connection+str(connection._port),
driver='com.mysql.cj.jdbc.Driver',
dbtable='sparktable',
user=connection._username,
password=Spark_connect_1.connection._pass).\
mode('append').save()
My Other file is spark_conn.py:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,BooleanType,DoubleType
from aws_config import *
from Table_structure import*
class Spark_connect_1:
dict1 = get_secret()
def connection(self):
dict1 = get_secret()
_username = dict1['username']
_pass = dict1['password']
_host = dict1['host']
_port = dict1['port']
_dbname = dict1['dbname']
spark = SparkSession.builder \
.master("local[1]") \
.appName("JSON_MYSQL") \
.getOrCreate()
return spark
I want to use the variable in " _port" into sk_read_write file.
I have tried to import spark_conn into sk_read_write file and use
"Spark_connect_1.connection._port" (to get port name) but not working, please suggest how to proceed
You can access the port name by using a class variable for _port
Example
base_file.py
class ABC:
_port = "Some value"
def sample(self):
ABC._port = "another value"
print("Value of port is {}".format(ABC._port))
test_file.py
from base_file import ABC
#before changing value:
before = ABC()
before.sample()

Data loss in Spark Streaming writing in cassandra

I'm writting data to cassandra and to a text file.
After some minutes I stop the process. Then ,by example, i have 82035 rows in cassandra and I have 96749 rows in the file.
I found apparently valid data in the txt file that is not in the database. By example:
promerar|082220.80|4158.5985417|00506.7613786
MOLIIUO|082220|4107.4749|00444.2117
josehrin|082220|4159.1124|00455.1298
This is the code:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
.setAppName("Streaming test") \
.setMaster("local[2]") \
.set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
count =0
def saveToCassandra(rows):
if not rows.isEmpty():
sqlContext.createDataFrame(rows).write\
.format("org.apache.spark.sql.cassandra")\
.mode('append')\
.options(table="puntos", keyspace="test_rtk")\
.save()
def saveCoord(rdd):
rdd.foreach(lambda x: open("/tmp/p1", "a").write(x[0]+"|"+x[2]+"|"+x[3]+"|"+x[5]+"\n"))
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 2})
data = kvs.map(lambda x: x[1].split(","))
rows= data.map(lambda x:Row(long=x[5],lat=x[3],date=time.strftime("%Y-%m-%d"),time=x[2],user=x[0]))
rows.foreachRDD(saveToCassandra)
data.foreachRDD(saveCoord)
ssc.start()
This is the creation of the table in cassandra:
CREATE KEYSPACE test_rtk WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
CREATE TABLE test_rtk.puntos(
long text,
lat text,
time text,
date text,
user TEXT,
PRIMARY KEY (time,long,lat)
);
Can you help my?
Check Here For Better Solutions
Spark Streaming Programming Guide

How to broadcast a complex class object in pyspark across the clusters

Following is the code where py_cpp_bind refers to a piece of code written in C++11 and then binded to python using boost-python (enabled pickling). In order to initialize the object it requires three arguments (filename, int, int). I wished to broadcast this object across the clusters, as this piece is required to perform a computation for each element.
However, on execution Apache Spark seems to complain with
Caused by: java.io.EOFException
at java.io.DataInputStream.readInt(DataInputStream.java:392)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
... 15 more
Code:
from pyspark.serializers import BatchedSerializer, PickleSerializer
from pyspark import SparkContext, SparkConf
import py_cpp_bind
def populate_NL(n, tk2):
tk = [list(tk2[0]), tk2[1]]
res = mscore.score(tk[1], tk[0])
return res
def main(n, sc):
mscore = py_cpp_bind.score()
# following line constructs the object from the given arguments
print mscore.init("data/earthquake.csv", n, 4000)
broadcastVar = sc.broadcast(mdl)
C = [((0,), [1])]
C = sc.parallelize(C).flatMap(lambda X : populate(n, X))
print(C.collect())
if __name__ == "__main__":
conf = SparkConf().setMaster("local[*]")
conf = conf.setAppName("TEST")
sc = SparkContext(conf = conf, serializer=PickleSerializer())
n = 5
main(n, sc)

Categories