Why my simple Spark application works so slow? - python

I am trying to count the frequent item sets generated by FP growth of mllib using Spark API. My Spark is version 1.5.1. The following is my code:
#!/usr/bin/python
from pyspark.mllib.fpm import FPGrowth
from pyspark import SparkContext,SparkConf
from pyspark import HiveContext
import os
os.environ['PYSPARK_PYTHON']='/usr/bin/python'
appName = "FP_growth"
sc = SparkContext()
sql_context = HiveContext(sc)
def read_spu(prod):#prod_code):
sql = """
select
t.orderno_nosplit,
t.prod_code,
t.item_code,
sum(t.item_qty) as item_qty
from ioc_fdm.fdm_dwr_ioc_fcs_pk_spu_item_f_chain t
where t.prod_code='%s'
group by t.prod_code, t.orderno_nosplit, t.item_code """%prod
spu_result = sql_context.sql(sql)
return spu_result.cache()
if __name__ == '__main__':
spu=read_spu('6727780')
conf=0.7
trans=spu.rdd.repartition(100).map(lambda x: (x[0],x[2])).groupByKey().mapValues(list).values().cache()
model = FPGrowth.train(trans, 0.01, 100)
freq_count = model.freqItemsets().count()
print 'freq_count:',freq_count
sc.stop()
The input data are read from Hadoop, and the data is not very large, only about 20000 rows. However, the script works very very slow in the stage of .count. I don't know why. From the performance, it seems it is because of data skew. But the output data is not large(only about 100KB per task ).
The cluster has 8 nodes of 320 cores and 1.56 T total memory (not only one user). My spark-submit script is spark-submit --master yarn-cluster --executor-memory 30g --num-executors 20 --executor-cores 5 FP_growth.py
The attachments are the screen prints of performance when running:
Resource used
Active Stages
Tasks

Related

Unable to run job on spark cluster

Running a spark cluster on a local kubernetes cluster, and trying to execute a job via python.
When I run the below, sc gets created, but I have TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
When I uncomment either of the commented lines, I have the error UnresolvedAddressException when creating sc.
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os, sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
MASTER_IP = "127.0.0.1:61253"
conf = SparkConf().setAppName("Demo")
conf = conf.setMaster("spark://" + MASTER_IP)
# conf = conf.set("spark.driver.bindAddress", MASTER_IP)
# conf = conf.set("spark.driver.host", MASTER_IP)
sc = SparkContext.getOrCreate(conf=conf)
words = 'the company is a \
major Spanish IT provider '
seq = words.split()
data = sc.parallelize(seq)
counts = data.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).collect()
dict(counts)
sc.stop()
Note that I am able to run jobs when execing into the master node using
kubectl exec $MASTER_NODE -it -- \
pyspark --conf spark.driver.bindAddress=$MASTER_IP --conf spark.driver.host=$MASTER_IP
Any idea?
By the way, the master is made available using minikube service spark-master

Exception: Java gateway process exited before sending its port number with pyspark

I am working with python and pyspark in a jupyter notebook. I am trying to read several parquet files from an aws s3 bucket and convert them into a single json file.
This is what I have:
from functools import reduce
from pyspark.sql import DataFrame
bucket = s3.Bucket(name='mybucket')
keys =[]
for key in bucket.objects.all():
keys.append(key.key)
print(keys[0])
from pyspark.sql import SparkSession
# initialise sparkContext
spark = SparkSession.builder \
.master('local') \
.appName('myAppName') \
.config('spark.executor.memory', '5gb') \
.config("spark.cores.max", "6") \
.getOrCreate()
sc = spark.sparkContext
But I am getting:
Exception: Java gateway process exited before sending its port number with pyspark
I am not sure how to fix this, thank you!
Your getting this error because your pyspark is not able to communicate with your cluster. you need to set the value of some global variable like this.
import os
import findspark
findspark.init()
os.environ['PYSPARK_SUBMIT_ARGS'] = """--name job_name --master yarn / local
--conf spark.dynamicAllocation.enabled=true
pyspark-shell"""
os.environ['PYSPARK_PYTHON'] = "python3.6" # what ever version of python your using
os.environ['python'] = "python3.6"
findspark package is optional but it's good to use in case of pyspark.

Insert data into mysql using dataflow

The below code builds the pipeline and DAG is generated.
RuntimeError: NotImplementedError [while running 'generatedPtransform-438']Please let me know if there is any direct connector for mysql in python for beam.
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import pubsub_v1
from google.cloud import bigquery
import mysql.connector
import apache_beam as beam
import logging
import argparse
import sys
import re
PROJECT="12344"
TOPIC = "projects/12344/topics/mytopic"
class insertfn(beam.Dofn):
def insertdata(self,data):
db_conn=mysql.connector.connect(host="localhost",user="abc",passwd="root",database="new")
db_cursor=db_conn.cursor()
emp_sql = " INSERT INTO emp(ename,eid,dept) VALUES (%s,%s,%s)"
db_cusror.executemany(emp_sql,(data[0],data[1],data[2]))
db_conn.commit()
print(db_cursor.rowcount,"record inserted")
class Split(beam.DoFn):
def process(self, data):
data = data.split(",")
return [{
'ename': data[0],
'eid': data[1],
'dept': data[2]
}]
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument("--input_topic")
parser.add_argument("--output")
known_args = parser.parse_known_args(argv)
p = beam.Pipeline(options=PipelineOptions())
(p
| 'ReadData' >> beam.io.ReadFromPubSub(topic=TOPIC).with_output_types(bytes)
| "Decode" >> beam.Map(lambda x: x.decode('utf-8'))
| 'ParseCSV' >> beam.ParDo(Split())
| 'WriteToMySQL' >> beam.ParDo(insertfn())
)
result = p.run()
result.wait_until_finish()
After our discussion in the comment section, I noticed that you are not using the proper commands to execute the DataFlow pipeline.
According to the documentation, there are mandatory flags which must be defined in order to run the pipeline in Dataflow Managed Service. These flags are described below:
job_name - The name of the Dataflow job being executed.
project - The ID of your Google Cloud project. runner - The pipeline
runner - that will parse your program and construct your pipeline. For
cloud execution, this must be DataflowRunner.
staging_location - A Cloud Storage path for Dataflow to stage code packages needed by workers executing the job.
temp_location - A Cloud Storage path for Dataflow to stage temporary job files created during the execution of the pipeline.
In addition to these flags, there are others you can use, in your case since you use a PubSub topic:
--input_topic: sets the input Pub/Sub topic to read messages from.
Therefore, an example to run a Dataflow pipeline would be as follows:
python RunPipelineDataflow.py \
--job_name=jobName\
--project=$PROJECT_NAME \
--runner=DataflowRunner \
--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY\
--temp_location=gs://$BUCKET_NAME/temp
--input_topic=projects/$PROJECT_NAME/topics/$TOPIC_NAME \
I would like to point the importance of using DataflowRunner, it allows you to use the Cloud Dataflow managed service, providing a fully managed service, autoscaling and dynamic work rebalancing. However, it is also possible to use DirectRunner which executes your pipeline in your machine, it is designed to validate the pipeline.

how to load --jars with pyspark with spark standalone on client mode

I am using python 2.7 with spark standalone cluster on client mode.
I want to use jdbc for mysql and found that i need to load it using --jars argument, I have the jdbc on my local, and manage to load it with pyspark console like here
When I write a python script inside my ide, using pyspark, I don't manage to load the additional jar mysql-connector-java-5.1.26.jar and keep get
no suitable driver
error
How can I load additional jar files when running a python script in client mode, using a standalone cluster on client mode and refering to a remote master?
edit: added some code #########################################################################
this is the basic code that i am using, i use pyspark with spark context in python e.g i do not use spark submit directly and don't understand how to use spark submit parameters in this case...
def createSparkContext(masterAdress = algoMaster):
"""
:return: return a spark context that is suitable for my configs
note the ip for the master
app name is not that important, just to show off
"""
from pyspark.mllib.util import MLUtils
from pyspark import SparkConf
from pyspark import SparkContext
import os
SUBMIT_ARGS = "--driver-class-path /var/nfs/general/mysql-connector-java-5.1.43 pyspark-shell"
#SUBMIT_ARGS = "--packages com.databricks:spark-csv_2.11:1.2.0 pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
conf = SparkConf()
#conf.set("spark.driver.extraClassPath", "var/nfs/general/mysql-connector-java-5.1.43")
conf.setMaster(masterAdress)
conf.setAppName('spark-basic')
conf.set("spark.executor.memory", "2G")
#conf.set("spark.executor.cores", "4")
conf.set("spark.driver.memory", "3G")
conf.set("spark.driver.cores", "3")
#conf.set("spark.driver.extraClassPath", "/var/nfs/general/mysql-connector-java-5.1.43")
sc = SparkContext(conf=conf)
print sc._conf.get("spark.executor.extraClassPath")
return sc
sql = SQLContext(sc)
df = sql.read.format('jdbc').options(url='jdbc:mysql://ip:port?user=user&password=pass', dbtable='(select * from tablename limit 100) as tablename').load()
print df.head()
Thanks
Your SUBMIT_ARGS is going to be passed to the spark-submit when creating a sparkContext from python. You should use --jars instead of --driver-class-path.
EDIT
Your problem is actually a lot simpler than it seems: you're missing the parameter driver in the options:
sql = SQLContext(sc)
df = sql.read.format('jdbc').options(
url='jdbc:mysql://ip:port',
user='user',
password='pass',
driver="com.mysql.jdbc.Driver",
dbtable='(select * from tablename limit 100) as tablename'
).load()
You can also put userand password in separate arguments.

spark python script not writing to hbase

I am trying to run the script from this blog
import sys
import json
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
def SaveRecord(rdd):
host = 'sparkmaster.example.com'
table = 'cats'
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
conf = {"hbase.zookeeper.quorum": host,
"hbase.mapred.outputtable": table,
"mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
"mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
datamap = rdd.map(lambda x: (str(json.loads(x)["id"]),[str(json.loads(x)["id"]),"cfamily","cats_json",x]))
datamap.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: StreamCatsToHBase.py <hostname> <port>")
exit(-1)
sc = SparkContext(appName="StreamCatsToHBase")
ssc = StreamingContext(sc, 1)
lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
lines.foreachRDD(SaveRecord)
ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate
I am unable to run it. I have tried three different command line options but none is producing the output nor writing the data to hbase table
Here are the command line options that i tried
spark-submit --jars /usr/local/spark/lib/spark-examples-1.5.2-hadoop2.4.0.jar --jars /usr/local/hbase/lib/hbase-examples-1.1.2.jar sp_json.py localhost 2389 > sp_json.log
spark-submit --driver-class-path /usr/local/spark/lib/spark-examples-1.5.2-hadoop2.4.0.jar sp_json.py localhost 2389 > sp_json.log
spark-submit --driver-class-path /usr/local/spark/lib/spark-examples-1.5.2-hadoop2.4.0.jar --jars /usr/local/hbase/lib/hbase-examples-1.1.2.jar sp_json.py localhost 2389 > sp_json.log
Here is the logfile. It is too verbose. It is one of the reasons that debugging is difficult in Apache spark because it spits out too much information.
Finally got it working using the following command syntaxspark-submit --jars /usr/local/spark/lib/spark-examples-1.5.2-hadoop2.4.0.jar,/usr/local/hbase/lib/hbase-examples-1.1.2.jar sp_json.py localhost 2399 > sp_json.log

Categories