Connection Timeout Error: AWS-Glue ETL Job with Comprehend - python

I am attempting to use a python template provided by AWS that I modified to load the yelp review dataset (LARGE JSON file) into an S3 bucket using glue, where I apply the comprehend API embedded within the python script.
I continue to receive an error message:
ConnectTimeout:
HTTPSConnectionPool(host='comprehend.us-east-1.amazonaws.com',
port=443): Max retries exceeded with url: / (Caused by
ConnectTimeoutError(, 'Connection to comprehend.us-east-1.amazonaws.com
timed out. (connect timeout=60)'))
I have updated the original template to use json rather than parquet. Also, I have updated the number of batches within the original file from 10 to 1000 (where NUMBER_OF_BATCHES = 1000 in above script). Is there anything else I can do optimize my code so I stop receiving an error? Here is the existing code with existing s3 files paths to the VERY large JSON file I am trying to apply comprehend API:
import os
import sys
import boto3
from awsglue.job import Job
from awsglue.transforms import *
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
import pyspark.sql.functions as F
from pyspark.sql import Row, Window, SparkSession
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = SparkSession.builder.config("spark.sql.broadcastTimeout", "6000").getOrCreate()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark._jsc.hadoopConfiguration().set("json.enable.summary-metadata", "false")
AWS_REGION = 'us-east-1'
MIN_SENTENCE_LENGTH_IN_CHARS = 10
MAX_SENTENCE_LENGTH_IN_CHARS = 4500
COMPREHEND_BATCH_SIZE = 25 ## This is the max batch size for comprehend
NUMBER_OF_BATCHES = 1000
SentimentRow = Row("review_id", "sentiment")
def getBatchSentiment(input_list):
arr = []
bodies = [i[1] for i in input_list]
client = boto3.client('comprehend',region_name = AWS_REGION)
def callApi(text_list):
response = client.batch_detect_sentiment(TextList = text_list, LanguageCode = 'en')
return response
for i in range(NUMBER_OF_BATCHES-1):
text_list = bodies[COMPREHEND_BATCH_SIZE * i : COMPREHEND_BATCH_SIZE * (i+1)]
response = callApi(text_list)
for r in response['ResultList']:
idx = COMPREHEND_BATCH_SIZE * i + r['Index']
arr.append(SentimentRow(input_list[idx][0], r['Sentiment']))
return arr
### Main function to process data through comprehend
## Read yelp academic dataset reviews
reviews = spark.read.json("s3://schmldemobucket/yelp_academic_dataset_review.json").distinct()
df = reviews \
.withColumn('body_len', F.length('text')) \
.filter(F.col('body_len') > MIN_SENTENCE_LENGTH_IN_CHARS) \
.filter(F.col('body_len') < MAX_SENTENCE_LENGTH_IN_CHARS)
record_count = df.count()
df2 = df \
.repartition(record_count / (NUMBER_OF_BATCHES * COMPREHEND_BATCH_SIZE)) \
.sortWithinPartitions(['review_id'], ascending=True)
group_rdd = df2.rdd.map(lambda l: (l.review_id, l.text)).glom()
sentiment = group_rdd.coalesce(10).map(lambda l: getBatchSentiment(l)).flatMap(lambda x: x).toDF().repartition('review_id').cache()
## Join sentiment results with the yelp review dataset
joined = reviews \
.drop('text') \
.join(sentiment, sentiment.review_id == reviews.review_id) \
.drop(sentiment.review_id)
## Write out result set to S3 in JSON format
joined.write.partitionBy('business_id').mode('overwrite').json('s3://schmldemobucket/')
job.commit()
I expect to receive a successful glue etl job status but I'm not sure how to optimize further to receive a succeeded job status. Please help!!!!

Related

ERROR MicroBatchExecution: When integrating Kafka with pySpark

I am trying to send data to pySpark but it is giving me this error:
ERROR MicroBatchExecution: Query [id = d3a1ed30-d223-4da4-9052-189b103afca8, runId = 70bfaa84-15c9-4c8b-9058-0f9a04ee4dd0] terminated with error
java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
Producer
import tweepy
import json
import time
from kafka import KafkaProducer
import twitterauth as auth
import utils
producer = KafkaProducer(bootstrap_servers=["localhost:9092"], value_serializer=utils.json_serializer)
class twitterStream(tweepy.StreamingClient):
def on_connect(self):
print("Twitter Client Connected")
def on_tweet(self, raw_data):
if raw_data.referenced_tweets == None:
producer.send(topic="registered_user", value=raw_data.text)
print("Producer Running")
def on_error(self):
self.disconnect()
def adding_rules(self, keywords):
for terms in keywords:
self.add_rules(tweepy.StreamRule(terms))
if __name__ == "__main__":
stream = twitterStream(bearer_token=auth.bearer_token)
stream_terms = ['bitcoin','luna','etherum']
stream.adding_rules(stream_terms)
stream.filter(tweet_fields=['referenced_tweets'])
pySpark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import findspark
import json
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'
if __name__ == "__main__":
findspark.init()
# spark = SparkSession.builder.appName("Kafka Pyspark Streaming Learning").master("local[*]").getOrCreate()
sc = SparkSession.builder.master("local[*]") \
.appName('SparkByExamples.com') \
.getOrCreate()
df = sc \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("startingoffsets","latest") \
.option("subscribe", "registered_user") \
.load()
query = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = df.writeStream.format("console").start()
import time
time.sleep(10) # sleep 10 seconds
query.stop()
Hadoop Version: 3.3.0
Spark and PySpark version: 3.3.0
Scala Version: 2.12.15
I have tried everything for the last 8 hours still no luck. Could anyone help?
StackTrace:
When I run df.printSchema() there is no error and this is the output:
you need to use different name for holding the converted fields as the Dataframe is immutable , your initial dataframe is named df and it has certain schema, when you are casting the fields to string you need to have a seperate name.
castDf = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = castDf .writeStream...

PicklingError: Could not serialize object: TypeError: can't pickle SSLContext objects

I'm trying to use aws encryption sdk to encrypt pyspark columns but running into this error. Here's the code
from pyspark.sql import functions as F
from pyspark.sql.functions import udf, col, lit
from pyspark.context import SparkContext
import aws_encryption_sdk
from aws_encryption_sdk import CommitmentPolicy
client = aws_encryption_sdk.EncryptionSDKClient(
commitment_policy=CommitmentPolicy.REQUIRE_ENCRYPT_REQUIRE_DECRYPT
)
kms_kwargs = dict(key_ids=[key_arn])
global master_key_provider
master_key_provider = aws_encryption_sdk.StrictAwsKmsMasterKeyProvider(**kms_kwargs)
df = spark.read.csv('test.csv', inferSchema = True, header = True)
def encrypt_string(text):
encrypted_text, encryptor_header = client.encrypt(
source=text, key_provider=master_key_provider
)
return encrypted_text
udf_encrypt = udf(lambda text: encrypt_string(text))
def spark_encrypt(df, colmn):
return df.withColumn("segment_encrypt", udf_encrypt(col(colmn)))
df_out = spark_encrypt(df, "segment")
Is there a way to resolve this ?

Issue in Kafka to Spark Streaming Data pipeline with python pyspark

I am using below program and runnign this in Anaconda(Spyder) for creating data pipeline from Kafka to Spark streaming & in python
import sys
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from uuid import uuid1
import os
##Step 1: Initialize sparkcontext
spark_context = SparkContext(appName="Transformation Application")
###Step 2: Initialize streaming context
ssc = StreamingContext(spark_context, 5)
def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
if s is None:
return None
return s.decode('utf-8')
message = KafkaUtils.createDirectStream(ssc,topics=['testtopic'],kafkaParams={"metadata.broker.list":"localhost:9092","key.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer","value.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer"},fromOffsets=None,messageHandler=None,keyDecoder=utf8_decoder,valueDecoder=utf8_decoder)
message
words = message.map(lambda x: x[1]).flatMap(lambda x: x.split(" "))
wordcount=words.map(lambda x: (x,1)).reduceByKey(lambda a,b:a+b)
wordcount.pprint()
When I am printing message, words,wordscount i am getting no proper results,I am getting hexadecimal values .
message
Out[16]: <pyspark.streaming.kafka.KafkaDStream at 0x23f8b1f8248>
wordcount
Out[18]: <pyspark.streaming.dstream.TransformedDStream at 0x23f8b2324c8>
in my topic(testtopic) I am produced string - " Hi Hi Hi how are you doing" then wordcount should give count for each word but it is giving some encoded hexadecimal values

Data loss in Spark Streaming writing in cassandra

I'm writting data to cassandra and to a text file.
After some minutes I stop the process. Then ,by example, i have 82035 rows in cassandra and I have 96749 rows in the file.
I found apparently valid data in the txt file that is not in the database. By example:
promerar|082220.80|4158.5985417|00506.7613786
MOLIIUO|082220|4107.4749|00444.2117
josehrin|082220|4159.1124|00455.1298
This is the code:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
.setAppName("Streaming test") \
.setMaster("local[2]") \
.set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
count =0
def saveToCassandra(rows):
if not rows.isEmpty():
sqlContext.createDataFrame(rows).write\
.format("org.apache.spark.sql.cassandra")\
.mode('append')\
.options(table="puntos", keyspace="test_rtk")\
.save()
def saveCoord(rdd):
rdd.foreach(lambda x: open("/tmp/p1", "a").write(x[0]+"|"+x[2]+"|"+x[3]+"|"+x[5]+"\n"))
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 2})
data = kvs.map(lambda x: x[1].split(","))
rows= data.map(lambda x:Row(long=x[5],lat=x[3],date=time.strftime("%Y-%m-%d"),time=x[2],user=x[0]))
rows.foreachRDD(saveToCassandra)
data.foreachRDD(saveCoord)
ssc.start()
This is the creation of the table in cassandra:
CREATE KEYSPACE test_rtk WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
CREATE TABLE test_rtk.puntos(
long text,
lat text,
time text,
date text,
user TEXT,
PRIMARY KEY (time,long,lat)
);
Can you help my?
Check Here For Better Solutions
Spark Streaming Programming Guide

Manually commit offset in kafka Direct Stream in python

I am porting a streaming application written in scala to python. I want to manually commit offset for DStream. This is done in scala like below:
stream = KafkaUtils.createDirectStream(soomeConfigs)
stream.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// some time later, after outputs have completed
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
But I am unable to find the similar APIs in python. Can you please guide me regarding the same that how Can I manually commit offsets using python client.
I resolved this by going back to pyspark 2.2 library as it has API to get offsetRanges and storing offsets on redis. I had to go back to python 2.7 as there is no "long" support in python 3.6.
import redis
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition, KafkaRDD
def get_offset_ranges(topic):
ranges = None
rk = '{topic}:offsets'.format(topic=topic)
cache = redis.Redis()
if cache.exists(rk):
mapping = cache.hgetall(rk)
ranges = dict()
for k, v in mapping.items():
tp = TopicAndPartition(topic, int(k))
ranges[tp] = long(v)
return ranges
def update_offset_ranges(offset_ranges):
cache = redis.Redis()
for rng in offset_ranges:
rk = '{rng.topic}:offsets'.format(rng=rng)
print("updating redis_key: {}, partion:{} , lastOffset: {} ".format(rk, rng.partition, rng.untilOffset))
cache.hset(rk, rng.partition, rng.untilOffset)
def do_some_work(rdd):
pass
def process_dstream(rdd):
rdd.foreachPartition(lambda iter: do_some_work(iter))
krdd = KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer)
off_ranges = krdd.offsetRanges()
for o in off_ranges:
print(str(o))
update_offset_ranges(off_ranges)
sc = SparkContext(appName="mytstApp")
ssc = StreamingContext(sc, 1)
kafka_params = {
"bootstrap.servers": "localhost:9092",
"group.id": "myUserGroup",
"enable.auto.commit": "false",
"auto.offset.reset": "smallest"
}
topic = "mytopic"
offset_ranges = get_offset_ranges(topic)
dstream = KafkaUtils.createDirectStream(ssc, "mytopic", kafka_params, fromOffsets=offset_ranges)
dstream.foreachRDD(process_dstream)
# Start our streaming context and wait for it to 'finish'
ssc.start()
# Wait for the job to finish
try:
ssc.awaitTermination()
except Exception as e:
ssc.stop()
raise e # to exit with error condition

Categories