ERROR MicroBatchExecution: When integrating Kafka with pySpark - python

I am trying to send data to pySpark but it is giving me this error:
ERROR MicroBatchExecution: Query [id = d3a1ed30-d223-4da4-9052-189b103afca8, runId = 70bfaa84-15c9-4c8b-9058-0f9a04ee4dd0] terminated with error
java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
Producer
import tweepy
import json
import time
from kafka import KafkaProducer
import twitterauth as auth
import utils
producer = KafkaProducer(bootstrap_servers=["localhost:9092"], value_serializer=utils.json_serializer)
class twitterStream(tweepy.StreamingClient):
def on_connect(self):
print("Twitter Client Connected")
def on_tweet(self, raw_data):
if raw_data.referenced_tweets == None:
producer.send(topic="registered_user", value=raw_data.text)
print("Producer Running")
def on_error(self):
self.disconnect()
def adding_rules(self, keywords):
for terms in keywords:
self.add_rules(tweepy.StreamRule(terms))
if __name__ == "__main__":
stream = twitterStream(bearer_token=auth.bearer_token)
stream_terms = ['bitcoin','luna','etherum']
stream.adding_rules(stream_terms)
stream.filter(tweet_fields=['referenced_tweets'])
pySpark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import findspark
import json
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'
if __name__ == "__main__":
findspark.init()
# spark = SparkSession.builder.appName("Kafka Pyspark Streaming Learning").master("local[*]").getOrCreate()
sc = SparkSession.builder.master("local[*]") \
.appName('SparkByExamples.com') \
.getOrCreate()
df = sc \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("startingoffsets","latest") \
.option("subscribe", "registered_user") \
.load()
query = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = df.writeStream.format("console").start()
import time
time.sleep(10) # sleep 10 seconds
query.stop()
Hadoop Version: 3.3.0
Spark and PySpark version: 3.3.0
Scala Version: 2.12.15
I have tried everything for the last 8 hours still no luck. Could anyone help?
StackTrace:
When I run df.printSchema() there is no error and this is the output:

you need to use different name for holding the converted fields as the Dataframe is immutable , your initial dataframe is named df and it has certain schema, when you are casting the fields to string you need to have a seperate name.
castDf = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = castDf .writeStream...

Related

Issue in Kafka to Spark Streaming Data pipeline with python pyspark

I am using below program and runnign this in Anaconda(Spyder) for creating data pipeline from Kafka to Spark streaming & in python
import sys
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from uuid import uuid1
import os
##Step 1: Initialize sparkcontext
spark_context = SparkContext(appName="Transformation Application")
###Step 2: Initialize streaming context
ssc = StreamingContext(spark_context, 5)
def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
if s is None:
return None
return s.decode('utf-8')
message = KafkaUtils.createDirectStream(ssc,topics=['testtopic'],kafkaParams={"metadata.broker.list":"localhost:9092","key.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer","value.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer"},fromOffsets=None,messageHandler=None,keyDecoder=utf8_decoder,valueDecoder=utf8_decoder)
message
words = message.map(lambda x: x[1]).flatMap(lambda x: x.split(" "))
wordcount=words.map(lambda x: (x,1)).reduceByKey(lambda a,b:a+b)
wordcount.pprint()
When I am printing message, words,wordscount i am getting no proper results,I am getting hexadecimal values .
message
Out[16]: <pyspark.streaming.kafka.KafkaDStream at 0x23f8b1f8248>
wordcount
Out[18]: <pyspark.streaming.dstream.TransformedDStream at 0x23f8b2324c8>
in my topic(testtopic) I am produced string - " Hi Hi Hi how are you doing" then wordcount should give count for each word but it is giving some encoded hexadecimal values

ERROR Utils: Aborting task java.lang.UnsupportedOperationException: PrimitiveType coder: unsupported data type null

I am new to Hbase. I have a dataframe that I want to save to HBase in Ambari. The error messge is:
ERROR Utils: Aborting task
java.lang.UnsupportedOperationException: PrimitiveType coder: unsupported data type null
I tried to fix the problem, and now there is no null value in the dataframe any more, I am still the same error. Anyone can help me?
+--------------------+-------+---------+------+------+
| time|col1 | col2| col3| col4|
+--------------------+-------+---------+-------------+
|2020-04-12T01:30:...|+30003 | 532879| +1830| 20577|
|2020-04-11T18:15:...|+18838 | 521714| +1317| 20064|
+--------------------+--------+--------+------+------+
Below is my code for referece:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import json
def main():
def g(x):
for i in x.collect():
schema = StructType({CODE FOR STRUCTTYPE})
df1 = spark.createDataFrame(i,schema = schema)
df2=df1.select(col("time"),col("col1"),col("col2"),col("col3"),col("col4"))
df3=df2.fillna({'col3':'0'})
data_source_format = "org.apache.spark.sql.execution.datasources.hbase"
catalog =''.join("""{"table":\
{"namespace":"default","name":"data"},"rowkey":"key","columns":{"time":\
{"cf":"rowkey","col":"key","type":"string"},{three other cols}}""".split())
df3.write
.options(catalog=catalog,newtable=5)
.format(data_source_format)
.save()
spark.read.options(catalog=catalog).format(data_source_format).load()
conf = SparkConf().setAppName("PySparkKafka").setMaster("local[2]")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
ssc = StreamingContext(sc, 10)
topic =['api-spark1']
kafkaStream = KafkaUtils.createDirectStream(ssc,topic,{"metadata.broker.list": "sandbox-hdp.hortonworks.com:6667"})
parsed = kafkaStream.map(lambda kv: json.loads(kv[1])['response'])
parsed.foreachRDD(g)
ssc.start()
ssc.awaitTermination()
if __name__=='__main__':
main()
This is the error msg:
20/04/12 01:59:57 ERROR Utils: Aborting task
java.lang.UnsupportedOperationException: PrimitiveType coder: unsupported data type null
at org.apache.spark.sql.execution.datasources.hbase.types.PrimitiveType.toBytes(PrimitiveType.scala:61)
unsupported data type null means one of the column you are using has null values. to work around just do this cast(null as int)
Spark identify data type such as string, int etc. but not null type.

Pyspark: Error Connecting to Snowflake using Private Key

I am looking to create an ETL process that reads queries from Snowflake. Most of the examples online show how to set up a connection using a regular string password, but the way my company has set up their password is via private key. Unfortunately, when I try to pass in the private key as a parameter, it returns the following error:
Traceback (most recent call last):
File "/Users/rihun/PycharmProjects/snowflake_gcp_etl/loader.py", line 61, in <module>
.option("query", query) \
File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/readwriter.py", line 172, in load
return self._df(self._jreader.load())
File "/usr/local/opt/apache-spark/libexec/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/local/opt/apache-spark/libexec/python/pyspark/sql/utils.py", line 79, in deco
raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.IllegalArgumentException: 'Input PEM private key is invalid'
Code Example:
import findspark
findspark.init()
import pyspark
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages net.snowflake:snowflake-jdbc:3.6.24,net.snowflake:spark-snowflake_2.11:2.4.12-spark_2.3 pyspark-shell'
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from snowflake_connector import get_keeper_token, get_snowflake_credentials
spark = SparkSession.builder.master('local').appName('Snowflake Loader').config('spark.driver.memory', '5G').getOrCreate()
spark.builder.config('spark.executor.memory', '16G')
spark.builder.config('spark.executor.cores', '4')
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
sf_creds = get_snowflake_credentials(keeper_token=get_keeper_token())
sfOptions = {
"sfURL": sf_creds['sfURL'],
"sfAccount": sf_creds['sfAccount'],
"sfUser": sf_creds['sfUser'],
"pem_private_key": sf_creds['sfPrivateKey'],
"sfDatabase": sf_creds['sfDatabase'],
"sfSchema": sf_creds['sfSchema'],
"sfWarehouse": sf_creds['sfWarehouse'],
}
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(**sfOptions) \
.option("query", query) \
.load()
df.count()
How I am getting the credentials
def get_snowflake_credentials(keeper_token: str,
keeper_url='<keeper_url>',
keeper_namespace='cloudDB',
keeper_secret_path='<path_to_key>',
sf_account='<sf_account>',
sf_svc_user='<user>',
sf_wh='<warehouse>',
sf_role='<role>',
sf_db='<db>',
sf_schema='<schema>'):
# Connect to Keeper to collect secrets
client = hvac.Client(
url=keeper_url,
namespace=keeper_namespace,
token=keeper_token
)
# Secrets are stored within the key entitled 'data'
keeper_secrets = client.read(keeper_secret_path)['data']
passphrase = keeper_secrets['SNOWSQL_PRIVATE_KEY_PASSPHRASE']
private_key = keeper_secrets['private_key']
# PEM key must be byte encoded
key = bytes(private_key, 'utf-8')
p_key = serialization.load_pem_private_key(
key
, password=passphrase.encode()
, backend=default_backend()
)
pkb = p_key.private_bytes(
encoding=serialization.Encoding.DER
, format=serialization.PrivateFormat.PKCS8
, encryption_algorithm=serialization.NoEncryption())
sf_client = snowflake.connector.connect(
user=sf_svc_user
, account=sf_account
, warehouse=sf_wh
, role=sf_role
, database=sf_db
, schema=sf_schema
, private_key=pkb)
return {
"sfURL": "<url>",
"sfAccount": sf_account,
"sfUser": sf_svc_user,
"sfPrivateKey": pkb,
"sfDatabase": sf_db,
"sfSchema": sf_schema,
"sfWarehouse": sf_wh
}
Can you try with this code.
---------------------------------------------------------------------------------
#!/usr/bin/env python
# coding=utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkConf, SparkContext
import subprocess
from pyspark.sql import SparkSession
import os
import logging
from logging import getLogger
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend
import re
from cryptography.hazmat.primitives.serialization import load_pem_private_key
v_log = '<path>/spark.log'
spark = SparkSession \
.builder \
.config("spark.jars", "<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \
.config("spark.repl.local.jars",
"<path>/snowflake-jdbc-3.8.0.jar,<path>/spark-snowflake_2.11-2.4.13-spark_2.4.jar") \
.config("spark.sql.catalogImplementation", "in-memory") \
.getOrCreate()
spark._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession(
spark._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate())
logging.basicConfig(
filename=v_log,
level=logging.DEBUG)
logger = getLogger(__name__)
with open("<path-to>/rsa_key.p8", "rb") as key_file:
p_key = serialization.load_pem_private_key(
key_file.read(),
password=os.environ['PRIVATE_KEY_PASSPHRASE'].encode(),
backend=default_backend()
)
pkb = p_key.private_bytes(
encoding=serialization.Encoding.PEM,
format=serialization.PrivateFormat.PKCS8,
encryption_algorithm=serialization.NoEncryption()
)
pkb = pkb.decode("UTF-8")
pkb = re.sub("-*(BEGIN|END) PRIVATE KEY-*\n", "", pkb).replace("\n", "")
sfOptions = {
"sfURL": "<URL>",
"sfAccount": "sfcsupport",
"sfUser": "",
"sfDatabase": "",
"sfSchema": "PUBLIC",
"sfWarehouse": "",
"sfRole": "",
"pem_private_key":pkb
}
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(**sfOptions) \
.option("query", "Select * from <TableName>") \
.load()
df.show()
----------------------------------------------------------------------------

Data loss in Spark Streaming writing in cassandra

I'm writting data to cassandra and to a text file.
After some minutes I stop the process. Then ,by example, i have 82035 rows in cassandra and I have 96749 rows in the file.
I found apparently valid data in the txt file that is not in the database. By example:
promerar|082220.80|4158.5985417|00506.7613786
MOLIIUO|082220|4107.4749|00444.2117
josehrin|082220|4159.1124|00455.1298
This is the code:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
.setAppName("Streaming test") \
.setMaster("local[2]") \
.set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
count =0
def saveToCassandra(rows):
if not rows.isEmpty():
sqlContext.createDataFrame(rows).write\
.format("org.apache.spark.sql.cassandra")\
.mode('append')\
.options(table="puntos", keyspace="test_rtk")\
.save()
def saveCoord(rdd):
rdd.foreach(lambda x: open("/tmp/p1", "a").write(x[0]+"|"+x[2]+"|"+x[3]+"|"+x[5]+"\n"))
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 2})
data = kvs.map(lambda x: x[1].split(","))
rows= data.map(lambda x:Row(long=x[5],lat=x[3],date=time.strftime("%Y-%m-%d"),time=x[2],user=x[0]))
rows.foreachRDD(saveToCassandra)
data.foreachRDD(saveCoord)
ssc.start()
This is the creation of the table in cassandra:
CREATE KEYSPACE test_rtk WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
CREATE TABLE test_rtk.puntos(
long text,
lat text,
time text,
date text,
user TEXT,
PRIMARY KEY (time,long,lat)
);
Can you help my?
Check Here For Better Solutions
Spark Streaming Programming Guide

Connection Timeout Error: AWS-Glue ETL Job with Comprehend

I am attempting to use a python template provided by AWS that I modified to load the yelp review dataset (LARGE JSON file) into an S3 bucket using glue, where I apply the comprehend API embedded within the python script.
I continue to receive an error message:
ConnectTimeout:
HTTPSConnectionPool(host='comprehend.us-east-1.amazonaws.com',
port=443): Max retries exceeded with url: / (Caused by
ConnectTimeoutError(, 'Connection to comprehend.us-east-1.amazonaws.com
timed out. (connect timeout=60)'))
I have updated the original template to use json rather than parquet. Also, I have updated the number of batches within the original file from 10 to 1000 (where NUMBER_OF_BATCHES = 1000 in above script). Is there anything else I can do optimize my code so I stop receiving an error? Here is the existing code with existing s3 files paths to the VERY large JSON file I am trying to apply comprehend API:
import os
import sys
import boto3
from awsglue.job import Job
from awsglue.transforms import *
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
import pyspark.sql.functions as F
from pyspark.sql import Row, Window, SparkSession
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = SparkSession.builder.config("spark.sql.broadcastTimeout", "6000").getOrCreate()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark._jsc.hadoopConfiguration().set("json.enable.summary-metadata", "false")
AWS_REGION = 'us-east-1'
MIN_SENTENCE_LENGTH_IN_CHARS = 10
MAX_SENTENCE_LENGTH_IN_CHARS = 4500
COMPREHEND_BATCH_SIZE = 25 ## This is the max batch size for comprehend
NUMBER_OF_BATCHES = 1000
SentimentRow = Row("review_id", "sentiment")
def getBatchSentiment(input_list):
arr = []
bodies = [i[1] for i in input_list]
client = boto3.client('comprehend',region_name = AWS_REGION)
def callApi(text_list):
response = client.batch_detect_sentiment(TextList = text_list, LanguageCode = 'en')
return response
for i in range(NUMBER_OF_BATCHES-1):
text_list = bodies[COMPREHEND_BATCH_SIZE * i : COMPREHEND_BATCH_SIZE * (i+1)]
response = callApi(text_list)
for r in response['ResultList']:
idx = COMPREHEND_BATCH_SIZE * i + r['Index']
arr.append(SentimentRow(input_list[idx][0], r['Sentiment']))
return arr
### Main function to process data through comprehend
## Read yelp academic dataset reviews
reviews = spark.read.json("s3://schmldemobucket/yelp_academic_dataset_review.json").distinct()
df = reviews \
.withColumn('body_len', F.length('text')) \
.filter(F.col('body_len') > MIN_SENTENCE_LENGTH_IN_CHARS) \
.filter(F.col('body_len') < MAX_SENTENCE_LENGTH_IN_CHARS)
record_count = df.count()
df2 = df \
.repartition(record_count / (NUMBER_OF_BATCHES * COMPREHEND_BATCH_SIZE)) \
.sortWithinPartitions(['review_id'], ascending=True)
group_rdd = df2.rdd.map(lambda l: (l.review_id, l.text)).glom()
sentiment = group_rdd.coalesce(10).map(lambda l: getBatchSentiment(l)).flatMap(lambda x: x).toDF().repartition('review_id').cache()
## Join sentiment results with the yelp review dataset
joined = reviews \
.drop('text') \
.join(sentiment, sentiment.review_id == reviews.review_id) \
.drop(sentiment.review_id)
## Write out result set to S3 in JSON format
joined.write.partitionBy('business_id').mode('overwrite').json('s3://schmldemobucket/')
job.commit()
I expect to receive a successful glue etl job status but I'm not sure how to optimize further to receive a succeeded job status. Please help!!!!

Categories