I'm writting data to cassandra and to a text file.
After some minutes I stop the process. Then ,by example, i have 82035 rows in cassandra and I have 96749 rows in the file.
I found apparently valid data in the txt file that is not in the database. By example:
promerar|082220.80|4158.5985417|00506.7613786
MOLIIUO|082220|4107.4749|00444.2117
josehrin|082220|4159.1124|00455.1298
This is the code:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
import time
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
.setAppName("Streaming test") \
.setMaster("local[2]") \
.set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
count =0
def saveToCassandra(rows):
if not rows.isEmpty():
sqlContext.createDataFrame(rows).write\
.format("org.apache.spark.sql.cassandra")\
.mode('append')\
.options(table="puntos", keyspace="test_rtk")\
.save()
def saveCoord(rdd):
rdd.foreach(lambda x: open("/tmp/p1", "a").write(x[0]+"|"+x[2]+"|"+x[3]+"|"+x[5]+"\n"))
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 2})
data = kvs.map(lambda x: x[1].split(","))
rows= data.map(lambda x:Row(long=x[5],lat=x[3],date=time.strftime("%Y-%m-%d"),time=x[2],user=x[0]))
rows.foreachRDD(saveToCassandra)
data.foreachRDD(saveCoord)
ssc.start()
This is the creation of the table in cassandra:
CREATE KEYSPACE test_rtk WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};
CREATE TABLE test_rtk.puntos(
long text,
lat text,
time text,
date text,
user TEXT,
PRIMARY KEY (time,long,lat)
);
Can you help my?
Check Here For Better Solutions
Spark Streaming Programming Guide
Related
I am trying to send data to pySpark but it is giving me this error:
ERROR MicroBatchExecution: Query [id = d3a1ed30-d223-4da4-9052-189b103afca8, runId = 70bfaa84-15c9-4c8b-9058-0f9a04ee4dd0] terminated with error
java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
Producer
import tweepy
import json
import time
from kafka import KafkaProducer
import twitterauth as auth
import utils
producer = KafkaProducer(bootstrap_servers=["localhost:9092"], value_serializer=utils.json_serializer)
class twitterStream(tweepy.StreamingClient):
def on_connect(self):
print("Twitter Client Connected")
def on_tweet(self, raw_data):
if raw_data.referenced_tweets == None:
producer.send(topic="registered_user", value=raw_data.text)
print("Producer Running")
def on_error(self):
self.disconnect()
def adding_rules(self, keywords):
for terms in keywords:
self.add_rules(tweepy.StreamRule(terms))
if __name__ == "__main__":
stream = twitterStream(bearer_token=auth.bearer_token)
stream_terms = ['bitcoin','luna','etherum']
stream.adding_rules(stream_terms)
stream.filter(tweet_fields=['referenced_tweets'])
pySpark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import findspark
import json
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'
if __name__ == "__main__":
findspark.init()
# spark = SparkSession.builder.appName("Kafka Pyspark Streaming Learning").master("local[*]").getOrCreate()
sc = SparkSession.builder.master("local[*]") \
.appName('SparkByExamples.com') \
.getOrCreate()
df = sc \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("startingoffsets","latest") \
.option("subscribe", "registered_user") \
.load()
query = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = df.writeStream.format("console").start()
import time
time.sleep(10) # sleep 10 seconds
query.stop()
Hadoop Version: 3.3.0
Spark and PySpark version: 3.3.0
Scala Version: 2.12.15
I have tried everything for the last 8 hours still no luck. Could anyone help?
StackTrace:
When I run df.printSchema() there is no error and this is the output:
you need to use different name for holding the converted fields as the Dataframe is immutable , your initial dataframe is named df and it has certain schema, when you are casting the fields to string you need to have a seperate name.
castDf = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
query = castDf .writeStream...
I am using below program and runnign this in Anaconda(Spyder) for creating data pipeline from Kafka to Spark streaming & in python
import sys
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from uuid import uuid1
import os
##Step 1: Initialize sparkcontext
spark_context = SparkContext(appName="Transformation Application")
###Step 2: Initialize streaming context
ssc = StreamingContext(spark_context, 5)
def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
if s is None:
return None
return s.decode('utf-8')
message = KafkaUtils.createDirectStream(ssc,topics=['testtopic'],kafkaParams={"metadata.broker.list":"localhost:9092","key.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer","value.deserializer": "org.springframework.kafka.support.serializer.JsonDeserializer"},fromOffsets=None,messageHandler=None,keyDecoder=utf8_decoder,valueDecoder=utf8_decoder)
message
words = message.map(lambda x: x[1]).flatMap(lambda x: x.split(" "))
wordcount=words.map(lambda x: (x,1)).reduceByKey(lambda a,b:a+b)
wordcount.pprint()
When I am printing message, words,wordscount i am getting no proper results,I am getting hexadecimal values .
message
Out[16]: <pyspark.streaming.kafka.KafkaDStream at 0x23f8b1f8248>
wordcount
Out[18]: <pyspark.streaming.dstream.TransformedDStream at 0x23f8b2324c8>
in my topic(testtopic) I am produced string - " Hi Hi Hi how are you doing" then wordcount should give count for each word but it is giving some encoded hexadecimal values
I'm trying to store the tweets from my kafka cluster into Elastic Search. Initially, I set the output format to be
'org.elasticsearch.spark.sql'. But , it created no index.
I tried to change the format to 'console' to check the working of the streaming . But , it doesn't print out anything to the console either.
I am guessing this is a problem with my streaming dataframes . But , I can't seem to find out what exactly is the issue .
This is my full code for the Consumer(Spark Streaming):
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.0,org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0,org.elasticsearch:elasticsearch-hadoop:7.6.2 pyspark-shell'
from pyspark import SparkContext,SparkConf
# Spark Streaming
from pyspark.streaming import StreamingContext
from pyspark.sql.session import SparkSession
# Kafka
from pyspark.streaming.kafka import KafkaUtils
# json parsing
import json
import nltk
import logging
from datetime import datetime
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
def evaluate_sentiment(avg):
try:
if avg < 0:
return 'Negative'
elif avg > 0:
return 'Positive'
else:
return 'Neutral'
except TypeError:
return 'Neutral'
eval_udf = udf(evaluate_sentiment,StringType())
def start_stream(df):
df.writeStream.format('console').start()
conf = SparkConf().setAppName('twitter_analysis')
spark = SparkSession.builder.appName('twitter_analysis').getOrCreate()
conf.set("es.index.auto.create", "true")
schema = StructType([StructField("date", TimestampType(), True),
StructField("user", StringType(), True),
StructField("text", StringType(), True),
StructField("reply_count", IntegerType(), True),
StructField("retweet_count", IntegerType(), True),
StructField("favorite_count", IntegerType(), True),
StructField("sentiment_score", DecimalType(), True)])
kafkaStream = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "192.168.0.10:9092").option("subscribe", "twitter_analysis")\
.option('failOnDataLoss',False).load()
parsed_df = kafkaStream.select(from_json(col('value').cast('string'),schema).alias('parsed_value')) \
.withColumn('timestamp', lit(current_timestamp()))
mdf = parsed_df.select('parsed_value.*', 'timestamp')
evaluated_df = mdf.withColumn('status',eval_udf('sentiment_score'))\
.withColumn('date',to_date(col('timestamp')))
start_stream(evaluated_df)
What could be causing this problem ? Has it got to do anything with the schema I have defined ?
An example of the JSON data that is sent from the Kafka cluster to spark streaming :
{"date": "2020-11-07 21:02:39", "user": "TalhianeM", "text": "RT #amin_goat: Non, des probl\u00e8mes de vote dans une d\u00e9mocratie occidentale ?\n\nOn m\u2019avait assur\u00e9 que cela n\u2019arrivait qu\u2019en Afrique pourtant.", "reply_count": 0, "retweet_count": 0, "favorite_count": 0, "sentiment_score": 0.0}
Could someone please help me resolve this problem ? I tried multiple methods but nothing seems to work in getting the data streams sent to Elastic Search.
UPDATE : I resolved it . There seemed to be a problem with the host .
I am new to Hbase. I have a dataframe that I want to save to HBase in Ambari. The error messge is:
ERROR Utils: Aborting task
java.lang.UnsupportedOperationException: PrimitiveType coder: unsupported data type null
I tried to fix the problem, and now there is no null value in the dataframe any more, I am still the same error. Anyone can help me?
+--------------------+-------+---------+------+------+
| time|col1 | col2| col3| col4|
+--------------------+-------+---------+-------------+
|2020-04-12T01:30:...|+30003 | 532879| +1830| 20577|
|2020-04-11T18:15:...|+18838 | 521714| +1317| 20064|
+--------------------+--------+--------+------+------+
Below is my code for referece:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import json
def main():
def g(x):
for i in x.collect():
schema = StructType({CODE FOR STRUCTTYPE})
df1 = spark.createDataFrame(i,schema = schema)
df2=df1.select(col("time"),col("col1"),col("col2"),col("col3"),col("col4"))
df3=df2.fillna({'col3':'0'})
data_source_format = "org.apache.spark.sql.execution.datasources.hbase"
catalog =''.join("""{"table":\
{"namespace":"default","name":"data"},"rowkey":"key","columns":{"time":\
{"cf":"rowkey","col":"key","type":"string"},{three other cols}}""".split())
df3.write
.options(catalog=catalog,newtable=5)
.format(data_source_format)
.save()
spark.read.options(catalog=catalog).format(data_source_format).load()
conf = SparkConf().setAppName("PySparkKafka").setMaster("local[2]")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
ssc = StreamingContext(sc, 10)
topic =['api-spark1']
kafkaStream = KafkaUtils.createDirectStream(ssc,topic,{"metadata.broker.list": "sandbox-hdp.hortonworks.com:6667"})
parsed = kafkaStream.map(lambda kv: json.loads(kv[1])['response'])
parsed.foreachRDD(g)
ssc.start()
ssc.awaitTermination()
if __name__=='__main__':
main()
This is the error msg:
20/04/12 01:59:57 ERROR Utils: Aborting task
java.lang.UnsupportedOperationException: PrimitiveType coder: unsupported data type null
at org.apache.spark.sql.execution.datasources.hbase.types.PrimitiveType.toBytes(PrimitiveType.scala:61)
unsupported data type null means one of the column you are using has null values. to work around just do this cast(null as int)
Spark identify data type such as string, int etc. but not null type.
I am attempting to use a python template provided by AWS that I modified to load the yelp review dataset (LARGE JSON file) into an S3 bucket using glue, where I apply the comprehend API embedded within the python script.
I continue to receive an error message:
ConnectTimeout:
HTTPSConnectionPool(host='comprehend.us-east-1.amazonaws.com',
port=443): Max retries exceeded with url: / (Caused by
ConnectTimeoutError(, 'Connection to comprehend.us-east-1.amazonaws.com
timed out. (connect timeout=60)'))
I have updated the original template to use json rather than parquet. Also, I have updated the number of batches within the original file from 10 to 1000 (where NUMBER_OF_BATCHES = 1000 in above script). Is there anything else I can do optimize my code so I stop receiving an error? Here is the existing code with existing s3 files paths to the VERY large JSON file I am trying to apply comprehend API:
import os
import sys
import boto3
from awsglue.job import Job
from awsglue.transforms import *
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
import pyspark.sql.functions as F
from pyspark.sql import Row, Window, SparkSession
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = SparkSession.builder.config("spark.sql.broadcastTimeout", "6000").getOrCreate()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark._jsc.hadoopConfiguration().set("json.enable.summary-metadata", "false")
AWS_REGION = 'us-east-1'
MIN_SENTENCE_LENGTH_IN_CHARS = 10
MAX_SENTENCE_LENGTH_IN_CHARS = 4500
COMPREHEND_BATCH_SIZE = 25 ## This is the max batch size for comprehend
NUMBER_OF_BATCHES = 1000
SentimentRow = Row("review_id", "sentiment")
def getBatchSentiment(input_list):
arr = []
bodies = [i[1] for i in input_list]
client = boto3.client('comprehend',region_name = AWS_REGION)
def callApi(text_list):
response = client.batch_detect_sentiment(TextList = text_list, LanguageCode = 'en')
return response
for i in range(NUMBER_OF_BATCHES-1):
text_list = bodies[COMPREHEND_BATCH_SIZE * i : COMPREHEND_BATCH_SIZE * (i+1)]
response = callApi(text_list)
for r in response['ResultList']:
idx = COMPREHEND_BATCH_SIZE * i + r['Index']
arr.append(SentimentRow(input_list[idx][0], r['Sentiment']))
return arr
### Main function to process data through comprehend
## Read yelp academic dataset reviews
reviews = spark.read.json("s3://schmldemobucket/yelp_academic_dataset_review.json").distinct()
df = reviews \
.withColumn('body_len', F.length('text')) \
.filter(F.col('body_len') > MIN_SENTENCE_LENGTH_IN_CHARS) \
.filter(F.col('body_len') < MAX_SENTENCE_LENGTH_IN_CHARS)
record_count = df.count()
df2 = df \
.repartition(record_count / (NUMBER_OF_BATCHES * COMPREHEND_BATCH_SIZE)) \
.sortWithinPartitions(['review_id'], ascending=True)
group_rdd = df2.rdd.map(lambda l: (l.review_id, l.text)).glom()
sentiment = group_rdd.coalesce(10).map(lambda l: getBatchSentiment(l)).flatMap(lambda x: x).toDF().repartition('review_id').cache()
## Join sentiment results with the yelp review dataset
joined = reviews \
.drop('text') \
.join(sentiment, sentiment.review_id == reviews.review_id) \
.drop(sentiment.review_id)
## Write out result set to S3 in JSON format
joined.write.partitionBy('business_id').mode('overwrite').json('s3://schmldemobucket/')
job.commit()
I expect to receive a successful glue etl job status but I'm not sure how to optimize further to receive a succeeded job status. Please help!!!!