am trying to read data from Azure event hub and store this dataframe to Mysql table in spark streaming mode.
below is the my pyspark code
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from datetime import datetime as dt
from pyspark.sql import DataFrameWriter
try:
session = SparkSession.builder.master("local").appName("dataingestion").config("")
spark = session.getOrCreate()
print("Successfully build spark session : ")
except:
print("Fail to build spark session : ")
raise Exception
startoffset = "#latest"
positionKey1 = {
"ehName": eventhubName,
"partitionId": 0
}
endTime = dt.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ")
eventPosition1 = {
"offset": startoffset,
"seqNo": 0,
"enqueuedTime": endTime,
"isInclusive": True
}
positionMap = {
json.dumps(positionKey1) : eventPosition1
}
# Place the map into the main Event Hub config dictionary
ehreadConf = {}
ehreadConf["eventhubs.startingPositions"] = json.dumps(positionMap)
ehreadConf['eventhubs.connectionString'] = event_hub_sas_key
ehreadConf['eventhubs.consumerGroup'] = "$Default"
ehreadConf['eventhubs.maxEventsPerTrigger'] = 5000
try:
inputStream = spark.readStream.format("eventhubs").options(**ehreadConf).load()
print("Successfully connected the event hub : ")
print("Check streaming is started or not : ", inputStream.isStreaming)
print("Schema of inputStream : ", inputStream.printSchema())
except Exception:
print("Fail to connect with Azure event hub : ")
raise Exception
inputStream = inputStream.withColumn("body", inputStream["body"].cast("string"))
db_target_properties = {"user": username, "password": password, "driver": driver}
def foreach_batch_function(sparkDf,epoach_id):
sparkDf.write.option("batchsize","5000").jdbc(url=url, table="demo",properties=db_target_properties, mode="append")
pass
query = sparkDf.writeStream.outputMode("append").format("com.microsoft.sqlserver.jdbc.spark").option("truncate", 'false').option(
"checkpointLocation", "dbfs:/FileStore/lambda-StreamJob/AzureSql/checkpoint").trigger(processingTime='8 seconds').foreachBatch(foreach_batch_function).start()
query.awaitTermination()
spark.stop()
but am unable to store this spark dataframe into mysql table.
am getting an error like data source jdbc dose not support spark streaming.
py4j.protocol.Py4JJavaError: An error occurred while calling o68.start.
: java.lang.UnsupportedOperationException: Data source jdbc does not support streamed writing>
Like the error notes, support for writing from a stream to a JDBC sink is not present in Spark Yet (I guess).
Instead of writing stream directly into mysq, you can do a forEachBatch in the streamDf and do the write.jdbc operation.
server_name = "jdbc:mysql://localhost:3306"
database_name = "eventhub"
jdbcurl = server_name + "/" + database_name
table_name = "stream_cdr_data"
db_properties = {"user":""user"", "password":"data#123"}
def write_to_mysql(df, epoch_id):
dfwriter = df.write.mode("append")
dfwriter.jdbc(url=jdbcurl, table=table_name, properties=db_properties) # if this is not working use below
#df.write.jdbc(url=jdbcurl, table=table_name, properties=db_properties, mode="append")
pass
query = sparkDf.writeStream.outputMode("append").foreachBatch(write_to_mysql).start()
query.awaitTermination()
For below Issue (OP asked in below comment section)
request/expected seqNo xxxx is less than the received seqNo xxxx. The earliest seqNo is yyyy and the last seqNo is yyyy
Take a look at the FAQ !!!!!
Related
I'm working on the PoC about Amundsen Lyft for introducing data catalog.
So, I use the file 'sample_mysql_loader.py' and insert database information.
OS: Ubuntu 18.04
Python: 3.6.9
pip: 21.3.1
elasticsearch: 8.0.0
neo4j: 3.5.26
I installed Amundsen Lyft using quickstart on Docker
import logging
import sys
import textwrap
import uuid
from elasticsearch import Elasticsearch
from pyhocon import ConfigFactory
from sqlalchemy.ext.declarative import declarative_base
from databuilder.extractor.mysql_metadata_extractor import MysqlMetadataExtractor
from databuilder.extractor.neo4j_extractor import Neo4jExtractor
from databuilder.extractor.neo4j_search_data_extractor import Neo4jSearchDataExtractor
from databuilder.extractor.sql_alchemy_extractor import SQLAlchemyExtractor
from databuilder.job.job import DefaultJob
from databuilder.loader.file_system_elasticsearch_json_loader import FSElasticsearchJSONLoader
from databuilder.loader.file_system_neo4j_csv_loader import FsNeo4jCSVLoader
from databuilder.publisher import neo4j_csv_publisher
from databuilder.publisher.elasticsearch_publisher import ElasticsearchPublisher
from databuilder.publisher.neo4j_csv_publisher import Neo4jCsvPublisher
from databuilder.task.task import DefaultTask
from databuilder.transformer.base_transformer import NoopTransformer
import pymysql
pymysql.install_as_MySQLdb()
es_host = None
neo_host = None
if len(sys.argv) > 1:
es_host = sys.argv[1]
if len(sys.argv) > 2:
neo_host = sys.argv[2]
es = Elasticsearch([
{'host': es_host or 'localhost'},
])
DB_FILE = '/tmp/test.db'
SQLITE_CONN_STRING = 'sqlite:////tmp/test.db'
Base = declarative_base()
NEO4J_ENDPOINT = f'bolt://{neo_host or "localhost"}:7687'
neo4j_endpoint = NEO4J_ENDPOINT
neo4j_user = 'neo4j'
neo4j_password = 'test'
LOGGER = logging.getLogger(__name__)
# todo: connection string needs to change
def connection_string():
***user = 'user_name'
password='password'
host = 'host'
port = 'port'
db = 'db_name'***
return "mysql://%s:%s#%s:%s/%s" % (user, ***password,*** host, port, db)
def run_mysql_job():
where_clause_suffix = textwrap.dedent("""
where c.table_schema = '***db_name***'
""")
tmp_folder = '/var/tmp/amundsen/table_metadata'
node_files_folder = f'{tmp_folder}/nodes/'
relationship_files_folder = f'{tmp_folder}/relationships/'
job_config = ConfigFactory.from_dict({
f'extractor.mysql_metadata.{MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
f'extractor.mysql_metadata.{MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True,
f'extractor.mysql_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds}
})
job = DefaultJob(conf=job_config,
task=DefaultTask(extractor=MysqlMetadataExtractor(), loader=FsNeo4jCSVLoader()),
publisher=Neo4jCsvPublisher())
return job
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
cypher_query=None,
elasticsearch_mapping=None):
"""
:param elasticsearch_index_alias: alias for Elasticsearch used in
amundsensearchlibrary/search_service/config.py as an index
:param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
`table_search_index`
:param model_name: the Databuilder model class used in transporting between Extractor and Loader
:param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
it uses the `Table` query baked into the Extractor
:param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
if None is given (default) it uses the `Table` query baked into the Publisher
"""
# loader saves data to this location and publisher reads it from here
extracted_search_data_path = '/var/tmp/amundsen/search_data.json'
task = DefaultTask(loader=FSElasticsearchJSONLoader(),
extractor=Neo4jSearchDataExtractor(),
transformer=NoopTransformer())
# elastic search client instance
elasticsearch_client = es
# unique name of new index in Elasticsearch
elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())
job_config = ConfigFactory.from_dict({
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user,
f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password,
f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w',
f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r',
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}':
elasticsearch_client,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}':
elasticsearch_new_index_key,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}':
elasticsearch_doc_type_key,
f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}':
elasticsearch_index_alias,
})
# only optionally add these keys, so need to dynamically `put` them
if cypher_query:
job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}',
cypher_query)
if elasticsearch_mapping:
job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}',
elasticsearch_mapping)
job = DefaultJob(conf=job_config,
task=task,
publisher=ElasticsearchPublisher())
return job
if __name__ == "__main__":
# Uncomment next line to get INFO level logging
# logging.basicConfig(level=logging.INFO)
loading_job = run_mysql_job()
loading_job.launch()
job_es_table = create_es_publisher_sample_job(
elasticsearch_index_alias='table_search_index',
elasticsearch_doc_type_key='table',
model_name='databuilder.models.table_elasticsearch_document.TableESDocument')
job_es_table.launch()
But, I got the error message below. How can I connect my database to Amundsen Lyft?
elasticsearch.exceptions.RequestError: RequestError(400, 'mapper_parsing_exception', 'Root mapping definition has unsupported parameters: [table : {properties={schema={analyzer=simple, type=text, fields={raw={type=keyword}}}, cluster={type=text}, description={analyzer=simple, type=text}, display_name={type=keyword}, column_descriptions={analyzer=simple, type=text}, programmatic_descriptions={analyzer=simple, type=text}, tags={type=keyword}, badges={type=keyword}, database={analyzer=simple, type=text, fields={raw={type=keyword}}}, total_usage={type=long}, name={analyzer=simple, type=text, fields={raw={type=keyword}}}, last_updated_timestamp={format=epoch_second, type=date}, unique_usage={type=long}, column_names={analyzer=simple, type=text, fields={raw={type=keyword}}}, key={type=keyword}}}]')
Having an issue with Scheduled Queries, using a python script, I get the error:
"google.api_core.exceptions.PermissionDenied: 403 The caller does not have permission", when running the script below.
#!/usr/bin/python
import sys
import json
import time
from google.cloud import bigquery_datatransfer
from google.oauth2 import service_account
prj_id = "project-id"
ds_id = "dataset_id"
gcp_info = json.load(open("key-file.json"))
creds = service_account.Credentials.from_service_account_info(gcp_info)
s_creds = creds.with_scopes(
[
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/bigquery',
]
)
s_acc = "service-account#project_id.iam.gserviceaccount.com"
bq_tc = bigquery_datatransfer.DataTransferServiceClient(credentials=s_creds)
dataset = prj_id + '.' + ds_id + '.'
def main():
argc = len(sys.argv) - 1
if argc != 1:
print("Usage: python3 /root/gcp_new-query.py <Temp-Table>")
sys.exit()
t_id = dataset + sys.argv[1] + '-temp'
t2_id = dataset + sys.argv[1] + '-data'
q2 = """
DELETE FROM `{}` WHERE AddressID > 0 AND MsgTS < TIMESTAMP_SUB(CURRENT_TIMESTAMP(),
INTERVAL 60 MINUTE)
""".format(t_id)
p = bq_tc.common_project_path(prj_id)
tc_cfg2 = bigquery_datatransfer.TransferConfig(
destination_dataset_id=ds_id,
display_name=sys.argv[1]+"-RM-Old-Data",
data_source_id="scheduled_query",
params={
"query": q2,
},
schedule="every hour",
)
tc_cfg2 = bq_tc.create_transfer_config(
bigquery_datatransfer.CreateTransferConfigRequest(
parent=p,
transfer_config=tc_cfg2,
service_account_name=s_acc,
)
)
print("Created scheduled query '{}'".format(tc_cfg2.name))
main()
As soon as it gets into create_transfer_config(), I get the error.
I have gone through the documentation and made sure all of the right permissions have been granted to "service-account#project_id", those being:
BigQuery Data Transfer Service Agent
BigQuery Admin
Am I missing permissions or is there something in my script that's not quite right?
Apologies if I haven't explained everything all that well.
EDIT: I have also made sure that the service account has an associated key json file.
-A.
I found a solution in the end. I simply used:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/json.json"
In the place of:
gcp_info = json.load(open("key-file.json"))
creds = service_account.Credentials.from_service_account_info(gcp_info)
s_creds = creds.with_scopes(
[
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/bigquery',
]
)
s_acc = "service-account#project_id.iam.gserviceaccount.com"
I'm trying to connect to a MQTT brooker to get data from my iot device.
For that, I'm basing myself on an exisitng python library that does exactly that with the exact brooker i need (user_id and key are provided before with and HTTP API authentication) :
def __init__
mqtt_pass = generate_mqtt_password(user_id=self._cloud_creds.user_id, key=self._cloud_creds.key)
self._mqtt_client = mqtt.Client(client_id=self._client_id, protocol=mqtt.MQTTv311)
....
self._mqtt_client.username_pw_set(username=self._cloud_creds.user_id, password=mqtt_pass)
self._mqtt_client.tls_set(ca_certs=self._ca_cert, certfile=None,
keyfile=None, cert_reqs=ssl.CERT_REQUIRED,
tls_version=ssl.PROTOCOL_TLS,
ciphers=None)
....
def generate_mqtt_password(user_id: str, key: str):
"""
Generates the MQTT password that the APP uses to connect to the mqtt server.
:param user_id:
:param key:
:return:
"""
md5_hash = md5()
clearpwd = f"{user_id}{key}"
md5_hash.update(clearpwd.encode("utf8"))
return md5_hash.hexdigest()
ref : https://github.com/albertogeniola/MerossIot/blob/07375049fc0cbc6adee2815b48fa639e74d0d946/meross_iot/manager.py
So I translated it to this in C#, using MQTTnet
var factory = new MqttFactory();
var mqttClient = factory.CreateMqttClient();
byte[] tmpSource = Encoding.UTF8.GetBytes(userId + key);
byte[] tmpHash = new MD5CryptoServiceProvider().ComputeHash(tmpSource);
string passMd5 = BitConverter.ToString(tmpHash).Replace("-", string.Empty).ToLower();
// Create TCP based options using the builder.
var options = new MqttClientOptionsBuilder()
.WithClientId(RandomString(32))
.WithTcpServer("iot.meross.com", 2001)
.WithCredentials(userId, passMd5)
.WithProtocolVersion(MqttProtocolVersion.V311)
.WithTls(new MqttClientOptionsBuilderTlsParameters()
{
UseTls = true,
SslProtocol = SslProtocols.Tls12
})
.Build();
var result = await mqttClient.ConnectAsyn
However with my code It raise an exceptioin "Connecting with MQTT server failed (NotAuthorized)".
I've checked and the "password" I compute is exactly the same as the one the python code does.
Is there a difference I'm missing ?
I am porting a streaming application written in scala to python. I want to manually commit offset for DStream. This is done in scala like below:
stream = KafkaUtils.createDirectStream(soomeConfigs)
stream.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// some time later, after outputs have completed
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
But I am unable to find the similar APIs in python. Can you please guide me regarding the same that how Can I manually commit offsets using python client.
I resolved this by going back to pyspark 2.2 library as it has API to get offsetRanges and storing offsets on redis. I had to go back to python 2.7 as there is no "long" support in python 3.6.
import redis
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition, KafkaRDD
def get_offset_ranges(topic):
ranges = None
rk = '{topic}:offsets'.format(topic=topic)
cache = redis.Redis()
if cache.exists(rk):
mapping = cache.hgetall(rk)
ranges = dict()
for k, v in mapping.items():
tp = TopicAndPartition(topic, int(k))
ranges[tp] = long(v)
return ranges
def update_offset_ranges(offset_ranges):
cache = redis.Redis()
for rng in offset_ranges:
rk = '{rng.topic}:offsets'.format(rng=rng)
print("updating redis_key: {}, partion:{} , lastOffset: {} ".format(rk, rng.partition, rng.untilOffset))
cache.hset(rk, rng.partition, rng.untilOffset)
def do_some_work(rdd):
pass
def process_dstream(rdd):
rdd.foreachPartition(lambda iter: do_some_work(iter))
krdd = KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer)
off_ranges = krdd.offsetRanges()
for o in off_ranges:
print(str(o))
update_offset_ranges(off_ranges)
sc = SparkContext(appName="mytstApp")
ssc = StreamingContext(sc, 1)
kafka_params = {
"bootstrap.servers": "localhost:9092",
"group.id": "myUserGroup",
"enable.auto.commit": "false",
"auto.offset.reset": "smallest"
}
topic = "mytopic"
offset_ranges = get_offset_ranges(topic)
dstream = KafkaUtils.createDirectStream(ssc, "mytopic", kafka_params, fromOffsets=offset_ranges)
dstream.foreachRDD(process_dstream)
# Start our streaming context and wait for it to 'finish'
ssc.start()
# Wait for the job to finish
try:
ssc.awaitTermination()
except Exception as e:
ssc.stop()
raise e # to exit with error condition
I am able to push data from Kafka to Memsql.
I am trying to push using Transform. I have created Kafka Consumer in Python which is consuming data from Kafka Topic and converting to Json Format.
I don't know how to use this as Transform in Memsql.
from confluent_kafka import KafkaError
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError
import sys
c = AvroConsumer({
'bootstrap.servers': 'X.Y.Z.W:9092',
'group.id': 'groupid1112',
'schema.registry.url': 'http://X.Y.Z.W:8081',
'default.topic.config': {
'auto.offset.reset': 'smallest'
}
})
c.subscribe(['test_topic'])
count =0
while True:
try:
msg = c.poll(10)
except SerializerError as e:
print("Message deserialization failed for {}: {}".format(msg, e))
break
if msg is None:
continue
if msg.error():
if msg.error().code() == KafkaError._PARTITION_EOF:
continue
else:
print(msg.error())
break
valueList = list(msg.value().values())
print(valueList)
c.close()
It's pritning
[1518776144187, 1, 2, 103,'asas',asas'eer',None]
check these docs
https://docs.memsql.com/memsql-pipelines/v6.0/transforms/
stay tuned for native avro support in an upcoming MemSQL release.
youll want to do something like the following, but with me sketching over the avro-specific details since I don't know the avro library off the top of my head.
```
def input_stream():
"""
Consume STDIN and yield each record that is received from MemSQL
"""
while True:
byte_len = sys.stdin.read(8)
if len(byte_len) == 8:
byte_len = struct.unpack("L", byte_len)[0]
result = sys.stdin.read(byte_len)
yield result
else:
assert len(byte_len) == 0, byte_len
return
avro_context = WhateverYouNeed() # maybe connect to schema registry here if you need to
for msg in input_stream():
object = DeserializeAvro(avro_context, msg) # this is your code
sys.stdout.write(SerializeToTSV(object)) # also your code
```
Using the schema registry should be fine, but you shouldn't need to worry about the details of reading from kafka in your transform script. I can try to get you a more detailed script Monday, but this is how to structure the code.