Processing event hub data using python - python

I am using azure event hub python SDK to send to and receive messages from event hub following this link.https://github.com/Azure/azure-event-hubs-python/tree/develop. I can successfully send and receive messages. But how do i parse the messages and retrieve the data from the event data object. Please find the code below.
import os
import sys
#import logging
from azure.eventhub import EventHubClient, Receiver, Offset
ADDRESS = 'sb://####.servicebus.windows.net/#####'
USER = '##########'
KEY = '##################################'
CONSUMER_GROUP = "$default"
OFFSET = Offset("-1")
PARTITION = "1"
total = 0
last_sn = -1
last_offset = "-1"
try:
if not ADDRESS:
raise ValueError("No EventHubs URL supplied.")
client = EventHubClient(ADDRESS, debug=False, username=USER, password=KEY)
receiver = client.add_receiver(CONSUMER_GROUP, PARTITION, prefetch=5000,
offset=OFFSET)
client.run()
try:
batched_events = receiver.receive(timeout=20)
except:
raise
finally:
client.stop()
for event_data in batched_events:
last_offset = event_data.offset.value
last_sn = event_data.sequence_number
total += 1
print("Partition {}, Received {}, sn={} offset={}".format(
PARTITION,
total,
last_sn,
last_offset))
except KeyboardInterrupt:
pass
if i try to view the event_data received i can see the below message.
event_data
<azure.eventhub.common.EventData at 0xd4f1358>
event_data.message
<uamqp.message.Message at 0xd4f1240>
Any help on the above on how to parse this message to extract the data

As of 1.1.0, there are new utility methods to extract the actual data of the message:
body_as_str
body_as_json
So, what used to be
import json
event_obj = json.loads(next(event_data.body).decode('UTF-8'))
Is now:
event_obj = event_data.body_as_json()

For people using the Event Hub version 5.2.0 -- latest as of today (GitHub, Reference Docs), it's the same as the 1.1.0 version, i.e. use body_as_str() or body_as_json(). But the client has changed -- there's an EventHubProducerClient and an EventHubConsumerClient in the new version. To print the body of an event received:
from azure.eventhub import EventHubConsumerClient
connection_str = '<< CONNECTION STRING FOR THE EVENT HUBS NAMESPACE >>'
consumer_group = '<< CONSUMER GROUP >>'
eventhub_name = '<< NAME OF THE EVENT HUB >>'
client = EventHubConsumerClient.from_connection_string(
connection_str, consumer_group, eventhub_name=eventhub_name
)
def on_event_batch(partition_context, events):
partition_context.update_checkpoint()
for e in events:
print(e.body_as_str())
with client:
client.receive_batch(
on_event_batch=on_event_batch,
starting_position="-1", # "-1" is from the beginning of the partition.
)

Related

How to implement exactly-once on Apache Kafka using Python?

I am trying to implement an exactly-once concept using confluent kafka library. But I couldn't succeed. Here is my producer code:
from confluent_kafka import Producer, SerializingProducer
import socket, json, random, time
conf = {'bootstrap.servers': "localhost:9092,localhost:9093,localhost:9094",
'client.id': socket.gethostname(),
'message.send.max.retries':2,
'enable.idempotence':True,
'isolation.level':'read_committed'}
producer = Producer(conf)
producer.produce("test_topic2", key="pc", value="ok")
producer.flush()
Here is my consumer code:
from confluent_kafka import Consumer, TopicPartition
conf = {'bootstrap.servers': 'localhost:9092,localhost:9093,localhost:9094',
'group.id': "foo2",
'auto.offset.reset': 'earliest',
'enable.idempotence':True,
'isolation.level':'read_committed'}
consumer = Consumer(conf)
topic = "test_topic2"
#consumer.subscribe([topic],on_assign=my_on_assign)
consumer.subscribe([topic])
# Poll for new messages from Kafka and print them.
try:
while True:
msg = consumer.poll()
if msg is None:
print("Waiting...")
elif msg.error():
print("ERROR: %s".format(msg.error()))
else:
# Extract the (optional) key and value, and print.
print(f"{msg.topic()} key = {msg.key()} value = {msg.value().decode('utf-8'):12}\
{msg.partition()} {msg.offset()} ")
except KeyboardInterrupt:
pass
finally:
# Leave group and commit final offsets
consumer.close()
And this is my output(respectively topic name, key name, value name, partition and offset):
test_topic2 key = b'pc' value = ok 2 3
test_topic2 key = b'pc' value = ok 2 4
As I know when I send the same key and value it needs to be arranged to show only one of them. Here for example I need to only get the message which offset is 3. So my question is am I implementing wrong architecture on my code or I understand the concept wrongly ? Thanks in advance.

How to add configuration setting for sasl.mechanism PLAIN (API) and GSSAPI (Kerberos) authentication in python script

Need some help to set the configuration for sasl.mechanism PLAIN (API) and GSSAPI (Kerberos) authentication.
We are using confluent Kafka here, there are two scripts, one a python script and the second one is a bash script which calls the python one. You can find the script below.
Thanks for the help in advance!
import json
import os
import string
import random
import socket
import uuid
import re
from datetime import datetime
import time
import hashlib
import math
import sys
from functools import cache
from confluent_kafka import Producer, KafkaError, KafkaException
topic_name = os.environ['TOPIC_NAME']
partition_count = int(os.environ['PARTITION_COUNT'])
message_key_template = json.loads(os.environ['KEY_TEMPLATE'])
message_value_template = json.loads(os.environ['VALUE_TEMPLATE'])
message_header_template = json.loads(os.environ['HEADER_TEMPLATE'])
bootstrap_servers = os.environ['BOOTSTRAP_SERVERS']
perf_counter_batch_size = int(os.environ.get('PERF_COUNTER_BATCH_SIZE', 100))
messages_per_aggregate = int(os.environ.get('MESSAGES_PER_AGGREGATE', 1))
max_message_count = int(os.environ.get('MAX_MESSAGE_COUNT', sys.maxsize))
def error_cb(err):
""" The error callback is used for generic client errors. These
errors are generally to be considered informational as the client will
automatically try to recover from all errors, and no extra action
is typically required by the application.
For this example however, we terminate the application if the client
is unable to connect to any broker (_ALL_BROKERS_DOWN) and on
authentication errors (_AUTHENTICATION). """
print("Client error: {}".format(err))
if err.code() == KafkaError._ALL_BROKERS_DOWN or \
err.code() == KafkaError._AUTHENTICATION:
# Any exception raised from this callback will be re-raised from the
# triggering flush() or poll() call.
raise KafkaException(err)
def acked(err, msg):
if err is not None:
print("Failed to send message: %s: %s" % (str(msg), str(err)))
producer_configs = {
'bootstrap.servers': bootstrap_servers,
'client.id': socket.gethostname(),
'error_cb': error_cb
}
# TODO: Need to support sasl.mechanism PLAIN (API) and GSSAPI (Kerberos) authentication.
# TODO: Need to support truststores for connecting to private DCs.
producer = Producer(producer_configs)
# generates a random value if it is not cached in the template_values dictionary
def get_templated_value(term, template_values):
if not term in template_values:
template_values[term] = str(uuid.uuid4())
return template_values[term]
def fill_template_value(value, template_values):
str_value = str(value)
template_regex = '{{(.+?)}}'
templated_terms = re.findall(template_regex, str_value)
for term in templated_terms:
str_value = str_value.replace(f"{{{{{term}}}}}", get_templated_value(term, template_values))
return str_value
def fill_template(template, templated_terms):
# TODO: Need to address metadata field, as it's treated as a string instead of a nested object.
return {field: fill_template_value(value, templated_terms) for field, value in template.items()}
#cache
def get_partition(lock_id):
bits = 128
bucket_size = 2**bits / partition_count
partition = (int(hashlib.md5(lock_id.encode('utf-8')).hexdigest(), 16) / bucket_size)
return math.floor(partition)
sequence_number = int(time.time() * 1000)
sequence_number = 0
message_count = 0
producing = True
start_time = time.perf_counter()
aggregate_message_counter = 0
# cache for templated term values so that they match across the different templates
templated_values = {}
try:
while producing:
sequence_number += 1
aggregate_message_counter += 1
message_count += 1
if aggregate_message_counter % messages_per_aggregate == 0:
# reset templated values
templated_values = {}
else:
for term in list(templated_values):
if term not in ['aggregateId', 'tenantId']:
del(templated_values[term])
# Fill in templated field values
message_key = fill_template(message_key_template, templated_values)
message_value = fill_template(message_value_template, templated_values)
message_header = fill_template(message_header_template, templated_values)
ts = datetime.utcnow().isoformat()[:-3]+'Z'
message_header['timestamp'] = ts
message_header['sequence_number'] = str(sequence_number)
message_value['timestamp'] = ts
message_value['sequenceNumber'] = sequence_number
lock_id = message_header['lock_id']
partition = get_partition(lock_id) # partition by lock_id, since key could be random, but a given aggregate_id should ALWAYS resolve to the same partition, regardless of key.
# Send message
producer.produce(topic_name, partition=partition, key=json.dumps(message_key), value=json.dumps(message_value), headers=message_header, callback=acked)
if sequence_number % perf_counter_batch_size == 0:
producer.flush()
end_time = time.perf_counter()
total_duration = end_time - start_time
messages_per_second=(perf_counter_batch_size/total_duration)
print(f'{messages_per_second} messages/second')
# reset start time
start_time = time.perf_counter()
if message_count >= max_message_count:
break
except Exception as e:
print(f'ERROR: %s' % e)
sys.exit(1)
finally:
producer.flush()

IBM Stats subscription topic always returns Reason 2033: FAILED: MQRC_NO_MSG_AVAILABLE

I am attempting to port some old java code to python.
I am using pymqi to connect to a queue manager and query for all messageflow statistics topics using the topic string: $SYS/Broker/+/StatisticsAccounting/Archive/#
When using the existing java program messages are read from the topic without issue.
When using the new python code it is able to connect and query the topic without issue but always gives the message
Reason 2033: FAILED: MQRC_NO_MSG_AVAILABLE
Stats messages are published by the broker for each messageflow every 10 minutes, and I have left the new code running for over 30minutes, never having received a message.
I've also tried setting
get_opts['WaitInterval'] = pymqi.CMQC.MQWI_UNLIMITED
and sitting around for 20minutes rather than using a loop, but no luck.
Is there any IIB server config that might be impacting the messages that I am able to see, or are there other options I should be using within the client?
import pymqi
queue_manager = 'MYQM'
channel = 'MYAPP.SVRCONN'
host = 'MYHOST'
port = 'MYPORT'
topic_string = '$SYS/Broker/+/StatisticsAccounting/Archive/#'
conn_info = '%s(%s)' % (host, port)
user = ""
password = ""
qmgr = pymqi.QueueManager(None)
qmgr.connect_tcp_client(queue_manager, pymqi.CD(), channel, conn_info, user, password)
sub_desc = pymqi.SD()
sub_desc['Options'] = pymqi.CMQC.MQSO_CREATE + pymqi.CMQC.MQSO_RESUME + pymqi.CMQC.MQSO_MANAGED
sub_desc.set_vs('SubName', 'apptest')
sub_desc.set_vs('ObjectString', topic_string)
sub = pymqi.Subscription(qmgr)
sub.sub(sub_desc=sub_desc)
get_opts = pymqi.GMO(Options=pymqi.CMQC.MQGMO_WAIT)
get_opts['WaitInterval'] = 10000
md = pymqi.md()
keep_running = True
while keep_running:
try:
# Reset the MsgId, CorrelId & GroupId so that we can reuse
# the same 'md' object again.
md.MsgId = pymqi.CMQC.MQMI_NONE
md.CorrelId = pymqi.CMQC.MQCI_NONE
md.GroupId = pymqi.CMQC.MQGI_NONE
message = sub.get(None, md, get_opts)
print('Have message from Queue')
print(message)
except pymqi.MQMIError as e:
if e.comp == pymqi.CMQC.MQCC_FAILED and e.reason == pymqi.CMQC.MQRC_NO_MSG_AVAILABLE:
print("no message?")
print(e)
pass
else:
# Some other error condition.
raise
except (UnicodeDecodeError, ValueError) as e:
print('Message is not valid json')
print(e)
print(message)
continue
except KeyboardInterrupt:
print('Have received a keyboard interrupt')
keep_running = False
sub.close(sub_close_options=0,close_sub_queue=True)
qmgr.disconnect()

Getting no msgs when reading Azure Event Hub with Python using EventHubClient

I have an Azure Event Hub that has messages in it. I wrote the msgs with a Python app and can see the correct msg count in the Event Hub GUI. But I cannot seem to read the msg with Python. My code is below. It runs with no errors but gets zero results.
Oddly, after I run this code, the Event Hub GUI shows that all the msgs (a couple thousand) were outgoing, indicating that my program actually got them. But the code never displays them.
Any help appreciated!
The result is always...
Msg offset: <azure.eventhub.common.Offset object at 0x102fc4e10>
Msg seq: 0
Msg body: 0
Received 1 messages in 0.11292386054992676 seconds
++++++++++++
# pip install azure-eventhub
import logging
import time
from azure.eventhub import EventHubClient, Receiver, Offset
logger = logging.getLogger("azure")
# URL of the event hub, amqps://<mynamespace>.servicebus.windows.net/myeventhub
ADDRESS = "amqps://chc-eh-ns.servicebus.windows.net/chc-eh"
# Access tokens for event hub namespace, from Azure portal for namespace
USER = "RootManageSharedAccessKey"
KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXX"
# Additional setup to receive events
CONSUMER_GROUP = "$default" # our view of the event hub, useful when there is more than one consumer at same time
PARTITION = "0" # which stream within event hub
OFFSET = Offset("-1") # get all msgs in event hub. msgs are never removed, they just expire per event hub settings
PREFETCH = 100 # not sure exactly what this does ??
# Initialize variables
total = 0
last_sn = -1
last_offset = -1
client = EventHubClient(ADDRESS, debug=False, username=USER, password=KEY)
try:
receiver = client.add_receiver(CONSUMER_GROUP, PARTITION, prefetch=PREFETCH, offset=OFFSET)
client.run()
start_time = time.time()
for event_data in receiver.receive(timeout=100):
last_offset = event_data.offset
last_sn = event_data.sequence_number
print("Msg offset: " + str(last_offset))
print("Msg seq: " + str(last_sn))
print("Msg body: " + event_data.body_as_str())
total += 1
end_time = time.time()
client.stop()
run_time = end_time - start_time
print("\nReceived {} messages in {} seconds".format(total, run_time))
except KeyboardInterrupt:
pass
finally:
client.stop()
Got it! The code sample I copied was wrong. You have to get batches of messages then iterate over each batch. Here is the correct code...
# pip install azure-eventhub
import time
from azure.eventhub import EventHubClient, Offset
# URL of the event hub, amqps://<mynamespace>.servicebus.windows.net/myeventhub
ADDRESS = "amqps://chc-eh-ns.servicebus.windows.net/chc-eh"
# Access tokens for event hub namespace, from Azure portal for namespace
USER = "RootManageSharedAccessKey"
KEY = "XXXXXXXXXXXX"
# Additional setup to receive events
CONSUMER_GROUP = "$default" # our view of the event hub, useful when there is more than one consumer at same time
PARTITION = "0" # which stream within event hub
OFFSET = Offset("-1") # get all msgs in event hub. msgs are never removed, they just expire per event hub settings
PREFETCH = 100 # batch size ??
# Initialize variables
total = 0
last_sn = -1
last_offset = -1
client = EventHubClient(ADDRESS, debug=False, username=USER, password=KEY)
try:
receiver = client.add_receiver(CONSUMER_GROUP, PARTITION, prefetch=PREFETCH, offset=OFFSET)
client.run()
start_time = time.time()
batch = receiver.receive(timeout=5000)
while batch:
for event_data in batch:
last_offset = event_data.offset
last_sn = event_data.sequence_number
print("Msg offset: " + str(last_offset))
print("Msg seq: " + str(last_sn))
print("Msg body: " + event_data.body_as_str())
total += 1
batch = receiver.receive(timeout=5000)
end_time = time.time()
client.stop()
run_time = end_time - start_time
print("\nReceived {} messages in {} seconds".format(total, run_time))
except KeyboardInterrupt:
pass
finally:
client.stop()

How to create Transform in Memsql when source is Kafka Avro Format

I am able to push data from Kafka to Memsql.
I am trying to push using Transform. I have created Kafka Consumer in Python which is consuming data from Kafka Topic and converting to Json Format.
I don't know how to use this as Transform in Memsql.
from confluent_kafka import KafkaError
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError
import sys
c = AvroConsumer({
'bootstrap.servers': 'X.Y.Z.W:9092',
'group.id': 'groupid1112',
'schema.registry.url': 'http://X.Y.Z.W:8081',
'default.topic.config': {
'auto.offset.reset': 'smallest'
}
})
c.subscribe(['test_topic'])
count =0
while True:
try:
msg = c.poll(10)
except SerializerError as e:
print("Message deserialization failed for {}: {}".format(msg, e))
break
if msg is None:
continue
if msg.error():
if msg.error().code() == KafkaError._PARTITION_EOF:
continue
else:
print(msg.error())
break
valueList = list(msg.value().values())
print(valueList)
c.close()
It's pritning
[1518776144187, 1, 2, 103,'asas',asas'eer',None]
check these docs
https://docs.memsql.com/memsql-pipelines/v6.0/transforms/
stay tuned for native avro support in an upcoming MemSQL release.
youll want to do something like the following, but with me sketching over the avro-specific details since I don't know the avro library off the top of my head.
```
def input_stream():
"""
Consume STDIN and yield each record that is received from MemSQL
"""
while True:
byte_len = sys.stdin.read(8)
if len(byte_len) == 8:
byte_len = struct.unpack("L", byte_len)[0]
result = sys.stdin.read(byte_len)
yield result
else:
assert len(byte_len) == 0, byte_len
return
avro_context = WhateverYouNeed() # maybe connect to schema registry here if you need to
for msg in input_stream():
object = DeserializeAvro(avro_context, msg) # this is your code
sys.stdout.write(SerializeToTSV(object)) # also your code
```
Using the schema registry should be fine, but you shouldn't need to worry about the details of reading from kafka in your transform script. I can try to get you a more detailed script Monday, but this is how to structure the code.

Categories