How to implement exactly-once on Apache Kafka using Python? - python

I am trying to implement an exactly-once concept using confluent kafka library. But I couldn't succeed. Here is my producer code:
from confluent_kafka import Producer, SerializingProducer
import socket, json, random, time
conf = {'bootstrap.servers': "localhost:9092,localhost:9093,localhost:9094",
'client.id': socket.gethostname(),
'message.send.max.retries':2,
'enable.idempotence':True,
'isolation.level':'read_committed'}
producer = Producer(conf)
producer.produce("test_topic2", key="pc", value="ok")
producer.flush()
Here is my consumer code:
from confluent_kafka import Consumer, TopicPartition
conf = {'bootstrap.servers': 'localhost:9092,localhost:9093,localhost:9094',
'group.id': "foo2",
'auto.offset.reset': 'earliest',
'enable.idempotence':True,
'isolation.level':'read_committed'}
consumer = Consumer(conf)
topic = "test_topic2"
#consumer.subscribe([topic],on_assign=my_on_assign)
consumer.subscribe([topic])
# Poll for new messages from Kafka and print them.
try:
while True:
msg = consumer.poll()
if msg is None:
print("Waiting...")
elif msg.error():
print("ERROR: %s".format(msg.error()))
else:
# Extract the (optional) key and value, and print.
print(f"{msg.topic()} key = {msg.key()} value = {msg.value().decode('utf-8'):12}\
{msg.partition()} {msg.offset()} ")
except KeyboardInterrupt:
pass
finally:
# Leave group and commit final offsets
consumer.close()
And this is my output(respectively topic name, key name, value name, partition and offset):
test_topic2 key = b'pc' value = ok 2 3
test_topic2 key = b'pc' value = ok 2 4
As I know when I send the same key and value it needs to be arranged to show only one of them. Here for example I need to only get the message which offset is 3. So my question is am I implementing wrong architecture on my code or I understand the concept wrongly ? Thanks in advance.

Related

Concurrency issue data corruption asyncio python-can locking/queues - nested dictionaries

I am at the end of a very long journey...
Here is the long story if you are interested.
https://github.com/hardbyte/python-can/issues/1336
Sorry for the incredibly long code snippet but I am not sure where I am going wrong so I thought more is more.
The code is as follows request_inst() requests instrumentation data using the dict request_info from an MCU, the MCU responds and this is picked up by the listener. obtain_message() creates a future with which to store all_data that is yielded from the listener with msg = await reader.get_message(). I attempt to structure this process with lock. store_data() is where I store the response data from the MCU, this is a dict called all_data. all_data when printed outside of the listener appears with zero values as shown below. The purpose of the code is to make all_data available outside of the event loop but currently even with this implementation I cannot get all_data to appear without zero values showing up in the dict.
import asyncio
import can
from can.notifier import MessageRecipient
from typing import List
freq = 0.0003
# this is the respond ids and the and the parameter ids of the data
# stored data is suppose to fill up the None with a value
all_data = {268439810: {16512: [None], 16513: [None], 16514: [None], 16515: [None]},
268444162: {16512: [None], 16513: [None], 16514: [None], 16515: [None]}}
request_info = {286326784: {16512, 16513, 16514, 16515},
287440896: {16512, 16513, 16514, 16515}}
# all the request ids in that have been configured
cm4_read_ids = [286326784, 287440896]
# all the response ids in that have been configured
mcu_respond_ids = [268439810, 268444162]
# finds arb id and pid and logs the data from the message
# async def store_data(arb_id, msg_data, msg_dlc):
async def store_data(msg: can.Message, lock):
pid = int.from_bytes(msg.data[0:2], 'little')
arb_id = msg.arbitration_id
if arb_id in mcu_respond_ids:
async with lock:
if msg.dlc == 5:
all_data[arb_id][pid][0] = int.from_bytes(msg.data[2:4], 'little', signed=False)
elif msg.dlc == 7:
all_data[arb_id][pid][0] = int.from_bytes(msg.data[2:6], 'little', signed=False)
return all_data
async def request_inst(bus: can.Bus):
print('Request inst active')
while True:
for key in request_info:
for val in request_info[key]:
pid = int(val)
pidbytes = pid.to_bytes(2, 'little')
msg = can.Message(arbitration_id=key, data=pidbytes)
bus.send(msg)
await asyncio.sleep(freq)
# await store_data(reader)
async def message_obtain(reader: can.AsyncBufferedReader, lock):
print('Started it the get message process')
while True:
await asyncio.sleep(0.01)
msg = await reader.get_message()
future = await store_data(msg, lock)
async with lock:
print('This is the future')
print(future)
async def main() -> None:
with can.Bus(
interface="socketcan", channel="can0", receive_own_messages=True
) as bus:
# Create Notifier with an explicit loop to use for scheduling of callbacks
loop = asyncio.get_running_loop()
reader = can.AsyncBufferedReader()
lock = asyncio.Lock()
listeners: List[MessageRecipient] = [reader]
notifier = can.Notifier(bus, listeners, loop=loop)
try:
task1 = asyncio.create_task(request_inst(bus))
task2 = asyncio.create_task(message_obtain(reader, lock))
await asyncio.gather(task1, task2)
except KeyboardInterrupt:
notifier.stop()
bus.shutdown()
if __name__ == "__main__":
asyncio.run(main())
The issue I am seeing is that even on this cut down one page wonder I am seeing what I believe to be concurrency issues.
As you can see I have tried locking but still I am seeing these zero values appear in all_data See below where pid 16514's balue becomes 0 when the messages being returned by the MCU are not zero.
n.b. the output below where the incorrect data is shown is the output from print(future)
The real value should never be 0 as it is a measured value.
b6 = (1016872961).to_bytes(4, 'little')
struct.unpack('<f', b6)
(0.01907348819077015,)
Am I doing anything very stupid? It feels like I am not accessing the data in the listener correctly despite using lock when all_data is being modified.
If I print from the listeners the data is always correct even when all_data is returning 0 values.
If anyone is able to help me it would be much appreciated.
It appears the problem was not software related and the zeros were real. Every day is a learning day!
This is a PCAN image of the highlighted send and a response shown at the top.
EDIT: Confirmed MCU response issue - looks like the firmware on the MCU. My Rigol wouldn't trigger on the ID so I had to 1/8 video and then screen shot that to catch it in the act. You can see the response is all 7 bytes of nothing.
Ok ok ok, it turns out I was doing something really silly. I was using nested dictionaries to store data about different ids but the keys were the same. After some investigation using id(id1[some_pid1]) and id(id2[some_pid1]) I discovered the keys had the same memory address.
data_dict = {id1: {some_pid1: value, some_pid2: value},
id2: {some_pid1: value, some_pid2: value}}
This appeared to all that it was a race condition but actually I was just writing zeros (which turned out to be forced from the MCU) to the wrong id because it shared a key with the other id.
Whoops

CanĀ“t consume messages from topic

I have the following code:
from confluent_kafka.admin import AdminClient, NewTopic
a = AdminClient({'bootstrap.servers': 'localhost:9092'})
new_topics = [NewTopic(topic, num_partitions=3, replication_factor=1) for topic in ["topic1", "topic2"]]
fs = a.create_topics(new_topics)
for topic, f in fs.items():
try:
f.result()
print("Topic {} created".format(topic))
except Exception as e:
print("Failed to create topic {}: {}".format(topic, e))
Creating the topics worked fine.
This is my producer:
from confluent_kafka import Producer
p = Producer({'bootstrap.servers': 'localhost:9092'})
some_data_source = ["hello", "wuff"]
def delivery_report(err, msg):
if err is not None:
print('Message delivery failed: {}'.format(err))
else:
print('Message delivered to {} [{}]'.format(msg.topic(), msg.partition()))
for data in some_data_source:
p.poll(0)
p.produce('mytopic', data.encode('utf-8'), callback=delivery_report)
p.flush()
Message delivered to mytopic [0]
Message delivered to mytopic [0]
Consumer:
from confluent_kafka import Consumer
c = Consumer({
'bootstrap.servers': 'localhost:9092',
'group.id': 'mygroup',
'auto.offset.reset': 'earliest'
})
c.subscribe(['topic1'])
while True:
msg = c.poll(1.0)
print(msg)
if msg is None:
continue
if msg.error():
print("Consumer error: {}".format(msg.error()))
continue
print('Received message: {}'.format(msg.value().decode('utf-8')))
c.close()
When I subscribe to the topic (which works), I only get None every second. Am I doing something wrong here? Does it has to do something with 'group.id': 'mygroup'? Can anyone help me?
Your producer code is writing to mytopic topic. Which doesn't match your create script or what your consumer has subscribed to.
Also, if you don't want it to print None, then move the print statement inside the if statement since poll function can return None
As commented, you may also want to try further debugging with CLI tools

IBM Stats subscription topic always returns Reason 2033: FAILED: MQRC_NO_MSG_AVAILABLE

I am attempting to port some old java code to python.
I am using pymqi to connect to a queue manager and query for all messageflow statistics topics using the topic string: $SYS/Broker/+/StatisticsAccounting/Archive/#
When using the existing java program messages are read from the topic without issue.
When using the new python code it is able to connect and query the topic without issue but always gives the message
Reason 2033: FAILED: MQRC_NO_MSG_AVAILABLE
Stats messages are published by the broker for each messageflow every 10 minutes, and I have left the new code running for over 30minutes, never having received a message.
I've also tried setting
get_opts['WaitInterval'] = pymqi.CMQC.MQWI_UNLIMITED
and sitting around for 20minutes rather than using a loop, but no luck.
Is there any IIB server config that might be impacting the messages that I am able to see, or are there other options I should be using within the client?
import pymqi
queue_manager = 'MYQM'
channel = 'MYAPP.SVRCONN'
host = 'MYHOST'
port = 'MYPORT'
topic_string = '$SYS/Broker/+/StatisticsAccounting/Archive/#'
conn_info = '%s(%s)' % (host, port)
user = ""
password = ""
qmgr = pymqi.QueueManager(None)
qmgr.connect_tcp_client(queue_manager, pymqi.CD(), channel, conn_info, user, password)
sub_desc = pymqi.SD()
sub_desc['Options'] = pymqi.CMQC.MQSO_CREATE + pymqi.CMQC.MQSO_RESUME + pymqi.CMQC.MQSO_MANAGED
sub_desc.set_vs('SubName', 'apptest')
sub_desc.set_vs('ObjectString', topic_string)
sub = pymqi.Subscription(qmgr)
sub.sub(sub_desc=sub_desc)
get_opts = pymqi.GMO(Options=pymqi.CMQC.MQGMO_WAIT)
get_opts['WaitInterval'] = 10000
md = pymqi.md()
keep_running = True
while keep_running:
try:
# Reset the MsgId, CorrelId & GroupId so that we can reuse
# the same 'md' object again.
md.MsgId = pymqi.CMQC.MQMI_NONE
md.CorrelId = pymqi.CMQC.MQCI_NONE
md.GroupId = pymqi.CMQC.MQGI_NONE
message = sub.get(None, md, get_opts)
print('Have message from Queue')
print(message)
except pymqi.MQMIError as e:
if e.comp == pymqi.CMQC.MQCC_FAILED and e.reason == pymqi.CMQC.MQRC_NO_MSG_AVAILABLE:
print("no message?")
print(e)
pass
else:
# Some other error condition.
raise
except (UnicodeDecodeError, ValueError) as e:
print('Message is not valid json')
print(e)
print(message)
continue
except KeyboardInterrupt:
print('Have received a keyboard interrupt')
keep_running = False
sub.close(sub_close_options=0,close_sub_queue=True)
qmgr.disconnect()

Processing event hub data using python

I am using azure event hub python SDK to send to and receive messages from event hub following this link.https://github.com/Azure/azure-event-hubs-python/tree/develop. I can successfully send and receive messages. But how do i parse the messages and retrieve the data from the event data object. Please find the code below.
import os
import sys
#import logging
from azure.eventhub import EventHubClient, Receiver, Offset
ADDRESS = 'sb://####.servicebus.windows.net/#####'
USER = '##########'
KEY = '##################################'
CONSUMER_GROUP = "$default"
OFFSET = Offset("-1")
PARTITION = "1"
total = 0
last_sn = -1
last_offset = "-1"
try:
if not ADDRESS:
raise ValueError("No EventHubs URL supplied.")
client = EventHubClient(ADDRESS, debug=False, username=USER, password=KEY)
receiver = client.add_receiver(CONSUMER_GROUP, PARTITION, prefetch=5000,
offset=OFFSET)
client.run()
try:
batched_events = receiver.receive(timeout=20)
except:
raise
finally:
client.stop()
for event_data in batched_events:
last_offset = event_data.offset.value
last_sn = event_data.sequence_number
total += 1
print("Partition {}, Received {}, sn={} offset={}".format(
PARTITION,
total,
last_sn,
last_offset))
except KeyboardInterrupt:
pass
if i try to view the event_data received i can see the below message.
event_data
<azure.eventhub.common.EventData at 0xd4f1358>
event_data.message
<uamqp.message.Message at 0xd4f1240>
Any help on the above on how to parse this message to extract the data
As of 1.1.0, there are new utility methods to extract the actual data of the message:
body_as_str
body_as_json
So, what used to be
import json
event_obj = json.loads(next(event_data.body).decode('UTF-8'))
Is now:
event_obj = event_data.body_as_json()
For people using the Event Hub version 5.2.0 -- latest as of today (GitHub, Reference Docs), it's the same as the 1.1.0 version, i.e. use body_as_str() or body_as_json(). But the client has changed -- there's an EventHubProducerClient and an EventHubConsumerClient in the new version. To print the body of an event received:
from azure.eventhub import EventHubConsumerClient
connection_str = '<< CONNECTION STRING FOR THE EVENT HUBS NAMESPACE >>'
consumer_group = '<< CONSUMER GROUP >>'
eventhub_name = '<< NAME OF THE EVENT HUB >>'
client = EventHubConsumerClient.from_connection_string(
connection_str, consumer_group, eventhub_name=eventhub_name
)
def on_event_batch(partition_context, events):
partition_context.update_checkpoint()
for e in events:
print(e.body_as_str())
with client:
client.receive_batch(
on_event_batch=on_event_batch,
starting_position="-1", # "-1" is from the beginning of the partition.
)

How to create Transform in Memsql when source is Kafka Avro Format

I am able to push data from Kafka to Memsql.
I am trying to push using Transform. I have created Kafka Consumer in Python which is consuming data from Kafka Topic and converting to Json Format.
I don't know how to use this as Transform in Memsql.
from confluent_kafka import KafkaError
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError
import sys
c = AvroConsumer({
'bootstrap.servers': 'X.Y.Z.W:9092',
'group.id': 'groupid1112',
'schema.registry.url': 'http://X.Y.Z.W:8081',
'default.topic.config': {
'auto.offset.reset': 'smallest'
}
})
c.subscribe(['test_topic'])
count =0
while True:
try:
msg = c.poll(10)
except SerializerError as e:
print("Message deserialization failed for {}: {}".format(msg, e))
break
if msg is None:
continue
if msg.error():
if msg.error().code() == KafkaError._PARTITION_EOF:
continue
else:
print(msg.error())
break
valueList = list(msg.value().values())
print(valueList)
c.close()
It's pritning
[1518776144187, 1, 2, 103,'asas',asas'eer',None]
check these docs
https://docs.memsql.com/memsql-pipelines/v6.0/transforms/
stay tuned for native avro support in an upcoming MemSQL release.
youll want to do something like the following, but with me sketching over the avro-specific details since I don't know the avro library off the top of my head.
```
def input_stream():
"""
Consume STDIN and yield each record that is received from MemSQL
"""
while True:
byte_len = sys.stdin.read(8)
if len(byte_len) == 8:
byte_len = struct.unpack("L", byte_len)[0]
result = sys.stdin.read(byte_len)
yield result
else:
assert len(byte_len) == 0, byte_len
return
avro_context = WhateverYouNeed() # maybe connect to schema registry here if you need to
for msg in input_stream():
object = DeserializeAvro(avro_context, msg) # this is your code
sys.stdout.write(SerializeToTSV(object)) # also your code
```
Using the schema registry should be fine, but you shouldn't need to worry about the details of reading from kafka in your transform script. I can try to get you a more detailed script Monday, but this is how to structure the code.

Categories