Kafka producer and consumer not working properly on python with docker - python

Im working on a project that uses kafka producer and consumer in order to acquire articles (with specific topics) from a news_api every two hours and then with a consumer save them in a mongodb.
So i made three classes one for KafkaAdminClient one for KafkaProducer and one for KafkaConsumer.
The servers for my kafka are running on a docker container. The main application is a flask app, and thats where i start all the threads and the ones for kafka.
Ive been trying to change a lot of little things but it just seems very unstable and i dont know why. Firstly the data gets on the consumer and eventually in the mongodb at a random time. Then the old topics in the consumer dont get deleted and the database keeps getting populated with new and old values.
Now that i put a group in the consumer and i added the kafkaAdminClient class i dont get messages in the consumer at all. All i get is this:
articleretrieval-flask_api-1 | WARNING:kafka.cluster:Topic health is not available during auto-create initialization articleretrieval-flask_api-1 | WARNING:kafka.cluster:Topic business is not available during auto-create initialization articleretrieval-flask_api-1 | WARNING:kafka.cluster:Topic war is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic motorsport is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic sources is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic science is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic technology is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic education is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic space is not available during auto-create initialization articleretrieval-flask_api-1
| INFO:kafka.consumer.subscription_state:Updated partition assignment: [] articleretrieval-flask_api-1
| INFO:kafka.conn:<BrokerConnection node_id=bootstrap-0 host=kafka:29092 <connected> [IPv4 ('172.19.0.4', 29092)]>: Closing connection.
kafkaConsumerThread.py:
class KafkaConsumerThread:
def __init__(self, topics, db,logger):
self.topics = topics
self.db = db
self.logger = logger
def start(self):
self.logger.debug("Getting the kafka consumer")
try:
consumer = KafkaConsumer(bootstrap_servers=['kafka:29092'],
auto_offset_reset='earliest',
# group_id='my_group',
enable_auto_commit=False,
value_deserializer=lambda x: json.loads(x.decode('utf-8')))
except NoBrokersAvailable as err:
self.logger.error("Unable to find a broker: {0}".format(err))
time.sleep(1)
consumer.subscribe(self.topics + ["sources"])
for message in consumer:
self.logger(message)
if message.topic == "sources":
self.db.insert_source_info(message.value["source_name"], message.value["source_info"])
else:
self.db.insert_article(message.topic, [message.value])
def on_send_success(record_metadata):
return
# print(record_metadata.topic)
# print(record_metadata.partition)
def on_send_error(excp):
print(excp)
def call_apis(self, topics, news_api, media_api):
try:
producer = KafkaProducer(bootstrap_servers=['kafka:29092'],
max_block_ms=100000,
value_serializer=lambda x: json.dumps(x).encode('utf-8'))
except NoBrokersAvailable as err:
# self.logger.error("Unable to find a broker: {0}".format(err))
time.sleep(1)
domains = []
try:
if producer:
for topic in topics:
articles = news_api.get_articles(topic)
for article in articles:
if article['source'] != '':
if article['source'] not in domains:
domains.append(article['source'])
producer.send(topic, value=article).add_callback(on_send_success).add_errback(on_send_error)
producer.flush()
for domain in domains:
source_info = media_api.get_source_domain_info(domain)
if source_info:
producer.send("sources", value={"source_name": domain, "source_info": source_info}).add_callback(on_send_success).add_errback(on_send_error)
# Flush the producer to ensure all messages are sent
producer.flush()
except AttributeError:
self.logger.error("Unable to send message. The producer does not exist.")
class KafkaProducerThread:
def __init__(self, topics,logger):
self.topics = topics
self.news_api = NewsApi()
self.media_api = MediaWikiApi()
self.logger = logger
def start(self):
# Call the APIs immediately when the thread starts
call_apis(self, self.topics, self.news_api, self.media_api)
# Use a timer to schedule the next API call
timer = Timer(7200, self.start)
timer.start()
kafkaAdminClient.py:
class KafkaAdminThread:
def __init__(self,topics):
self.topics = topics
def start(self):
admin_client = KafkaAdminClient(
bootstrap_servers=['kafka:29092'],
client_id='my_client'
)
topic_list = []
for topic in self.topics:
topic_list.append(NewTopic(name=topic, num_partitions=1, replication_factor=1))
admin_client.create_topics(new_topics=topic_list, validate_only=False)
app.py:
if __name__ == "__main__":
# Creating a new connection with mongo
# threading.Thread(target=lambda: app.run(port=8080, host="0.0.0.0",debug=True,use_reloader=False)).start()
executor = ThreadPoolExecutor(max_workers=4)
producerThread = KafkaProducerThread(TOPICS,logging)
adminThread = KafkaAdminThread(TOPICS)
executor.submit(adminThread.start)
flaskThread = threading.Thread(target=lambda: app.run(port=8080, host="0.0.0.0", debug=True, use_reloader=False))
executor.submit(flaskThread.start())
time.sleep(15)
executor.submit(producerThread.start)
consumerThread = KafkaConsumerThread(TOPICS, db,logging)
executor.submit(consumerThread.start)
docker-compose.yml:
zookeeper:
image: wurstmeister/zookeeper
ports:
- "2181:2181"
kafka:
container_name: kafka_broker_1
image: wurstmeister/kafka
links:
- zookeeper
ports:
- "9092:9092"
- "29092:29092"
depends_on:
- zookeeper
environment:
KAFKA_ADVERTISED_HOSTNAME: kafka
KAFKA_ADVERTISED_LISTENERS: INSIDE://kafka:29092,OUTSIDE://localhost:9092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INSIDE:PLAINTEXT,OUTSIDE:PLAINTEXT
KAFKA_LISTENERS: INSIDE://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
KAFKA_INTER_BROKER_LISTENER_NAME: INSIDE
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
volumes:
- /var/run/docker.sock:/var/run/docker.sock%
flask_api:
build:
context: . #Very important it refers where the root will be for the build.
dockerfile: Dockerfile
links:
- kafka
environment:
- FLASK-KAFKA_BOOTSTRAP-SERVERS=kafka:29092
- SERVER_PORT=8080
ports:
- "8080:8080"
depends_on:
- kafka

the old topics in the consumer dont get deleted
Nothing in your shown code is deleting topics. The only way they would be deleted is if the Kafka container restarted since you've not mounted a volume for Kafka or Zookeeper to persist them.
and the database keeps getting populated with new and old values.
I assume your producer isn't keeping track of what sources it has read so far? If so, you'll end up with duplicates in the topic. I
suggest using kafka-console-consumer to debug whether the producer is actually working the way you want.
Likewise, you've disabled consumer auto commits, and I see no code committing manually, so when the consumer restarts, it'll re-process any existing data in the topics. Group / AdminClient settings shouldn't affect that, but setting a group will allow you to actually maintain offset tracking.
In terms of stability, I've used Flask and Kafka without threads before, and it has worked fine. At least, a producer... My suggestion would be to make a completely separate container for the consumer that's responsible for writing to the database. You don't need the overhead of Flask framework for that. Or, recommended, would be Kafka Connect Mongo sink instead.
The wurstmeister container supports creating topics on its own, via environment variables, by the way.

Related

uwsgi master graceful shutdown

I'm running uwsgi+flask application,
The app is running as a k8s pod.
When i deploy a new pod (a new version), the existing pod get SIGTERM.
This causes the master to stop accepting new connection at the same moment, what causes issues as the LB still pass requests to the pod (for a few more seconds).
I would like the master to wait 30 sec BEFORE stop accepting new connections (When getting SIGTERM) but couldn't find a way, is it possible?
My uwsgi.ini file:
[uwsgi]
;https://uwsgi-docs.readthedocs.io/en/latest/HTTP.html
http = :8080
wsgi-file = main.py
callable = wsgi_application
processes = 2
enable-threads = true
master = true
reload-mercy = 30
worker-reload-mercy = 30
log-5xx = true
log-4xx = true
disable-logging = true
stats = 127.0.0.1:1717
stats-http = true
single-interpreter= true
;https://github.com/containous/traefik/issues/615
http-keepalive=true
add-header = Connection: Keep-Alive
Seems like this is not possible to achieve using uwsgi:
https://github.com/unbit/uwsgi/issues/1974
The solution - (as mentioned on this kubernetes issue):
https://github.com/kubernetes/contrib/issues/1140
Is to use the prestop hook, quite ugly but will help to achieve zero downtime:
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: nginx
spec:
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx
image: nginx
ports:
- containerPort: 80
lifecycle:
preStop:
exec:
command: ["/bin/sleep","5"]
The template is taken from this answer: https://stackoverflow.com/a/39493421/3659858
Another option is to use the CLI option:
--hook-master-start "unix_signal:15 gracefully_kill_them_all"
or in the .ini file (remove the double quotes):
hook-master-start = unix_signal:15 gracefully_kill_them_all
which will gracefully terminate workers after receiving a SIGTERM (signal 15).
See the following for reference.
When I tried the above though, it didn't work as expected from within a docker container. Instead, you can also use uWSGI's Master FIFO file. The Master FIFO file can be specified like:
--master-fifo <filename>
or
master-fifo = /tmp/master-fifo
Then you can simply write a q character to the file and it will gracefully shut down your workers before exiting.

elasticache redis - python - connection times out

I have a rather simple test app:
import redis
import os
import logging
log = logging.getLogger()
log.setLevel(logging.DEBUG)
def test_redis(event, context):
redis_endpoint = None
if "REDIS" in os.environ:
redis_endpoint = os.environ["REDIS"]
log.debug("redis: " + redis_endpoint)
else:
log.debug("cannot read REDIS config environment variable")
return {
'statusCode': 500
}
redis_conn = None
try:
redis_conn = redis.StrictRedis(host=redis_endpoint, port=6379, db=0)
redis_conn.set("foo", "boo")
redis_conn.get("foo")
except:
log.debug("failed to connect to redis")
return {
'statusCode': 500
}
finally:
del redis_conn
return {
'statusCode': 200
}
which I have deployed as a HTTP endpoint with serverless
#
# For full config options, check the docs:
# docs.serverless.com
#
service: XXX
plugins:
- serverless-aws-documentation
- serverless-python-requirements
custom:
pythonRequirements:
dockerizePip: true
provider:
name: aws
stage: dev
region: eu-central-1
runtime: python3.6
environment:
# our cache
REDIS: xx-xx-redis-001.xxx.euc1.cache.amazonaws.com
functions:
hello:
handler: hello/hello_world.say_hello
events:
- http:
path: hello
method: get
# private: true # <-- Requires clients to add API keys values in the `x-api-key` header of their request
# authorizer: # <-- An AWS API Gateway custom authorizer function
testRedis:
handler: test_redis/test_redis.test_redis
events:
- http:
path: test-redis
method: get
When I trigger the endpoint via API Gateway, the lambda just times out after about 7 seconds.
The environmental variable is read properly, no error message displayed.
I suppose there's a problem connecting to the redis, but the tutorial are quite explicit - not sure what the problem could be.
The problem might need the need to set up a NAT, not sure how to accomplish this task with serverless
I ran into this issue as well. For me, there were a few problems that had to be ironed out
The lambda needs VPC permissions.
The ElastiCache security group needs an inbound rule from the Lambda security group that allows communication on the Redis port. I thought they could just be in the same security group.
And the real kicker: I had turned on encryption in-transit. This meant that I needed to pass redis.RedisClient(... ssl=True). The redis-py page mentions that ssl_cert_reqs needs to be set to None for use with ElastiCache, but that didn't seem to be true in my case. I did however need to pass ssl=True.
It makes sense that ssl=True needed to be set but the connection was just timing out so I went round and round trying to figure out what the problem with the permissions/VPC/SG setup was.
Try having the lambda in the same VPC and security group as your elastic cluster

kafka consumer on single node multi broker configuration

Did somebody worked on kafka python with single node multi broker setup?
I was able to produce and consume the data with single node single broker settings but when I have changed that to single node multi broker data was produced and was stored in the topics but when I run the consumer code data was not consumed.
Any suggestions on the above would be appreciated. Thanks in advance!
Note: all the configurations like producer , consumer and server properties were verified and are fine.
Producer code:
from kafka.producer import KafkaProducer
def producer():
data = {'desc' : 'testing', 'data' : 'testing single node multi broker'}
topic = 'INTERNAL'
producer = KafkaProducer(value_serializer=lambda v:json.dumps(v).encode('utf-8'), bootstrap_servers=["localhost:9092","localhost:9093","localhost:9094"])
producer.send(topic, data)
producer.flush()
Consumer code:
from kafka.consumer import KafkaConsumer
def consumer():
topic = 'INTERNAL'
consumer = KafkaConsumer(topic,bootstrap_servers=["localhost:9092","localhost:9093","localhost:9094"])
for data in consumer:
print data
Server 1 config: I have added two more server files like this with same parameters for other brokers with the difference in broker.id, log.dirs values.
broker.id=1
port=9092
num.network.threads=3
log.dirs=/tmp/kafka-logs-1
num.partitions=3
num.recovery.threads.per.data.dir=1
log.retention.hours=168
log.segment.bytes=1073741824
log.retention.check.interval.ms=300000
log.cleaner.enable=false
zookeeper.connect=localhost:2181
delete.topic.enable=true
Producer config:
metadata.broker.list=localhost:9092,localhost:9093,localhost:9094
Consumer config:
zookeeper.connect=127.0.0.1:2181
zookeeper.connection.timeout.ms=6000
Do you receive the messages with a simple Kafka consumer ?
bin/kafka-console-consumer.sh –bootstrap-server localhost:9092,localhost:9093,localhost:9094 –topic INTERNAL –from-beginning
Or with this one :
bin/kafka-console-consumer.sh --zookeeper localhost:2181 --from-beginning --topic INTERNAL
If you get the messages with the second command, try to delete /tmp/log.dir of your brokers and log files in /tmp/zookeepker/version-2/. Then restart zookeeper and your brokers and create your topic again.

why rabbitmq generate lots of idx file in queue directory

I set up a RabbitMQ server with docker as below. then configure the celery to use it as broker.
rabbit:
hostname: rabbit
image: rabbitmq:latest
environment:
- RABBITMQ_DEFAULT_USER=admin
- RABBITMQ_DEFAULT_PASS=mypass
ports:
- "5673:5672"
worker:
build:
context: .
dockerfile: dockerfile
volumes:
- .:/app
links:
- rabbit
depends_on:
- rabbit
And the celery configuration is:
from __future__ import absolute_import
from celery import Celery
app = Celery('test_celery',broker='amqp://admin:mypass#host:5673',backend='rpc://',include=['test_celery.tasks'])
Run_tasks code:
from .tasks import longtime_add
import time
if __name__ == '__main__':
url = ['http://example1.com' , 'http://example2.com' , 'http://example3.com' , 'http://example4.com' , 'http://example5.com' , 'http://example6.com' , 'http://example7.com' , 'http://example8.com'] # change them to your ur list.
for i in url:
result = longtime_add.delay(i)
print 'Task result:',result.result
tasks code
from __future__ import absolute_import
from test_celery.celery import app
import time,requests
from pymongo import MongoClient
client = MongoClient('10.1.1.234', 27018) # change the ip and port to your mongo database's
db = client.mongodb_test
collection = db.celery_test
post = db.test
#app.task(bind=True,default_retry_delay=10) # set a retry delay, 10 equal to 10s
def longtime_add(self,i):
try:
r = requests.get(i)
if some conditions happend:
longtime_add.delay(i)
elif some other conditions happened:
post.insert({'status':r.status_code,"creat_time":time.time()})
except Exception as exc:
raise self.retry(exc=exc)
The run_taks code will generate a list of url and send them to RabbitMQ, then tasks will consume them, and check some conditions happened or not, if happend send the result to Rabbtmq again , otherwise store the data to database.
The question here . When tasks run for a long time, 24 hours or even long, Lots of idx files will be generated in directory "mnesia/rabbit#rabbit/queues" by RabbitMQ . The total size will be 40G.
So the question here , how to stop these large file generated automatically or keeping in a small size?
.idx files are queue index segments. Make your apps consume enqueued messages and acknowledge them, then queue indices will be updated and their segment files deleted when they are no longer necessary.
You can also purge or delete queues, of course.

Django Celery get task count

I am currently using django with celery and everything works fine.
However I want to be able to give the users an opportunity to cancel a task if the server is overloaded by checking how many tasks are currently scheduled.
How can I achieve this ?
I am using redis as broker.
I just found this :
Retrieve list of tasks in a queue in Celery
It is somehow relate to my issue but I don't need to list the tasks , just count them :)
Here is how you can get the number of messages in a queue using celery that is broker-agnostic.
By using connection_or_acquire, you can minimize the number of open connections to your broker by utilizing celery's internal connection pooling.
celery = Celery(app)
with celery.connection_or_acquire() as conn:
conn.default_channel.queue_declare(
queue='my-queue', passive=True).message_count
You can also extend Celery to provide this functionality:
from celery import Celery as _Celery
class Celery(_Celery)
def get_message_count(self, queue):
'''
Raises: amqp.exceptions.NotFound: if queue does not exist
'''
with self.connection_or_acquire() as conn:
return conn.default_channel.queue_declare(
queue=queue, passive=True).message_count
celery = Celery(app)
num_messages = celery.get_message_count('my-queue')
If your broker is configured as redis://localhost:6379/1, and your tasks are submitted to the general celery queue, then you can get the length by the following means:
import redis
queue_name = "celery"
client = redis.Redis(host="localhost", port=6379, db=1)
length = client.llen(queue_name)
Or, from a shell script (good for monitors and such):
$ redis-cli -n 1 -h localhost -p 6379 llen celery
If you have already configured redis in your app, you can try this:
from celery import Celery
QUEUE_NAME = 'celery'
celery = Celery(app)
client = celery.connection().channel().client
length = client.llen(QUEUE_NAME)
Get a redis client instance used by Celery, then check the queue length. Don't forget to release the connection every time you use it (use .acquire):
# Get a configured instance of celery:
from project.celery import app as celery_app
def get_celery_queue_len(queue_name):
with celery_app.pool.acquire(block=True) as conn:
return conn.default_channel.client.llen(queue_name)
Always acquire a connection from the pool, don't create it manually. Otherwise, your redis server will run out of connection slots and this will kill your other clients.
I'll expand on the answer of #StephenFuhry around the not-found error, because more or less broker-agnostic way of retrieving queue length is beneficial even if Celery suggests to mess with brokers directly. In Celery 4 (with Redis broker) this error looks like:
ChannelError: Channel.queue_declare: (404) NOT_FOUND - no queue 'NAME' in vhost '/'
Observations:
ChannelError is a kombu exception (if fact, it's amqp's and kombu "re-exports" it).
On Redis broker Celery/Kombu represent queues as Redis lists
Redis collection type keys are removed whenever the collection becomes empty
If we look at what queue_declare does, it has these lines:
if passive and not self._has_queue(queue, **kwargs):
raise ChannelError(...)
Kombu Redis virtual transport's _has_queue is this:
def _has_queue(self, queue, **kwargs):
with self.conn_or_acquire() as client:
with client.pipeline() as pipe:
for pri in self.priority_steps:
pipe = pipe.exists(self._q_for_pri(queue, pri))
return any(pipe.execute())
The conclusion is that on a Redis broker ChannelError raised from queue_declare is okay (for an existing queue of course), and just means that the queue is empty.
Here's an example of how to output all active Celery queues' lengths (normally should be 0, unless your worker can't cope with the tasks).
from kombu.exceptions import ChannelError
def get_queue_length(name):
with celery_app.connection_or_acquire() as conn:
try:
ok_nt = conn.default_channel.queue_declare(queue=name, passive=True)
except ChannelError:
return 0
else:
return ok_nt.message_count
for queue_info in celery_app.control.inspect().active_queues().values():
print(queue_info[0]['name'], get_queue_length(queue_info[0]['name']))

Categories