Celery/Django worker detect when there are no jobs left - python

I have a Celery/Django worker connected via RabbitMQ to the server. When the worker finishes a job I want it to terminate if there are no jobs left - how can I check there are no jobs left in the queue?

kill the pid when task is finisched throw psutil.
For example:
import psutil
import os
#celery.task
def my_task():
pid=os.getpid() # get the worker pid
# your code
return pid # or store it somewhere
def task_caller()
result = my_task.apply()
if no_more_jobs('my_queue'):
kill_worker(result)
def kill_worker(pid):
try:
proc = psutil.Process(pid=pid)
for child in proc.children(recursive=True):
child.kill()
proc.kill()
return True
except Exception:
# manage exception
return False
def no_more_jobs(queue):
# edit below params
connection = pika.BlockingConnection(pika.ConnectionParameters(host='localhost'))
channel = connection.channel()
q = channel.queue_declare(queue)
return q.method.message_count == 0
Note: This is a basic example which needs to be edited depending your producer/consumer logic

Related

RabbitMQ Python PIKA - using add_callback_threadsafe lost acknowledgement when stopping program

I'm using pika to process RabitMQ message by small batch, and using a thread for each batch.
At the end of the function in the thread, I send acknowledgement of the messages through add_callback_threadsafe to the channel.
In parallele I'm catching SIGINT signals to stop the program properly, by waiting with thread.join() that all threads finish before stopping the channel consume and closing the connection.
But once the CtrlC is sent to generate the SIGINT, event if the program wait for all threads to finish, the acknowledgement will not be processed.
==> is there a way to force the channel/connection to process the waiting add_callback_threadsafe before closing the connection ?
# import packages
# connect to Rabbit MQ
import pika
# intercept stop signal
import signal
# print exception
import traceback
# threading
import functools
import threading
from queue import Queue
# logs time
import datetime
import time
# Function Message Acknowledgement
def ack_message(ch, delivery_tag):
"""Note that `ch` must be the same pika channel instance via which
the message being ACKed was retrieved (AMQP protocol constraint).
"""
print(f'DEBUG ack_message : begining of ack_message function')
if ch.is_open:
ch.basic_ack(delivery_tag)
print(f'DEBUG ack_message : Acknowledgement delivered')
else:
# Channel is already closed, so we can't ACK this message;
# log and/or do something that makes sense for your app in this case.
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f'ERROR Channel Closed when trying to Acknowledge')
pass
# Function Process multiple messages in separate thread
def block_process():
# list global variables to be changed
global channel
# init local variables
body_list = list()
tag_list = list()
print(f'DEBUG block_process : start of block_process function')
# cancel the timer if exist, as we will proces all elements in the queue here
if event and event.isAlive():
event.cancel()
# extract all queued messages fom internal python queue and rebuild individual listes body and tag from tupple
for i in range(list_Boby_Tag.qsize()):
myTuppleBodyTag = list_Boby_Tag.get()
body_list += [myTuppleBodyTag[0]]
tag_list += [myTuppleBodyTag[1]]
# that also empty the queue
# do something that take time with the block of nessage in body_list
time.sleep(10)
for body in body_list:
body_str = body.decode()
print(f'DEBUG block_process : message processed is {body_str}')
# acknowledging all tags in tag_list by using the channel thread safe function .connection.add_callback_threadsafe
for tag in tag_list:
print(f'DEBUG preprare delivering Acknowledgement from thread')
cb = functools.partial(ack_message, channel, tag)
channel.connection.add_callback_threadsafe(cb)
print(f'DEBUG block_process : end of block_process function')
return
# Function Process message by message and call
def process_message(ch, method, properties, body):
# list global variables to be changed
global list_Boby_Tag
global event
global threads
# do nothing if this flag is on, as the program is about to close
if PauseConsume == 1:
return
# cancel the timer if exist as we are going to process a block or restart a new timer
if event and event.isAlive():
event.cancel()
# put in the queue the data from the body and tag as tupple
list_Boby_Tag.put((body,method.delivery_tag))
# if a max queue size is reached (here 500), immmediately launch a new thread to process the queue
if list_Boby_Tag.qsize() == 500 :
#print(f'DEBUG thread count before {len(threads)}')
# keep in the threads list only the thread still running
threads = [x for x in threads if x.is_alive()]
#print(f'DEBUG thread count after {len(threads)}')
# start the inference in a separated thread
t = threading.Thread(target=block_process)
t.start()
# keep trace of the thread so it can be waited at the end if still running
threads.append(t)
#print(f'DEBUG thread count after add {len(threads)}')
elif list_Boby_Tag.qsize() > 0 :
# if the queue is not full create a thread with a timer to do the process after sometime, here 10 seconds for test purpose
event = threading.Timer(interval=10, function=block_process)
event.start()
# also add this thread to the list of threads
threads.append(event)
# PARAMETERS
RabbitMQ_host = '192.168.1.190'
RabbitMQ_port = 5672
RabbitMQ_queue = 'test_ctrlC'
RabbitMQ_cred_un = 'xxxx'
RabbitMQ_cred_pd = 'xxxx'
# init variables for batch process
list_Boby_Tag = Queue()
threads = list()
event = None
PauseConsume = 0
init_time = time.time()
# connect to RabbitMQ via Pika
cred = pika.credentials.PlainCredentials(RabbitMQ_cred_un,RabbitMQ_cred_pd)
connection = pika.BlockingConnection(pika.ConnectionParameters(host=RabbitMQ_host, port=RabbitMQ_port, credentials=cred))
channel = connection.channel()
channel.queue_declare(queue=RabbitMQ_queue,durable=True)
# tell rabbitMQ to don't dispatch a new message to a worker until it has processed and acknowledged the previous one :
channel.basic_qos(prefetch_count=1)
# define the comsumer
channel.basic_consume(queue=RabbitMQ_queue,
auto_ack=False, # false = need message acknowledgement : basic_ack in the callback
on_message_callback=process_message)
# empty queue and generate test data
channel.queue_purge(queue=RabbitMQ_queue)
# wait few second so the purge can be check in the RabbitMQ ui
print(f'DEBUG main : queue {RabbitMQ_queue} purged')
connection.sleep(10)
# generate 10 test messages
for msgId in range(10):
channel.basic_publish(exchange='',
routing_key=RabbitMQ_queue,
body=f'message{msgId}',
properties=pika.BasicProperties(
delivery_mode = pika.spec.PERSISTENT_DELIVERY_MODE
))
print(f'DEBUG main : test messages created in {RabbitMQ_queue}')
# Function clean stop of pika connection in case of interruption or exception
def cleanClose():
# tell the on_message_callback to do nothing
PauseConsume = 1
# Wait for all threads to complete
for thread in threads:
thread.join()
# stop pika connection after a short pause
connection.sleep(3)
channel.stop_consuming()
connection.close()
return
# Function handle exit signals
def exit_handler(signum, frame):
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f'Exit signal received ({signum})')
cleanClose()
exit(0)
signal.signal(signal.SIGINT, exit_handler) # send by a CTRL+C or modified Docker Stop
#signal.signal(signal.SIGTSTP, exit_handler) # send by a CTRL+Z Docker Stop
print(' [*] Waiting for messages. To exit press CTRL+C')
try:
channel.start_consuming()
except Exception:
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f'Exception received within start_consumming')
traceback.print_exc()
cleanClose()
A workaround has been found by Luke here :
https://github.com/lukebakken/pika-1402/blob/lukebakken/pika-1402/test_pika_blockthread.py
He has changed the sample code with :
Simplified it a bit by removing the "batch processing via Queue" code since it wasn't related to the current issue
Moved the Pika connection to its own thread
Instead of using a consume callback, move to a generator-style for loop which allows for checking if exiting is requested. This could also be accomplished via SelectConnection and a timer.
Sample code with the batch processing via Queue added back, matching the original code is here :
# from https://github.com/lukebakken/pika-1402/blob/lukebakken/pika-1402/test_pika_blockthread.py
# import packages
# connect to Rabbit MQ
import pika
import pika.credentials
import pika.spec
# intercept stop signal
import signal
# print exception
# import traceback
# threading
import functools
import threading
from queue import Queue
# logs time
import datetime
import time
# PARAMETERS
RabbitMQ_host = "192.168.1.190"
RabbitMQ_port = 5672
RabbitMQ_queue = "test_ctrlC"
RabbitMQ_cred_un = "xxxx"
RabbitMQ_cred_pd = "xxxx"
nbTest = 100000
nbBatch = 1000
nbPrefetch = 10000
# note, prefecth always >= nbBatch
timerSec = 60 # timer wait
workSec = 5 # nbr of sec for simulating batch work
# init variables for batch process
init_time = time.time()
exiting = False
work_threads = list()
event = None
list_Boby_Tag = Queue()
# Function Message Acknowledgement
def ack_message(ch, delivery_tag):
"""Note that `ch` must be the same pika channel instance via which
the message being ACKed was retrieved (AMQP protocol constraint).
"""
if ch.is_open:
ch.basic_ack(delivery_tag)
#print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f"DEBUG ack_message : begining of ack_message function, tag: {delivery_tag}")
else:
# Channel is already closed, so we can't ACK this message;
# log and/or do something that makes sense for your app in this case.
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),"Channel Closed when trying to Acknowledge")
pass
return
# Function Process multiple messages in separate thread
def do_work(channel, list_Boby_Tag):
# init local variables
body_list = list()
tag_list = list()
# cancel the timer if exist, as we will proces all elements in the queue here
if event and event.is_alive():
event.cancel()
# extract all queued messages fom internal python queue and rebuild individual listes body and tag from tupple
for i in range(list_Boby_Tag.qsize()):
myTuppleBodyTag = list_Boby_Tag.get()
body_list += [myTuppleBodyTag[0]]
tag_list += [myTuppleBodyTag[1]]
# that also empty the queue
# do something that take time with the block of nessage in body_list
time.sleep(workSec)
for body in body_list:
body_str = body.decode()
#print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f'DEBUG block_process : message processed is {body_str}')
# acknowledging all tags in tag_list by using the channel thread safe function .connection.add_callback_threadsafe
for tag in tag_list:
cb = functools.partial(ack_message, channel, tag)
channel.connection.add_callback_threadsafe(cb)
return
# Function Process message by message and call thread by block or timer
def process_message(ch, method, body):
global work_threads
global list_Boby_Tag
global event
# cancel the timer if exist as we are going to process a block or restart a new timer
if event and event.is_alive():
event.cancel()
# put in the queue the data from the body and tag as tupple
list_Boby_Tag.put((body,method.delivery_tag))
# if a max queue size is reached (here 500), immmediately launch a new thread to process the queue
if list_Boby_Tag.qsize() == nbBatch :
#print(f'DEBUG thread count before {len(threads)}')
# keep in the threads list only the thread still running
work_threads = [x for x in work_threads if x.is_alive()]
#print(f'DEBUG thread count after {len(threads)}')
# start the inference in a separated thread
t = threading.Thread(target=do_work, args=(ch, list_Boby_Tag))
t.start()
# keep trace of the thread so it can be waited at the end if still running
work_threads.append(t)
#print(f'DEBUG thread count after add {len(threads)}')
elif list_Boby_Tag.qsize() > 0 :
# if the queue is not full create a thread with a timer to do the process after sometime, here 10 seconds for test purpose
event = threading.Timer(interval=timerSec, function=do_work, args=(ch, list_Boby_Tag))
event.start()
# also add this thread to the list of threads
work_threads.append(event)
return
# Function to start pika channel and stopping it
def pika_runner():
# connect to RabbitMQ via Pika
cred = pika.credentials.PlainCredentials(RabbitMQ_cred_un, RabbitMQ_cred_pd)
connection = pika.BlockingConnection(
pika.ConnectionParameters(
host=RabbitMQ_host, port=RabbitMQ_port, credentials=cred
)
)
channel = connection.channel()
channel.queue_declare(queue=RabbitMQ_queue, durable=True)
# tell rabbitMQ to don't dispatch a new message to a worker until it has processed and acknowledged the previous one :
channel.basic_qos(prefetch_count=nbPrefetch)
# empty queue and generate test data
channel.queue_purge(queue=RabbitMQ_queue)
# wait few second so the purge can be check in the RabbitMQ ui
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f"DEBUG main : queue {RabbitMQ_queue} purged, sleeping 5 seconds")
connection.sleep(5)
# generate test messages
for msgId in range(nbTest):
channel.basic_publish(
exchange="",
routing_key=RabbitMQ_queue,
body=f"message-{msgId+1}",
properties=pika.BasicProperties(
delivery_mode=pika.spec.PERSISTENT_DELIVERY_MODE
),
)
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f"DEBUG main : test messages created in {RabbitMQ_queue}")
# loop forever to retrieve messahes
for method_frame, properties, body in channel.consume(
queue=RabbitMQ_queue, inactivity_timeout=1, auto_ack=False
):
if exiting:
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f"DEBUG : stopping consuming")
#channel.stop_consuming()
channel.cancel()
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f"DEBUG : joining work threads")
for thread in work_threads:
thread.join()
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f"DEBUG : all work threads done, sleeping 5 seconds to let acks be delivered")
connection.sleep(5)
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),f"DEBUG : closing connections and channels")
channel.close()
connection.close()
else:
if method_frame is not None:
process_message(channel, method_frame, body)
return
# Function handle exit signals
def exit_handler(signum, _):
global exiting
if exiting:
return
exiting = True
print(datetime.datetime.now(),str(datetime.timedelta(seconds=time.time() - init_time)),"Exit signal received")
pika_thread.join()
exit(0)
# launch the thread that will connect and listen to Pika
pika_thread = threading.Thread(target=pika_runner)
pika_thread.start()
# catch interuption signal to exit gracefully
signal.signal(signal.SIGINT, exit_handler) # send by a CTRL+C or modified Docker Stop
print(" [*] Waiting for messages. To exit press CTRL+C")
# wait for all threads to finish
for thread in work_threads:
thread.join()
pika_thread.join()

celery - message queues for long time running processes

I'm building a web server via django 1.11.5, that uses celery-3.1.23 & rabbitmq as message queue manager, to send async-tasks to a number of different demon-processes (processes with infinite loop [long time running]).
How can I dynamically create queues for each process separately, and receive messages from the process' queue inside the daemon-process, do something asynchronously, and then forward the result to another "aggregator queue", to collect & validate the results, and sending a response to the user. (please see attached ilustracion)
So far, I connected the processes via multiprocessing.connection Client and Server objects, and opened the processes by the Process object.
code - consumer:
from multiprocessing.connection import Listener
from multiprocessing import Process
def main_process_loop(path, port_id, auth_key):
# Initialize the action1 intance to handle work stream:
action_handler = ActionHandler(path)
# Initialize the infinite loop that will run the process:
pid, auth_key_bytes = int(port_id), bytes(auth_key)
address = ('localhost', pid) # family is deduced to be 'AF_INET'
while True:
try:
listener = Listener(address, authkey=auth_key_bytes)
conn = listener.accept()
input_values = conn.recv()
listener.close()
if input_values == None:
raise Exception(ERR_MSG_INVALID_ARGV)
else:
#do something with input_values and ActionHandler
# need to return success message to user
except Exception as err:
# need to return fail message to user
if __name__ == '__main__':
# worker_processes = []
for auth_key, port_id in PID_DICT.items():
path = TEMPLATE_FORMAT.format(auth_key)
p = Process(target=main_process_loop, args=(path, port_id, auth_key))
# worker_processes.append(p)
p.start()
# for p in worker_processes:
# p.join()
# print "all processes have been initiated"
code - celery task:
from multiprocessing.connection import Client
from celery import Celery
app = Celery('tasks', broker='amqp://localhost:5672//')
#app.task
def run_backend_processes(a_lst, b_lst, in_type, out_path, in_file_name):
ARGV_FORMAT = r"IN_TYPE={0} IN_PATH={1} B_LEVEL=" + str(b_lst) + " OUT_PATH={2}"
##################################################
for process in a_lst:
pid = {
'A': 6001,
'B': 6002,
'C': 6003,
'D': 6004,
}[process]
file_path = os.path.join(out_path, process + "_" + in_file_name)
argv_string = ARGV_FORMAT.format(in_type, file_path, out_path)
address = ('localhost', int(pid))
conn = Client(address, authkey=bytes(mxd_process))
conn.send(str(argv_string))
conn.close()
return 'process succeed'
and the django's view is not unique - uses "run_backend_processes.delay"
Thank you,
Yoav.
Q&A Tried:
Celery parallel distributed task with multiprocessing
Can a celery worker/server accept tasks from a non celery producer?

Daemon Thread in Daemon service

The Summary
I'm writing a daemon to run as a Linux service. It communicates with an API (which I'm also writing but isn't part of the problem,) gets data, does stuff with that data, then feeds the newly munged data back to the API.
When I issue the script.py start command, it works just fine. The daemon process starts the script and the daemon Threads kick off and run.
What doesn't happen is when I issue the script.py stop command, the daemon Threads keep running. The stopping of the main thread (the one kicked off by script.py start) doesn't stop the daemon Threads.
I can still see them running with ps ux. And they keep running until manually killed.
The Question
How do I get my script.py stop to kill the daemon Threads as well as the main thread launched by the daemon module?
The Details
More in depth: It's a network device polling engine with a server/agent model. This is the agent side.
There are two daemon threads:
GetterThread
PutterThread
There are up to 15 worker threads of class WorkerThread that can be launched to either ping or SNMP poll the inventory of a given IP address. They merely launch a sub-process that does the actual pinging or polling.
There are three data Queues:
ping_request_queue
poll_request_queue
result_queue
The whole thing is wrapped up in a custom class called App that is controlled by the daemon module
GetterThread
class GetterThread(threading.Thread):
""" This thread is responsible for fetching the ping and poll lists from the server and dropping them into the
appropriate queue """
server = None # type: Server
ping_request_queue = None # type: Queue.Queue
poll_request_queue = None # type: Queue.Queue
def __init__(self, server, ping_request_queue, poll_request_queue):
"""
Create the Thread
:param Server server: The server to use
:param Queue.Queue ping_request_queue:
:param Queue.Queue poll_request_queue:
"""
threading.Thread.__init__(self)
self.ctl = ThreadController()
self.server = server # type: Server
self.ping_request_queue = ping_request_queue # type: Queue.Queue
self.poll_request_queue = poll_request_queue # type: Queue.Queue
def run(self):
while self.ctl.run:
if not self.server.online:
sleep(30)
self.server.check_in()
continue
sleep(1)
ping_list, poll_list = self.server.get_lists()
for r in ping_list:
req = PingRequest.decode(r)
self.ping_request_queue.put(req)
for r in poll_list:
req = PollRequest.decode(r)
self.poll_request_queue.put(req)
self.ctl.remove()
PutterThread
class PutterThread(threading.Thread):
""" This thread is responsible for picking up results from the results_queue and sending them to the server """
server = None # type: Server
q = None # type: Queue.Queue
def __init__(self, server, result_queue):
"""
Create a thread to put the results on the server
:param Queue.Queue result_queue:
"""
threading.Thread.__init__(self)
self.ctl = ThreadController()
self.server = server # type: Server
self.q = result_queue
def run(self):
while self.ctl.run:
if not self.server.online:
sleep(30)
self.server.check_in()
continue
sleep(1)
if self.q.not_empty:
result = self.q.get()
if isinstance(result, Request):
if result.stage == Request.PINGED:
""" Send the ping results """
f = self.server.send_ping_results
lmsg = 'Sent ping result for request {}'.format(result.uuid)
elif result.stage == Request.POLLED:
f = self.server.send_poll_results
lmsg = 'Sent poll result for request {}'.format(result.uuid)
else:
continue
f(result)
logging.debug(lmsg)
else:
logging.info('Bad request in queue: {!r}'.format(result))
self.ctl.remove()
Both the getter and putter thread instances are set as daemons.
I'm running the whole script as a daemon:
class App:
def __init__(self):
self.pidfile_path = "/var/run/project/poller.agent.pid"
self.logfile_path = "/var/log/project/poller.agent.log"
self.handler = logging.FileHandler(self.logfile_path)
def run(self):
result_queue = Queue.Queue()
ping_request_queue = Queue.Queue()
poll_request_queue = Queue.Queue()
getter_thread = GetterThread(self.server, ping_request_queue, poll_request_queue)
getter_thread.setName('GetterThread')
getter_thread.setDaemon(True)
putter_thread = PutterThread(self.server, results_queue)
putter_thread.setName('PutterThread')
putter_thread.setDaemon(True)
worker_threads = []
max_threads = {
'ping': 5,
'poll': 10,
}
thread_defs = [
('ping', ping_request_queue, result_queue),
('poll', poll_request_queue, result_queue)
]
while True:
if ping_request_queue.not_empty or poll_request_queue.not_empty:
for thread_def in thread_defs:
thread_type, input_queue, output_queue = thread_def
thread_count = min(input_queue.qsize(), max_threads.get(thread_type))
for x in range(thread_count):
t = WorkerThread(*thread_def)
t.setName('WorkerThread-{}-{:02n}'.format(thread_type, x)
worker_threads.append(t)
t.start()
sleep(1)
if __name__ == "__main__":
app = App()
daemon_runner = runner.DaemonRunner(app)
daemon_runner.daemon_context.files_preserve = [app.handler.stream]
daemon_runner.do_action()

How to monitor events from workers in a Celery-Django application?

According to the celery tutorial regarding real-time monitoring of celery workers, one can also programmatically capture the events produced by the workers and take action accordingly.
My question is how can I integrate a monitor as the one in this example, in a Celery-Django application?
EDIT:
The code example in the tutorial looks like:
from celery import Celery
def my_monitor(app):
state = app.events.State()
def announce_failed_tasks(event):
state.event(event)
task_id = event['uuid']
print('TASK FAILED: %s[%s] %s' % (
event['name'], task_id, state[task_id].info(), ))
with app.connection() as connection:
recv = app.events.Receiver(connection, handlers={
'task-failed': announce_failed_tasks,
'worker-heartbeat': announce_dead_workers,
})
recv.capture(limit=None, timeout=None, wakeup=True)
if __name__ == '__main__':
celery = Celery(broker='amqp://guest#localhost//')
my_monitor(celery)
So I want to capture task_failed event sent by the worker, and to get its task_id like the tutorial shows, to get the result for this task from the result-backend that was configured for my application and process it further. My problem is that it is not obvious to me how to get the application, as in a django-celery project it is not transparent to me the instantiation of Celery library.
I am also open to any other idea as to how to process the results when a worker has finished executing a task.
Ok, I found a way of doing this, though I am not sure that this is the solution, but it works for me. The monitor function basically connects directly to the broker and listens to different types of events. My code looks like this:
from celery.events import EventReceiver
from kombu import Connection as BrokerConnection
def my_monitor:
connection = BrokerConnection('amqp://guest:guest#localhost:5672//')
def on_event(event):
print "EVENT HAPPENED: ", event
def on_task_failed(event):
exception = event['exception']
print "TASK FAILED!", event, " EXCEPTION: ", exception
while True:
try:
with connection as conn:
recv = EventReceiver(conn,
handlers={'task-failed' : on_task_failed,
'task-succeeded' : on_event,
'task-sent' : on_event,
'task-received' : on_event,
'task-revoked' : on_event,
'task-started' : on_event,
# OR: '*' : on_event
})
recv.capture(limit=None, timeout=None)
except (KeyboardInterrupt, SystemExit):
print "EXCEPTION KEYBOARD INTERRUPT"
sys.exit()
This is all. And I run this in a different process than the normal application, meaning that I create a child process of my celery application which only runs this function.
HTH
Beware of a couple of gotchas
You need to set CELERY_SEND_EVENTS flag as true in your celery config.
You can also set the event monitor in a new thread from your worker.
Here is my implementation:
class MonitorThread(object):
def __init__(self, celery_app, interval=1):
self.celery_app = celery_app
self.interval = interval
self.state = self.celery_app.events.State()
self.thread = threading.Thread(target=self.run, args=())
self.thread.daemon = True
self.thread.start()
def catchall(self, event):
if event['type'] != 'worker-heartbeat':
self.state.event(event)
# logic here
def run(self):
while True:
try:
with self.celery_app.connection() as connection:
recv = self.celery_app.events.Receiver(connection, handlers={
'*': self.catchall
})
recv.capture(limit=None, timeout=None, wakeup=True)
except (KeyboardInterrupt, SystemExit):
raise
except Exception:
# unable to capture
pass
time.sleep(self.interval)
if __name__ == '__main__':
app = get_celery_app() # returns app
MonitorThread(app)
app.start()

Making sure a worker process always terminate in zeroMQ

I am implementing a pipeline pattern with zeroMQ using the python bindings.
tasks are fanned out to workers which listen for new tasks with an infinite loop like this:
while True:
socks = dict(self.poller.poll())
if self.receiver in socks and socks[self.receiver] == zmq.POLLIN:
msg = self.receiver.recv_unicode(encoding='utf-8')
self.process(msg)
if self.hear in socks and socks[self.hear] == zmq.POLLIN:
msg = self.hear.recv()
print self.pid,":", msg
sys.exit(0)
they exit when they get a message from the sink node, confirming having received all the results expected.
however, worker may miss such a message and not finish. What is the best way to have workers always finish, when they have no way to know (other than through the already mentioned message, that there are no further tasks to process).
Here is the testing code I wrote for checking the workers status:
#-*- coding:utf-8 -*-
"""
Test module containing tests for all modules of pypln
"""
import unittest
from servers.ventilator import Ventilator
from subprocess import Popen, PIPE
import time
class testWorkerModules(unittest.TestCase):
def setUp(self):
self.nw = 4
#spawn 4 workers
self.ws = [Popen(['python', 'workers/dummy_worker.py'], stdout=None) for i in range(self.nw)]
#spawn a sink
self.sink = Popen(['python', 'sinks/dummy_sink.py'], stdout=None)
#start a ventilator
self.V = Ventilator()
# wait for workers and sinks to connect
time.sleep(1)
def test_send_unicode(self):
'''
Pushing unicode strings through workers to sinks.
'''
self.V.push_load([u'são joão' for i in xrange(80)])
time.sleep(1)
#[p.wait() for p in self.ws]#wait for the workers to terminate
wsr = [p.poll() for p in self.ws]
while None in wsr:
print wsr, [p.pid for p in self.ws if p.poll() == None] #these are the unfinished workers
time.sleep(0.5)
wsr = [p.poll() for p in self.ws]
self.sink.wait()
self.sink = self.sink.returncode
self.assertEqual([0]*self.nw, wsr)
self.assertEqual(0, self.sink)
if __name__ == '__main__':
unittest.main()
All the messaging stuff eventually ends up with heartbeats. If you (as a worker or a sink or whatever) discover that a component you need to work with is dead, you can basically either try to connect somewhere else or kill yourself. So if you as a worker discover that the sink is there no more, just exit. This also means that you may exit even though the sink is still there but the connection is broken. But I am not sure you can do more, perhaps set all the timeouts more reasonably...

Categories