Django celery - asyncio - daemonic process are not allowed to have children - python

I can see similar questions have been asked before but those are running multi processors and not executors. Therefore I am unsure how to fix this.
the GitHub issue also say its resolved in 4.1 https://github.com/celery/celery/issues/1709
I am using
celery==4.1.1
django-celery==3.2.1
django-celery-beat==1.0.1
django-celery-results==1.0.1
My script as as follows, ive tried to cut it down to show relevant code only.
#asyncio.coroutine
def snmp_get(ip, oid, snmp_user, snmp_auth, snmp_priv):
results=[]
snmpEngine = SnmpEngine()
errorIndication, errorStatus, errorIndex, varBinds = yield from getCmd(
...
)
...
for varBind in varBinds:
results.append(' = '.join([x.prettyPrint() for x in varBind]))
snmpEngine.transportDispatcher.closeDispatcher()
return results
def create_link_data_record(link_data):
obj = LinkData.objects.create(
...
)
return 'data polled for {} record {} created'.format(link_data.hostname, obj.id)
async def retrieve_data(link, loop):
from concurrent.futures import ProcessPoolExecutor
executor = ProcessPoolExecutor(2)
poll_interval = 60
results = []
# credentials:
...
print('polling data for {} on {}'.format(hostname,link_mgmt_ip))
# create link data obj
link_data = LinkDataObj()
...
# first poll for speeds
download_speed_data_poll1 = await snmp_get(link_mgmt_ip, down_speed_oid % link_index ,snmp_user, snmp_auth, snmp_priv)
download_speed_data_poll1 = await snmp_get(link_mgmt_ip, down_speed_oid % link_index ,snmp_user, snmp_auth, snmp_priv)
# check we were able to poll
if 'timeout' in str(get_snmp_value(download_speed_data_poll1)).lower():
return 'timeout trying to poll {} - {}'.format(hostname ,link_mgmt_ip)
upload_speed_data_poll1 = await snmp_get(link_mgmt_ip, up_speed_oid % link_index, snmp_user, snmp_auth, snmp_priv)
# wait for poll interval
await asyncio.sleep(poll_interval)
# second poll for speeds
download_speed_data_poll2 = await snmp_get(link_mgmt_ip, down_speed_oid % link_index, snmp_user, snmp_auth, snmp_priv)
upload_speed_data_poll2 = await snmp_get(link_mgmt_ip, up_speed_oid % link_index, snmp_user, snmp_auth, snmp_priv)
# create deltas for speed
down_delta = int(get_snmp_value(download_speed_data_poll2)) - int(get_snmp_value(download_speed_data_poll1))
up_delta = int(get_snmp_value(upload_speed_data_poll2)) - int(get_snmp_value(upload_speed_data_poll1))
...
results.append(await loop.run_in_executor(executor, create_link_data_record, link_data))
return results
def get_link_data():
link_data = LinkTargets.objects.all()
# create loop
loop = asyncio.get_event_loop()
if asyncio.get_event_loop().is_closed():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(asyncio.new_event_loop())
# create tasks
tasks = [asyncio.ensure_future(retrieve_data(link, loop)) for link in link_data]
if tasks:
start = time.time()
done, pending = loop.run_until_complete(asyncio.wait(tasks))
loop.close()
the error below which references the run_in_executor code
[2018-05-24 14:13:00,840: ERROR/ForkPoolWorker-3] Task exception was never retrieved
future: <Task finished coro=<retrieve_data() done, defined at /itapp/itapp/monitoring/jobs/link_monitoring.py:130> exception=AssertionError('daemonic processes are not allowed to have children',)>
Traceback (most recent call last):
File "/itapp/itapp/monitoring/jobs/link_monitoring.py", line 209, in retrieve_data
link_data.last_change = await loop.run_in_executor(executor, timestamp, (link_data.link_target_id, link_data.service_status))
File "/usr/local/lib/python3.6/asyncio/base_events.py", line 639, in run_in_executor
return futures.wrap_future(executor.submit(func, *args), loop=self)
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 466, in submit
self._start_queue_management_thread()
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 427, in _start_queue_management_thread
self._adjust_process_count()
File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 446, in _adjust_process_count
p.start()
File "/usr/local/lib/python3.6/multiprocessing/process.py", line 103, in start
'daemonic processes are not allowed to have children'
AssertionError: daemonic processes are not allowed to have children

Try with Celery 5-devel
pip install git+https://github.com/celery/celery#5.0-devel
As per below issue
https://github.com/celery/celery/issues/3884
Celery 5.0 will support asyncio. We currently do not support it.
And then there is also below SO thread on same
How to combine Celery with asyncio?

Related

Gathering results from an asyncio task list with exceptions

The code...
import asyncio
import random
from time import perf_counter
from typing import Iterable
from pprint import pprint
async def coro(n, i, threshold=0.4):
await asyncio.sleep(i)
if i > threshold:
# For illustration's sake - some coroutines may raise,
# and we want to accomodate that and just test for exception
# instances in the results of asyncio.gather(return_exceptions=True)
raise Exception(f"{i} of Task-{n} is too high")
return i
async def main(it: Iterable, timeout: float) -> tuple:
tasks = [asyncio.create_task(coro(i+1, d), name=f"Task-{i+1}") for i, d in enumerate(it)]
await asyncio.wait(tasks, timeout=timeout)
return tasks # *not* (done, pending)
timeout = 0.5
random.seed(444)
n = 10
it = [random.random() for _ in range(n)]
start = perf_counter()
tasks = asyncio.run(main(it=it, timeout=timeout))
elapsed = perf_counter() - start
print(f"Done main({n}) in {elapsed:0.2f} seconds\n")
pprint(tasks)
print('----')
# does not work from here on....
res = []
for t in tasks:
try:
r = t.result() # gives an error!!!
except Exception as e:
res.append(e)
else:
res.append(r)
pprint(res)
...does not work for collection of the task results. It fails with ...
Traceback (most recent call last):
File "c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py", line 8, in coro
await asyncio.sleep(i)
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\asyncio\tasks.py", line 654, in sleep
return await future
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py", line 35, in <module>
r = t.result()
asyncio.exceptions.CancelledError
Task exception was never retrieved
future: <Task finished name='Task-7' coro=<coro() done, defined at c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py:7> exception=Exception('i too high')>
Traceback (most recent call last):
File "c:\Users\user\Documents\user\projects\learn\asyncio\wrap_gather_in_timeout.py", line 13, in coro
raise Exception("i too high")
Exception: i too high
The code was run in python 3.9.
Any idea where I am going wrong and why?
Is it because the tasks need to be cancelled after it throws an exception? I could not successfully implement it.
Inspired by: Solution to wrapping asyncio.gather SO
Your code works, the issue why you are not able to create res successfully is because the code does not raise just the normal Exception class. Since the task fails it ends up calling asyncio.exceptions.CancelledError which if we take a look in the documentation inherits from BaseException not Exception. This change is new as of Python 3.8 and since you are using Python 3.9 that change is live. Changing your code slightly to the following yields:
res = []
for t in tasks:
try:
r = t.result() # gives an error!!!
except BaseException as e:
res.append(e)
continue
res.append(r)
print(res)
[0.3088946587429545,
0.01323751590501987,
Exception('0.4844375347808497 of Task-3 is too high'),
asyncio.exceptions.CancelledError(),
asyncio.exceptions.CancelledError(),
asyncio.exceptions.CancelledError(),
Exception('0.4419557492849159 of Task-7 is too high'),
0.3113884366691503,
0.07422124156714727,
asyncio.exceptions.CancelledError()]

Consuming and replying to separate queues | Pika implementation

I am struggling to solve my issue, hope anyone from the community can help me here.
Our requirement is locked and can't be changed as the producer publishing the queues is controlled by a different team.
Producer which is written in JAVA declares three queues (TASK, RESPONSE, TASK_RESPONSE) and listens on them with the help of spring framework.
A hashmap is sent to the TASK and TASK_RESPONSE queue from the java AMQP client (Producer).
We need to consume these hashmaps and send the responses as follows.
If the queue TASK is processed, the response needs to be sent on RESPONSE queue incrementally.
If the queue TASK_RESPONSE is processed, the response needs to be sent on TASK_RESPONSE queue incrementally (RPC mode).
Now, we need to consume and publish this in python since we need to do some background processing on the tasks.
I tried to work with celery and dramatiq, but was not able to figure out how it can be done with them, so I tried writing myself (with the help of tutorials available online)
Problem is, I am able to consume the messages but not able to reply_to the RESPONSE queue. Here is my code.
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
import pika
import datetime
import logging
import json
from logging import StreamHandler
from time import sleep
from random import randint
from pika import SelectConnection
from settings import *
logging.basicConfig(handlers=[StreamHandler()], level=logging.INFO, format=logging.BASIC_FORMAT)
_logger = logging.getLogger(__name__)
class QueueConsumer(object):
"""The consumer class to manage connections to the AMQP server/queue"""
def __init__(self, queue, logger, parameters, thread_id=0):
self.channel = None
self.connection = None
self.queue_name_task = queue['task']
self.queue_name_response = queue['response']
self.logger = logger
self.consumer_id = 'Consumer Thread: %d' % (thread_id,)
self.parameters = pika.ConnectionParameters(**parameters)
def consume(self):
try:
self.connection = SelectConnection(parameters=self.parameters, on_open_callback=self._on_connected)
self.connection.ioloop.start()
except Exception as e:
self.logger.error('{} {}'.format(self.consumer_id, str(e)))
self.connection.close()
self.connection.ioloop.start()
def _on_connected(self, connection):
connection.channel(on_open_callback=self._on_channel_open)
def _on_channel_open(self, channel):
self.channel = channel
try:
# Declare Task Queue
self.channel.queue_declare(queue=self.queue_name_task,
exclusive=False,
durable=True,
auto_delete=False,
callback=self._on_queue_declared)
self.logger.info("{} Opened Channel....".format(self.consumer_id))
# Declare Task Response Queue
self.channel.queue_declare(queue=self.queue_name_response,
exclusive=False,
durable=True,
auto_delete=False)
self.logger.info("{} Opened Channel....".format(self.consumer_id))
except Exception as e:
self.logger.error('{} {}'.format(self.consumer_id, str(e)))
def _on_queue_declared(self, frame):
self.logger.debug('{} ... declaring queue'.format(self.consumer_id))
self.channel.basic_qos(prefetch_count=1)
try:
self.channel.basic_consume(queue=self.queue_name_task,
on_message_callback=self.handle_delivery,
auto_ack=True)
self.logger.info("{} Declared queue...".format(self.consumer_id))
except Exception as e:
self.logger.error('{} crashing:--> {}'.format(self.consumer_id, str(e)))
def handle_delivery(self, channel, method, header, body):
try:
start_time = datetime.datetime.now()
_logger.info("Received...")
_logger.info("Content: %s" % body)
req = json.loads(self.decode(body))
# Do something
sleep(randint(10, 20))
time_taken = datetime.datetime.now() - start_time
log_msg = "[{}] Time Taken: {}.{}".format(req['bar']['baz'], time_taken.seconds, time_taken.microseconds)
_logger.info(log_msg)
# Publish the result to another queue.
try:
self.channel.basic_publish(exchange='',
routing_key=self.queue_name_response,
properties=pika.BasicProperties(),
body=log_msg)
_logger.info("Message Published...\t(%s)" % self.queue_name_response)
except Exception as e:
self.logger.error('{} Message publishing failed:--> {}'.format(self.consumer_id, str(e)))
except Exception as err:
_logger.exception(err)
def decode(self, body):
try:
_body = body.decode('utf-8')
except AttributeError:
_body = body
return _body
if __name__ == "__main__":
pika_parameters = OrderedDict([
('host', TF_BROKER_HOST),
('port', TF_BROKER_PORT),
('virtual_host', TF_BROKER_VHOST)
])
queue = {'task': TF_IAAS_TASK_QUEUE, 'response': TF_IAAS_REPLY_QUEUE}
try:
with ThreadPoolExecutor(max_workers=TF_IAAS_THREAD_SIZE, thread_name_prefix=TF_IAAS_THREAD_PREFIX) as executor:
start = 1
for thread_id in range(start, (TF_IAAS_THREAD_SIZE + start)):
executor.submit(QueueConsumer(queue, _logger, pika_parameters, thread_id).consume)
except Exception as err:
_logger.exception(err)
Publish Messages On RabbitMQ
import pika
import json
import random
import datetime
from faker import Faker
from random import randint
fake = Faker('en_US')
if __name__ == '__main__':
try:
connection = pika.BlockingConnection(pika.ConnectionParameters(
host='localhost'))
channel = connection.channel()
channel.queue_declare(queue='tf_task', durable=True)
started_at = datetime.datetime.now()
properties = pika.BasicProperties(delivery_mode=2)
for i in range(0, 10000):
body = {
'foo': randint(i, i+100),
'bar': {
'baz': fake.name(),
'poo': float(random.randrange(155+i, 389+i))/100
}
}
channel.basic_publish(exchange='',
routing_key='tf_task',
body=json.dumps(body),
properties=properties)
if i%10000 == 0:
duration = datetime.datetime.now() - started_at
print(i, duration.total_seconds())
print(" [x] Sent 'Hello World!'")
connection.close()
now = datetime.datetime.now()
duration = now - started_at
print(duration.total_seconds())
except Exception as e:
print(e)

asyncio + asyncpg + pandas: obtain pandas.df with async selects from db - ERROR

Edited my code - NOW it WORKS
I'm trying to obtain some date from my Postgres db through asyncpg connection pool asynchronously.
Basically my db contain about 100 different tables (per city) and i'm trying to gather all the data in one frame as fast as it possible.
import pandas as pd
import asyncpg
import asyncio
from time import time
def make_t():
lst = []
# iterator for sql tuple
for i in ['a',
'b',
'c']:
i1 = i
sql = """
SELECT
'%s' as city,
MAX(starttime) AS max_ts
FROM
"table_%s"
"""
lst.append(sql % (i, i1))
return tuple(lst)
async def get_data(pool, sql):
start = time()
async with pool.acquire() as conn:
stmt = await conn.prepare(sql)
columns = [a.name for a in stmt.get_attributes()]
data = await stmt.fetch()
print(f'Exec time: {time() - start}')
return pd.DataFrame(data, columns=columns)
async def main():
dsn = 'postgres://user:pass#127.0.0.1:5432/my_base'
cT = ['city', 'max_ts']
sqls = make_t()
pool = await asyncpg.create_pool(dsn=dsn, max_size=50)
start = time()
tasks = []
for sql in sqls:
tasks.append(loop.create_task(get_data(pool, sql)))
tasks = await asyncio.gather(*tasks)
df = pd.DataFrame(columns=cT)
for task in tasks:
# form df from corutine results
df = df.append(task.result())
print(f'total exec time: {time() - start} secs')
print('exiting main')
return df
loop = asyncio.get_event_loop()
df = loop.run_until_complete(main())
loop.close()
print('exiting program')
Python 3.6.5 :: Anaconda, Inc.
Gets me this error:
Traceback (most recent call last):
File "", line 319, in
File "/Users/fixx/anaconda3/lib/python3.6/asyncio/base_events.py", line
468, in run_until_complete
return future.result()
File "", line 308, in main
File "/Users/fixx/anaconda3/lib/python3.6/asyncio/tasks.py", line 594, in gather
for arg in set(coros_or_futures):
TypeError: unhashable type: 'list'
I cant figure out, why? My sqls in tuple!
asyncio.gather accepts coroutines as individual arguments, and you are sending it a list of tasks. You have to use the * operator to call gather correctly:
tasks = await asyncio.gather(*tasks)

Multiprocessing for a RabbitMQ Queue Pika

I am working with RabbitMQ queues. I want to run multiple consumer object instance through a single program. Below is my Operator class that creates 1 producer and 1 consumer
class Operator(object):
def __init__(self, delegate: callable, identifier):
"""
Create a new instance of the Operator and initialize the connections
"""
self._queue_details = self._get_queue_details()
self._host_ip = self._queue_details['IP']
self._port = self._queue_details['Port']
self._username = self._queue_details['Username']
self._password = self._queue_details['Password']
self._input_queue_name = self._queue_details['ReadQueueName']
self._output_queue_name = self._queue_details['WriteQueueName']
self._error_queue_name = self._queue_details['ErrorQueueName']
self._delegate = delegate
self._identifier = identifier
self._queue_connection = None
self._input_channel = None
self._output_channel = None
self._error_channel = None
self.is_busy = False
self.mark_to_terminate = False
def __del__(self):
# close connections
self._queue_connection.close()
#staticmethod
def _initialize_channel(connection, queue_name, durable):
channel = connection.channel()
channel.queue_declare(queue=queue_name, durable=durable)
return channel
#staticmethod
def _get_queue_details() -> dict:
return ConfigurationManager().get_value('queueDetails')
#staticmethod
def _get_connection(username, password, host_ip, port):
connection = pika.BlockingConnection(pika.ConnectionParameters(
credentials=pika.PlainCredentials(username, password), host=host_ip, port=port))
return connection
def initialize_operator(self):
connection = self._get_connection(self._username, self._password, self._host_ip, self._port)
self._queue_connection = connection
self._input_channel = self._initialize_channel(connection, self._input_queue_name, durable=False)
self._output_channel = self._initialize_channel(connection, self._output_queue_name, durable= True)
self._error_channel = self._initialize_channel(connection, self._error_queue_name, durable=True)
def consume(self):
self._input_channel.basic_qos(prefetch_count=1)
self._input_channel.basic_consume(self._process_incoming_message, queue=self._input_queue_name)
self._input_channel.start_consuming()
def _push_to_queue(self, channel, response):
channel.basic_publish(exchange='', routing_key=self._output_queue_name, body=response,
properties=pika.BasicProperties(delivery_mode=2)) # make message persistent
def _process_incoming_message(self, channel, method, properties, message):
self.is_busy = True
processed_result, is_error = self._delegate(message)
if is_error:
self._error_channel.basic_publish(exchange='', routing_key=self._output_queue_name, body=processed_result,
properties=pika.BasicProperties(delivery_mode=2))
else:
self._output_channel.basic_publish(exchange='', routing_key=self._output_queue_name, body=processed_result,
properties=pika.BasicProperties(delivery_mode=2))
# send in the final ack of the process.
channel.basic_ack(delivery_tag=method.delivery_tag)
# close connection if to avoid receiving messages
if self.mark_to_terminate:
self._queue_connection.close()
self.is_busy = False
And from my main script I spin up the agents like below:
# spins up the agent
for count in range(spin_up_count):
instance = Operator(self._translate_and_parse, f'Operator: {time.time()}')
instance.initialize_operator()
process = Process(target=instance.consume)
process.start()
self._online_agents.append((instance, process))
The problem is when I go for a process.start() it throws me a TypeError
TypeError: can't pickle _thread.lock objects
Complete Stack trace
File "C:/Users/adity/Documents/PythonProjects/Caligo/Caligo/QueueService.py", line 201, in _scale_up
process.start()
File "C:\Users\adity\AppData\Local\Programs\Python\Python36-32\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Users\adity\AppData\Local\Programs\Python\Python36-32\lib\multiprocessing\context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Users\adity\AppData\Local\Programs\Python\Python36-32\lib\multiprocessing\context.py", line 322, in _Popen
return Popen(process_obj)
File "C:\Users\adity\AppData\Local\Programs\Python\Python36-32\lib\multiprocessing\popen_spawn_win32.py", line 65, in __init__
reduction.dump(process_obj, to_child)
File "C:\Users\adity\AppData\Local\Programs\Python\Python36-32\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: can't pickle _thread.lock objects
The RabbitMQ team monitors the rabbitmq-users mailing list and only sometimes answers questions on StackOverflow.
Don't instantiate your Operator objects prior to starting the forked processes. You also can't make instance.consume the target of the forked process.
The target method of the Process instance is what should create the Operator instance and then call the consume method.
If you need to manage the forked processes you should keep track of the process IDs and use signals to communicate with them.

How would I go about using concurrent.futures and queues for a real-time scenario?

It is fairly easy to do parallel work with Python 3's concurrent.futures module as shown below.
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to = {executor.submit(do_work, input, 60): input for input in dictionary}
for future in concurrent.futures.as_completed(future_to):
data = future.result()
It is also very handy to insert and retrieve items into a Queue.
q = queue.Queue()
for task in tasks:
q.put(task)
while not q.empty():
q.get()
I have a script running in background listening for updates. Now, in theory assume that, as those updates arrive, I would queue them and do work on them concurrently using the ThreadPoolExecutor.
Now, individually, all of these components work in isolation, and make sense, but how do I go about using them together? I am not aware if it is possible to feed the ThreadPoolExecutor work from the queue in real time unless the data to work from is predetermined?
In a nutshell, all I want to do is, receive updates of say 4 messages a second, shove them in a queue, and get my concurrent.futures to work on them. If I don't, then I am stuck with a sequential approach which is slow.
Let's take the canonical example in the Python documentation below:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
The list of URLS is fixed. Is it possible to feed this list in real-time and get the worker to process it as they come by, perhaps from a queue for management purposes? I am a bit confused on whether my approach is actually possible?
The example from the Python docs, expanded to take its work from a queue. A change to note, is that this code uses concurrent.futures.wait instead of concurrent.futures.as_completed to allow new work to be started while waiting for other work to complete.
import concurrent.futures
import urllib.request
import time
import queue
q = queue.Queue()
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def feed_the_workers(spacing):
""" Simulate outside actors sending in work to do, request each url twice """
for url in URLS + URLS:
time.sleep(spacing)
q.put(url)
return "DONE FEEDING"
def load_url(url, timeout):
""" Retrieve a single page and report the URL and contents """
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# start a future for a thread which sends work in through the queue
future_to_url = {
executor.submit(feed_the_workers, 0.25): 'FEEDER DONE'}
while future_to_url:
# check for status of the futures which are currently working
done, not_done = concurrent.futures.wait(
future_to_url, timeout=0.25,
return_when=concurrent.futures.FIRST_COMPLETED)
# if there is incoming work, start a new future
while not q.empty():
# fetch a url from the queue
url = q.get()
# Start the load operation and mark the future with its URL
future_to_url[executor.submit(load_url, url, 60)] = url
# process any completed futures
for future in done:
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
if url == 'FEEDER DONE':
print(data)
else:
print('%r page is %d bytes' % (url, len(data)))
# remove the now completed future
del future_to_url[future]
Output from fetching each url twice:
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
'http://www.bbc.co.uk/' page is 193780 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
DONE FEEDING
'http://www.bbc.co.uk/' page is 193605 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://europe.wsj.com/' page is 874649 bytes
'http://europe.wsj.com/' page is 874649 bytes
At work I found a situation where I wanted to do parallel work on an unbounded stream of data. I created a small library inspired by the excellent answer already provided by Stephen Rauch.
I originally approached this problem by thinking about two separate threads, one that submits work to a queue and one that monitors the queue for any completed tasks and makes more room for new work to come in. This is similar to what Stephen Rauch proposed, where he consumes the stream using a feed_the_workers function that runs in a separate thread.
Talking to one of my colleagues, he helped me realize that you can get away with doing everything in a single thread if you define a buffered iterator that allows you to control how many elements are let out of the input stream every time you are ready to submit more work to the thread pool.
So we introduce the BufferedIter class
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
for _ in range(n):
vals.append(next(self.iter))
return vals
which allows us to define the stream processor in the following way
import logging
import queue
import signal
import sys
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
level = logging.DEBUG
log = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
handler.setLevel(level)
log.addHandler(handler)
log.setLevel(level)
WAIT_SLEEP = 1 # second, adjust this based on the timescale of your tasks
def stream_processor(input_stream, task, num_workers):
# Use a queue to signal shutdown.
shutting_down = queue.Queue()
def shutdown(signum, frame):
log.warning('Caught signal %d, shutting down gracefully ...' % signum)
# Put an item in the shutting down queue to signal shutdown.
shutting_down.put(None)
# Register the signal handler
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
def is_shutting_down():
return not shutting_down.empty()
futures = dict()
buffer = BufferedIter(input_stream)
with ThreadPoolExecutor(num_workers) as executor:
num_success = 0
num_failure = 0
while True:
idle_workers = num_workers - len(futures)
if not is_shutting_down():
items = buffer.nextN(idle_workers)
for data in items:
futures[executor.submit(task, data)] = data
done, _ = wait(futures, timeout=WAIT_SLEEP, return_when=ALL_COMPLETED)
for f in done:
data = futures[f]
try:
f.result(timeout=0)
except Exception as exc:
log.error('future encountered an exception: %r, %s' % (data, exc))
num_failure += 1
else:
log.info('future finished successfully: %r' % data)
num_success += 1
del futures[f]
if is_shutting_down() and len(futures) == 0:
break
log.info("num_success=%d, num_failure=%d" % (num_success, num_failure))
Below we show an example for how to use the stream processor
import itertools
def integers():
"""Simulate an infinite stream of work."""
for i in itertools.count():
yield i
def task(x):
"""The task we would like to perform in parallel.
With some delay to simulate a time consuming job.
With a baked in exception to simulate errors.
"""
time.sleep(3)
if x == 4:
raise ValueError('bad luck')
return x * x
stream_processor(integers(), task, num_workers=3)
The output for this example is shown below
2019-01-15 22:34:40,193 future finished successfully: 1
2019-01-15 22:34:40,193 future finished successfully: 0
2019-01-15 22:34:40,193 future finished successfully: 2
2019-01-15 22:34:43,201 future finished successfully: 5
2019-01-15 22:34:43,201 future encountered an exception: 4, bad luck
2019-01-15 22:34:43,202 future finished successfully: 3
2019-01-15 22:34:46,208 future finished successfully: 6
2019-01-15 22:34:46,209 future finished successfully: 7
2019-01-15 22:34:46,209 future finished successfully: 8
2019-01-15 22:34:49,215 future finished successfully: 11
2019-01-15 22:34:49,215 future finished successfully: 10
2019-01-15 22:34:49,215 future finished successfully: 9
^C <=== THIS IS WHEN I HIT Ctrl-C
2019-01-15 22:34:50,648 Caught signal 2, shutting down gracefully ...
2019-01-15 22:34:52,221 future finished successfully: 13
2019-01-15 22:34:52,222 future finished successfully: 14
2019-01-15 22:34:52,222 future finished successfully: 12
2019-01-15 22:34:52,222 num_success=14, num_failure=1
I really liked the interesting approach by #pedro above. However, when processing thousands of files, I noticed that at the end a StopIteration would be thrown and some files would always be skipped. I had to make a little modification to as follows. Very useful answer again.
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
try:
for _ in range(n):
vals.append(next(self.iter))
return vals, False
except StopIteration as e:
return vals, True
-- Call as follows
...
if not is_shutting_down():
items, is_finished = buffer.nextN(idle_workers)
if is_finished:
stop()
...
-- Where stop is a function that simply tells to shutdown
def stop():
shutting_down.put(None)
It is possible to gain the benefits of the executor without strictly having to use a Queue. New tasks are submitted from the main thread. The undone futures are tracked and waited on until all futures are done.
import concurrent.futures
import sys
import time
sys.setrecursionlimit(64) # This is only for demonstration purposes to trigger a RecursionError. Do not set in practice.
def slow_factorial(n: int) -> int:
time.sleep(0.01)
if n == 0:
return 1
else:
return n * slow_factorial(n-1)
initial_inputs = [0, 1, 5, 20, 200, 100, 50, 51, 55, 40, 44, 21, 222, 333, 202, 1000, 10, 9000, 9009, 99, 9999]
for executor_class in (concurrent.futures.ThreadPoolExecutor, concurrent.futures.ProcessPoolExecutor):
for max_workers in (4, 8, 16, 32):
start_time = time.monotonic()
with executor_class(max_workers=max_workers) as executor:
futures_to_n = {executor.submit(slow_factorial, n): n for n in initial_inputs}
while futures_to_n:
futures_done, futures_not_done = concurrent.futures.wait(futures_to_n, return_when=concurrent.futures.FIRST_COMPLETED)
# Note: Length of futures_done is often > 1.
for future in futures_done:
n = futures_to_n.pop(future)
try:
factorial_n = future.result()
except RecursionError:
n_smaller = int(n ** 0.9)
future = executor.submit(slow_factorial, n_smaller)
futures_to_n[future] = n_smaller
# print(f'Failed to compute factorial of {n}. Trying to compute factorial of a smaller number {n_smaller} instead.')
else:
# print(f'Factorial of {n} is {factorial_n}.')
pass
used_time = time.monotonic() - start_time
executor_type = executor_class.__name__.removesuffix('PoolExecutor').lower()
print(f'Workflow took {used_time:.1f}s with {max_workers} {executor_type} workers.')
print()
Output:
Workflow took 9.4s with 4 thread workers.
Workflow took 6.3s with 8 thread workers.
Workflow took 5.4s with 16 thread workers.
Workflow took 5.2s with 32 thread workers.
Workflow took 9.0s with 4 process workers.
Workflow took 5.9s with 8 process workers.
Workflow took 5.1s with 16 process workers.
Workflow took 4.9s with 32 process workers.
For more clarity, uncomment the two print statements. As per the output above, there is an asymptotic speed benefit with more workers.

Categories