Python parallelising "async for" - python

I have the following method in my Tornado handler:
async def get(self):
url = 'url here'
try:
async for batch in downloader.fetch(url):
self.write(batch)
await self.flush()
except Exception as e:
logger.warning(e)
This is the code for downloader.fetch():
async def fetch(url, **kwargs):
timeout = kwargs.get('timeout', aiohttp.ClientTimeout(total=12))
response_validator = kwargs.get('response_validator', json_response_validator)
extractor = kwargs.get('extractor', json_extractor)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url) as resp:
response_validator(resp)
async for batch in extractor(resp):
yield batch
except aiohttp.client_exceptions.ClientConnectorError:
logger.warning("bad request")
raise
except asyncio.TimeoutError:
logger.warning("server timeout")
raise
I would like yield the "batch" object from multiple downloaders in paralel.
I want the first available batch from the first downloader and so on until all downloaders finished. Something like this (this is not working code):
async for batch in [downloader.fetch(url1), downloader.fetch(url2)]:
....
Is this possible? How can I modify what I am doing in order to be able to yield from multiple coroutines in parallel?

How can I modify what I am doing in order to be able to yield from multiple coroutines in parallel?
You need a function that merges two async sequences into one, iterating over both in parallel and yielding elements from one or the other, as they become available. While such a function is not included in the current standard library, you can find one in the aiostream package.
You can also write your own merge function, as shown in this answer:
async def merge(*iterables):
iter_next = {it.__aiter__(): None for it in iterables}
while iter_next:
for it, it_next in iter_next.items():
if it_next is None:
fut = asyncio.ensure_future(it.__anext__())
fut._orig_iter = it
iter_next[it] = fut
done, _ = await asyncio.wait(iter_next.values(),
return_when=asyncio.FIRST_COMPLETED)
for fut in done:
iter_next[fut._orig_iter] = None
try:
ret = fut.result()
except StopAsyncIteration:
del iter_next[fut._orig_iter]
continue
yield ret
Using that function, the loop would look like this:
async for batch in merge(downloader.fetch(url1), downloader.fetch(url2)):
....

Edit:
As mentioned in the comment, below method does not execute given routines in parallel.
Checkout aitertools library.
import asyncio
import aitertools
async def f1():
await asyncio.sleep(5)
yield 1
async def f2():
await asyncio.sleep(6)
yield 2
async def iter_funcs():
async for x in aitertools.chain(f2(), f1()):
print(x)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(iter_funcs())
It seems that, functions being iterated must be couroutine.

Related

Python: await the generator end

Current versions of Python (Dec 2022) still allow using #coroutine decorator and a generation can be as:
import asyncio
asyncify = asyncio.coroutine
data_ready = False # Status of a pipe, just to test
def gen():
global data_ready
while not data_ready:
print("not ready")
data_ready = True # Just to test
yield
return "done"
async def main():
result = await asyncify(gen)()
print(result)
loop = asyncio.new_event_loop()
loop.create_task(main())
loop.run_forever()
However, new Python versions 3.8+ will deprecate #coroutine decorator (the asyncify function alias), how to wait for (await) generator to end as above?
I tried to use async def as expected by the warning but not working:
import asyncio
asyncify = asyncio.coroutine
data_ready = False # Just to test
async def gen():
global data_ready
while not data_ready:
print("not ready")
data_ready = True # Just to test
yield
yield "done"
return
async def main():
# this has error: TypeError: object async_generator can't be used in 'await' expression
result = await gen()
print(result)
loop = asyncio.new_event_loop()
loop.create_task(main())
loop.run_forever()
Asynchronous generators inherit asynchronous iterator and are aimed for asynchronous iterations. You can not directly await them as regular coroutines.
With that in mind, returning to your experimental case and your question "how to wait for (await) generator to end?": to get the final yielded value - perform asynchronous iterations:
import asyncio
data_ready = False # Just to test
async def gen():
global data_ready
while not data_ready:
print("not ready")
data_ready = True # Just to test
yield "processing"
yield "done"
return
async def main():
a_gen = gen()
async for result in a_gen: # assign to result on each async iteration
pass
print('result:', result)
asyncio.run(main())
Prints:
not ready
result: done
Naturally, you can also advance the async generator in steps with anext:
a_gen = gen()
val_1 = await anext(a_gen)
Summing up, follow the guidlines on PEP 525 – Asynchronous Generators and try to not mix old-depreceted things with the actual ones.

Join multiple async generators in Python [duplicate]

This question already has answers here:
asynchronous python itertools chain multiple generators
(2 answers)
Closed 3 years ago.
I would like to listen for events from multiple instances of the same object and then merge this event streams to one stream. For example, if I use async generators:
class PeriodicYielder:
def __init__(self, period: int) -> None:
self.period = period
async def updates(self):
while True:
await asyncio.sleep(self.period)
yield self.period
I can successfully listen for events from one instance:
async def get_updates_from_one():
each_1 = PeriodicYielder(1)
async for n in each_1.updates():
print(n)
# 1
# 1
# 1
# ...
But how can I get events from multiple async generators? In other words: how can I iterate through multiple async generators in the order they are ready to produce next value?
async def get_updates_from_multiple():
each_1 = PeriodicYielder(1)
each_2 = PeriodicYielder(2)
async for n in magic_async_join_function(each_1.updates(), each_2.updates()):
print(n)
# 1
# 1
# 2
# 1
# 1
# 2
# ...
Is there such magic_async_join_function in stdlib or in 3rd party module?
You can use wonderful aiostream library. It'll look like this:
import asyncio
from aiostream import stream
async def test1():
for _ in range(5):
await asyncio.sleep(0.1)
yield 1
async def test2():
for _ in range(5):
await asyncio.sleep(0.2)
yield 2
async def main():
combine = stream.merge(test1(), test2())
async with combine.stream() as streamer:
async for item in streamer:
print(item)
asyncio.run(main())
Result:
1
1
2
1
1
2
1
2
2
2
If you wanted to avoid the dependency on an external library (or as a learning exercise), you could merge the async iterators using a queue:
def merge_async_iters(*aiters):
# merge async iterators, proof of concept
queue = asyncio.Queue(1)
async def drain(aiter):
async for item in aiter:
await queue.put(item)
async def merged():
while not all(task.done() for task in tasks):
yield await queue.get()
tasks = [asyncio.create_task(drain(aiter)) for aiter in aiters]
return merged()
This passes the test from Mikhail's answer, but it's not perfect: it doesn't propagate the exception in case one of the async iterators raises. Also, if the task that exhausts the merged generator returned by merge_async_iters() gets cancelled, or if the same generator is not exhausted to the end, the individual drain tasks are left hanging.
A more complete version could handle the first issue by detecting an exception and transmitting it through the queue. The second issue can be resolved by merged generator cancelling the drain tasks as soon as the iteration is abandoned. With those changes, the resulting code looks like this:
def merge_async_iters(*aiters):
queue = asyncio.Queue(1)
run_count = len(aiters)
cancelling = False
async def drain(aiter):
nonlocal run_count
try:
async for item in aiter:
await queue.put((False, item))
except Exception as e:
if not cancelling:
await queue.put((True, e))
else:
raise
finally:
run_count -= 1
async def merged():
try:
while run_count:
raised, next_item = await queue.get()
if raised:
cancel_tasks()
raise next_item
yield next_item
finally:
cancel_tasks()
def cancel_tasks():
nonlocal cancelling
cancelling = True
for t in tasks:
t.cancel()
tasks = [asyncio.create_task(drain(aiter)) for aiter in aiters]
return merged()
Different approaches to merging async iterators can be found in this answer, and also this one, where the latter allows for adding new streams mid-stride. The complexity and subtlety of these implementations shows that, while it is useful to know how to write one, actually doing so is best left to well-tested external libraries such as aiostream that cover all the edge cases.

How to make a asyncio pool cancelable?

I have a pool_map function that can be used to limit the number of simultaneously executing functions.
The idea is to have a coroutine function accepting a single parameter that is mapped to a list of possible parameters, but to also wrap all function calls into a semaphore acquisition, whereupon only a limited number is running at once:
from typing import Callable, Awaitable, Iterable, Iterator
from asyncio import Semaphore
A = TypeVar('A')
V = TypeVar('V')
async def pool_map(
func: Callable[[A], Awaitable[V]],
arg_it: Iterable[A],
size: int=10
) -> Generator[Awaitable[V], None, None]:
"""
Maps an async function to iterables
ensuring that only some are executed at once.
"""
semaphore = Semaphore(size)
async def sub(arg):
async with semaphore:
return await func(arg)
return map(sub, arg_it)
I modified and didn’t test above code for the sake of an example, but my variant works well. E.g. you can use it like this:
from asyncio import get_event_loop, coroutine, as_completed
from contextlib import closing
URLS = [...]
async def run_all(awaitables):
for a in as_completed(awaitables):
result = await a
print('got result', result)
async def download(url): ...
if __name__ != '__main__':
pool = pool_map(download, URLS)
with closing(get_event_loop()) as loop:
loop.run_until_complete(run_all(pool))
But a problem arises if there is an exception thrown while awaiting a future. I can’t see how to cancel all scheduled or still-running tasks, neither the ones still waiting for the semaphore to be acquired.
Is there a library or an elegant building block for this that I don’t know, or do I have to build all parts myself? (i.e. a Semaphore with access to its waiters, a as_finished that provides access to its running task queue, …)
Use ensure_future to get a Task instead of a coroutine:
import asyncio
from contextlib import closing
def pool_map(func, args, size=10):
"""
Maps an async function to iterables
ensuring that only some are executed at once.
"""
semaphore = asyncio.Semaphore(size)
async def sub(arg):
async with semaphore:
return await func(arg)
tasks = [asyncio.ensure_future(sub(x)) for x in args]
return tasks
async def f(n):
print(">>> start", n)
if n == 7:
raise Exception("boom!")
await asyncio.sleep(n / 10)
print("<<< end", n)
return n
async def run_all(tasks):
exc = None
for a in asyncio.as_completed(tasks):
try:
result = await a
print('=== result', result)
except asyncio.CancelledError as e:
print("!!! cancel", e)
except Exception as e:
print("Exception in task, cancelling!")
for t in tasks:
t.cancel()
exc = e
if exc:
raise exc
pool = pool_map(f, range(1, 20), 3)
with closing(asyncio.get_event_loop()) as loop:
loop.run_until_complete(run_all(pool))
Here's a naive solution, based on the fact that cancel is a no-op if the task is already finished:
async def run_all(awaitables):
futures = [asyncio.ensure_future(a) for a in awaitables]
try:
for fut in as_completed(futures):
result = await fut
print('got result', result)
except:
for future in futures:
future.cancel()
await asyncio.wait(futures)

Python asyncio task list generation without executing the function

While working in asyncio, I'm trying to use a list comprehension to build my task list. The basic form of the function is as follows:
import asyncio
import urllib.request as req
#asyncio.coroutine
def coro(term):
print(term)
google = "https://www.google.com/search?q=" + term.replace(" ", "+") + "&num=100&start=0"
request = req.Request(google, None, headers)
(some beautiful soup stuff)
My goal is to use a list of terms to create my task list:
terms = ["pie", "chicken" ,"things" ,"stuff"]
tasks=[
coro("pie"),
coro("chicken"),
coro("things"),
coro("stuff")]
My initial thought was:
loop = asyncio.get_event_loop()
tasks = [my_coroutine(term) for term in terms]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
This doesn't create the task list it runs the function during the list comprehension. Is there a way to use a shortcut to create the task list wihout writing every task?
Your HTTP client does not support asyncio, and you will not get the expected results. Try this to see .wait() does work as you expected:
import asyncio
import random
#asyncio.coroutine
def my_coroutine(term):
print("start", term)
yield from asyncio.sleep(random.uniform(1, 3))
print("end", term)
terms = ["pie", "chicken", "things", "stuff"]
loop = asyncio.get_event_loop()
tasks = [my_coroutine(term) for term in terms]
print("Here we go!")
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
If you use asyncio.gather() you get one future encapsulating all your tasks, which can be easily canceled with .cancel(), here demonstrated with python 3.5+ async def/await syntax (but works the same with #coroutine and yield from):
import asyncio
import random
async def my_coroutine(term):
print("start", term)
n = random.uniform(0.2, 1.5)
await asyncio.sleep(n)
print("end", term)
return "Term {} slept for {:.2f} seconds".format(term, n)
async def stop_all():
"""Cancels all still running tasks after one second"""
await asyncio.sleep(1)
print("stopping")
fut.cancel()
return ":-)"
loop = asyncio.get_event_loop()
terms = ["pie", "chicken", "things", "stuff"]
tasks = (my_coroutine(term) for term in terms)
fut = asyncio.gather(stop_all(), *tasks, return_exceptions=True)
print("Here we go!")
loop.run_until_complete(fut)
for task_result in fut.result():
if not isinstance(task_result, Exception):
print("OK", task_result)
else:
print("Failed", task_result)
loop.close()
And finally, if you want to use an async HTTP client, try aiohttp. First install it with:
pip install aiohttp
then try this example, which uses asyncio.as_completed:
import asyncio
import aiohttp
async def fetch(session, url):
print("Getting {}...".format(url))
async with session.get(url) as resp:
text = await resp.text()
return "{}: Got {} bytes".format(url, len(text))
async def fetch_all():
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, "http://httpbin.org/delay/{}".format(delay))
for delay in (1, 1, 2, 3, 3)]
for task in asyncio.as_completed(tasks):
print(await task)
return "Done."
loop = asyncio.get_event_loop()
resp = loop.run_until_complete(fetch_all())
print(resp)
loop.close()
this works in python 3.5 (added the new async-await syntax):
import asyncio
async def coro(term):
for i in range(3):
await asyncio.sleep(int(len(term))) # just sleep
print("cor1", i, term)
terms = ["pie", "chicken", "things", "stuff"]
tasks = [coro(term) for term in terms]
loop = asyncio.get_event_loop()
cors = asyncio.wait(tasks)
loop.run_until_complete(cors)
should't your version yield from req.Request(google, None, headers)? and (what library is that?) is this library even made for use with asyncio?
(here is the same code with the python <= 3.4 syntax; the missing parts are the same as above):
#asyncio.coroutine
def coro(term):
for i in range(3):
yield from asyncio.sleep(int(len(term))) # just sleep
print("cor1", i, term)
Create queue and run event loop
def main():
while terms:
tasks.append(asyncio.create_task(terms.pop())
responses = asyncio.gather(*tasks, return_exception=True)
loop = asyncio.get_event_loop()
loop.run_until_complete(responses)

Persist and fetch data in with block

I have a situation - I'm using the asyncio package with Python 3.x, and persisting data in a with block, something like this:
test_repo = TestRepository()
with (yield from test_repo):
res = yield from test_repo.get_by_lim_off(
page_size=int(length),
offset=start,
customer_name=customer_name,
customer_phone=customer_phone,
return_type=return_type
)
I need to get res data in the with block, but persistence and fetching data should happen when I exit from the with block. How can I achieve this?
This behavior is only supported in Python 3.5+, via asynchronous context managers (__aenter__/__aexit__), and async with, both of which were added in PEP 492:
class TestRepository:
# All your normal methods go here
async def __aenter__(self):
# You can call coroutines here
await self.some_init()
async def __aexit__(self, exc_type, exc, tb):
# You can call coroutines here
await self.do_persistence()
await self.fetch_data()
async def do_work():
test_repo = TestRepository()
async with test_repo:
res = await test_repo.get_by_lim_off(
page_size=int(length),
offset=start,
customer_name=customer_name,
customer_phone=customer_phone,
return_type=return_type
)
asyncio.get_event_loop().run_until_complete(do_work())
Prior to 3.5, you have to use a try/finally block with explicit calls to the init/cleanup coroutines, unfortunately:
#asyncio.coroutine
def do_work():
test_repo = TestRepository()
yield from test_repo.some_init()
try:
res = yield from test_repo.get_by_lim_off(
page_size=int(length),
offset=start,
customer_name=customer_name,
customer_phone=customer_phone,
return_type=return_type
)
finally:
yield from test_repo.do_persistence()
yield from test_repo.fetch_data()

Categories