Why does my asyncio function stop after the first task? - python

This is my first attempt at asynchronous programming in Python, but I am running into a problem where my results stop after the first task is finished, as opposed to returning all of the results after every task has finished executing.
In api.py, I have a search_async function that ultimately makes the request using the aiohttp.ClientSession object being passed around. Then the search_value_async function as a wrapper that's being called in app.py
# api.py
async def search_async(self, session, offset=0):
endpoint = 'https://example.com'
query_string = urlencode({ 'offset': offset })
lookup_url = f'{endpoint}?{query_string)}'
async with session.get(lookup_url, headers=self.get_resource_headers()) as response:
if response.status not in range(200, 299):
return {
'Status': response.status
}
return await response.json()
async def search_value_async(self, session, offset=0):
return await self.search_async(session, offset)
# app.py
async def get_recommendations(queries):
async with aiohttp.ClientSession() as session:
data = await get_all_queries(session, queries)
return data
async def get_all_queries(session, queries):
tasks = []
for query in queries:
for offset in range(0, 1000, 50):
tasks.append(asyncio.create_task(api.search_value_async(session, query, offset)))
results = await asyncio.gather(*tasks)
return results
def main():
# queries = ...
data = []
results = asyncio.run(get_recommendations(queries))
data.extend(results)
recommendations = normalize_data(data)
return data
So far, I've confirmed that the correct number of coroutines are being created, and I was able to diagnose that the number of results I get back when running asynchronously is equivalent to only the first task being ran.
I'm new to this, so my understanding could be wrong, but if all my tasks are being created, I would expect the results from await aysncio.gather(*tasks) to give me the results from all of my completed tasks, not just the first one.

Related

python asyncio result not set in task error

I am trying to create an asyncio task, perform some db query and then a for loop process and get the result back in the task. However, in the code sample below, it seems like my result is not being put to total_result.result() but rather, just to total_result.
Not sure if there is any misunderstanding that I"m having regarding my implementation of asyncio below?
class DatabaseHandler:
def __init__(self):
self.loop = get_event_loop()
self.engine = create_engine("postgres stuffs here")
self.conn = self.engine.connect()
async def _fetch_sql_data(self, query):
return self.conn.execute(query)
async def get_all(self, item):
total_result = []
if item == "all":
data = create_task(self._fetch_sql_data("select col1 from table1;"))
else:
data = create_task(self._fetch_sql_data(f"select col1 from table1 where quote = '{item}';"))
await data
for i in data.result().fetchall():
total_result.append(i[0])
return total_result
async def update(self):
total_result = create_task(self.get_all("all"))
print(await total_result) # prints out the result immediately and not the task object.
# this means that `total_result.result()` produces an error
loop = get_event_loop()
a = DatabaseHandler()
loop.run_until_complete(a.update())
I have a feeling that it is because of total_result being a list object. But not sure how to resolve this.
task.result() returns the result of your task (the return value of the wrapped coro) and not another Task. This means this
task = asyncio.create_task(coro())
await task
result = task.result()
is actually equivalent to
result = await coro()
Using tasks is especially useful, if you want to execute multiple coroutines concurrently. But as you are not doing that here, your code is a bit overcomplicated. You can just do
async def get_all(self, item):
total_result = []
if item == "all":
result = await self._fetch_sql_data("select col1 from table1;")
else:
result = await self._fetch_sql_data(f"select col1 from table1 where quote = '{item}';")
for i in result.fetchall():
total_result.append(i[0])
return total_result # holds the results of your db query just as called from sync code

Asyncio running same task second time with different input shuts down first task too

I have a script where I have multiple async functions and I am running them in loop. Everything runs okay, except one task which I need to run twice with different input parameters.
def run(self):
checks_to_run = self.returnChecksBasedOnInputs()
loop = asyncio.new_event_loop().run_until_complete(self.run_all_checks_async(checks_to_run))
asyncio.set_event_loop(loop)
return self.output
async def run_all_checks_async(self,checks_to_run):
async with aiohttp.ClientSession() as session:
check_results = []
for single_check in checks_to_run:
if single_check == "cvim_check_storage": #can run parallel in separate thread for each az
total_number_of_azs = len(Constants.cvim_azs)+1
self.log.info(total_number_of_azs)
for x in range(1,total_number_of_azs):
task = asyncio.ensure_future(getattr(self, single_check)(session,x))
else:
task = asyncio.ensure_future(getattr(self, single_check)(session))
check_results.append(task)
await asyncio.gather(*check_results, return_exceptions=False)
class apiCaller:
def __init__(self):
pass
async def callAndReturnJson(self, method, url, headers, session, payload, log):
sslcontext = ssl.create_default_context(purpose=ssl.Purpose.CLIENT_AUTH)
try:
async with session.request(method, url, data=payload, headers=headers,ssl=sslcontext) as response:
response = await response.json()
print(str(response))
return response
except Exception as e:
print("here exception")
raise Exception(str(e))
The problem is here in this function - I am running it twice, but I noticed when the second version of the task goes to the return statement also first task closes down immediately. How can I avoid that and wait till other task also finishes ?
async def cvim_check_storage(self,session, aznumber):
response = await apiCaller().callAndReturnJson("POST",f"https://{single_cvim_az}/v1/diskmgmt/check_disks",getattr(Constants,cvim_az_headers),session=session, payload=payload,log=self.log)
self.log.info(str(response))
self.log.info(str(response.keys()))
if "diskmgmt_request" not in response.keys():
self.output.cvim_checks.cvim_raid_checks.results[az_plus_number].overall_status = "FAILED"
self.output.cvim_checks.cvim_raid_checks.results[az_plus_number].details = str(response)
return
...rest of the code if above if statement is false
The problem is how you track your tasks. You are using task to add new tasks to check_results, but in one of your branches, you are assigning task inside a for loop. You don't add task to check_results until after the loop completes, though, so only the last task gets added. gather won't wait for any of the other tasks created before completing.
The solution is to add task during each iteration of the inner for loop. There are a few different ways to spell that.
One option is to just call check_results.append anywhere you currently assign to task.
if single_check == "cvim_check_storage": #can run parallel in separate thread for each az
total_number_of_azs = len(Constants.cvim_azs)+1
self.log.info(total_number_of_azs)
for x in range(1,total_number_of_azs):
check_results.append(
asyncio.ensure_future(getattr(self, single_check)(session,x))
)
else:
check_results.append(
asyncio.ensure_future(getattr(self, single_check)(session))
)
I'd take it one step further and use a list comprehension when creating multiple tasks, though.
if single_check == "cvim_check_storage": #can run parallel in separate thread for each az
total_number_of_azs = len(Constants.cvim_azs)+1
self.log.info(total_number_of_azs)
check_results.extend(
[
asyncio.ensure_future(getattr(self, single_check)(session,x))
for x in range(1,total_number_of_azs)
]
)
else:
task = asyncio.ensure_future(getattr(self, single_check)(session))
check_results.append(task)

Why does using pytest-asyncio and #parametrize cause tests to run for longer than without

I have a test. It sends a get request to a list of urls and checks that the response is not 500.
#pytest.mark.asyncio
#pytest.mark.parametrize('url_test_list', get_all_url_list(HOST_FOR_TEST))
async def test_check_status_urls(self, url_test_list):
returned_status = await get(url_test_list)
assert returned_status < 500
and this is my "get" function
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
response_status = response.status
return response_status
It works, but it is slow. It takes about 3 minutes to complete.
But when I use this test without #parametrize and my "get" function takes the url_list - it runs in about 1 minute. My code in second case:
#pytest.mark.asyncio
async def test_check_status_urls(self):
url_list = make_url_list()
returned_status = await get(url_list)
assert all(returned_status) > 500
async def get(urls):
good_list = []
async with aiohttp.ClientSession() as session:
for url in urls:
async with session.get(url) as response:
response_status = response.status
good_list.append(response_status)
return good_list
I would like to have the best of both worlds here. Is there a way I can have the tests run quickly, but also run as individual units?

Python parallelising "async for"

I have the following method in my Tornado handler:
async def get(self):
url = 'url here'
try:
async for batch in downloader.fetch(url):
self.write(batch)
await self.flush()
except Exception as e:
logger.warning(e)
This is the code for downloader.fetch():
async def fetch(url, **kwargs):
timeout = kwargs.get('timeout', aiohttp.ClientTimeout(total=12))
response_validator = kwargs.get('response_validator', json_response_validator)
extractor = kwargs.get('extractor', json_extractor)
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url) as resp:
response_validator(resp)
async for batch in extractor(resp):
yield batch
except aiohttp.client_exceptions.ClientConnectorError:
logger.warning("bad request")
raise
except asyncio.TimeoutError:
logger.warning("server timeout")
raise
I would like yield the "batch" object from multiple downloaders in paralel.
I want the first available batch from the first downloader and so on until all downloaders finished. Something like this (this is not working code):
async for batch in [downloader.fetch(url1), downloader.fetch(url2)]:
....
Is this possible? How can I modify what I am doing in order to be able to yield from multiple coroutines in parallel?
How can I modify what I am doing in order to be able to yield from multiple coroutines in parallel?
You need a function that merges two async sequences into one, iterating over both in parallel and yielding elements from one or the other, as they become available. While such a function is not included in the current standard library, you can find one in the aiostream package.
You can also write your own merge function, as shown in this answer:
async def merge(*iterables):
iter_next = {it.__aiter__(): None for it in iterables}
while iter_next:
for it, it_next in iter_next.items():
if it_next is None:
fut = asyncio.ensure_future(it.__anext__())
fut._orig_iter = it
iter_next[it] = fut
done, _ = await asyncio.wait(iter_next.values(),
return_when=asyncio.FIRST_COMPLETED)
for fut in done:
iter_next[fut._orig_iter] = None
try:
ret = fut.result()
except StopAsyncIteration:
del iter_next[fut._orig_iter]
continue
yield ret
Using that function, the loop would look like this:
async for batch in merge(downloader.fetch(url1), downloader.fetch(url2)):
....
Edit:
As mentioned in the comment, below method does not execute given routines in parallel.
Checkout aitertools library.
import asyncio
import aitertools
async def f1():
await asyncio.sleep(5)
yield 1
async def f2():
await asyncio.sleep(6)
yield 2
async def iter_funcs():
async for x in aitertools.chain(f2(), f1()):
print(x)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(iter_funcs())
It seems that, functions being iterated must be couroutine.

aiohttp: rate limiting parallel requests

APIs often have rate limits that users have to follow. As an example let's take 50 requests/second. Sequential requests take 0.5-1 second and thus are too slow to come close to that limit. Parallel requests with aiohttp, however, exceed the rate limit.
To poll the API as fast as allowed, one needs to rate limit parallel calls.
Examples that I found so far decorate session.get, approximately like so:
session.get = rate_limited(max_calls_per_second)(session.get)
This works well for sequential calls. Trying to implement this in parallel calls does not work as intended.
Here's some code as example:
async with aiohttp.ClientSession() as session:
session.get = rate_limited(max_calls_per_second)(session.get)
tasks = (asyncio.ensure_future(download_coroutine(
timeout, session, url)) for url in urls)
process_responses_function(await asyncio.gather(*tasks))
The problem with this is that it will rate-limit the queueing of the tasks. The execution with gather will still happen more or less at the same time. Worst of both worlds ;-).
Yes, I found a similar question right here aiohttp: set maximum number of requests per second, but neither replies answer the actual question of limiting the rate of requests. Also the blog post from Quentin Pradet works only on rate-limiting the queueing.
To wrap it up: How can one limit the number of requests per second for parallel aiohttp requests?
If I understand you well, you want to limit the number of simultaneous requests?
There is a object inside asyncio named Semaphore, it works like an asynchronous RLock.
semaphore = asyncio.Semaphore(50)
#...
async def limit_wrap(url):
async with semaphore:
# do what you want
#...
results = asyncio.gather([limit_wrap(url) for url in urls])
updated
Suppose I make 50 concurrent requests, and they all finish in 2 seconds. So, it doesn't touch the limitation(only 25 requests per seconds).
That means I should make 100 concurrent requests, and they all finish in 2 seconds too(50 requests per seconds). But before you actually make those requests, how could you determine how long will they finish?
Or if you doesn't mind finished requests per second but requests made per second. You can:
async def loop_wrap(urls):
for url in urls:
asyncio.ensure_future(download(url))
await asyncio.sleep(1/50)
asyncio.ensure_future(loop_wrap(urls))
loop.run_forever()
The code above will create a Future instance every 1/50 second.
I approached the problem by creating a subclass of aiohttp.ClientSession() with a ratelimiter based on the leaky-bucket algorithm. I use asyncio.Queue() for ratelimiting instead of Semaphores. I’ve only overridden the _request() method. I find this approach cleaner since you only replace session = aiohttp.ClientSession() with session = ThrottledClientSession(rate_limit=15).
class ThrottledClientSession(aiohttp.ClientSession):
"""
Rate-throttled client session class inherited from aiohttp.ClientSession)
USAGE:
replace `session = aiohttp.ClientSession()`
with `session = ThrottledClientSession(rate_limit=15)`
see https://stackoverflow.com/a/60357775/107049
"""
MIN_SLEEP = 0.1
def __init__(self, rate_limit: float = None, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.rate_limit = rate_limit
self._fillerTask = None
self._queue = None
self._start_time = time.time()
if rate_limit is not None:
if rate_limit <= 0:
raise ValueError('rate_limit must be positive')
self._queue = asyncio.Queue(min(2, int(rate_limit) + 1))
self._fillerTask = asyncio.create_task(self._filler(rate_limit))
def _get_sleep(self) -> Optional[float]:
if self.rate_limit is not None:
return max(1 / self.rate_limit, self.MIN_SLEEP)
return None
async def close(self) -> None:
"""Close rate-limiter's "bucket filler" task"""
if self._fillerTask is not None:
self._fillerTask.cancel()
try:
await asyncio.wait_for(self._fillerTask, timeout=0.5)
except asyncio.TimeoutError as err:
print(str(err))
await super().close()
async def _filler(self, rate_limit: float = 1):
"""Filler task to fill the leaky bucket algo"""
try:
if self._queue is None:
return
self.rate_limit = rate_limit
sleep = self._get_sleep()
updated_at = time.monotonic()
fraction = 0
extra_increment = 0
for i in range(0, self._queue.maxsize):
self._queue.put_nowait(i)
while True:
if not self._queue.full():
now = time.monotonic()
increment = rate_limit * (now - updated_at)
fraction += increment % 1
extra_increment = fraction // 1
items_2_add = int(min(self._queue.maxsize - self._queue.qsize(), int(increment) + extra_increment))
fraction = fraction % 1
for i in range(0, items_2_add):
self._queue.put_nowait(i)
updated_at = now
await asyncio.sleep(sleep)
except asyncio.CancelledError:
print('Cancelled')
except Exception as err:
print(str(err))
async def _allow(self) -> None:
if self._queue is not None:
# debug
# if self._start_time == None:
# self._start_time = time.time()
await self._queue.get()
self._queue.task_done()
return None
async def _request(self, *args, **kwargs) -> aiohttp.ClientResponse:
"""Throttled _request()"""
await self._allow()
return await super()._request(*args, **kwargs)
I liked #sraw's approached this with asyncio, but their answer didn't quite cut it for me. Since I don't know if my calls to download are going to each be faster or slower than the rate limit I want to have the option to run many in parallel when requests are slow and run one at a time when requests are very fast so that I'm always right at the rate limit.
I do this by using a queue with a producer that produces new tasks at the rate limit, then many consumers that will either all wait on the next job if they're fast, or there will be work backed up in the queue if they are slow, and will run as fast as the processor/network allow:
import asyncio
from datetime import datetime
async def download(url):
# download or whatever
task_time = 1/10
await asyncio.sleep(task_time)
result = datetime.now()
return result, url
async def producer_fn(queue, urls, max_per_second):
for url in urls:
await queue.put(url)
await asyncio.sleep(1/max_per_second)
async def consumer(work_queue, result_queue):
while True:
url = await work_queue.get()
result = await download(url)
work_queue.task_done()
await result_queue.put(result)
urls = range(20)
async def main():
work_queue = asyncio.Queue()
result_queue = asyncio.Queue()
num_consumer_tasks = 10
max_per_second = 5
consumers = [asyncio.create_task(consumer(work_queue, result_queue))
for _ in range(num_consumer_tasks)]
producer = asyncio.create_task(producer_fn(work_queue, urls, max_per_second))
await producer
# wait for the remaining tasks to be processed
await work_queue.join()
# cancel the consumers, which are now idle
for c in consumers:
c.cancel()
while not result_queue.empty():
result, url = await result_queue.get()
print(f'{url} finished at {result}')
asyncio.run(main())
I developed a library named octopus-api (https://pypi.org/project/octopus-api/), that enables you to rate limit and set the number of connections (parallel) calls to the endpoint using aiohttp under the hood. The goal of it is to simplify all the aiohttp setup needed.
Here is an example of how to use it, where the get_ethereum is the user-defined request function:
from octopus_api import TentacleSession, OctopusApi
from typing import Dict, List
if __name__ == '__main__':
async def get_ethereum(session: TentacleSession, request: Dict):
async with session.get(url=request["url"], params=request["params"]) as response:
body = await response.json()
return body
client = OctopusApi(rate=50, resolution="sec", connections=6)
result: List = client.execute(requests_list=[{
"url": "https://api.pro.coinbase.com/products/ETH-EUR/candles?granularity=900&start=2021-12-04T00:00:00Z&end=2021-12-04T00:00:00Z",
"params": {}}] * 1000, func=get_ethereum)
print(result)
The TentacleSession works the same as how you write POST, GET, PUT and PATCH for aiohttp.ClientSession.
Let me know if it helps your issue related to rate limits and parallel calls.
As far as the question here regarding n requests being sent at the same time when gather() is called, the key is using create_task() with an await asyncio.sleep(1.1) before every call. Any task created with create_task is immediately run:
for i in range(THREADS):
await asyncio.sleep(1.1)
tasks.append(
asyncio.create_task(getData(session, q, ''.join(random.choice(string.ascii_lowercase) for i in range(10))))
)
await asyncio.gather(*tasks)
The other issue of limiting # of simultaneous connections is also solved in the below example by using ClientSession() context in async_payload_wrapper and setting the connector with a limit.
With this setup I can run 25 coroutines (THREADS=25) that each loop over a queue of URLS and not violate a 25 concurrent connection rule:
async def send_request(session, url, routine):
start_time = time.time()
print(f"{routine}, sending request: {datetime.now()}")
params = {
'api_key': 'nunya',
'url': '%s' % url,
'render_js': 'false',
'premium_proxy': 'false',
'country_code':'us'
}
try:
async with session.get(url='http://yourAPI.com',params=params,) as response:
data = await response.content.read()
print(f"{routine}, done request: {time.time() - start_time} seconds")
return data
except asyncio.TimeoutError as e:
print('timeout---------------------')
errors.append(url)
except aiohttp.ClientResponseError as e:
print('request failed - Server Error')
errors.append(url)
except Exception as e:
errors.append(url)
async def getData(session, q, test):
while True:
if not q.empty():
url = q.get_nowait()
resp = await send_request(session, url ,test)
if resp is not None:
processData(resp, test, url)
else:
print(f'{test} queue empty')
break
async def async_payload_wrapper():
tasks = []
q = asyncio.Queue()
for url in urls:
await q.put(url)
async with ClientSession(connector=aiohttp.TCPConnector(limit=THREADS), timeout=ClientTimeout(total=61), raise_for_status=True) as session:
for i in range(THREADS):
await asyncio.sleep(1.1)
tasks.append(
asyncio.create_task(getData(session, q, ''.join(random.choice(string.ascii_lowercase) for i in range(10))))
)
await asyncio.gather(*tasks)
if __name__ == '__main__':
start_time = time.time()
asyncio.run(async_payload_wrapper())

Categories