aiohttp ClientSession.get() method failing silently - Python3.7 - python

I'm making a small application that attempts to find company website URLs by searching for their names via Bing. It takes in a big list of company names, uses the Bing Search API to obtain the 1st URL, & saves those URLs back in the list.
I'm having a problem with aiohttp's ClientSession.get() method, specifically, it fails silently & I can't figure out why.
Here's how I'm initializing the script. Keep an eye out for worker.perform_mission():
async def _execute(workers,*, loop=None):
if not loop:
loop = asyncio.get_event_loop()
[asyncio.ensure_future(i.perform_mission(verbose=True), loop=loop) for i in workers]
def main():
filepth = 'c:\\SOME\\FILE\\PATH.xlsx'
cache = pd.read_excel(filepth)
# CHANGE THE NUMBER IN range(<here>) TO ADD MORE WORKERS.
workers = (Worker(cache) for i in range(1))
loop = asyncio.get_event_loop()
loop.run_until_complete(_execute(workers, loop=loop))
...<MORE STUFF>...
The worker.perform_mission() method does the following (scroll to the bottom and look at _split_up_request_like_they_do_in_the_docs()):
class Worker(object):
def __init__(self, shared_cache):
...<MORE STUFF>...
async def perform_mission(self, verbose=False):
while not self.mission_complete:
if not self.company_name:
await self.find_company_name()
if verbose:
print('Obtained Company Name')
if self.company_name and not self.website:
print('Company Name populated but no website found yet.')
data = await self.call_bing() #<<<<< THIS IS SILENTLY FAILING.
if self.website and ok_to_set_website(self.shared_cache, self):
await self.try_set_results(data)
self.mission_complete = True
else:
print('{} worker failed at setting website.'.format(self.company_name))
pass
else:
print('{} worker failed at obtaining data from Bing.'.format(self.company_name))
pass
async def call_bing(self):
async with aiohttp.ClientSession() as sesh:
sesh.headers = self.headers
sesh.params = self.params
return await self._split_up_request_like_they_do_in_the_docs(sesh)
async def _split_up_request_like_they_do_in_the_docs(self, session):
print('_bing_request() successfully called.') #<<<THIS CATCHES
async with session.get(self.search_url) as resp:
print('Session.get() successfully called.') #<<<THIS DOES NOT.
return await resp.json()
And finally my output is:
Obtained Company Name
Company Name populated but no website found yet.
_bing_request() successfully called.
Process finished with exit code 0
Can anyone help me figure out why print('Session.get() successfully called.'), isn't triggering?...or maybe help me ask this question better?

Take a look at this part:
async def _execute(workers,*, loop=None):
# ...
[asyncio.ensure_future(i.perform_mission(verbose=True), loop=loop) for i in workers]
You create a bunch of tasks, but you don't await these tasks are finished. It means _execute itself will be done right after tasks are created, long before these tasks are finished. And since you run event loop until _execute done, it will stop shortly after start.
To fix this, use asyncio.gather to wait multiple awaitables are finished:
async def _execute(workers,*, loop=None):
# ...
tasks = [asyncio.ensure_future(i.perform_mission(verbose=True), loop=loop) for i in workers]
await asyncio.gather(*tasks)

Related

Alternative to asyncio.gather which I can keep adding coroutines to at runtime?

I need to be able to keep adding coroutines to the asyncio loop at runtime. I tried using create_task() thinking that this would do what I want, but it still needs to be awaited.
This is the code I had, not sure if there is a simple edit to make it work?
async def get_value_from_api():
global ASYNC_CLIENT
return ASYNC_CLIENT.get(api_address)
async def print_subs():
count = await get_value_from_api()
print(count)
async def save_subs_loop():
while True:
asyncio.create_task(print_subs())
time.sleep(0.1)
async def start():
global ASYNC_CLIENT
async with httpx.AsyncClient() as ASYNC_CLIENT:
await save_subs_loop()
asyncio.run(start())
I once created similar pattern when I was mixing trio and kivy, which was demonstration of running multiple coroutines asynchronously.
It use a trio.MemoryChannel which is roughly equivalent to asyncio.Queue, I'll just refer it as queue here.
Main idea is:
Wrap each task with class, which has run function.
Make class object's own async method to put object itself into queue when execution is done.
Create a global task-spawning loop to wait for the object in queue and schedule execution/create task for the object.
import asyncio
import traceback
import httpx
async def task_1(client: httpx.AsyncClient):
resp = await client.get("http://127.0.0.1:5000/")
print(resp.read())
await asyncio.sleep(0.1) # without this would be IP ban
async def task_2(client: httpx.AsyncClient):
resp = await client.get("http://127.0.0.1:5000/meow/")
print(resp.read())
await asyncio.sleep(0.5)
class CoroutineWrapper:
def __init__(self, queue: asyncio.Queue, coro_func, *param):
self.func = coro_func
self.param = param
self.queue = queue
async def run(self):
try:
await self.func(*self.param)
except Exception:
traceback.print_exc()
return
# put itself back into queue
await self.queue.put(self)
class KeepRunning:
def __init__(self):
# queue for gathering CoroutineWrapper
self.queue = asyncio.Queue()
def add_task(self, coro, *param):
wrapped = CoroutineWrapper(self.queue, coro, *param)
# add tasks to be executed in queue
self.queue.put_nowait(wrapped)
async def task_processor(self):
task: CoroutineWrapper
while task := await self.queue.get():
# wait for new CoroutineWrapper Object then schedule it's async method execution
asyncio.create_task(task.run())
async def main():
keep_running = KeepRunning()
async with httpx.AsyncClient() as client:
keep_running.add_task(task_1, client)
keep_running.add_task(task_2, client)
await keep_running.task_processor()
asyncio.run(main())
Server
import time
from flask import Flask
app = Flask(__name__)
#app.route("/")
def hello():
return str(time.time())
#app.route("/meow/")
def meow():
return "meow"
app.run()
Output:
b'meow'
b'1639920445.965701'
b'1639920446.0767004'
b'1639920446.1887035'
b'1639920446.2986999'
b'1639920446.4067013'
b'meow'
b'1639920446.516704'
b'1639920446.6267014'
...
You can see tasks running repeatedly on their own pace.
Old answer
Seems like you only want to cycle fixed amount of tasks.
In that case just iterate list of coroutine with itertools.cycle
But this is no different with synchronous, so lemme know if you need is asynchronous.
import asyncio
import itertools
import httpx
async def main_task(client: httpx.AsyncClient):
resp = await client.get("http://127.0.0.1:5000/")
print(resp.read())
await asyncio.sleep(0.1) # without this would be IP ban
async def main():
async with httpx.AsyncClient() as client:
for coroutine in itertools.cycle([main_task]):
await coroutine(client)
asyncio.run(main())
Server:
import time
from flask import Flask
app = Flask(__name__)
#app.route("/")
def hello():
return str(time.time())
app.run()
Output:
b'1639918937.7694323'
b'1639918937.8804302'
b'1639918937.9914327'
b'1639918938.1014295'
b'1639918938.2124324'
b'1639918938.3204308'
...
asyncio.create_task() works as you describe it. The problem you are having here is that you create an infinite loop here:
async def save_subs_loop():
while True:
asyncio.create_task(print_subs())
time.sleep(0.1) # do not use time.sleep() in async code EVER
save_subs_loop() keeps creating tasks but control is never yielded back to the event loop, because there is no await in there. Try
async def save_subs_loop():
while True:
asyncio.create_task(print_subs())
await asyncio.sleep(0.1) # yield control back to loop to give tasks a chance to actually run
This problem is so common I'm thinking python should raise a RuntimeError if it detects time.sleep() within a coroutine :-)
You might want to try the TaskThread framework
It allows you to add tasks in runtime
Tasks are re-scheduled periodically (like in your while loop up there)
There is a consumer / producer framework built in (parent/child relationships) which you seem to need
disclaimer: I wrote TaskThread out of necessity & it's been a life saver.

Cancelling asyncio task run in executor

I'm scraping some websites, paralelizing requests library using asyncio:
def run():
asyncio.run(scrape());
def check_link(link):
#.... code code code ...
response = requests.get(link)
#.... code code code ...
write_some_stats_into_db()
async def scrape():
#.... code code code ...
task = asyncio.get_event_loop().run_in_executor(check_link(link));
#.... code code code ...
if done:
for task in all_tasks:
task.cancel();
I only need to find one 'correct' link, after that, I can stop the program. However, because the check_link is run in executor, it's threads are automatically daemonized, thus even after calling taks.cancel(), I have to wait for all of the other still running check_link to complete.
Do you have any ideas how to 'force-kill' the other running checks in the thread executor?
You can do it the following way, actually from my point of view, if you do not have to use asyncio for the task, use only threads without any async loop, since it makes your code more complicated.
import asyncio
from random import randint
import time
from functools import partial
# imagine that this is links array
LINKS = list(range(1000))
# how many thread-worker you want to have simultaneously
WORKERS_NUM = 10
# stops the app
STOP_EVENT = asyncio.Event()
STOP_EVENT.clear()
def check_link(link: str) -> int:
"""checks link in another thread and returns result"""
time.sleep(3)
r = randint(1, 11)
print(f"{link}____{r}\n")
return r
async def check_link_wrapper(q: asyncio.Queue):
"""Async wrapper around sync function"""
loop = asyncio.get_event_loop()
while not STOP_EVENT.is_set():
link = await q.get()
if not link:
break
value = await loop.run_in_executor(None, func=partial(check_link, link))
if value == 10:
STOP_EVENT.set()
print("Hurray! We got TEN !")
async def feeder(q: asyncio.Queue):
"""Send tasks and "poison pill" to all workers"""
# send tasks to workers
for link in LINKS:
await q.put(link)
# ask workers to stop
for _ in range(WORKERS_NUM):
await q.put(None)
async def amain():
"""Main async function of the app"""
# maxsize is one since we want the app
# to stop as fast as possible if stop condition is met
q = asyncio.Queue(maxsize=1)
# we create separate task, since we do not want to await feeder
# we are interested only in workers
asyncio.create_task(feeder(q))
await asyncio.gather(
*[check_link_wrapper(q) for _ in range(WORKERS_NUM)],
)
if __name__ == '__main__':
asyncio.run(amain())

Why doesn't this asyncio semaphore implementation work with aiohttp in python

I first make a simple request to get a JSON containing all the names, then I iterate over all the names and make asynchronous awaitable calls corresponding to each name, and store them in a list called "tasks", and then I gather all of them.
The problem is, the response server has a limit to the api responses per minute, and no matter how low I keep the semaphore value, this code takes the same time (small enough to not meet the server's expectations) to make the API calls, as if the semaphore doesn't exist at all. How do I control the API call rate?
<some code>
url = http://example.com/
response = requests.request("GET", url, headers=headers)
async def get_api(session, url_dev):
async with session.get(url_dev, headers = headers) as resp:
result = await resp.json()
return result
async def main():
async with aiohttp.ClientSession() as session:
sem = asyncio.Semaphore(1)
tasks = []
for i in response.json()["Names"]:
url_dev = "https://example.com/example/" + str(i["Id"])
await sem.acquire()
async with sem:
tasks.append(asyncio.create_task(get_api(session, url_dev)))
full_list = list()
async with sem:
full_list = await asyncio.gather(*tasks)
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())
Semaphore here really isn't the right tool to manage rate limiting unless you are going to increment the semaphore in a separate loop, or add a sleep inside the critical section. You could also schedule a follow up task to sleep and then deque the semaphore.
Further, you've queued all of the tasks inside the critical section, but the execution happens async to the critical section because you queued it as a task. You need to have the semaphore inside the get_api method.
Also, you're acquiring the semaphore twice; either use the acquire method and try/ finally, or use async with, but not both. See the docs
Here is a simple script to illustrate how you can have a task loop that does not exceed starting more than 5 tasks per 5 second interval:
async def dequeue(sem, sleep):
"""Wait for a duration and then increment the semaphore"""
try:
await asyncio.sleep(sleep)
finally:
sem.release()
async def task(sem, sleep, data):
"""Decrement the semaphore, schedule an increment, and then work"""
await sem.acquire()
asyncio.create_task(dequeue(sem, sleep))
# logic here
print(data)
async def main():
max_concurrent = 5
sleep = 5
sem = asyncio.Semaphore(max_concurrent)
tasks = [asyncio.create_task(task(sem, sleep, i)) for i in range(15)]
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())
You could also wrap this logic in a decorator if you want to get really fancy:
def rate_limited(max_concurrent, duration):
def decorator(func):
semaphore = asyncio.Semaphore(max_concurrent)
async def dequeue():
try:
await asyncio.sleep(duration)
finally:
semaphore.release()
#functools.wraps(func)
async def wrapper(*args, **kwargs):
await semaphore.acquire()
asyncio.create_task(dequeue())
return await func(*args, **kwargs)
return wrapper
return decorator
Then the code becomes the follow (note semaphore was created outside of asyncio.run, so you need to query the default loop for it to work properly):
#rate_limited(max_concurrent=5, duration=5)
async def task(i):
print(i)
async def main():
tasks = [asyncio.create_task(task(i)) for i in range(7)]
await asyncio.gather(*tasks)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
You should acquire and release the semaphore object when you run the request to the API endpoint in get_api, instead of when you create the tasks and gather the results. Also, based on your sample use case, there should be no need to manually call sem.acquire and sem.release when you use its context manager instead:
async def get_api(session, sem:asyncio.Semaphore, url_dev):
#below, using both the semaphore and session.get in a context manager
#now, the semaphore will properly block requests when the limit has been reached, until others have finished
async with sem, session.get(url_dev, headers = headers) as resp:
result = await resp.json()
return result
async def main():
sem = asyncio.Semaphore(1)
async with aiohttp.ClientSession() as session:
tasks = []
for i in response.json()["Names"]:
url_dev = "https://example.com/example/" + str(i["Id"])
#passing the semaphore instance to get_api
tasks.append(asyncio.create_task(get_api(session, sem, url_dev)))
full_list = await asyncio.gather(*tasks)

Retrieving data from python's coroutine object

I am trying to learn async, and now I am trying to get whois information for a batch of domains. I found this lib aiowhois, but there are only a few strokes of information, not enough for such newbie as I am.
This code works without errors, but I don't know how to print data from parsed whois variable, which is coroutine object.
resolv = aiowhois.Whois(timeout=10)
async def coro(url, sem):
parsed_whois = await resolv.query(url)
async def main():
tasks = []
sem = asyncio.Semaphore(4)
for url in domains:
task = asyncio.Task(coro(url, sem))
tasks.append(task)
await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
You can avoid using tasks. Just apply gather to the coroutine directly.
In case you are confused about the difference, this SO QA might help you (especially the second answer).
You can have each coroutine return its result, without resorting to global variables:
async def coro(url):
return await resolv.query(url)
async def main():
domains = ...
ops = [coro(url) for url in domains]
rets = await asyncio.gather(*ops)
print(rets)
Please see the official docs to learn more about how to use gather or wait or even more options
Note: if you are using the latest python versions, you can also simplify the loop running with just
asyncio.run(main())
Note 2: I have removed the semaphore from my code, as it's unclear why you need it and where.
all_parsed_whois = [] # make a global
async def coro(url, sem):
all_parsed_whois.append(await resolv.query(url))
If you want the data as soon as it is available you could task.add_done_callback()
python asyncio add_done_callback with async def

How to initiate next request before yielding in asynchronous generator in python

I'm attempting to get some data from a paginated API (specifically github's, but the API doesn't matter for this question). I'm using a python asynchronous generator to yield each individual row from each page. The code looks something like this:
async def get_data():
cursor = None
with aiohttp.ClientSession() as session:
while True:
async with session.get(build_url(cursor)):
data = await response.json()
yield from get_rows(data)
if not has_next_page(data):
return
cursor = get_next_cursor(data)
So, this basically works. However, one of the minor flaws is that it doesn't initiate the next request until after all the rows have been yielded from the current page. Is there a good way to initiate that processing inside of this loop, before starting to yield? In particular, I want to make sure that the async with is still evaluated correctly when doing asyncio.ensure_future, which is the API for initiating background work.
You'll need at least one extra coroutine to achieve that, and bridge the two with an asyncio.Queue:
async def get_data():
queue = asyncio.Queue()
async def fetch_all_pages():
cursor = None
with aiohttp.ClientSession() as session:
while True:
async with session.get(build_url(cursor)):
data = await response.json()
await queue.put(data)
if not has_next_page(data):
# signal the peer to exit
await queue.put(None)
break
cursor = get_next_cursor(data)
asyncio.ensure_future(fetch_all_pages())
while True:
data = await queue.get()
if not data:
break
yield from get_rows(data)

Categories