python coroutine, perform tasks periodically and cancel - python

For every 10 minutes, do the following tasks.
- generate list of image urls to download
- (if previous download is not finished, we have to cancel the download)
- download images concurrently
I'm relatively new to coroutines..
Can I structure the above with coroutines?
I think coroutine is essentially sequential flow..
So having problem thinking about it..
Actually, come to think of it myself, following would work?
async def generate_urls():
await sleep(10)
result = _generate_urls()
return result
async def download_image(url):
# download images
image = await _download_image()
return image
async def main():
while True:
urls = await generate_urls()
for url in urls:
download_task = asyncio.create_task(download_image(url))
await download_task
asyncio.run(main())

You current code is quite close. Below are some modifications to make it more closely align with your original spec:
import asyncio
def generate_urls():
return _generate_urls() #no need to sleep in the URL generation function
async def download_image(url):
image = await _download_image()
return image
async def main():
tasks = []
while True:
tasks.extend(t:=[asyncio.create_task(download_image(url)) for url in generate_urls()])
await asyncio.gather(*t) #run downloads concurrently
await asyncio.sleep(10) #sleep after creating tasks
for i in d: #after 10 seconds, check if any of the downloads are still running
if not i.done():
i.cancel() #cancel if task is not complete

Related

Cancelling asyncio task run in executor

I'm scraping some websites, paralelizing requests library using asyncio:
def run():
asyncio.run(scrape());
def check_link(link):
#.... code code code ...
response = requests.get(link)
#.... code code code ...
write_some_stats_into_db()
async def scrape():
#.... code code code ...
task = asyncio.get_event_loop().run_in_executor(check_link(link));
#.... code code code ...
if done:
for task in all_tasks:
task.cancel();
I only need to find one 'correct' link, after that, I can stop the program. However, because the check_link is run in executor, it's threads are automatically daemonized, thus even after calling taks.cancel(), I have to wait for all of the other still running check_link to complete.
Do you have any ideas how to 'force-kill' the other running checks in the thread executor?
You can do it the following way, actually from my point of view, if you do not have to use asyncio for the task, use only threads without any async loop, since it makes your code more complicated.
import asyncio
from random import randint
import time
from functools import partial
# imagine that this is links array
LINKS = list(range(1000))
# how many thread-worker you want to have simultaneously
WORKERS_NUM = 10
# stops the app
STOP_EVENT = asyncio.Event()
STOP_EVENT.clear()
def check_link(link: str) -> int:
"""checks link in another thread and returns result"""
time.sleep(3)
r = randint(1, 11)
print(f"{link}____{r}\n")
return r
async def check_link_wrapper(q: asyncio.Queue):
"""Async wrapper around sync function"""
loop = asyncio.get_event_loop()
while not STOP_EVENT.is_set():
link = await q.get()
if not link:
break
value = await loop.run_in_executor(None, func=partial(check_link, link))
if value == 10:
STOP_EVENT.set()
print("Hurray! We got TEN !")
async def feeder(q: asyncio.Queue):
"""Send tasks and "poison pill" to all workers"""
# send tasks to workers
for link in LINKS:
await q.put(link)
# ask workers to stop
for _ in range(WORKERS_NUM):
await q.put(None)
async def amain():
"""Main async function of the app"""
# maxsize is one since we want the app
# to stop as fast as possible if stop condition is met
q = asyncio.Queue(maxsize=1)
# we create separate task, since we do not want to await feeder
# we are interested only in workers
asyncio.create_task(feeder(q))
await asyncio.gather(
*[check_link_wrapper(q) for _ in range(WORKERS_NUM)],
)
if __name__ == '__main__':
asyncio.run(amain())

python3.6 async/await still works synchronously with fastAPI

I have a fastAPI app that posts two requests, one of them is longer (if it helps, they're Elasticsearch queries and I'm using the AsyncElasticsearch module which already returns coroutine). This is my attempt:
class my_module:
search_object = AsyncElasticsearch(url, port)
async def do_things(self):
resp1 = await search_object.search() #the longer one
print(check_resp1)
resp2 = await search_object.search() #the shorter one
print(check_resp2)
process(resp2)
process(resp1)
do_synchronous_things()
return thing
app = FastAPI()
#app.post("/")
async def service(user_input):
result = await my_module.do_things()
return results
What I observed is instead of awaiting resp1, by the time it got to check_resp1 it's already a full response, as if I didn't use async at all.
I'm new to python async, I knew my code wouldn't work, but I don't know how to fix it. As far as I understand, when interpreter sees await it starts the function then just moves on, which in this case should immediately post the next request. How do I make it do that?
Yes, that's correct the coroutine won't proceed until the results are ready. You can use asyncio.gather to run tasks concurrently:
import asyncio
async def task(msg):
print(f"START {msg}")
await asyncio.sleep(1)
print(f"END {msg}")
return msg
async def main():
await task("1")
await task("2")
results = await asyncio.gather(task("3"), task("4"))
print(results)
if __name__ == "__main__":
asyncio.run(main())
Test:
$ python test.py
START 1
END 1
START 2
END 2
START 3
START 4
END 3
END 4
['3', '4']
Alternatively you can use asyncio.as_completed to get the earliest next result:
for coro in asyncio.as_completed((task("5"), task("6"))):
earliest_result = await coro
print(earliest_result)
Update Fri 2 Apr 09:25:33 UTC 2021:
asyncio.run is available since Python 3.7+, in previous versions you will have to create and start the loop manually:
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
Explanation
The reason your code run synchronyously is that in do_things function, the code is executed as follow:
Schedule search_object.search() to execute
Wait till search_object.search() is finished and get the result
Schedule search_object.search() to execute
Wait till search_object.search() is finished and get the result
Execute (synchronyously) process(resp2)
Execute (synchronyously) process(resp1)
Execute (synchronyously) do_synchronous_things()
What you intended, is to make steps 1 and 3 executed before 2 and 4. You can make it easily with unsync library - here is the documentation.
How you can fix this
from unsync import unsync
class my_module:
search_object = AsyncElasticsearch(url, port)
#unsync
async def search1():
return await search_object.search()
#unsync
async def search2(): # not sure if this is any different to search1
return await search_object.search()
async def do_things(self):
task1, task2 = self.search1(), self.search2() # schedule tasks
resp1, resp2 = task1.result(), task2.result() # wait till tasks are executed
# you might also do similar trick with process function to run process(resp2) and process(resp1) concurrently
process(resp2)
process(resp1)
do_synchronous_things() # if this does not rely on resp1 and resp2 it might also be put into separate task to make the computation quicker. To do this use #unsync(cpu_bound=True) decorator
return thing
app = FastAPI()
#app.post("/")
async def service(user_input):
result = await my_module.do_things()
return results
More information
If you want to learn more about asyncio and asyncronyous programming, I recommend this tutorial. There is also similar case that you presented with a few possible solutions to make the coroutines run concurrently.
PS. Obviosuly I could not run this code, so you must debug it on your own.

aiohttp download large list of pdf files

i am trying to download large number of pdf files asynchronously, python requests does not work well with async functionalities
but i am finding aiohttp hard to implement with pdf downloads, and can't find a thread for this specific task, for someone new into python async world to understand easily.
yeah it can be done with threadpoolexecutor but in this case better to keep in one thread.
this code works but need to do with 100 or so urls
asynchronously
import aiohttp
import aiofiles
async with aiohttp.ClientSession() as session:
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
async with session.get(url) as resp:
if resp.status == 200:
f = await aiofiles.open('download_pdf.pdf', mode='wb')
await f.write(await resp.read())
await f.close()
Thanks in advance.
You could do try something like this. For the sake of simplicity the same dummy pdf will be downloaded multiple times to disk with different file names:
from asyncio import Semaphore, gather, run, wait_for
from random import randint
import aiofiles
from aiohttp.client import ClientSession
# Mock a list of different pdfs to download
pdf_list = [
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
]
MAX_TASKS = 5
MAX_TIME = 5
async def download(pdf_list):
tasks = []
sem = Semaphore(MAX_TASKS)
async with ClientSession() as sess:
for pdf_url in pdf_list:
# Mock a different file name each iteration
dest_file = str(randint(1, 100000)) + ".pdf"
tasks.append(
# Wait max 5 seconds for each download
wait_for(
download_one(pdf_url, sess, sem, dest_file),
timeout=MAX_TIME,
)
)
return await gather(*tasks)
async def download_one(url, sess, sem, dest_file):
async with sem:
print(f"Downloading {url}")
async with sess.get(url) as res:
content = await res.read()
# Check everything went well
if res.status != 200:
print(f"Download failed: {res.status}")
return
async with aiofiles.open(dest_file, "+wb") as f:
await f.write(content)
# No need to use close(f) when using with statement
if __name__ == "__main__":
run(download(pdf_list))
Keep in mind that firing multiple concurrent request to a server might get your IP banned for a period of time. In that case, consider adding a sleep call (which kind of defeats the purpose of using aiohttp) or switching to a classic sequential script. In order to keep things concurrent but kinder to the server, the script will fire max 5 requests at any given time (MAX_TASKS).

Use python's asyncio to make an API request and process the result asynchronously

I need to make an API request for several pieces of data, and then process each result. The request is paginated, so I'm currently doing
def get_results():
while True:
response = api(num_results=5)
if response is None: # No more results
break
yield response
def process_data():
for page in get_results():
for result in page:
do_stuff(result)
process_data()
I'm hoping to use asyncio to retrieve the next page of results from the API while I'm processing the current one, instead of waiting for results, processing them, then waiting again. I've modified the code to
import asyncio
async def get_results():
while True:
response = api(num_results=5)
if response is None: # No more results
break
yield response
async def process_data():
async for page in get_results():
for result in page:
do_stuff(result)
asyncio.run(process_data())
I'm not sure if this is doing what I intend it to. Is this the right way to make processing the current page of API results and getting the next page of results asynchronous?
Maybe you can use Asyncio.Queue to refactor your code to Producer/Consumer Pattern
import asyncio
import random
q = asyncio.Queue()
async def api(num_results):
# you could use aiohttp to fetch api
# fake content
await asyncio.sleep(1)
fake_response = random.random()
if fake_response < 0.1:
return None
return fake_response
async def get_results(q):
while True:
response = await api(num_results=5)
if response is None:
# indicate producer done
print('Producer Done')
await q.put(None)
break
print('Producer: ', response)
await q.put(response)
async def process_data():
while True:
data = await q.get()
if not data:
print('Consumer Done')
break
# process data whatever you want, but if its cpu intensive, you can call loop.run_in_executor
# fake the process needs a little time
await asyncio.sleep(3)
print('Consume', data)
loop = asyncio.get_event_loop()
loop.create_task(get_results(q))
loop.run_until_complete(process_data())
Come back to the question
Is this the right way to make processing the current page of API results and getting the next page of results asynchronous?
Its not the right way, because get_results() is iterated each time your do_stuff(result) done

aiohttp ClientSession.get() method failing silently - Python3.7

I'm making a small application that attempts to find company website URLs by searching for their names via Bing. It takes in a big list of company names, uses the Bing Search API to obtain the 1st URL, & saves those URLs back in the list.
I'm having a problem with aiohttp's ClientSession.get() method, specifically, it fails silently & I can't figure out why.
Here's how I'm initializing the script. Keep an eye out for worker.perform_mission():
async def _execute(workers,*, loop=None):
if not loop:
loop = asyncio.get_event_loop()
[asyncio.ensure_future(i.perform_mission(verbose=True), loop=loop) for i in workers]
def main():
filepth = 'c:\\SOME\\FILE\\PATH.xlsx'
cache = pd.read_excel(filepth)
# CHANGE THE NUMBER IN range(<here>) TO ADD MORE WORKERS.
workers = (Worker(cache) for i in range(1))
loop = asyncio.get_event_loop()
loop.run_until_complete(_execute(workers, loop=loop))
...<MORE STUFF>...
The worker.perform_mission() method does the following (scroll to the bottom and look at _split_up_request_like_they_do_in_the_docs()):
class Worker(object):
def __init__(self, shared_cache):
...<MORE STUFF>...
async def perform_mission(self, verbose=False):
while not self.mission_complete:
if not self.company_name:
await self.find_company_name()
if verbose:
print('Obtained Company Name')
if self.company_name and not self.website:
print('Company Name populated but no website found yet.')
data = await self.call_bing() #<<<<< THIS IS SILENTLY FAILING.
if self.website and ok_to_set_website(self.shared_cache, self):
await self.try_set_results(data)
self.mission_complete = True
else:
print('{} worker failed at setting website.'.format(self.company_name))
pass
else:
print('{} worker failed at obtaining data from Bing.'.format(self.company_name))
pass
async def call_bing(self):
async with aiohttp.ClientSession() as sesh:
sesh.headers = self.headers
sesh.params = self.params
return await self._split_up_request_like_they_do_in_the_docs(sesh)
async def _split_up_request_like_they_do_in_the_docs(self, session):
print('_bing_request() successfully called.') #<<<THIS CATCHES
async with session.get(self.search_url) as resp:
print('Session.get() successfully called.') #<<<THIS DOES NOT.
return await resp.json()
And finally my output is:
Obtained Company Name
Company Name populated but no website found yet.
_bing_request() successfully called.
Process finished with exit code 0
Can anyone help me figure out why print('Session.get() successfully called.'), isn't triggering?...or maybe help me ask this question better?
Take a look at this part:
async def _execute(workers,*, loop=None):
# ...
[asyncio.ensure_future(i.perform_mission(verbose=True), loop=loop) for i in workers]
You create a bunch of tasks, but you don't await these tasks are finished. It means _execute itself will be done right after tasks are created, long before these tasks are finished. And since you run event loop until _execute done, it will stop shortly after start.
To fix this, use asyncio.gather to wait multiple awaitables are finished:
async def _execute(workers,*, loop=None):
# ...
tasks = [asyncio.ensure_future(i.perform_mission(verbose=True), loop=loop) for i in workers]
await asyncio.gather(*tasks)

Categories