Asynchronous Download of Files - python

This downloads updated fasta files (protein sequences) from a database, I've gotten this to work faster using asyncio compared to requests, however I'm not convinced the downloads are actually happening asynchronously.
import os
import aiohttp
import aiofiles
import asyncio
folder = '~/base/fastas/proteomes/'
upos = {'UP000005640': 'Human_Homo_sapien',
'UP000002254': 'Dog_Boxer_Canis_Lupus_familiaris',
'UP000002311': 'Yeast_Saccharomyces_cerevisiae',
'UP000000589': 'Mouse_Mus_musculus',
'UP000006718': 'Monkey_Rhesus_macaque_Macaca_mulatta',
'UP000009130': 'Monkey_Cynomolgus_Macaca_fascicularis',
'UP000002494': 'Rat_Rattus_norvegicus',
'UP000000625': 'Escherichia_coli',
}
#https://www.uniprot.org/uniprot/?query=proteome:UP000005640&format=fasta Example link
startline = r'https://www.uniprot.org/uniprot/?query=proteome:'
endline = r'&format=fasta&include=False' #include is true to include isoforms, make false for only canonical sequences
async def fetch(session, link, folderlocation, name):
async with session.get(link, timeout=0) as response:
try:
file = await aiofiles.open(folderlocation, mode='w')
file.write(await response.text())
await file.close()
print(name, 'ended')
except FileNotFoundError:
loc = ''.join((r'/'.join((folderlocation.split('/')[:-1])), '/'))
command = ' '.join(('mkdir -p', loc))
os.system(command)
file = await aiofiles.open(folderlocation, mode='w')
file.write(await response.text())
await file.close()
print(name, 'ended')
async def rfunc():
async with aiohttp.ClientSession() as session:
for upo, name in upos.items():
print(name, 'started')
link = ''.join((startline, upo, endline))
folderlocation =''.join((folder, name, '.fasta'))
await fetch(session, link, folderlocation, name)
loop = asyncio.get_event_loop()
loop.run_until_complete(rfunc())
My output from running this:
In [5]: runfile('~/base/Fasta Proteome Updater.py')
Human_Homo_sapien started
Human_Homo_sapien ended
Dog_Boxer_Canis_Lupus_familiaris started
Dog_Boxer_Canis_Lupus_familiaris ended
Yeast_Saccharomyces_cerevisiae started
Yeast_Saccharomyces_cerevisiae ended
Mouse_Mus_musculus started
Mouse_Mus_musculus ended
Monkey_Rhesus_macaque_Macaca_mulatta started
Monkey_Rhesus_macaque_Macaca_mulatta ended
Monkey_Cynomolgus_Macaca_fascicularis started
Monkey_Cynomolgus_Macaca_fascicularis ended
Rat_Rattus_norvegicus started
Rat_Rattus_norvegicus ended
Escherichia_coli started
Escherichia_coli ended
The printed output seems to signify the downloads are happening one at a time, is there something wrong here?

You are looping the items to download and waiting (await) for each item to finish. To make them happen all at one time, you need to schedule all downloads for execution at once - e.g. using gather.
Then your code could look like this:
async def rfunc():
async with aiohttp.ClientSession() as session:
await gather(
*[
fetch(
session,
''.join((startline, upo, endline)),
''.join((folder, name, '.fasta')),
name,
) for upo, name in upos.items()
]
)
loop = asyncio.get_event_loop()
loop.run_until_complete(rfunc())

Related

asyncio run_until_complete does not wait that all coroutines finish

I am making my first steps in Python and I have a bit of struggle trying to understand why I do not have the expected result with this one. Here is what I am trying to achieve :
I have a function that consumes an API. While waiting for the API to answer and given that I am going through a proxy that creates additional lag, I though that sending concurrent request will speed up the process (I run 100 concurrent requests). It does. But asyncio run_until_complete always returns some unfinished coroutines.
Here the code (simplified):
import aiohttp
import asyncio
async def consume_api(parameter):
url = "someurl" #it is actually based on the parameter
try:
async with aiohttp.ClientSession() as session:
async with session.get(URL, proxy="someproxy") as asyncresponse:
r = await asyncresponse.read()
except:
global error_count
error_count += 1
if error_count > 50:
return "Exceeded 50 try on same request"
else:
return consume_api(parameter)
return r.decode("utf-8")
def loop_on_api(list_of_parameter):
loop = asyncio.get_event_loop()
coroutines = [consume_api(list_of_parameter[i]) for i in range(len(list_of_parameter))]
results = loop.run_until_complete(asyncio.gather(*coroutines))
return results
When I run the debugger, the results returned by the loop_on_api function include a list of string corresponding to the results of consume_api and some occurence of <coroutine objects consume_api at 0x00...>. Those variables have a cr_running parameter at False and a cr_Frame.
Though if I check the coroutines variables, I can find all the 100 coroutines but none seems to have a cr_Frame.
Any idea what I am doing wrong?
I'm also thinking my way of counting the 50 error will be shared by all coroutines.
Any idea how I can make it specific?
This should work, you can add/change/refactor what ever you want
import aiohttp
import asyncio
async def consume_api(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.read()
def loop_on_api(list_of_urls):
loop = asyncio.get_event_loop()
coroutines = [consume_api(url) for url in list_of_urls]
results = loop.run_until_complete(asyncio.gather(*coroutines))
return results
if __name__ == '__main__':
print(loop_on_api(['https://google.com', 'https://twitter.com']))
It seems the issue is coming from the proxy I am using, which sometimes do not carry the request or response. Hence forcing a rerun seems to be the best answer. Hence I now check if the results returned have some coroutines remaining and re-run the loop_on_api() on them
def loop_on_api(list_of_parameter):
loop = asyncio.get_event_loop()
coroutines = [consume_api(list_of_parameter[i]) for i in range(len(list_of_parameter))]
results = loop.run_until_complete(asyncio.gather(*coroutines))
undone = []
rerun_list_of_parameter = []
for i in range(len(results)):
if str(type(results[i])) == "<class 'coroutine'>": #not very elegant >> is there a better way?
undone.append(i)
rerun_list_of_parameter.append(list_of_parameter[i])
if len(undone) > 0:
undone_results = loop_on_api(rerun_list_of_parameter)
for i in range(len(undone_results)):
results[undone[i]] = undone_results[i]
return results

what is the disadvantage of run asyncio.run multi times in Python code?

I'd like to embed some async code in my Python Project to make the http request part be asyncable . for example, I read params from Kafka, use this params to generate some urls and put the urls into a list. if the length of the list is greater than 1000, then I send this list to aiohttp to batch get the response.
I can not change the whole project from sync to async, so I could only change the http request part.
the code example is:
async def async_request(url):
async with aiohttp.ClientSession() as client:
resp = await client.get(url)
result = await resp.json()
return result
async def do_batch_request(url_list, result):
task_list = []
for url in url_list:
task = asyncio.create_task(async_request(url))
task_list.append(task)
batch_response = asyncio.gather(*task_list)
result.extend(batch_response)
def batch_request(url_list):
batch_response = []
asyncio.run(do_batch_request(url_list, batch_response))
return batch_response
url_list = []
for msg in kafka_consumer:
url = msg['url']
url_list.append(url)
if len(url_list) >= 1000:
batch_response = batch_request(url_list)
parse(batch_response)
....
As we know, asyncio.run will create an even loop to run the async function and then close the even loop. My problem is that, will my method influence the performance of the async code? And do you have some better way for my situation?
There's no serious problem with your approach and you'll get speed benefit from asyncio. Only possible problem here is that if later you'll want to do something async in other place in the code you'll not be able to do it concurrently with batch_request.
There's not much to do if you don't want to change the whole project from sync to async, but if in the future you'll want to run batch_request in parallel with something, keep in mind that you can run it in thread and wait for result asynchronously.

Too many open files error when using asyncio/pyppeteer

I'm trying to make requests with headless chrome using pyppeteer. But I keep getting "OSError: [Errno 24] Too many open files" after a certain amount of requests. I checked the open resources of the python process with losf and found out that with every new request there's a new line like the following
python3 14840 root 11r FIFO 0,8 0t0 64208510 pipe
Can someone tell me what resources aren't being closed? The code that's producing this error is below
def search(self, search_path):
async def main(url):
browser = await launch(args=['--no-sandbox'], handleSIGINT=False, handleSIGTERM=False, handleSIGHUP=False)
page = await browser.newPage()
await page.setJavaScriptEnabled(False)
try:
response = await page.goto(url, options={"timeout": 50000})
except pyppeteer.errors.TimeoutError:
pass
src = await page.content()
await browser.close()
return src
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
url = "https://www.example.com" + search_path
val = asyncio.get_event_loop().run_until_complete(main(url))
loop.close()
EDIT
I managed to close the open pipes by calling
browser.process.communicate()

How to Upload to Google Drive Within an Asynchronous Method in Python

I'm currently building a Discord bot that uploads a file to Google Drive when a command is used. However, the command methods are Asynchronous and the files().create() method is Synchronous, and calling it simply causes the bot to get stuck.
#bot.command(pass_context = True)
#commands.has_role(name = 'Archivist')
async def archivechannel(ctx, channel : discord.Channel, filename):
await bot.say("Archiving....")
try:
with open("{}.txt".format(filename), "w") as openfile:
lines = []
async for message in bot.logs_from(channel, limit=500, reverse=True):
if not (message.author.bot or message.content.startswith("]")):
print ("<{}> {}#{}: {}".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
lines.append("<{}> {}#{}: {}\n".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
openfile.writelines(lines)
await bot.say("Archive Complete!")
except IOError:
await bot.say("Error: IOException")
await bot.say("Uploading....")
metadata = {'name' : "{}.txt".format(filename), 'mimetype' : 'application/vnd.google.apps.document', 'parents' : folderID}
media = MediaFileUpload('{}.txt'.format(filename), mimetype='text/plain')
res = service.files().create(body=metadata, media_body=media).execute()
print(res)
The line causing the problem is:
res = service.files().create(body=metadata, media_body=media).execute()
The bot just gets stuck after saying "Uploading...." and doesn't upload anything.
Does anyone know how I can fix this?
Edit: Using a ThreadPoolExecutor, nor a DefaultExecutor has worked, nor has setting up a synchronous function that runs the create and execute methods, taking in the metadata and media parameters
Edit 2: After doing some more screwing around, it turns out the problem is now in the following line:
media = MediaFileUpload('{}.txt'.format(filename), mimetype='text/plain')
However from my testing, for the question I asked, Patrick is correct and I have marked the question as answered.
You can run your blocking operation in another thread, while your asynchronous code waits for it to complete without blocking the event loop.
We'll create a new ThreadPoolExecutor, then use run_in_executor to use it to run the task.
from concurrent.futures import ThreadPoolExecutor
def upload_file(metadata, media):
return service.files().create(body=metadata, media_body=media).execute()
#bot.command(pass_context = True)
#commands.has_role(name = 'Archivist')
async def archivechannel(ctx, channel : discord.Channel, filename):
await bot.say("Archiving....")
try:
with open("{}.txt".format(filename), "w") as openfile:
lines = []
async for message in bot.logs_from(channel, limit=500, reverse=True):
if not (message.author.bot or message.content.startswith("]")):
print ("<{}> {}#{}: {}".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
lines.append("<{}> {}#{}: {}\n".format(message.timestamp, message.author.name, message.author.discriminator, message.content))
openfile.writelines(lines)
await bot.say("Archive Complete!")
except IOError:
await bot.say("Error: IOException")
await bot.say("Uploading....")
metadata = {filename : "{}.txt".format(filename), 'mimetype' : 'application/vnd.google.apps.document', 'parents' : folderID}
media = MediaFileUpload('{}.txt'.format(filename), mimetype='text/plain')
with ThreadPoolExecutor() as pool:
res = await bot.loop.run_in_executor(
pool, upload_file, metadata, media
)
print(res)
You may also be able to use the default executor by removing the context manager and passing None instead of pool. I'm having trouble finding information about the default executor though, so you may want to experiment.

Multithreading / multiprocessing with a python loop

I have an script that loops through a range of URLs to pull item location based on returned json data. However, the script takes an 60 minutes to run and 55 minutes of that (per cprofile) is spent waiting for json data to load.
I would like to multithread to run multiple POST requests at a time to speed this up and have initially split up URL ranges into two halves to do this. Where I am getting stuck is how to implement multithreading or asyncio.
Slimmed down code:
import asyncio
import aiohttp
# i am not recommend to use globals
results = dict()
url = "https://www.website.com/store/ajax/search"
query = "store={}&size=18&query=17360031"
# this is default url opener got from aiohttp documentation
async def open_url(store, loop=None):
async with aiohttp.ClientSession(loop=loop) as session:
async with session.post(url, data={'searchQuery': query.format(store)}) as resp:
return await resp.json(), store
async def processing(loop=None):
# U need to use 'global' keyworld if U wan't to write to global variables
global results
# one of the simplest ways to parallelize requests, is to init Future, and when data will be ready save it to global
tasks = [open_url(store, loop=event_loop) for store in range(0, 5)]
for coro in asyncio.as_completed(tasks, loop=loop):
try:
data, store = await coro
results[store] = data['searchResults']['results'][0]['location']['aisle']
except (IndexError, KeyError):
continue
if __name__ == '__main__':
event_loop = asyncio.new_event_loop()
event_loop.run_until_complete(processing(loop=event_loop))
# Print Results
for store, data in results.items():
print(store, data)
json:
{u'count': 1,
u'results': [{u'department': {u'name': u'Home', u'storeDeptId': -1},
u'location': {u'aisle': [A], u'detailed': [A.536]},
u'score': u'0.507073'}],
u'totalCount': 1}
Even if you use multithreading or multiprocessing, each thread/process will still block until the JSON data is retrieved. This could speed up things a little but it's still not your best choice.
Since you're using requests, try grequests which combines this one with gevent. This lets you define a series of HTTP requests that run asynchronously. As a result you'll get a huge speed boost. The usage is very simple: just create a list of requests (using grequests.get) and pass it grequests.map.
Hope this helps!
If u wan't to parallelize requests( i hope u ask for this). This code snippet will help.
There are request opener,and 2000 post requests sent via aiohttp and asyncio.
python3.5 used
import asyncio
import aiohttp
# i am not recommend to use globals
results = dict()
MAX_RETRIES = 5
MATCH_SLEEP_TIME = 3 # i am recommend U to move this variables to other file like constants.py or any else
url = "https://www.website.com/store/ajax/search"
query = "store={}&size=18&query=44159"
# this is default url opener got from aiohttp documentation
async def open_url(store, semaphore, loop=None):
for _ in range(MAX_RETRIES):
with await semarhore:
try:
async with aiohttp.ClientSession(loop=loop) as session:
async with session.post(url, data={'searchQuery': query.format(store)}) as resp:
return await resp.json(), store
except ConnectionResetError:
# u can handle more exceptions here, and sleep if they are raised
await asyncio.sleep(MATCH_SLEEP_TIME, loop=loop)
continue
return None
async def processing(semaphore, loop=None):
# U need to use 'global' keyworld if U wan't to write to global variables
global results
# one of the simplest ways to parallelize requests, is to init Future, and when data will be ready save it to global
tasks = [open_url(store, semaphore, loop=event_loop) for store in range(0, 2000)]
for coro in asyncio.as_completed(tasks, loop=loop):
try:
response = await coro
if response is None:
continue
data, store = response
results[store] = data['searchResults']['results'][0]['location']['aisle']
except (IndexError, KeyError):
continue
if __name__ == '__main__':
event_loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore(50, loop=event_loop) # count of concurrent requests
event_loop.run_until_complete(processing(semaphore, loop=event_loop))

Categories