Python aiohttp module: ambiguous .content attribute - python

Here is a little code snippet:
import aiohttp
import aiofiles
async def fetch(url):
# starting a session
async with aiohttp.ClientSession() as session:
# starting a get request
async with session.get(url) as response:
# getting response content
content = await response.content
return content
async def save_file(file_name, content):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
while True:
chunk = content.read(1024)
if not chunk:
break
f.write(chunk)
I am trying to download some binary files using the aiohttp library and then passing them to a coroutine using aiofiles library to write the file in the disk.
I have read the documentation but still couldn't figure out if I can pass content = await response.content or is it closed when the handle async with.. is closed? Because on a secondary blog, I found:
According to aiohttp’s documentation, because the response object was created in a context manager, it technically calls release() implicitly.
Which confuses me, should I embed the logic of the second function inside the response handle or is my logic correct?

The async context manager will close the resources related to the request, so if you return from the function, you have to make sure you've read everything of interest. So you have two options:
read the entire response into memory, e.g. with content = await response.read() or, if the file doesn't fit into memory (and also if you want to speed things up by reading and writing in parallel)
use a queue or an async iterator to parallelize reading and writing.
Here is an untested implementation of #2:
async def fetch(url):
# return an async generator over contents of URL
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
# getting response content in chunks no larger than 4K
for chunk in response.content.iter_chunked(4096):
yield chunk
async def save_file(file_name, content_iter):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
for chunk in content_iter:
f.write(chunk) # maybe you need to await this?
async def main():
save_file(file_name, fetch(url))

Thanks to user4815162342's code I could find a solution by parellelizing the fetch and write coroutines. I would've checked his code as the accepted solution but since I had to add some code to make it work, here it is:
# fetch binary from server
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
async for chunk in response.content.iter_chunked(4096):
yield chunk
# write binary function
async def save_file(file_name, chunk_iter):
list(map(create_dir_tree, list_binary_sub_dirs))
async with aiofiles.open(f'./binary/bin_ts/{file_name}', 'wb') as f:
async for chunk in chunk_iter:
await f.write(chunk)
async def main(urls):
tasks = []
for url in urls:
print('running on sublist')
file_name = url.rpartition('/')[-1]
request_ts = fetch(url)
tasks.append(save_file(file_name, request_ts))
await asyncio.gather(*tasks)
asyncio.run(main(some_list_of_urls))

Related

Separating async requests and saving using aiohttp

I am currently calling an external API many times and downloading the response's content from each call. I am using aiohttp and asyncio to speed up this process, but am having trouble figuring out how to separate the fetch functionality from the save functionality.
Setup
import asyncio
import os
from aiohttp import ClientSession
Currently, I am using the following function:
async def fetch_and_save(link, path, client):
async with await client.get(link) as response:
contents = await response.read()
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, "wb") as f:
f.write(contents)
My main call looks like this:
async def fetch_and_save_all(inputs):
async with ClientSession() as client:
tasks = [asyncio.ensure_future(fetch_and_save(link, path, client))
for link, path in inputs]
for f in asyncio.as_completed(tasks):
await f
def main(inputs):
loop = asyncio.get_event_loop()
loop.run_until_complete(fetch_and_save_all(inputs))
if __name__ == "__main__":
inputs = [
(f"https://httpbin.org/range/{i}", f"./tmp/{i}.txt") for i in range(1, 10)]
main(inputs)
Given this basic example, is it possible to separate the fetch and save functionality in fetch_and_save?
Just create independent functions for fetch portion and save portion.
async def fetch(link, client):
async with await client.get(link) as response:
contents = await response.read()
return contents
def save(contents, path):
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, 'wb') as f:
bytes_written = f.write(contents)
return bytes_written
async def fetch_and_save(link, path, client):
contents = await fetch(link, client)
save(contents, path)

aiohttp download large list of pdf files

i am trying to download large number of pdf files asynchronously, python requests does not work well with async functionalities
but i am finding aiohttp hard to implement with pdf downloads, and can't find a thread for this specific task, for someone new into python async world to understand easily.
yeah it can be done with threadpoolexecutor but in this case better to keep in one thread.
this code works but need to do with 100 or so urls
asynchronously
import aiohttp
import aiofiles
async with aiohttp.ClientSession() as session:
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
async with session.get(url) as resp:
if resp.status == 200:
f = await aiofiles.open('download_pdf.pdf', mode='wb')
await f.write(await resp.read())
await f.close()
Thanks in advance.
You could do try something like this. For the sake of simplicity the same dummy pdf will be downloaded multiple times to disk with different file names:
from asyncio import Semaphore, gather, run, wait_for
from random import randint
import aiofiles
from aiohttp.client import ClientSession
# Mock a list of different pdfs to download
pdf_list = [
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
]
MAX_TASKS = 5
MAX_TIME = 5
async def download(pdf_list):
tasks = []
sem = Semaphore(MAX_TASKS)
async with ClientSession() as sess:
for pdf_url in pdf_list:
# Mock a different file name each iteration
dest_file = str(randint(1, 100000)) + ".pdf"
tasks.append(
# Wait max 5 seconds for each download
wait_for(
download_one(pdf_url, sess, sem, dest_file),
timeout=MAX_TIME,
)
)
return await gather(*tasks)
async def download_one(url, sess, sem, dest_file):
async with sem:
print(f"Downloading {url}")
async with sess.get(url) as res:
content = await res.read()
# Check everything went well
if res.status != 200:
print(f"Download failed: {res.status}")
return
async with aiofiles.open(dest_file, "+wb") as f:
await f.write(content)
# No need to use close(f) when using with statement
if __name__ == "__main__":
run(download(pdf_list))
Keep in mind that firing multiple concurrent request to a server might get your IP banned for a period of time. In that case, consider adding a sleep call (which kind of defeats the purpose of using aiohttp) or switching to a classic sequential script. In order to keep things concurrent but kinder to the server, the script will fire max 5 requests at any given time (MAX_TASKS).

Fetch file from localhost without web server

I want to fetch file from localhost without web server asynchronously. Seems that it is possible to do using file:// scheme. The following code sample is taken from documentation, but obviously it doesn't work:
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main():
async with aiohttp.ClientSession() as session:
html = await fetch(session, 'file://localhost/Users/user/test.txt')
print(html)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
How to make it work?
The one way I see is to use "curl file://path" in separate thread pool using run_in_executor, but I think there should be a way to fix code
If you need to obtain the contents of a local file, you can do it with ordinary Python built-ins, such as:
with open('Users/user/test.txt') as rd:
html = rd.read()
If the file is not very large, and is stored on a local filesystem, you don't even need to make it async, as reading it will be fast enough not to disturb the event loop. If the file is large or reading it might be slow for other reasons, you should read it through run_in_executor to prevent it from blocking other asyncio code. For example (untested):
def read_file_sync(file_name):
with open('Users/user/test.txt') as rd:
return rd.read()
async def read_file(file_name):
loop = asyncio.get_event_loop()
html = await loop.run_in_executor(None, read_file_sync, file_name)
return html

Fetch HEAD request's status asynchronously in aiohttp

Question is regarding aiohttp libriary usage.
My goal here is to check list of urls by sending bunch of HEAD requests, potentially asynchronously ,and eventually create dict of
url: status pairs.
I am new in asyncio and stuff and I found a lot of examples where people use GET requests to fetch html ,for example ,and they use await resp.read() or await resp.text() and it works fine but with HEAD request I don’t have body, I just have header, that's it. If I try to await resp.status or resp itself as an object – it does not work as they are not awaitable.
Code below works only synchronously step by step and I can’t figure out how to make it run asynchronously. Seems like whatever i do with status turns code to sync mode somehow...
I would be glad to see your ideas.
Thanks.
import asyncio
import aiohttp
urls_list = [url1, url2, url3, etc, etc, etc, ]
status_dict = {}
async def main():
async with aiohttp.ClientSession() as session:
for individual_url in urls_list:
async with session.head(individual_url) as resp:
status_dict.update({url: resp.status})
asyncio.run(main())
You can you asyncio.gather:
import asyncio
import aiohttp
urls_list = ["https://google.com", "https://yahoo.com", "http://hello123456789.com"]
status_dict = {}
async def head_status(session, url) -> dict:
async with session.head(url) as resp:
return {url: resp.status}
async def main():
async with aiohttp.ClientSession() as session:
statuses = await asyncio.gather(*[head_status(session, url) for url in urls_list], return_exceptions=True)
for a in statuses:
if not isinstance(a, Exception):
status_dict.update(a)
asyncio.run(main())

aiohttp with asyncio and Semaphores returning a list filled with Nones

I have a script that checks the status code for a couple hundred thousand supplied websites, and I was trying to integrate a Semaphore to the flow to speed up processing. The problem is that whenever I integrate a Semaphore, I just get a list populated with None objects, and I'm not entirely sure why.
I have been mostly copying code from other sources as I don't fully grok asynchronous programming fully yet, but it seems like when I debug I should be getting results out of the function, but something is going wrong when I gather the results. I've tried juggling around my looping, my gathering, ensuring futures, etc, but nothing seems to return a list of things that work.
async def fetch(session, url):
try:
async with session.head(url, allow_redirects=True) as resp:
return url, resp.real_url, resp.status, resp.reason
except Exception as e:
return url, None, e, 'Error'
async def bound_fetch(sem, session, url):
async with sem:
await fetch(session, url)
async def run(urls):
timeout = 15
tasks = []
sem = asyncio.Semaphore(100)
conn = aiohttp.TCPConnector(limit=64, ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
for url in urls:
task = asyncio.wait_for(bound_fetch(sem, session, url), timeout)
tasks.append(task)
responses = await asyncio.gather(*tasks)
# responses = [await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks))]
return responses
urls = ['https://google.com', 'https://yahoo.com']
loop = asyncio.ProactorEventLoop()
data = loop.run_until_complete(run(urls))
I've commented out the progress bar component, but that implementation returns the desired results when there is no semaphore.
Any help would be greatly appreciated. I am furiously reading up on asynchronous programming, but I can't wrap my mind around it yet.
You should explicitly return results of awaiting coroutines.
Replace this code...
async def bound_fetch(sem, session, url):
async with sem:
await fetch(session, url)
... with this:
async def bound_fetch(sem, session, url):
async with sem:
return await fetch(session, url)

Categories