Separating async requests and saving using aiohttp - python

I am currently calling an external API many times and downloading the response's content from each call. I am using aiohttp and asyncio to speed up this process, but am having trouble figuring out how to separate the fetch functionality from the save functionality.
Setup
import asyncio
import os
from aiohttp import ClientSession
Currently, I am using the following function:
async def fetch_and_save(link, path, client):
async with await client.get(link) as response:
contents = await response.read()
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, "wb") as f:
f.write(contents)
My main call looks like this:
async def fetch_and_save_all(inputs):
async with ClientSession() as client:
tasks = [asyncio.ensure_future(fetch_and_save(link, path, client))
for link, path in inputs]
for f in asyncio.as_completed(tasks):
await f
def main(inputs):
loop = asyncio.get_event_loop()
loop.run_until_complete(fetch_and_save_all(inputs))
if __name__ == "__main__":
inputs = [
(f"https://httpbin.org/range/{i}", f"./tmp/{i}.txt") for i in range(1, 10)]
main(inputs)
Given this basic example, is it possible to separate the fetch and save functionality in fetch_and_save?

Just create independent functions for fetch portion and save portion.
async def fetch(link, client):
async with await client.get(link) as response:
contents = await response.read()
return contents
def save(contents, path):
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, 'wb') as f:
bytes_written = f.write(contents)
return bytes_written
async def fetch_and_save(link, path, client):
contents = await fetch(link, client)
save(contents, path)

Related

gunzip aiohttp response on the fly

How can I async the following code, without first downloading the entire file:
import gzip
import urllib.request
def gunzip_url(url: str):
with gzip.open(urllib.request.urlopen(url), 'rt') as f:
return f.read()
The following works, but it is not on the fly (downloads entire gz file before decompressing):
import aiohttp
import asyncio
import gzip
async def gunzip_url(client: aiohttp.ClientSession, url: str):
async with client.get(url) as resp:
gz = await resp.read()
return gzip.decompress(gz)
async def main():
async with aiohttp.ClientSession() as client:
coros = [gunzip_url(client, 'http://some.file/1.gz'),
gunzip_url(client, 'http://some.file/2.gz')]
return await asyncio.gather(*coros)
data = asyncio.run(main())
The following works, but is also not on the fly:
from aioify import aioify
agzip = aioify(obj=gzip, name='agzip')
async def gunzip_url(client: aiohttp.ClientSession, url):
async with client.get(url) as resp:
return await agzip.decompress(await resp.read())

Python bulk httpresponse check

I am new to programing and trying to achieve below.
1.Read txt file
2.check http response
3.write .txt file by diffrent http result.
made one with requets but its not as fast as I expected.
and I am stucked at #2
import aiohttp
import asyncio
_INPUT_FILE = "test.txt"
with open(_INPUT_FILE) as f:
urls = [ "check('http://" + line.strip()+ "\')" for line in f ]
async def check(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as res:
print("Status:", res.status)
async def main():
await asyncio.gather(urls)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
test.txt looks like
tes.com
aaaa.com
aodff.io
I've stacked for while if someone can advice or help me with my code
thanks in advance!

aiohttp download large list of pdf files

i am trying to download large number of pdf files asynchronously, python requests does not work well with async functionalities
but i am finding aiohttp hard to implement with pdf downloads, and can't find a thread for this specific task, for someone new into python async world to understand easily.
yeah it can be done with threadpoolexecutor but in this case better to keep in one thread.
this code works but need to do with 100 or so urls
asynchronously
import aiohttp
import aiofiles
async with aiohttp.ClientSession() as session:
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
async with session.get(url) as resp:
if resp.status == 200:
f = await aiofiles.open('download_pdf.pdf', mode='wb')
await f.write(await resp.read())
await f.close()
Thanks in advance.
You could do try something like this. For the sake of simplicity the same dummy pdf will be downloaded multiple times to disk with different file names:
from asyncio import Semaphore, gather, run, wait_for
from random import randint
import aiofiles
from aiohttp.client import ClientSession
# Mock a list of different pdfs to download
pdf_list = [
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
]
MAX_TASKS = 5
MAX_TIME = 5
async def download(pdf_list):
tasks = []
sem = Semaphore(MAX_TASKS)
async with ClientSession() as sess:
for pdf_url in pdf_list:
# Mock a different file name each iteration
dest_file = str(randint(1, 100000)) + ".pdf"
tasks.append(
# Wait max 5 seconds for each download
wait_for(
download_one(pdf_url, sess, sem, dest_file),
timeout=MAX_TIME,
)
)
return await gather(*tasks)
async def download_one(url, sess, sem, dest_file):
async with sem:
print(f"Downloading {url}")
async with sess.get(url) as res:
content = await res.read()
# Check everything went well
if res.status != 200:
print(f"Download failed: {res.status}")
return
async with aiofiles.open(dest_file, "+wb") as f:
await f.write(content)
# No need to use close(f) when using with statement
if __name__ == "__main__":
run(download(pdf_list))
Keep in mind that firing multiple concurrent request to a server might get your IP banned for a period of time. In that case, consider adding a sleep call (which kind of defeats the purpose of using aiohttp) or switching to a classic sequential script. In order to keep things concurrent but kinder to the server, the script will fire max 5 requests at any given time (MAX_TASKS).

Python aiohttp module: ambiguous .content attribute

Here is a little code snippet:
import aiohttp
import aiofiles
async def fetch(url):
# starting a session
async with aiohttp.ClientSession() as session:
# starting a get request
async with session.get(url) as response:
# getting response content
content = await response.content
return content
async def save_file(file_name, content):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
while True:
chunk = content.read(1024)
if not chunk:
break
f.write(chunk)
I am trying to download some binary files using the aiohttp library and then passing them to a coroutine using aiofiles library to write the file in the disk.
I have read the documentation but still couldn't figure out if I can pass content = await response.content or is it closed when the handle async with.. is closed? Because on a secondary blog, I found:
According to aiohttp’s documentation, because the response object was created in a context manager, it technically calls release() implicitly.
Which confuses me, should I embed the logic of the second function inside the response handle or is my logic correct?
The async context manager will close the resources related to the request, so if you return from the function, you have to make sure you've read everything of interest. So you have two options:
read the entire response into memory, e.g. with content = await response.read() or, if the file doesn't fit into memory (and also if you want to speed things up by reading and writing in parallel)
use a queue or an async iterator to parallelize reading and writing.
Here is an untested implementation of #2:
async def fetch(url):
# return an async generator over contents of URL
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
# getting response content in chunks no larger than 4K
for chunk in response.content.iter_chunked(4096):
yield chunk
async def save_file(file_name, content_iter):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
for chunk in content_iter:
f.write(chunk) # maybe you need to await this?
async def main():
save_file(file_name, fetch(url))
Thanks to user4815162342's code I could find a solution by parellelizing the fetch and write coroutines. I would've checked his code as the accepted solution but since I had to add some code to make it work, here it is:
# fetch binary from server
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
async for chunk in response.content.iter_chunked(4096):
yield chunk
# write binary function
async def save_file(file_name, chunk_iter):
list(map(create_dir_tree, list_binary_sub_dirs))
async with aiofiles.open(f'./binary/bin_ts/{file_name}', 'wb') as f:
async for chunk in chunk_iter:
await f.write(chunk)
async def main(urls):
tasks = []
for url in urls:
print('running on sublist')
file_name = url.rpartition('/')[-1]
request_ts = fetch(url)
tasks.append(save_file(file_name, request_ts))
await asyncio.gather(*tasks)
asyncio.run(main(some_list_of_urls))

Python package - aiohttp has a warning message "Unclosed client session"

My code is as follows:
import asyncio
import aiohttp
urls = [
'http://www.163.com/',
'http://www.sina.com.cn/',
'https://www.hupu.com/',
'http://www.csdn.net/'
]
async def get_url_data(u):
"""
read url data
:param u:
:return:
"""
print('running ', u)
resp = await aiohttp.ClientSession().get(url=u)
headers = resp.headers
print(u, headers)
return headers
async def request_url(u):
"""
main func
:param u:
:return:
"""
res = await get_url_data(u)
return res
loop = asyncio.get_event_loop()
task_lists = asyncio.wait([request_url(u) for u in urls])
loop.run_until_complete(task_lists)
loop.close()
When i running my code, it's display a warning message:
Unclosed client session
Anybody can give me some solutions about that?
Thanks a lot
You should close the connection in the end.
You have 2 options:
You can close the connection manually:
import aiohttp
session = aiohttp.ClientSession()
# use the session here
session.close()
Or you can use it with a contex manager:
import aiohttp
import asyncio
async def fetch(client):
async with client.get('http://python.org') as resp:
assert resp.status == 200
return await resp.text()
async def main(loop):
async with aiohttp.ClientSession(loop=loop) as client:
html = await fetch(client)
print(html)
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
The client session supports the context manager protocol for self closing.
If you are not using context manager, the proper way to close it would also need an await. Many answers on the internet miss that part, and few people actually notice it, presumably because most people use the more convenient context manager. But the manual await session.close() is essential when/if you are closing a class-wide session inside the tearDownClass() when doing unittesting.
import aiohttp
session = aiohttp.ClientSession()
# use the session here
await session.close()
You should use ClientSession using async context manager for proper blocking/freeing resources:
async def get_url_data(u):
"""
read url data
:param u:
:return:
"""
print('running ', u)
async with aiohttp.ClientSession() as session:
resp = await session.get(url=u)
headers = resp.headers
print(u, headers)
return headers

Categories