gunzip aiohttp response on the fly - python

How can I async the following code, without first downloading the entire file:
import gzip
import urllib.request
def gunzip_url(url: str):
with gzip.open(urllib.request.urlopen(url), 'rt') as f:
return f.read()
The following works, but it is not on the fly (downloads entire gz file before decompressing):
import aiohttp
import asyncio
import gzip
async def gunzip_url(client: aiohttp.ClientSession, url: str):
async with client.get(url) as resp:
gz = await resp.read()
return gzip.decompress(gz)
async def main():
async with aiohttp.ClientSession() as client:
coros = [gunzip_url(client, 'http://some.file/1.gz'),
gunzip_url(client, 'http://some.file/2.gz')]
return await asyncio.gather(*coros)
data = asyncio.run(main())
The following works, but is also not on the fly:
from aioify import aioify
agzip = aioify(obj=gzip, name='agzip')
async def gunzip_url(client: aiohttp.ClientSession, url):
async with client.get(url) as resp:
return await agzip.decompress(await resp.read())

Related

aiohttp session closed without exiting the context manager

I have a pretty complicated API with custom parameters and headers so I created a class to wrap around it. Here's a contrived example:
import asyncio
import aiohttp
# The wrapper class around my API
class MyAPI:
def __init__(self, base_url: str):
self.base_url = base_url
async def send(self, session, method, url) -> aiohttp.ClientResponse:
request_method = getattr(session, method.lower())
full_url = f"{self.base_url}/{url}"
async with request_method(full_url) as response:
return response
async def main():
api = MyAPI("https://httpbin.org")
async with aiohttp.ClientSession() as session:
response = await api.send(session, "GET", "/uuid")
print(response.status) # 200 OK
print(await response.text()) # Exception: Connection closed
asyncio.run(main())
Why is my session closed? I didn't exit the context manager of session.
If I ignore the wrapper class, everything works as expected:
async def main():
async with aiohttp.ClientSession() as session:
async with session.get("https://httpbin.org/uuid") as response:
print(await response.text())
You can't call response.text() once you have left the request_method(full_url) context.
If you write:
async with request_method(full_url) as response:
text = await response.text()
return response.status, text
then the send() method returns without error.

Separating async requests and saving using aiohttp

I am currently calling an external API many times and downloading the response's content from each call. I am using aiohttp and asyncio to speed up this process, but am having trouble figuring out how to separate the fetch functionality from the save functionality.
Setup
import asyncio
import os
from aiohttp import ClientSession
Currently, I am using the following function:
async def fetch_and_save(link, path, client):
async with await client.get(link) as response:
contents = await response.read()
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, "wb") as f:
f.write(contents)
My main call looks like this:
async def fetch_and_save_all(inputs):
async with ClientSession() as client:
tasks = [asyncio.ensure_future(fetch_and_save(link, path, client))
for link, path in inputs]
for f in asyncio.as_completed(tasks):
await f
def main(inputs):
loop = asyncio.get_event_loop()
loop.run_until_complete(fetch_and_save_all(inputs))
if __name__ == "__main__":
inputs = [
(f"https://httpbin.org/range/{i}", f"./tmp/{i}.txt") for i in range(1, 10)]
main(inputs)
Given this basic example, is it possible to separate the fetch and save functionality in fetch_and_save?
Just create independent functions for fetch portion and save portion.
async def fetch(link, client):
async with await client.get(link) as response:
contents = await response.read()
return contents
def save(contents, path):
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, 'wb') as f:
bytes_written = f.write(contents)
return bytes_written
async def fetch_and_save(link, path, client):
contents = await fetch(link, client)
save(contents, path)

Python aiohttp module: ambiguous .content attribute

Here is a little code snippet:
import aiohttp
import aiofiles
async def fetch(url):
# starting a session
async with aiohttp.ClientSession() as session:
# starting a get request
async with session.get(url) as response:
# getting response content
content = await response.content
return content
async def save_file(file_name, content):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
while True:
chunk = content.read(1024)
if not chunk:
break
f.write(chunk)
I am trying to download some binary files using the aiohttp library and then passing them to a coroutine using aiofiles library to write the file in the disk.
I have read the documentation but still couldn't figure out if I can pass content = await response.content or is it closed when the handle async with.. is closed? Because on a secondary blog, I found:
According to aiohttp’s documentation, because the response object was created in a context manager, it technically calls release() implicitly.
Which confuses me, should I embed the logic of the second function inside the response handle or is my logic correct?
The async context manager will close the resources related to the request, so if you return from the function, you have to make sure you've read everything of interest. So you have two options:
read the entire response into memory, e.g. with content = await response.read() or, if the file doesn't fit into memory (and also if you want to speed things up by reading and writing in parallel)
use a queue or an async iterator to parallelize reading and writing.
Here is an untested implementation of #2:
async def fetch(url):
# return an async generator over contents of URL
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
# getting response content in chunks no larger than 4K
for chunk in response.content.iter_chunked(4096):
yield chunk
async def save_file(file_name, content_iter):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
for chunk in content_iter:
f.write(chunk) # maybe you need to await this?
async def main():
save_file(file_name, fetch(url))
Thanks to user4815162342's code I could find a solution by parellelizing the fetch and write coroutines. I would've checked his code as the accepted solution but since I had to add some code to make it work, here it is:
# fetch binary from server
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
async for chunk in response.content.iter_chunked(4096):
yield chunk
# write binary function
async def save_file(file_name, chunk_iter):
list(map(create_dir_tree, list_binary_sub_dirs))
async with aiofiles.open(f'./binary/bin_ts/{file_name}', 'wb') as f:
async for chunk in chunk_iter:
await f.write(chunk)
async def main(urls):
tasks = []
for url in urls:
print('running on sublist')
file_name = url.rpartition('/')[-1]
request_ts = fetch(url)
tasks.append(save_file(file_name, request_ts))
await asyncio.gather(*tasks)
asyncio.run(main(some_list_of_urls))

python aiohttp performance: connect performed on the main thread

I have the following code
import asyncio
import aiohttp
urls = [
'http://54.224.27.241',
'http://54.224.27.241',
'http://54.224.27.241',
'http://54.224.27.241',
'http://54.224.27.241',
]
async def query(urls):
out = []
with aiohttp.ClientSession() as session:
for url in urls:
try:
async with session.get(url, timeout=5) as resp:
text = await resp.text()
out.append(resp.status)
except:
print('timeout')
return out
loop = asyncio.get_event_loop()
out = loop.run_until_complete(query(urls))
loop.close()
print(str(out))
The code is much slower than the one that uses a threadpool and keep increasing if you increase the number of urls (lets say 20, 50 etc.)
I have a feeling that when the initial connection establishment is not done in an async way.
(Note that I am connecting here to an non-existing server to deliberately produce a connection timeout).
Can someone point out what is wrong here?
Warning: I don't promise this code works, as I can't install aiohttp atm, but looking at the example in the docs
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def main():
async with aiohttp.ClientSession() as session:
html = await fetch(session, 'http://python.org')
print(html)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Notice how they're calling the aiohttp.ClientSession() with the async keyword. Additionally, I was getting some error in your line data = await async with session.get(url) as resp:
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main():
out = []
async with aiohttp.ClientSession() as session:
for url in urls:
data = await fetch(session, url)
out.append(data)
return out
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Python 3.5 async keyword

I am currently looking into pulsar for an asynchronous HTTP client.
The following example is in the docs:
from pulsar.apps import http
async with http.HttpClient() as session:
response1 = await session.get('https://github.com/timeline.json')
response2 = await session.get('https://api.github.com/emojis.json')
but when I try to execute it I get
async with http.HttpClient() as session:
^ SyntaxError: invalid syntax
It looks like the async keyword is not recognized. I am using Python 3.5.
Working example:
import asyncio
from pulsar.apps.http import HttpClient
async def my_fun():
async with HttpClient() as session:
response1 = await session.get('https://github.com/timeline.json')
response2 = await session.get('https://api.github.com/emojis.json')
print(response1)
print(response2)
loop = asyncio.get_event_loop()
loop.run_until_complete(my_fun())
you can only use async with inside a coroutines, so you have to do this
from pulsar.apps.http import HttpClient
import pulsar
async def my_fun():
async with HttpClient() as session:
response1 = await session.get('https://github.com/timeline.json')
response2 = await session.get('https://api.github.com/emojis.json')
return response1, response2
loop = pulsar.get_event_loop()
res1, res2 = loop.run_until_complete(my_fun())
print(res1)
print(res2)
internally pulsar use asyncio, so you don't have to import it explicitly to use it, use it through pulsar
as a side note, if you upgrade to python 3.6 you can use async list/set/etc comprehension
from pulsar.apps.http import HttpClient
import pulsar
async def my_fun():
async with HttpClient() as session:
urls=['https://github.com/timeline.json','https://api.github.com/emojis.json']
return [ await session.get(url) for url in urls]
loop = pulsar.get_event_loop()
res1, res2 = loop.run_until_complete(my_fun())
print(res1)
print(res2)

Categories