Why doesn't this python aiohttp requests code run asynchronously? - python

I'm trying to access an API with aiohttp but something is causing this code to block each iteration.
def main():
async with aiohttp.ClientSession() as session:
for i, (image, target) in enumerate(dataset_val):
image_bytes = pil_to_bytes(image)
async with session.post('http://localhost:8080/predictions/resnet50', data=image_bytes) as resp:
print(await resp.text())
print(i, flush=True, end='\r')
asyncio.run(main())

As explained by #deceze, await will wait for your result inside your loop. If you want to call everything at the same time, you need to call everything from an external loop and gather the results.
Here's a way of doing it
import asyncio
import aiohttp
async def call(session: aiohttp.ClientSession, url: str, image):
image_bytes = pil_to_bytes(image)
async with session.post(url, data=image_bytes) as response:
return await response.text()
async def call_all(url:str, tasks: list):
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[call(session, url, img) for img, target in tasks],
return_exceptions=True
)
return results
loop = asyncio.get_event_loop()
res = loop.run_until_complete(
call_all('http://localhost:8080/predictions/resnet50', dataset_val)
)

Related

Python, Concurrency and asyncio: Problem adding a rotating proxy

I'm creating an optimized multi-threading app using asyncio and want to add a rotating proxy into the mix.
Starting with a sample taken from this outstanding article:
Speed Up Your Python Program With Concurrency
I added a rotating proxy and it stopped working. The code simply exits the function after touching the line for the proxy.
This little snippet of code works, but not when added to the main script as shown in the screenshot above.
import asyncio
import random as rnd
async def download_site():
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
asyncio.run(download_site())
And here's the full sample:
import asyncio
import time
import aiohttp
# Sample code taken from here:
# https://realpython.com/python-concurrency/#asyncio-version
# Info for adding headers for the proxy (Scroll toward the bottom)
# https://docs.aiohttp.org/en/stable/client_advanced.html
# Good read to possible improve performance on large lists of URLs
# https://asyncio.readthedocs.io/en/latest/webscraper.html
# RUN THIS METHOD TO SEE HOW IT WORKS.
# # Original Code (working...)
# async def download_site(session, url):
# async with session.get(url, proxy="http://proxy.com") as response:
# print("Read {0} from {1}".format(response.content_length, url))
def get_proxy(self):
proxy_list = [
(754, '38.39.205.220:80'),
(681, '38.39.204.100:80'),
(682, '38.39.204.101:80'),
(678, '38.39.204.94:80')
]
proxy = random.choice(proxy_list)
print(proxy[1])
return proxy
async def download_site(session, url):
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
async with session.get(url, proxy="http://" + proxy) as response:
print("Read {0} from {1}".format(response.content_length, url))
async def download_all_sites(sites):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
# Modified to loop thru only 1 URL to make debugging simple
if __name__ == "__main__":
sites = [
"https://www.jython.org",
# "http://olympus.realpython.org/dice",
] #* 80
start_time = time.time()
asyncio.get_event_loop().run_until_complete(download_all_sites(sites))
duration = time.time() - start_time
print(f"Downloaded {len(sites)} sites in {duration} seconds")
Thank you for any help you can offer.
You use return_exceptions=True but you don't actually check the returned results for errors. You can use asyncio.as_completed to handle exceptions and get the earliest next result:
import asyncio
import random
import traceback
import aiohttp
URLS = ("https://stackoverflow.com",)
TIMEOUT = 5
PROXIES = (
"http://38.39.205.220:80",
"http://38.39.204.100:80",
"http://38.39.204.101:80",
"http://38.39.204.94:80",
)
def get_proxy():
return random.choice(PROXIES)
async def download_site(session, url):
proxy = get_proxy()
print(f"Got proxy: {proxy}")
async with session.get(url, proxy=f"{proxy}", timeout=TIMEOUT) as resp:
print(f"{url}: {resp.status}")
return await resp.text()
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for url in URLS:
tasks.append(asyncio.create_task(download_site(session, url)))
for coro in asyncio.as_completed(tasks):
try:
html = await coro
except Exception:
traceback.print_exc()
else:
print(len(html))
if __name__ == "__main__":
asyncio.run(main())

aiohttp - Splitting task while getting large number of HTML pages - RuntimeError: cannot reuse already awaited coroutine

I have list of URL links which I get and save to HTML files with following code:
tasksURL = []
async with aiohttp.ClientSession() as session:
for url in listOfURLs:
tasksURL.append(self.fetch(session, url))
allHTMLs = await asyncio.gather(*tasksURL)
i = 0
for html in allHTMLs:
i += 1
with open("myPath.html", mode='w', encoding='UTF-8', errors='strict', buffering=1) as f:
f.write(html)
Since URL list can be quite large (up to 60 000) I need to chunk this tasks.
I tried following solution. I've defined function that will chop list in smaller chunks with this function:
def chunkList(self, listOfURLs, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
And than use this function to run each chunked piece of listOfURLs like this:
tasksURL = []
chunkedListOfURLs = self.chunkList(listOfURLs, 5)
for URLList in chunkedListOfURLs:
async with aiohttp.ClientSession() as session:
for url in URLList:
tasksURL.append(self.fetch(session, url))
allHTMLs = await asyncio.gather(*tasksURL)
for html in allHTMLs:
with open("myPath.html", mode='w', encoding='UTF-8', errors='strict', buffering=1) as f:
f.write(html)
I'm getting error:
RuntimeError: cannot reuse already awaited coroutine
I understand problem but haven't found way around it.
I would suggest to use the asyncio.Queue in this case. You don't want to create 60k tasks for each URL. When you use queue, you can spawn a set number of workers and limit the queue size:
If maxsize is less than or equal to zero, the queue size is infinite.
If it is an integer greater than 0, then await put() blocks when the
queue reaches maxsize until an item is removed by get().
import asyncio
import random
WORKERS = 10
async def worker(q):
while True:
url = await q.get()
t = random.uniform(1, 5)
print(f"START: {url} ({t:.2f}s)")
await asyncio.sleep(t)
print(f"END: {url}")
q.task_done()
async def main():
q = asyncio.Queue(maxsize=100)
tasks = []
for _ in range(WORKERS):
tasks.append(asyncio.create_task(worker(q)))
for i in range(10):
await q.put(f"http://example.com/{i}")
await q.join()
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
main = asyncio.run(main())
Test:
$ python test.py
START: http://example.com/0 (1.14s)
START: http://example.com/1 (4.40s)
START: http://example.com/2 (2.48s)
START: http://example.com/3 (4.34s)
START: http://example.com/4 (1.94s)
END: http://example.com/0
START: http://example.com/5 (1.52s)
END: http://example.com/4
START: http://example.com/6 (4.84s)
END: http://example.com/2
START: http://example.com/7 (4.35s)
END: http://example.com/5
START: http://example.com/8 (2.33s)
END: http://example.com/3
START: http://example.com/9 (1.80s)
END: http://example.com/1
END: http://example.com/8
END: http://example.com/9
END: http://example.com/6
END: http://example.com/7
Btw writing to files will block your main event loop, either call it in run_in_executor or use aiofiles.
Update Sat 3 Apr 13:49:55 UTC 2021:
Example:
import asyncio
import traceback
import aiohttp
WORKERS = 5
URLS = [
"http://airbnb.com",
"http://amazon.co.uk",
"http://amazon.com",
"http://baidu.com",
"http://basecamp.com",
"http://bing.com",
"http://djangoproject.com",
"http://envato.com",
"http://facebook.com",
"http://github.com",
"http://gmail.com",
"http://google.co.uk",
"http://google.com",
"http://google.es",
"http://google.fr",
"http://heroku.com",
"http://instagram.com",
"http://linkedin.com",
"http://live.com",
"http://netflix.com",
"http://rubyonrails.org",
"http://shopify.com",
"http://stackoverflow.com",
"http://trello.com",
"http://wordpress.com",
"http://yahoo.com",
"http://yandex.ru",
"http://yiiframework.com",
"http://youtube.com",
]
class Bot:
async def fetch(self, client, url):
async with client.get(url) as r:
return await r.text()
async def worker(self, q, client):
loop = asyncio.get_running_loop()
while True:
url = await q.get()
try:
html = await self.fetch(client, url)
except Exception:
traceback.print_exc()
else:
await loop.run_in_executor(None, self.save_to_disk, url, html)
finally:
q.task_done()
def save_to_disk(self, url, html):
print(f"{url} ({len(html)})")
async def main():
q = asyncio.Queue(maxsize=100)
tasks = []
async with aiohttp.ClientSession() as client:
bot = Bot()
for _ in range(WORKERS):
tasks.append(asyncio.create_task(bot.worker(q, client)))
for url in URLS:
await q.put(url)
await q.join()
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
main = asyncio.run(main())
In your example your tasksURL array will have a set of awaited coroutines after every chunk that you successfully process. You then append new coroutines to that list on subsequent iterations and when you go to gather you're trying to await complete coroutines as well as new, unawaited ones. Simply creating a new tasksURL list for each chunk will solve your problem:
for URLList in chunkedListOfURLs:
tasksURL = []
async with aiohttp.ClientSession() as session:
for url in URLList:
tasksURL.append(fetch(session, url))
allHTMLs = await asyncio.gather(*tasksURL)
Note that by default, aiohttp's client session allows 100 concurrent connections. See https://docs.aiohttp.org/en/stable/client_advanced.html#limiting-connection-pool-size for more details, so you get some concurrency limits out of the box without chunking. Semaphores and queues are also other options to limit concurrency depending on your requirements as mentioned in other answers.

AttributeError: module 'select' has no attribute 'select' error ASYNCIO

I am executing the below code on a windows pc. I read that, by default, Windows can use only 64 sockets in asyncio loop. I don't know if this is the reason for the error.
import aiohttp
import asyncio
import time
async def download_file(url):
print(f'started downloading{url}')
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
async with session.get(url) as resp:
content = await resp.read()
print (f'Finished download{url}')
return content
async def write_file(n, content):
filename = f'async_{n}.html'
with open(filename,'wb') as f:
print(f'started writing{filename}')
f.write(content)
print(f'Finished writing{filename}')
async def scrape_task(n,url):
content = await download_file(url)
await write_file(n,content)
async def main():
tasks = []
for n,url in enumerate(open('urls.txt').readlines()):
tasks.append((scrape_task(n, url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
t=time.perf_counter()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.perf_counter() - t
print(f'Total time taken: {t2:0.2f} seconds')
I made the below changes to limit the connections to 60
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
I can't figure out where I am going wrong.

Asynchronous requests script crashes with too many URLs

I'm trying to check a set of URLs for their status code and return all that are of code 4xx or 5xx.
In total I need to check about 12500 URLs and my script works fine for up to about 7000 URLs. Above that the script crashes with ResourceWarning unclosed transport error.
I'm using python-3.6 and aiohttp 3.5.4
Any idea what's causing this?
async def fetch(url, session):
async with session.get(url) as response:
data = response.status
return url, data
async def bound_fetch(sem, url, session):
async with sem:
return await fetch(url, session)
async def check_urls(url_list):
''' get status code for all urls and write into dictionary '''
base_url = <base_url>
tasks = []
sem = asyncio.Semaphore(10)
async with ClientSession() as session:
for url in url_list:
full_url = base_url + url
task = asyncio.ensure_future(bound_fetch(sem, full_url.format(), session))
tasks.append(task)
results = await asyncio.gather(*tasks)
results_dict = defaultdict(list)
for res in results:
if res[1] != 200 and res[1] != 301 and res[1] != 302:
print(f'ERROR {str(res[1])} {res[0]}')
results_dict[res[1]].append(res[0])
print(f'URLs checked, found {str(len(results_dict))} errors')
''' main function'''
loop = asyncio.get_event_loop()
loop.set_debug(True)
warnings.simplefilter('always', ResourceWarning)
future = asyncio.ensure_future(check_urls(list_of_urls))
loop.run_until_complete(future)

Aiohttp not performing any requests

First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def bound_fetch(sem, url):
async with sem:
print("doing request for "+ url)
async with ClientSession() as session:
async with session.get(url) as response:
responseheader = await response.headers
print(headers)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
sem = asyncio.Semaphore(100)
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i))
tasks.append(task)
headers = await asyncio.wait(*tasks)
print(headers)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
As per my last question I'm following this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
I tried to adapt my code as close as possible to the example implementation but this code is still not making any requests and printing the headers in bound_headers as I wish.
Can somebody spot whats wrong with this code ?
response.headers is a regular property, no need to put await before the call
asyncio.wait on other hand accepts a list of futures and returns (done, pending) pair.
Looks like you should replace await wait() call with await asyncio.gather(*tasks) (gather doc)

Categories