First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def bound_fetch(sem, url):
async with sem:
print("doing request for "+ url)
async with ClientSession() as session:
async with session.get(url) as response:
responseheader = await response.headers
print(headers)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
sem = asyncio.Semaphore(100)
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i))
tasks.append(task)
headers = await asyncio.wait(*tasks)
print(headers)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
As per my last question I'm following this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
I tried to adapt my code as close as possible to the example implementation but this code is still not making any requests and printing the headers in bound_headers as I wish.
Can somebody spot whats wrong with this code ?
response.headers is a regular property, no need to put await before the call
asyncio.wait on other hand accepts a list of futures and returns (done, pending) pair.
Looks like you should replace await wait() call with await asyncio.gather(*tasks) (gather doc)
Related
I'm trying to access an API with aiohttp but something is causing this code to block each iteration.
def main():
async with aiohttp.ClientSession() as session:
for i, (image, target) in enumerate(dataset_val):
image_bytes = pil_to_bytes(image)
async with session.post('http://localhost:8080/predictions/resnet50', data=image_bytes) as resp:
print(await resp.text())
print(i, flush=True, end='\r')
asyncio.run(main())
As explained by #deceze, await will wait for your result inside your loop. If you want to call everything at the same time, you need to call everything from an external loop and gather the results.
Here's a way of doing it
import asyncio
import aiohttp
async def call(session: aiohttp.ClientSession, url: str, image):
image_bytes = pil_to_bytes(image)
async with session.post(url, data=image_bytes) as response:
return await response.text()
async def call_all(url:str, tasks: list):
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[call(session, url, img) for img, target in tasks],
return_exceptions=True
)
return results
loop = asyncio.get_event_loop()
res = loop.run_until_complete(
call_all('http://localhost:8080/predictions/resnet50', dataset_val)
)
I'm creating an optimized multi-threading app using asyncio and want to add a rotating proxy into the mix.
Starting with a sample taken from this outstanding article:
Speed Up Your Python Program With Concurrency
I added a rotating proxy and it stopped working. The code simply exits the function after touching the line for the proxy.
This little snippet of code works, but not when added to the main script as shown in the screenshot above.
import asyncio
import random as rnd
async def download_site():
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
asyncio.run(download_site())
And here's the full sample:
import asyncio
import time
import aiohttp
# Sample code taken from here:
# https://realpython.com/python-concurrency/#asyncio-version
# Info for adding headers for the proxy (Scroll toward the bottom)
# https://docs.aiohttp.org/en/stable/client_advanced.html
# Good read to possible improve performance on large lists of URLs
# https://asyncio.readthedocs.io/en/latest/webscraper.html
# RUN THIS METHOD TO SEE HOW IT WORKS.
# # Original Code (working...)
# async def download_site(session, url):
# async with session.get(url, proxy="http://proxy.com") as response:
# print("Read {0} from {1}".format(response.content_length, url))
def get_proxy(self):
proxy_list = [
(754, '38.39.205.220:80'),
(681, '38.39.204.100:80'),
(682, '38.39.204.101:80'),
(678, '38.39.204.94:80')
]
proxy = random.choice(proxy_list)
print(proxy[1])
return proxy
async def download_site(session, url):
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
async with session.get(url, proxy="http://" + proxy) as response:
print("Read {0} from {1}".format(response.content_length, url))
async def download_all_sites(sites):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
# Modified to loop thru only 1 URL to make debugging simple
if __name__ == "__main__":
sites = [
"https://www.jython.org",
# "http://olympus.realpython.org/dice",
] #* 80
start_time = time.time()
asyncio.get_event_loop().run_until_complete(download_all_sites(sites))
duration = time.time() - start_time
print(f"Downloaded {len(sites)} sites in {duration} seconds")
Thank you for any help you can offer.
You use return_exceptions=True but you don't actually check the returned results for errors. You can use asyncio.as_completed to handle exceptions and get the earliest next result:
import asyncio
import random
import traceback
import aiohttp
URLS = ("https://stackoverflow.com",)
TIMEOUT = 5
PROXIES = (
"http://38.39.205.220:80",
"http://38.39.204.100:80",
"http://38.39.204.101:80",
"http://38.39.204.94:80",
)
def get_proxy():
return random.choice(PROXIES)
async def download_site(session, url):
proxy = get_proxy()
print(f"Got proxy: {proxy}")
async with session.get(url, proxy=f"{proxy}", timeout=TIMEOUT) as resp:
print(f"{url}: {resp.status}")
return await resp.text()
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for url in URLS:
tasks.append(asyncio.create_task(download_site(session, url)))
for coro in asyncio.as_completed(tasks):
try:
html = await coro
except Exception:
traceback.print_exc()
else:
print(len(html))
if __name__ == "__main__":
asyncio.run(main())
I have list of URL links which I get and save to HTML files with following code:
tasksURL = []
async with aiohttp.ClientSession() as session:
for url in listOfURLs:
tasksURL.append(self.fetch(session, url))
allHTMLs = await asyncio.gather(*tasksURL)
i = 0
for html in allHTMLs:
i += 1
with open("myPath.html", mode='w', encoding='UTF-8', errors='strict', buffering=1) as f:
f.write(html)
Since URL list can be quite large (up to 60 000) I need to chunk this tasks.
I tried following solution. I've defined function that will chop list in smaller chunks with this function:
def chunkList(self, listOfURLs, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
And than use this function to run each chunked piece of listOfURLs like this:
tasksURL = []
chunkedListOfURLs = self.chunkList(listOfURLs, 5)
for URLList in chunkedListOfURLs:
async with aiohttp.ClientSession() as session:
for url in URLList:
tasksURL.append(self.fetch(session, url))
allHTMLs = await asyncio.gather(*tasksURL)
for html in allHTMLs:
with open("myPath.html", mode='w', encoding='UTF-8', errors='strict', buffering=1) as f:
f.write(html)
I'm getting error:
RuntimeError: cannot reuse already awaited coroutine
I understand problem but haven't found way around it.
I would suggest to use the asyncio.Queue in this case. You don't want to create 60k tasks for each URL. When you use queue, you can spawn a set number of workers and limit the queue size:
If maxsize is less than or equal to zero, the queue size is infinite.
If it is an integer greater than 0, then await put() blocks when the
queue reaches maxsize until an item is removed by get().
import asyncio
import random
WORKERS = 10
async def worker(q):
while True:
url = await q.get()
t = random.uniform(1, 5)
print(f"START: {url} ({t:.2f}s)")
await asyncio.sleep(t)
print(f"END: {url}")
q.task_done()
async def main():
q = asyncio.Queue(maxsize=100)
tasks = []
for _ in range(WORKERS):
tasks.append(asyncio.create_task(worker(q)))
for i in range(10):
await q.put(f"http://example.com/{i}")
await q.join()
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
main = asyncio.run(main())
Test:
$ python test.py
START: http://example.com/0 (1.14s)
START: http://example.com/1 (4.40s)
START: http://example.com/2 (2.48s)
START: http://example.com/3 (4.34s)
START: http://example.com/4 (1.94s)
END: http://example.com/0
START: http://example.com/5 (1.52s)
END: http://example.com/4
START: http://example.com/6 (4.84s)
END: http://example.com/2
START: http://example.com/7 (4.35s)
END: http://example.com/5
START: http://example.com/8 (2.33s)
END: http://example.com/3
START: http://example.com/9 (1.80s)
END: http://example.com/1
END: http://example.com/8
END: http://example.com/9
END: http://example.com/6
END: http://example.com/7
Btw writing to files will block your main event loop, either call it in run_in_executor or use aiofiles.
Update Sat 3 Apr 13:49:55 UTC 2021:
Example:
import asyncio
import traceback
import aiohttp
WORKERS = 5
URLS = [
"http://airbnb.com",
"http://amazon.co.uk",
"http://amazon.com",
"http://baidu.com",
"http://basecamp.com",
"http://bing.com",
"http://djangoproject.com",
"http://envato.com",
"http://facebook.com",
"http://github.com",
"http://gmail.com",
"http://google.co.uk",
"http://google.com",
"http://google.es",
"http://google.fr",
"http://heroku.com",
"http://instagram.com",
"http://linkedin.com",
"http://live.com",
"http://netflix.com",
"http://rubyonrails.org",
"http://shopify.com",
"http://stackoverflow.com",
"http://trello.com",
"http://wordpress.com",
"http://yahoo.com",
"http://yandex.ru",
"http://yiiframework.com",
"http://youtube.com",
]
class Bot:
async def fetch(self, client, url):
async with client.get(url) as r:
return await r.text()
async def worker(self, q, client):
loop = asyncio.get_running_loop()
while True:
url = await q.get()
try:
html = await self.fetch(client, url)
except Exception:
traceback.print_exc()
else:
await loop.run_in_executor(None, self.save_to_disk, url, html)
finally:
q.task_done()
def save_to_disk(self, url, html):
print(f"{url} ({len(html)})")
async def main():
q = asyncio.Queue(maxsize=100)
tasks = []
async with aiohttp.ClientSession() as client:
bot = Bot()
for _ in range(WORKERS):
tasks.append(asyncio.create_task(bot.worker(q, client)))
for url in URLS:
await q.put(url)
await q.join()
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
main = asyncio.run(main())
In your example your tasksURL array will have a set of awaited coroutines after every chunk that you successfully process. You then append new coroutines to that list on subsequent iterations and when you go to gather you're trying to await complete coroutines as well as new, unawaited ones. Simply creating a new tasksURL list for each chunk will solve your problem:
for URLList in chunkedListOfURLs:
tasksURL = []
async with aiohttp.ClientSession() as session:
for url in URLList:
tasksURL.append(fetch(session, url))
allHTMLs = await asyncio.gather(*tasksURL)
Note that by default, aiohttp's client session allows 100 concurrent connections. See https://docs.aiohttp.org/en/stable/client_advanced.html#limiting-connection-pool-size for more details, so you get some concurrency limits out of the box without chunking. Semaphores and queues are also other options to limit concurrency depending on your requirements as mentioned in other answers.
I am executing the below code on a windows pc. I read that, by default, Windows can use only 64 sockets in asyncio loop. I don't know if this is the reason for the error.
import aiohttp
import asyncio
import time
async def download_file(url):
print(f'started downloading{url}')
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
async with session.get(url) as resp:
content = await resp.read()
print (f'Finished download{url}')
return content
async def write_file(n, content):
filename = f'async_{n}.html'
with open(filename,'wb') as f:
print(f'started writing{filename}')
f.write(content)
print(f'Finished writing{filename}')
async def scrape_task(n,url):
content = await download_file(url)
await write_file(n,content)
async def main():
tasks = []
for n,url in enumerate(open('urls.txt').readlines()):
tasks.append((scrape_task(n, url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
t=time.perf_counter()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.perf_counter() - t
print(f'Total time taken: {t2:0.2f} seconds')
I made the below changes to limit the connections to 60
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
I can't figure out where I am going wrong.
I am scraping blog urls from main page, and later I iterate over all urls to retrive text on it.
Will generator be faster if I move loop to blogscraper and make yield some_text ? I guess app will still be one threaded and It wont request next pages while computing text from html.
Should I use asyncio? or there are some better modules to make it parrel? Create generator that yields coroutine results as the coroutines finish
I also want to make later small rest app for displaying results
def readmainpage(self):
blogurls = []
while(nextPage):
r = requests.get(url)
...
blogurls += [new_url]
return blogurls
def blogscraper(self, url):
r = request.get(url)
...
return sometext
def run(self):
blog_list = self.readmainpage()
for blog in blog_list:
data = self.blogscraper(blog['url'])
Using threading package, you can run your top function (object initialitization) asynchronously. It will create sub parallel sub-process for your requests. For example, single page fetching is 2 mins and you have 10 pages. In threading, all will take 2 mins. Threading in Python 3.x
With asyncio you can try to use aiohttp module:
pip install aiohttp
As example code it's can looks something like this, also can be done some improvements but it depends on your code...
import sys
import aiohttp
import asyncio
import socket
from urllib.parse import urlparse
class YourClass:
def __init__(self):
self.url = "..."
url_parsed = urlparse( self.url )
self.session = aiohttp.ClientSession(
headers = { "Referer": f"{ url_parsed.scheme }://{ url_parsed.netloc }" },
auto_decompress = True,
connector = aiohttp.TCPConnector(family=socket.AF_INET, verify_ssl=False) )
async def featch(self, url):
async with self.session.get( url ) as resp:
assert resp.status == 200
return await resp.text()
async def readmainpage(self):
blogurls = []
while nextPage:
r = await self.featch(self.url)
# ...
blogurls += [new_url]
return blogurls
async def blogscraper(self, url):
r = await self.featch(url)
return r
# ...
return sometext
async def __call__(self):
url_parsed = urlparse( self.url )
blog_list = await self.readmainpage()
coros = [ asyncio.Task( self.blogscraper( blog['url']) ) for blog in blog_list ]
for data in await asyncio.gather( *coros ):
print(data)
# do not forget to close session if not using with statement
await self.session.close()
def main():
featcher = YourClass()
loop = asyncio.get_event_loop()
loop.run_until_complete( featcher() )
sys.exit(0)
if __name__ == "__main__":
main()