Asynchronous requests script crashes with too many URLs - python

I'm trying to check a set of URLs for their status code and return all that are of code 4xx or 5xx.
In total I need to check about 12500 URLs and my script works fine for up to about 7000 URLs. Above that the script crashes with ResourceWarning unclosed transport error.
I'm using python-3.6 and aiohttp 3.5.4
Any idea what's causing this?
async def fetch(url, session):
async with session.get(url) as response:
data = response.status
return url, data
async def bound_fetch(sem, url, session):
async with sem:
return await fetch(url, session)
async def check_urls(url_list):
''' get status code for all urls and write into dictionary '''
base_url = <base_url>
tasks = []
sem = asyncio.Semaphore(10)
async with ClientSession() as session:
for url in url_list:
full_url = base_url + url
task = asyncio.ensure_future(bound_fetch(sem, full_url.format(), session))
tasks.append(task)
results = await asyncio.gather(*tasks)
results_dict = defaultdict(list)
for res in results:
if res[1] != 200 and res[1] != 301 and res[1] != 302:
print(f'ERROR {str(res[1])} {res[0]}')
results_dict[res[1]].append(res[0])
print(f'URLs checked, found {str(len(results_dict))} errors')
''' main function'''
loop = asyncio.get_event_loop()
loop.set_debug(True)
warnings.simplefilter('always', ResourceWarning)
future = asyncio.ensure_future(check_urls(list_of_urls))
loop.run_until_complete(future)

Related

Why doesn't this python aiohttp requests code run asynchronously?

I'm trying to access an API with aiohttp but something is causing this code to block each iteration.
def main():
async with aiohttp.ClientSession() as session:
for i, (image, target) in enumerate(dataset_val):
image_bytes = pil_to_bytes(image)
async with session.post('http://localhost:8080/predictions/resnet50', data=image_bytes) as resp:
print(await resp.text())
print(i, flush=True, end='\r')
asyncio.run(main())
As explained by #deceze, await will wait for your result inside your loop. If you want to call everything at the same time, you need to call everything from an external loop and gather the results.
Here's a way of doing it
import asyncio
import aiohttp
async def call(session: aiohttp.ClientSession, url: str, image):
image_bytes = pil_to_bytes(image)
async with session.post(url, data=image_bytes) as response:
return await response.text()
async def call_all(url:str, tasks: list):
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[call(session, url, img) for img, target in tasks],
return_exceptions=True
)
return results
loop = asyncio.get_event_loop()
res = loop.run_until_complete(
call_all('http://localhost:8080/predictions/resnet50', dataset_val)
)

aiohttp check request headers after receiving response

here's the problem:
I have a list of authorization headers to check.
For that, I use aiohhtp
def make_tasks(session, proxies, unchecked_headers):
tasks = list()
for unchecked_header in unchecked_headers:
current_proxy = proxies.pop()
headers['authorization'] = unchecked_header
t = session.get(url, proxy=current_proxy, headers=headers)
tasks.append(t)
return tasks
async def check_headers(proxies, unchecked_headers):
async with aiohttp.ClientSession() as s:
tasks = make_tasks(s, proxies, unchecked_headers)
results = await asyncio.gather(*tasks)
for result in results:
...
Now, depending on the response code, I need to log some information about that authorization header. The issue is that no where in the response it mentions the actual header. So, I get all of the responses back, yet I don't know which response corresponds to which header, as they are asynchronous.
I looked around, and didn't find a way to check which headers were sent initially on the ClientResponse object. What can I do here?
You can use asyncio.as_completed + wrapper around session.get to return response and any additional data you want. For example:
import aiohttp
import asyncio
url = "https://httpbin.org/get"
unchecked_headers = [
"A",
"B",
"C",
]
proxies = ["Proxy1", "Proxy2", "Proxy3"]
headers = {}
def make_tasks(session, proxies, unchecked_headers):
async def _wrapper(t, *args):
response = await t
return response, *args
tasks = list()
for unchecked_header in unchecked_headers:
current_proxy = proxies.pop()
headers["authorization"] = unchecked_header
# I commented this out because I don't have access to proxy:
# t = session.get(url, proxy=current_proxy, headers=headers)
t = session.get(url, headers=headers)
tasks.append(_wrapper(t, current_proxy, unchecked_header))
return tasks
async def check_headers(proxies, unchecked_headers):
async with aiohttp.ClientSession() as s:
for task in asyncio.as_completed(
make_tasks(s, proxies, unchecked_headers)
):
response, proxy, header = await task
print(response.url, proxy, header)
async def main():
await check_headers(proxies, unchecked_headers)
if __name__ == "__main__":
asyncio.run(main())
Prints:
https://httpbin.org/get Proxy3 A
https://httpbin.org/get Proxy1 C
https://httpbin.org/get Proxy2 B

Python, Concurrency and asyncio: Problem adding a rotating proxy

I'm creating an optimized multi-threading app using asyncio and want to add a rotating proxy into the mix.
Starting with a sample taken from this outstanding article:
Speed Up Your Python Program With Concurrency
I added a rotating proxy and it stopped working. The code simply exits the function after touching the line for the proxy.
This little snippet of code works, but not when added to the main script as shown in the screenshot above.
import asyncio
import random as rnd
async def download_site():
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
asyncio.run(download_site())
And here's the full sample:
import asyncio
import time
import aiohttp
# Sample code taken from here:
# https://realpython.com/python-concurrency/#asyncio-version
# Info for adding headers for the proxy (Scroll toward the bottom)
# https://docs.aiohttp.org/en/stable/client_advanced.html
# Good read to possible improve performance on large lists of URLs
# https://asyncio.readthedocs.io/en/latest/webscraper.html
# RUN THIS METHOD TO SEE HOW IT WORKS.
# # Original Code (working...)
# async def download_site(session, url):
# async with session.get(url, proxy="http://proxy.com") as response:
# print("Read {0} from {1}".format(response.content_length, url))
def get_proxy(self):
proxy_list = [
(754, '38.39.205.220:80'),
(681, '38.39.204.100:80'),
(682, '38.39.204.101:80'),
(678, '38.39.204.94:80')
]
proxy = random.choice(proxy_list)
print(proxy[1])
return proxy
async def download_site(session, url):
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
async with session.get(url, proxy="http://" + proxy) as response:
print("Read {0} from {1}".format(response.content_length, url))
async def download_all_sites(sites):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
# Modified to loop thru only 1 URL to make debugging simple
if __name__ == "__main__":
sites = [
"https://www.jython.org",
# "http://olympus.realpython.org/dice",
] #* 80
start_time = time.time()
asyncio.get_event_loop().run_until_complete(download_all_sites(sites))
duration = time.time() - start_time
print(f"Downloaded {len(sites)} sites in {duration} seconds")
Thank you for any help you can offer.
You use return_exceptions=True but you don't actually check the returned results for errors. You can use asyncio.as_completed to handle exceptions and get the earliest next result:
import asyncio
import random
import traceback
import aiohttp
URLS = ("https://stackoverflow.com",)
TIMEOUT = 5
PROXIES = (
"http://38.39.205.220:80",
"http://38.39.204.100:80",
"http://38.39.204.101:80",
"http://38.39.204.94:80",
)
def get_proxy():
return random.choice(PROXIES)
async def download_site(session, url):
proxy = get_proxy()
print(f"Got proxy: {proxy}")
async with session.get(url, proxy=f"{proxy}", timeout=TIMEOUT) as resp:
print(f"{url}: {resp.status}")
return await resp.text()
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for url in URLS:
tasks.append(asyncio.create_task(download_site(session, url)))
for coro in asyncio.as_completed(tasks):
try:
html = await coro
except Exception:
traceback.print_exc()
else:
print(len(html))
if __name__ == "__main__":
asyncio.run(main())

How can I asynchronously request URLs in a growing queue with asyncio?

I have X initial urls that are paginated - in order to get the next set of data, I have to grab the next url from the response header until there is no next url. I am having trouble getting this going right. I'm trying a queue approach that I found here.
import asyncio
from aiohttp import ClientSession, TCPConnector
async def get(session, url):
headers = {
'Authorization': 'Bearer KEY',
}
async with session.get(url, headers=headers) as response:
json = await response.json()
return json, response
async def process(session, url, q):
try:
try:
views, response = await get(session, url)
scode = response.status
if scode == 404:
return
except Exception as e:
print(e)
return
try:
await q.put(str(response.links["next"]["url"]))
except:
pass
<do something with views>
except Exception as e:
print(e)
async def fetch_worker(session, q):
while True:
url = await q.get()
try:
await process(session, url, q)
except Exception as e:
print(e)
finally:
q.task_done()
async def d():
<code to query and put data into stdrows>
connector = TCPConnector(limit=500)
async with ClientSession(connector=connector) as session:
url = '<some base url>'
for i in range(500):
tasks.append(asyncio.create_task(fetch_worker(session, url_queue)))
for row in stdrows:
await url_queue.put(url.format(row[1]))
await asyncio.gather(*tasks)
await url_queue.join()
asyncio.run(d())
This appears not to be going at 500 tasks/sec. is it even possible to get to this rate without knowing all the URLs ahead of time? I am hoping to fetch the next url from whatever initial url (or from its paginated url) while i work with views.

Receiving "RuntimeError: Session is closed" in aiohttp.ClientSession.get(), even after creating new context manager

I am writing a web crawler using aiohttp, and my program is crashing with "RuntimeError: Session is closed" errors in my web crawler.
The main loop makes it through the first iteration, fetching and processing all pages in the URL queue without any issue. But then as it enters fetch_pages() in the 2nd iteration of the main loop, and makes first call to aiohttp.ClientSession.session.get(), it throws "RuntimeError: Session is closed".
I don't understand why I would be getting this error, because it appears to me that the code below should be creating a new aiohttp.ClientSession() context manager each time the get_batch() function below is called, and closing the session at the end of the function call. But this is not happening. Can someone explain to me why I am getting this error?
I have posted the relevant portions of my code below (I tried to trim as much as possible, but have included links to full source below).
Here is the main loop:
class Crawler():
((...))
def __init__(self):
self.loop = asyncio.get_event_loop()
self.url_queue = URLQueue(maxsize=10000) # urls are popped from URL queue
self.page_queue = asyncio.PriorityQueue() # when fetched, they are placed on page queue for html processing
((...))
async def fetch_pages(self):
print("Entering fetch_page()")
pages, errors = [], []
if self.url_queue.empty():
await asyncio.sleep(1)
else:
await self.fetcher.get_batch(self.BATCH_SIZE, self.url_queue, self.page_queue, self.error_queue)
((...))
async def process_html(self): ...
async def analyze_content(self): ...
async def extract_links(self): ...
async def index_content(self): ...
async def handle_errors(self): ...
((...))
async def main(self):
try:
while True:
tasks = [t.loop.create_task(t.fetch_pages()),
t.loop.create_task(t.process_html()),
t.loop.create_task(t.analyze_content()),
t.loop.create_task(t.index_content()),
t.loop.create_task(t.handle_errors())]
await asyncio.gather(*tasks)
except KeyboardInterrupt:
print("shutting down")
finally:
print("Pretending to save the URL queue, etc ... ")
t = Crawler()
if __name__ == "__main__":
#asyncio.run(crawler.crawl(index), debug=True)
t.loop.run_until_complete(t.main())
(full code here) ...
and here is the code for the fetch loop:
class Fetcher():
((...))
def __init__(self, domain_manager=None, http_headers = None, dns_cache_lifetime = 300, request_timeout = 30,
connection_timeout = 5, max_connections = 20, max_connections_per_host = 5, obey_robots = False,
verify_ssl_certs = False):
self.loop = asyncio.get_event_loop()
self.domain_manager = domain_manager # rate limit requests / robots.txt on per-domain basis
self.timeout = aiohttp.ClientTimeout(total=request_timeout,
connect=connection_timeout)
self.connector = aiohttp.TCPConnector(ttl_dns_cache=dns_cache_lifetime,
limit=max_connections,
limit_per_host=max_connections_per_host,
ssl=verify_ssl_certs)
async def fetch(self, url, session):
try:
async with session.get(url) as resp:
status = int(resp.status)
headers = dict(resp.headers)
if self.check_response_headers(url, status, headers):
html = await resp.text()
return {'url': url,
'headers': headers,
'html': html,
'last_visit': datetime.now()}
else:
raise FetchError(f"Fetch failed for url {url}: Header check failed (but why did we make it here?)",
url=url, exception=e, fetch_stage="GET")
except UnicodeDecodeError as e:
((...))
def check_response_headers(self, url, status, headers):
"""Given a response from fetch(), return a (Page object, error object) pair"""
((...))
async def fetch_with_dm(self, url, session, i):
"""fetches next url from queue until successfully fetches a page"""
domain = self.domain_manager.domain_from_url(url)
((...))
async with self.domain_manager.locks[domain]:
((...))
fetch_result = await self.fetch(url, session)
return fetch_result
async def get_batch(self, batch_size, url_queue, page_queue, error_queue):
start_time = datetime.now()
async with aiohttp.ClientSession(timeout=self.timeout, connector=self.connector) as session:
tasks = []
for i in range(batch_size):
url = None
score = None
if url_queue.empty():
break
else:
score, url = url_queue.get_nowait() # should we be blocking here / await / sleeping if no urls in queue?
if url == None:
raise ValueError("Received empty URL")
if score == None:
raise ValueError("Received empty URL score")
tasks.append(self.loop.create_task(self.fetch_with_dm(url, session, i)))
for p in asyncio.as_completed(tasks):
try:
page = await p
page['url_score'] = score
await page_queue.put((score, id(page), page))
except FetchError as fe:
await error_queue.put(fe)
(full code here)
... Again the "session closed" error is occuring when session.get(url) is called in fetch, but only in the second iteration of main loop ...

Categories