I am executing the below code on a windows pc. I read that, by default, Windows can use only 64 sockets in asyncio loop. I don't know if this is the reason for the error.
import aiohttp
import asyncio
import time
async def download_file(url):
print(f'started downloading{url}')
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
async with session.get(url) as resp:
content = await resp.read()
print (f'Finished download{url}')
return content
async def write_file(n, content):
filename = f'async_{n}.html'
with open(filename,'wb') as f:
print(f'started writing{filename}')
f.write(content)
print(f'Finished writing{filename}')
async def scrape_task(n,url):
content = await download_file(url)
await write_file(n,content)
async def main():
tasks = []
for n,url in enumerate(open('urls.txt').readlines()):
tasks.append((scrape_task(n, url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
t=time.perf_counter()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.perf_counter() - t
print(f'Total time taken: {t2:0.2f} seconds')
I made the below changes to limit the connections to 60
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
I can't figure out where I am going wrong.
I'm trying to check a set of URLs for their status code and return all that are of code 4xx or 5xx.
In total I need to check about 12500 URLs and my script works fine for up to about 7000 URLs. Above that the script crashes with ResourceWarning unclosed transport error.
I'm using python-3.6 and aiohttp 3.5.4
Any idea what's causing this?
async def fetch(url, session):
async with session.get(url) as response:
data = response.status
return url, data
async def bound_fetch(sem, url, session):
async with sem:
return await fetch(url, session)
async def check_urls(url_list):
''' get status code for all urls and write into dictionary '''
base_url = <base_url>
tasks = []
sem = asyncio.Semaphore(10)
async with ClientSession() as session:
for url in url_list:
full_url = base_url + url
task = asyncio.ensure_future(bound_fetch(sem, full_url.format(), session))
tasks.append(task)
results = await asyncio.gather(*tasks)
results_dict = defaultdict(list)
for res in results:
if res[1] != 200 and res[1] != 301 and res[1] != 302:
print(f'ERROR {str(res[1])} {res[0]}')
results_dict[res[1]].append(res[0])
print(f'URLs checked, found {str(len(results_dict))} errors')
''' main function'''
loop = asyncio.get_event_loop()
loop.set_debug(True)
warnings.simplefilter('always', ResourceWarning)
future = asyncio.ensure_future(check_urls(list_of_urls))
loop.run_until_complete(future)
I'm using aiohttp with limited_as_completed method to speed up scrapping (around 100 million static website pages). However, the code stops after several minutes, and returns the TimeoutError. I tried several things, but still could not prevent the raise asyncio.TimeoutError. May I ask how can I ignore the error, and continue?
The code I'm running is:
N=123
import html
from lxml import etree
import requests
import asyncio
import aiohttp
from aiohttp import ClientSession, TCPConnector
import pandas as pd
import re
import csv
import time
from itertools import islice
import sys
from contextlib import suppress
start = time.time()
data = {}
data['name'] = []
filename = "C:\\Users\\xxxx"+ str(N) + ".csv"
def limited_as_completed(coros, limit):
futures = [
asyncio.ensure_future(c)
for c in islice(coros, 0, limit)
]
async def first_to_finish():
while True:
await asyncio.sleep(0)
for f in futures:
if f.done():
futures.remove(f)
try:
newf = next(coros)
futures.append(
asyncio.ensure_future(newf))
except StopIteration as e:
pass
return f.result()
while len(futures) > 0:
yield first_to_finish()
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
with suppress(asyncio.TimeoutError):
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
limit = 1000
async def print_when_done(tasks):
for res in limited_as_completed(tasks, limit):
await res
url = "http://xxx.{}.html"
loop = asyncio.get_event_loop()
async def main():
connector = TCPConnector(limit=10)
async with ClientSession(connector=connector,headers=headers,raise_for_status=False) as session:
coros = (get_info_byid(i, url.format(i), session) for i in range(N,N+1000000))
await print_when_done(coros)
loop.run_until_complete(main())
loop.close()
print("took", time.time() - start, "seconds.")
The error log is:
Traceback (most recent call last):
File "C:\Users\xxx.py", line 111, in <module>
loop.run_until_complete(main())
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\asyncio\base_events.py", line 573, in run_until_complete
return future.result()
File "C:\Users\xxx.py", line 109, in main
await print_when_done(coros)
File "C:\Users\xxx.py", line 98, in print_when_done
await res
File "C:\Users\xxx.py", line 60, in first_to_finish
return f.result()
File "C:\Users\xxx.py", line 65, in get_info_byid
async with session.get(url,timeout=20) as resp:
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 855, in __aenter__
self._resp = await self._coro
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 391, in _request
await resp.start(conn)
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client_reqrep.py", line 770, in start
self._continue = None
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\helpers.py", line 673, in __exit__
raise asyncio.TimeoutError from None
concurrent.futures._base.TimeoutError
I have tried
1) add expect asyncio.TimeoutError: pass. Not working
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
try:
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
except asyncio.TimeoutError:
pass
2) suppress(asyncio.TimeoutError)as shown above. Not working
I just learned aiohttp yesterday, so maybe there is other things wrong in my code that causes timeout error only after a few minutes' running? Thank you very much if anyone knows how to deal with it!
what #Yurii Kramarenko has done will raise Unclosed client session excecption for sure, since the session has never be properly closed. What I recommend is sth like this:
import asyncio
import aiohttp
async def main(urls):
async with aiohttp.ClientSession(timeout=self.timeout) as session:
tasks=[self.do_something(session,url) for url in urls]
await asyncio.gather(*tasks)
I like #jbxiaoyu answer, but the timeout kwarg seems to take a special object, so I thought I'd add you need to create a ClientTimeout object, then pass it to the Session, like this:
from aiohttp import ClientSession, ClientTimeout
timeout = ClientTimeout(total=600)
async with ClientSession(timeout=timeout) as session:
tasks=[self.do_something(session,url) for url in urls]
await asyncio.gather(*tasks)
Simple example (not very good, but works fine):
import asyncio
from aiohttp.client import ClientSession
class Wrapper:
def __init__(self, session):
self._session = session
async def get(self, url):
try:
async with self._session.get(url, timeout=20) as resp:
return await resp.text()
except Exception as e:
print(e)
loop = asyncio.get_event_loop()
wrapper = Wrapper(ClientSession())
responses = loop.run_until_complete(
asyncio.gather(
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com')
)
)
print(responses)
I am writing a simple producer/consumer app to call multiple URL's asynchronously.
In the following code if I set the conn_count=1, and add 2 items to the Queue it works fine as only one consumer is created. But if I make conn_count=2 and add 4 items to the Queue only 3 request are being made. The other request fails with ClientConnectorError.
Can you please help be debug the reason for failure with multiple consumers? Thank You.
I am using a echo server I created.
Server:
import os
import logging.config
import yaml
from aiohttp import web
import json
def start():
setup_logging()
app = web.Application()
app.router.add_get('/', do_get)
app.router.add_post('/', do_post)
web.run_app(app)
async def do_get(request):
return web.Response(text='hello')
async def do_post(request):
data = await request.json()
return web.Response(text=json.dumps(data))
def setup_logging(
default_path='logging.yaml',
default_level=logging.INFO,
env_key='LOG_CFG'
):
path = default_path
value = os.getenv(env_key, None)
if value:
path = value
if os.path.exists(path):
with open(path, 'rt') as f:
config = yaml.safe_load(f.read())
logging.config.dictConfig(config)
else:
logging.basicConfig(level=default_level)
if __name__ == '__main__':
start()
Client:
import asyncio
import collections
import json
import sys
import async_timeout
from aiohttp import ClientSession, TCPConnector
MAX_CONNECTIONS = 100
URL = 'http://localhost:8080'
InventoryAccount = collections.namedtuple("InventoryAccount", "op_co customer_id")
async def produce(queue, num_consumers):
for i in range(num_consumers * 2):
await queue.put(InventoryAccount(op_co=i, customer_id=i * 100))
for j in range(num_consumers):
await queue.put(None)
async def consumer(n, queue, session, responses):
print('consumer {}: starting'.format(n))
while True:
try:
account = await queue.get()
if account is None:
queue.task_done()
break
else:
print(f"Consumer {n}, Updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}")
params = {'opCo': account.op_co, 'customerId': account.customer_id}
headers = {'content-type': 'application/json'}
with async_timeout.timeout(10):
print(f"Consumer {n}, session state " + str(session.closed))
async with session.post(URL,
headers=headers,
data=json.dumps(params)) as response:
assert response.status == 200
responses.append(await response.text())
queue.task_done()
except:
e = sys.exc_info()[0]
print(f"Consumer {n}, Error updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}. {e}")
queue.task_done()
print('consumer {}: ending'.format(n))
async def start(loop, session, num_consumers):
queue = asyncio.Queue(maxsize=num_consumers)
responses = []
consumers = [asyncio.ensure_future(loop=loop, coro_or_future=consumer(i, queue, session, responses)) for i in range(num_consumers)]
await produce(queue, num_consumers)
await queue.join()
for consumer_future in consumers:
consumer_future.cancel()
return responses
async def run(loop, conn_count):
async with ClientSession(loop=loop, connector=TCPConnector(verify_ssl=False, limit=conn_count)) as session:
result = await start(loop, session, conn_count)
print("Result: " + str(result))
if __name__ == '__main__':
conn_count = 2
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(run(loop, conn_count))
finally:
loop.close()
Reference:
https://pymotw.com/3/asyncio/synchronization.html
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
https://hackernoon.com/asyncio-for-the-working-python-developer-5c468e6e2e8e
First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def bound_fetch(sem, url):
async with sem:
print("doing request for "+ url)
async with ClientSession() as session:
async with session.get(url) as response:
responseheader = await response.headers
print(headers)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
sem = asyncio.Semaphore(100)
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i))
tasks.append(task)
headers = await asyncio.wait(*tasks)
print(headers)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
As per my last question I'm following this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
I tried to adapt my code as close as possible to the example implementation but this code is still not making any requests and printing the headers in bound_headers as I wish.
Can somebody spot whats wrong with this code ?
response.headers is a regular property, no need to put await before the call
asyncio.wait on other hand accepts a list of futures and returns (done, pending) pair.
Looks like you should replace await wait() call with await asyncio.gather(*tasks) (gather doc)