Using aiohttp to get the status of a number of websites

Using aiohttp to get the status of a number of websites - python

I have this code i am using to get the status of a list of websites.
import aiohttp
import asyncio
import json
import sys
import time
async def get_statuses(websites):
statuses = {}
tasks = [get_website_status(website) for website in websites]
for status in await asyncio.gather(*tasks):
if not statuses.get(status):
statuses[status] = 0
statuses[status] += 1
print(json.dumps(statuses))
async def get_website_status(url):
response = await aiohttp.get(url)
status = response.status
response.close()
return status
if __name__ == '__main__':
with open(sys.argv[1], 'r') as f:
websites = f.read().splitlines()
t0 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(get_statuses(websites))
t1 = time.time()
print("getting website statuses took {0:.1f} seconds".format(t1-t0))
and since get is depreciated await aiohttp.get(url) i edited the code as such
import aiohttp
import asyncio
import json
import sys
import time
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def get_statuses(websites):
statuses = {}
tasks = [get_website_status(website) for website in websites]
for status in await asyncio.gather(*tasks):
if not statuses.get(status):
statuses[status] = 0
statuses[status] += 1
print(json.dumps(statuses))
async def get_website_status(url):
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
#response = await aiohttp.get(url)
status = response.status
response.close()
return status
if __name__ == '__main__':
with open(sys.argv[1], 'r') as f:
websites = f.read().splitlines()
t0 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(get_statuses(websites))
t1 = time.time()
print("getting website statuses took {0:.1f} seconds".format(t1-t0))
I copied the session code from the docs https://aiohttp.readthedocs.io/en/stable/
However when i run my code i get this error:
c:\asyncio>a.py list.txt
Traceback (most recent call last):
File "C:\asyncio\a.py", line 35, in <module>
loop.run_until_complete(get_statuses(websites))
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\asyncio\base_ev
ents.py", line 579, in run_until_complete
return future.result()
File "C:\asyncio\a.py", line 14, in get_statuses
for status in await asyncio.gather(*tasks):
File "C:\asyncio\a.py", line 25, in get_website_status
status = response.status
AttributeError: 'str' object has no attribute 'status'
c:\asyncio>
here is a sample list.txt
https://facebook.com/
https://twitter.com/
https://google.com/
https://youtube.com/
https://linkedin.com/
https://instagram.com/
https://pinterest.com/

get_website_status routine delegates call to fetch function which returns text content response.text(), not the response itself.
That's why , in further, response.status throws an obvious error.
In case if response content is not needed, to fix the error, change fetch function to return the response object:
async def fetch(session, url):
response = await session.get(url)
return response

Related

AttributeError: module 'select' has no attribute 'select' error ASYNCIO

I am executing the below code on a windows pc. I read that, by default, Windows can use only 64 sockets in asyncio loop. I don't know if this is the reason for the error.
import aiohttp
import asyncio
import time
async def download_file(url):
print(f'started downloading{url}')
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
async with session.get(url) as resp:
content = await resp.read()
print (f'Finished download{url}')
return content
async def write_file(n, content):
filename = f'async_{n}.html'
with open(filename,'wb') as f:
print(f'started writing{filename}')
f.write(content)
print(f'Finished writing{filename}')
async def scrape_task(n,url):
content = await download_file(url)
await write_file(n,content)
async def main():
tasks = []
for n,url in enumerate(open('urls.txt').readlines()):
tasks.append((scrape_task(n, url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
t=time.perf_counter()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.perf_counter() - t
print(f'Total time taken: {t2:0.2f} seconds')
I made the below changes to limit the connections to 60
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
I can't figure out where I am going wrong.

Asynchronous requests script crashes with too many URLs

I'm trying to check a set of URLs for their status code and return all that are of code 4xx or 5xx.
In total I need to check about 12500 URLs and my script works fine for up to about 7000 URLs. Above that the script crashes with ResourceWarning unclosed transport error.
I'm using python-3.6 and aiohttp 3.5.4
Any idea what's causing this?
async def fetch(url, session):
async with session.get(url) as response:
data = response.status
return url, data
async def bound_fetch(sem, url, session):
async with sem:
return await fetch(url, session)
async def check_urls(url_list):
''' get status code for all urls and write into dictionary '''
base_url = <base_url>
tasks = []
sem = asyncio.Semaphore(10)
async with ClientSession() as session:
for url in url_list:
full_url = base_url + url
task = asyncio.ensure_future(bound_fetch(sem, full_url.format(), session))
tasks.append(task)
results = await asyncio.gather(*tasks)
results_dict = defaultdict(list)
for res in results:
if res[1] != 200 and res[1] != 301 and res[1] != 302:
print(f'ERROR {str(res[1])} {res[0]}')
results_dict[res[1]].append(res[0])
print(f'URLs checked, found {str(len(results_dict))} errors')
''' main function'''
loop = asyncio.get_event_loop()
loop.set_debug(True)
warnings.simplefilter('always', ResourceWarning)
future = asyncio.ensure_future(check_urls(list_of_urls))
loop.run_until_complete(future)

How to prevent raise asyncio.TimeoutError and continue the loop

I'm using aiohttp with limited_as_completed method to speed up scrapping (around 100 million static website pages). However, the code stops after several minutes, and returns the TimeoutError. I tried several things, but still could not prevent the raise asyncio.TimeoutError. May I ask how can I ignore the error, and continue?
The code I'm running is:
N=123
import html
from lxml import etree
import requests
import asyncio
import aiohttp
from aiohttp import ClientSession, TCPConnector
import pandas as pd
import re
import csv
import time
from itertools import islice
import sys
from contextlib import suppress
start = time.time()
data = {}
data['name'] = []
filename = "C:\\Users\\xxxx"+ str(N) + ".csv"
def limited_as_completed(coros, limit):
futures = [
asyncio.ensure_future(c)
for c in islice(coros, 0, limit)
]
async def first_to_finish():
while True:
await asyncio.sleep(0)
for f in futures:
if f.done():
futures.remove(f)
try:
newf = next(coros)
futures.append(
asyncio.ensure_future(newf))
except StopIteration as e:
pass
return f.result()
while len(futures) > 0:
yield first_to_finish()
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
with suppress(asyncio.TimeoutError):
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
limit = 1000
async def print_when_done(tasks):
for res in limited_as_completed(tasks, limit):
await res
url = "http://xxx.{}.html"
loop = asyncio.get_event_loop()
async def main():
connector = TCPConnector(limit=10)
async with ClientSession(connector=connector,headers=headers,raise_for_status=False) as session:
coros = (get_info_byid(i, url.format(i), session) for i in range(N,N+1000000))
await print_when_done(coros)
loop.run_until_complete(main())
loop.close()
print("took", time.time() - start, "seconds.")
The error log is:
Traceback (most recent call last):
File "C:\Users\xxx.py", line 111, in <module>
loop.run_until_complete(main())
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\asyncio\base_events.py", line 573, in run_until_complete
return future.result()
File "C:\Users\xxx.py", line 109, in main
await print_when_done(coros)
File "C:\Users\xxx.py", line 98, in print_when_done
await res
File "C:\Users\xxx.py", line 60, in first_to_finish
return f.result()
File "C:\Users\xxx.py", line 65, in get_info_byid
async with session.get(url,timeout=20) as resp:
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 855, in __aenter__
self._resp = await self._coro
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 391, in _request
await resp.start(conn)
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client_reqrep.py", line 770, in start
self._continue = None
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\helpers.py", line 673, in __exit__
raise asyncio.TimeoutError from None
concurrent.futures._base.TimeoutError
I have tried
1) add expect asyncio.TimeoutError: pass. Not working
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
try:
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
except asyncio.TimeoutError:
pass
2) suppress(asyncio.TimeoutError)as shown above. Not working
I just learned aiohttp yesterday, so maybe there is other things wrong in my code that causes timeout error only after a few minutes' running? Thank you very much if anyone knows how to deal with it!

what #Yurii Kramarenko has done will raise Unclosed client session excecption for sure, since the session has never be properly closed. What I recommend is sth like this:
import asyncio
import aiohttp
async def main(urls):
async with aiohttp.ClientSession(timeout=self.timeout) as session:
tasks=[self.do_something(session,url) for url in urls]
await asyncio.gather(*tasks)

I like #jbxiaoyu answer, but the timeout kwarg seems to take a special object, so I thought I'd add you need to create a ClientTimeout object, then pass it to the Session, like this:
from aiohttp import ClientSession, ClientTimeout
timeout = ClientTimeout(total=600)
async with ClientSession(timeout=timeout) as session:
tasks=[self.do_something(session,url) for url in urls]
await asyncio.gather(*tasks)

Simple example (not very good, but works fine):
import asyncio
from aiohttp.client import ClientSession
class Wrapper:
def __init__(self, session):
self._session = session
async def get(self, url):
try:
async with self._session.get(url, timeout=20) as resp:
return await resp.text()
except Exception as e:
print(e)
loop = asyncio.get_event_loop()
wrapper = Wrapper(ClientSession())
responses = loop.run_until_complete(
asyncio.gather(
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com')
)
)
print(responses)

Python asyncio / aiohttp error

I am writing a simple producer/consumer app to call multiple URL's asynchronously.
In the following code if I set the conn_count=1, and add 2 items to the Queue it works fine as only one consumer is created. But if I make conn_count=2 and add 4 items to the Queue only 3 request are being made. The other request fails with ClientConnectorError.
Can you please help be debug the reason for failure with multiple consumers? Thank You.
I am using a echo server I created.
Server:
import os
import logging.config
import yaml
from aiohttp import web
import json
def start():
setup_logging()
app = web.Application()
app.router.add_get('/', do_get)
app.router.add_post('/', do_post)
web.run_app(app)
async def do_get(request):
return web.Response(text='hello')
async def do_post(request):
data = await request.json()
return web.Response(text=json.dumps(data))
def setup_logging(
default_path='logging.yaml',
default_level=logging.INFO,
env_key='LOG_CFG'
):
path = default_path
value = os.getenv(env_key, None)
if value:
path = value
if os.path.exists(path):
with open(path, 'rt') as f:
config = yaml.safe_load(f.read())
logging.config.dictConfig(config)
else:
logging.basicConfig(level=default_level)
if __name__ == '__main__':
start()
Client:
import asyncio
import collections
import json
import sys
import async_timeout
from aiohttp import ClientSession, TCPConnector
MAX_CONNECTIONS = 100
URL = 'http://localhost:8080'
InventoryAccount = collections.namedtuple("InventoryAccount", "op_co customer_id")
async def produce(queue, num_consumers):
for i in range(num_consumers * 2):
await queue.put(InventoryAccount(op_co=i, customer_id=i * 100))
for j in range(num_consumers):
await queue.put(None)
async def consumer(n, queue, session, responses):
print('consumer {}: starting'.format(n))
while True:
try:
account = await queue.get()
if account is None:
queue.task_done()
break
else:
print(f"Consumer {n}, Updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}")
params = {'opCo': account.op_co, 'customerId': account.customer_id}
headers = {'content-type': 'application/json'}
with async_timeout.timeout(10):
print(f"Consumer {n}, session state " + str(session.closed))
async with session.post(URL,
headers=headers,
data=json.dumps(params)) as response:
assert response.status == 200
responses.append(await response.text())
queue.task_done()
except:
e = sys.exc_info()[0]
print(f"Consumer {n}, Error updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}. {e}")
queue.task_done()
print('consumer {}: ending'.format(n))
async def start(loop, session, num_consumers):
queue = asyncio.Queue(maxsize=num_consumers)
responses = []
consumers = [asyncio.ensure_future(loop=loop, coro_or_future=consumer(i, queue, session, responses)) for i in range(num_consumers)]
await produce(queue, num_consumers)
await queue.join()
for consumer_future in consumers:
consumer_future.cancel()
return responses
async def run(loop, conn_count):
async with ClientSession(loop=loop, connector=TCPConnector(verify_ssl=False, limit=conn_count)) as session:
result = await start(loop, session, conn_count)
print("Result: " + str(result))
if __name__ == '__main__':
conn_count = 2
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(run(loop, conn_count))
finally:
loop.close()
Reference:
https://pymotw.com/3/asyncio/synchronization.html
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
https://hackernoon.com/asyncio-for-the-working-python-developer-5c468e6e2e8e

Aiohttp not performing any requests

First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def bound_fetch(sem, url):
async with sem:
print("doing request for "+ url)
async with ClientSession() as session:
async with session.get(url) as response:
responseheader = await response.headers
print(headers)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
sem = asyncio.Semaphore(100)
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i))
tasks.append(task)
headers = await asyncio.wait(*tasks)
print(headers)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
As per my last question I'm following this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
I tried to adapt my code as close as possible to the example implementation but this code is still not making any requests and printing the headers in bound_headers as I wish.
Can somebody spot whats wrong with this code ?

response.headers is a regular property, no need to put await before the call
asyncio.wait on other hand accepts a list of futures and returns (done, pending) pair.
Looks like you should replace await wait() call with await asyncio.gather(*tasks) (gather doc)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using aiohttp to get the status of a number of websites - python

Related

AttributeError: module 'select' has no attribute 'select' error ASYNCIO

Asynchronous requests script crashes with too many URLs

How to prevent raise asyncio.TimeoutError and continue the loop

Python asyncio / aiohttp error

Aiohttp not performing any requests

Categories

Resources