How to prevent raise asyncio.TimeoutError and continue the loop - python

I'm using aiohttp with limited_as_completed method to speed up scrapping (around 100 million static website pages). However, the code stops after several minutes, and returns the TimeoutError. I tried several things, but still could not prevent the raise asyncio.TimeoutError. May I ask how can I ignore the error, and continue?
The code I'm running is:
N=123
import html
from lxml import etree
import requests
import asyncio
import aiohttp
from aiohttp import ClientSession, TCPConnector
import pandas as pd
import re
import csv
import time
from itertools import islice
import sys
from contextlib import suppress
start = time.time()
data = {}
data['name'] = []
filename = "C:\\Users\\xxxx"+ str(N) + ".csv"
def limited_as_completed(coros, limit):
futures = [
asyncio.ensure_future(c)
for c in islice(coros, 0, limit)
]
async def first_to_finish():
while True:
await asyncio.sleep(0)
for f in futures:
if f.done():
futures.remove(f)
try:
newf = next(coros)
futures.append(
asyncio.ensure_future(newf))
except StopIteration as e:
pass
return f.result()
while len(futures) > 0:
yield first_to_finish()
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
with suppress(asyncio.TimeoutError):
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
limit = 1000
async def print_when_done(tasks):
for res in limited_as_completed(tasks, limit):
await res
url = "http://xxx.{}.html"
loop = asyncio.get_event_loop()
async def main():
connector = TCPConnector(limit=10)
async with ClientSession(connector=connector,headers=headers,raise_for_status=False) as session:
coros = (get_info_byid(i, url.format(i), session) for i in range(N,N+1000000))
await print_when_done(coros)
loop.run_until_complete(main())
loop.close()
print("took", time.time() - start, "seconds.")
The error log is:
Traceback (most recent call last):
File "C:\Users\xxx.py", line 111, in <module>
loop.run_until_complete(main())
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\asyncio\base_events.py", line 573, in run_until_complete
return future.result()
File "C:\Users\xxx.py", line 109, in main
await print_when_done(coros)
File "C:\Users\xxx.py", line 98, in print_when_done
await res
File "C:\Users\xxx.py", line 60, in first_to_finish
return f.result()
File "C:\Users\xxx.py", line 65, in get_info_byid
async with session.get(url,timeout=20) as resp:
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 855, in __aenter__
self._resp = await self._coro
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client.py", line 391, in _request
await resp.start(conn)
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\client_reqrep.py", line 770, in start
self._continue = None
File "C:\Users\xx\AppData\Local\Programs\Python\Python37-32\lib\site-packages\aiohttp\helpers.py", line 673, in __exit__
raise asyncio.TimeoutError from None
concurrent.futures._base.TimeoutError
I have tried
1) add expect asyncio.TimeoutError: pass. Not working
async def get_info_byid(i, url, session):
async with session.get(url,timeout=20) as resp:
print(url)
try:
r = await resp.text()
name = etree.HTML(r).xpath('//h2[starts-with(text(),"Customer Name")]/text()')
data['name'].append(name)
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, index=False, sep='|')
except asyncio.TimeoutError:
pass
2) suppress(asyncio.TimeoutError)as shown above. Not working
I just learned aiohttp yesterday, so maybe there is other things wrong in my code that causes timeout error only after a few minutes' running? Thank you very much if anyone knows how to deal with it!

what #Yurii Kramarenko has done will raise Unclosed client session excecption for sure, since the session has never be properly closed. What I recommend is sth like this:
import asyncio
import aiohttp
async def main(urls):
async with aiohttp.ClientSession(timeout=self.timeout) as session:
tasks=[self.do_something(session,url) for url in urls]
await asyncio.gather(*tasks)

I like #jbxiaoyu answer, but the timeout kwarg seems to take a special object, so I thought I'd add you need to create a ClientTimeout object, then pass it to the Session, like this:
from aiohttp import ClientSession, ClientTimeout
timeout = ClientTimeout(total=600)
async with ClientSession(timeout=timeout) as session:
tasks=[self.do_something(session,url) for url in urls]
await asyncio.gather(*tasks)

Simple example (not very good, but works fine):
import asyncio
from aiohttp.client import ClientSession
class Wrapper:
def __init__(self, session):
self._session = session
async def get(self, url):
try:
async with self._session.get(url, timeout=20) as resp:
return await resp.text()
except Exception as e:
print(e)
loop = asyncio.get_event_loop()
wrapper = Wrapper(ClientSession())
responses = loop.run_until_complete(
asyncio.gather(
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com'),
wrapper.get('http://google.com')
)
)
print(responses)

Related

Why doesn't this python aiohttp requests code run asynchronously?

I'm trying to access an API with aiohttp but something is causing this code to block each iteration.
def main():
async with aiohttp.ClientSession() as session:
for i, (image, target) in enumerate(dataset_val):
image_bytes = pil_to_bytes(image)
async with session.post('http://localhost:8080/predictions/resnet50', data=image_bytes) as resp:
print(await resp.text())
print(i, flush=True, end='\r')
asyncio.run(main())
As explained by #deceze, await will wait for your result inside your loop. If you want to call everything at the same time, you need to call everything from an external loop and gather the results.
Here's a way of doing it
import asyncio
import aiohttp
async def call(session: aiohttp.ClientSession, url: str, image):
image_bytes = pil_to_bytes(image)
async with session.post(url, data=image_bytes) as response:
return await response.text()
async def call_all(url:str, tasks: list):
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(
*[call(session, url, img) for img, target in tasks],
return_exceptions=True
)
return results
loop = asyncio.get_event_loop()
res = loop.run_until_complete(
call_all('http://localhost:8080/predictions/resnet50', dataset_val)
)

AttributeError: module 'select' has no attribute 'select' error ASYNCIO

I am executing the below code on a windows pc. I read that, by default, Windows can use only 64 sockets in asyncio loop. I don't know if this is the reason for the error.
import aiohttp
import asyncio
import time
async def download_file(url):
print(f'started downloading{url}')
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
async with session.get(url) as resp:
content = await resp.read()
print (f'Finished download{url}')
return content
async def write_file(n, content):
filename = f'async_{n}.html'
with open(filename,'wb') as f:
print(f'started writing{filename}')
f.write(content)
print(f'Finished writing{filename}')
async def scrape_task(n,url):
content = await download_file(url)
await write_file(n,content)
async def main():
tasks = []
for n,url in enumerate(open('urls.txt').readlines()):
tasks.append((scrape_task(n, url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
t=time.perf_counter()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.perf_counter() - t
print(f'Total time taken: {t2:0.2f} seconds')
I made the below changes to limit the connections to 60
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
I can't figure out where I am going wrong.

Using aiohttp to get the status of a number of websites

I have this code i am using to get the status of a list of websites.
import aiohttp
import asyncio
import json
import sys
import time
async def get_statuses(websites):
statuses = {}
tasks = [get_website_status(website) for website in websites]
for status in await asyncio.gather(*tasks):
if not statuses.get(status):
statuses[status] = 0
statuses[status] += 1
print(json.dumps(statuses))
async def get_website_status(url):
response = await aiohttp.get(url)
status = response.status
response.close()
return status
if __name__ == '__main__':
with open(sys.argv[1], 'r') as f:
websites = f.read().splitlines()
t0 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(get_statuses(websites))
t1 = time.time()
print("getting website statuses took {0:.1f} seconds".format(t1-t0))
and since get is depreciated await aiohttp.get(url) i edited the code as such
import aiohttp
import asyncio
import json
import sys
import time
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def get_statuses(websites):
statuses = {}
tasks = [get_website_status(website) for website in websites]
for status in await asyncio.gather(*tasks):
if not statuses.get(status):
statuses[status] = 0
statuses[status] += 1
print(json.dumps(statuses))
async def get_website_status(url):
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
#response = await aiohttp.get(url)
status = response.status
response.close()
return status
if __name__ == '__main__':
with open(sys.argv[1], 'r') as f:
websites = f.read().splitlines()
t0 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(get_statuses(websites))
t1 = time.time()
print("getting website statuses took {0:.1f} seconds".format(t1-t0))
I copied the session code from the docs https://aiohttp.readthedocs.io/en/stable/
However when i run my code i get this error:
c:\asyncio>a.py list.txt
Traceback (most recent call last):
File "C:\asyncio\a.py", line 35, in <module>
loop.run_until_complete(get_statuses(websites))
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\asyncio\base_ev
ents.py", line 579, in run_until_complete
return future.result()
File "C:\asyncio\a.py", line 14, in get_statuses
for status in await asyncio.gather(*tasks):
File "C:\asyncio\a.py", line 25, in get_website_status
status = response.status
AttributeError: 'str' object has no attribute 'status'
c:\asyncio>
here is a sample list.txt
https://facebook.com/
https://twitter.com/
https://google.com/
https://youtube.com/
https://linkedin.com/
https://instagram.com/
https://pinterest.com/
get_website_status routine delegates call to fetch function which returns text content response.text(), not the response itself.
That's why , in further, response.status throws an obvious error.
In case if response content is not needed, to fix the error, change fetch function to return the response object:
async def fetch(session, url):
response = await session.get(url)
return response

python 3.6 asyncio error not iterable while not iterating through async object

I have a class that create a url and some json to execute in a post method that looks like that and I was following this guide
import vk_api
from vk_api.execute import VkFunction
import time
from datetime import datetime
import numpy as np
import asyncio
from ratelimit import limits
import requests
import aiohttp
class Execute:
def __init__(self, access_token):
self.access_token = access_token
def posts_to_push(self, posts, limit):
arr = []
data = list(self.posts_chunks_limit(posts, limit))
for i in range(len(data)):
code = f"data.push(API.wall.getById( {{'posts': {data[i]} }} )); "
arr.append(code)
return arr
def posts_execute_command(self, posts): # TODO make async
limit = 100
code = self.posts_to_push(posts, limit)
execute_limit = 25
for i in range(len(code)):
data = ''.join(code[i * execute_limit: (i * execute_limit) + execute_limit])
var = f'var data = []; {data} return data ;'
yield var
async def fetch(url, json_data, session):
async with session.post(url, json=json_data) as response:
return await response.read()
async def result_posts(self, posts):
result = []
command = self.posts_execute_command(posts)
async with aiohttp.ClientSession() as session:
for i in command:
execute = asyncio.ensure_future(self.fetch(url="https://api.vk.com/method/execute",
json_data={
"code": i,
"access_token": self.access_token,
"v": 5.101,
}), session)
result.append(execute)
responses = await asyncio.gather(*result)
print(responses)
async def posts_chunks_limit(self, data, limit):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(data), limit):
await asyncio.sleep(0.1)
yield data[i:i + limit]
def run_async(self, posts):
loop = asyncio.get_event_loop()
loop.run_until_complete(self.result_posts(posts))
and then i run it like this
df = pd.read_csv('/some_path')
arr = []
for i in df['ids']:
arr.append(i)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(vk.result_posts(arr))
loop.run_until_complete(future)
error message looks like this
Traceback (most recent call last):
File "../test_python.py", line 83, in <module>
loop.run_until_complete(future)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py", line 484, in run_until_complete
return future.result()
File "../test_python.py", line 45, in result_posts
for i in command:
File "../test_python.py", line 29, in posts_execute_command
code = self.posts_to_push(posts, limit)
File "../test_python.py", line 21, in posts_to_push
data = list(self.posts_chunks_limit(posts, limit))
TypeError: 'async_generator' object is not iterable
This is my frist time using aiohttp/asyncio, I find it quite complicated and easy to get lost, may be I can get some directions or solutions in my case ?
In this line:
data = list(self.posts_chunks_limit(posts, limit))
As post_chunks_limit is an async iterator, list doesn't know what to do with it. You need to iterate over it with async for or with an async list comprehension:
data = [x async for x in self.posts_chunks_limit(posts, limit)]
This requires, posts_to_push and posts_execute_command to be defined with async def. Also posts_execute_command must await the call to posts_to_push and result_posts needs to await the call to posts_execute_command.
With the help of #user4815162342 and bunch of SO posts, I was able to fix my issue and my code looks like this.
Issue was I was calling/awaiting a generator which would not be iterable in my result_postsmethod.
import vk_api
from vk_api.execute import VkFunction
import time
from datetime import datetime
import numpy as np
import asyncio
from ratelimit import limits
import requests
import aiohttp
import socket
from concurrent.futures import ThreadPoolExecutor
class Execute: # TODO auth, parsers, limits, timeouts
def __init__(self, access_token):
self.access_token = access_token
async def posts_to_push(self, posts, limit):
arr = []
data = [x async for x in self.posts_chunks_limit(posts, limit)]
for i in range(len(data)):
code = f"data.push(API.wall.getById( {{'posts': {data[i]} }} )); "
arr.append(code)
return arr # < len() = 1000, 1k lists with 100 post IDs inside for 100k total ids
async def posts_execute_command(self, posts): # TODO make async
limit = 100
code = await self.posts_to_push(posts, limit)
execute_limit = 25
for i in range(len(code)):
data = ''.join(code[i * execute_limit: (i * execute_limit) + execute_limit])
var = f'var data = []; {data} return data ;'
print(var, '---var---')
yield var
async def fetch(self, url, json_data, session):
async with session.post(url, data=json_data) as response:
return await response.read()
#limits(calls=1, period=1)
async def result_posts(self, posts):
result = []
command = [i async for i in self.posts_execute_command(posts) ] #<note this iteration
conn = aiohttp.TCPConnector(
family=socket.AF_INET,
verify_ssl=False,)
async with aiohttp.ClientSession(connector=conn) as session:
for i in command:
print('---code---', len(command)) #TODO fix command range that's the bug
execute = asyncio.ensure_future(self.fetch(url="https://api.vk.com/method/execute",
json_data={
"code": i,
"access_token": self.access_token,
"v": 5.101,
}, session = session))
await asyncio.sleep(1)
result.append(execute)
responses = await asyncio.gather(*result)
print(responses, 'responses')
return 'Done'
async def posts_chunks_limit(self, data, limit):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(data), limit):
yield data[i:i + limit]

Aiohttp not performing any requests

First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def bound_fetch(sem, url):
async with sem:
print("doing request for "+ url)
async with ClientSession() as session:
async with session.get(url) as response:
responseheader = await response.headers
print(headers)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
sem = asyncio.Semaphore(100)
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i))
tasks.append(task)
headers = await asyncio.wait(*tasks)
print(headers)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
As per my last question I'm following this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
I tried to adapt my code as close as possible to the example implementation but this code is still not making any requests and printing the headers in bound_headers as I wish.
Can somebody spot whats wrong with this code ?
response.headers is a regular property, no need to put await before the call
asyncio.wait on other hand accepts a list of futures and returns (done, pending) pair.
Looks like you should replace await wait() call with await asyncio.gather(*tasks) (gather doc)

Categories