I am writing a simple producer/consumer app to call multiple URL's asynchronously.
In the following code if I set the conn_count=1, and add 2 items to the Queue it works fine as only one consumer is created. But if I make conn_count=2 and add 4 items to the Queue only 3 request are being made. The other request fails with ClientConnectorError.
Can you please help be debug the reason for failure with multiple consumers? Thank You.
I am using a echo server I created.
Server:
import os
import logging.config
import yaml
from aiohttp import web
import json
def start():
setup_logging()
app = web.Application()
app.router.add_get('/', do_get)
app.router.add_post('/', do_post)
web.run_app(app)
async def do_get(request):
return web.Response(text='hello')
async def do_post(request):
data = await request.json()
return web.Response(text=json.dumps(data))
def setup_logging(
default_path='logging.yaml',
default_level=logging.INFO,
env_key='LOG_CFG'
):
path = default_path
value = os.getenv(env_key, None)
if value:
path = value
if os.path.exists(path):
with open(path, 'rt') as f:
config = yaml.safe_load(f.read())
logging.config.dictConfig(config)
else:
logging.basicConfig(level=default_level)
if __name__ == '__main__':
start()
Client:
import asyncio
import collections
import json
import sys
import async_timeout
from aiohttp import ClientSession, TCPConnector
MAX_CONNECTIONS = 100
URL = 'http://localhost:8080'
InventoryAccount = collections.namedtuple("InventoryAccount", "op_co customer_id")
async def produce(queue, num_consumers):
for i in range(num_consumers * 2):
await queue.put(InventoryAccount(op_co=i, customer_id=i * 100))
for j in range(num_consumers):
await queue.put(None)
async def consumer(n, queue, session, responses):
print('consumer {}: starting'.format(n))
while True:
try:
account = await queue.get()
if account is None:
queue.task_done()
break
else:
print(f"Consumer {n}, Updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}")
params = {'opCo': account.op_co, 'customerId': account.customer_id}
headers = {'content-type': 'application/json'}
with async_timeout.timeout(10):
print(f"Consumer {n}, session state " + str(session.closed))
async with session.post(URL,
headers=headers,
data=json.dumps(params)) as response:
assert response.status == 200
responses.append(await response.text())
queue.task_done()
except:
e = sys.exc_info()[0]
print(f"Consumer {n}, Error updating cloud prices for account: opCo = {account.op_co!s}, customerId = {account.customer_id!s}. {e}")
queue.task_done()
print('consumer {}: ending'.format(n))
async def start(loop, session, num_consumers):
queue = asyncio.Queue(maxsize=num_consumers)
responses = []
consumers = [asyncio.ensure_future(loop=loop, coro_or_future=consumer(i, queue, session, responses)) for i in range(num_consumers)]
await produce(queue, num_consumers)
await queue.join()
for consumer_future in consumers:
consumer_future.cancel()
return responses
async def run(loop, conn_count):
async with ClientSession(loop=loop, connector=TCPConnector(verify_ssl=False, limit=conn_count)) as session:
result = await start(loop, session, conn_count)
print("Result: " + str(result))
if __name__ == '__main__':
conn_count = 2
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(run(loop, conn_count))
finally:
loop.close()
Reference:
https://pymotw.com/3/asyncio/synchronization.html
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
https://hackernoon.com/asyncio-for-the-working-python-developer-5c468e6e2e8e
Related
I'm creating an optimized multi-threading app using asyncio and want to add a rotating proxy into the mix.
Starting with a sample taken from this outstanding article:
Speed Up Your Python Program With Concurrency
I added a rotating proxy and it stopped working. The code simply exits the function after touching the line for the proxy.
This little snippet of code works, but not when added to the main script as shown in the screenshot above.
import asyncio
import random as rnd
async def download_site():
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
asyncio.run(download_site())
And here's the full sample:
import asyncio
import time
import aiohttp
# Sample code taken from here:
# https://realpython.com/python-concurrency/#asyncio-version
# Info for adding headers for the proxy (Scroll toward the bottom)
# https://docs.aiohttp.org/en/stable/client_advanced.html
# Good read to possible improve performance on large lists of URLs
# https://asyncio.readthedocs.io/en/latest/webscraper.html
# RUN THIS METHOD TO SEE HOW IT WORKS.
# # Original Code (working...)
# async def download_site(session, url):
# async with session.get(url, proxy="http://proxy.com") as response:
# print("Read {0} from {1}".format(response.content_length, url))
def get_proxy(self):
proxy_list = [
(754, '38.39.205.220:80'),
(681, '38.39.204.100:80'),
(682, '38.39.204.101:80'),
(678, '38.39.204.94:80')
]
proxy = random.choice(proxy_list)
print(proxy[1])
return proxy
async def download_site(session, url):
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
async with session.get(url, proxy="http://" + proxy) as response:
print("Read {0} from {1}".format(response.content_length, url))
async def download_all_sites(sites):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
# Modified to loop thru only 1 URL to make debugging simple
if __name__ == "__main__":
sites = [
"https://www.jython.org",
# "http://olympus.realpython.org/dice",
] #* 80
start_time = time.time()
asyncio.get_event_loop().run_until_complete(download_all_sites(sites))
duration = time.time() - start_time
print(f"Downloaded {len(sites)} sites in {duration} seconds")
Thank you for any help you can offer.
You use return_exceptions=True but you don't actually check the returned results for errors. You can use asyncio.as_completed to handle exceptions and get the earliest next result:
import asyncio
import random
import traceback
import aiohttp
URLS = ("https://stackoverflow.com",)
TIMEOUT = 5
PROXIES = (
"http://38.39.205.220:80",
"http://38.39.204.100:80",
"http://38.39.204.101:80",
"http://38.39.204.94:80",
)
def get_proxy():
return random.choice(PROXIES)
async def download_site(session, url):
proxy = get_proxy()
print(f"Got proxy: {proxy}")
async with session.get(url, proxy=f"{proxy}", timeout=TIMEOUT) as resp:
print(f"{url}: {resp.status}")
return await resp.text()
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for url in URLS:
tasks.append(asyncio.create_task(download_site(session, url)))
for coro in asyncio.as_completed(tasks):
try:
html = await coro
except Exception:
traceback.print_exc()
else:
print(len(html))
if __name__ == "__main__":
asyncio.run(main())
I am executing the below code on a windows pc. I read that, by default, Windows can use only 64 sockets in asyncio loop. I don't know if this is the reason for the error.
import aiohttp
import asyncio
import time
async def download_file(url):
print(f'started downloading{url}')
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
async with session.get(url) as resp:
content = await resp.read()
print (f'Finished download{url}')
return content
async def write_file(n, content):
filename = f'async_{n}.html'
with open(filename,'wb') as f:
print(f'started writing{filename}')
f.write(content)
print(f'Finished writing{filename}')
async def scrape_task(n,url):
content = await download_file(url)
await write_file(n,content)
async def main():
tasks = []
for n,url in enumerate(open('urls.txt').readlines()):
tasks.append((scrape_task(n, url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
t=time.perf_counter()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.perf_counter() - t
print(f'Total time taken: {t2:0.2f} seconds')
I made the below changes to limit the connections to 60
connector = aiohttp.TCPConnector(limit=60)
async with aiohttp.clientSession(connector) as session:
I can't figure out where I am going wrong.
I am struggling to have my both my websocket script and my I/O serial script running together in one together.
Just some basic info before I continue:
I am using Windows PC(Have no access to linux PC)
This is the reason why I am using the AIOserial library instead of pyserial-asyncio
I have no "super" experience with asyncio, so be kind please :)
Here is my "old" websocket script:
from aiohttp import web
import socketio
import aiohttp_cors
import asyncio
import random
# creates a new Async Socket IO Server
sio = socketio.AsyncServer()
# Creates
app = web.Application()
sio.attach(app)
server_is_responding = "Message from the server:"
the_response = "Hello there!"
async def index(request):
with open('index.html') as f:
print("Somebody entered the server from the browser!")
return web.Response(text=f.read(), content_type='text/html')
#sio.on("android-device")
async def message(sid, data):
print("message: ", data)
#return send_message_to_client()
#sio.on('sendTextToServer')
async def message(sid, data):
print("message: " , data)
if data == "hei":
await sio.emit("ServerMessage", {"hehe"})
if data == "lol":
await sio.emit("ServerMessage", {"Message from server:": "hehe, funny right?.."})
else:
await sio.emit("ServerMessage", {"Message from server:": "Hello There!"})
# We bind our aiohttp endpoint to our app
# router
cors = aiohttp_cors.setup(app)
app.router.add_get('/', index)
# We kick off our server
if __name__ == '__main__':
web.run_app(app)
And here is my I/O serial script(which works and read the data), that I am trying to use with some of the websocket functions above:
import asyncio
import websockets
import socketio
import aiohttp_cors
import logging
from AIOExtensions.AIOSerial import (AIOSerial, AIOSerialClosedException,
AIOSerialErrorException, AIOSerialNotOpenException)
logging.basicConfig(level=logging.DEBUG)
sio = socketio.AsyncServer()
async def hello(websocket, path):
name = await websocket.recv()
print(f"< {name}")
greeting = f"Hello {name}!"
await websocket.send(greeting)
print(f"> {greeting}")
#sio.on("android-device")
async def message(sid, data):
print("message: ", data)
async def read_IO_serial():
try:
async with AIOSerial('COM8', baudrate=115200, line_mode=True) as aios:
await asyncio.sleep(100)
try:
while True:
# read with timeout
rcvd = await asyncio.wait_for(aios.read(), timeout=1.0)
# print the data received
print(f"data received: {rcvd}")
if rcvd == b'RF initialized\n':
print("CATCHED THIS LINE!")
except asyncio.TimeoutError:
print("reception timed out ;-(")
except AIOSerialNotOpenException:
print("Unable to open the port!")
print()
print("Have you specified the right port number? COM7? COM8?")
# port fatal error
except AIOSerialErrorException:
print("Port error!")
# port already closed
except AIOSerialClosedException:
print("Serial port is closed!")
start_server = websockets.serve(hello, "http://192.168.1.6", 8080)
#sio.attach(start_server) # HOW CAN I ATTACH THIS SO IT CAN BE USED WITH THE SIO FUNCTIONS BELOW?
if start_server:
print("Server started!")
asyncio.run(read_IO_serial())
asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()
As you can see in my first simple websocket script, I could use "sio.attach(app)" which made it possible to listed to events from client, so I need a way of replacing this "app" on my current script..
Someone who can please help me with this?
I solved it using asyncio.gather(), this is how I dit it:
from aiohttp import web
import socketio
import aiohttp_cors
import asyncio
import random
import asyncio as aio
import logging
import sys
# creates a new Async Socket IO Server
sio = socketio.AsyncServer()
# Creates
app = web.Application()
sio.attach(app)
server_is_responding = "Message from the server:"
the_response = "Hello there!"
async def index(request):
with open('index.html') as f:
print("Somebody entered the server from the browser!")
return web.Response(text=f.read(), content_type='text/html')
#sio.event
async def join(sid, message):
sio.enter_room(sid, message['room'])
await sio.emit('my_response', {'data': 'Entered room: ' + message['room']}, room=sid)
#sio.on("android-device")
async def message(sid, data):
print("message: ", data)
#sio.on("receiveMessageFromServer")
async def message(sid, data):
print("message: ", data)
# await asyncio.sleep(1 * random.random())
return "OKKKK", 123
from AIOExtensions.AIOSerial import (AIOSerial, AIOSerialClosedException,
AIOSerialErrorException, AIOSerialNotOpenException)
logging.basicConfig(level=logging.DEBUG)
async def read_IO_serial():
try:
async with AIOSerial('COM8', baudrate=115200, line_mode=True) as aios:
# aios.sp.baudrate = 230400
# aios.sp.baudrate = 115200
# await aios.write(b"AT\r\n")
# await aios.read()
# await aios.close()
await aio.sleep(100)
try:
while True:
# read with timeout
rcvd = await aio.wait_for(aios.read(), timeout=1.0)
# print the data received
print(f"data received: {rcvd}")
if rcvd == b'RF initialized\n':
print("CATCHED THIS LINE!")
except aio.TimeoutError:
print("reception timed out ;-(")
except AIOSerialNotOpenException:
print("Unable to open the port!")
print()
print("Have you specified the right port number? COM7? COM8?")
# port fatal error
except AIOSerialErrorException:
print("Port error!")
# port already closed
except AIOSerialClosedException:
print("Serial port is closed!")
async def on_startup(app):
pass
cors = aiohttp_cors.setup(app)
app.router.add_get('/', index)
# We kick off our server
if __name__ == '__main__':
loop = asyncio.get_event_loop()
group2 = asyncio.gather(read_IO_serial())
group1 = asyncio.gather(web.run_app(app))
all_groups = asyncio.gather(group1, group2)
results = loop.run_until_complete(all_groups)
# loop.close()
#print(results)
I am writing a web crawler using aiohttp, and my program is crashing with "RuntimeError: Session is closed" errors in my web crawler.
The main loop makes it through the first iteration, fetching and processing all pages in the URL queue without any issue. But then as it enters fetch_pages() in the 2nd iteration of the main loop, and makes first call to aiohttp.ClientSession.session.get(), it throws "RuntimeError: Session is closed".
I don't understand why I would be getting this error, because it appears to me that the code below should be creating a new aiohttp.ClientSession() context manager each time the get_batch() function below is called, and closing the session at the end of the function call. But this is not happening. Can someone explain to me why I am getting this error?
I have posted the relevant portions of my code below (I tried to trim as much as possible, but have included links to full source below).
Here is the main loop:
class Crawler():
((...))
def __init__(self):
self.loop = asyncio.get_event_loop()
self.url_queue = URLQueue(maxsize=10000) # urls are popped from URL queue
self.page_queue = asyncio.PriorityQueue() # when fetched, they are placed on page queue for html processing
((...))
async def fetch_pages(self):
print("Entering fetch_page()")
pages, errors = [], []
if self.url_queue.empty():
await asyncio.sleep(1)
else:
await self.fetcher.get_batch(self.BATCH_SIZE, self.url_queue, self.page_queue, self.error_queue)
((...))
async def process_html(self): ...
async def analyze_content(self): ...
async def extract_links(self): ...
async def index_content(self): ...
async def handle_errors(self): ...
((...))
async def main(self):
try:
while True:
tasks = [t.loop.create_task(t.fetch_pages()),
t.loop.create_task(t.process_html()),
t.loop.create_task(t.analyze_content()),
t.loop.create_task(t.index_content()),
t.loop.create_task(t.handle_errors())]
await asyncio.gather(*tasks)
except KeyboardInterrupt:
print("shutting down")
finally:
print("Pretending to save the URL queue, etc ... ")
t = Crawler()
if __name__ == "__main__":
#asyncio.run(crawler.crawl(index), debug=True)
t.loop.run_until_complete(t.main())
(full code here) ...
and here is the code for the fetch loop:
class Fetcher():
((...))
def __init__(self, domain_manager=None, http_headers = None, dns_cache_lifetime = 300, request_timeout = 30,
connection_timeout = 5, max_connections = 20, max_connections_per_host = 5, obey_robots = False,
verify_ssl_certs = False):
self.loop = asyncio.get_event_loop()
self.domain_manager = domain_manager # rate limit requests / robots.txt on per-domain basis
self.timeout = aiohttp.ClientTimeout(total=request_timeout,
connect=connection_timeout)
self.connector = aiohttp.TCPConnector(ttl_dns_cache=dns_cache_lifetime,
limit=max_connections,
limit_per_host=max_connections_per_host,
ssl=verify_ssl_certs)
async def fetch(self, url, session):
try:
async with session.get(url) as resp:
status = int(resp.status)
headers = dict(resp.headers)
if self.check_response_headers(url, status, headers):
html = await resp.text()
return {'url': url,
'headers': headers,
'html': html,
'last_visit': datetime.now()}
else:
raise FetchError(f"Fetch failed for url {url}: Header check failed (but why did we make it here?)",
url=url, exception=e, fetch_stage="GET")
except UnicodeDecodeError as e:
((...))
def check_response_headers(self, url, status, headers):
"""Given a response from fetch(), return a (Page object, error object) pair"""
((...))
async def fetch_with_dm(self, url, session, i):
"""fetches next url from queue until successfully fetches a page"""
domain = self.domain_manager.domain_from_url(url)
((...))
async with self.domain_manager.locks[domain]:
((...))
fetch_result = await self.fetch(url, session)
return fetch_result
async def get_batch(self, batch_size, url_queue, page_queue, error_queue):
start_time = datetime.now()
async with aiohttp.ClientSession(timeout=self.timeout, connector=self.connector) as session:
tasks = []
for i in range(batch_size):
url = None
score = None
if url_queue.empty():
break
else:
score, url = url_queue.get_nowait() # should we be blocking here / await / sleeping if no urls in queue?
if url == None:
raise ValueError("Received empty URL")
if score == None:
raise ValueError("Received empty URL score")
tasks.append(self.loop.create_task(self.fetch_with_dm(url, session, i)))
for p in asyncio.as_completed(tasks):
try:
page = await p
page['url_score'] = score
await page_queue.put((score, id(page), page))
except FetchError as fe:
await error_queue.put(fe)
(full code here)
... Again the "session closed" error is occuring when session.get(url) is called in fetch, but only in the second iteration of main loop ...
I have the following script to download (and later on process) Wikipedia's pageviews dumps. I am getting 503 errors on all the pages (whose urls are correct).
import argparse
import aiohttp
import asyncio
import async_timeout
import re
base_url = "http://dumps.wikimedia.org/other/pagecounts-raw/{year}/{year}-{month:02d}/pagecounts-{year}{month:02d}{day:02d}-{hour:02d}0000.gz"
async def downloadFile(semaphore, session, url):
try:
async with semaphore:
with async_timeout.timeout(10):
async with session.get(url) as remotefile:
if remotefile.status == 200:
data = await remotefile.read()
outfile = re.sub("/", "_", url[7:])
with open(outfile, 'wb') as fp:
print('Saving')
fp.write(data)
else:
print(remotefile.status)
return
except Exception as e:
print(e)
return
async def aux(urls):
sem = asyncio.Semaphore(10)
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
print(url)
task = asyncio.ensure_future(downloadFile(sem, session, url))
tasks.append(task)
await asyncio.gather(*tasks)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--year", type=int, default=2016)
parser.add_argument("--month", type=int, default=4)
parser.add_argument("--temp_folder", type=str)
args = parser.parse_args()
urls = []
for day in range(1, 32)[:3]:
for hour in range(24)[:2]:
urls.append(base_url.format(
year=args.year, month=args.month, day=day, hour=hour))
loop = asyncio.get_event_loop()
asyncio.ensure_future(aux(urls))
loop.run_until_complete(aux(urls))
if __name__ == "__main__":
main()
The error I'm getting is:
<ClientResponse(https://dumps.wikimedia.org/other/pagecounts-raw/2016/2016-04/pagecounts-20160402-000000.gz) [503 Service Temporarily Unavailable]>
<CIMultiDictProxy('Server': 'nginx/1.13.6', 'Date': 'Wed, 24 Oct 2018 21:27:58 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Content-Length': '213', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=106384710; includeSubDomains; preload')>
But this is really weird as copy-pasting the same urls on my chrome browser does the job!
I played with code and I can say following:
Wikipedia doesn't allow multiple requests per IP
Timeout 10 for this url is too low
To make your code work:
Change asyncio.Semaphore(10) to asyncio.Semaphore(1)
Change async_timeout.timeout(10) to async_timeout.timeout(120)
Completely remove line asyncio.ensure_future(aux(urls)), you don't need it since you pass you pass aux(urls) to run_until_complete
Final version that successfully downloads single archive:
import argparse
import aiohttp
import asyncio
import async_timeout
import re
base_url = "http://dumps.wikimedia.org/other/pagecounts-raw/{year}/{year}-{month:02d}/pagecounts-{year}{month:02d}{day:02d}-{hour:02d}0000.gz"
async def downloadFile(semaphore, session, url):
try:
async with semaphore:
with async_timeout.timeout(120):
async with session.get(url, ssl=False) as remotefile:
if remotefile.status == 200:
data = await remotefile.read()
outfile = re.sub("/", "_", url[7:])
with open(outfile, 'wb') as fp:
print('Saving')
fp.write(data)
else:
print('status:', remotefile.status)
return
except Exception as e:
print('exception:', type(e), str(e))
return
async def aux(urls):
sem = asyncio.Semaphore(1)
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
print('url:', url)
task = asyncio.ensure_future(downloadFile(sem, session, url))
tasks.append(task)
await asyncio.gather(*tasks)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--year", type=int, default=2016)
parser.add_argument("--month", type=int, default=4)
parser.add_argument("--temp_folder", type=str)
args = parser.parse_args()
urls = []
for day in range(1, 32)[:1]:
for hour in range(24)[:1]:
urls.append(base_url.format(
year=args.year, month=args.month, day=day, hour=hour))
loop = asyncio.get_event_loop()
loop.run_until_complete(aux(urls))
if __name__ == "__main__":
main()