Requests/aiohttp: closing response objects - python

I'm a bit confused about the need to .close() a response object in both requests and aiohttp. (Note that this is a separate instance method than session.close()--I'm talking about the response object itself.)
Does Response (requests) or ClientResponse (aiohttp) ever need explicitly call .close()?
If not, what is the purpose of using the response itself as a context manager? (async with session.request('GET', 'https://www.pastebin.com') below.) Why define the two dunder methods for this if it gets closed implicitly as shown below?
Some simple tests (below) seem to imply that responses are closed automatically when they are defined inside of a Session context manager. (Which itself calls self.close() in __exit__ or __aexit__. But this is the closing of the Session, not the Response object.)
Example - requests
>>> import requests
>>>
>>> with requests.Session() as s:
... resp = s.request('GET', 'https://www.pastebin.com')
... resp.raise_for_status()
... print(resp.raw.closed) # `raw` is urllib3.response.HTTPResponse object
... print(resp.raw._pool)
... print(resp.raw._connection)
... c = resp.text
...
True
HTTPSConnectionPool(host='pastebin.com', port=443)
None
>>>
>>> while 1:
... print(resp.raw.closed)
... print(resp.raw._pool)
... print(resp.raw._connection)
... break
...
True
HTTPSConnectionPool(host='pastebin.com', port=443)
None
Example - aiohttp
>>> import asyncio
>>> import aiohttp
>>>
>>> async def get():
... async with aiohttp.ClientSession() as s:
... # The response is already closed after this `with` block.
... # Why would it need to be used as a context manager?
... resp = await s.request('GET', 'https://www.pastebin.com')
... print(resp._closed)
... print(resp._connection)
... print(resp._released)
... c = await resp.text()
... print()
... print(resp._closed)
... print(resp._connection)
... print(resp._released)
... return c
...
>>> c = asyncio.run(get()) # Python 3.7 +
False
Connection<ConnectionKey(host='pastebin.com', port=443, is_ssl=True, ssl=None, proxy=None, proxy_auth=None, proxy_headers_hash=None)>
False
True
None
False
Here's the source for requests.models.Response. What does "Should not normally need to be called explicitly" mean? What are the exceptions?
def close(self):
"""Releases the connection back to the pool. Once this method has been
called the underlying ``raw`` object must not be accessed again.
*Note: Should not normally need to be called explicitly.*
"""
if not self._content_consumed:
self.raw.close()
release_conn = getattr(self.raw, 'release_conn', None)
if release_conn is not None:
release_conn()

Requests: You need not explicitly call close(). request will automatically close after finished because it bases on urlopen (this is why resp.raw.closed is True), This is the simplified code after i watched session.py and adapters.py:
from urllib3 import PoolManager
import time
manager = PoolManager(10)
conn = manager.connection_from_host('host1.example.com')
conn2 = manager.connection_from_host('host2.example.com')
res = conn.urlopen(url="http://host1.example.com/",method="get")
print(len(manager.pools))
manager.clear()
print(len(manager.pools))
print(res.closed)
#2
#0
#True
Then what did the __exit__ do? It uses to clear PoolManager(self.poolmanager=PoolManager(...)) and proxy.
# session.py
def __exit__(self, *args): #line 423
self.close()
def close(self): #line 733
for v in self.adapters.values():
v.close()
# adapters.py
# v.close()
def close(self): #line 307
self.poolmanager.clear()
for proxy in self.proxy_manager.values():
proxy.clear()
So when should you need to use close() , as the note said Releases the connection back to the pool, because DEFAULT_POOLSIZE = 10(http/https are independent). That means if you want to access more than 10 website with one session , you can chose to close some you do not need otherwise manager will close connection from the first to the newest when you have one more. But actually you need not to care about this , you can specify pool size and it would not waste much time to rebuild connection
aiohttp aiohttp.ClientSession() is using one TCPConnector for all requests. When it triggered __aexit__ , self._connector will be closed.
Edit: s.request() is set up a connection from host but it did not get response. await resp.text() can only be done after got response, if you did not do such step(wait for response), you will exit without having response.
if connector is None: #line 132
connector = TCPConnector(loop=loop)
...
self._connector = connector #line 151
# connection timeout
try:
with CeilTimeout(real_timeout.connect,loop=self._loop):
assert self._connector is not None
conn = await self._connector.connect(
req,
traces=traces,
timeout=real_timeout
)
...
async def close(self) -> None:
if not self.closed:
if self._connector is not None and self._connector_owner:
self._connector.close()
self._connector = None
...
async def __aexit__(self,
...) -> None:
await self.close()
This is code to show what i said
import asyncio
import aiohttp
import time
async def get():
async with aiohttp.ClientSession() as s:
# The response is already closed after this `with` block.
# Why would it need to be used as a context manager?
resp = await s.request('GET', 'https://www.stackoverflow.com')
resp2 = await s.request('GET', 'https://www.github.com')
print("resp:",resp._closed)
print("resp:",resp._connection)
print("resp2:",resp2._closed)
print("resp2:",resp2._connection)
s.close()
print(s.closed)
c = await resp.text()
d = await resp2.text()
print()
print(s._connector)
print("resp:",resp._closed)
print("resp:",resp._connection)
print("resp2:",resp2._closed)
print("resp2:",resp2._connection)
loop = asyncio.get_event_loop()
loop.run_until_complete(get()) # Python 3.5 +
#dead loop

Related

TypeError: Use async with instead

When I run the below snippet, I'm getting TypeError: Use async with instead.
headers = {'Content-type':'application/json'}
res = requests.post(end_point_recommend, data=json.dumps(dict_query), headers=headers)
print(res.ok)
print(json.dumps(res.json(), indent=2))
NUM = 10
CONCURRENT = 2
VERBOSE = True
payload = {13:5.0, 191:5.0, 209:5.0}
payload_list = [payload]*NUM
%%time
# Run:
with aiohttp.ClientSession() as session: # We create a persistent connection
loop = asyncio.get_event_loop()
calc_routes = loop.run_until_complete(run_load_test(end_point_recommend, payload_list, session, CONCURRENT, VERBOSE))
TypeError Traceback (most recent call last)
<timed exec> in <module>
D:\Anaconda\envs\practise\lib\site-packages\aiohttp\client.py in __enter__(self)
1068
1069 def __enter__(self) -> None:
-> 1070 raise TypeError("Use async with instead")
1071
1072 def __exit__(
TypeError: Use async with instead
I'm using aiohttp '3.7.3'. How to resolve this error?
I came across a similar issue in this Github Issue, but that solution didn't work out for me (i.e on replacing
with aiohttp.ClientSession() as session:
with
async with aiohttp.ClientSession() as session:,
I get the below error
SyntaxError: 'async with' outside async function
Function Definition of run_load_test
def run_load_test(url, payloads, _session, concurrent, verbose):
http_client = chunked_http_client(num_chunks=concurrent, s=_session)
# http_client returns futures, save all the futures to a list
tasks = [http_client(url, payload, verbose) for payload in payloads]
dfs_route = []
# wait for futures to be ready then iterate over them
for future in asyncio.as_completed(tasks):
data = yield from future
try:
dfs_route.append(data)
except Exception as err:
print("Error {0}".format(err))
return dfs_route
As the error says, async with must be in an async function. You can modify run_load_test to create the session, or you can define a new function that does that, and awaits run_load_test:
async def run(end_point_recommend, payload_list, concurrency, verbosity):
async with aiohttp.ClientSession() as session:
return await run_load_test(end_point_recommend, payload_list, session, concurrency, verbosity)
At top-level you can call the new function without the session argument:
%%time
# Run:
loop = asyncio.get_event_loop()
calc_routes = loop.run_until_complete(run(end_point_recommend, payload_list, CONCURRENT, VERBOSE))
replace
with aiohttp.ClientSession(loop=loop) as session:
to
async with aiohttp.ClientSession(loop=loop) as session:

A blocked Python async function invocation also block another async function

I use FastAPI to develope data layer APIs accessing SQL Server.
No mater using pytds or pyodbc,
if there is a database transaction caused any request hangs,
all the other requests would be blocked. (even without database operation)
Reproduce:
Intentaionally do a serializable SQL Server session, begin a transaction and do not rollback or commit
INSERT INTO [dbo].[KVStore] VALUES ('1', '1', 0)
begin tran
SET TRANSACTION ISOLATION LEVEL Serializable
SELECT * FROM [dbo].[KVStore]
Send a request to the API with async handler function like this:
def kv_delete_by_key_2_sql():
conn = pytds.connect(dsn='192.168.0.1', database=cfg.kvStore_db, user=cfg.kvStore_uid,
password=cfg.kvStore_upwd, port=1435, autocommit=True)
engine = conn.cursor()
try:
sql = "delete KVStore; commit"
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(engine.execute, sql)
rs = future.result()
j = {
'success': True,
'rowcount': rs.rowcount
}
return jsonable_encoder(j)
except Exception as exn:
j = {
'success': False,
'reason': exn_handle(exn)
}
return jsonable_encoder(j)
#app.post("/kvStore/delete")
async def kv_delete(request: Request, type_: Optional[str] = Query(None, max_length=50)):
request_data = await request.json()
return kv_delete_by_key_2_sql()
And send a request to the API of the same app with async handler function like this:
async def hangit0(request: Request, t: int = Query(0)):
print(t, datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])
await asyncio.sleep(t)
print(t, datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])
j = {
'success': True
}
return jsonable_encoder(j)
#app.get("/kvStore/hangit/")
async def hangit(request: Request, t: int = Query(0)):
return await hangit0(request, t)
I expected step.2 would hang and step.3 should directly return after 2 seconds.
However step.3 never return if the transaction doesn't commit or rollback...
How do I make these handler functions work concurrently?
The reason is that rs = future.result() is actually a blocking call - see python docs. Unfortunately, executor.submit() doesn't return an awaitable object (concurrent.futures.Future is different from asyncio.Future.
You can use asyncio.wrap_future which takes concurrent.futures.Future and returns asyncio.Future (see python docs). The new Future object is awaitable thus you can convert your blocking function into an async function.
An Example:
import asyncio
import concurrent.futures
async def my_async():
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(lambda x: x + 1, 1)
return await asyncio.wrap_future(future)
print(asyncio.run(my_async()))
In your code, simply change the rs = future.result() to rs = await asyncio.wrap_future(future) and make the whole function async. That should do the magic, good luck! :)

How do I download a large list of URLs in parallel in pyspark?

I have an RDD containing 10000 urls to be fetched.
list = ['http://SDFKHSKHGKLHSKLJHGSDFKSJH.com',
'http://google.com',
'http://twitter.com']
urls = sc.parallelize(list)
I need to check which urls are broken and preferably fetch the results to a corresponding RDD in Python. I tried this:
import asyncio
import concurrent.futures
import requests
async def get(url):
with concurrent.futures.ThreadPoolExecutor() as executor:
loop = asyncio.get_event_loop()
futures = [
loop.run_in_executor(
executor,
requests.get,
i
)
for i in url
]
return futures
async def get_response(futures):
response = await asyncio.gather(futures,return_exceptions=True)
return(response)
tasks = urls.map(lambda query: get(query)) # Method returns http call response as a Future[String]
results = tasks.map(lambda task: get_response(task) )
results = results.map(lambda response:'ERR' if isinstance(response, Exception) else 'OK' )
results.collect()
I get the following output which obviously is not right:
['OK', 'OK', 'OK']
I also tried this:
import asyncio
import concurrent.futures
import requests
async def get():
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
loop = asyncio.get_event_loop()
futures = [
loop.run_in_executor(
executor,
requests.get,
i
)
for i in urls.toLocalIterator()
]
for response in await asyncio.gather(*futures,return_exceptions=True):
print('{}: {}'.format(response, 'ERR' if isinstance(response, Exception) else 'OK'))
pass
loop = asyncio.get_event_loop()
loop.run_until_complete(get())
I get the following output:
HTTPConnectionPool(host='SDFKHSKHGKLHSKLJHGSDFKSJH.com', port=80): Max retries exceeded with url: / (Caused by
NewConnectionError('<urllib3.connection.HTTPConnection object at 0x12c834210>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known')): ERR
<Response [200]>: OK
<Response [200]>: OK
Desired output would be something like this:
http://SDFKHSKHGKLHSKLJHGSDFKSJH.com : ERR
http://google.com : OK
http://twitter.com : OK
But the problem with the second approach is the use of lists to store future objects. I believe that using RDD is better, since number of urls can be in millions or billions and no singe machine can handle it. Also it is not clear to me how to retrieve urls from responses.
If you're using concurrent.futures, you don't need asyncio at all (it will bring you no benefits since you are running in multiple threads anyway). You can use concurrent.futures.wait() to wait for multiple futures in parallel.
I can't test your data, but it should work with code like this:
import concurrent.futures, requests
def get_one(url):
resp = requests.get(url)
resp.raise_for_status()
return resp.text
def get_all():
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(get_one, url)
for url in urls.toLocalIterator()]
# the end of the "with" block will automatically wait
# for all of the executor's tasks to complete
for fut in futures:
if fut.exception() is not None:
print('{}: {}'.format(fut.exception(), 'ERR')
else:
print('{}: {}'.format(fut.result(), 'OK')
To do the same thing with asyncio, you should use aiohttp instead.
You can try pyspark-asyncactions
The naming convention for the patched methods is methodNameAsync, for example:
RDD.count ⇒ RDD.countAsync
DataFrame.take ⇒ RDD.takeAsync
DataFrameWriter.save ⇒ DataFrameWriter.saveAsync
Usage
To patch existing classes just import the package:
>>> import asyncactions
>>> from pyspark.sql import SparkSession
>>>
>>> spark = SparkSession.builder.getOrCreate()
All *Async methods return concurrent.futures.Future:
>>> rdd = spark.sparkContext.range(100)
>>> f = rdd.countAsync()
>>> f
<Future at ... state=running>
>>> type(f)
concurrent.futures._base.Future
>>> f.add_done_callback(lambda f: print(f.result()))
100

Asyncio and aiohttp returning task instead of results

I have a script to run parallel requests against an API within a class. However, the results I'm getting is basically a task instead of the actual results. Any reason why?
I mimicked the modified Client code on https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html.
import asyncio
from aiohttp import ClientSession
class Requestor:
async def _async_request(self, url, session, sema_sz=10):
sema = asyncio.Semaphore(sema_sz)
async with sema:
async with session.get(url) as response:
req = await response.json()
return req
async def _async_chunk_request(self, url, chunks, headers=None, sema_sz=10):
async with ClientSession(headers=headers) as session:
futures = [asyncio.ensure_future(self._async_request(url.format(chunk), session, sema_sz)) for chunk in chunks]
responses = asyncio.gather(*futures)
await responses
def get_request(self, url, chunks):
loop = asyncio.get_event_loop()
bulk_req = asyncio.ensure_future(self._async_chunk_request(url, chunks))
loop.run_until_complete(bulk_req)
return bulk_req
bulk_req is actually a Task variable and not the results and shows this in PyCharm, Task finished coro=<Requestor._async_chunk_request() done, defined at ...
When I debug, I see that req has a full and proper response value, so there's no issue with that. I feel like it's something to do with the actual gathering of the futures?
Your _chunk_request does not return anything.
async def _chunk_request(...):
...
...
await responses
I made a toy example trying to mimic your process. If I ended _chunk_request like you did, i got the same result - a finished Task with No results. Changing _chunk_request to return something fixed it:
async def _chunk_request(...):
...
...
return await responses
If you only need the return values from the tasks, get_request should return the result of the loop.run_until_complete() call.
My toy example
import asyncio
import random
from pprint import pprint
async def response(n):
asyncio.sleep(random.choice([1,3,5]))
return f'i am {n}'
async def _request(n):
req = await response(n)
#print(req)
return req
async def _chunk_request(chunks):
futures = [asyncio.ensure_future(_request(chunk)) for chunk in chunks]
#pprint(futures)
responses = asyncio.gather(*futures, return_exceptions=True)
#pprint(responses)
return await responses
def get_request(chunks):
loop = asyncio.get_event_loop()
bulk_req = asyncio.ensure_future(_chunk_request(chunks))
return loop.run_until_complete(bulk_req)
In [7]: result = get_request(range(1,6))
In [8]: print(result)
['i am 1', 'i am 2', 'i am 3', 'i am 4', 'i am 5']

Tornado testing async requests

I need an advice regards testing tornado app. For now I just playing with demo chat application, but it looks like real-life problem.
In the handler I have:
class MessageUpdatesHandler(BaseHandler):
#tornado.web.authenticated
#tornado.web.asynchronous
def post(self):
cursor = self.get_argument("cursor", None)
global_message_buffer.wait_for_messages(self.on_new_messages,
cursor=cursor)
def on_new_messages(self, messages):
# Closed client connection
if self.request.connection.stream.closed():
return
self.finish(dict(messages=messages))
class MessageBuffer(object):
def __init__(self):
....
def wait_for_messages(self, callback, cursor=None):
if cursor:
new_count = 0
for msg in reversed(self.cache):
if msg["id"] == cursor:
break
new_count += 1
if new_count:
callback(self.cache[-new_count:])
return
self.waiters.add(callback)
def cancel_wait(self, callback):
.....
def new_messages(self, messages):
logging.info("Sending new message to %r listeners", len(self.waiters))
for callback in self.waiters:
try:
callback(messages)
except:
logging.error("Error in waiter callback", exc_info=True)
self.waiters = set()
self.cache.extend(messages)
if len(self.cache) > self.cache_size:
self.cache = self.cache[-self.cache_size:]
As I metioned full source code is in torndado demos
In my test I have:
#wsgi_safe
class MessageUpdatesHandlerTest(LoginedUserHanldersTest):
Handler = MessageUpdatesHandler
def test_add_message(self):
from chatdemo import global_message_buffer
kwargs = dict(
method="POST",
body='',
)
future = self.http_client.fetch(self.get_url('/'), callback=self.stop, **kwargs)
message = {
"id": '123',
"from": "first_name",
"body": "hello",
"html": "html"
}
global_message_buffer.new_messages([message])
response = self.wait()
self.assertEqual(response.code, 200)
self.mox.VerifyAll()
What happens:
It creates a future object
It sends a hello message, in this moment no waiter is registered
in MessageBuffer so callback is not called
In wait starts IoLoop and makes, a post fetch and waiter becomes
registered in MessageBuffer
Callback is never called and my response remains empty, so
everything fails with
AssertionError: Async operation timed out
after 5 seconds
What I want it to do:
On post register itself as a waiter
Receive some messages
Return to me a 200 response
Thank you for your help

Categories