Abandoning Futures in Tornado - python

I'm considering a fan-out proxy in tornado to query multiple backend servers and the possible use-case of having it not wait for all responses before returning.
Is there a problem with the remaining futures if you use a WaitIterator but not continuing to wait after receiving a useful response?
Perhaps the results of the other futures will not be cleaned up? Perhaps callbacks could be added to any remaining futures to discard their results?
#!./venv/bin/python
from tornado import gen
from tornado import httpclient
from tornado import ioloop
from tornado import web
import json
class MainHandler(web.RequestHandler):
#gen.coroutine
def get(self):
r1 = httpclient.HTTPRequest(
url="http://apihost1.localdomain/api/object/thing",
connect_timeout=4.0,
request_timeout=4.0,
)
r2 = httpclient.HTTPRequest(
url="http://apihost2.localdomain/api/object/thing",
connect_timeout=4.0,
request_timeout=4.0,
)
http = httpclient.AsyncHTTPClient()
wait = gen.WaitIterator(
r1=http.fetch(r1),
r2=http.fetch(r2)
)
while not wait.done():
try:
reply = yield wait.next()
except Exception as e:
print("Error {} from {}".format(e, wait.current_future))
else:
print("Result {} received from {} at {}".format(
reply, wait.current_future,
wait.current_index))
if reply.code == 200:
result = json.loads(reply.body)
self.write(json.dumps(dict(result, backend=wait.current_index)))
return
def make_app():
return web.Application([
(r'/', MainHandler)
])
if __name__ == '__main__':
app = make_app()
app.listen(8888)
ioloop.IOLoop.current().start()

So I've checked through the source for WaitIterator.
It tracks the futures adding a callback, when fired the iterator queues the result or (if you've called next()) fulfils a future it's given to you.
As the future you wait on only gets created by calling .next(), it appears you can exit out of the while not wait.done() and not leave any futures without observers.
Reference counting ought to allow the WaitIterator instance to remain until after all the futures have fired their callbacks and then be reclaimed.
Update 2017/08/02
Having tested further with subclassing WaitIterator with extra logging, yes the iterator will be cleaned up when all the futures return, but if any of those futures return an exception it will be logged that this exception hasn't been observed.
ERROR:tornado.application:Future exception was never retrieved: HTTPError: HTTP 599: Timeout while connecting
In summary and answering my question: completing the WaitIterator isn't necessary from a clean-up point of view, but it is probably desirable to do so from a logging point of view.
If you wanted to be sure, passing the the wait iterator to a new future that will finish consuming it and adding an observer may suffice. For example
#gen.coroutine
def complete_wait_iterator(wait):
rounds = 0
while not wait.done():
rounds += 1
try:
reply = yield wait.next()
except Exception as e:
print("Not needed Error {} from {}".format(e, wait.current_future))
else:
print("Not needed result {} received from {} at {}".format(
reply, wait.current_future,
wait.current_index))
log.info('completer finished after {n} rounds'.format(n=rounds))
class MainHandler(web.RequestHandler):
#gen.coroutine
def get(self):
r1 = httpclient.HTTPRequest(
url="http://apihost1.localdomain/api/object/thing",
connect_timeout=4.0,
request_timeout=4.0,
)
r2 = httpclient.HTTPRequest(
url="http://apihost2.localdomain/api/object/thing",
connect_timeout=4.0,
request_timeout=4.0,
)
http = httpclient.AsyncHTTPClient()
wait = gen.WaitIterator(
r1=http.fetch(r1),
r2=http.fetch(r2)
)
while not wait.done():
try:
reply = yield wait.next()
except Exception as e:
print("Error {} from {}".format(e, wait.current_future))
else:
print("Result {} received from {} at {}".format(
reply, wait.current_future,
wait.current_index))
if reply.code == 200:
result = json.loads(reply.body)
self.write(json.dumps(dict(result, backend=wait.current_index)))
consumer = complete_wait_iterator(wait)
consumer.add_done_callback(lambda f: f.exception())
return

Related

REQUESTS Maximum number of attempts with a waiting time and in case of failure, give a message in Python

the situation is that sometimes a request does not load or gets stuck in Python, in case that happens or any error occurs, I would like to retry it "n" times and wait up to a maximum of 3 seconds for each one and in case the attempts are over tell me a message that f"Could not process {type_1} and {type_2}". Everything runs in parallel with concurrent.futures. Could you help me with that?
import Requests
import concurrent.futures
import json
data = [['PEN','USD'],['USD','EUR']]
def currency(element):
type_1 =element[0]
type_2 = element[1]
s = requests.Session()
url = f'https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&fromCurr={type_1}&toCurr={type_2}'
a = s.get(url)
response = json.loads(a)
value = response["convertedAmount"]
return value
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(
currency, data)
for value in results:
print(value)
Your code is almost there. Here, I modified a few things:
from concurrent.futures import ThreadPoolExecutor
import time
import requests
def convert_currency(tup):
from_currency, to_currency = tup
url = (
"https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0"
"&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&"
f"fromCurr={from_currency}&toCurr={to_currency}"
)
session = requests.Session()
for _ in range(3):
try:
response = session.get(url, timeout=3)
if response.ok:
return response.json()["convertedAmount"]
except requests.exceptions.ConnectTimeout:
time.sleep(3)
return f"Could not process {from_currency} and {to_currency}"
data = [["VND", "XYZ"], ['PEN','USD'], ["ABC", "XYZ"], ['USD','EUR'], ["USD", "XXX"]]
with ThreadPoolExecutor() as executor:
results = executor.map(convert_currency, data)
for value in results:
print(value)
Notes
I retried 3 times (see the for loop)
Use timeout= to specify the time out (in seconds)
The .ok attribute will tell if the call was successful
No need to import json as the response object can JSON decode with the .json() method
You might experiment between ThreadPoolExecutor and ProcessPoolExecutor to see which one performs better

Asynchronous requests inside the for loop in python

I have this snippet
config = {10: 'https://www.youtube.com/', 5: 'https://www.youtube.com/', 7: 'https://www.youtube.com/',
3: 'https://sportal.com/', 11: 'https://sportal.com/'}
def test(arg):
for key in arg.keys():
requests.get(arg[key], timeout=key)
test(config)
On that way the things are happaning synchronously. I want to do it аsynchronously. I want to iterate through the loop without waiting for response for each address and to go ahead to the next one. And so until I iterate though all addresses in dictionary. Than I want to wait until I get all responses for all addresses and after that to get out of test function. I know that I can do it with threading but I read that with asyncio lyb it can be done better, but I couldn't implement it. If anyone have even better suggestions I am open for them. Here is my try:
async def test(arg):
loop = asyncio.get_event_loop()
tasks = [loop.run_in_executor(requests.get(arg[key], timeout=key) for key in arg.keys())]
await asyncio.gather(*tasks)
asyncio.run(test(config))
Here is the solution:
def addresses(adr, to):
requests.get(adr, timeout=to)
async def test(arg):
loop = asyncio.get_event_loop()
tasks = [loop.run_in_executor(None, addresses, arg[key], key) for key in arg.keys()]
await asyncio.gather(*tasks)
asyncio.run(test(config))
Now it works аsynchronously with lyb asyncio not with threading.
Some good answers here. I had trouble with this myself (I do a lot of webscraping) and so I created a package to help me async-scrape (https://pypi.org/project/async-scrape/).
It supports GET and POST. I tried to make it as easy to use as possible. You just need to specify a handler function for the response when you instantiate and then use the scrape_all method to do the work.
It uses the term scrape becasue i've build in some handlers for common errors when scraping websites.
You can do some things in it as well like limit the call rate if you find you're getting blocked.
An example of it's use is:
# Create an instance
from async_scrape import AsyncScrape
def post_process(html, resp, **kwargs):
"""Function to process the gathered response from the request"""
if resp.status == 200:
return "Request worked"
else:
return "Request failed"
async_Scrape = AsyncScrape(
post_process_func=post_process,
post_process_kwargs={},
fetch_error_handler=None,
use_proxy=False,
proxy=None,
pac_url=None,
acceptable_error_limit=100,
attempt_limit=5,
rest_between_attempts=True,
rest_wait=60,
call_rate_limit=None,
randomise_headers=True
)
urls = [
"https://www.google.com",
"https://www.bing.com",
]
resps = async_Scrape.scrape_all(urls)
To do this inside a loop i collect the results and add then to a set and pop off the old ones.
EG
from async_scrape import AsyncScrape
from bs4 import BeautifulSoup as bs
def post_process(html, resp, **kwargs):
"""Function to process the gathered response from the request"""
new_urls = bs.findall("a", {"class":"new_link_on_website"}
return [new_urls, resp]
async_Scrape = AsyncScrape(
post_process_func=post_process,
post_process_kwargs={}
)
# Run the loop
urls = set(["https://initial_webpage.com/"])
processed = set()
all_resps = []
while len(urls):
resps = async_scrape.scrape_all(urls)
# Get failed urls
success_reqs = set([
r["req"] for r in resps
if not r["error"]
])
errored_reqs = set([
r["req"] for r in resps
if r["error"]
])
# Get what you want from the responses
for r in success_reqs:
# Add found urls to urls
urls |= set(r["func_resp"][0]) # "func_resp" is the key to the return from your handler function
# Collect the response
all_resps.extend(r["func_resp"][1])
# Add to processed urls
processed.add(r["url"]) # "url" is the key to the url from the response
# Remove processed urls
urls = urls - processed

iot edge direct method handler in python

I have created a module for a Bacnet scan and it will respond with a list of devices and its address as a result. But I'm having trouble implementing a direct method handler in python. When i first tried implementing it myself i got this error. Which could mean I didn't successfully register the direct method callback. I have some references but it was from C# and azure docs is not helping me figure out the right method to register the callback. for IoTHubModuleClient there's a on_method_request_received and a receive_method_request. appreciate any help!
def iothub_client_scan_run():
try:
iot_client = iothub_client_init()
bacnet_scan_listener_thread = threading.Thread(target=device_method_listener, args=(iot_client,))
bacnet_scan_listener_thread.daemon = True
bacnet_scan_listener_thread.start()
while True:
time.sleep(1000)
def device_method_listener(iot_client):
while True:
# Receive the direct method request
method_request = iot_client.receive_method_request()
print (
"\nMethod callback called with:\nmethodName = {method_name}\npayload = {payload}".format(
method_name=method_request.name,
payload=method_request.payload
)
)
if method_request.name == "runBacnetScan":
response = bacnet_scan_device(method_request)
else:
response_payload = {"Response": "Direct method {} not defined".format(method_request.name)}
response_status = 404
# Send a method response indicating the method request was resolved
print('Sending method response')
iot_client.send_method_response(response)
print('Message sent!')
Edit:
Here is my route config
I was able to resolve my issue or at least find the root cause and it was my network configuration under the createOptions. It seems like there's an issue when I'm trying to do NetworkMode: host and connects to the IotModuleClient.connect_from_edge_environment via connect with connection string. I'm still trying to tweak the connection configuration but at least i know its not on the code.
async def method_request_handler(module_client):
while True:
method_request = await module_client.receive_method_request()
print (
"\nMethod callback called with:\nmethodName = {method_name}\npayload = {payload}".format(
method_name=method_request.name,
payload=method_request.payload
)
)
if method_request.name == "method1":
payload = {"result": True, "data": "some data"} # set response payload
status = 200 # set return status code
print("executed method1")
elif method_request.name == "method2":
payload = {"result": True, "data": 1234} # set response payload
status = 200 # set return status code
print("executed method2")
else:
payload = {"result": False, "data": "unknown method"} # set response payload
status = 400 # set return status code
print("executed unknown method: " + method_request.name)
# Send the response
method_response = MethodResponse.create_from_method_request(method_request, status, payload)
await module_client.send_method_response(method_response)
print('Message sent!')
def stdin_listener():
while True:
try:
selection = input("Press Q to quit\n")
if selection == "Q" or selection == "q":
print("Quitting...")
break
except:
time.sleep(10)
# Schedule task for C2D Listener
listeners = asyncio.gather(input1_listener(module_client), twin_patch_listener(module_client), method_request_handler(module_client))

Python Hanging Threads

I have the following code:
final = []
with futures.ThreadPoolExecutor(max_workers=self.number_threads) as executor:
_futures = [executor.submit(self.get_attribute, listing,
self.proxies[listings.index(listing) % len(self.proxies)]) for listing
in listings]
for result in futures.as_completed(_futures):
try:
listing = result.result()
final.append(listing)
except Exception as e:
print traceback.format_exc()
return final
The self.get_attribute function that's submitted to the executor takes a dictionary and proxy as input and makes either one or two http requests to get some data and return with an edited dictionary. The problem is that the workers/threads hang towards the end of completing all the submitted tasks. If I submit 400 dictionaries, it will complete ~380 tasks, and then hang. If I submit 600, it will complete ~570-580. However if I submit 25, it will complete all of them. I'm not sure what the threshold is at which it will go from finishing to not finishing.
I have also tried using a queue and threading system like this:
def _get_attribute_thread(self):
while self.q.not_empty:
job = self.q.get()
listing = job['listing']
proxy = job['proxy']
self.threaded_results.put(self.get_attribute(listing, proxy))
self.q.task_done()
def _get_attributes_threaded_with_proxies(self, listings):
for listing in listings:
self.q.put({'listing': listing, 'proxy': self.proxies[listings.index(listing) % len(self.proxies)]})
for _ in xrange(self.number_threads):
thread = threading.Thread(target=self._get_attribute_thread)
thread.daemon = True
thread.start()
self.q.join()
final = []
while self.threaded_results.not_empty:
final.append(self.threaded_results.get())
return final
However the result is the same. What can I do to fix/debug the problem? Thanks in advance.

How to get url that were timeouted or got error?

I have python class on python+tornado, that works like crawler. I have lot of links on the same site and i need to got responses from all of them to my data base.
So difficult in this: I cant understand how can i catch urls, that got error(timeout, or runtime exeptions).
I know how to fix this with newbie-code(i've just 1 week code on python) - compare list of input links and output, but i want to do right way.
Can u tell me how can i do this?
import time
import requests
import json
from tornado import gen, ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue
class Scraper():
def __init__(self, source='', destinations=None, transform=None, headers={ }, max_clients=20, maxsize=20, connect_timeout=600, request_timeout=600 ):
"""Instantiate a tornado async http client to do many URL requests"""
if None in destinations:
sys.stderr.write('You must pass both collection of URLS and a transform function')
raise SystemExit
self.max_clients = max_clients
self.maxsize = maxsize
self.connect_timeout = connect_timeout
self.request_timeout = request_timeout
# AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=50)
AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
self.headers = headers
self.http_client = AsyncHTTPClient()
self.queue = Queue(maxsize=20)
self.source = source
self.destinations = destinations
self.transform = transform
self.read(self.destinations)
self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout)
self.loop = ioloop.IOLoop.current()
self.join_future = self.queue.join()
def done(future):
self.loop.stop()
self.join_future.add_done_callback(done)
self.loop.start()
#gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
#gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout):
while True:
url = yield self.queue.get()
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
future = self.http_client.fetch(request)
def done_callback(future):
self.queue.task_done()
body = future.result().body
transform(body)
future.add_done_callback(done_callback)
def transform_data(body, url=''):
#SOMECODE
a = ['link1', 'link2']
scraper = Scraper(destinations=a, transform=transform_data)
In a coroutine you can "yield" a future. The coroutine pauses until the future is resolved into a result or an exception:
try:
result = yield self.http_client.fetch(request)
except Exception as exc:
print("Failure!: %s" % exc)
else:
self.queue.task_done()
body = result.body
transform(body)
For more examples, see the Tornado documentation for HTTP clients.

Categories