I have python class on python+tornado, that works like crawler. I have lot of links on the same site and i need to got responses from all of them to my data base.
So difficult in this: I cant understand how can i catch urls, that got error(timeout, or runtime exeptions).
I know how to fix this with newbie-code(i've just 1 week code on python) - compare list of input links and output, but i want to do right way.
Can u tell me how can i do this?
import time
import requests
import json
from tornado import gen, ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue
class Scraper():
def __init__(self, source='', destinations=None, transform=None, headers={ }, max_clients=20, maxsize=20, connect_timeout=600, request_timeout=600 ):
"""Instantiate a tornado async http client to do many URL requests"""
if None in destinations:
sys.stderr.write('You must pass both collection of URLS and a transform function')
raise SystemExit
self.max_clients = max_clients
self.maxsize = maxsize
self.connect_timeout = connect_timeout
self.request_timeout = request_timeout
# AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=50)
AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
self.headers = headers
self.http_client = AsyncHTTPClient()
self.queue = Queue(maxsize=20)
self.source = source
self.destinations = destinations
self.transform = transform
self.read(self.destinations)
self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout)
self.loop = ioloop.IOLoop.current()
self.join_future = self.queue.join()
def done(future):
self.loop.stop()
self.join_future.add_done_callback(done)
self.loop.start()
#gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
#gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout):
while True:
url = yield self.queue.get()
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
future = self.http_client.fetch(request)
def done_callback(future):
self.queue.task_done()
body = future.result().body
transform(body)
future.add_done_callback(done_callback)
def transform_data(body, url=''):
#SOMECODE
a = ['link1', 'link2']
scraper = Scraper(destinations=a, transform=transform_data)
In a coroutine you can "yield" a future. The coroutine pauses until the future is resolved into a result or an exception:
try:
result = yield self.http_client.fetch(request)
except Exception as exc:
print("Failure!: %s" % exc)
else:
self.queue.task_done()
body = result.body
transform(body)
For more examples, see the Tornado documentation for HTTP clients.
Related
I use package requests together with urllib3.util.retry.Retry() to send tens of thousands of queries. I seek to count the number of queries and the number of necessary attempts until I successfully retrieve the desired data. My goal is to construct a measure for the reliability of the API.
To fix ideas, let's assume that the Response object of requests contains this data:
from requests import Session
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
def create_session():
session = Session()
retries = Retry(
total = 15,
backoff_factor = 0.5,
status_forcelist = [401, 408, 429, 500, 502, 504],
allowed_methods = frozenset(["GET"])
)
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
return session
urls = ['https://httpbin.org/status/500']
count_queries = len(urls)
count_attempts = 0
with create_session() as s:
for url in urls:
response = s.get(url)
count_attempts += response.total_retries
Since there is no such variable, I am looking for alternatives to count the total number of retries.
While I am unable to identify an approach to this problem, I made the following observations during my search which is potentially helpful:
urllib3 stores the retry-history in the Retry object. The urllib3.HTTPResponse stores the last Retry object (docs). The urllib3.HTTPResponse (to be precise, its undecoded body) is stored in requests.Response.raw, however only when stream=True (docs). In my understanding, I can't access this data.
One user provides a solution to a similar question that subclasses the Retry class. Essentially, a callback function is called which prints a string to a logger. This could be adapted to increment a counter instead of printing to logs. However, if possible, I prefer to track the retries specific to a particular get, as shown above, as opposed to all gets using the same session.
A very similar question was asked here, however no (working) solution was provided.
I'm using Python 3.9, urllib3 1.26.8, requests 2.26.0.
This is a rather verbose solution along the lines of this answer. It counts requests and retries on the session level (which, however, was not my preferred approach).
import requests
from urllib3.util.retry import Retry
class RequestTracker:
""" track queries and retries """
def __init__(self):
self._retries = 0
self._queries = 0
def register_retry(self):
self._retries += 1
def register_query(self):
self._queries += 1
#property
def retries(self):
return self._retries
#property
def queries(self):
return self._queries
class RetryTracker(Retry):
""" subclass Retry to track count of retries """
def __init__(self, *args, **kwargs):
self._request_tracker = kwargs.pop('request_tracker', None)
super(RetryTracker, self).__init__(*args, **kwargs)
def new(self, **kw):
""" pass additional information when creating new Retry instance """
kw['request_tracker'] = self._request_tracker
return super(RetryTracker, self).new(**kw)
def increment(self, method, url, *args, **kwargs):
""" register retry attempt when new Retry object with incremented counter is returned """
if self._request_tracker:
self._request_tracker.register_retry()
return super(RetryTracker, self).increment(method, url, *args, **kwargs)
class RetrySession(requests.Session):
""" subclass Session to track count of queries """
def __init__(self, retry):
super().__init__()
self._requests_count = retry
def prepare_request(self, request):
""" increment query counter """
# increment requests counter
self._requests_count.register_query()
return super().prepare_request(request)
class RequestManager:
""" manage requests """
def __init__(self, request_tracker=None):
# session settings
self.__session = None
self.__request_tracker = request_tracker
# retry logic specification
args = dict(
total = 11,
backoff_factor = 1,
status_forcelist = [401,408, 429, 500, 502, 504],
allowed_methods = frozenset(["GET"])
)
if self.__request_tracker is not None:
args['request_tracker'] = self.__request_tracker
self.__retries = RetryTracker(**args)
else:
self.__retries = Retry(**args)
#property
def session(self):
if self.__session is None:
# create new session
if self.__request_tracker is not None:
self.__session = RetrySession(self.__request_tracker)
else:
self.__session = requests.Session()
# mount https adapter with retry logic
https = requests.adapters.HTTPAdapter(max_retries=self.__retries)
self.__session.mount('https://', https)
return self.__session
#session.setter
def session(self, value):
raise AttributeError('Setting session attribute is prohibited.')
request_tracker = RequestTracker()
request_manager = RequestManager(request_tracker=request_tracker)
session = request_manager.session
urls = ['https://httpbin.org/status/500']
with session as s:
for url in urls:
response = s.get(url)
print(request_tracker.queries)
print(request_tracker.retries)
I am trying to pass my session object from one class to another. But I am not sure whats happening.
class CreateSession:
def __init__(self, user, pwd, url="http://url_to_hit"):
self.url = url
self.user = user
self.pwd = pwd
def get_session(self):
sess = requests.Session()
r = sess.get(self.url + "/", auth=(self.user, self.pwd))
print(r.content)
return sess
class TestGet(CreateSession):
def get_response(self):
s = self.get_session()
print(s)
data = s.get(self.url + '/some-get')
print(data.status_code)
print(data)
if __name__ == "__main__":
TestGet(user='user', pwd='pwd').get_response()
I am getting 401 for get_response(). Not able to understand this.
What's a 401?
The response you're getting means that you're unauthorised to access the resource.
A session is used in order to persist headers and other prerequisites throughout requests, why are you creating the session every time rather than storing it in a variable?
As is, the session should work the only issue is that you're trying to call a resource that you don't have access to. - You're not passing the url parameter either in the initialisation.
Example of how you can effectively use Session:
from requests import Session
from requests.exceptions import HTTPError
class TestGet:
__session = None
__username = None
__password = None
def __init__(self, username, password):
self.__username = username
self.__password = password
#property
def session(self):
if self.__session is None:
self.__session = Session()
self.__session.auth = (self.__user, self.__pwd)
return self.__session
#session.setter
def session(self, value):
raise AttributeError('Setting \'session\' attribute is prohibited.')
def get_response(self, url):
try:
response = self.session.get(url)
# raises if the status code is an error - 4xx, 5xx
response.raise_for_status()
return response
except HTTPError as e:
# you received an http error .. handle it here (e contains the request and response)
pass
test_get = TestGet('my_user', 'my_pass')
first_response = test_get.get_response('http://your-website-with-basic-auth.com')
second_response = test_get.get_response('http://another-url.com')
my_session = test_get.session
my_session.get('http://url.com')
I am building a web browser and i want to enable ad blocking in it.
I have read multiple answers, but I havent been able to implement it successfully.
I have successfully loaded the adFilter and ad matching works fine.
I think this has something to do with the networkAccessManager but I am unable to figure out how.
This is my class that inherits the QNetworkAccessManager class
class NetworkManager(QNetworkAccessManager):
def __init__(self):
super().__init__()
self.adblocker = Filter(open('easylist.txt', encoding="utf8"))
self.finished.connect(self._finished)
def createRequest(self, op, request, device=None):
url = request.url().toString()
if self.adblocker.match(url):
print('blocking url, ', url)
# block ads here
else:
print('good to go', url)
return QNetworkAccessManager.createRequest(self, op, request, device)
def examine(self, url):
self.get(QNetworkRequest(QUrl(url)))
def _finished(self, reply):
headers = reply.rawHeaderPairs()
headers = {str(k):str(v) for k,v in headers}
content_type = headers.get("Content-Type")
url = reply.url().toString()
status = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)
cookies = headers.get("Set-Cookie")
logger.log('{} --- {} --- {}'.format(str(status), url, content_type), 2)
I tried overriding the createRequest method. The ads are getting detected but those ad requests are not actually getting blocked.
How do i achieve this.
This is how I finally implemented the AdBlocker. You just need to override the acceptNavigationRequest method in The QWebEnginePage class. This is how I implemented it
class WebPage(QWebEnginePage):
adblocker = Filter(open('easylist.txt', encoding="utf8"))
def __init__(self, parent=None):
super().__init__(parent)
def acceptNavigationRequest(self, url, _type, isMainFrame):
urlString = url.toString()
resp = False
resp = WebPage.adblocker.match(url.toString())
if resp:
print("Blocking url --- "+url.toString())
return False
else:
print("TYPE", _type)
return True
return QWebEnginePage.acceptNavigationRequest(self, url, _type, isMainFrame)
I have following code:
class StackOverflowHandler(tornado.web.RequestHandler):
def get(self, look_up_pattern):
url = "https://api.stackexchange.com/2.2/search?order=desc&sort=votes&intitle=%s&site=stackoverflow"
response = self.async_get(url)
print(response)
self.write(response)
#gen.coroutine
def async_get(self, url):
link = httpclient.AsyncHTTPClient()
request = httpclient.HTTPRequest(url)
response = yield link.fetch(request)
data = response.body.decode('utf-8')
data = json.loads(data)
return data
application = tornado.web.Application([
(r"/search/(.*)", StackOverflowHandler),
])
The type that returns from async_get is tornado.concurrent.Future.
The exception is:
TypeError: write() only accepts bytes, unicode, and dict objects
I am new to asynchronous programming, please point me out my mistake.
Since async_get is coroutine it returns Future object. To get "real" results, Future must be resolved - it need to be yielded. More over the get handler must be decorated as asynchronous as well
class StackOverflowHandler(tornado.web.RequestHandler):
#gen.coroutine
def get(self, look_up_pattern):
url = "https://api.stackexchange.com/2.2/search?order=desc&sort=votes&intitle=%s&site=stackoverflow"
response = yield self.async_get(url)
print(response)
self.write(response)
#gen.coroutine
def async_get(self, url):
link = httpclient.AsyncHTTPClient()
request = httpclient.HTTPRequest(url)
response = yield link.fetch(request)
data = response.body.decode('utf-8')
data = json.loads(data)
return data
I need an advice regards testing tornado app. For now I just playing with demo chat application, but it looks like real-life problem.
In the handler I have:
class MessageUpdatesHandler(BaseHandler):
#tornado.web.authenticated
#tornado.web.asynchronous
def post(self):
cursor = self.get_argument("cursor", None)
global_message_buffer.wait_for_messages(self.on_new_messages,
cursor=cursor)
def on_new_messages(self, messages):
# Closed client connection
if self.request.connection.stream.closed():
return
self.finish(dict(messages=messages))
class MessageBuffer(object):
def __init__(self):
....
def wait_for_messages(self, callback, cursor=None):
if cursor:
new_count = 0
for msg in reversed(self.cache):
if msg["id"] == cursor:
break
new_count += 1
if new_count:
callback(self.cache[-new_count:])
return
self.waiters.add(callback)
def cancel_wait(self, callback):
.....
def new_messages(self, messages):
logging.info("Sending new message to %r listeners", len(self.waiters))
for callback in self.waiters:
try:
callback(messages)
except:
logging.error("Error in waiter callback", exc_info=True)
self.waiters = set()
self.cache.extend(messages)
if len(self.cache) > self.cache_size:
self.cache = self.cache[-self.cache_size:]
As I metioned full source code is in torndado demos
In my test I have:
#wsgi_safe
class MessageUpdatesHandlerTest(LoginedUserHanldersTest):
Handler = MessageUpdatesHandler
def test_add_message(self):
from chatdemo import global_message_buffer
kwargs = dict(
method="POST",
body='',
)
future = self.http_client.fetch(self.get_url('/'), callback=self.stop, **kwargs)
message = {
"id": '123',
"from": "first_name",
"body": "hello",
"html": "html"
}
global_message_buffer.new_messages([message])
response = self.wait()
self.assertEqual(response.code, 200)
self.mox.VerifyAll()
What happens:
It creates a future object
It sends a hello message, in this moment no waiter is registered
in MessageBuffer so callback is not called
In wait starts IoLoop and makes, a post fetch and waiter becomes
registered in MessageBuffer
Callback is never called and my response remains empty, so
everything fails with
AssertionError: Async operation timed out
after 5 seconds
What I want it to do:
On post register itself as a waiter
Receive some messages
Return to me a 200 response
Thank you for your help