I use package requests together with urllib3.util.retry.Retry() to send tens of thousands of queries. I seek to count the number of queries and the number of necessary attempts until I successfully retrieve the desired data. My goal is to construct a measure for the reliability of the API.
To fix ideas, let's assume that the Response object of requests contains this data:
from requests import Session
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
def create_session():
session = Session()
retries = Retry(
total = 15,
backoff_factor = 0.5,
status_forcelist = [401, 408, 429, 500, 502, 504],
allowed_methods = frozenset(["GET"])
)
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
return session
urls = ['https://httpbin.org/status/500']
count_queries = len(urls)
count_attempts = 0
with create_session() as s:
for url in urls:
response = s.get(url)
count_attempts += response.total_retries
Since there is no such variable, I am looking for alternatives to count the total number of retries.
While I am unable to identify an approach to this problem, I made the following observations during my search which is potentially helpful:
urllib3 stores the retry-history in the Retry object. The urllib3.HTTPResponse stores the last Retry object (docs). The urllib3.HTTPResponse (to be precise, its undecoded body) is stored in requests.Response.raw, however only when stream=True (docs). In my understanding, I can't access this data.
One user provides a solution to a similar question that subclasses the Retry class. Essentially, a callback function is called which prints a string to a logger. This could be adapted to increment a counter instead of printing to logs. However, if possible, I prefer to track the retries specific to a particular get, as shown above, as opposed to all gets using the same session.
A very similar question was asked here, however no (working) solution was provided.
I'm using Python 3.9, urllib3 1.26.8, requests 2.26.0.
This is a rather verbose solution along the lines of this answer. It counts requests and retries on the session level (which, however, was not my preferred approach).
import requests
from urllib3.util.retry import Retry
class RequestTracker:
""" track queries and retries """
def __init__(self):
self._retries = 0
self._queries = 0
def register_retry(self):
self._retries += 1
def register_query(self):
self._queries += 1
#property
def retries(self):
return self._retries
#property
def queries(self):
return self._queries
class RetryTracker(Retry):
""" subclass Retry to track count of retries """
def __init__(self, *args, **kwargs):
self._request_tracker = kwargs.pop('request_tracker', None)
super(RetryTracker, self).__init__(*args, **kwargs)
def new(self, **kw):
""" pass additional information when creating new Retry instance """
kw['request_tracker'] = self._request_tracker
return super(RetryTracker, self).new(**kw)
def increment(self, method, url, *args, **kwargs):
""" register retry attempt when new Retry object with incremented counter is returned """
if self._request_tracker:
self._request_tracker.register_retry()
return super(RetryTracker, self).increment(method, url, *args, **kwargs)
class RetrySession(requests.Session):
""" subclass Session to track count of queries """
def __init__(self, retry):
super().__init__()
self._requests_count = retry
def prepare_request(self, request):
""" increment query counter """
# increment requests counter
self._requests_count.register_query()
return super().prepare_request(request)
class RequestManager:
""" manage requests """
def __init__(self, request_tracker=None):
# session settings
self.__session = None
self.__request_tracker = request_tracker
# retry logic specification
args = dict(
total = 11,
backoff_factor = 1,
status_forcelist = [401,408, 429, 500, 502, 504],
allowed_methods = frozenset(["GET"])
)
if self.__request_tracker is not None:
args['request_tracker'] = self.__request_tracker
self.__retries = RetryTracker(**args)
else:
self.__retries = Retry(**args)
#property
def session(self):
if self.__session is None:
# create new session
if self.__request_tracker is not None:
self.__session = RetrySession(self.__request_tracker)
else:
self.__session = requests.Session()
# mount https adapter with retry logic
https = requests.adapters.HTTPAdapter(max_retries=self.__retries)
self.__session.mount('https://', https)
return self.__session
#session.setter
def session(self, value):
raise AttributeError('Setting session attribute is prohibited.')
request_tracker = RequestTracker()
request_manager = RequestManager(request_tracker=request_tracker)
session = request_manager.session
urls = ['https://httpbin.org/status/500']
with session as s:
for url in urls:
response = s.get(url)
print(request_tracker.queries)
print(request_tracker.retries)
Related
I am trying to pass my session object from one class to another. But I am not sure whats happening.
class CreateSession:
def __init__(self, user, pwd, url="http://url_to_hit"):
self.url = url
self.user = user
self.pwd = pwd
def get_session(self):
sess = requests.Session()
r = sess.get(self.url + "/", auth=(self.user, self.pwd))
print(r.content)
return sess
class TestGet(CreateSession):
def get_response(self):
s = self.get_session()
print(s)
data = s.get(self.url + '/some-get')
print(data.status_code)
print(data)
if __name__ == "__main__":
TestGet(user='user', pwd='pwd').get_response()
I am getting 401 for get_response(). Not able to understand this.
What's a 401?
The response you're getting means that you're unauthorised to access the resource.
A session is used in order to persist headers and other prerequisites throughout requests, why are you creating the session every time rather than storing it in a variable?
As is, the session should work the only issue is that you're trying to call a resource that you don't have access to. - You're not passing the url parameter either in the initialisation.
Example of how you can effectively use Session:
from requests import Session
from requests.exceptions import HTTPError
class TestGet:
__session = None
__username = None
__password = None
def __init__(self, username, password):
self.__username = username
self.__password = password
#property
def session(self):
if self.__session is None:
self.__session = Session()
self.__session.auth = (self.__user, self.__pwd)
return self.__session
#session.setter
def session(self, value):
raise AttributeError('Setting \'session\' attribute is prohibited.')
def get_response(self, url):
try:
response = self.session.get(url)
# raises if the status code is an error - 4xx, 5xx
response.raise_for_status()
return response
except HTTPError as e:
# you received an http error .. handle it here (e contains the request and response)
pass
test_get = TestGet('my_user', 'my_pass')
first_response = test_get.get_response('http://your-website-with-basic-auth.com')
second_response = test_get.get_response('http://another-url.com')
my_session = test_get.session
my_session.get('http://url.com')
I am building a web browser and i want to enable ad blocking in it.
I have read multiple answers, but I havent been able to implement it successfully.
I have successfully loaded the adFilter and ad matching works fine.
I think this has something to do with the networkAccessManager but I am unable to figure out how.
This is my class that inherits the QNetworkAccessManager class
class NetworkManager(QNetworkAccessManager):
def __init__(self):
super().__init__()
self.adblocker = Filter(open('easylist.txt', encoding="utf8"))
self.finished.connect(self._finished)
def createRequest(self, op, request, device=None):
url = request.url().toString()
if self.adblocker.match(url):
print('blocking url, ', url)
# block ads here
else:
print('good to go', url)
return QNetworkAccessManager.createRequest(self, op, request, device)
def examine(self, url):
self.get(QNetworkRequest(QUrl(url)))
def _finished(self, reply):
headers = reply.rawHeaderPairs()
headers = {str(k):str(v) for k,v in headers}
content_type = headers.get("Content-Type")
url = reply.url().toString()
status = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)
cookies = headers.get("Set-Cookie")
logger.log('{} --- {} --- {}'.format(str(status), url, content_type), 2)
I tried overriding the createRequest method. The ads are getting detected but those ad requests are not actually getting blocked.
How do i achieve this.
This is how I finally implemented the AdBlocker. You just need to override the acceptNavigationRequest method in The QWebEnginePage class. This is how I implemented it
class WebPage(QWebEnginePage):
adblocker = Filter(open('easylist.txt', encoding="utf8"))
def __init__(self, parent=None):
super().__init__(parent)
def acceptNavigationRequest(self, url, _type, isMainFrame):
urlString = url.toString()
resp = False
resp = WebPage.adblocker.match(url.toString())
if resp:
print("Blocking url --- "+url.toString())
return False
else:
print("TYPE", _type)
return True
return QWebEnginePage.acceptNavigationRequest(self, url, _type, isMainFrame)
When using a Session, it seems you need to provide the full URL each time, e.g.
session = requests.Session()
session.get('http://myserver/getstuff')
session.get('http://myserver/getstuff2')
This gets a little tedious. Is there a way to do something like:
session = requests.Session(url_base='http://myserver')
session.get('/getstuff')
session.get('/getstuff2')
This feature has been asked on the forums a few times 1, 2, 3. The preferred approach as documented here, is subclassing, as follows:
from requests import Session
from urllib.parse import urljoin
class LiveServerSession(Session):
def __init__(self, base_url=None):
super().__init__()
self.base_url = base_url
def request(self, method, url, *args, **kwargs):
joined_url = urljoin(self.base_url, url)
return super().request(method, joined_url, *args, **kwargs)
You would use this simply as follows:
baseUrl = 'http://api.twitter.com'
with LiveServerSession(baseUrl) as s:
resp = s.get('/1/statuses/home_timeline.json')
requests_toolbelt.sessions.BaseUrlSession
https://github.com/requests/toolbelt/blob/f5c86c51e0a01fbc8b3b4e1c286fd5c7cb3aacfa/requests_toolbelt/sessions.py#L6
NOTE: This uses urljoin from standard lib. Beware of urljoin's behavior.
In [14]: from urlparse import urljoin
In [15]: urljoin('https://localhost/api', '/resource')
Out[15]: 'https://localhost/resource'
In [16]: urljoin('https://localhost/api', 'resource')
Out[16]: 'https://localhost/resource'
In [17]: urljoin('https://localhost/api/', '/resource')
Out[17]: 'https://localhost/resource'
In [18]: urljoin('https://localhost/api/', 'resource')
Out[18]: 'https://localhost/api/resource'
OR
import requests
from functools import partial
def PrefixUrlSession(prefix=None):
if prefix is None:
prefix = ""
else:
prefix = prefix.rstrip('/') + '/'
def new_request(prefix, f, method, url, *args, **kwargs):
return f(method, prefix + url, *args, **kwargs)
s = requests.Session()
s.request = partial(new_request, prefix, s.request)
return s
You could just subclass request.Session and overload its __init__ and request methods like this:
# my_requests.py
import requests
class SessionWithUrlBase(requests.Session):
# In Python 3 you could place `url_base` after `*args`, but not in Python 2.
def __init__(self, url_base=None, *args, **kwargs):
super(SessionWithUrlBase, self).__init__(*args, **kwargs)
self.url_base = url_base
def request(self, method, url, **kwargs):
# Next line of code is here for example purposes only.
# You really shouldn't just use string concatenation here,
# take a look at urllib.parse.urljoin instead.
modified_url = self.url_base + url
return super(SessionWithUrlBase, self).request(method, modified_url, **kwargs)
And then you could use your subclass instead of requests.Session in your code:
from my_requests import SessionWithUrlBase
session = SessionWithUrlBase(url_base='https://stackoverflow.com/')
session.get('documentation') # https://stackoverflow.com/documentation
Also you could monkey-patch requests.Session to avoid modifying existing codebase (this implementation should be 100% compatible), but be sure to do actual patching before any code calls requests.Session():
# monkey_patch.py
import requests
class SessionWithUrlBase(requests.Session):
...
requests.Session = SessionWithUrlBase
And then:
# main.py
import requests
import monkey_patch
session = requests.Session()
repr(session) # <monkey_patch.SessionWithUrlBase object at ...>
I don't see a built-in way to do this, but you can use wrapper functions to add the functionality you want:
from functools import wraps
import inspect
import requests
from requests.compat import urljoin
def _base_url(func, base):
'''Decorator for adding a base URL to func's url parameter'''
#wraps(func)
def wrapper(*args, **kwargs):
argname = 'url'
argspec = inspect.getargspec(func)
if argname in kwargs:
kwargs[argname] = urljoin(base, kwargs[argname])
else:
# Find and replace url parameter in positional args. The argspec
# includes self while args doesn't, so indexes have to be shifted
# over one
for i, name in enumerate(argspec[0]):
if name == argname:
args = list(args)
args[i-1] = urljoin(base, args[i-1])
break
return func(*args, **kwargs)
return wrapper
def inject_base_url(func):
'''Decorator for adding a base URL to all methods that take a url param'''
#wraps(func)
def wrapper(*args, **kwargs):
argname = 'base_url'
if argname in kwargs:
obj = args[0]
# Add base_url decorator to all methods that have a url parameter
for name, method in inspect.getmembers(obj, inspect.ismethod):
argspec = inspect.getargspec(method.__func__)
if 'url' in argspec[0]:
setattr(obj, name, _base_url(method, kwargs[argname]))
del kwargs[argname]
return func(*args, **kwargs)
return wrapper
# Wrap requests.Session.__init__ so it takes a base_url parameter
setattr(
requests.Session,
'__init__',
inject_base_url(getattr(requests.Session, '__init__'))
)
Now you can specify a base URL when you construct a new requests.Session object:
s = requests.Session(base_url='http://stackoverflow.com')
s.get('questions') # http://stackoverflow.com/questions
s.post('documentation') # http://stackoverflow.com/documentation
# With no base_url, you get the default behavior
s = requests.Session()
s.get('http://google.com')
keep it simple and use builtin methods for joining (no '/' suffix hassle):
import urllib.parse
session = requests.Session()
session.my_base_url_join = lambda path: urllib.parse.urljoin(str_BASE_URL, path)
# use: session.get(session.my_base_url_join(path='/message'))
This is possible with the requests-toolbelt package, which enables setting a base_url at the session level:
from requests_toolbelt import sessions
s = sessions.BaseUrlSession(
base_url='https://example.com/resource/')
r = s.get('sub-resource/', params={'foo': 'bar'})
>>> print(r.request.url)
https://example.com/resource/sub-resource/?foo=bar
see documentation for BaseUrlSession.
Based on #qrtLs answer, here is a 3-4 line version that does what you want (assuming you don't need to create multiple sessions and only need the get method defined).
import requests
import urllib.parse
session = requests.Session()
session.base_url = "https://google.com/"
session.get = lambda *args, **kwargs: requests.Session.get(session, urllib.parse.urljoin(s.base_url, args[0]), *args[1:], **kwargs)
r = session.get("/search?q=asdf", verify=False)
I have python class on python+tornado, that works like crawler. I have lot of links on the same site and i need to got responses from all of them to my data base.
So difficult in this: I cant understand how can i catch urls, that got error(timeout, or runtime exeptions).
I know how to fix this with newbie-code(i've just 1 week code on python) - compare list of input links and output, but i want to do right way.
Can u tell me how can i do this?
import time
import requests
import json
from tornado import gen, ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue
class Scraper():
def __init__(self, source='', destinations=None, transform=None, headers={ }, max_clients=20, maxsize=20, connect_timeout=600, request_timeout=600 ):
"""Instantiate a tornado async http client to do many URL requests"""
if None in destinations:
sys.stderr.write('You must pass both collection of URLS and a transform function')
raise SystemExit
self.max_clients = max_clients
self.maxsize = maxsize
self.connect_timeout = connect_timeout
self.request_timeout = request_timeout
# AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=50)
AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
self.headers = headers
self.http_client = AsyncHTTPClient()
self.queue = Queue(maxsize=20)
self.source = source
self.destinations = destinations
self.transform = transform
self.read(self.destinations)
self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout)
self.loop = ioloop.IOLoop.current()
self.join_future = self.queue.join()
def done(future):
self.loop.stop()
self.join_future.add_done_callback(done)
self.loop.start()
#gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
#gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout):
while True:
url = yield self.queue.get()
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
future = self.http_client.fetch(request)
def done_callback(future):
self.queue.task_done()
body = future.result().body
transform(body)
future.add_done_callback(done_callback)
def transform_data(body, url=''):
#SOMECODE
a = ['link1', 'link2']
scraper = Scraper(destinations=a, transform=transform_data)
In a coroutine you can "yield" a future. The coroutine pauses until the future is resolved into a result or an exception:
try:
result = yield self.http_client.fetch(request)
except Exception as exc:
print("Failure!: %s" % exc)
else:
self.queue.task_done()
body = result.body
transform(body)
For more examples, see the Tornado documentation for HTTP clients.
I have following code:
class StackOverflowHandler(tornado.web.RequestHandler):
def get(self, look_up_pattern):
url = "https://api.stackexchange.com/2.2/search?order=desc&sort=votes&intitle=%s&site=stackoverflow"
response = self.async_get(url)
print(response)
self.write(response)
#gen.coroutine
def async_get(self, url):
link = httpclient.AsyncHTTPClient()
request = httpclient.HTTPRequest(url)
response = yield link.fetch(request)
data = response.body.decode('utf-8')
data = json.loads(data)
return data
application = tornado.web.Application([
(r"/search/(.*)", StackOverflowHandler),
])
The type that returns from async_get is tornado.concurrent.Future.
The exception is:
TypeError: write() only accepts bytes, unicode, and dict objects
I am new to asynchronous programming, please point me out my mistake.
Since async_get is coroutine it returns Future object. To get "real" results, Future must be resolved - it need to be yielded. More over the get handler must be decorated as asynchronous as well
class StackOverflowHandler(tornado.web.RequestHandler):
#gen.coroutine
def get(self, look_up_pattern):
url = "https://api.stackexchange.com/2.2/search?order=desc&sort=votes&intitle=%s&site=stackoverflow"
response = yield self.async_get(url)
print(response)
self.write(response)
#gen.coroutine
def async_get(self, url):
link = httpclient.AsyncHTTPClient()
request = httpclient.HTTPRequest(url)
response = yield link.fetch(request)
data = response.body.decode('utf-8')
data = json.loads(data)
return data