How Do I block ads in PyQt - python

I am building a web browser and i want to enable ad blocking in it.
I have read multiple answers, but I havent been able to implement it successfully.
I have successfully loaded the adFilter and ad matching works fine.
I think this has something to do with the networkAccessManager but I am unable to figure out how.
This is my class that inherits the QNetworkAccessManager class
class NetworkManager(QNetworkAccessManager):
def __init__(self):
super().__init__()
self.adblocker = Filter(open('easylist.txt', encoding="utf8"))
self.finished.connect(self._finished)
def createRequest(self, op, request, device=None):
url = request.url().toString()
if self.adblocker.match(url):
print('blocking url, ', url)
# block ads here
else:
print('good to go', url)
return QNetworkAccessManager.createRequest(self, op, request, device)
def examine(self, url):
self.get(QNetworkRequest(QUrl(url)))
def _finished(self, reply):
headers = reply.rawHeaderPairs()
headers = {str(k):str(v) for k,v in headers}
content_type = headers.get("Content-Type")
url = reply.url().toString()
status = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)
cookies = headers.get("Set-Cookie")
logger.log('{} --- {} --- {}'.format(str(status), url, content_type), 2)
I tried overriding the createRequest method. The ads are getting detected but those ad requests are not actually getting blocked.
How do i achieve this.

This is how I finally implemented the AdBlocker. You just need to override the acceptNavigationRequest method in The QWebEnginePage class. This is how I implemented it
class WebPage(QWebEnginePage):
adblocker = Filter(open('easylist.txt', encoding="utf8"))
def __init__(self, parent=None):
super().__init__(parent)
def acceptNavigationRequest(self, url, _type, isMainFrame):
urlString = url.toString()
resp = False
resp = WebPage.adblocker.match(url.toString())
if resp:
print("Blocking url --- "+url.toString())
return False
else:
print("TYPE", _type)
return True
return QWebEnginePage.acceptNavigationRequest(self, url, _type, isMainFrame)

Related

Count number of retries for each request

I use package requests together with urllib3.util.retry.Retry() to send tens of thousands of queries. I seek to count the number of queries and the number of necessary attempts until I successfully retrieve the desired data. My goal is to construct a measure for the reliability of the API.
To fix ideas, let's assume that the Response object of requests contains this data:
from requests import Session
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
def create_session():
session = Session()
retries = Retry(
total = 15,
backoff_factor = 0.5,
status_forcelist = [401, 408, 429, 500, 502, 504],
allowed_methods = frozenset(["GET"])
)
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
return session
urls = ['https://httpbin.org/status/500']
count_queries = len(urls)
count_attempts = 0
with create_session() as s:
for url in urls:
response = s.get(url)
count_attempts += response.total_retries
Since there is no such variable, I am looking for alternatives to count the total number of retries.
While I am unable to identify an approach to this problem, I made the following observations during my search which is potentially helpful:
urllib3 stores the retry-history in the Retry object. The urllib3.HTTPResponse stores the last Retry object (docs). The urllib3.HTTPResponse (to be precise, its undecoded body) is stored in requests.Response.raw, however only when stream=True (docs). In my understanding, I can't access this data.
One user provides a solution to a similar question that subclasses the Retry class. Essentially, a callback function is called which prints a string to a logger. This could be adapted to increment a counter instead of printing to logs. However, if possible, I prefer to track the retries specific to a particular get, as shown above, as opposed to all gets using the same session.
A very similar question was asked here, however no (working) solution was provided.
I'm using Python 3.9, urllib3 1.26.8, requests 2.26.0.
This is a rather verbose solution along the lines of this answer. It counts requests and retries on the session level (which, however, was not my preferred approach).
import requests
from urllib3.util.retry import Retry
class RequestTracker:
""" track queries and retries """
def __init__(self):
self._retries = 0
self._queries = 0
def register_retry(self):
self._retries += 1
def register_query(self):
self._queries += 1
#property
def retries(self):
return self._retries
#property
def queries(self):
return self._queries
class RetryTracker(Retry):
""" subclass Retry to track count of retries """
def __init__(self, *args, **kwargs):
self._request_tracker = kwargs.pop('request_tracker', None)
super(RetryTracker, self).__init__(*args, **kwargs)
def new(self, **kw):
""" pass additional information when creating new Retry instance """
kw['request_tracker'] = self._request_tracker
return super(RetryTracker, self).new(**kw)
def increment(self, method, url, *args, **kwargs):
""" register retry attempt when new Retry object with incremented counter is returned """
if self._request_tracker:
self._request_tracker.register_retry()
return super(RetryTracker, self).increment(method, url, *args, **kwargs)
class RetrySession(requests.Session):
""" subclass Session to track count of queries """
def __init__(self, retry):
super().__init__()
self._requests_count = retry
def prepare_request(self, request):
""" increment query counter """
# increment requests counter
self._requests_count.register_query()
return super().prepare_request(request)
class RequestManager:
""" manage requests """
def __init__(self, request_tracker=None):
# session settings
self.__session = None
self.__request_tracker = request_tracker
# retry logic specification
args = dict(
total = 11,
backoff_factor = 1,
status_forcelist = [401,408, 429, 500, 502, 504],
allowed_methods = frozenset(["GET"])
)
if self.__request_tracker is not None:
args['request_tracker'] = self.__request_tracker
self.__retries = RetryTracker(**args)
else:
self.__retries = Retry(**args)
#property
def session(self):
if self.__session is None:
# create new session
if self.__request_tracker is not None:
self.__session = RetrySession(self.__request_tracker)
else:
self.__session = requests.Session()
# mount https adapter with retry logic
https = requests.adapters.HTTPAdapter(max_retries=self.__retries)
self.__session.mount('https://', https)
return self.__session
#session.setter
def session(self, value):
raise AttributeError('Setting session attribute is prohibited.')
request_tracker = RequestTracker()
request_manager = RequestManager(request_tracker=request_tracker)
session = request_manager.session
urls = ['https://httpbin.org/status/500']
with session as s:
for url in urls:
response = s.get(url)
print(request_tracker.queries)
print(request_tracker.retries)

requests.session() object not recognized in another class

I am trying to pass my session object from one class to another. But I am not sure whats happening.
class CreateSession:
def __init__(self, user, pwd, url="http://url_to_hit"):
self.url = url
self.user = user
self.pwd = pwd
def get_session(self):
sess = requests.Session()
r = sess.get(self.url + "/", auth=(self.user, self.pwd))
print(r.content)
return sess
class TestGet(CreateSession):
def get_response(self):
s = self.get_session()
print(s)
data = s.get(self.url + '/some-get')
print(data.status_code)
print(data)
if __name__ == "__main__":
TestGet(user='user', pwd='pwd').get_response()
I am getting 401 for get_response(). Not able to understand this.
What's a 401?
The response you're getting means that you're unauthorised to access the resource.
A session is used in order to persist headers and other prerequisites throughout requests, why are you creating the session every time rather than storing it in a variable?
As is, the session should work the only issue is that you're trying to call a resource that you don't have access to. - You're not passing the url parameter either in the initialisation.
Example of how you can effectively use Session:
from requests import Session
from requests.exceptions import HTTPError
class TestGet:
__session = None
__username = None
__password = None
def __init__(self, username, password):
self.__username = username
self.__password = password
#property
def session(self):
if self.__session is None:
self.__session = Session()
self.__session.auth = (self.__user, self.__pwd)
return self.__session
#session.setter
def session(self, value):
raise AttributeError('Setting \'session\' attribute is prohibited.')
def get_response(self, url):
try:
response = self.session.get(url)
# raises if the status code is an error - 4xx, 5xx
response.raise_for_status()
return response
except HTTPError as e:
# you received an http error .. handle it here (e contains the request and response)
pass
test_get = TestGet('my_user', 'my_pass')
first_response = test_get.get_response('http://your-website-with-basic-auth.com')
second_response = test_get.get_response('http://another-url.com')
my_session = test_get.session
my_session.get('http://url.com')

How to get url that were timeouted or got error?

I have python class on python+tornado, that works like crawler. I have lot of links on the same site and i need to got responses from all of them to my data base.
So difficult in this: I cant understand how can i catch urls, that got error(timeout, or runtime exeptions).
I know how to fix this with newbie-code(i've just 1 week code on python) - compare list of input links and output, but i want to do right way.
Can u tell me how can i do this?
import time
import requests
import json
from tornado import gen, ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue
class Scraper():
def __init__(self, source='', destinations=None, transform=None, headers={ }, max_clients=20, maxsize=20, connect_timeout=600, request_timeout=600 ):
"""Instantiate a tornado async http client to do many URL requests"""
if None in destinations:
sys.stderr.write('You must pass both collection of URLS and a transform function')
raise SystemExit
self.max_clients = max_clients
self.maxsize = maxsize
self.connect_timeout = connect_timeout
self.request_timeout = request_timeout
# AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=50)
AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
self.headers = headers
self.http_client = AsyncHTTPClient()
self.queue = Queue(maxsize=20)
self.source = source
self.destinations = destinations
self.transform = transform
self.read(self.destinations)
self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout)
self.loop = ioloop.IOLoop.current()
self.join_future = self.queue.join()
def done(future):
self.loop.stop()
self.join_future.add_done_callback(done)
self.loop.start()
#gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
#gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout):
while True:
url = yield self.queue.get()
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
future = self.http_client.fetch(request)
def done_callback(future):
self.queue.task_done()
body = future.result().body
transform(body)
future.add_done_callback(done_callback)
def transform_data(body, url=''):
#SOMECODE
a = ['link1', 'link2']
scraper = Scraper(destinations=a, transform=transform_data)
In a coroutine you can "yield" a future. The coroutine pauses until the future is resolved into a result or an exception:
try:
result = yield self.http_client.fetch(request)
except Exception as exc:
print("Failure!: %s" % exc)
else:
self.queue.task_done()
body = result.body
transform(body)
For more examples, see the Tornado documentation for HTTP clients.

Tornado: asynchronous endpoints

I have following code:
class StackOverflowHandler(tornado.web.RequestHandler):
def get(self, look_up_pattern):
url = "https://api.stackexchange.com/2.2/search?order=desc&sort=votes&intitle=%s&site=stackoverflow"
response = self.async_get(url)
print(response)
self.write(response)
#gen.coroutine
def async_get(self, url):
link = httpclient.AsyncHTTPClient()
request = httpclient.HTTPRequest(url)
response = yield link.fetch(request)
data = response.body.decode('utf-8')
data = json.loads(data)
return data
application = tornado.web.Application([
(r"/search/(.*)", StackOverflowHandler),
])
The type that returns from async_get is tornado.concurrent.Future.
The exception is:
TypeError: write() only accepts bytes, unicode, and dict objects
I am new to asynchronous programming, please point me out my mistake.
Since async_get is coroutine it returns Future object. To get "real" results, Future must be resolved - it need to be yielded. More over the get handler must be decorated as asynchronous as well
class StackOverflowHandler(tornado.web.RequestHandler):
#gen.coroutine
def get(self, look_up_pattern):
url = "https://api.stackexchange.com/2.2/search?order=desc&sort=votes&intitle=%s&site=stackoverflow"
response = yield self.async_get(url)
print(response)
self.write(response)
#gen.coroutine
def async_get(self, url):
link = httpclient.AsyncHTTPClient()
request = httpclient.HTTPRequest(url)
response = yield link.fetch(request)
data = response.body.decode('utf-8')
data = json.loads(data)
return data

flask http-auth and unittesting

Hi!
I have a route that I have protected using HTTP Basic authentication, which is implemented by Flask-HTTPAuth. Everything works fine (i can access the route) if i use curl, but when unit testing, the route can't be accessed, even though i provide it with the right username and password.
Here are the relevant code snippets in my testing module:
class TestClient(object):
def __init__(self, app):
self.client = app.test_client()
def send(self, url, method, data=None, headers={}):
if data:
data = json.dumps(data)
rv = method(url, data=data, headers=headers)
return rv, json.loads(rv.data.decode('utf-8'))
def delete(self, url, headers={}):
return self.send(url, self.client.delete, headers)
class TestCase(unittest.TestCase):
def setUp(self):
app.config.from_object('test_config')
self.app = app
self.app_context = self.app.app_context()
self.app_context.push()
db.create_all()
self.client = TestClient(self.app)
def test_delete_user(self):
# create new user
data = {'username': 'john', 'password': 'doe'}
self.client.post('/users', data=data)
# delete previously created user
headers = {}
headers['Authorization'] = 'Basic ' + b64encode((data['username'] + ':' + data['password'])
.encode('utf-8')).decode('utf-8')
headers['Content-Type'] = 'application/json'
headers['Accept'] = 'application/json'
rv, json = self.client.delete('/users', headers=headers)
self.assertTrue(rv.status_code == 200) # Returns 401 instead
Here are the callback methods required by Flask-HTTPAuth:
auth = HTTPBasicAuth()
#auth.verify_password
def verify_password(username, password):
# THIS METHOD NEVER GETS CALLED
user = User.query.filter_by(username=username).first()
if not user or not user.verify_password(password):
return False
g.user = user
return True
#auth.error_handler
def unauthorized():
response = jsonify({'status': 401, 'error': 'unauthorized', 'message': 'Please authenticate to access this API.'})
response.status_code = 401
return response
Any my route:
#app.route('/users', methods=['DELETE'])
#auth.login_required
def delete_user():
db.session.delete(g.user)
db.session.commit()
return jsonify({})
The unit test throws the following exception:
Traceback (most recent call last):
File "test_api.py", line 89, in test_delete_user
self.assertTrue(rv.status_code == 200) # Returns 401 instead
AssertionError: False is not true
I want to emphazise once more that everything works fine when i run curl with exactly the same arguments i provide for my test client, but when i run the test, verify_password method doesn't even get called.
Thank you very much for your help!
Here is an example how this could be done with pytest and the inbuilt monkeypatch fixture.
If I have this API function in some_flask_app:
from flask_httpauth import HTTPBasicAuth
app = Flask(__name__)
auth = HTTPBasicAuth()
#app.route('/api/v1/version')
#auth.login_required
def api_get_version():
return jsonify({'version': get_version()})
I can create a fixture that returns a flask test client and patches the authenticate function in HTTPBasicAuth to always return True:
import pytest
from some_flask_app import app, auth
#pytest.fixture(name='client')
def initialize_authorized_test_client(monkeypatch):
app.testing = True
client = app.test_client()
monkeypatch.setattr(auth, 'authenticate', lambda x, y: True)
yield client
app.testing = False
def test_settings_tracking(client):
r = client.get("/api/v1/version")
assert r.status_code == 200
You are going to love this.
Your send method:
def send(self, url, method, data=None, headers={}):
pass
Your delete method:
def delete(self, url, headers={}):
return self.send(url, self.client.delete, headers)
Note you are passing headers as third positional argument, so it's going as data into send().

Categories