I have a few conditions to implement for rotating proxies in scrapy middleware:
If response is not 200 try that request with another random proxy from a list.
I have two lists of proxies let's say I'd like to start crawling with first list of proxies and retry about 10 times with that list and after that as a last resort I want to try second proxy list.
I have tried creating the middleware but it is not working as expected it is not rotating proxies as well as not picking up the second proxy list as last resort. Here is the code:
class SFAProxyMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.packetstream_proxies = [
settings.get("PS_PROXY_USA"),
settings.get("PS_PROXY_CA"),
settings.get("PS_PROXY_IT"),
settings.get("PS_PROXY_GLOBAL"),
]
self.unlimited_proxies = [
settings.get("UNLIMITED_PROXY_1"),
settings.get("UNLIMITED_PROXY_2"),
settings.get("UNLIMITED_PROXY_3"),
settings.get("UNLIMITED_PROXY_4"),
settings.get("UNLIMITED_PROXY_5"),
settings.get("UNLIMITED_PROXY_6"),
]
def add_proxy(self, request, host):
request.meta["proxy"] = host
def process_request(self, request, spider):
retries = request.meta.get("retry_times", 0)
if "proxy" in request.meta.keys():
return None
if retries <= 10:
self.add_proxy(request, random.choice(self.unlimited_proxies))
else:
self.add_proxy(request, random.choice(self.packetstream_proxies))
Am I doing something wrong implementing the middleware? Thanks
I think based on the conditions at the beginning of your question, that you also need to process the response to check for it's status code and if it isn't a 200 then to increase the retry count and send it make to the scheduler.
You might need to set the dont_filter parameter in the request to True, and you should also probably set a maximum for the number of retries.
for example
from scrapy.exceptions import IgnoreRequest
MAX_RETRY = 20
class SFAProxyMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.packetstream_proxies = [
settings.get("PS_PROXY_USA"),
settings.get("PS_PROXY_CA"),
settings.get("PS_PROXY_IT"),
settings.get("PS_PROXY_GLOBAL"),
]
self.unlimited_proxies = [
settings.get("UNLIMITED_PROXY_1"),
settings.get("UNLIMITED_PROXY_2"),
settings.get("UNLIMITED_PROXY_3"),
settings.get("UNLIMITED_PROXY_4"),
settings.get("UNLIMITED_PROXY_5"),
settings.get("UNLIMITED_PROXY_6"),
]
def add_proxy(self, request, host):
request.meta["proxy"] = host
def process_request(self, request, spider):
retries = request.meta.get("retry_times", 0)
if "proxy" in request.meta.keys():
return None
if retries <= 10:
self.add_proxy(request, random.choice(self.unlimited_proxies))
else:
self.add_proxy(request, random.choice(self.packetstream_proxies))
def process_response(self, response, spider):
if response.status_code != 200:
request = response.request
request.meta.setdefault("retry_times", 1)
request.meta["retry_times"] += 1
if request.meta["retry_times"] > MAX_RETRY:
raise IgnoreRequest
request.dont_filter = True
return request
return response
Related
I'm using Flask-HTTPAuth for authentication, Flask-login for login and want to add unit tests for endpoint decorated with #auth.login_required(scheme="Bearer"):
endpoints.py
class UserData(Resource):
#auth.login_required
def get(self, user_id):
user = db_models.User.query.get(user_id)
return result
authent.py
#auth.verify_token
def verify_token(token):
data = token_serializer.loads(token)
return data.get("username", False)
But got stuck with mocking (it doesn't work):
test.py
#mock.patch("myapp.auth")
#mock.patch("myapp.db_models.User")
def test_get_user(self, mock_user, mock_login):
with app.test_client() as client:
mock_login.verify_token.return_value = TEST_USER["username"]
mock_login.login_required.return_value = True
mock_user.query.get.return_value = TEST_USER
response = client.get("/user/100000")
self.assertIsNotNone(response)
self.assertIsInstance(response.get_json(), dict)
And the second approach:
test.py
#mock.patch("myapp.auth.login_required", return_value = True)
#mock.patch("myapp.db_models.User")
def test_get_user(self, mock_user, mock_login):
with app.test_client() as client:
mock_user.query.get.return_value = TEST_USER
response = client.get("/user/100000")
self.assertIsNotNone(response)
self.assertIsInstance(response.get_json(), dict)
Could you please help me to figure out how to do it in the right way?
Thank you for your help!
i have a list of url_handler and i would want to make asyncronous httprequest using tornado. When all response structure is arrived i need to use it for other targets.
Here a simple example of my code:
(...)
self.number = 0
self.counter = 0
self.data = {}
(...)
#tornado.web.asynchronous
def post(self):
list_url = [url_service1, url_service2]
self.number = len(list_url)
http_client = AsyncHTTPClient()
for service in list_url:
request = tornado.httpclient.HTTPRequest(url=service, method='POST', headers={'content-type': 'application/json'}, body=json.dumps({..params..}))
http_client.fetch(request, callback=self.handle_response)
# Loop for is finished. Use self.data for example in other funcions...
# if i print(self.data) i have empty dict...
# do_something(self.data)
def handle_response(self,response):
if response.error:
print("Error")
else:
self.counter = self.counter + 1
print("Response {} / {} from {}".format(self.counter, self.number, response.effective_url))
self.data[response.effective_url] = json_decode(response.body)
# number is 2
if self.counter == self.number:
print("Finish response")
def do_something(data):
# code with data parameter
I hope my problem is well explained
Since you know AsyncHTTPClient is asynchronous, that means, the requests will run in background.
So, when the for loop is finished, that does not mean all the requests are also finished - they are running in the background even when the loop finishes.
That is why self.data is empty, because the requests aren't completed yet.
How to fix this
As you know the handle_response callback is called after every request is completed. You can call do_something function from this callback when all the requests are completed. Like this:
def handle_response(...):
...
if self.counter == self.number:
self.do_something(self.data)
print("Finish response")
I'm currently building a web app meant to display the data collected by a scrapy spider. The user makes a request, the spider crawl a website, then return the data to the app in order to be prompted. I'd like to retrieve the data directly from the scraper, without relying on an intermediary .csv or .json file. Something like :
from scrapy.crawler import CrawlerProcess
from scraper.spiders import MySpider
url = 'www.example.com'
spider = MySpider()
crawler = CrawlerProcess()
crawler.crawl(spider, start_urls=[url])
crawler.start()
data = crawler.data # this bit
This is not so easy because Scrapy is non-blocking and works in an event loop; it uses Twisted event loop, and Twisted event loop is not restartable, so you can't write crawler.start(); data = crawler.data - after crawler.start() process runs forever, calling registered callbacks until it is killed or ended.
These answers may be relevant:
How to integrate Flask & Scrapy?
Building a RESTful Flask API for Scrapy
If you use an event loop in your app (e.g. you have a Twisted or Tornado web server) then it is possible to get the data from a crawl without storing it to disk. The idea is to listen to item_scraped signal. I'm using the following helper to make it nicer:
import collections
from twisted.internet.defer import Deferred
from scrapy.crawler import Crawler
from scrapy import signals
def scrape_items(crawler_runner, crawler_or_spidercls, *args, **kwargs):
"""
Start a crawl and return an object (ItemCursor instance)
which allows to retrieve scraped items and wait for items
to become available.
Example:
.. code-block:: python
#inlineCallbacks
def f():
runner = CrawlerRunner()
async_items = scrape_items(runner, my_spider)
while (yield async_items.fetch_next):
item = async_items.next_item()
# ...
# ...
This convoluted way to write a loop should become unnecessary
in Python 3.5 because of ``async for``.
"""
crawler = crawler_runner.create_crawler(crawler_or_spidercls)
d = crawler_runner.crawl(crawler, *args, **kwargs)
return ItemCursor(d, crawler)
class ItemCursor(object):
def __init__(self, crawl_d, crawler):
self.crawl_d = crawl_d
self.crawler = crawler
crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
crawl_d.addCallback(self._on_finished)
crawl_d.addErrback(self._on_error)
self.closed = False
self._items_available = Deferred()
self._items = collections.deque()
def _on_item_scraped(self, item):
self._items.append(item)
self._items_available.callback(True)
self._items_available = Deferred()
def _on_finished(self, result):
self.closed = True
self._items_available.callback(False)
def _on_error(self, failure):
self.closed = True
self._items_available.errback(failure)
#property
def fetch_next(self):
"""
A Deferred used with ``inlineCallbacks`` or ``gen.coroutine`` to
asynchronously retrieve the next item, waiting for an item to be
crawled if necessary. Resolves to ``False`` if the crawl is finished,
otherwise :meth:`next_item` is guaranteed to return an item
(a dict or a scrapy.Item instance).
"""
if self.closed:
# crawl is finished
d = Deferred()
d.callback(False)
return d
if self._items:
# result is ready
d = Deferred()
d.callback(True)
return d
# We're active, but item is not ready yet. Return a Deferred which
# resolves to True if item is scraped or to False if crawl is stopped.
return self._items_available
def next_item(self):
"""Get a document from the most recently fetched batch, or ``None``.
See :attr:`fetch_next`.
"""
if not self._items:
return None
return self._items.popleft()
The API is inspired by motor, a MongoDB driver for async frameworks. Using scrape_items you can get items from twisted or tornado callbacks as soon as they are scraped, in a way similar to how you fetch items from a MongoDB query.
This is probably too late but it may help others, you can pass a callback function to the Spider and call that function to return your data like so:
The dummy spider that we are going to use:
class Trial(Spider):
name = 'trial'
start_urls = ['']
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.output_callback = kwargs.get('args').get('callback')
def parse(self, response):
pass
def close(self, spider, reason):
self.output_callback(['Hi, This is the output.'])
A custom class with the callback:
from scrapy.crawler import CrawlerProcess
from scrapyapp.spiders.trial_spider import Trial
class CustomCrawler:
def __init__(self):
self.output = None
self.process = CrawlerProcess(settings={'LOG_ENABLED': False})
def yield_output(self, data):
self.output = data
def crawl(self, cls):
self.process.crawl(cls, args={'callback': self.yield_output})
self.process.start()
def crawl_static(cls):
crawler = CustomCrawler()
crawler.crawl(cls)
return crawler.output
Then you can do:
out = crawl_static(Trial)
print(out)
you can pass the variable as an attribute of the class and store the data in it.
of curse you need to add the attribute in the __init__ method of you spider class.
from scrapy.crawler import CrawlerProcess
from scraper.spiders import MySpider
url = 'www.example.com'
spider = MySpider()
crawler = CrawlerProcess()
data = []
crawler.crawl(spider, start_urls=[url], data)
crawler.start()
print(data)
My answer is inspired from Siddhant,
from scrapy import Spider
class MySpider(Spider):
name = 'myspider'
def parse(self, response):
item = {
'url': response.url,
'status': response.status
}
yield self.output_callback(item) # instead of yield item
from scrapy.crawler import CrawlerProcess
class Crawler:
def __init__(self):
self.process = CrawlerProcess()
self.scraped_items = []
def process_item(self, item): # similar to process_item in pipeline
item['scraped'] = 'yes'
self.scraped_items.append(item)
return item
def spawn(self, **kwargs):
self.process.crawl(MySpider,
output_callback=self.process_item,
**kwargs)
def run(self):
self.process.start()
if __name__ == '__main__':
crawler = Crawler()
crawler.spawn(start_urls=['https://www.example.com', 'https://www.google.com'])
crawler.run()
print(crawler.scraped_items)
Output
[{'url': 'https://www.google.com', 'status': 200, 'scraped': 'yes'},
{'url': 'https://www.example.com', 'status': 200, 'scraped': 'yes'}]
process_item is very useful for processing item as well as storing it.
I'm using an interceptor to check the validity of a token passed by the user in my tornado application.
def token_authenticate():
def wrapper(self, transforms, *args, **kwargs):
def _throw_error(self):
print 'writing basic auth'
if self._headers_written:
raise Exception('headers have already been written')
self.write(json.dumps({'auth': 'false'}))
self.finish()
return False
request = self.request
try:
token = request.arguments.get('token')[0]
if not token:
return _throw_error(self)
session = Instance().get_session()
user_token = session.query(UserToken)\
.filter(UserToken.token == token)\
.filter(UserToken.expires > datetime.utcnow())\
.one()
if user_token:
self.token = user_token
self.user = user_token.user
else:
print 'no user token'
return _throw_error(self)
except Exception, e:
print 'exception ' + e
return _throw_error(self)
return True
return wrapper
def interceptor(func):
def classwrapper(cls):
def wrapper(old):
def inner(self, transforms, *args, **kwargs):
log.debug('Invoking wrapper %s', func)
ret = func(self, transforms, *args, **kwargs)
if ret:
return old(self, transforms, *args, **kwargs)
else:
return ret
return inner
cls._execute = wrapper(cls._execute)
return cls
return classwrapper
## HANDLER
#interceptor(token_authenticate())
class SampleAuthenticatedRequestHandler(BaseHandler):
def get(self):
self.write({'response': self.user.as_dict()})
The request receives an empty response when the token is missing/invalid.
> curl localhost:8888/test -I -v
* Adding handle: conn: 0x7fb18b004000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7fb18b004000) send_pipe: 1, recv_pipe: 0
* About to connect() to localhost port 8888 (#0)
* Trying ::1...
* Connected to localhost (::1) port 8888 (#0)
> HEAD /test HTTP/1.1
> User-Agent: curl/7.30.0
> Host: localhost:8888
> Accept: */*
>
* Empty reply from server
* Connection #0 to host localhost left intact
curl: (52) Empty reply from server
Is there something missing? Is this the best way to abstract authentication away from my handler? I was hoping to replicate this model for different kinds of authentication (token based, session based etc).
RequestHandler._execute is an internal method and this will break in Tornado 4.0. prepare() and the HTTP verb methods get()/post()/etc are the supported methods to be overridden in subclasses. I don't see any reason that this code would return an empty response, although the server logs (which you did not post) might have something useful.
This is a working version of the above function. Just for anyone else who's having the same trouble -
def token_authenticate():
"""
This is a basic authentication interceptor which
protects the desired URIs and requires
authentication as per configuration
"""
def wrapper(self, transforms, *args, **kwargs):
request = self.request
try:
token = request.arguments.get('token')[0]
if not token:
return False
user_token = get_user(token)
if user_token:
self.token = user_token
self.user = user_token.user
return True
except:
pass
return False
return wrapper
def interceptor(func):
"""
This is a class decorator which is helpful in configuring
one or more interceptors which are able to intercept, inspect,
process and approve or reject further processing of the request
"""
def classwrapper(cls):
def wrapper(old):
def inner(self, transforms, *args, **kwargs):
ret = func(self, transforms, *args, **kwargs)
if ret:
return old(self, transforms, *args, **kwargs)
else:
self._transforms = transforms
return self._unauthorized()
return inner
cls._execute = wrapper(cls._execute)
return cls
return classwrapper
From what I understand from tornado.gen module docs is that tornado.gen.Task comprises of tornado.gen.Callback and tornado.gen.Wait with each Callback/Wait pair associated with unique keys ...
#tornado.web.asynchronous
#tornado.gen.engine
def get(self):
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=(yield tornado.gen.Callback("google")))
http_client.fetch("http://python.org",
callback=(yield tornado.gen.Callback("python")))
http_client.fetch("http://tornadoweb.org",
callback=(yield tornado.gen.Callback("tornado")))
response = yield [tornado.gen.Wait("google"), tornado.gen.Wait("tornado"), tornado.gen.Wait("python")]
do_something_with_response(response)
self.render("template.html")
So the above code will get all responses from the different URLs.
Now what I actually need to accomplish is to return the response as soon as one http_client returns the data. So if 'tornadoweb.org' returns the data first, it should do a self.write(respose) and a loop in def get() should keep waiting for other http_clients to complete.
Any ideas on how to write this using tornado.gen interface.
Very vague implementation(and syntactically incorrect) of what I am trying to do would be like this
class GenAsyncHandler2(tornado.web.RequestHandler):
#tornado.web.asynchronous
#tornado.gen.engine
def get(self):
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=(yield tornado.gen.Callback("google")))
http_client.fetch("http://python.org",
callback=(yield tornado.gen.Callback("python")))
http_client.fetch("http://tornadoweb.org",
callback=(yield tornado.gen.Callback("tornado")))
while True:
response = self.get_response()
if response:
self.write(response)
self.flush()
else:
break
self.finish()
def get_response(self):
for key in tornado.gen.availableKeys():
if key.is_ready:
value = tornado.gen.pop(key)
return value
return None
It's case, when you shouldn't use inline callbacks, i.e gen.
Also self.render will be called after all callbacks finished. If you want to return response from server partially - render it partially.
Think this way(it's only idea with big room of improvement):
response = []
#tornado.web.asynchronous
def get(self):
self.render('head.html')
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=self.mywrite)
http_client.fetch("http://python.org",
callback=self.mywrite)
http_client.fetch("http://tornadoweb.org",
callback=self.mywrite)
self.render('footer.html')
self.finish()
def mywrite(self, result):
self.render('body_part.html')
self.response.add(result)
if len(self.response) == 3:
do_something_with_response(self.response)
In addition to this, actually there is a method WaitAll which waits for all results and returns when all HTTPCliens have completed giving responses.
I have submitted the diff in my tornado branch (https://github.com/pranjal5215/tornado). I have added a class WaitAny which is async WaitAll and returns result as soon as one HTTPClient has returned result.
Diff is at (https://github.com/pranjal5215/tornado/commit/dd6902147ab2c5cbf2b9c7ee9a35b4f89b40790e), (https://github.com/pranjal5215/tornado/wiki/Add-WaitAny-to-make-WaitAll-return-results-incrementally)
Sample usage:
class GenAsyncHandler2(tornado.web.RequestHandler):
#tornado.web.asynchronous
#tornado.gen.engine
def get(self):
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=(yield tornado.gen.Callback("google")))
http_client.fetch("http://python.org",
callback=(yield tornado.gen.Callback("python")))
http_client.fetch("http://tornadoweb.org",
callback=(yield tornado.gen.Callback("tornado")))
keys = set(["google", "tornado", "python"])
while keys:
key, response = yield tornado.gen.WaitAny(keys)
keys.remove(key)
# do something with response
self.write(str(key)+" ")
self.flush()
self.finish()