Scrapy not parsing response in make_requests_from_url loop - python

I'm trying to get scrapy to grab a URL from a message queue, and then scrape that URL. I have the loop going just fine and grabbing the URL from the queue, but it never enters the parse() method once it has a url, it just continues to loop (and sometimes the url comes back around even though I've deleted it from the queue...)
While it's running in terminal, if I CTRL+C and force it to end, it enters the parse() method and crawls the page, then ends. I'm not sure what's wrong here.
class my_Spider(Spider):
name = "my_spider"
allowed_domains = ['domain.com']
def __init__(self):
super(my_Spider, self).__init__()
self.url = None
def start_requests(self):
while True:
# Crawl the url from queue
yield self.make_requests_from_url(self._pop_queue())
def _pop_queue(self):
# Grab the url from queue
return self.queue()
def queue(self):
url = None
while url is None:
conf = {
"sqs-access-key": "",
"sqs-secret-key": "",
"sqs-queue-name": "crawler",
"sqs-region": "us-east-1",
"sqs-path": "sqssend"
}
# Connect to AWS
conn = boto.sqs.connect_to_region(
conf.get('sqs-region'),
aws_access_key_id=conf.get('sqs-access-key'),
aws_secret_access_key=conf.get('sqs-secret-key')
)
q = conn.get_queue(conf.get('sqs-queue-name'))
message = conn.receive_message(q)
# Didn't get a message back, wait.
if not message:
time.sleep(10)
url = None
else:
url = message
if url is not None:
message = url[0]
message_body = str(message.get_body())
message.delete()
self.url = message_body
return self.url
def parse(self, response):
...
yield item
Updated from comments:
def start_requests(self):
while True:
# Crawl the url from queue
queue = self._pop_queue()
self.logger.error(queue)
if queue is None:
time.sleep(10)
continue
url = queue
if url:
yield self.make_requests_from_url(url)
Removed the while url is None: loop, but still get the same problem.

Would I be right to assume that if this works:
import scrapy
import random
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
def __init__(self):
super(ExampleSpider, self).__init__()
self.url = None
def start_requests(self):
while True:
# Crawl the url from queue
yield self.make_requests_from_url(self._pop_queue())
def _pop_queue(self):
# Grab the url from queue
return self.queue()
def queue(self):
return 'http://www.example.com/?{}'.format(random.randint(0,100000))
def parse(self, response):
print "Successfully parsed!"
Then your code should work as well, unless:
There's a problem with allowed_domains and your queue actually returns URLs outside it
There's a problem with your queue() function and/or the data it produces e.g. it returns arrays or it blocks indefinitely or something like that
Note also that the boto library is blocking and not Twisted/asynchronous. In order to not block scrapy while using it, you will have to use a Twisted-compatible library like txsqs. Alternatively you might want to run boto calls in a separate thread with deferToThread.
After your follow up question in Scrapy list, I believe that you have to understand that your code is quite far from functional and this makes it as much as generic Boto/SQS question as Scrapy question. Anyway - here's an average functional solution.
I've created and AWS SQS with this properties:
Then gave it some overly broad permissions:
Now I'm able to submit messages in the queue with AWS CLi like this:
$ aws --region eu-west-1 sqs send-message --queue-url "https://sqs.eu-west-1.amazonaws.com/123412341234/my_queue" --message-body 'url:https://stackoverflow.com'
For some weird reason - I think that when I was setting --message-body to a URL it was actually downloading the page and sending the result as message body(!) Not sure - don't have time to confirm this, but interesting. Anyway.
Here's a proper'ish Spider code. As I said before, boto is blocking API which is bad. In this implementation I call its API just once from start_requests() and then only when the spider is idle on the spider_idle() callback. At that point, because the spider is idle, the fact that boto is blocking doesn't pose much of a problem. While I pull URLs from SQS, I pull as many as possible with the while loop (you could put a limit there if you don't want to consume e.g. more than 500 at a time) in order to have to call the blocking API as rarely as possible. Notice also the call to conn.delete_message_batch() which actually removes messages from the queue (otherwise they just stay there for ever) and the queue.set_message_class(boto.sqs.message.RawMessage) that avoids this problem.
Overall this might be an ok solution for your level of requirements.
from scrapy import Spider, Request
from scrapy import signals
import boto.sqs
from scrapy.exceptions import DontCloseSpider
class CPU_Z(Spider):
name = "cpuz"
allowed_domains = ['http://valid.x86.fr']
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CPU_Z, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
return spider
def __init__(self, *args, **kwargs):
super(CPU_Z, self).__init__(*args, **kwargs)
conf = {
"sqs-access-key": "AK????????????????",
"sqs-secret-key": "AB????????????????????????????????",
"sqs-queue-name": "my_queue",
"sqs-region": "eu-west-1",
}
self.conn = boto.sqs.connect_to_region(
conf.get('sqs-region'),
aws_access_key_id=conf.get('sqs-access-key'),
aws_secret_access_key=conf.get('sqs-secret-key')
)
self.queue = self.conn.get_queue(conf.get('sqs-queue-name'))
assert self.queue
self.queue.set_message_class(boto.sqs.message.RawMessage)
def _get_some_urs_from_sqs(self):
while True:
messages = self.conn.receive_message(self.queue, number_messages=10)
if not messages:
break
for message in messages:
body = message.get_body()
if body[:4] == 'url:':
url = body[4:]
yield self.make_requests_from_url(url)
self.conn.delete_message_batch(self.queue, messages)
def spider_idle(self, spider):
for request in self._get_some_urs_from_sqs():
self.crawler.engine.crawl(request, self)
raise DontCloseSpider()
def start_requests(self):
for request in self._get_some_urs_from_sqs():
yield request
def parse(self, response):
yield {
"freq_clock": response.url
}

Related

How to pause spider in Scrapy

I'm new in scrapy and I need to pause a spider after receiving a response error (like 407, 429).
Also, I should do this without using time.sleep(), and use middlewares or extensions.
Here is my middlewares:
from scrapy import signals
from pydispatch import dispatcher
class Handle429:
def __init__(self):
dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
def item_scraped(self, item, spider, response):
if response.status == 429:
print("THIS IS 429 RESPONSE")
#
# here stop spider for 10 minutes and then continue
#
I read about self.crawler.engine.pause() but how can I implement it in my middleware, and set a custom time for pause?
Or is there another way to do this? Thanks.
I have solved my problem. First of all, middleware can have default foo like process_response or process_request.
In settings.py
HTTPERROR_ALLOWED_CODES = [404]
Then, I have changed my middleware class:
from twisted.internet import reactor
from twisted.internet.defer import Deferred
#replace class Handle429
class HandleErrorResponse:
def __init__(self):
self.time_pause = 1800
def process_response(self, request, response, spider):
# this foo called by default before the spider
pass
Then I find a code that helps me to pause spider without time.sleep()
#in HandleErrorResponse
def process_response(self, request, response, spider):
print(response.status)
if response.status == 404:
d = Deferred()
reactor.callLater(self.time_pause, d.callback, response)
return response
And it's work.
I can't fully explain how reactor.callLater() works, but I think it just stops the event loop in scrapy, and then your response will be sent to the spider.

Async query database for keys to use in multiple requests

I want to asynchronously query a database for keys, then make requests to several urls for each key.
I have a function that returns a Deferred from the database whose value is the key for several requests. Ideally, I would call this function and return a generator of Deferreds from start_requests.
#inlineCallbacks
def get_request_deferred(self):
d = yield engine.execute(select([table])) # async
d.addCallback(make_url)
d.addCallback(Request)
return d
def start_requests(self):
????
But attempting this in several ways raises
builtins.AttributeError: 'Deferred' object has no attribute 'dont_filter'
which I take to mean that start_requests must return Request objects, not Deferreds whose values are Request objects. The same seems to be true of spider middleware's process_start_requests().
Alternatively, I can make initial requests to, say, http://localhost/ and change them to the real url once the key is available from the database through downloader middleware's process_request(). However, process_request only returns a Request object; it cannot yield Requests to multiple pages using the key: attempting yield Request(url) raises
AssertionError: Middleware myDownloaderMiddleware.process_request
must return None, Response or Request, got generator
What is the cleanest solution to
get key asynchronously from database
for each key, generate several requests
You've provided no use case for async database queries to be a necessity. I'm assuming you cannot begin to scrape your URLs unless you query the database first? If that's the case then you're better off just doing the query synchronously, iterate over the query results, extract what you need, then yield Request objects. It makes little sense to query a db asynchronously and just sit around waiting for the query to finish.
You can let the callback for the Deferred object pass the urls to a generator of some sort. The generator will then convert any received urls into scrapy Request objects and yield them. Below is an example using the code you linked (not tested):
import scrapy
from Queue import Queue
from pdb import set_trace as st
from twisted.internet.defer import Deferred, inlineCallbacks
class ExampleSpider(scrapy.Spider):
name = 'example'
def __init__(self):
self.urls = Queue()
self.stop = False
self.requests = request_generator()
self.deferred = deferred_generator()
def deferred_generator(self):
d = Deferred()
d.addCallback(self.deferred_callback)
yield d
def request_generator(self):
while not self.stop:
url = self.urls.get()
yield scrapy.Request(url=url, callback=self.parse)
def start_requests(self):
return self.requests.next()
def parse(self, response):
st()
# when you need to parse the next url from the callback
yield self.requests.next()
#static_method
def deferred_callback(url):
self.urls.put(url)
if no_more_urls():
self.stop = True
Don't forget to stop the request generator when you're done.

Making a Non-Blocking HTTP Request from Scrapy Pipeline

As I understand it, Scrapy is single threaded but async on the network side. I am working on something which requires an API call to an external resource from within the item pipeline. Is there any way to make the HTTP request without blocking the pipeline and slowing down Scrapy from crawling?
Thanks
You can do it by scheduling a request directly to a crawler.engine via crawler.engine.crawl(request, spider). To do that however fist you need to expose crawler in your pipeline:
class MyPipeline(object):
def __init__(self, crawler):
self.crawler = crawler
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_item(self, item, spider):
if item['some_extra_field']: # check if we already did below
return item
url = 'some_url'
req = scrapy.Request(url, self.parse_item, meta={'item':item})
self.crawler.engine.crawl(req, spider)
raise DropItem() # we will get this item next time
def parse_item(self, response):
item = response.meta['item']
item['some_extra_field'] = '...'
return item

Fetching data with Python's asyncio in a sequential order

I have a Python 2.7 program which pulls data from websites and dumps the results to a database. It follows the consumer producer model and is written using the threading module.
Just for fun I would like to rewrite this program using the new asyncio module (from 3.4) but I cannot figure out how to do this properly.
The most crucial requirement is that the program must fetch data from the same website in a sequential order. For example for an url 'http://a-restaurant.com' it should first get 'http://a-restaurant.com/menu/0', then 'http://a-restaurant.com/menu/1', then 'http://a-restaurant.com/menu/2', ...
If they are not fetched in order the website stops delivering pages altogether and you have to start from 0.
However another fetch for another website ('http://another-restaurant.com') can (and should) run at the same time (the other sites also have the sequantial restriction).
The threading module suits well for this as I can create separate threads for each website and in each thread it can wait until one page has finished loading before fetching another one.
Here's a grossly simplified code snippet from the threading version (Python 2.7):
class FetchThread(threading.Threading)
def __init__(self, queue, url)
self.queue = queue
self.baseurl = url
...
def run(self)
# Get 10 menu pages in a sequantial order
for food in range(10):
url = self.baseurl + '/' + str(food)
text = urllib2.urlopen(url).read()
self.queue.put(text)
...
def main()
queue = Queue.Queue()
urls = ('http://a-restaurant.com/menu', 'http://another-restaurant.com/menu')
for url in urls:
fetcher = FetchThread(queue, url)
fetcher.start()
...
And here's how I tried to do it with asyncio (in 3.4.1):
#asyncio.coroutine
def fetch(url):
response = yield from aiohttp.request('GET', url)
response = yield from response.read_and_close()
return response.decode('utf-8')
#asyncio.coroutine
def print_page(url):
page = yield from fetch(url)
print(page)
l = []
urls = ('http://a-restaurant.com/menu', 'http://another-restaurant.com/menu')
for url in urls:
for food in range(10):
menu_url = url + '/' + str(food)
l.append(print_page(menu_url))
loop.run_until_complete(asyncio.wait(l))
And it fetches and prints everything in a non-sequential order. Well, I guess that's the whole idea of those coroutines. Should I not use aiohttp and just fetch with urllib? But do the fetches for the first restaurant then block the fetches for the other restaurants? Am I just thinking this completely wrong?
(This is just a test to try fetch things in a sequential order. Haven't got to the queue part yet.)
Your current code will work fine for the restaurant that doesn't care about sequential ordering of requests. All ten requests for the menu will run concurrently, and will print to stdout as soon as they're complete.
Obviously, this won't work for the restaurant that requires sequential requests. You need to refactor a bit for that to work:
#asyncio.coroutine
def fetch(url):
response = yield from aiohttp.request('GET', url)
response = yield from response.read_and_close()
return response.decode('utf-8')
#asyncio.coroutine
def print_page(url):
page = yield from fetch(url)
print(page)
#syncio.coroutine
def print_pages_sequential(url, num_pages):
for food in range(num_pages):
menu_url = url + '/' + str(food)
yield from print_page(menu_url)
l = [print_pages_sequential('http://a-restaurant.com/menu', 10)]
conc_url = 'http://another-restaurant.com/menu'
for food in range(10):
menu_url = conc_url + '/' + str(food)
l.append(print_page(menu_url))
loop.run_until_complete(asyncio.wait(l))
Instead of adding all ten requests for the sequential restaurant to the list, we add one coroutine to the list which will iterate over all ten pages sequentially. The way this works is that yield from print_page will stop the execution of print_pages_sequential until the print_page request is complete, but it will do so without blocking any other coroutines that are running concurrently (like all the print_page calls you append to l).
By doing it this way, all of your "another-restaurant" requests can run completely concurrently, just like you want, and your "a-restaurant" requests will run sequentially, but without blocking any of the "another-restaurant" requests.
Edit:
If all the sites have the same sequential fetching requirement, the logic can be simplified more:
l = []
urls = ["http://a-restaurant.com/menu", "http://another-restaurant.com/menu"]
for url in urls:
menu_url = url + '/' + str(food)
l.append(print_page_sequential(menu_url, 10))
loop.run_until_complete(asyncio.wait(l))
asyncio.Task is replacement for threading.Thread in asyncio world.
asyncio.async also creates new task.
asyncio.gather is very convenient way to wait for several coroutines, I prefer it instead of asyncio.wait.
#asyncio.coroutine
def fetch(url):
response = yield from aiohttp.request('GET', url)
response = yield from response.read_and_close()
return response.decode('utf-8')
#asyncio.coroutine
def print_page(url):
page = yield from fetch(url)
print(page)
#asyncio.coroutine
def process_restaurant(url):
for food in range(10):
menu_url = url + '/' + str(food)
yield from print_page(menu_url)
urls = ('http://a-restaurant.com/menu', 'http://another-restaurant.com/menu')
coros = []
for url in urls:
coros.append(asyncio.Task(process_restaurant(url)))
loop.run_until_complete(asyncio.gather(*coros))

Add a delay to a specific scrapy Request

Is it possible to delay the retry of a particular scrapy Request. I have a middleware which needs to defer the request of a page until a later time. I know how to do the basic deferal (end of queue), and also how to delay all requests (global settings), but I want to just delay this one individual request. This is most important near the end of the queue, where if I do the simple deferral it immediately becomes the next request again.
Method 1
One way would be to add a middleware to your Spider (source, linked):
# File: middlewares.py
from twisted.internet import reactor
from twisted.internet.defer import Deferred
class DelayedRequestsMiddleware(object):
def process_request(self, request, spider):
delay_s = request.meta.get('delay_request_by', None)
if not delay_s:
return
deferred = Deferred()
reactor.callLater(delay_s, deferred.callback, None)
return deferred
Which you could later use in your Spider like this:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {'middlewares.DelayedRequestsMiddleware': 123},
}
def start_requests(self):
# This request will have itself delayed by 5 seconds
yield scrapy.Request(url='http://quotes.toscrape.com/page/1/',
meta={'delay_request_by': 5})
# This request will not be delayed
yield scrapy.Request(url='http://quotes.toscrape.com/page/2/')
def parse(self, response):
... # Process results here
Method 2
You could do this with a Custom Retry Middleware (source), you just need to override the process_response method of the current Retry Middleware:
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
class CustomRetryMiddleware(RetryMiddleware):
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
# Your delay code here, for example sleep(10) or polling server until it is alive
return self._retry(request, reason, spider) or response
return response
Then enable it instead of the default RetryMiddleware in settings.py:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'myproject.middlewarefilepath.CustomRetryMiddleware': 550,
}
A solution that uses twisted.reactor.callLater() is here:
https://github.com/ArturGaspar/scrapy-delayed-requests
sleep() Method suspends execution for the given number of seconds. The argument may be a floating point number to indicate a more precise sleep time.
So that you have to import time module in your spider.
import time
Then you can add the sleep method where you need the delay.
time.sleep( 5 )

Categories