I'm currently building a web app meant to display the data collected by a scrapy spider. The user makes a request, the spider crawl a website, then return the data to the app in order to be prompted. I'd like to retrieve the data directly from the scraper, without relying on an intermediary .csv or .json file. Something like :
from scrapy.crawler import CrawlerProcess
from scraper.spiders import MySpider
url = 'www.example.com'
spider = MySpider()
crawler = CrawlerProcess()
crawler.crawl(spider, start_urls=[url])
crawler.start()
data = crawler.data # this bit
This is not so easy because Scrapy is non-blocking and works in an event loop; it uses Twisted event loop, and Twisted event loop is not restartable, so you can't write crawler.start(); data = crawler.data - after crawler.start() process runs forever, calling registered callbacks until it is killed or ended.
These answers may be relevant:
How to integrate Flask & Scrapy?
Building a RESTful Flask API for Scrapy
If you use an event loop in your app (e.g. you have a Twisted or Tornado web server) then it is possible to get the data from a crawl without storing it to disk. The idea is to listen to item_scraped signal. I'm using the following helper to make it nicer:
import collections
from twisted.internet.defer import Deferred
from scrapy.crawler import Crawler
from scrapy import signals
def scrape_items(crawler_runner, crawler_or_spidercls, *args, **kwargs):
"""
Start a crawl and return an object (ItemCursor instance)
which allows to retrieve scraped items and wait for items
to become available.
Example:
.. code-block:: python
#inlineCallbacks
def f():
runner = CrawlerRunner()
async_items = scrape_items(runner, my_spider)
while (yield async_items.fetch_next):
item = async_items.next_item()
# ...
# ...
This convoluted way to write a loop should become unnecessary
in Python 3.5 because of ``async for``.
"""
crawler = crawler_runner.create_crawler(crawler_or_spidercls)
d = crawler_runner.crawl(crawler, *args, **kwargs)
return ItemCursor(d, crawler)
class ItemCursor(object):
def __init__(self, crawl_d, crawler):
self.crawl_d = crawl_d
self.crawler = crawler
crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
crawl_d.addCallback(self._on_finished)
crawl_d.addErrback(self._on_error)
self.closed = False
self._items_available = Deferred()
self._items = collections.deque()
def _on_item_scraped(self, item):
self._items.append(item)
self._items_available.callback(True)
self._items_available = Deferred()
def _on_finished(self, result):
self.closed = True
self._items_available.callback(False)
def _on_error(self, failure):
self.closed = True
self._items_available.errback(failure)
#property
def fetch_next(self):
"""
A Deferred used with ``inlineCallbacks`` or ``gen.coroutine`` to
asynchronously retrieve the next item, waiting for an item to be
crawled if necessary. Resolves to ``False`` if the crawl is finished,
otherwise :meth:`next_item` is guaranteed to return an item
(a dict or a scrapy.Item instance).
"""
if self.closed:
# crawl is finished
d = Deferred()
d.callback(False)
return d
if self._items:
# result is ready
d = Deferred()
d.callback(True)
return d
# We're active, but item is not ready yet. Return a Deferred which
# resolves to True if item is scraped or to False if crawl is stopped.
return self._items_available
def next_item(self):
"""Get a document from the most recently fetched batch, or ``None``.
See :attr:`fetch_next`.
"""
if not self._items:
return None
return self._items.popleft()
The API is inspired by motor, a MongoDB driver for async frameworks. Using scrape_items you can get items from twisted or tornado callbacks as soon as they are scraped, in a way similar to how you fetch items from a MongoDB query.
This is probably too late but it may help others, you can pass a callback function to the Spider and call that function to return your data like so:
The dummy spider that we are going to use:
class Trial(Spider):
name = 'trial'
start_urls = ['']
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.output_callback = kwargs.get('args').get('callback')
def parse(self, response):
pass
def close(self, spider, reason):
self.output_callback(['Hi, This is the output.'])
A custom class with the callback:
from scrapy.crawler import CrawlerProcess
from scrapyapp.spiders.trial_spider import Trial
class CustomCrawler:
def __init__(self):
self.output = None
self.process = CrawlerProcess(settings={'LOG_ENABLED': False})
def yield_output(self, data):
self.output = data
def crawl(self, cls):
self.process.crawl(cls, args={'callback': self.yield_output})
self.process.start()
def crawl_static(cls):
crawler = CustomCrawler()
crawler.crawl(cls)
return crawler.output
Then you can do:
out = crawl_static(Trial)
print(out)
you can pass the variable as an attribute of the class and store the data in it.
of curse you need to add the attribute in the __init__ method of you spider class.
from scrapy.crawler import CrawlerProcess
from scraper.spiders import MySpider
url = 'www.example.com'
spider = MySpider()
crawler = CrawlerProcess()
data = []
crawler.crawl(spider, start_urls=[url], data)
crawler.start()
print(data)
My answer is inspired from Siddhant,
from scrapy import Spider
class MySpider(Spider):
name = 'myspider'
def parse(self, response):
item = {
'url': response.url,
'status': response.status
}
yield self.output_callback(item) # instead of yield item
from scrapy.crawler import CrawlerProcess
class Crawler:
def __init__(self):
self.process = CrawlerProcess()
self.scraped_items = []
def process_item(self, item): # similar to process_item in pipeline
item['scraped'] = 'yes'
self.scraped_items.append(item)
return item
def spawn(self, **kwargs):
self.process.crawl(MySpider,
output_callback=self.process_item,
**kwargs)
def run(self):
self.process.start()
if __name__ == '__main__':
crawler = Crawler()
crawler.spawn(start_urls=['https://www.example.com', 'https://www.google.com'])
crawler.run()
print(crawler.scraped_items)
Output
[{'url': 'https://www.google.com', 'status': 200, 'scraped': 'yes'},
{'url': 'https://www.example.com', 'status': 200, 'scraped': 'yes'}]
process_item is very useful for processing item as well as storing it.
Related
For the purposes of an application I'm working on, I need scrapy to break out of the crawl and start crawling again from a particular, arbitrary URL.
The intended behaviour is for scrapy to just back to a particular URL which can be supplied in an argument if a particular condition is satisfied.
I'm using CrawlSpider but can't figure out how to achieve this:
class MyCrawlSpider(CrawlSpider):
name = 'mycrawlspider'
initial_url = ""
def __init__(self, initial_url, *args, **kwargs):
self.initial_url = initial_url
domain = "mydomain.com"
self.start_urls = [initial_url]
self.allowed_domains = [domain]
self.rules = (
Rule(LinkExtractor(allow=[r"^http[s]?://(www.)?" + domain + "/.*"]), callback='parse_item', follow=True),
)
super(MyCrawlSpider, self)._compile_rules()
def parse_item(self, response):
if(some_condition is True):
# force scrapy to go back to home page and recrawl
print("Should break out")
else:
print("Just carry on")
I tried to place
return scrapy.Request(self.initial_url, callback=self.parse_item)
in the branch of someCondition is True but without success. Would hugely appreciate some help, been working on trying to figure this out for hours.
you could make a custom exception that you handle appropriately, like so...
Please feel free to edit with the appropriate syntax for CrawlSpider
class RestartException(Exception):
pass
class MyCrawlSpider(CrawlSpider):
name = 'mycrawlspider'
initial_url = ""
def __init__(self, initial_url, *args, **kwargs):
self.initial_url = initial_url
domain = "mydomain.com"
self.start_urls = [initial_url]
self.allowed_domains = [domain]
self.rules = (
Rule(LinkExtractor(allow=[r"^http[s]?://(www.)?" + domain + "/.*"]), callback='parse_item', follow=True),
)
super(MyCrawlSpider, self)._compile_rules()
def parse_item(self, response):
if(some_condition is True):
print("Should break out")
raise RestartException("We're restarting now")
else:
print("Just carry on")
siteName = "http://whatever.com"
crawler = MyCrawlSpider(siteName)
while True:
try:
#idk how you start this thing, but do that
crawler.run()
break
except RestartException as err:
print(err.args)
crawler.something = err.args
continue
print("I'm done!")
I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) won't work.
My problem is, when I render a second page, about 99% of the time the program just crashes. At other times, it will work three times before crashing. I've also gotten a few segfaults, but it is all very random.
My guess is the object I'm using to render isn't getting deleted properly, so trying to reuse it is possibly causing some problems for myself. I've looked all over and no one really seems to be having this same issue.
Here's the code I'm using. The program downloads web pages from steam's community market so I can create a database of all the items. I need to call the getItemsFromPage function multiple times to get all of the items, as they are broken up into pages (showing results 1-10 out of X amount).
import csv
import re
import sys
from string import replace
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Item:
__slots__ = ("name", "count", "price", "game")
def __repr__(self):
return self.name + "(" + str(self.count) + ")"
def __str__(self):
return self.name + ", " + str(self.count) + ", $" + str(self.price)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
self.deleteLater()
def getItemsFromPage(appid, page=1):
r = Render("http://steamcommunity.com/market/search?q=appid:" + str(appid) + "#p" + str(page))
soup = BeautifulSoup(str(r.frame.toHtml().toUtf8()))
itemLst = soup.find_all("div", "market_listing_row market_recent_listing_row")
items = []
for k in itemLst:
i = Item()
i.name = k.find("span", "market_listing_item_name").string
i.count = int(replace(k.find("span", "market_listing_num_listings_qty").string, ",", ""))
i.price = float(re.search(r'\$([0-9]+\.[0-9]+)', str(k)).group(1))
i.game = appid
items.append(i)
return items
if __name__ == "__main__":
print "Updating market items to dota2.csv ..."
i = 1
with open("dota2.csv", "w") as f:
writer = csv.writer(f)
r = None
while True:
print "Page " + str(i)
items = getItemsFromPage(570)
if len(items) == 0:
print "No items found, stopping..."
break
for k in items:
writer.writerow((k.name, k.count, k.price, k.game))
i += 1
print "Done."
Calling getItemsFromPage once works fine. Subsequent calls give me my problem. The output of the program is typically
Updating market items to dota2.csv ...
Page 1
Page 2
and then it crashes. It should go on for over 700 pages.
The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
I want to convert my current tornado app from using #web.asynchronous to #gen.coroutine. My asynchronous callback is called when a particular variable change happens on an IOLoop iteration. The current example in Tornado docs solves an I/O problem but in my case its the variable that I am interested in. I want the coroutine to wake up on the variable change. My app looks like the code shown below.
Note: I can only use Python2.
# A transaction is a DB change that can happen
# from another process
class Transaction:
def __init__(self):
self.status = 'INCOMPLETE'
self.callback = None
# In this, I am checking the status of the DB
# before responding to the GET request
class MainHandler(web.RequestHandler):
def initialize(self, app_reference):
self.app_reference = app_reference
#web.asynchronous
def get(self):
txn = Transaction()
callback = functools.partial(self.do_something)
txn.callback = callback
self.app_reference.monitor_transaction(txn)
def do_something(self):
self.write("Finished GET request")
self.finish()
# MyApp monitors a list of transactions and adds the callback
# 'transaction.callback' when transactions status changes to
# COMPLETE state.
class MyApp(Application):
def __init__(self, settings):
self.settings = settings
self._url_patterns = self._get_url_patterns()
self.txn_list = [] # list of all transactions being monitored
Application.__init__(self, self._url_patterns, **self.settings)
IOLoop.current().add_callback(self.check_status)
def monitor_transaction(self, txn):
self.txn_list.append(txn)
def check_status(self):
count = 0
for transaction in self.txn_list:
transaction.status = is_transaction_complete()
if transaction.status is 'COMPLETE':
IOLoop.current().add_callback(transaction.callback)
self.txn_list.pop(count)
count += 1
if len(self.txn_list):
IOloop.current().add_callback(self.check_status)
# adds 'self' to url_patterns
def _get_url_patterns(self):
from urls import url_patterns
modified_url_patterns = []
for url in url_patterns:
modified_url_patterns.append( url + ({ 'app_reference': self },))
return modified_url_patterns
If I understand right for it to write using gen.coroutine the get should be modified as
#gen.coroutine
def get(self):
txn = Transaction()
response = yield wake_up_when_transaction_completes()
# respond to GET here
My issue is I am not sure how to wake a routine only when the status changes and I cannot use a loop as it will block the tornado thread. Basically I want to notify from the IOLoop iteration.
def check_status():
for transaction in txn_list:
if transaction.status is 'COMPLETE':
NOTIFY_COROUTINE
Sounds like a job for the new tornado.locks! Released last week with Tornado 4.2:
http://tornado.readthedocs.org/en/latest/releases/v4.2.0.html#new-modules-tornado-locks-and-tornado-queues
Use an Event for this:
from tornado import locks, gen
event = locks.Event()
#gen.coroutine
def waiter():
print("Waiting for event")
yield event.wait()
print("Done")
#gen.coroutine
def setter():
print("About to set the event")
event.set()
More info on the Event interface:
http://tornado.readthedocs.org/en/latest/locks.html#tornado.locks.Event
From what I understand from tornado.gen module docs is that tornado.gen.Task comprises of tornado.gen.Callback and tornado.gen.Wait with each Callback/Wait pair associated with unique keys ...
#tornado.web.asynchronous
#tornado.gen.engine
def get(self):
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=(yield tornado.gen.Callback("google")))
http_client.fetch("http://python.org",
callback=(yield tornado.gen.Callback("python")))
http_client.fetch("http://tornadoweb.org",
callback=(yield tornado.gen.Callback("tornado")))
response = yield [tornado.gen.Wait("google"), tornado.gen.Wait("tornado"), tornado.gen.Wait("python")]
do_something_with_response(response)
self.render("template.html")
So the above code will get all responses from the different URLs.
Now what I actually need to accomplish is to return the response as soon as one http_client returns the data. So if 'tornadoweb.org' returns the data first, it should do a self.write(respose) and a loop in def get() should keep waiting for other http_clients to complete.
Any ideas on how to write this using tornado.gen interface.
Very vague implementation(and syntactically incorrect) of what I am trying to do would be like this
class GenAsyncHandler2(tornado.web.RequestHandler):
#tornado.web.asynchronous
#tornado.gen.engine
def get(self):
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=(yield tornado.gen.Callback("google")))
http_client.fetch("http://python.org",
callback=(yield tornado.gen.Callback("python")))
http_client.fetch("http://tornadoweb.org",
callback=(yield tornado.gen.Callback("tornado")))
while True:
response = self.get_response()
if response:
self.write(response)
self.flush()
else:
break
self.finish()
def get_response(self):
for key in tornado.gen.availableKeys():
if key.is_ready:
value = tornado.gen.pop(key)
return value
return None
It's case, when you shouldn't use inline callbacks, i.e gen.
Also self.render will be called after all callbacks finished. If you want to return response from server partially - render it partially.
Think this way(it's only idea with big room of improvement):
response = []
#tornado.web.asynchronous
def get(self):
self.render('head.html')
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=self.mywrite)
http_client.fetch("http://python.org",
callback=self.mywrite)
http_client.fetch("http://tornadoweb.org",
callback=self.mywrite)
self.render('footer.html')
self.finish()
def mywrite(self, result):
self.render('body_part.html')
self.response.add(result)
if len(self.response) == 3:
do_something_with_response(self.response)
In addition to this, actually there is a method WaitAll which waits for all results and returns when all HTTPCliens have completed giving responses.
I have submitted the diff in my tornado branch (https://github.com/pranjal5215/tornado). I have added a class WaitAny which is async WaitAll and returns result as soon as one HTTPClient has returned result.
Diff is at (https://github.com/pranjal5215/tornado/commit/dd6902147ab2c5cbf2b9c7ee9a35b4f89b40790e), (https://github.com/pranjal5215/tornado/wiki/Add-WaitAny-to-make-WaitAll-return-results-incrementally)
Sample usage:
class GenAsyncHandler2(tornado.web.RequestHandler):
#tornado.web.asynchronous
#tornado.gen.engine
def get(self):
http_client = AsyncHTTPClient()
http_client.fetch("http://google.com",
callback=(yield tornado.gen.Callback("google")))
http_client.fetch("http://python.org",
callback=(yield tornado.gen.Callback("python")))
http_client.fetch("http://tornadoweb.org",
callback=(yield tornado.gen.Callback("tornado")))
keys = set(["google", "tornado", "python"])
while keys:
key, response = yield tornado.gen.WaitAny(keys)
keys.remove(key)
# do something with response
self.write(str(key)+" ")
self.flush()
self.finish()
I wrote simple web site crowler with threadpool. The problem is: then crawler is get all over site it must finish, but in real it wait for something in the end,and script dont finished, why this happend?
from Queue import Queue
from threading import Thread
import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread
visited = set()
queue = Queue()
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
print "startcall in thread",self
print args
try: func(*args, **kargs)
except Exception, e: print e
print "stopcall in thread",self
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def process(pool,host,url):
try:
print "get url",url
#content = urlopen(url).read().decode(charset)
content = urlopen(url).read()
except UnicodeDecodeError:
return
for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
#print "link",link
try:
href = link['href']
except KeyError:
continue
if not href.startswith('http://'):
href = 'http://%s%s' % (host, href)
if not href.startswith('http://%s%s' % (host, '/')):
continue
if href not in visited:
visited.add(href)
pool.add_task(process,pool,host,href)
print href
def start(host,charset):
pool = ThreadPool(7)
pool.add_task(process,pool,host,'http://%s/' % (host))
pool.wait_completion()
start('simplesite.com','utf8')
The problem I see is that you never quit the while in run. So, it will block forever. You need to break that loop when the jobs are done.
You could try to :
1) insert
if not func: break
after task.get(...) in run.
2) append
pool.add_task(None, None, None)
at the end of process.
This is a way for process to notify the pool that he has no more task to process.