open_spider method run two times when using CrawlerProcess - python

I want to run multiple spiders, so i try to use CrawlerProcess. But i find the method open_spider will run two times at the beginning and the end with process_item method.
It causes when the spider open , i remove my collection and save the data into mongodb completed. It will remove my collection again finally.
How do i fix the issue and why the method open_spider run two times ?
I tyep scrapy crawl movies run the project:
Here is my movies.py:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import time
# scrapy api imports
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from Tainan.FirstSpider import FirstSpider
class MoviesSpider(scrapy.Spider):
name = 'movies'
allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']
process = CrawlerProcess(get_project_settings())
process.crawl(FirstSpider)
process.start()
It's my FirstSpider.py:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
class FirstSpider(scrapy.Spider):
name = 'first'
allowed_domains = ['tw.movies.yahoo.com', 'movies.yahoo.com.tw']
start_urls = ['http://tw.movies.yahoo.com/movie_thisweek.html/']
def parse(self, response):
movieHrefs = response.xpath('//*[#class="release_movie_name"]/a/#href').extract()
for movieHref in movieHrefs:
yield Request(movieHref, callback=self.parse_page)
def parse_page(self, response):
print 'FirstSpider => parse_page'
movieImage = response.xpath('//*[#class="foto"]/img/#src').extract()
cnName = response.xpath('//*[#class="movie_intro_info_r"]/h1/text()').extract()
enName = response.xpath('//*[#class="movie_intro_info_r"]/h3/text()').extract()
movieDate = response.xpath('//*[#class="movie_intro_info_r"]/span/text()')[0].extract()
movieTime = response.xpath('//*[#class="movie_intro_info_r"]/span/text()')[1].extract()
imdbScore = response.xpath('//*[#class="movie_intro_info_r"]/span/text()')[3].extract()
movieContent = response.xpath('//*[#class="gray_infobox_inner"]/span/text()').extract_first().strip()
yield {'image': movieImage, 'cnName': cnName, 'enName': enName, 'movieDate': movieDate, 'movieTime': movieTime, 'imdbScore': imdbScore, 'movieContent': movieContent}
It's my pipelines.py:
from pymongo import MongoClient
from scrapy.conf import settings
class MongoDBPipeline(object):
global open_count
open_count = 1
global process_count
process_count = 1
def __init__(self):
connection = MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
# My issue is here it will print open_spider count = 2 finally.
def open_spider(self, spider):
global open_count
print 'Pipelines => open_spider count =>'
print open_count
open_count += 1
self.collection.remove({})
# open_spider method call first time and process_item save data to my mongodb.
# but when process_item completed, open_spider method run again...it cause my data that i have saved it has been removed.
def process_item(self, item, spider):
global process_count
print 'Pipelines => process_item count =>'
print process_count
process_count += 1
self.collection.insert(dict(item))
return item
I can't figure it out, some one can help me out that would be appreciated. Thanks in advance.

How do i fix the issue and why the method open_spider run two times ?
The open_spider method runs once per spider, and you're running two spiders.
I tyep scrapy crawl movies run the project
The crawl command will run the spider named movies (MoviesSpider).
To do this, it has to import the movies module, which will cause it to run your FirstSpider as well.
Now, how to fix this depends on what you want to do.
Maybe you should only run a single spider, or have separate settings per spider, or maybe something entirely different.

Related

Scrapy ModuleNotFoundError: No module named 'MySQLdb'

Just started out with Scrapy and I am trying to write to a MySQL database rather than outputting to a csv.
I have found the code here: https://gist.github.com/tzermias/6982723 that I am using to try to make this work, but unfortunately having an error that I can't get my head around.
This is my pipelines.py:
class WebsitePipeline(object):
def process_item(self, item, spider):
return item
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy import log
SETTINGS = get_project_settings()
class MySQLPipeline(object):
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.stats)
def __init__(self, stats):
#Instantiate DB
self.dbpool = adbapi.ConnectionPool ('MySQLdb',
host=SETTINGS['DB_HOST'],
user=SETTINGS['DB_USER'],
passwd=SETTINGS['DB_PASSWD'],
port=SETTINGS['DB_PORT'],
db=SETTINGS['DB_DB'],
charset='utf8',
use_unicode = True,
cursorclass=MySQLdb.cursors.DictCursor
)
self.stats = stats
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
""" Cleanup function, called after crawing has finished to close open
objects.
Close ConnectionPool. """
self.dbpool.close()
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._insert_record, item)
query.addErrback(self._handle_error)
return item
def _insert_record(self, tx, item):
result = tx.execute(
""" INSERT INTO table VALUES (1,2,3)"""
)
if result > 0:
self.stats.inc_value('database/items_added')
def _handle_error(self, e):
log.err(e)
This is what is in my settings.py:
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Website.pipelines.MySQLPipeline': 300,
}
#Database settings
DB_HOST = 'localhost'
DB_PORT = 3306
DB_USER = 'username'
DB_PASSWD = 'password'
DB_DB = 'scrape'
This is the spider.py:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
class WebsitesitemapSpider(SitemapSpider):
name = 'Websitesitemap'
allowed_domains = ['Website.com']
sitemap_urls = ['https://www.Website.com/robots.txt']
def parse(self, response):
yield {response.url}
I have been unable to find a working example of what I am looking to do to be able to work out where I am going wrong so thank you to anyone who looks at this or might be able to help.
you will need MySQL-python installed in your python environment, along with libmysql installed on the operating system.
On Ubuntu this would be achieved in the folllowing manner.
pip install MySQL-python
sudo apt-get install libmysql-dev
do you have these packages installed "MySQLdb, scrapy, twisted".
Else try installing using PIP and then try running the script.

Django Celery Scrappy ERROR: twisted.internet.error.ReactorNotRestartable

I have next model:
Command 'collect' (collect_positions.py) -> Celery task (tasks.py) -> ScrappySpider (MySpider) ...
collect_positions.py:
from django.core.management.base import BaseCommand
from tracker.models import Keyword
from tracker.tasks import positions
class Command(BaseCommand):
help = 'collect_positions'
def handle(self, *args, **options):
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
chunk_size = 1
keywords = Keyword.objects.filter(product=product).values_list('id', flat=True)
chunks_list = list(chunks(keywords, chunk_size))
positions.chunks(chunks_list, 1).apply_async(queue='collect_positions')
return 0
tasks.py:
from app_name.celery import app
from scrapy.settings import Settings
from scrapy_app import settings as scrapy_settings
from scrapy_app.spiders.my_spider import MySpider
from tracker.models import Keyword
from scrapy.crawler import CrawlerProcess
#app.task
def positions(*args):
s = Settings()
s.setmodule(scrapy_settings)
keywords = Keyword.objects.filter(id__in=list(args))
process = CrawlerProcess(s)
process.crawl(MySpider, keywords_chunk=keywords)
process.start()
return 1
I run the command through the command line, which creates tasks for parsing. The first queue completes successfully, but other returned an error:
twisted.internet.error.ReactorNotRestartable
Please tell me how can I fix this error?
I can provide any data if there is a need...
UPDATE 1
Thanks for the answer, #Chiefir! I managed to run all queues, but only the start_requests() function is started, and parse() does not run.
The main functions of the scrappy spider:
def start_requests(self):
print('STEP1')
yield scrapy.Request(
url='exmaple.com',
callback=self.parse,
errback=self.error_callback,
dont_filter=True
)
def error_callback(self, failure):
print(failure)
# log all errback failures,
# in case you want to do something special for some errors,
# you may need the failure's type
print(repr(failure))
# if isinstance(failure.value, HttpError):
if failure.check(HttpError):
# you can get the response
response = failure.value.response
print('HttpError on %s', response.url)
# elif isinstance(failure.value, DNSLookupError):
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
print('DNSLookupError on %s', request.url)
# elif isinstance(failure.value, TimeoutError):
elif failure.check(TimeoutError):
request = failure.request
print('TimeoutError on %s', request.url)
def parse(self, response):
print('STEP2', response)
In the console I get:
STEP1
What could be the reason?
This is old question as a world:
This is what helped for me to win the battle against ReactorNotRestartable error: last answer from the author of the question
0) pip install crochet
1) import from crochet import setup
2) setup() - at the top of the file
3) remove 2 lines:
a) d.addBoth(lambda _: reactor.stop())
b) reactor.run()
I had the same problem with this error, and spend 4+ hours to solve this problem, read all questions here about it. Finally found that one - and share it. That is how i solved this. The only meaningful lines from Scrapy docs left are 2 last lines in this my code:
#some more imports
from crochet import setup
setup()
def run_spider(spiderName):
module_name="first_scrapy.spiders.{}".format(spiderName)
scrapy_var = import_module(module_name) #do some dynamic import of selected spider
spiderObj=scrapy_var.mySpider() #get mySpider-object from spider module
crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
crawler.crawl(spiderObj) #from Scrapy docs
This code allows me to select what spider to run just with its name passed to run_spider function and after scrapping finishes - select another spider and run it again.
In your case you need in separate file create separate function which runs your spiders and run it from your task. Usually I do in this way :)
P.S. And really there is no way to restart the TwistedReactor.
UPDATE 1
I don't know if you need to call a start_requests() method. For me it usually works just with this code:
class mySpider(scrapy.Spider):
name = "somname"
allowed_domains = ["somesite.com"]
start_urls = ["https://somesite.com"]
def parse(self, response):
pass
def parse_dir_contents(self, response): #for crawling additional links
pass
You can fix this by setting the parameter stop_after_crawl to False on the start method of CrawlerProcess:
stop_after_crawl (bool) – stop or not the reactor when all crawlers have finished
#shared_task
def crawl(m_id, *args, **kwargs):
process = CrawlerProcess(get_project_settings(), install_root_handler=False)
process.crawl(SpiderClass, m_id=m_id)
process.start(stop_after_crawl=False)

Running multiple instances of a CrawlSpider

i'm just getting started using scrapy and i'd like to do the following
Have a list of n domains
i=0
loop for i to n
Use a (mostly) generic CrawlSpider to get all links (a href) of domain[i]
Save results as json lines
to do this, the Spider needs to receive the domain it has to crawl as an argument.
I already successfully created the CrawlSpider:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Item, Field
from scrapy.crawler import CrawlerProcess
class MyItem(Item):
#MyItem Fields
class SubsiteSpider(CrawlSpider):
name = "subsites"
start_urls = []
allowed_domains = []
rules = (Rule(LinkExtractor(), callback='parse_obj', follow=True),)
def __init__(self, starturl, allowed, *args, **kwargs):
print(args)
self.start_urls.append(starturl)
self.allowed_domains.append(allowed)
super().__init__(**kwargs)
def parse_obj(self, response):
item = MyItem()
#fill Item Fields
return item
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
process.crawl(SubsiteSpider)
process.start()
If i call it with scrapy crawl subsites -a starturl=http://example.com -a allowed=example.com -o output.jl
the result is exactly as i want it, so this part is fine already.
What i fail to do is create multiple instances of SubsiteSpider, each with a different domain as argument.
I tried (in SpiderRunner.py)
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl('subsites', ['https://example.com', 'example.com'])
process.start()
Variant:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
allowed = ["example.com"]
start = ["https://example.com"]
process.crawl('subsites', start, allowed)
process.start()
But i get an error that occurs, i presume, because the argument is not properly passed to __init__, for example TypeError: __init__() missing 1 required positional argument: 'allowed' or TypeError: __init__() missing 2 required positional arguments: 'starturl' and 'allowed'
(Loop is yet to be implemented)
So, here are my questions:
1) What is the proper way to pass arguments to init, if i do not start crawling via scrapy shell, but from within python code?
2) How can i also pass the -o output.jl argument? (or maybe, use allowed argument as filename?)
3) I am fine with this running each spider after another - would it still be considered best / good practice to do it that way? Could you point to a more extensive tutorial about "running the same spider again and again, with different arguments(=target domains), optionally parallel", if there is one?
Thank you all very much in advance!
If there are any spelling mistakes (not an english native speaker), or if question / details are not precise enough, please tell me how to correct them.
There are a few problems with your code:
start_urls and allowed_domains are class attributes which you modify in __init__(), making them shared across all instances of your class.
What you should do instead is make them instance attributes:
class SubsiteSpider(CrawlSpider):
name = "subsites"
rules = (Rule(LinkExtractor(), callback='parse_obj', follow=True),)
def __init__(self, starturl, allowed, *args, **kwargs):
self.start_urls = [starturl]
self.allowed_domains = [allowed]
super().__init__(*args, **kwargs)
Those last 3 lines should not be in the file with you spider class, since you probably don't want to run that code each time your spider is imported.
Your calling of CrawlProcess.crawl() is slightly wrong. You can use it like this, passing the arguments in the same manner you'd pass them to the spider class' __init__().
process = CrawlerProcess(get_project_settings())
process.crawl('subsites', 'https://example.com', 'example.com')
process.start()
How can i also pass the -o output.jl argument? (or maybe, use allowed argument as filename?
You can achieve the same effect using custom_settings, giving each instance a different FEED_URI setting.

PyQt4 Scrapy Implementation

Using scrapy I faced a problem of javascript rendered pages. For the site Forum Franchise for example the link http://www.idee-franchise.com/forum/viewtopic.php?f=3&t=69, trying to scrap the source html I couldn't retrieve any posts because they seem to be "appended" after the page is being rendered (Probably through javascript).
So i was looking on the net for a solution to this problem, and i came across https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/ .
I am completely new to PYPQ, but was hoping to take a shortcut and copy paste some code.
This worked perfectly for when i tried to scrap a single page. But then when i implemented this in scrapy i get the following error :
QObject::connect: Cannot connect (null)::configurationAdded(QNetworkConfiguration) to QNetworkConfigurationManager::configurationAdded(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationRemoved(QNetworkConfiguration) to QNetworkConfigurationManager::configurationRemoved(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::configurationChanged(QNetworkConfiguration) to QNetworkConfigurationManager::configurationChanged(QNetworkConfiguration)
QObject::connect: Cannot connect (null)::onlineStateChanged(bool) to QNetworkConfigurationManager::onlineStateChanged(bool)
QObject::connect: Cannot connect (null)::configurationUpdateComplete() to QNetworkConfigurationManager::updateCompleted()
If i scrap a single page, then no error occurs, but when i set crawler to recursive mode, then right at the second link i get an error that python.exe stopped working and the above error.
I will searching for what this could be, and somewhere i read a QApplication object should only be initiated once.
Could someone please tell me what should be the proper implementation?
The Spider
# -*- coding: utf-8 -*-
import scrapy
import sys, traceback
from bs4 import BeautifulSoup as bs
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from crawler.items import ThreadItem, PostItem
from crawler.utils import utils
class IdeefranchiseSpider(CrawlSpider):
name = "ideefranchise"
allowed_domains = ["idee-franchise.com"]
start_urls = (
'http://www.idee-franchise.com/forum/',
# 'http://www.idee-franchise.com/forum/viewtopic.php?f=3&t=69',
)
rules = [
Rule(LinkExtractor(allow='/forum/'), callback='parse_thread', follow=True)
]
def parse_thread(self, response):
print "Parsing Thread", response.url
thread = ThreadItem()
thread['url'] = response.url
thread['domain'] = self.allowed_domains[0]
thread['title'] = self.get_thread_title(response)
thread['forumname'] = self.get_thread_forum_name(response)
thread['posts'] = self.get_thread_posts(response)
yield thread
# paginate if possible
next_page = response.css('fieldset.display-options > a::attr("href")')
if next_page:
url = response.urljoin(next_page[0].extract())
yield scrapy.Request(url, self.parse_thread)
def get_thread_posts(self, response):
# using PYQTRenderor to reload page. I think this is where the problem
# occurs, when i initiate the PYQTPageRenderor object.
soup = bs(unicode(utils.PYQTPageRenderor(response.url).get_html()))
# sleep so that PYQT can render page
# time.sleep(5)
# comments
posts = []
for item in soup.select("div.post.bg2") + soup.select("div.post.bg1"):
try:
post = PostItem()
post['profile'] = item.select("p.author > strong > a")[0].get_text()
details = item.select('dl.postprofile > dd')
post['date'] = details[2].get_text()
post['content'] = item.select('div.content')[0].get_text()
# appending the comment
posts.append(post)
except:
e = sys.exc_info()[0]
self.logger.critical("ERROR GET_THREAD_POSTS %s", e)
traceback.print_exc(file=sys.stdout)
return posts
The PYPQ implementation
import sys
from PyQt4.QtCore import QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
class PYQTPageRenderor(object):
def __init__(self, url):
self.url = url
def get_html(self):
r = Render(self.url)
return unicode(r.frame.toHtml())
The proper implementation, if you want to do it yourself, would be to create a downlader middleware that uses PyQt to process a request. It will be instantiated once by Scrapy.
Should not be that complicated, just
Create QTDownloader class in the middleware.py file of your project
The constructor should create the QApplication object.
The process_request method should do the url loading, and HTML fetching. Note that you return a Response object with the HTML string.
You might do appropriate clean-up in a _cleanup method of your class.
Finally, activate your middleware by adding it to the DOWNLOADER_MIDDLEWARES variable of the settings.py file of your project.
If you don't want to write your own solution, you could use an existing middleware that uses Selenium to do the downloading, like scrapy-webdriver. If you don't want to have a visible browser, you can instruct it to use PhantomJS.
EDIT1:
So the proper way to do it, as pointed out by Rejected is to use a download handler. The idea is similar, but the downloading should happen in a download_request method, and it should be enabled by adding it to DOWNLOAD_HANDLERS. Take a look to the WebdriverDownloadHandler for an example.

Scrapy: Pass arguments to spider that is defined in /spiders directory

I'm writing a script that uses CrawlerProcess to run a spider of class MySpider defined in mybot/spiders/myspider.py.
Here's the relevant part of my code:
# in main.py
from scrapy.crawler import CrawlerProcess
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
items = []
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl("joe", domain='dictionary.com')
def add_item(item):
items.append(item)
dispatcher.connect(add_item, signals.item_passed) # adds result from spider to items
process.start()
.
#in /spiders/myspider.py
from scrapy.spiders import Spider
from mybot.items import MyItem
name = "joe"
allowed_domains = ["dictionary.com"]
start_urls = ["http://dictionary.reference.com/"]
for sel in response.xpath('//tr[#class="alt"]'):
new_item = MyItem()
new_item['name'] = sel.xpath('td/a/text()')[0].extract()
yield(new_item)
Now, I want to change the program so that I can pass some other start_url to the spider from main.py. It looks like I can pass the allowed_domains argument to the spider via
process.crawl("joe", domain='dictionary.com')
but I don't know how to generalize that.
I think I have to redefine the MySpider's constructor to accept an optional argument, but it doesn't look like the spider is created in main.py
(and in fact, the command new_spider = MySpider() returns the error global name 'MySpider' is not defined.
So my question is twofold:
How do I change the spider's constructor?
How do I pass the start_urls to the spider from main.py?
Or is there perhaps a different solution altogether?

Categories