I'm having problems with Scrapy pipelines.
EnricherPipeline is never starting. I put a debugger in the fist line of process_item and it never gets control.
JsonPipeline does start, but the first argument it receives is of type generator object process_item and not the MatchItem instance it should receive (when I disable the EnricherPipeline, JsonPipeline works as expected.
class MatchSpider(CrawlSpider):
def parse(self, response):
browser = Browser(browser='Chrome')
browser.get(response.url)
browser.find_element_by_xpath('//a[contains(text(), "{l}") and #title="{c}"]'.format(l=self.league, c=self.country)).click()
browser.find_element_by_xpath('//select[#id="seasons"]/option[text()="{s}"]'.format(s=self.season.replace('-', '/'))).click()
browser.find_element_by_xpath('//a[contains(text(), "Fixture")]').click()
page_matches = browser.find_elements_by_xpath('//*[contains(#class, "result-1 rc")]')
matches.extend([m.get_attribute('href') for m in page_matches]
for m in matches[:1]:
yield Request(m, callback=self.process_match, dont_filter=True)
def process_match(self, response):
match_item = MatchItem()
match_item['url'] = response.url
match_item['project'] = self.settings.get('BOT_NAME')
match_item['spider'] = self.name
match_item['server'] = socket.gethostname()
match_item['date'] = datetime.datetime.now()
return match_item
class EnricherPipeline:
def process_item(self, item, spider):
self.match = defaultdict(dict)
self.match['date'] = item['match']['startTime']
self.match['referee'] = item['match']['refereeName']
self.match['stadium'] = item['match']['venueName']
self.match['exp_mins'] = item['match']['expandedMinutes']
yield self.match
class JsonPipeline:
def process_item(self, item, scraper):
output_dir = 'data/matches/{league}/{season}'.format(league=scraper.league, season=scraper.season)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_name = "-".join([str(datetime.strptime(item['date'], '%Y-%m-%dT%H:%M:%S').date()),
item['home']['name'], item['away']['name']]) + '.json'
item_path = os.sep.join((output_dir, file_name))
with open(item_path, 'w') as f:
f.write(json.dumps(item))
ITEM_PIPELINES = {
'scrapers.whoscored.whoscored.pipelines.EnricherPipeline': 300,
'scrapers.whoscored.whoscored.pipelines.JsonPipeline': 800,
}
Ok, so the problem was that EnricherPipeline was yielding and not returning a result. After that it worked as expected, although I still don't understand why a debugger is not working in that first pipeline.
I write scrapy code like #FranGoitia, using return item in dict type, and it can go to pipeline well.
The true reason is:
Can't not yield any type not base on dict, the scrapy engine will not call pipeline.
Odds, I spend three days to found this...
Related
I'm scraping review from moocs likes this one
From there I'm getting all the course details, 5 items and another 6 items from each review itself.
This is the code I have for the course details:
def parse_reviews(self, response):
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
return l.load_item()
Now I want to include the review details, another 5 items for each review.
Since the course data is common for all the reviews I want to store it in a different file and use course name/id to relate the data afterward.
This is the code I have for the review's items:
for review in response.xpath('//*[#class="review-body"]'):
review_body = review.xpath('.//div[#class="review-body__content"]//text()').extract()
course_stage = review.xpath('.//*[#class="review-body-info__course-stage--completed"]//text()').extract()
user_name = review.xpath('.//*[#class="review-body__username"]//text()').extract()
review_date = review.xpath('.//*[#itemprop="datePublished"]/#datetime').extract()
score = review.xpath('.//*[#class="sr-only"]//text()').extract()
I tried to work with a temporary solution, returning all the items for each case but is not working either:
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
#l = MyItemLoader(selector=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
for review in response.xpath('//*[#class="review-body"]'):
l.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
l.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
l.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
l.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
l.add_xpath('score', './/*[#class="sr-only"]//text()')
yield l.load_item()
The output file for that script is corrupted, cells are displaced and the size of the fields is not correct.
EDIT:
I want to have two files at the output:
The first one containing:
course_title,course_description,course_instructors,course_key_concepts,course_link
And the second one with:
course_title,review_body,course_stage,user_name,review_date,score
The issue is you are mixing everything up into a single item, which is not the right way to do it. You should create two items: MoocsItem and MoocsReviewItem.
And then update the code like below
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Now what you want is that different item type goes in different csv files, which is what the below SO thread answers:
How can scrapy export items to separate csv files per item
I have not tested the below, but the code will look something like this:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
def item_type(item):
return type(item).__name__.replace('Item','').lower() # TeamItem => team
class MultiCSVItemPipeline(object):
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
You need to make sure the ITEM_PIPELINES is updated to use this MultiCSVItemPipeline class
ITEM_PIPELINES = {
'mybot.pipelines.MultiCSVItemPipeline': 300,
}
The code is as below , every time it returns only the first loop ,the last 9 loops disapeared .So what should I do to get all the loops ?
I have tried to add a "m = []" and m.append(l) ,but got a error "ERROR: Spider must return Request, BaseItem, dict or None, got 'ItemLoader'"
link is http://ajax.lianjia.com/ajax/housesell/area/district?ids=23008619&limit_offset=0&limit_count=100&sort=&&city_id=110000
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
for i in range(0,len(jsonresponse['data']['list'])):
l = ItemLoader(item = ItjuziItem(),response=response)
house_code = jsonresponse['data']['list'][i]['house_code']
price_total = jsonresponse['data']['list'][i]['price_total']
ctime = jsonresponse['data']['list'][i]['ctime']
title = jsonresponse['data']['list'][i]['title']
frame_hall_num = jsonresponse['data']['list'][i]['frame_hall_num']
tags = jsonresponse['data']['list'][i]['tags']
house_area = jsonresponse['data']['list'][i]['house_area']
community_id = jsonresponse['data']['list'][i]['community_id']
community_name = jsonresponse['data']['list'][i]['community_name']
is_two_five = jsonresponse['data']['list'][i]['is_two_five']
frame_bedroom_num = jsonresponse['data']['list'][i]['frame_bedroom_num']
l.add_value('house_code',house_code)
l.add_value('price_total',price_total)
l.add_value('ctime',ctime)
l.add_value('title',title)
l.add_value('frame_hall_num',frame_hall_num)
l.add_value('tags',tags)
l.add_value('house_area',house_area)
l.add_value('community_id',community_id)
l.add_value('community_name',community_name)
l.add_value('is_two_five',is_two_five)
l.add_value('frame_bedroom_num',frame_bedroom_num)
print l
return l.load_item()
The error:
ERROR: Spider must return Request, BaseItem, dict or None, got
'ItemLoader'
is slightly misleading since you can also return a generator! What is happening here is that return breaks the loop and the whole function. You can turn this function into a generator to avoid this.
Simply just replace return with yield in your last line.
return l.load_item()
to:
yield l.load_item()
I'm facing some issues with the parse_node method in Scrapy:
class s1(scrapy.spiders.XMLFeedSpider):
name = "s1"
handle_httpstatus_list = [400,401,403,404,408,410,500,502,503,504]
allowed_domains = ["xxx"]
start_urls = ["xxx"]
main_url = start_urls[0]
jobs_list = []
tracker = SummaryTracker()
itertag = "miojob"
counter = 0
def parse_node(self, response, node):
if response.status in [400,401,403,404,408,410,500,502,503,504]:
time.sleep(60)
yield scrapy.Request(self.main_url, callback=self.parse_node, errback=self.err1, dont_filter = True)
else:
#Some Code #
yield scrapy.Request(self.main_url, callback=self.parse_node, errback=self.err1, dont_filter = True)
This is part of a scrapy-bot that recursively scrape the same page to extract the last ten items. Everything works except for the last scrapy.Request, because it gives me this error:
"parse_node() takes exactly 3 arguments (2 given)"
Instead if I use a simple Request(self.main_url) it works, but I can't use the errback because it needs a callback. I tried to pass additional arguments to parse_node like this:
yield scrapy.Request(self.main_url, callback=self.parse_node(arg1,arg2), errback=self.err1, dont_filter = True)
but it gives me an Assertion error, probably because the arguments are wrong?
Have you any idea on how to solve this? Passing the correct args to parse_node, in the way I can use also the errback callable.
try
def parse_node(self, response):
<yourcode>
I've resolved the issue by reading the source code here:
https://github.com/scrapy/scrapy/blob/master/scrapy/spiders/feed.py
The old Request now is:
yield scrapy.Request(self.main_url, callback=self.parse,errback=self.err1, dont_filter = True)
The tweak here is calling the parse method, instead of the parse_node because "parse" will pass the Selector(node) to parse_node.
I've had a bit of help on here by my code pretty much works. The only issue is that in the process of generating an XML, it wraps the content in "value" tags when I don't want it to. According to the doc's this is due to this:
Unless overriden in the :meth:serialize_field method, multi-valued
fields are exported by serializing each value inside a <value>
element. This is for convenience, as multi-valued fields are very
common.
This is my output:
<?xml version="1.0" encoding="UTF-8"?>
<items>
<item>
<body>
<value>Don't forget me this weekend!</value>
</body>
<to>
<value>Tove</value>
</to>
<who>
<value>Jani</value>
</who>
<heading>
<value>Reminder</value>
</heading>
</item>
</items>
What I send it to the XML exporter seems to be this, so I don't know why it think's it's multivalue?
{'body': [u"Don't forget me this weekend!"],
'heading': [u'Reminder'],
'to': [u'Tove'],
'who': [u'Jani']}
pipeline.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item
Any help would be really appreciated. I just want the same output without the redundant tags.
The extract() method will always return a list of values, even if there is only a single value as a result, for example: [4], [3,4,5] or None.
To avoid this, if you know there is only one value, you can select it like:
item['to'] = selector.xpath('//to/text()').extract()[0]
Note:
Be aware that this can result in an exception thrown in case extract() returns None and you are trying to index that. In such uncertain cases, this is a good trick to use:
item['to'] = (selector.xpath('...').extract() or [''])[0]
Or you could write your custom function to get the first element:
def extract_first(selector, default=None):
val = selector.extract()
return val[0] if val else default
This way you can have a default value in case your desired value is not found:
item['to'] = extract_first(selector.xpath(...)) # First or none
item['to'] = extract_first(selector.xpath(...), 'not-found') # First of 'not-found'
The above answer is correct regarding why this is happening, but I'd like to add that there is now out of the box support for this, and no need to write a helper method.
item['to'] = selector.xpath('//to/text()').extract_first()
and
item['to'] = selector.xpath('//to/text()').extract_first(default='spam')
I'm trying to deploy a crawler with four spiders. One of the spiders uses XMLFeedSpider and runs fine from the shell and scrapyd, but the others use BaseSpider and all give this error when run in scrapyd, but run fine from the shell
TypeError: init() got an unexpected keyword argument '_job'
From what I've read this points to a problem with the init function in my spiders, but I cannot seem to solve the problem. I don't need an init function and if I remove it completely I still get the error!
My Spider looks like this
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import XmlXPathSelector
from betfeeds_master.items import Odds
# Parameters
MYGLOBAL = 39
class homeSpider(BaseSpider):
name = "home"
#con = None
allowed_domains = ["www.myhome.com"]
start_urls = [
"http://www.myhome.com/oddxml.aspx?lang=en&subscriber=mysubscriber",
]
def parse(self, response):
items = []
traceCompetition = ""
xxs = XmlXPathSelector(response)
oddsobjects = xxs.select("//OO[OddsType='3W' and Sport='Football']")
for oddsobject in oddsobjects:
item = Odds()
item['competition'] = ''.join(oddsobject.select('Tournament/text()').extract())
if traceCompetition != item['competition']:
log.msg('Processing %s' % (item['competition'])) #print item['competition']
traceCompetition = item['competition']
item['matchDate'] = ''.join(oddsobject.select('Date/text()').extract())
item['homeTeam'] = ''.join(oddsobject.select('OddsData/HomeTeam/text()').extract())
item['awayTeam'] = ''.join(oddsobject.select('OddsData/AwayTeam/text()').extract())
item['lastUpdated'] = ''
item['bookie'] = MYGLOBAL
item['home'] = ''.join(oddsobject.select('OddsData/HomeOdds/text()').extract())
item['draw'] = ''.join(oddsobject.select('OddsData/DrawOdds/text()').extract())
item['away'] = ''.join(oddsobject.select('OddsData/AwayOdds/text()').extract())
items.append(item)
return items
I can put an use an init function in to the spider, but I get exactly the same error.
def __init__(self, *args, **kwargs):
super(homeSpider, self).__init__(*args, **kwargs)
pass
Why is this happening and how do I solve it?
The good answer was given by alecx :
My init function was :
def __init__(self, domain_name):
In order to work within an egg for scrapyd, it should be :
def __init__(self, domain_name, **kwargs):
considering you pass domain_name as mandatory argument