I'm scraping review from moocs likes this one
From there I'm getting all the course details, 5 items and another 6 items from each review itself.
This is the code I have for the course details:
def parse_reviews(self, response):
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
return l.load_item()
Now I want to include the review details, another 5 items for each review.
Since the course data is common for all the reviews I want to store it in a different file and use course name/id to relate the data afterward.
This is the code I have for the review's items:
for review in response.xpath('//*[#class="review-body"]'):
review_body = review.xpath('.//div[#class="review-body__content"]//text()').extract()
course_stage = review.xpath('.//*[#class="review-body-info__course-stage--completed"]//text()').extract()
user_name = review.xpath('.//*[#class="review-body__username"]//text()').extract()
review_date = review.xpath('.//*[#itemprop="datePublished"]/#datetime').extract()
score = review.xpath('.//*[#class="sr-only"]//text()').extract()
I tried to work with a temporary solution, returning all the items for each case but is not working either:
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
#l = MyItemLoader(selector=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
for review in response.xpath('//*[#class="review-body"]'):
l.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
l.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
l.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
l.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
l.add_xpath('score', './/*[#class="sr-only"]//text()')
yield l.load_item()
The output file for that script is corrupted, cells are displaced and the size of the fields is not correct.
EDIT:
I want to have two files at the output:
The first one containing:
course_title,course_description,course_instructors,course_key_concepts,course_link
And the second one with:
course_title,review_body,course_stage,user_name,review_date,score
The issue is you are mixing everything up into a single item, which is not the right way to do it. You should create two items: MoocsItem and MoocsReviewItem.
And then update the code like below
def parse_reviews(self, response):
#print response.body
l = ItemLoader(item=MoocsItem(), response=response)
l.add_xpath('course_title', '//*[#class="course-header-ng__main-info__name__title"]//text()')
l.add_xpath('course_description', '//*[#class="course-info__description"]//p/text()')
l.add_xpath('course_instructors', '//*[#class="course-info__instructors__names"]//text()')
l.add_xpath('course_key_concepts', '//*[#class="key-concepts__labels"]//text()')
l.add_value('course_link', response.url)
item = l.load_item()
for review in response.xpath('//*[#class="review-body"]'):
r = ItemLoader(item=MoocsReviewItem(), response=response, selector=review)
r.add_value('course_title', item['course_title'])
r.add_xpath('review_body', './/div[#class="review-body__content"]//text()')
r.add_xpath('course_stage', './/*[#class="review-body-info__course-stage--completed"]//text()')
r.add_xpath('user_name', './/*[#class="review-body__username"]//text()')
r.add_xpath('review_date', './/*[#itemprop="datePublished"]/#datetime')
r.add_xpath('score', './/*[#class="sr-only"]//text()')
yield r.load_item()
yield item
Now what you want is that different item type goes in different csv files, which is what the below SO thread answers:
How can scrapy export items to separate csv files per item
I have not tested the below, but the code will look something like this:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
def item_type(item):
return type(item).__name__.replace('Item','').lower() # TeamItem => team
class MultiCSVItemPipeline(object):
SaveTypes = ['moocs','moocsreview']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open(CSVDir+name+'.csv','w+b')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
You need to make sure the ITEM_PIPELINES is updated to use this MultiCSVItemPipeline class
ITEM_PIPELINES = {
'mybot.pipelines.MultiCSVItemPipeline': 300,
}
Related
I'm having the scrapy spider which scraps details from the website. It works well for fixed Item fields. It also extracts the dynamic fields from the website, but it did not add all the extracted dynamic fields to an output CSV file.
To export records to CSV, I'm using CsvItemExporter.
Find the Below Item class for Dynamic fields
class MortgageInfoItem(scrapy.Item):
def __setitem__(self, key, value):
if key not in self.fields:
if "date" in key:
self.fields[key] = scrapy.Field(serializer=serialize_date)
else:
self.fields[key] = scrapy.Field(serializer=serialize_text)
self._values[key] = value
The fields may vary for each record. For example, the first record has owner1 and owner2, next record has owner1, owner2 and owner3 like this.
So finally I need the CSV file which has all the owner information. (ex. owner1, owner2, owner3, ...)
Find the below class for csv exporter
class MultiCSVItemPipeline(object):
CSVDir = 'output' + settings.DIRECTORY
file_name = "MortGage_info"
max_columns = 0
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.file = open(self.CSVDir + self.file_name + '.csv', 'w+b')
self.exporters = CsvItemExporter(self.file)
self.exporters.start_exporting()
def spider_closed(self, spider):
self.exporters.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporters.export_item(item)
return item
Please help me to export all the fields to CSV file using CsvItemExporter. Thanks in advance.
I'm trying to build a function (clean_keyboard) to use in the extended Itemloader Class.
It should filter and clean data in the extended Item Class 'category' = 'Notebook'.
I have tested it without the filter for 'Notebooks' -> (if ProductItem['category'] == 'Notebook':) and the Processor/Method works fine without. But after inserting this piece of code for filtering I get the TypeError above in the title. See code below.
### processor method for cleaning data with the Itemloader, Item and Itemloader class extended
def clean_keyboard(pattern):
keyboard_dict = {'deutsch': 'DE', 'US-QWERTY': 'US', '': 'DE'}
if ProductItem['category'] == 'Notebook': # <-- TypeError when adding category filter, without it works fine
if pattern in keyboard_dict:
return keyboard_dict[pattern]
else:
return pattern
class ProductItem(scrapy.Item):
category = scrapy.Field()
keyboard = scrapy.Field()
class SpiderItemLoader(ItemLoader):
default_item_class = ProductItem
default_input_processor = MapCompose(str.strip)
default_output_processor = TakeFirst()
keyboard_out = MapCompose(clean_keyboard)
### Parse Method in the Spider to get the data/ using the SpiderItemloader extended class
def parse_item(self, response):
l = SpiderItemLoader(response = response)
l.add_xpath('keyboard', '//*[#class="short-description"]/p/strong[text()="keyboard"]/following-sibling::text()')
l.add_xpath('category', '//*[#class="short-description"]/p/strong[text()="category"]/following-sibling::text()'')
return l.load_item()
As Daniel commented, the failing line makes no sense. You want to inspect the 'category' property of the item being processed, however your clean_keyboard function has no access to it. ProductItem is a class shared by all items, not a specific item.
Item loader processors have no access to items, only to a specific property of those.
I recommend that you use an item pipeline instead of an item loader processor to implement the logic of your clean_keyboard function.
Indeed, thank you both for helping me understanding this with the class access (Itemloader vs. Item Pipeline).
Thus, since I do have access to items in Item Pipeline I was able to solve the filtering, accesing other item by using the Item Pipeline. See my tested code with the solution below.
# Configure item pipelines in settings.py
ITEM_PIPELINES = {
'tutorial.pipelines.DataCleaningPipeline': 300,
}
# Pipeline in pipelines.py
class DataCleaningPipeline(object):
def process_item(self, item, spider):
keyboard_dict = {'deutsch': 'DE', 'US-QWERTY': 'US', '': 'DE', 'QWERTZ': 'DE'}
dict_key = item.get('keyboard')
category = item.get('category')
if 'Notebook' in category and dict_key in keyboard_dict:
item['keyboard']= keyboard_dict[dict_key]
return item
else:
return item
I have a spider which exports data to different CSV files (per the names of the class definitions as defined in the spider class). However, I also wanted to keep the order of the fields in a specific order as they were being processed and exported into their different CSV files.
For example, this is my items.py:
import scrapy
class first_class_def_Item(scrapy.Item):
f1 = scrapy.Field() # f1 an arbitrary id used for both class definition items
f2 = scrapy.Field()
f3 = scrapy.Field()
class second_class_def_Item(scrapy.Item):
f1 = scrapy.Field()
f4 = scrapy.Field()
f5 = scrapy.Field()
f6 = scrapy.Field()
This is my pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# The CSV file names are used (imported) from the scrapy spider.
# For this example, I just want to keep "first_class_def.csv" without,
# the "_item", as in "first_class_def_Item.csv" as defined in the main scrapy spider
return type(item).__name__.replace('_Item','')
class SomeSitePipeline(object):
# For simplicity, I'm using the same class def names as found in the,
# main scrapy spider and as defined in the items.py
SaveTypes = ['first_class_def','second_class_def']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/somefolder/"+name+'.csv','wb')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes ])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.SaveTypes):
self.exporters[typesItem].export_item(item)
return item
And this is my spider.py:
import os
import scrapy
from itertools import zip_longest
from somesite.items import first_class_def_Item, second_class_def_Item
from csv import DictReader
path = os.path.join(os.path.expanduser('~'), 'user', 'somefolder', 'IDs.csv')
class SomeSiteSpider(scrapy.Spider):
name = 'somesite'
allowed_domains = ['somesite.com']
start_urls = ['https://somesite.com/login.aspx']
def parse(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'txtLogin$txtInput': 'User',
'txtPassword$txtInput': 'pass',
'btnLogin.x': '53',
'btnLogin.y': '33'},
callback=self.Tables)
def Tables(self, response):
with open(path) as rows:
for row in DictReader(rows):
id=row["id"]
yield scrapy.Request("https://somesite.com/page1.aspx",
meta={'mid': mid,
'form_control': some_form_control},
dont_filter = True,
callback=self.first_class_def)
yield scrapy.Request("https://somesite.com/page2.aspx",
meta={'mid': mid,
'form_control': some_form_control},
dont_filter = True,
callback=self.second_class_def)
def first_class_def(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'id': response.meta['id'],
'form_control': response.meta['some_form_control'],
'SearchControl$btnCreateReport': 'Create Report'},
meta={'id': response.meta['id']},
callback=self.scrap_page_1)
def scrap_page_1(self, response):
items = first_class_def_Item()
field_1 = response.xpath('//*[#class="formatText"][1]/text()').extract()
field_2 = response.xpath('//*[#class="formatCurrency"][1]/text()').extract()
for a,b in zip(field_1,field_2):
items['f1'] = response.meta['id']
items['f2'] = a
items['f3'] = b
yield items
def second_class_def(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'id': response.meta['id'],
'form_control': response.meta['some_form_control'],
'form_control_two': 'some_form_control_two',
'SearchControl$btnCreateReport': 'Create Report'},
meta={'id': response.meta['id']},
callback=self.scrap_page_2)
def scrap_page_2(self, response):
items = second_class_def_Item()
field_1 = response.xpath('//*[#class="formatText"][1]/text()').extract()
field_2 = response.xpath('//*[#class="formatCurrency"][1]/text()').extract()
field_3 = response.xpath('//*[#class="formatText"][3]/text()').extract()
for a,b,c in zip(field_1,field_2,field_3):
items['f1'] = response.meta['id']
items['f4'] = a
items['f5'] = b
items['f6'] = c
yield items
As the spider was processing and exporting data, I was looking for a way to keep the fields in the CSV generated files "first_class_def.csv" and "second_class_def.csv", exported in the same order as in the items.py:
f1,f2,f3
and
f1,f4,f5,f6
However, whenever I would crawl the spider, the fields within the CSV files were being exported in random order:
f2,f1,f3 and f5,f1,f4,f6
The solution is posted below!
Unfortunately, due to the way scrapy's Item is implemented, the information about the order of field definitions is not preserved.
If the order matters, the best you can do is define the order you want as a separate class variable, and use that in your pipeline. Passing the fields_to_export argument to CsvItemExporter would probably be simplest.
Here's a basic idea you can play around with:
# items.py
class Item1(scrapy.Item):
fields_to_export = ['fi', 'f2']
f1 = scrapy.Field()
f2 = scrapy.Field()
# pipelines.py
from project.items import Item1
class SomeSitePipeline(object):
save_types = {'item1': Item1}
def spider_opened(self, spider):
# (...)
self.exporters = dict(
(name, CsvItemExporter(self.files[name], fields_to_export=item_type.fields_to_export))
for name, item_type in self.save_types.items()
)
# (...)
Also, I just noticed you're using list comprehensions for side-effects, which is a bad idea, you should just use a normal loop instead.
This is the solution to my specific problem: export fields organized per the items class definition as defined in the items.py of a scrapy spider project.
So after tinkering with this problem and implementing #stranac's suggestion of getting rid of the list comprehension, I came up with the following solution, allowing to export all fields in order into their relative csv files:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# just want "first_class_def.csv" not "first_class_def_Item.csv"
return type(item).__name__.replace('_Item','')
class SomeSitePipeline(object):
fileNamesCsv = ['first_class_def','second_class_def']
def __init__(self):
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/somefolder/"+name+'.csv','wb')) for name in self.fileNamesCsv ])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'first_class_def':
self.exporters[name].fields_to_export = ['f1','f2','f3']
self.exporters[name].start_exporting()
if name == 'second_class_def':
self.exporters[name].fields_to_export = ['f1','f4','f5','f6']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item
Now, everything works as I originally intended to.
I'm having problems with Scrapy pipelines.
EnricherPipeline is never starting. I put a debugger in the fist line of process_item and it never gets control.
JsonPipeline does start, but the first argument it receives is of type generator object process_item and not the MatchItem instance it should receive (when I disable the EnricherPipeline, JsonPipeline works as expected.
class MatchSpider(CrawlSpider):
def parse(self, response):
browser = Browser(browser='Chrome')
browser.get(response.url)
browser.find_element_by_xpath('//a[contains(text(), "{l}") and #title="{c}"]'.format(l=self.league, c=self.country)).click()
browser.find_element_by_xpath('//select[#id="seasons"]/option[text()="{s}"]'.format(s=self.season.replace('-', '/'))).click()
browser.find_element_by_xpath('//a[contains(text(), "Fixture")]').click()
page_matches = browser.find_elements_by_xpath('//*[contains(#class, "result-1 rc")]')
matches.extend([m.get_attribute('href') for m in page_matches]
for m in matches[:1]:
yield Request(m, callback=self.process_match, dont_filter=True)
def process_match(self, response):
match_item = MatchItem()
match_item['url'] = response.url
match_item['project'] = self.settings.get('BOT_NAME')
match_item['spider'] = self.name
match_item['server'] = socket.gethostname()
match_item['date'] = datetime.datetime.now()
return match_item
class EnricherPipeline:
def process_item(self, item, spider):
self.match = defaultdict(dict)
self.match['date'] = item['match']['startTime']
self.match['referee'] = item['match']['refereeName']
self.match['stadium'] = item['match']['venueName']
self.match['exp_mins'] = item['match']['expandedMinutes']
yield self.match
class JsonPipeline:
def process_item(self, item, scraper):
output_dir = 'data/matches/{league}/{season}'.format(league=scraper.league, season=scraper.season)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_name = "-".join([str(datetime.strptime(item['date'], '%Y-%m-%dT%H:%M:%S').date()),
item['home']['name'], item['away']['name']]) + '.json'
item_path = os.sep.join((output_dir, file_name))
with open(item_path, 'w') as f:
f.write(json.dumps(item))
ITEM_PIPELINES = {
'scrapers.whoscored.whoscored.pipelines.EnricherPipeline': 300,
'scrapers.whoscored.whoscored.pipelines.JsonPipeline': 800,
}
Ok, so the problem was that EnricherPipeline was yielding and not returning a result. After that it worked as expected, although I still don't understand why a debugger is not working in that first pipeline.
I write scrapy code like #FranGoitia, using return item in dict type, and it can go to pipeline well.
The true reason is:
Can't not yield any type not base on dict, the scrapy engine will not call pipeline.
Odds, I spend three days to found this...
I've had a bit of help on here by my code pretty much works. The only issue is that in the process of generating an XML, it wraps the content in "value" tags when I don't want it to. According to the doc's this is due to this:
Unless overriden in the :meth:serialize_field method, multi-valued
fields are exported by serializing each value inside a <value>
element. This is for convenience, as multi-valued fields are very
common.
This is my output:
<?xml version="1.0" encoding="UTF-8"?>
<items>
<item>
<body>
<value>Don't forget me this weekend!</value>
</body>
<to>
<value>Tove</value>
</to>
<who>
<value>Jani</value>
</who>
<heading>
<value>Reminder</value>
</heading>
</item>
</items>
What I send it to the XML exporter seems to be this, so I don't know why it think's it's multivalue?
{'body': [u"Don't forget me this weekend!"],
'heading': [u'Reminder'],
'to': [u'Tove'],
'who': [u'Jani']}
pipeline.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item
Any help would be really appreciated. I just want the same output without the redundant tags.
The extract() method will always return a list of values, even if there is only a single value as a result, for example: [4], [3,4,5] or None.
To avoid this, if you know there is only one value, you can select it like:
item['to'] = selector.xpath('//to/text()').extract()[0]
Note:
Be aware that this can result in an exception thrown in case extract() returns None and you are trying to index that. In such uncertain cases, this is a good trick to use:
item['to'] = (selector.xpath('...').extract() or [''])[0]
Or you could write your custom function to get the first element:
def extract_first(selector, default=None):
val = selector.extract()
return val[0] if val else default
This way you can have a default value in case your desired value is not found:
item['to'] = extract_first(selector.xpath(...)) # First or none
item['to'] = extract_first(selector.xpath(...), 'not-found') # First of 'not-found'
The above answer is correct regarding why this is happening, but I'd like to add that there is now out of the box support for this, and no need to write a helper method.
item['to'] = selector.xpath('//to/text()').extract_first()
and
item['to'] = selector.xpath('//to/text()').extract_first(default='spam')