I am trying to use Scrapy to export scraped items into a CSV field with each field enclosed in double quotes. Currently, the CSV exports correctly, but when I try to modify the item fields and add double quotes manually, the CSV ends up with each field enclosed in triple double quotes. Here is an example of what I'm trying to do:
Scrapy code
import scrapy
from tutorial.items import StoreItem
class SecilSpider(scrapy.Spider):
name = "secil"
allowed_domains = ["secilstore.com"]
def start_requests(self):
start_urls = reversed(["http://www.secilstore.com/yeni_liste/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/Aksesuar_32/Sayfa/{0}".format(page) for page in xrange(1,2)] + \
["http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33/Sayfa/{0}".format(page) for page in xrange(1,2)])
return [ scrapy.Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
item = StoreItem()
for url in response.xpath('//div[#class="image"]/a/#href').extract():
yield scrapy.Request("http://www.secilstore.com" + url, callback = self.parse)
baseUrl = response.request.headers.get('Referer', None)
if baseUrl is not None:
baseUrl = baseUrl.split('Sayfa')[0]
color = response.xpath('//a[#class="renk"]/text()').extract()
for c in color:
item['url'] = baseUrl
item['productUrl'] = response.url
item['imageUrl'] = "http://www.secilstore.com" + response.xpath('//img[#id="productMainImage"]/#src').extract()[0]
item['color'] = c
item['price'] = response.xpath('//span[#class="price cufonHover"]/text()').extract()[0] + "TL"
item['title'] = response.xpath('//h2[#class="cufon"]/text()').extract()
item['brand'] = response.xpath('//h3[#class="slogan cufonSemi"]/text()').extract()[0]
size = '|'.join(s.strip() for s in response.xpath('//a[#class="inStock"]/text()').extract())
item['size'] = size if size else -1
oldPrice = response.xpath('//div[#class="indirimFiyat"]/text()').extract()
item['oldPrice'] = oldPrice[0] + "TL" if oldPrice else -1
items.append(item)
yield item
My CSV Item Pipeline
class CSVPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('/home/ali/%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, False,'"')
self.exporter.fields_to_export = ['url','productUrl','title','brand','imageUrl','price','oldPrice','color','size']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
So when, I try to modify a field in the spider and add double quotes manually like this (fpr example, for item['url']):
item['url'] = '"%s"' % baseUrl
the resulting CSV prints out the following:
"""http://www.secilstore.com/yeni_liste/Magaza/%C3%87anta_33""",http://www.secilstore.com/urun/5905b5c6b858458df3f4851d477eec1b/Secil-Kilit-Aksesuarli-Kisa-Sapli-Canta,Kilit Aksesuarlı Kısa Saplı Çanta,Seçil,http://www.secilstore.com/_docs/i400x500/a/a1894cadeb_Kilit-Aksesuarli-Kisa-Sapli-canta.jpg,"69,90TL","159,90TL",Ekru,-1
As you can see, the first field is surrounded by triple double quotes instead of only one. Also what is interesting is that the prices are printed in double quotes. How can I surround each field with only one pair of double quotes?
Thanks!
I found it by modifying the CSVItemPipeline:
self.exporter = CsvItemExporter(open(spider.name+".csv", "w"), False,
fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL)
This allowed me to generate a CSV file with the fields in double quotes.
Related
I notice in scrapy log that some urls returned 200 status but contained no items. It seems to be the stability of the site as re-crawling these urls 1-2 times again yield items. I would like to save these urls in a separate file for re-crawling.
I tried to create a dictionary in the spider class to store these urls but there is no easy way to save the dictionary into a file.
Another way I tried is to create a 2nd item class for urls and use item pipeline. It still outputs empty file though. I am not too advanced to write my own pipeline. Here is my code.
import scrapy
class MyItem(scrapy.Item):
productCode = scrapy.Field()
productName = scrapy.Field()
...
class UrlItem(scrapy.Item):
eurl = scrapy.Field()
parse
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if len(products) == 0:
url = UrlItem()
url['eurl'] = response.url
yield url
else:
item = MyItem()
item['...'] = ...
...
yield item
pipeline
from .items import MyItem, UrlItem
import csv
class UrlPipeline:
def open_spider(self, spider):
self.file = open('%s.csv' % "noProductsUrls", 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, url, spider):
if isinstance(url, UrlItem):
csvWriter = csv.writer(self.file)
csvWriter.writerow(ItemAdapter(url))
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class MyPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if isinstance(item, MyItem):
adapter = ItemAdapter(item)
if adapter['productCode'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(adapter['productCode'])
return item
settings file
'project.pipelines.MyPipeline': 300,
'project.pipelines.UrlPipeline': 300,
The thing is also I have the exporter setting in spider class already that saves a csv. In the pipeline I just want to add one more csv. Do the two conflict? Or it is better to structure both csv files in the pipeline?
Update: I opted for #marcos solution below which is superior.
There is a way to save csv in the spider class which is based on this post.
def __init__(self):
self.outfile = open("urls.csv", "w", newline = "")
self.writer = csv.writer(self.outfile)
def closed(self,reason):
self.outfile.close()
add the following in def parse
if len(products) == 0:
self.writer.writerow([response.url])
I would suggest you just yield a retry request whenever no product is found on the page, unless you have a very specific reason to store those URLs.
The code would look like:
class MySpider(scrapy.Spider):
custom_settings = {
'FEEDS':{
'%(filename)s.csv':{'format':'csv', 'encoding':'utf-8',},
},
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter',},}
def parsePage(self, response):
products = response.xpath(...)
if not len(products):
yield self._retry_request(response)
return
item = MyItem()
item['...'] = ...
...
yield item
def _retry_request(self, response, max_retries=5):
retries = response.meta.get('retry_time', 0)
if retries < max_retries:
return response.request.replace(
meta={**response.meta, 'retry_time': retries + 1},
dont_filter=True,
)
else:
self.logger.warning(f'Max retries reached for {response.url}')
I made a scraper for yellow pages. There is a categories.txt file that is read by the script and then it generates links according to those categories:
settings = get_project_settings()
categories = settings.get('CATEGORIES')
links = []
for category in categories:
link = 'https://www.yellowpages.com/search?search_terms=' + category + '&geo_location_terms=NY&page=1'
links.append(link)
then this links list is passed to start urls:
class YpSpider(CrawlSpider):
def __init__(self, *a, **kw):
super().__init__(*a, **kw)
dispatcher.connect(self.spider_closed, signals.spider_closed)
name = 'yp'
allowed_domains = ['yellowpages.com']
start_urls = links
rules = (
Rule(LinkExtractor(restrict_xpaths='//a[#class="business-name"]', allow=''), callback='parse_item',
follow=True),
Rule(LinkExtractor(restrict_xpaths='//a[#class="next ajax-page"]', allow=''),
follow=True),
)
it will save all the data from all the links in a csv file named parent.csv. This parent.csv file will have a column named keyword which will help in seperating data from different categories and make seperate csv files for each of them. This is implemented in spider closed function:
def spider_closed(self, spider):
with open('parent.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
with open('{}.csv'.format(row[0]), 'a') as f:
writer = csv.writer(f)
writer.writerow(row)
The problem i am facing is to get the category name in my parse method corresponding to every link so that it may be saved in parent.csv and used to seperate diiferent categories afterwards:
def parse_item(self, response):
item = YellowItem()
item['keyword'] = # here i need the corresponding category for every link
I think you should change the way you generate links. You can, for example, override the start_requests method and pass the category to the request through either it's cb_kwargs or meta attribute.
I would also suggest, that you change the implementation to get the settings from the crawler calling the spider by overriding from_crawler.
Here's how I would do it:
class YpSpider(CrawlSpider):
def __init__(self, crawler, *args, **kwargs):
super().__init__(*args, **kwargs)
self.crawler = crawler
self.categories = crawler.settings.get('CATEGORIES')
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def start_requests(self):
for category in self.categories:
yield Request(
f'https://www.yellowpages.com/search?search_terms={category}&geo_location_terms=NY&page=1',
self.parse,
cb_kwargs={'category': category}
)
def parse(self, response, category):
# do sth
...
for url in response.xpath('//a[#class="business-name"]/#href').extract():
yield Request(
url,
self.parse_item,
cb_kwargs={'category': category}
)
yield Request(
response.xpath('//a[#class="next ajax-page"]/#href').extract_first(),
self.parse,
cb_kwargs={'category': category}
)
def parse_item(self, response, category):
item = YellowItem()
item['keyword'] = category
# do sth else
...
I got multiple Spiders with different items and i want to export each item into a different csv file. I used the code example from How can scrapy export items to separate csv files per item, but there is a problem.
Right now my spider will only write the "page" item. All items is filled in the shell but the files keep being empty. I debugged the pipeline, but I didn't found an error so far.
Here is my spider:
import csv
import scrapy
from BeautifulSoup import BeautifulSoup
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spiders import Rule
from DataSpiders import CSV_PATH
from ScrapingItems import TrierDeItem
from SuperSpider import SuperSpider
HTML_PATH = 'pages/trier.de/'
class TrierDeSpider(scrapy.Spider, SuperSpider):
name = 'trierDeSpider'
allowed_domains = ['trier.de']
denied_domains = []
start_urls = [
'https://www.trier.de/rathaus-buerger-in/trier-in-zahlen/',
'https://trier.de/startseite/',
'https://www.trier.de/leben-in-trier/',
'https://www.trier.de/kultur-freizeit/',
'https://www.trier.de/wirtschaft-arbeit/',
'https://www.trier.de/bildung-wissenschaft/',
'https://www.trier.de/bauen-wohnen/',
'https://www.trier.de/umwelt-verkehr/',
]
# Set starting point for the spider and starts crawling from start_urls
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse', follow=True),)
def parse(self, response):
"""
Parse for Links Page Body. Follow allowed Domains by adding them to the request. Parse the current page with
callback and the method parse_page.
:param response:
:return:
"""
for link in LxmlLinkExtractor(allow=self.allowed_domains, deny=self.denied_domains).extract_links(response):
yield scrapy.Request(response.urljoin(link.url), callback=self.parse_page)
def parse_page(self, response):
"""
Parse the current page for information.
:param response:
:return:
"""
trier_de_item = TrierDeItem()
yield self.parse_general_page_info(response, HTML_PATH)
# extract the page url
trier_de_item["url"] = response.url
# extract the crawling datetime
trier_de_item["crawling_date_time"] = response.headers['Date']
# extract page title
trier_de_item["title"] = response.css('title::text').extract()
# extract description tags
trier_de_item["description"] = response.xpath('//meta[#name="description"]/#content').extract()
trier_de_item["og_description"] = response.xpath('//meta[#name="og:description"]/#content').extract()
# extract all page headers
trier_de_item["news_title"] = response.xpath('//div[#class="dachzeile"]/text()').extract()
# extract topic
trier_de_item["topic"] = response.xpath('//div[#class="topic"]/text()').extract()
# extract headlines
trier_de_item['headlines'] = response.xpath('//h1/text()').extract()
# check if page contains a table
table = response.xpath('//table[#class="datentabelle"]').extract()
if len(table) > 0:
self.parse_table(response.body, trier_de_item['headlines'][0])
yield trier_de_item
#staticmethod
def parse_table(body_html, title):
'''
Parse HTML Page with table and save to csv file
:param body_html:
:param title:
:return:
'''
title = title.replace('/', '')
try:
# Create Filename from title
filename = title + '.csv'
soup = BeautifulSoup(body_html)
soup.prettify('utf-8')
content = []
# find all tables in html
tables = soup.findAll('table')
for table in tables:
# find reach table row
for row in table.findAll('tr'):
# extract each table header and row and extract text to line from each row
line = []
for header in row.findAll('th'):
if ' ' in header.text:
line.append('')
else:
line.append(header.text)
for row in row.findAll('td'):
if ' ' in row.text:
line.append('')
else:
line.append(row.text)
content.append(line)
# Open a new csv file an write each line to the file
with open(CSV_PATH + filename, 'wb') as csv_file:
wr = csv.writer(csv_file)
for line in content:
wr.writerow(line)
except Exception as e:
print(e)
pass
SuperSpider:
import urlparse
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from DataSpiders import write_html
from DataSpiders.ScrapingItems import PageItem, BaseItem
ALLOWED_FILE_TYPES = ('.pdf', '.csv', '.xls', '.xlsx')
class SuperSpider:
def __init__(self):
pass
def url_join(self, urls, response):
'''
Join URL with response
:param urls:
:param response:
:return:
'''
joined_urls = []
for url in urls:
joined_urls.append(response.urljoin(url))
return joined_urls
def parse_general_page_info(self, response, HTML_PATH):
page_item = PageItem()
page_item["url"] = response.url
# extract respones body
if 'jsp' in response.url:
url = response.url.split('.jsp')
write_html(url[0], response.body, HTML_PATH)
elif '?' in response.url:
url = response.url.split('?')
write_html(url[0], response.body, HTML_PATH)
else:
write_html(response.url, response.body, HTML_PATH)
# Search for files that contain any allowed file type
found_files = []
domain = response.url.split('/')[2]
for a in response.xpath('//a[#href]/#href'):
link = a.extract()
if link.endswith(ALLOWED_FILE_TYPES):
link = urlparse.urljoin(domain, link)
found_files.append(link)
# extract all refering links
extractor = LxmlLinkExtractor()
linklist = []
for link in extractor.extract_links(response):
# extract links which contain a file in url and add those to 'found_files' for downloading
if '?imgUid' in link.url:
fullpath = link.url
path = fullpath.split('.de')[1]
found_files.append(urlparse.urljoin(domain, path))
else:
linklist.append(link.url)
page_item["links"] = linklist
# add all files to lokaloItem
page_item["file_urls"] = self.url_join(found_files, response)
# extract page title
page_item["title"] = response.css('title::text').extract()
# extract all image urls
relative_img_urls = response.css("img::attr(src)").extract()
page_item["image_urls"] = self.url_join(relative_img_urls, response)
return page_item
def parse_base_page_information(self, response):
baseItem = BaseItem()
baseItem["url"] = response.url
# extract page title
baseItem["title"] = response.css('title::text').extract()
baseItem["crawling_date_time"] = response.headers['Date']
# extract description tags
baseItem["description"] = response.xpath('//meta[#name="description"]/#content').extract()
baseItem["og_description"] = response.xpath('//meta[#name="og:description"]/#content').extract()
baseItem['headlines'] = response.xpath('//h1/text()').extract()
return baseItem
ScrapingItems:
from scrapy import Item, Field
class PageItem(Item):
url = Field()
title = Field()
image_urls = Field()
file_urls = Field()
links = Field()
class BaseItem(Item):
url = Field()
title = Field()
crawling_date_time = Field()
description = Field()
og_description = Field()
headlines = Field()
class TrierDeItem(BaseItem):
news_title = Field()
tag = Field()
topic = Field()
And the Multi CSV Pipeline:
class MultiCSVItemPipeline(object):
CSVPath = "csv_data/"
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([(name, open(self.CSVPath + name + '.csv', 'ab')) for name in self.SaveTypes])
self.exporters = dict([(name, CsvItemExporter(self.files[name])) for name in self.SaveTypes])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
self.exporters[what].export_item(item)
return item
def item_type(item):
'''
Returns the scraping item name
:param item:
:return:
'''
return type(item).__name__.replace('Item', '').lower()
I haven't found a solution to this right now, but I tried several things that failed.
yield list of items, which doesn't work with scrapy
yield only one item and create two parse methods for page_item and trier_item
delete all SaveTypes but 'trierde'. The spider didn't write anything
So, relating to these options I tried I believe, that there is some error with the pipeline itself...
I appreciate any help anybody can offer.
Additional Info:
Before changing my pipeline to MultiCSV I was able to save each item to csv.
After I wasn't able to fix the problem with the Scrapy exporter I decided to create my own exporter.
Here's the code for everyone who want's to export multiple, different Items to different csv files in one or more spiders. It worked for me so far, but I'm still checking the code for errors. Feel free to reply, if you got some ideas for improvement.
class MultiCSVItemPipeline(object):
# Subfolder path, where the csv files are stored
CSVPath = "csv_data/"
# All allowed items
SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
# List for already checked csv headers
CheckedHeaders = []
def __init__(self):
import sys
reload(sys)
sys.setdefaultencoding('utf8')
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
# Check if items exists and create new ones if not
for file in set(self.SaveTypes):
f = open(self.CSVPath + file + '.csv', 'a+')
f.close()
def spider_closed(self, spider):
# not needed anymore
# [e.finish_exporting() for e in self.exporters.values()]
# [f.close() for f in self.files.values()]
pass
def process_item(self, item, spider):
what = item_type(item)
if what in set(self.SaveTypes):
try:
# Check if csv file contains header, but only those, that aren't checked
if what not in self.CheckedHeaders:
self.check_header(what, item)
self.write_item_to_row(item, what)
except Exception as e:
logging.error("########################################################")
logging.error("Error writing to " + what + ".csv file ")
logging.error("Error Message: " + e.message)
logging.error("Error Reason: " + e.reason)
logging.error("Error Object: " + e.object)
logging.error("########################################################")
return item
def write_item_to_row(self, item, what):
"""
Write a single item to a row in csv file
:param item:
:param what:
:return:
"""
ofile = open(self.CSVPath + what + '.csv', "ab")
writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
row = []
for k in item_dict:
d = item_dict[k]
# Ig item is not a list join the element to string, replace all delimiters and set encoding to utf-8
if not isinstance(d, types.ListType):
value = ''.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
else:
value = ','.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
row.append(value)
writer.writerow(row)
ofile.close()
def check_header(self, what, item):
"""
Check if the file contains header elements and create if missing
:param what:
:param item:
:return:
"""
try:
with open(self.CSVPath + what + '.csv', 'ab+') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
item_dict = item.__dict__['_values']
# If file is empty, create new csv header
if os.stat(self.CSVPath + what + '.csv').st_size == 0:
self.write_csv_header(item_dict, writer)
else:
# Read first row and check header elements
read_csv = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
first_row = read_csv.next()
# if not all headers are set in the csv file, print warning
if not self.check_key_in_csv_header(item_dict, first_row):
# TODO: Add missing header to the csv file
logging.warning("Wrong headers for file " + what + ".csv")
self.CheckedHeaders.append(what)
csvfile.close()
return True
except Exception as e:
logging.error(e.message)
return False
#staticmethod
def write_csv_header(item_dict, writer):
"""
Write header of a csv file.
Header is writen from each keys in the scrapy item
:param item_dict:
:param writer:
:return:
"""
first_row = []
for k in item_dict:
# Join each Key to a string, delete delimiters and encode to utf-8
value = ''.join(k).replace('\t', '').replace('\n', '').encode('utf8')
first_row.append(value)
writer.writerow(first_row)
#staticmethod
def check_key_in_csv_header(item_dict, row):
"""
Check, for each item key, if it's contained in the first line of the csv
k (key) stands for each dictionary key of the scrapy item.
:param item_dict:
:param row:
:return:
"""
for k in item_dict:
if k not in row:
return False
return True
I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()
class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(#class,"SkyScrapperBoxes")]/div[contains(#class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/#href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[#class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[#id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
I am currently working on Scrapy to crawl the website. I have some knowledge about unittest in python. But,How can I write the unittest to check that link is working, and item['location'], item['details'] are returning the value or not? I have learned Scrapy contract but cannot understand anything.So, how can write the unittest in this case?
If we are talking specifically about how to test the spiders (not pipelines, or loaders), then what we did is provided a "fake response" from a local HTML file. Sample code:
import os
from scrapy.http import Request, TextResponse
def fake_response(file_name=None, url=None):
"""Create a Scrapy fake HTTP response from a HTML file"""
if not url:
url = 'http://www.example.com'
request = Request(url=url)
if file_name:
if not file_name[0] == '/':
responses_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(responses_dir, file_name)
else:
file_path = file_name
file_content = open(file_path, 'r').read()
else:
file_content = ''
response = TextResponse(url=url, request=request, body=file_content,
encoding='utf-8')
return response
Then, in your TestCase class, call the fake_response() function and feed the response to the parse() callback:
from unittest.case import TestCase
class MyTestCase(TestCase):
def setUp(self):
self.spider = MySpider()
def test_parse(self):
response = fake_response('input.html')
item = self.spider.parse(response)
self.assertEqual(item['title'], 'My Title')
# ...
Aside from that, you should definitely start using Item Loaders with input and output processors - this would help to achieve a better modularity and, hence, isolation - spider would just yield item instances, data preparation and modification would be incapsulated inside the loader, which you would test separately.