Scrapy pipeline to export csv file in the right format - python

I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item

I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request

If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()

Related

Why does my Scrapy spider only scrape some of my data?

I'm trying to use Scrapy to scrape IMDb data (episode information and cast list) for each episode of Law & Order: SVU. After I run the code below, I export it to CSV via the command line with "scrapy crawl svu -o svu.csv".
The code below successfully pulls episode information, but the CSV does not contain the cast list. How do I fix the code to extract and export both the episode information and the cast list?
My thoughts & attempts:
I believe that the cast list is extracted because it is
visible in the terminal when the spider runs, so it may be an export issue.
If I comment out my first Yield statement (episode information), the cast list is successfully exported. This makes me think it isn't just an export issue.
Thanks for the help!
import scrapy
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
yield {
'season': response.xpath("//div[#class='bp_heading']/text()")[0].extract(),
'episode': response.xpath("//div[#class='bp_heading']/text()")[1].extract(),
'episode_name': response.xpath("//h1[#itemprop='name']/text()").extract_first().strip(),
'date_published': response.xpath("//div[#class='subtext']/a/meta[#itemprop='datePublished']/#content").extract(),
'rating_value': response.xpath("//span[#itemprop='ratingValue']/text()").extract(),
'rating_count': response.xpath("//span[#itemprop='ratingCount']/text()").extract()
}
# Follow link to full cast list
for a in response.xpath("//div[#class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[#class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[#class='cast_list']"):
yield {
'actor': response.xpath("//td[#itemprop='actor']/a/span[#itemprop='name']/text()").extract(),
'character': response.xpath("//td[#class='character']/a/text()").extract()
}
I added changes to your code. Addition I show you how to use Items and Pipelines.
spiders/svu.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import EpisodeItem, CastItem
class SvuSpider(scrapy.Spider):
name = "svu"
start_urls = [
'https://www.imdb.com/title/tt0629700/?ref_=ttep_ep1'
]
def parse(self, response):
# Gather episode information
item = EpisodeItem(
season=response.xpath("//div[#class='bp_heading']/text()")[0].extract(),
episode=response.xpath("//div[#class='bp_heading']/text()")[1].extract(),
episode_name=response.xpath("//h1[#itemprop='name']/text()").extract_first().strip(),
date_published=response.xpath("//div[#class='subtext']/a/meta[#itemprop='datePublished']/#content").extract(),
rating_value=response.xpath("//span[#itemprop='ratingValue']/text()").extract(),
rating_count=response.xpath("//span[#itemprop='ratingCount']/text()").extract()
)
yield item
# Follow link to full cast list
for a in response.xpath("//div[#class='see-more']/a"):
yield response.follow(a, callback=self.parse_cast)
# Follow link to next episode
for a in response.xpath("//a[#class='bp_item np_next']"):
yield response.follow(a, callback=self.parse)
def parse_cast(self,response):
# Gather cast list data
for actor in response.xpath("//table[#class='cast_list']"):
character = response.xpath("//td[#class='character']/a/text()").extract()
character.extend(response.xpath("//td[#class='character']/text()").extract())
character = [c.strip().replace('\n ', '') for c in character if c.strip()]
item = CastItem(
actor=response.xpath("//td[#itemprop='actor']/a/span[#itemprop='name']/text()").extract(),
character=character
)
yield item
items.py
from scrapy import Item, Field
class EpisodeItem(Item):
season = Field()
episode = Field()
episode_name = Field()
date_published = Field()
rating_value = Field()
rating_count = Field()
class CastItem(Item):
actor = Field()
character = Field()
pipelines.py
from scrapy import signals
from scrapy.exporters import CsvItemExporter
from .items import CastItem, EpisodeItem
class IMDBPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
item_names = ['episode', 'cast']
self.files = self.files = {n: open('%s.csv' % n, 'w+b') for n in item_names}
self.exporters = {n: CsvItemExporter(f) for n, f in self.files.items()}
for exporter in self.exporters.values():
exporter.start_exporting()
def spider_closed(self, spider):
for exporter in self.exporters.values():
exporter.finish_exporting()
for file in self.files.values():
file.close()
def process_item(self, item, spider):
if isinstance(item, EpisodeItem):
self.exporters['episode'].export_item(item)
if isinstance(item, CastItem):
self.exporters['cast'].export_item(item)
return item
Add to settings file:
ITEM_PIPELINES = {
'PROJECT_NAME.pipelines.IMDBPipeline': 300,
}
Be carefull. You need to replace PROJECT_NAME to yours.

Scrapy: Checking the value in a csv file before adding

I want to check the title of an item in the csv file, and then add to the csv file if it does not exists. I searched almost any responses related to duplicate values. Mostly, they are about DuplicatesPipeline and the others did not work for me.
This is my custom pipeline which is the pipelines.py
class CheckCsvPipeline(object):
def __init__(self):
csv_path = r"C:\Users\HP\PycharmProjects\ToScrape\book\items.csv"
self.csvfile = open(csv_path, 'r')
self.readCsv = csv.reader(self.csvfile, delimiter=',')
def process_item(self, item, spider):
for row in self.readCsv:
if item['title'] in row:
raise DropItem("This title exists: %s" %item)
else:
return item
Here is my spider:
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}
I run the spider with the following code, but it still adds the existing values.
scrapy crawl books -o items.csv
I suggest you to maintain a list of titles in your spider, and then inside pipeline, check if title already exists in that lists, then do not yield it.
class CheckCsvPipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
if item['title'] in spider.allTitles:
raise DropItem("This title exists: %s" % item)
else:
return item
in your spider, do this
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
allTitles = []
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
self.allTitles.extend([ title ])
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}

Item not reaching pipeline

I am new to python and scrapy. I am not getting item data in pipeline. Nothing is being written in csv. Error is
DmozSpider' object has no attribute getitem
Any help will be appreciated:
spider file
import scrapy
import sys
import os
from tutorial.items import TutorialItem
from pprint import pprint
class DmozSpider(scrapy.Spider):
name = "myspider"
allowed_domains = ["www.xyz.co.id"]
start_urls = ["http://www.xyz.co.id/search?q=abc"]
def parse(self, response):
var = response.xpath("//a[#class='img']/#href").extract()[0]
item = TutorialItem()
item['title'] = var
yield item
pipeline file
import csv
class TutorialPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'))
def process_item(self, domain, item):
print item['title']
self.csvwriter.writerow([item['title']])
return item
items file
import scrapy
class TutorialItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
price = scrapy.Field()
Settings file
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
}
The definition of your pipeline method process_item() is incorrect. The bug is in the stated parameters self, domain, item. The official description in the documentation is:
process_item(self, item, spider)
Change the method in your class TutorialPipeline accrodingly to:
def process_item(self, item, spider):
print item['title']
self.csvwriter.writerow([item['title']])
return item
Try item.get('title') instead of item['title']

Limit how much elements scrapy can collect

I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...

Scrapy isn't extracting data

This is a scrapy code and I want to scrape data from mouthshut.com and it includes the strong tag in between. I am able to run it and have title coming but they are blank. Why it isn't extracting any data?
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
items = []
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
items.append(item)
return items
You should use a pipeline to extract data from your spider! Here is a sample that extract data to json files:
pipelines.py
# -*- coding: utf-8 -*-
# python import
from scrapy import signals, log
from scrapy.contrib.exporter import JsonItemExporter
from datetime import datetime
import os
# project import
from items import tgju
from pymongo import MongoClient
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
def get_items(module):
md = module.__dict__
return (str(md[c].__name__) for c in md if (isinstance(md[c], type) and md[c].__module__ == module.__name__))
class JsonPipeline(object):
def __init__(self):
self.files = dict()
self.exporter = dict()
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
for key in get_items(tgju):
path = os.path.join('temp', key)
if not os.path.exists(path):
os.makedirs(path)
self.files[key] = open(os.path.join(path,
'%s_%s_%s.json' % (spider.name,
key.lower(),
datetime.now().strftime('%Y%m%dT%H%M%S'))),
'w+b')
self.exporter[key] = JsonItemExporter(self.files[key])
self.exporter[key].start_exporting()
def spider_closed(self, spider):
for key in get_items(tgju):
self.exporter[key].finish_exporting()
self.files.pop(key).close()
def process_item(self, item, spider):
try:
log.msg('-----------------%s------------------' % item.__class__.__name__)
self.exporter[item.__class__.__name__].export_item(item)
except KeyError:
pass
return item
Add this line to your settings files:
ITEM_PIPELINES = {
'pipelines.JsonPipeline': 800,
}
And try yield each item instead of return.
Update:
Also change your spider to this one...
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[#class="profile"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
def parse(self,response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="reviewtitle fl"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[#style=" font-size: 15px;font-weight: 700;"]/a/text()').extract()
#item['date'] = site.select('div[#class="review_stats"]//div[#class="date"]/text()').extract()
#item['desc'] = site.select('div[#class="review_body"]//span[#class="blurb blurb_expanded"]/text()').extract()
yield item
this is work well.
2015-01-21 19:06:33+0800 [shut] DEBUG: Scraped from <200 http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930>
{'title': [u'Vodafone 3G - Useless in Bangalore',
u'Worst Mobile Operator Ever',
u'Worst 3g connectivity of vodafone in bangalore',
u'Pathetic Network 3G',
u'HOW DO THEY STILL DO BUSINESS WITH SUCH SERVICES!!',
u'Bad customer service',
u'Vodafone Kolkata \u2013 My worst ever experience.',
u'Network connectivity - permanent nemesis',
u'VODAFONE MOBILE OPERATOR',
u'Beware of Vodafone billing plans',
u'Vodafone changed my billing plan without my notice',
u'Pathetic service. They deduct balance unnecessari',
u'Worst service from Vodafone',
u'Forget Vodafone',
u'Vodafone Data Services sucks',
u'Outgoing calls has been barred',
u'Vodafone Sucks',
u'Worst Customer satisfaction I have ever Faced',
u'Untrained Customer Care... Seems like headline de',
u'3rd Party downloads - shameless way to make money!']}
here you should know:
1. yield is much better then list in scrapy.
2. li node is not the parent of strong.
3. the value of strong stype has some blank.

Categories