I want to check the title of an item in the csv file, and then add to the csv file if it does not exists. I searched almost any responses related to duplicate values. Mostly, they are about DuplicatesPipeline and the others did not work for me.
This is my custom pipeline which is the pipelines.py
class CheckCsvPipeline(object):
def __init__(self):
csv_path = r"C:\Users\HP\PycharmProjects\ToScrape\book\items.csv"
self.csvfile = open(csv_path, 'r')
self.readCsv = csv.reader(self.csvfile, delimiter=',')
def process_item(self, item, spider):
for row in self.readCsv:
if item['title'] in row:
raise DropItem("This title exists: %s" %item)
else:
return item
Here is my spider:
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}
I run the spider with the following code, but it still adds the existing values.
scrapy crawl books -o items.csv
I suggest you to maintain a list of titles in your spider, and then inside pipeline, check if title already exists in that lists, then do not yield it.
class CheckCsvPipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
if item['title'] in spider.allTitles:
raise DropItem("This title exists: %s" % item)
else:
return item
in your spider, do this
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
allTitles = []
def parse(self, response):
books = response.xpath('//h3/a/#href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
self.allTitles.extend([ title ])
price = response.xpath('//*[#class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}
Related
I am trying to scrape data of # pages. I have already done a scraper which can scrape data from a single # page. But it suddenly finished the work after scraping of the first page
The whole file with parse function and scrapd function - Scraper.py
# -*- coding: utf-8 -*-
import scrapy
import csv
import os
from scrapy.selector import Selector
from scrapy import Request
class Proddduct(scrapy.Item):
price = scrapy.Field()
description = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class LapadaScraperSpider(scrapy.Spider):
name = 'lapada_scraper2'
allowed_domains = ['http://www.lapada.org']
start_urls = ['https://lapada.org/art-and-antiques/?search=antique']
def parse(self, response):
next_page_url = response.xpath("//ul/li[#class='next']//a/#href").get()
for item in self.scrape(response):
yield item
if next_page_url:
print("Found url: {}".format(next_page_url))
yield scrapy.Request(url=next_page_url, callback=self.parse)
def scrape(self, response):
parser = scrapy.Selector(response)
products = parser.xpath("//div[#class='content']")
for product in products:
item = Proddduct()
XPATH_PRODUCT_DESCRIPTION = ".//strong/text()"
XPATH_PRODUCT_PRICE = ".//div[#class='price']/text()"
XPATH_PRODUCT_LINK = ".//a/#href"
raw_product_description = product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
raw_product_price = product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_link = product.xpath(XPATH_PRODUCT_LINK).extract_first()
item['description'] = raw_product_description
item['price'] = raw_product_price
item['link'] = raw_product_link
yield item
def get_information(self, response):
item = response.meta['item']
item['phonenumber'] = "12345"
yield item
How can I scrape all items in all pages?
Thanks
Change allowed_domains = ['http://www.lapada.org'] to allowed_domains = ['lapada.org']
In the below code parse function executes approximately 32 times (foor loop 32 href's found) in the sameway each sublink should go and scrape the data(32 individual urls parse_next function)
.But parse_next function executes only once(one way)/not called( and output csv file is empty.can anyone help me where i did mistake.
import scrapy
import logging
logger = logging.getLogger('mycustomlogger')
from ScrapyTestProject.items import ScrapytestprojectItem
class QuotesSpider(scrapy.Spider):
name = "nestedurl"
allowed_domains = ['www.grohe.in']
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/',
def parse(self, response):
logger.info("Parse function called on %s", response.url)
for divs in response.css('div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract_first(),
'imageurl': divs.css('a img::attr(src)').extract_first(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
#logger.info("This is an information %s", next_page)
yield scrapy.Request(next_page, callback=self.parse_next, meta={'item': item})
#yield item
def parse_next(self, response):
item = response.meta['item']
logger.info("Parse function called on2 %s", response.url)
item['headline'] = response.css('div#content a.headline::text').extract()
return item
#response.css('div#product-variants a::attr(href)').extract()
Ok so a few things go wrong:
Indentation
start_urls list is not closed with a [
allowed_domains uses the domain extension .in while you want to scrape .com
Working code below:
import scrapy
import logging
class QuotesSpider(scrapy.Spider):
name = "nestedurl"
allowed_domains = ['www.grohe.com']
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/'
]
def parse(self, response):
# logger.info("Parse function called on %s", response.url)
for divs in response.css('div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract_first(),
'imageurl': divs.css('a img::attr(src)').extract_first(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
#logger.info("This is an information %s", next_page)
yield scrapy.Request(next_page, callback=self.parse_next, meta={'item': item})
#yield item
def parse_next(self, response):
item = response.meta['item']
# logger.info("Parse function called on2 %s", response.url)
item['headline'] = response.css('div#content a.headline::text').extract()
return item
#response.css('div#product-variants a::attr(href)').extract()
Note: deleted some logging / item pipelines as these are not defined on my machine.
I try to get this spider work and if request the components to be scraped separately it works, however when try to use Srapy callback function to receive the arguments later i get crashed. The goal is to craw over multiple pages and scrape data while write in output json file in format:
author | album | title | lyrics
the data for each is located on separate web pages, so that is why I'm tying to use Scrapy callback function to get that accomplished.
Also each of the above items are defined under Scrapy items.py as:
import scrapy
class TutorialItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
album = scrapy.Field()
title = scrapy.Field()
lyrics = scrapy.Field()
Spider Code start here:
import scrapy
import re
import json
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tutorial.items import TutorialItem
# urls class
class DomainSpider(scrapy.Spider):
name = "domainspider"
allowed_domains = ['www.domain.com']
start_urls = [
'http://www.domain.com',
]
rules = (
Rule(LinkExtractor(allow='www\.domain\.com/[A-Z][a-zA-Z_/]+$'),
'parse', follow=True,
),
)
# Parsing start here
# crawling and scraping the links from menu list
def parse(self, response):
links = response.xpath('//html/body/nav[1]/div/ul/li/div/a/#href')
for link in links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
yield scrapy.Request(next_page, callback=self.parse_artist_page)
# crawling and scraping artist names and links
def parse_artist_page(self, response):
artist_links = response.xpath('//*/div[contains(#class, "artist-col")]/a/#href')
author = response.xpath('//*/div[contains(#class, "artist-col")]/a/text()').extract()
item = TutorialItem(author=author)
for link in artist_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
yield scrapy.Request(next_page, callback=self.parse_album_page)
request.meta['author'] = item
yield item
return
# crawling and scraping album names and links
def parse_album_page(self, response):
album_links = response.xpath('//*/div[contains(#id, "listAlbum")]/a/#href')
album = response.xpath('//*/div[contains(#class, "album")]/b/text()').extract()
item = TutorialItem(album=album)
for link in album_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
yield scrapy.Request(next_page, callback=self.parse_lyrics_page)
request.meta['album'] = item
yield item
return
# crawling and scraping titles and lyrics
def parse_lyrics_page(self, response):
title = response.xpath('//html/body/div[3]/div/div[2]/b/text()').extract()
lyrics = map(unicode.strip, response.xpath('//html/body/div[3]/div/div[2]/div[6]/text()').extract())
item = response.meta['author', 'album']
item = TutorialItem(author=author, album=album, title=title, lyrics=lyrics)
yield item
The code crash when get to call back function:
request.meta['author'] = item
yield item
return
Can anyone help?
I did found where was the problem, the way callback function was set by me, now works:
# crawling and scraping artist names and links
def parse_artist_page(self, response):
artist_links = response.xpath('//*/div[contains(#class, "artist-col")]/a/#href')
author = response.xpath('//*/div[contains(#class, "artist-col")]/a/text()').extract()
for link in artist_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
request = scrapy.Request(next_page, callback=self.parse_album_page)
request.meta['author'] = author
return request
# crawling and scraping album names and links
def parse_album_page(self, response):
author = response.meta.get('author')
album_links = response.xpath('//*/div[contains(#id, "listAlbum")]/a/#href')
album = response.xpath('//*/div[contains(#class, "album")]/b/text()').extract()
for link in album_links:
next_page_link = link.extract()
if next_page_link:
next_page = response.urljoin(next_page_link)
request = scrapy.Request(next_page, callback=self.parse_lyrics_page)
request.meta['author'] = author
request.meta['album'] = album
return request
# crawling and scraping song titles and lyrics
def parse_lyrics_page(self, response):
author = response.meta.get('author')
album = response.meta.get('album')
title = response.xpath('//html/body/div[3]/div/div[2]/b/text()').extract()
lyrics = map(unicode.strip, response.xpath('//html/body/div[3]/div/div[2]/div[6]/text()').extract())
item = TutorialItem(author=author, album=album, title=title, lyrics=lyrics)
yield item
I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()
Suppose I have a Bookitem, I need to add information to it in both the parse phase and detail phase
def parse(self, response)
data = json.loads(response)
for book in data['result']:
item = BookItem();
item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self,response):
hxs = HtmlXPathSelector(response)
item['price'] = ......
#I want to continue the same book item as from the for loop above
Using the code as is would led to undefined item in the detail phase. How can I pass the item to the detail? detail(self,response,item) doesn't seem to work.
There is an argument named meta for Request:
yield Request(url, callback=self.detail, meta={'item': item})
then in function detail, access it this way:
item = response.meta['item']
See more details here about jobs topic.
iMom0's approach still works, but as of scrapy 1.7, the recommended approach is to pass user-defined information through cb_kwargs and leave meta for middlewares, extensions, etc:
def parse(self, response):
....
yield Request(url, callback=self.detail, cb_kwargs={'item': item})
def detail(self,response, item):
item['price'] = ......
You could also pass the individual key-values into the cb_kwargs argument and then only instantiate the BookItem instance in the final callback (detail in this case):
def parse(self, response)
data = json.loads(response)
for book in data['result']:
yield Request(url,
callback=self.detail,
cb_kwargs=dict(id_=book['id'],
url=book['url']))
def detail(self,response, id_, url):
hxs = HtmlXPathSelector(response)
item = BookItem()
item['id'] = id_
item['url'] = url
item['price'] = ......
You can define variable in init method:
class MySpider(BaseSpider):
...
def __init__(self):
self.item = None
def parse(self, response)
data = json.loads(response)
for book in data['result']:
self.item = BookItem();
self.item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self, response):
hxs = HtmlXPathSelector(response)
self.item['price'] = ....