How to give custom name to images when downloading through scrapy - python

This is my program to download images through image pipeline. It works well and download images but the problem ** is it rename images in sha1 hash after that I am unable to identify them. Can there be any solution so that I can use the **model_name as of the images to be download?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time
class CompItem(scrapy.Item):
model_name = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
image_name = scrapy.Field()
class criticspider(CrawlSpider):
name = "buysmaart_images"
allowed_domains = ["http://buysmaart.com/"]
start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4", "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye", "http://buysmaart.com/productdetails/506/OPPO-N1", "http://buysmaart.com/productdetails/342/LG-G2-D802T"]
def __init__(self, *args, **kwargs):
super(criticspider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(2)
def parse_start_url(self, response):
self.browser.get(response.url)
time.sleep(8)
sel = Selector(text=self.browser.page_source)
item = CompItem()
photos = sel.xpath('//ul[contains(#id,"productImageUl")]/li')
print len(photos)
all_photo_urls = []
for photo in photos:
item['image_name'] = sel.xpath('.//h3[contains(#class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
#tmp_url = photo.xpath('.//img/#src').extract()[0].encode('ascii','ignore')
image_url = photo.xpath('.//img/#src').extract()[0]
all_photo_urls.append(image_url)
item['image_urls'] = all_photo_urls
yield item
pipeline
from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
def process_item(self, item, spider):
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings
BOT_NAME = 'download_images'
SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'

Scrapy 1.3.3 solution(override image_downloaded methods):
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'image_names': item["image_names"]})
def image_downloaded(self, response, request, info):
checksum = None
for path, image, buf in self.get_images(response, request, info):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
path = 'full/%s' % response.meta['image_names'][0] # **Here Changed**
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum

The solution is to override the image_key method of your DownloadImagesPipeline class.
def image_key(self, url):
return 'image_name.here'
For example if you want the image name of the URL you can use
url.split('/')[-1]
as the name of the image.
Note that this method is deprecated and can be removed in a future release.
Alternatively you can set the image_name for your image in your Spider:
item['image_name'] = ['whatever_you_want']
In this case you have to extend your pipeline a bit more to utilize the name of the image you provided:
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
And of course your pipeline should extend ImagesPipeline.

It will give answer for custom image names as well as to which folder (custom named) such images to be saved.
#spider.py
import scrapy
from ..items import DusharaItem
class DusharaSpider(scrapy.Spider):
name='dushara'
start_urls=['https://www.indiaglitz.com/dushara-photos-tamil-actress-3129970-8771']
def parse(self,response):
selector = response.xpath('//div[#class="gallmain gallerycontainer-8771"]/div[#class="gallery_detail gal-8771"]')
for sel in selector:
item = DusharaItem()
item['image_urls'] = sel.xpath('./img/#src').extract_first()
#item['image_urls'] = [sel.xpath('./img/#src').extract_first()] # for default scraping process
item['folder_names_1'] = 'Actress'
item['folder_names_2'] = 'Tamil'
item['image_names'] = sel.xpath('./img/#src').extract_first().split('/')[-1] # it should contain image extension like .jpg
yield item
#items.py
import scrapy
class DusharaItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
folder_names_1 = scrapy.Field()
folder_names_2 = scrapy.Field()
image_names = scrapy.Field()
#pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class DusharaPipeline(ImagesPipeline):
def get_media_requests(self, item,info):
url = item['image_urls']
folder_names_1 = item['folder_names_1']
folder_names_2 = item['folder_names_2']
image_names = item['image_names']
yield scrapy.Request(url=url, meta={'folder_names_1': folder_names_1, 'folder_names_2': folder_names_2, 'image_names': image_names})
def file_path(self, request, response=None, info=None, *, item=None):
folder_names_1 = request.meta['folder_names_1']
folder_names_2 = request.meta['folder_names_2']
image_names = request.meta['image_names']
return '/'+folder_names_1+'/'+folder_names_2+'/'+image_names
#settings.py
ITEM_PIPELINES = {'dushara.pipelines.DusharaPipeline': 300}
#ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} # for default scraping process
IMAGES_STORE = r'D:\Scraped'

Related

Same file downloads

I have a problem with my script such that the same file name, and pdf is downloading. I have checked the output of my results without downloadfile and I get unique data. It's when I use the pipeline that it somehow produces duplicates for download.
Here's my script:
import scrapy
from environment.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = ['https://fsc.org/en/members']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
content = response.xpath("(//div[#class='content__wrapper field field--name-field-content field--type-entity-reference-revisions field--label-hidden field__items']/div[#class='content__item even field__item'])[position() >1]")
loader = fcpItem()
names_add = response.xpath(".//div[#class = 'field__item resource-item']/article//span[#class='media-caption file-caption']/text()").getall()
url = response.xpath(".//div[#class = 'field__item resource-item']/article/div[#class='actions']/a//#href").getall()
pdf=[response.urljoin(x) for x in url if '#' is not x]
names = [x.split(' ')[0] for x in names_add]
for nm, pd in zip(names, pdf):
loader['names'] = nm
loader['pdfs'] = [pd]
yield loader
items.py
class fcpItem(scrapy.Item):
names = Field()
pdfs = Field()
results = Field()
pipelines.py
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, item=None):
items = item['names']+'.pdf'
return items
settings.py
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
FILES_STORE = os.path.join(BASE_DIR, 'fsc')
ROBOTSTXT_OBEY = False
FILES_URLS_FIELD = 'pdfs'
FILES_RESULT_FIELD = 'results'
ITEM_PIPELINES = {
'environment.pipelines.pipelines.DownfilesPipeline': 150
}
I am using css instead of xpath.
From the chrome debug panel, the tag is root of item of PDF list.
Under that div tag has title of PDF and tag for file download URL
Between root tag and tag two child's and sibling relation so xpath is not clean method and hard, a css much better is can easley pick up from root to . it don't necessary relation ship path. css can skip relationship and just sub/or grand sub is not matter. It also provides not necessary to consider index problem which is URL array and title array sync by index match.
Other key point are URL path decoding and file_urls needs to set array type even if single item.
fsc_spider.py
import scrapy
import urllib.parse
from quotes.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = [
'https://fsc.org/en/members',
]
def parse(self, response):
for book in response.css('div.field__item.resource-item'):
url = urllib.parse.unquote(book.css('div.actions a::attr(href)').get(), encoding='utf-8', errors='replace')
url_left = url[0:url.rfind('/')]+'/'
title = book.css('span.media-caption.file-caption::text').get()
item = fcpItem()
item['original_file_name'] = title.replace(' ','_')
item['file_urls'] = ['https://fsc.org'+url_left+title.replace(' ','%20')]
yield item
items.py
import scrapy
class fcpItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
original_file_name = scrapy.Field()
pipelines.py
import scrapy
from scrapy.pipelines.files import FilesPipeline
class fscPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[-1].replace('%20','_')
return file_name
settings.py
BOT_NAME = 'quotes'
FILES_STORE = 'downloads'
SPIDER_MODULES = ['quotes.spiders']
NEWSPIDER_MODULE = 'quotes.spiders'
FEED_EXPORT_ENCODING = 'utf-8'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = { 'quotes.pipelines.fscPipeline': 1}
file structure
execution
quotes>scrapy crawl fsc
result
The problem is that you are overwriting the same scrapy item every iteration.
What you need to do is create a new item for each time your parse method yields. I have tested this and confirmed that it does produce the results you desire.
I made and inline not in my example below on the line that needs to be changed.
For example:
import scrapy
from environment.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = ['https://fsc.org/en/members']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
content = response.xpath("(//div[#class='content__wrapper field field--name-field-content field--type-entity-reference-revisions field--label-hidden field__items']/div[#class='content__item even field__item'])[position() >1]")
names_add = response.xpath(".//div[#class = 'field__item resource-item']/article//span[#class='media-caption file-caption']/text()").getall()
url = response.xpath(".//div[#class = 'field__item resource-item']/article/div[#class='actions']/a//#href").getall()
pdf=[response.urljoin(x) for x in url if '#' is not x]
names = [x.split(' ')[0] for x in names_add]
for nm, pd in zip(names, pdf):
loader = fcpItem() # Here you create a new item each iteration
loader['names'] = nm
loader['pdfs'] = [pd]
yield loader

Scrapy Images Downloading

My spider runs without displaying any errors but the images are not stored in the folder here are my scrapy files:
Spider.py:
import scrapy
import re
import os
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem, ListResidentialItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["someurl.com"]
start_urls = [
"someurl.com"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
img_url = sel.xpath('//a[#data-tealium-id="detail_nav_showphotos"]/#href').extract()[0]
yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo, meta={'item': item})
def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item = ListResidentialItem()
try:
image_urls = map(unicode.strip,response.xpath('//a[#itemprop="contentUrl"]/#data-href').extract())
item['image_urls'] = [ x for x in image_urls]
except IndexError:
item['image_urls'] = ''
return item
settings.py:
from scrapy.settings.default_settings import ITEM_PIPELINES
from scrapy.pipelines.images import ImagesPipeline
BOT_NAME = 'production'
SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'
ROBOTSTXT_OBEY = True
DEPTH_PRIORITY = 1
IMAGE_STORE = '/images'
CONCURRENT_REQUESTS = 250
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
'scrapy.contrib.pipeline.images.ImagesPipeline': 300,
}
items.py
# -*- coding: utf-8 -*-
import scrapy
class ProductionItem(scrapy.Item):
img_url = scrapy.Field()
# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pass
My pipeline file is empty i'm not sure what i am suppose to add to the pipeline.py file.
Any help is greatly appreciated.
My Working end result:
spider.py:
import scrapy
import re
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem
from production.items import ImageItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["url"]
start_urls = [
"startingurl.com"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
img_url = sel.xpath('//a[#idd="followclaslink"]/#href').extract()[0]
yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseImages, meta={'item': item})
def parseImages(self, response):
for elem in response.xpath("//img"):
img_url = elem.xpath("#src").extract_first()
yield ImageItem(image_urls=[img_url])
Settings.py
BOT_NAME = 'production'
SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'
ROBOTSTXT_OBEY = True
IMAGES_STORE = '/Users/home/images'
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
# Disable cookies (enabled by default)
items.py
# -*- coding: utf-8 -*-
import scrapy
class ProductionItem(scrapy.Item):
img_url = scrapy.Field()
# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
class ImageItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
Since you don't know what to put in the pipelines I assume you can use the default pipeline for images provided by scrapy so in the settings.py file you can just declare it like
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline':1
}
Also, your images path is wrong the / means that you are going to the absolute root path of your machine, so you either put the absolute path to where you want to save or just do a relative path from where you are running your crawler
IMAGES_STORE = '/home/user/Documents/scrapy_project/images'
or
IMAGES_STORE = 'images'
Now, in the spider you extract the url but you don't save it into the item
item['image_urls'] = sel.xpath('//a[#data-tealium-id="detail_nav_showphotos"]/#href').extract_first()
The field has to literally be image_urls if you're using the default pipeline.
Now, in the items.py file you need to add the following 2 fields (both are required with this literal name)
image_urls=Field()
images=Field()
That should work
In my case it was the IMAGES_STORE path that was causing the problem
I did IMAGES_STORE = 'images' and it worked like a charm!
Here is complete code:
Settings:
ITEM_PIPELINES = {
'mutualartproject.pipelines.MyImagesPipeline': 1,
}
IMAGES_STORE = 'images'
Pipline:
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
Just adding my misstake here which threw me of for several hours. Perhaps it can help someone.
From scrapy docs (https://doc.scrapy.org/en/latest/topics/media-pipeline.html#using-the-images-pipeline):
Then, configure the target storage setting to a valid value that will be used for storing the downloaded images. Otherwise the pipeline will remain disabled, even if you include it in the ITEM_PIPELINES setting.
For some reason I used a colon ":" instead of an equal sign "=".
# My misstake:
IMAGES_STORE : '/Users/my_user/images'
# Working code
IMAGES_STORE = '/Users/my_user/images'
This dosen't return an error but instead leads to the pipeline not loading at all which for me was pretty hard to trouble shoot.
You have to enable SPIDER_MIDDLEWARES and DOWNLOADER_MIDDLEWARES in the settings.py file

Scrapy File Downloads with Custom Names - Subclass Inheritance Issue

I'm building a simple spider to crawl a structured site and download *.txt files. I've managed to get everything working except for a custom FilesPipeline class.
My goal is to download *.txt files into directories according to their url location. I can achieve my goal if I edit the Scrapy class directly (shown below)
files.py -> FilesPipeline::file_path()
...
# return 'full/%s%s' % (media_guid, media_ext)
return url.split('example.com/')[1]
I want to overload the class properly but haven't been successful. I'm not sure what I should be doing differently. The spider will run with no warnings or errors but wont download files.
settings.py
ITEM_PIPELINES = {
'myspider.pipelines.MySpiderFilesPipeline': 1,
'myspider.pipelines.MySpiderPipeline': 300,
}
spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from myspider.items import MySpiderItem
class SpideySpider(CrawlSpider):
name = 'spidey'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
rules = (
Rule(LinkExtractor(allow='', restrict_xpaths='//tr/td/a', deny_extensions='html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
links = response.xpath('//tr/td/a')
for link in links:
i = MySpiderItem()
i['title'] = response.xpath('//title/text()').extract()
i['href'] = link.xpath('#href').extract()
i['text'] = link.xpath('text()').extract()
i["current_url"] = response.url
referring_url = response.request.headers.get('Referer', None)
i['referring_url'] = referring_url
i['depth'] = response.meta['depth']
if i['text'][0]:
if re.match('^#.*\.txt$', i['text'][0]) is not None:
i['file_urls'] = [ response.urljoin(i['href'][0]) ]
yield i
pipelines.py
import scrapy
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.contrib.pipeline.files import FilesPipeline, FSFilesStore
import json
import re
class MySpiderPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if re.match('^#.*\.html$', item['text'][0]) is not None:
valid = False
raise DropItem("HTML File")
if re.match('^#.*\.txt$', item['text'][0]) is not None:
pass
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class MySpiderFilesPipeline(FilesPipeline):
_url_breakstring = "example.com/"
def get_media_requests(self, item, info):
return [Request(x) for x in item.get(self.files_urls_field, [])]
def file_path(self, request, response=None, info=None):
return url.split(_url_breakstring)[1]
# media_guid = hashlib.sha1(to_bytes(url)).hexdigest() # change to request.url after deprecation
# media_ext = os.path.splitext(url)[1] # change to request.url after deprecation
# return 'full/%s%s' % (media_guid, media_ext)
For pipeline class add __init__ method for example:
class GCSFilePipeline(ImagesPipeline):
def __init__(self, store_uri, download_func=None, settings=None):
super().__init__(store_uri, settings=settings, download_func=download_func)

Item not reaching pipeline

I am new to python and scrapy. I am not getting item data in pipeline. Nothing is being written in csv. Error is
DmozSpider' object has no attribute getitem
Any help will be appreciated:
spider file
import scrapy
import sys
import os
from tutorial.items import TutorialItem
from pprint import pprint
class DmozSpider(scrapy.Spider):
name = "myspider"
allowed_domains = ["www.xyz.co.id"]
start_urls = ["http://www.xyz.co.id/search?q=abc"]
def parse(self, response):
var = response.xpath("//a[#class='img']/#href").extract()[0]
item = TutorialItem()
item['title'] = var
yield item
pipeline file
import csv
class TutorialPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'))
def process_item(self, domain, item):
print item['title']
self.csvwriter.writerow([item['title']])
return item
items file
import scrapy
class TutorialItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
price = scrapy.Field()
Settings file
ITEM_PIPELINES = {
'tutorial.pipelines.TutorialPipeline': 300,
}
The definition of your pipeline method process_item() is incorrect. The bug is in the stated parameters self, domain, item. The official description in the documentation is:
process_item(self, item, spider)
Change the method in your class TutorialPipeline accrodingly to:
def process_item(self, item, spider):
print item['title']
self.csvwriter.writerow([item['title']])
return item
Try item.get('title') instead of item['title']

Scrapy pipeline to export csv file in the right format

I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()

Categories