I want to extract information from a website like price and store that as values in a dictionary. However, I'm trying to learn scrapy so I'd like to know how to achieve this with it.
Here's how it would look like with requests and BeautifulSoup
import numpy as np
import requests as r
import pandas as pd
from bs4 import BeauitfulSoup
html = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
data = defaultdict(list)
for i in range(0, len(html):
r = requests.get(html[i])
soup = BeautifulSoup(r.content, 'lxml')
name = soup.select(".s-item__title")
value = soup.select(".ITALIC")
for n, v in zip(name, value):
data["card"].append(n.text.strip())
data["price"].append(v.text.strip())
Here's what I have tried with scrapy but I do not get any values after looking at the json output. I just get the links, how do I get the output like the code above?:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
html = np.array(['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16'],
dtype=object)
url = pd.DataFrame(html, columns=['data'])
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = url.data.values
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
table = response.xpath("//div[#class='s-item__price']").get()
loader = ItemLoader(StatisticsItem())
loader.add_value('values', table)
loader.add_value('url', response.url)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'ebay_data.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
I set the custom_settings to write to 'cards_info.json' with json format.
Inside parse I go through each card on the page (see xpath) and get the card's title and price, then I yield them. Scrapy will write them into 'cards_info.json'.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
class StatisticsItem(scrapy.Item):
statistics_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'statistics'
start_urls = ['https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=2&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=3&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=4&_sop=16',
'https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=5&_sop=16']
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': 'cards_info.json'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get()
price = card.xpath('.//span[#class="s-item__price"]//text()').get()
# now do whatever you want, append to dictionary, yield as item.
# example with yield:
yield {
'card': name,
'price': price
}
Output:
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon 1st Edition Shadowless Base Set 11 Blister Booster Pack Lot - DM To Buy!', 'price': '£93,805.84'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Pokemon Team Rocket Complete Complete 83/82, German, 1. Edition', 'price': '£102,026.04'}
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ebay.co.uk/b/Collectable-Card-Games-Accessories/2536/bn_2316999?LH_PrefLoc=2&mag=1&rt=nc&_pgn=1&_sop=16>
{'card': 'Yugioh E Hero Pit Boss 2013 World Championship Prize Card BGS 9.5 Gem Mint', 'price': '£100,000.00'}
...
...
cards_info.json:
[
{"card": "1999 Pokemon Base Set Booster Box GREEN WING", "price": "\u00a340,000.00"},
{"card": "1996 MEDIA FACTORY POKEMON NO RARITY BASE SET CHARIZARD 006 BECKETT BGS MINT 9 ", "price": "\u00a339,999.99"},
{"card": "Yugioh - BGS8.5 Jump Festa Blue Eyes White Dragon -1999 - Limited - PSA", "price": "\u00a340,000.00"},
{"card": "PSA 8 CHARIZARD 1999 POKEMON 1ST EDITION THICK STAMP SHADOWLESS #4 HOLO NM-MINT", "price": "\u00a337,224.53"},
{"card": "PSA 9 MINT Pok\u00e9mon Play Promo 50000 PTS Gold Star Japanese Pokemon", "price": "\u00a338,261.06"},
...
...
]
Related
I'm trying to scrape from booking website the name of hotels that's shown in the first page using
the library scrapy in python , but I got an empty file csv, it contains only the name of columns , any suggestions!! Thank you
this is the python code :
import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter
class CsvPipeline(object):
def __init__(self):
self.file = open ('duproprio.tmp','wb')
self.exporter = CsvItemExporter(self.file,str)
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_items(self,item,spider):
self.exporter.export_item(item)
return item
class DuProprioSpider(scrapy.Spider):
name = "booking"
start_urls = [
"https://www.booking.com/searchresults.fr.html?label=gen173nr-1DCAEoggI46AdIM1gEaIwBiAEBmAENuAEXyAEP2AED6AEBiAIBqAIDuALsycKNBsACAdICJGE1YmJmNDE1LWU2ZTEtNGEzMy05MTcyLThkYmQ2OGI5NWE5OdgCBOACAQ&sid=2e5b4623e13363b5ec7de2d7957c8c22&sb=1&sb_lp=1&src=theme_landing_index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fhotel%2Findex.fr.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaIwBiAEBmAENuAEXyAEP2AED6AEBiAIBqAIDuALsycKNBsACAdICJGE1YmJmNDE1LWU2ZTEtNGEzMy05MTcyLThkYmQ2OGI5NWE5OdgCBOACAQ%3Bsid%3D2e5b4623e13363b5ec7de2d7957c8c22%3B&ss=Maroc&is_ski_area=&checkin_year=&checkin_month=&checkout_year=&checkout_month=&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&ss_raw=ma&ac_position=1&ac_langcode=fr&ac_click_type=b&dest_id=143&dest_type=country&place_id_lat=32.4281&place_id_lon=-6.92197&search_pageview_id=7ca057bb44b9012d&search_selected=true&search_pageview_id=7ca057bb44b9012d&ac_suggestion_list_length=5&ac_suggestion_theme_list_length=0"]
custom_settings = {
'LOG_LEVEL':logging.WARNING,
'ITEM_PIPELINES':{'__main__.CsvPipeline':1},
'FEED_FORMAT':'csv',
'FEED_URI':'bookingresult.csv'
}
#count = 0
#total = 25
def parse(self,response):
#self.count =+25
nexturl = "https://www.booking.com/searchresults.fr.html?label=gog235jc-1DCAIojAFCAm1hSA1YA2iMAYgBAZgBDbgBF8gBD9gBA-gBAfgBAogCAagCA7gCj9q5jQbAAgHSAiQ1MDlhN2M0Ny0yMmYwLTRiNDUtYjNhMC0xY2Y1MTg3NWM5ODfYAgTgAgE&sid=2e5b4623e13363b5ec7de2d7957c8c22&aid=356980&dest_id=-38833&dest_type=city&srpvid=00bd4bf5ca01008f&track_hp_back_button=1&nflt=ht_id%3D204&offset=0"
for i in response.css('div._814193827>div>div>div>div>a'):
yield scrapy.Request(url=i.xpath('#href').extract_first(),callback = self.parse_detail)
#if self.count < self.total+25:
yield scrapy.Request(nexturl,self.parse)
def parse_detail(self,response):
nom_hotel = response.css('h2#hp_hotel_name.hp__hotel-name::text').get()
yield{
'nom_hotel' : nom_hotel.strip()
}
process = CrawlerProcess(
{
'USER_AGENT':'Mozilla/4.0 (comatible;MSIE 7.0;Window NT 5.1)'
})
process.crawl(DuProprioSpider)
process.start()
1. The first result is '\n'. Example for solution with get all:
def parse_detail(self, response):
nom_hotel = response.css('h2#hp_hotel_name.hp__hotel-name::text').getall()
nom_hotel = ''.join(nom_hotel)
yield{
'nom_hotel': nom_hotel.strip()
}
Output:
nom_hotel
Camp Sahara Holidays
Lovely House at La perle de Cabo Negro
Riad Dar Salam
Hôtel Auberge du Littoral
Kasbah Sirocco
...
...
...
2. Your pipeline is wrong so you'll get the results at the end of file after many blank lines. Or instead just use the default exporter:
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter'},
'FEED_FORMAT': 'csv',
'FEED_URI': 'bookingresult.csv'
}
3. You don't have to enter each page just to get the name, you can just scrape it from the search results page. example:
def parse(self, response):
nexturl = "https://www.booking.com/searchresults.fr.html?label=gog235jc-1DCAIojAFCAm1hSA1YA2iMAYgBAZgBDbgBF8gBD9gBA-gBAfgBAogCAagCA7gCj9q5jQbAAgHSAiQ1MDlhN2M0Ny0yMmYwLTRiNDUtYjNhMC0xY2Y1MTg3NWM5ODfYAgTgAgE&sid=2e5b4623e13363b5ec7de2d7957c8c22&aid=356980&dest_id=-38833&dest_type=city&srpvid=00bd4bf5ca01008f&track_hp_back_button=1&nflt=ht_id%3D204&offset=0"
all_names = response.xpath('//div[#data-testid="title"]/text()').getall()
for name in all_names:
yield {'nom_hotel': name}
It's faster since instead of 26 requests (the first plus 25 search results) you just create 1 request.
I am learning scrapy because I learnt it work asynchronously and thus faster than Selenium. But it actually takes about 3 mins to scrape just 100 items. I don't know why. Please I need help.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from batt_data.items import BattDataItem
import urllib.parse
from selenium import webdriver
class BatterySpider(CrawlSpider):
name = 'battery'
# allowed_domains = ['web']
start_urls = ['https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/1.html']
base_url = ['https://www.made-in-china.com/multi-search/24v%2Bbattery/F1/1.html']
# driver = webdriver.Chrome()
# driver.find_element_by_xpath('//a[contains(#class,"list-switch-btn list-switch-btn-right selected")]').click()
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(#class, "nextpage")]'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
price = response.css('.price::text').extract()
description = response.xpath('//img[#class="J-firstLazyload"]/#alt').extract()
chemistry = response.xpath('//li[#class="J-faketitle ellipsis"][1]/span/text()').extract()
applications = response.xpath('//li[#class="J-faketitle ellipsis"][2]/span/text()').extract()
discharge_rate = response.xpath('//li[#class="J-faketitle ellipsis"][4]/span/text()').extract()
shape = response.xpath('//li[#class="J-faketitle ellipsis"][5]/span/text()').extract()
data = zip(description,price,chemistry,applications,discharge_rate,shape)
for item in data:
scraped = {
'description': item[0],
'price' : item[1],
'chemistry' : item[2],
'applications' : item[3],
'discharge_rate' : item[4],
'shape' : item[5],
}
yield scraped
I was actually sending too many requests. I handled it by looping through a container that has all the items I needed. The updated spider did the job in less than 1 min
I am trying to scrape a swedish real estate website www.booli.se . However, i can't figure out how to follow links for each house and extract for example price, rooms, age etc. I only know how to scrape one page and i can't seem to wrap my head around this. I am looking to do something like:
for link in website:
follow link
attribute1 = item.css('cssobject::text').extract()[1]
attribute2 = item.ss('cssobject::text').extract()[2]
yield{'Attribute 1': attribute1, 'Attribute 2': attribute2}
So that i can scrape the data and output it to an excel-file. My code for scraping a simple page without following links is as follows:
import scrapy
class BooliSpider(scrapy.Spider):
name = "boolidata"
start_urls = [
'https://www.booli.se/slutpriser/lund/116978/'
]
'''def parse(self, response):
for link in response.css('.nav-list a::attr(href)').extract():
yield scrapy.Request(url=response.urljoin(link),
callback=self.collect_data)'''
def parse(self, response):
for item in response.css('li.search-list__item'):
size = item.css('span.search-list__row::text').extract()[1]
price = item.css('span.search-list__row::text').extract()[3]
m2price = item.css('span.search-list__row::text').extract()[4]
yield {'Size': size, 'Price': price, 'M2price': m2price}
Thankful for any help. Really having trouble getting it all together and outputting specific link contents to a cohesive output-file (excel).
You could use scrapy's CrawlSpider for following and scraping links
Your code should look like this:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spider import CrawlSpider, Rule
class BooliItem(scrapy.Item):
size = scrapy.Field()
price = scrapy.Field()
m2price = scrapy.Field()
class BooliSpider(CrawlSpider):
name = "boolidata"
start_urls = [
'https://www.booli.se/slutpriser/lund/116978/',
]
rules = [
Rule(
LinkExtractor(
allow=(r'listing url pattern here to follow'),
deny=(r'other url patterns to deny'),
),
callback='parse_item',
follow=True,
),
]
def parse_item(self, response):
item = BooliItem()
item['size'] = response.css('size selector').extract()
item['price'] = response.css('price selector').extract()
item['m2price'] = response.css('m2price selector').extract()
return item
And you can run your code via:
scrapy crawl booli -o booli.csv
and import your csv to Excel.
I am trying to scrape lynda.com courses and storing their info in a csv file. This is my code
# -*- coding: utf-8 -*-
import scrapy
import itertools
class LyndadevSpider(scrapy.Spider):
name = 'lyndadev'
allowed_domains = ['lynda.com']
start_urls = ['https://www.lynda.com/Developer-training-tutorials']
def parse(self, response):
#print(response.url)
titles = response.xpath('//li[#role="presentation"]//h3/text()').extract()
descs = response.xpath('//li[#role="presentation"]//div[#class="meta-description hidden-xs dot-ellipsis dot-resize-update"]/text()').extract()
links = response.xpath('//li[#role="presentation"]/div/div/div[#class="col-xs-8 col-sm-9 card-meta-data"]/a/#href').extract()
for title, desc, link in itertools.izip(titles, descs, links):
#print link
categ = scrapy.Request(link, callback=self.parse2)
yield {'desc': link, 'category': categ}
def parse2(self, response):
#getting categories by storing the navigation info
item = response.xpath('//ol[#role="navigation"]').extract()
return item
What I am trying to do here is that I am grabbing the titles, description of the list of tutorials and then navigating to the url and grabbing the categories in parse2.
However, I get results like this:
category,desc
<GET https://www.lynda.com/SVN-Subversion-tutorials/SVN-Java-Developers/552873-2.html>,https://www.lynda.com/SVN-Subversion-tutorials/SVN-Java-Developers/552873-2.html
<GET https://www.lynda.com/Java-tutorials/WebSocket-Programming-Java-EE/574694-2.html>,https://www.lynda.com/Java-tutorials/WebSocket-Programming-Java-EE/574694-2.html
<GET https://www.lynda.com/GameMaker-tutorials/Building-Physics-Based-Platformer-GameMaker-Studio-Using-GML/598780-2.html>,https://www.lynda.com/GameMaker-tutorials/Building-Physics-Based-Platformer-GameMaker-Studio-Using-GML/598780-2.html
How do I access the information that I want?
You need to yield a scrapy.Request in the parse method that parses the responses of start_urls (instead of yielding a dict). Also, I would rather loop over course items and extract the information for each course item separately.
I'm not sure what you mean exactly by categories. I suppose those are the tags you can see on the course details page at the bottom under Skills covered in this course. But I might be wrong.
Try this code:
# -*- coding: utf-8 -*-
import scrapy
class LyndaSpider(scrapy.Spider):
name = "lynda"
allowed_domains = ["lynda.com"]
start_urls = ['https://www.lynda.com/Developer-training-tutorials']
def parse(self, response):
courses = response.css('ul#category-courses div.card-meta-data')
for course in courses:
item = {
'title': course.css('h3::text').extract_first(),
'desc': course.css('div.meta-description::text').extract_first(),
'link': course.css('a::attr(href)').extract_first(),
}
request = scrapy.Request(item['link'], callback=self.parse_course)
request.meta['item'] = item
yield request
def parse_course(self, response):
item = response.meta['item']
#item['categories'] = response.css('div.tags a em::text').extract()
item['category'] = response.css('ol.breadcrumb li:last-child a span::text').extract_first()
return item
As a part of learning to use Scrapy, I have tried to Crawl Amazon and there is a problem while scraping data,
The output of my code is as follows:
2013-02-25 12:47:21+0530 [scanon] DEBUG: Scraped from <200 http://www.amazon.com/s/ref=sr_pg_2?ie=UTF8&page=2&qid=1361774681&rh=n%3A283155>
{'link': [u'http://www.amazon.com/ObamaCare-Survival-Guide-Nick-Tate/dp/0893348627/ref=sr_1_13?s=books&ie=UTF8&qid=1361774694&sr=1-13',
u'http://www.amazon.com/MELT-Method-Breakthrough-Self-Treatment-Eliminate/dp/0062065351/ref=sr_1_14?s=books&ie=UTF8&qid=1361774694&sr=1-14',
u'http://www.amazon.com/Official-SAT-Study-Guide-2nd/dp/0874478529/ref=sr_1_15?s=books&ie=UTF8&qid=1361774694&sr=1-15',
u'http://www.amazon.com/Inferno-Robert-Langdon-Dan-Brown/dp/0385537859/ref=sr_1_16?s=books&ie=UTF8&qid=1361774694&sr=1-16',
u'http://www.amazon.com/Memory-Light-Wheel-Time/dp/0765325950/ref=sr_1_17?s=books&ie=UTF8&qid=1361774694&sr=1-17',
u'http://www.amazon.com/Jesus-Calling-Enjoying-Peace-Presence/dp/1591451884/ref=sr_1_18?s=books&ie=UTF8&qid=1361774694&sr=1-18',
u'http://www.amazon.com/Fifty-Shades-Grey-Book-Trilogy/dp/0345803485/ref=sr_1_19?s=books&ie=UTF8&qid=1361774694&sr=1-19',
u'http://www.amazon.com/Fifty-Shades-Trilogy-Darker-3-/dp/034580404X/ref=sr_1_20?s=books&ie=UTF8&qid=1361774694&sr=1-20',
u'http://www.amazon.com/Wheat-Belly-Lose-Weight-Health/dp/1609611543/ref=sr_1_21?s=books&ie=UTF8&qid=1361774694&sr=1-21',
u'http://www.amazon.com/Publication-Manual-American-Psychological-Association/dp/1433805618/ref=sr_1_22?s=books&ie=UTF8&qid=1361774694&sr=1-22',
u'http://www.amazon.com/One-Only-Ivan-Katherine-Applegate/dp/0061992259/ref=sr_1_23?s=books&ie=UTF8&qid=1361774694&sr=1-23',
u'http://www.amazon.com/Inquebrantable-Spanish-Jenni-Rivera/dp/1476745420/ref=sr_1_24?s=books&ie=UTF8&qid=1361774694&sr=1-24'],
'title': [u'ObamaCare Survival Guide',
u'The Official SAT Study Guide, 2nd edition',
u'Inferno: A Novel (Robert Langdon)',
u'A Memory of Light (Wheel of Time)',
u'Jesus Calling: Enjoying Peace in His Presence',
u'Fifty Shades of Grey: Book One of the Fifty Shades Trilogy',
u'Fifty Shades Trilogy: Fifty Shades of Grey, Fifty Shades Darker, Fifty Shades Freed 3-volume Boxed Set',
u'Wheat Belly: Lose the Wheat, Lose the Weight, and Find Your Path Back to Health',
u'Publication Manual of the American Psychological Association, 6th Edition',
u'The One and Only Ivan',
u'Inquebrantable (Spanish Edition)'],
'visit_id': '2f4d045a9d6013ef4a7cbc6ed62dc111f6111633',
'visit_status': 'new'}
But, I wanted the output to be captured like this,
2013-02-25 12:47:21+0530 [scanon] DEBUG: Scraped from <200 http://www.amazon.com/s/ref=sr_pg_2?ie=UTF8&page=2&qid=1361774681&rh=n%3A283155>
{'link': [u'http://www.amazon.com/ObamaCare-Survival-Guide-Nick-Tate/dp/0893348627/ref=sr_1_13?s=books&ie=UTF8&qid=1361774694&sr=1-13'],
'title': [u'ObamaCare Survival Guide']}
2013-02-25 12:47:21+0530 [scanon] DEBUG: Scraped from <200 http://www.amazon.com/s/ref=sr_pg_2?ie=UTF8&page=2&qid=1361774681&rh=n%3A283155>
{'link': [u'http://www.amazon.com/Official-SAT-Study-Guide-2nd/dp/0874478529/ref=sr_1_15?s=books&ie=UTF8&qid=1361774694&sr=1-15'],
'title': [u'The Official SAT Study Guide, 2nd edition']}
I think its not a problem with the scrapy or the crawler, but with the FOR loop written.
Following is the code,
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from Amaze.items import AmazeItem
class AmazeSpider2(CrawlSpider):
name = "scanon"
allowed_domains = ["www.amazon.com"]
start_urls = ["http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=books"]
rules = (
Rule(SgmlLinkExtractor(allow=("ref=sr_pg_*")), callback="parse_items_1", follow= True),
)
def parse_items_1(self, response):
items = []
print ('*** response:', response.url)
hxs = HtmlXPathSelector(response)
titles = hxs.select('//h3')
for title in titles:
item = AmazeItem()
item["title"] = title.select('//a[#class="title"]/text()').extract()
item["link"] = title.select('//a[#class="title"]/#href').extract()
print ('**parse-items_1:', item["title"], item["link"])
items.append(item)
return items
Any assistance!
problem is in your Xpath
def parse_items_1(self, response):
items = []
print ('*** response:', response.url)
hxs = HtmlXPathSelector(response)
titles = hxs.select('//h3')
for title in titles:
item = AmazeItem()
item["title"] = title.select('.//a[#class="title"]/text()').extract()
item["link"] = title.select('.//a[#class="title"]/#href').extract()
print ('**parse-items_1:', item["title"], item["link"])
items.append(item)
return items
in above Xpaths you needs to use . in xpath to look into title only other wise your xpath will look on whole page , so it will get allot of matches and will return them,
By the way - you can test our your Xpath expressions in the Scrapy Shell - http://doc.scrapy.org/en/latest/topics/shell.html
Done right, it will save you hours of work and a headache. :)
Use yield to make a generator and fix your xpath selectors:
def parse_items_1(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//h3')
for title in titles:
item = AmazeItem()
item["title"] = title.select('.//a[#class="title"]/text()').extract()
item["link"] = title.select('.//a[#class="title"]/#href').extract()
yield item