Scrapy returning unordered values when crawling - python

I am new in Scrapy and I am trying to crawl this page and get the prices from the items, the problem is that scrapy is returning the values unordered and I don't know why.
This is my simple code
import scrapy
from ..items import AmazonItem
from scrapy.http import Request
import time
class QuotesSpider(scrapy.Spider):
name = "main"
def start_requests(self):
urls = [
'https://www.amazon.com/best-sellers-movies-TV-DVD-Blu-ray/zgbs/movies-tv/ref=zg_bs_nav_0',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# amazon = AmazonItem()
ol_response = response.xpath('//ol[#id="zg-ordered-list"]/li')
for number_ra in range(0,50):
response_div = ol_response[number_ra]
price = response_div.css(".p13n-sc-price::text").extract()
item_name = response_div.xpath("span/div/span/a/div/text()").get().strip()
link = response_div.xpath("span/div/span/a").attrib['href'].split('/')[3].split('?')[0]
print("({}) {} , PRICE: {}".format(number_ra+1,item_name,price))
print(link+"\n")
The name and the id are in the correct order but not the prices.
Thanks, guys

You are doing it wrong way
You should iterate over each Item one by one
def parse(self, response):
for item in response.xpath('//ol[#id="zg-ordered-list"]/li'):
price = item.css(".p13n-sc-price::text").get()
item_name = item.css(".p13n-sc-truncate.p13n-sc-line-clamp-1::text").get()
link = response.urljoin(item.css(".a-link-normal::attr(href)").get())
print("{} , PRICE: {}".format(item_name,price))

Related

Scrapy How to scrape items from multiple pages?

I am trying to scrape data of # pages. I have already done a scraper which can scrape data from a single # page. But it suddenly finished the work after scraping of the first page
The whole file with parse function and scrapd function - Scraper.py
# -*- coding: utf-8 -*-
import scrapy
import csv
import os
from scrapy.selector import Selector
from scrapy import Request
class Proddduct(scrapy.Item):
price = scrapy.Field()
description = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class LapadaScraperSpider(scrapy.Spider):
name = 'lapada_scraper2'
allowed_domains = ['http://www.lapada.org']
start_urls = ['https://lapada.org/art-and-antiques/?search=antique']
def parse(self, response):
next_page_url = response.xpath("//ul/li[#class='next']//a/#href").get()
for item in self.scrape(response):
yield item
if next_page_url:
print("Found url: {}".format(next_page_url))
yield scrapy.Request(url=next_page_url, callback=self.parse)
def scrape(self, response):
parser = scrapy.Selector(response)
products = parser.xpath("//div[#class='content']")
for product in products:
item = Proddduct()
XPATH_PRODUCT_DESCRIPTION = ".//strong/text()"
XPATH_PRODUCT_PRICE = ".//div[#class='price']/text()"
XPATH_PRODUCT_LINK = ".//a/#href"
raw_product_description = product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
raw_product_price = product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_link = product.xpath(XPATH_PRODUCT_LINK).extract_first()
item['description'] = raw_product_description
item['price'] = raw_product_price
item['link'] = raw_product_link
yield item
def get_information(self, response):
item = response.meta['item']
item['phonenumber'] = "12345"
yield item
How can I scrape all items in all pages?
Thanks
Change allowed_domains = ['http://www.lapada.org'] to allowed_domains = ['lapada.org']

Scrapy - how to join data together from different parts of a website

I am in the process of building a crawler. Now, I want it to navigate all available pages on the site, and [i] fill a number of data fields for each product, and [ii], for each product, drill into the corresponding product url, and populate a number of other data fields. I want all of the data in the same {} for each product. But instead, what the crawler is doing is carrying out [i], and then [ii], so that part [ii] is populated in a separate {}.
I want to somehow add data [i] into [ii]. request.meta['item'] = item looks to be something which could work, but I have not yet succeeded in getting it to work.
I have the following code:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlerItem
class Crawler1Spider(CrawlSpider):
name = "crawler1"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
#visit each page
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="listnavpagenum"]')), callback='parse_item', follow=True),
#click on each product link
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="exhib_status exhib_status_interiors"]')), callback='parse_detail', follow=True),
]
def parse_item(self, response):
sel = Selector(response)
elements = sel.xpath('//div[#class="ez_listitem_wrapper"]')
items = []
results = []
n = 0
for element in elements:
item = CrawlerItem()
n = n + 1
#work out how to put images into image folder
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[#class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
item['count'] = n
yield item
#items.append(item)
#return items
def parse_detail(self, response):
item = CrawlerItem()
item['telephone'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
Suggestion as to how I can get all the data into one {} for each product would be much appreciated.
UPDATE: 20/11/15
I have amended the code as follows:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlItem
class Crawler1Spider(CrawlSpider):
name = "test"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="listnavpagenum"]')), callback='parse_item', follow=True),
]
def parse_item(self, response):
item = CrawlItem()
sel = Selector(response)
elements = sel.xpath('//div[#class="ez_listitem_wrapper"]')
items = []
n = 0
for element in elements:
n = n + 1
#work out how to put images into image folder
#item['image_urls'] = selector.xpath('//a[#class="exhib_status exhib_status_interiors"]/img/#src').extract()
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[#class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
item['count'] = n
item_detail_url = item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail,meta=dict(item=item))
def parse_detail(self, response):
#get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
item['telephone'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
I'm getting the data in the same {}'s, however, the robot is only extracting data from the last item per page. Any further suggestions?
I am afraid you can't use rules for this case, as every request is independent when they reach the site you want to crawl.
You'll need to define your own behaviour from start_requests:
def start_requests(self):
yield Request(url=myinitialurl, callback=self.parse)
def parse(self, response):
# crawl the initial page and then do something with that info
yield Request(url=producturl, callback=self.parse_item)
def parse_item(self, response):
item = CrawlerItem()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail, meta=dict(item=item))
def parse_detail(self, response):
# get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
yield item
try instantiating item = CrawlItem() within the for loop in parse_item.

Scrapy parse list of urls, open one by one and parse additional data

I am trying to parse a site, an e-store. I parse a page with products, which are loaded with ajax, get urls of these products,and then parse additional info of each product following these parced urls.
My script gets the list of first 4 items on the page, their urls, makes the request, parses add info, but then not returning into the loop and so spider closes.
Could somebody help me in solving this? I'm pretty new to this kind of stuff, and ask here when totally stuck.
Here is my code:
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy_sokos.items import SokosItem
class SokosSpider(Spider):
name = "sokos"
allowed_domains = ["sokos.fi"]
base_url = "http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=%s&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151"
start_urls = [
"http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=0&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151",
]
for i in range(0, 8, 4):
start_urls.append((base_url) % str(i))
def parse(self, response):
products = Selector(response).xpath('//div[#class="product-listing product-grid"]/article[#class="product product-thumbnail"]')
for product in products:
item = SokosItem()
item['url'] = product.xpath('//div[#class="content"]/a[#class="image"]/#href').extract()[0]
yield Request(url = item['url'], meta = {'item': item}, callback=self.parse_additional_info)
def parse_additional_info(self, response):
item = response.meta['item']
item['name'] = Selector(response).xpath('//h1[#class="productTitle"]/text()').extract()[0].strip()
item['description'] = Selector(response).xpath('//div[#id="kuvaus"]/p/text()').extract()[0]
euro = Selector(response).xpath('//strong[#class="special-price"]/span[#class="euros"]/text()').extract()[0]
cent = Selector(response).xpath('//strong[#class="special-price"]/span[#class="cents"]/text()').extract()[0]
item['price'] = '.'.join(euro + cent)
item['number'] = Selector(response).xpath('//#data-productid').extract()[0]
yield item
The AJAX requests you are simulating are caught by the Scrapy "duplicate url filter".
Set dont_filter to True when yielding a Request:
yield Request(url=item['url'],
meta={'item': item},
callback=self.parse_additional_info,
dont_filter=True)

Scrapy: how to crawl the URL I got from spider? exceptions.NameError: global name 'parse_detail' is not defined

I practice scrapy and have a question:
I want to crawl the link I got from spider again and don't know how to do
Here is My code:
as you can see,the link I crawl will save in the parameter:movie_descriptionTW_URL
And I wrote yield Request(movie_descriptionTW, parse_detail) to send the result to def :
def parse_detail(self, response):
print(response.url)
But there is an error : exceptions.NameError: global name 'parse_detail' is not defined
How to solve this?
Please teach me! Thank you
from scrapy.spider import Spider
from scrapy.selector import Selector
from yahoo.items import YahooItem
from scrapy.http.request import Request
class MySpider(Spider):
name = "yahoogo"
start_urls = ["https://tw.movies.yahoo.com/chart.html"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath("//tr")
items = []
for site in sites:
item = YahooItem()
ranking_list = site.xpath("td[#class='c1']/span/text()").extract()
movie_descriptionTW = site.xpath("(td[#class='c3']/*//a)[position() < last()-1]/text() | td[#class='c3']/a[1]/text() ").extract()
movie_descriptionTW_URL = site.xpath("(td[#class='c3']/*//a[2]/#href) | td[#class='c3']/a[1]/#href ").extract()
# crawl again!
yield Request(movie_descriptionTW, parse_detail)
if ranking_list:
items.append(item)
yield items
def parse_detail(self, response):
print(response.url)
use self.parse_detail to refer to class methods like the following:
for url in movie_descriptionTW_URL:
yield Request(url=url, callback=self.parse_detail)

Scrapy Python Craigslist Scraper

I am trying to scrape Craigslist classifieds using Scrapy to extract items that are for sale.
I am able to extract date, post title, and post url but am having trouble extracting price.
For some reason the current code extracts all of the prices, but when I remove the // before the price span look up the price field returns as empty.
Can someone please review the code below and help me out?
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://longisland.craigslist.org/search/sss?sort=date&query=raptor%20660&srchType=T"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistSampleItem()
item['date'] = titles.select('span[#class="itemdate"]/text()').extract()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
item ['price'] = titles.select('//span[#class="itempp"]/text()').extract()
items.append(item)
return items
itempp appears to be inside of another element, itempnr. Perhaps it would work if you were to change //span[#class="itempp"]/text() to span[#class="itempnr"]/span[#class="itempp"]/text().

Categories