This question already has answers here:
Scrapy: scraping data from Pagination
(2 answers)
Closed 4 years ago.
I am trying to scrape data from a page and continue scraping following the pagination link.
The page I am trying to scrape is --> here
# -*- coding: utf-8 -*-
import scrapy
class AlibabaSpider(scrapy.Spider):
name = 'alibaba'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']
def parse(self, response):
for products in response.xpath('//div[contains(#class, "m-gallery-product-item-wrap")]'):
item = {
'product_name': products.xpath('.//h2/a/#title').extract_first(),
'price': products.xpath('.//div[#class="price"]/b/text()').extract_first('').strip(),
'min_order': products.xpath('.//div[#class="min-order"]/b/text()').extract_first(),
'company_name': products.xpath('.//div[#class="stitle util-ellipsis"]/a/#title').extract_first(),
'prod_detail_link': products.xpath('.//div[#class="item-img-inner"]/a/#href').extract_first(),
'response_rate': products.xpath('.//i[#class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
#'image_url': products.xpath('.//div[#class=""]/').extract_first(),
}
yield item
#Follow the paginatin link
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield scrapy.Request(url=next_page_url, callback=self.parse)
Problem
The code is not able to follow the pagination link.
How can you help
Modify the code to follow the pagination link.
To get your code working, you need to fix the broken link by using response.follow() or something similar. Try the below approach.
import scrapy
class AlibabaSpider(scrapy.Spider):
name = 'alibaba'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']
def parse(self, response):
for products in response.xpath('//div[contains(#class, "m-gallery-product-item-wrap")]'):
item = {
'product_name': products.xpath('.//h2/a/#title').extract_first(),
'price': products.xpath('.//div[#class="price"]/b/text()').extract_first('').strip(),
'min_order': products.xpath('.//div[#class="min-order"]/b/text()').extract_first(),
'company_name': products.xpath('.//div[#class="stitle util-ellipsis"]/a/#title').extract_first(),
'prod_detail_link': products.xpath('.//div[#class="item-img-inner"]/a/#href').extract_first(),
'response_rate': products.xpath('.//i[#class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
#'image_url': products.xpath('.//div[#class=""]/').extract_first(),
}
yield item
#Follow the paginatin link
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield response.follow(url=next_page_url, callback=self.parse)
Your pasted code was badly indented. I've fixed that as well.
It doesn't work because url isn't valid. If you want to keep using scrapy.Request, you could use:
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
A shorter solution:
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield response.follow(next_page_url)
Related
I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.
Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
I have a code.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.css('li > a::attr(href)').extract_first()
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
And I don't know why it scrapes only 3 pages
Output marked in red is only 3 pages of 88
where's the problem in pagination?
Your selector was finding the first <a> tag he could find, which was the language <a> tag. You were changing languages not pages.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.xpath('//a[#class="prev_next"]/#href').extract()[-1]
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
I looks like the website that you are scraping uses the url format uri?page=x
a simple loop to replace x can solve your problems.
I have to get all reviews text and scores from page of a product and i managed to:
With adding manual link to page with single product review I get all reviews and scores from page(including other pages of review)
And to speed up this process I wanted to from categories page go to product page and get all reviews and scores after this is done proceed to another product.
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield response.follow(href, self.parse_link)
# follow pagination links
#for href in response.css('li.arrow-next a::attr(href)'):
# yield response.follow(href, self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield response.follow(href, callback=self.parse)
Ok the following solution should work. The links you were getting only had the second part of the link , '/19838632', you need to use response.urljoin('/19838632') to get the full link.
Also the way the spider is currently setup you are going to be making a large number of requests to the site concurrently, therefore I would highly recommend using a proxy service.
`python
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
for href in response.css('.cat-prod-row-name a::attr(href)').extract():
link = response.urljoin(href)
yield scrapy.Request(link, callback = self.parse)
next_page_link = response.css('li[class ="page-arrow arrow-next"] a::attr(href)').extract_first()
next_page_link = response.urljoin(next_page_link)
yield scrapy.Request(next_page_link, callback = self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
`
I'm building a scaper with Scrapy for swedish ecommerce site Blocket.se.
It's scraping the first page as it should, but it won't jump the next.
The command for next url
response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
outputs an "incomplete" link when I try it in Scrapy shell:
?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Does it have to be a "full" link to work?:
https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Starting-url: https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th
Full code:
import scrapy
class BlocketSpider(scrapy.Spider):
name = "blocket"
start_urls = ["https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th"]
def parse(self, response):
urls = response.css("h1.media-heading > a::attr(href)").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
#follow pagination links
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
"Objekt": response.css("h1.h3::text").extract(),
"Säljare":response.css("li.mrl > strong > a::text").extract(),
"Uppladdad": response.css("li.mrl > time::text").extract(),
"Pris": response.css("div.h3::text").extract(),
"Område": response.css("span.area_label::text").extract(),
"Bild-URL": response.css("div.item > img::attr(src)").extract(),
}
Yes, scrapy needs the full URL, usually. But you can keep using urljoin() or using the response.follow() method:
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
yield response.follow(url=next_page_url, callback=self.parse)
More about this in Scrapy Tutorial.
I need a Scrapy spider to scrape the following page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) for each URL (30 products, so 30 urls) and then go into each product via that url and scrape the data inside.
I have the second part working exactly as I want:
import scrapy
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}
# next_page = response.css('div.ph-summary-entry-ctn a::attr("href")').extract_first()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
But I don't know how to do the first part. As you will see I have the main page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) set as the start_url. But how do I get it to populate the start_urls list with all 30 urls I need crawled?
I am not able to test at this moment, so please let me know if this works for you so I can edit it should there be any bugs.
The idea here is that we find every link in the first page and yield new scrapy requests passing your product parsing method as a callback
import scrapy
from urllib.parse import urljoin
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
products = response.xpath("//*[contains(#class, 'ph-summary-entry-ctn')]/a/#href").extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}