I'm building a scaper with Scrapy for swedish ecommerce site Blocket.se.
It's scraping the first page as it should, but it won't jump the next.
The command for next url
response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
outputs an "incomplete" link when I try it in Scrapy shell:
?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Does it have to be a "full" link to work?:
https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Starting-url: https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th
Full code:
import scrapy
class BlocketSpider(scrapy.Spider):
name = "blocket"
start_urls = ["https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th"]
def parse(self, response):
urls = response.css("h1.media-heading > a::attr(href)").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
#follow pagination links
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
"Objekt": response.css("h1.h3::text").extract(),
"Säljare":response.css("li.mrl > strong > a::text").extract(),
"Uppladdad": response.css("li.mrl > time::text").extract(),
"Pris": response.css("div.h3::text").extract(),
"Område": response.css("span.area_label::text").extract(),
"Bild-URL": response.css("div.item > img::attr(src)").extract(),
}
Yes, scrapy needs the full URL, usually. But you can keep using urljoin() or using the response.follow() method:
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
yield response.follow(url=next_page_url, callback=self.parse)
More about this in Scrapy Tutorial.
Related
the next_page variable gives the correct link when used on shell and even when printed on Console but Scrapy still keeps scraping the same(first) page
code below:
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%255B%255D%3DMen&p[]=facets.ideal_for%255B%255D%3Dmen&otracker=categorytree&fm=neo%2Fmerchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%2527s%2BBottom%2BWear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo%2Fmerchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
browser=webdriver.Chrome()
browser.get(response.request.url)
next_page=response.css("a._1LKTO3::attr(href)").getall()
try:
next_page=next_page[-1]
except:
time.sleep(1)
next_page=response.css("a._1LKTO3::attr(href)").getall()
next_page=next_page[-1]
print("\n\n\n NEXT PAGE\n\n\n")
print("\n"+next_page+"\n")
print(response.urljoin(next_page))
if next_page is not None:
next_page=response.urljoin(next_page)
# yield scrapy.Request(url=next_page,callback=self.parse)
yield scrapy.Request(next_page, callback=self.parse)
Your code works for me so I'm not sure why it doesn't work for you. Anyway this pagination also works but it's cleaner.
import scrapy
from selenium import webdriver
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%255B%255D%3DMen&p[]=facets.ideal_for%255B%255D%3Dmen&otracker=categorytree&fm=neo%2Fmerchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%2527s%2BBottom%2BWear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo%2Fmerchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
browser = webdriver.Chrome()
browser.get(response.request.url)
next_page = response.xpath('//a[span[text()="Next"]]/#href').get()
if next_page:
print("\n\n\n NEXT PAGE\n\n\n")
print("\n"+next_page+"\n")
next_page = response.urljoin(next_page)
print(next_page)
yield scrapy.Request(next_page, callback=self.parse)
I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.
Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
I have to get all reviews text and scores from page of a product and i managed to:
With adding manual link to page with single product review I get all reviews and scores from page(including other pages of review)
And to speed up this process I wanted to from categories page go to product page and get all reviews and scores after this is done proceed to another product.
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield response.follow(href, self.parse_link)
# follow pagination links
#for href in response.css('li.arrow-next a::attr(href)'):
# yield response.follow(href, self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield response.follow(href, callback=self.parse)
Ok the following solution should work. The links you were getting only had the second part of the link , '/19838632', you need to use response.urljoin('/19838632') to get the full link.
Also the way the spider is currently setup you are going to be making a large number of requests to the site concurrently, therefore I would highly recommend using a proxy service.
`python
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
for href in response.css('.cat-prod-row-name a::attr(href)').extract():
link = response.urljoin(href)
yield scrapy.Request(link, callback = self.parse)
next_page_link = response.css('li[class ="page-arrow arrow-next"] a::attr(href)').extract_first()
next_page_link = response.urljoin(next_page_link)
yield scrapy.Request(next_page_link, callback = self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
`
This question already has answers here:
Scrapy: scraping data from Pagination
(2 answers)
Closed 4 years ago.
I am trying to scrape data from a page and continue scraping following the pagination link.
The page I am trying to scrape is --> here
# -*- coding: utf-8 -*-
import scrapy
class AlibabaSpider(scrapy.Spider):
name = 'alibaba'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']
def parse(self, response):
for products in response.xpath('//div[contains(#class, "m-gallery-product-item-wrap")]'):
item = {
'product_name': products.xpath('.//h2/a/#title').extract_first(),
'price': products.xpath('.//div[#class="price"]/b/text()').extract_first('').strip(),
'min_order': products.xpath('.//div[#class="min-order"]/b/text()').extract_first(),
'company_name': products.xpath('.//div[#class="stitle util-ellipsis"]/a/#title').extract_first(),
'prod_detail_link': products.xpath('.//div[#class="item-img-inner"]/a/#href').extract_first(),
'response_rate': products.xpath('.//i[#class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
#'image_url': products.xpath('.//div[#class=""]/').extract_first(),
}
yield item
#Follow the paginatin link
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield scrapy.Request(url=next_page_url, callback=self.parse)
Problem
The code is not able to follow the pagination link.
How can you help
Modify the code to follow the pagination link.
To get your code working, you need to fix the broken link by using response.follow() or something similar. Try the below approach.
import scrapy
class AlibabaSpider(scrapy.Spider):
name = 'alibaba'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']
def parse(self, response):
for products in response.xpath('//div[contains(#class, "m-gallery-product-item-wrap")]'):
item = {
'product_name': products.xpath('.//h2/a/#title').extract_first(),
'price': products.xpath('.//div[#class="price"]/b/text()').extract_first('').strip(),
'min_order': products.xpath('.//div[#class="min-order"]/b/text()').extract_first(),
'company_name': products.xpath('.//div[#class="stitle util-ellipsis"]/a/#title').extract_first(),
'prod_detail_link': products.xpath('.//div[#class="item-img-inner"]/a/#href').extract_first(),
'response_rate': products.xpath('.//i[#class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
#'image_url': products.xpath('.//div[#class=""]/').extract_first(),
}
yield item
#Follow the paginatin link
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield response.follow(url=next_page_url, callback=self.parse)
Your pasted code was badly indented. I've fixed that as well.
It doesn't work because url isn't valid. If you want to keep using scrapy.Request, you could use:
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
A shorter solution:
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield response.follow(next_page_url)
i am using Splash 2.0.2 + Scrapy 1.0.5 + Scrapyjs 0.1.1 and im still not able to render javascript with a click. Here is an example url https://olx.pt/anuncio/loja-nova-com-250m2-garagem-em-box-fechada-para-arrumos-IDyTzAT.html#c49d3d94cf
I am still getting the page without the phone number rendered:
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
script = """
function main(splash)
splash:go(splash.args.url)
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
"""
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
import ipdb;ipdb.set_trace()
how can i get this to work?
Add
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
to Lua script and it will work.
function main(splash)
splash:go(splash.args.url)
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
.click() is JQuery function https://api.jquery.com/click/
You can avoid having to use Splash in the first place and make the appropriate GET request to get the phone number yourself. Working spider:
import json
import re
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents)
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
property_id = re.search(r"ID(\w+)\.", response.url).group(1)
phone_url = "https://olx.pt/ajax/misc/contact/phone/%s/" % property_id
yield scrapy.Request(phone_url, callback=self.parse_phone)
def parse_phone(self, response):
phone_number = json.loads(response.body)["value"]
print(phone_number)
If there are more things to extract from this "dynamic" website, see if Splash is really enough and, if not, look into browser automation and selenium.