next page crawl in Scrapy - python

I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.

Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)

Related

Scrapy doesn't go to the next url

the next_page variable gives the correct link when used on shell and even when printed on Console but Scrapy still keeps scraping the same(first) page
code below:
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%255B%255D%3DMen&p[]=facets.ideal_for%255B%255D%3Dmen&otracker=categorytree&fm=neo%2Fmerchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%2527s%2BBottom%2BWear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo%2Fmerchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
browser=webdriver.Chrome()
browser.get(response.request.url)
next_page=response.css("a._1LKTO3::attr(href)").getall()
try:
next_page=next_page[-1]
except:
time.sleep(1)
next_page=response.css("a._1LKTO3::attr(href)").getall()
next_page=next_page[-1]
print("\n\n\n NEXT PAGE\n\n\n")
print("\n"+next_page+"\n")
print(response.urljoin(next_page))
if next_page is not None:
next_page=response.urljoin(next_page)
# yield scrapy.Request(url=next_page,callback=self.parse)
yield scrapy.Request(next_page, callback=self.parse)
Your code works for me so I'm not sure why it doesn't work for you. Anyway this pagination also works but it's cleaner.
import scrapy
from selenium import webdriver
class QuotesSpider(scrapy.Spider):
name = "Bider"
def start_requests(self):
urls = [
"https://www.flipkart.com/clothing-and-accessories/bottomwear/pr?sid=clo,vua&p[]=facets.ideal_for%255B%255D%3DMen&p[]=facets.ideal_for%255B%255D%3Dmen&otracker=categorytree&fm=neo%2Fmerchandising&iid=M_1064313a-7a8d-48f3-8199-daaf60d62ef6_2_372UD5BXDFYS_MC.8HARX8UX7IX5&otracker=hp_rich_navigation_2_2.navigationCard.RICH_NAVIGATION_Fashion~Men%2527s%2BBottom%2BWear_8HARX8UX7IX5&otracker1=hp_rich_navigation_PINNED_neo%2Fmerchandising_NA_NAV_EXPANDABLE_navigationCard_cc_2_L1_view-all&cid=8HARX8UX7IX5"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
browser = webdriver.Chrome()
browser.get(response.request.url)
next_page = response.xpath('//a[span[text()="Next"]]/#href').get()
if next_page:
print("\n\n\n NEXT PAGE\n\n\n")
print("\n"+next_page+"\n")
next_page = response.urljoin(next_page)
print(next_page)
yield scrapy.Request(next_page, callback=self.parse)

Failed to retrieve product listings pages from few categories

From this webpage I am trying to get that kind of link where different products are located. There are 6 categories having More info button which when I traverse recursively, I usually reach the target pages. This is one such product listings page I wish to get.
Please note that some of these pages have both product listing and more info buttons, which is why I failed to capture the product listing pages accurately.
Current spider looks like the following (fails to grab lots of product listings pages):
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
for item in response.css(".match-height a.more-info::attr(href)").getall():
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url":inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
Expected output (randomly taken):
https://www.norgren.com/de/en/list/directional-control-valves/in-line-and-manifold-valves
https://www.norgren.com/de/en/list/pressure-switches/electro-mechanical-pressure-switches
https://www.norgren.com/de/en/list/pressure-switches/electronic-pressure-switches
https://www.norgren.com/de/en/list/directional-control-valves/sub-base-valves
https://www.norgren.com/de/en/list/directional-control-valves/non-return-valves
https://www.norgren.com/de/en/list/directional-control-valves/valve-islands
https://www.norgren.com/de/en/list/air-preparation/combination-units-frl
How to get all the product listings pages from the six categories?
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url)
def parse(self, response):
# check if there are items in the page
if response.xpath('//div[contains(#class, "item-list")]//div[#class="buttons"]/div[#class="more-information"]/a/#href'):
yield scrapy.Request(url=response.url, callback=self.get_links, dont_filter=True)
# follow "more info" buttons
for url in response.xpath('//a[text()="More info"]/#href').getall():
yield response.follow(url)
def get_links(self, response):
yield {"target_url": response.url}
next_page = response.xpath('//a[#class="next-button"]/#href').get()
if next_page:
yield response.follow(url=next_page, callback=self.get_links)
Maybe filter only pages that have at least one link to details? Here is an example of how to identify if a page meets the criteria you are searching for:
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
more_info_items = response.css(
".match-height a.more-info::attr(href)").getall()
detail_items = [item for item in more_info_items if '/detail/' in item]
if len(detail_items) > 0:
print(f'This is a link you are searching for: {response.url}')
for item in more_info_items:
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url": inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
I only printed the link to the console, but you can figure out how to log it to where you need.

Scraping websites with Scrapy

I have a code.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.css('li > a::attr(href)').extract_first()
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
And I don't know why it scrapes only 3 pages
Output marked in red is only 3 pages of 88
where's the problem in pagination?
Your selector was finding the first <a> tag he could find, which was the language <a> tag. You were changing languages not pages.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.xpath('//a[#class="prev_next"]/#href').extract()[-1]
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
I looks like the website that you are scraping uses the url format uri?page=x
a simple loop to replace x can solve your problems.

How to crawl through link to ifnormation I need

I have to get all reviews text and scores from page of a product and i managed to:
With adding manual link to page with single product review I get all reviews and scores from page(including other pages of review)
And to speed up this process I wanted to from categories page go to product page and get all reviews and scores after this is done proceed to another product.
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield response.follow(href, self.parse_link)
# follow pagination links
#for href in response.css('li.arrow-next a::attr(href)'):
# yield response.follow(href, self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield response.follow(href, callback=self.parse)
Ok the following solution should work. The links you were getting only had the second part of the link , '/19838632', you need to use response.urljoin('/19838632') to get the full link.
Also the way the spider is currently setup you are going to be making a large number of requests to the site concurrently, therefore I would highly recommend using a proxy service.
`python
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
for href in response.css('.cat-prod-row-name a::attr(href)').extract():
link = response.urljoin(href)
yield scrapy.Request(link, callback = self.parse)
next_page_link = response.css('li[class ="page-arrow arrow-next"] a::attr(href)').extract_first()
next_page_link = response.urljoin(next_page_link)
yield scrapy.Request(next_page_link, callback = self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
`

Scrapy - scrape of all of the item instead of 1 item

I need to scrape all of the items but only 1 item is scrape.
My code is working fine before but when I transfer it to other project which is same code this happens I don't know why
I need to get all of the items according to the page size in start_url
here's my working code
class HmSalesitemSpider(scrapy.Spider):
name = 'HM_salesitem'
allowed_domains = ['www2.hm.com']
start_urls = ['https://www2.hm.com/en_us/sale/shopbyproductladies/view-
all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-
size=3002']
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_subpage)
def parse_subpage(self, response):
item = {
'title': response.xpath("normalize-space(.//h1[contains(#class, 'primary') and contains(#class, 'product-item-headline')]/text())").extract_first(),
'sale-price': response.xpath("normalize-space(.//span[#class='price-value']/text())").extract_first(),
'regular-price': response.xpath('//script[contains(text(), "whitePrice")]/text()').re_first("'whitePrice'\s?:\s?'([^']+)'"),
'photo-url': response.css('div.product-detail-main-image-container img::attr(src)').extract_first(),
'description': response.css('p.pdp-description-text::text').extract_first()
}
yield item
Please Help. Thank you
It seems you have problem with indents. Move yielding request to for loop:
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').get()
yield scrapy.Request(url=url, callback=self.parse_subpage)
Or this is a bit cleared version:
def parse(self, response):
for link in response.css('li.product-item a::attr(href)').extract():
yield response.follow(link, self.parse_subpage)

Categories