How to crawl through link to ifnormation I need - python

I have to get all reviews text and scores from page of a product and i managed to:
With adding manual link to page with single product review I get all reviews and scores from page(including other pages of review)
And to speed up this process I wanted to from categories page go to product page and get all reviews and scores after this is done proceed to another product.
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield response.follow(href, self.parse_link)
# follow pagination links
#for href in response.css('li.arrow-next a::attr(href)'):
# yield response.follow(href, self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield response.follow(href, callback=self.parse)

Ok the following solution should work. The links you were getting only had the second part of the link , '/19838632', you need to use response.urljoin('/19838632') to get the full link.
Also the way the spider is currently setup you are going to be making a large number of requests to the site concurrently, therefore I would highly recommend using a proxy service.
`python
import scrapy
class ReviewAutoSpider(scrapy.Spider):
name = 'automatic'
start_urls = ['https://www.ceneo.pl/Gry_bez_pradu']
def parse(self, response):
# follow links to website with review
for href in response.css('a.product-rewiews-link + a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
for href in response.css('.cat-prod-row-name a::attr(href)').extract():
link = response.urljoin(href)
yield scrapy.Request(link, callback = self.parse)
next_page_link = response.css('li[class ="page-arrow arrow-next"] a::attr(href)').extract_first()
next_page_link = response.urljoin(next_page_link)
yield scrapy.Request(next_page_link, callback = self.parse)
def parse_link(self, response):
#get all reviews+score on page
for review in response.css('li.review-box'):
yield {
'score': review.css('span.review-score-count::text').get(),
'text': review.css('p.product-review-body::text').getall(),
}
# follow pagination links
for href in response.css('li.arrow-next a::attr(href)'):
yield scrapy.Request(href, callback = self.parse)
`

Related

Failed to retrieve product listings pages from few categories

From this webpage I am trying to get that kind of link where different products are located. There are 6 categories having More info button which when I traverse recursively, I usually reach the target pages. This is one such product listings page I wish to get.
Please note that some of these pages have both product listing and more info buttons, which is why I failed to capture the product listing pages accurately.
Current spider looks like the following (fails to grab lots of product listings pages):
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
for item in response.css(".match-height a.more-info::attr(href)").getall():
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url":inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
Expected output (randomly taken):
https://www.norgren.com/de/en/list/directional-control-valves/in-line-and-manifold-valves
https://www.norgren.com/de/en/list/pressure-switches/electro-mechanical-pressure-switches
https://www.norgren.com/de/en/list/pressure-switches/electronic-pressure-switches
https://www.norgren.com/de/en/list/directional-control-valves/sub-base-valves
https://www.norgren.com/de/en/list/directional-control-valves/non-return-valves
https://www.norgren.com/de/en/list/directional-control-valves/valve-islands
https://www.norgren.com/de/en/list/air-preparation/combination-units-frl
How to get all the product listings pages from the six categories?
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url)
def parse(self, response):
# check if there are items in the page
if response.xpath('//div[contains(#class, "item-list")]//div[#class="buttons"]/div[#class="more-information"]/a/#href'):
yield scrapy.Request(url=response.url, callback=self.get_links, dont_filter=True)
# follow "more info" buttons
for url in response.xpath('//a[text()="More info"]/#href').getall():
yield response.follow(url)
def get_links(self, response):
yield {"target_url": response.url}
next_page = response.xpath('//a[#class="next-button"]/#href').get()
if next_page:
yield response.follow(url=next_page, callback=self.get_links)
Maybe filter only pages that have at least one link to details? Here is an example of how to identify if a page meets the criteria you are searching for:
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
more_info_items = response.css(
".match-height a.more-info::attr(href)").getall()
detail_items = [item for item in more_info_items if '/detail/' in item]
if len(detail_items) > 0:
print(f'This is a link you are searching for: {response.url}')
for item in more_info_items:
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url": inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
I only printed the link to the console, but you can figure out how to log it to where you need.

next page crawl in Scrapy

I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.
Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)

Scrapy - Can not do multiple callbacks

I am having problems going through multiple pages. Here is my class for scrapy code called quotes.
class quotes(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/?']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback = self.books_detail)
def books_detail(self, response):
yas = {
'title':[],
'price':[],
'availability':[],
'category':[]
}
yas['category'].append(response.css('h1::text').extract())
all_divs = response.css('.col-lg-3')
for div in all_divs:
link = div.css('.product_pod a::attr(href)').get()
title = response.follow(link, callback = self.get_title)
yas['price'].append(div.css('.price_color::text').extract())
yas['availability'].append(div.css('.availability::text')[1].extract())
yield yas
def get_title(self,response):
print('testing')
title = response.css('h1::text').extract()
yield {"title":title}
So I use a response.follow to goto function books_details and in that function, I again call response.follow to call get_title. I get the 'title' from get_title and the rest of the details from the main page.
I can scrape the information just fine from the books_details function and I can get the link of the title page just fine as well from the code line.
link = div.css('.product_pod a::attr(href)').get()
But using the response.follow I can not go to the get_title function.
Any help would be appreciated. Thanks.
You should yield request, not run it directly, and use meta= to send data to next parser
yield response.follow(link, callback=self.get_title, meta={'item': yas})
and in next parser you can get it
yas = response.meta['item']
and then you can add new values and yield all data
yas["title"] = response.css('h1::text').extract()
yield yas
See other example in Scrapy yeild items from multiple requests
Doc: Request and Response, Request.meta special keys
Minimal working code which you can put in one file and run as normal script (python script.py) without creating project.
There are other changes.
You shouldn't put all books to one list but yield every book separatelly. Scrapy will keep all results and when you use option to save in csv then it will save all results.
For every book you should create new dictionary. If you use the same dictionary many time then it will ovewrite data and you may get many result with the same data.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback=self.books_detail)
def books_detail(self, response):
all_divs = response.css('.col-lg-3')
for div in all_divs:
# every book in separated dictionary and it has to be new dictionary - because it could overwrite old data
book = {
'category': response.css('h1::text').extract(),
'price': div.css('.price_color::text').extract()[0].strip(),
'availability': div.css('.availability::text')[1].extract().strip(),
}
link = div.css('.product_pod a::attr(href)').get()
yield response.follow(link, callback=self.get_title, meta={'item': book})
def get_title(self, response):
book = response.meta['item']
print('testing:', response.url)
book["title"] = response.css('h1::text').extract()[0].strip()
yield book
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(QuotesSpider)
c.start()

Scrapy spider won't jump to next page

I'm building a scaper with Scrapy for swedish ecommerce site Blocket.se.
It's scraping the first page as it should, but it won't jump the next.
The command for next url
response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
outputs an "incomplete" link when I try it in Scrapy shell:
?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Does it have to be a "full" link to work?:
https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&l=0&md=th&o=2
Starting-url: https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th
Full code:
import scrapy
class BlocketSpider(scrapy.Spider):
name = "blocket"
start_urls = ["https://www.blocket.se/stockholm?q=cykel&cg=0&w=1&st=s&c=&ca=11&is=1&l=0&md=th"]
def parse(self, response):
urls = response.css("h1.media-heading > a::attr(href)").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
#follow pagination links
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
"Objekt": response.css("h1.h3::text").extract(),
"Säljare":response.css("li.mrl > strong > a::text").extract(),
"Uppladdad": response.css("li.mrl > time::text").extract(),
"Pris": response.css("div.h3::text").extract(),
"Område": response.css("span.area_label::text").extract(),
"Bild-URL": response.css("div.item > img::attr(src)").extract(),
}
Yes, scrapy needs the full URL, usually. But you can keep using urljoin() or using the response.follow() method:
next_page_url = response.xpath(u'//a[contains(text(), "Nästa")]/#href').extract()
if next_page_url:
yield response.follow(url=next_page_url, callback=self.parse)
More about this in Scrapy Tutorial.

Scrape nested URLs using Scrapy

I am trying to scrape this web page:
https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/
I tried different ways, but every time it gives me a syntax error. I don't know much Python and Scrapy. Can anyone help me?
My requirements are:
In the header section of the page, there is a background image, some description and 2 product-related images.
In the Product Range section there are some number of images. I would like to go through all the images and scrape the individual product details.
The structure is like this:
Here is my code so far:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "plumber"
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/',
]
def parse(self, response):
for divs in response.css('div#product-variants div.viewport div.workspace div.float-box'):
yield {
#response.css('div#product-variants a::attr(href)').extract()
'producturl': divs.css('a::attr(href)').extract(),
'imageurl': divs.css('a img::attr(src)').extract(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract(),
next_page = producturl
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
}
You should take next_page yield out of your item.
In general you can iterate through products, make some load and carry it over in your request's meta parameter, like so:
def parse(self, response):
for divs in response.css('div#product-variants div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract(),
'imageurl': divs.css('a img::attr(src)').extract(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
yield scrapy.Request(next_page, callback=self.parse_page, meta={'item': item})
def parse_page(self, response):
"""This is individual product page"""
item = response.meta['item']
item['something_new'] = 'some_value'
return item

Categories