Scrapy - scrape of all of the item instead of 1 item - python

I need to scrape all of the items but only 1 item is scrape.
My code is working fine before but when I transfer it to other project which is same code this happens I don't know why
I need to get all of the items according to the page size in start_url
here's my working code
class HmSalesitemSpider(scrapy.Spider):
name = 'HM_salesitem'
allowed_domains = ['www2.hm.com']
start_urls = ['https://www2.hm.com/en_us/sale/shopbyproductladies/view-
all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-
size=3002']
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_subpage)
def parse_subpage(self, response):
item = {
'title': response.xpath("normalize-space(.//h1[contains(#class, 'primary') and contains(#class, 'product-item-headline')]/text())").extract_first(),
'sale-price': response.xpath("normalize-space(.//span[#class='price-value']/text())").extract_first(),
'regular-price': response.xpath('//script[contains(text(), "whitePrice")]/text()').re_first("'whitePrice'\s?:\s?'([^']+)'"),
'photo-url': response.css('div.product-detail-main-image-container img::attr(src)').extract_first(),
'description': response.css('p.pdp-description-text::text').extract_first()
}
yield item
Please Help. Thank you

It seems you have problem with indents. Move yielding request to for loop:
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').get()
yield scrapy.Request(url=url, callback=self.parse_subpage)
Or this is a bit cleared version:
def parse(self, response):
for link in response.css('li.product-item a::attr(href)').extract():
yield response.follow(link, self.parse_subpage)

Related

scrapy-splash CrawlSpider

I tried to scraping the driver names through scrapy-splash CrawlSpider on this site, but I constantly come across errors. After searching for ways to solve the problem, I came across github and just copied the latest code.
start_urls = ['http://www.huananzhi.com/html/1/184/185/index.html']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,callback=self.parse_item, args={'wait': 0.5}, meta={'real_url': url})
def _requests_to_follow(self, response):
if not isinstance(
response,
(HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r, response)
def use_splash(self, request, response):
request.meta.update(splash={
'args': {
'wait': 15,
},
'endpoint': 'render.html',
})
return request
linkRule = LinkExtractor(restrict_xpaths='//article /div[1]/div[1]/div[2]/a[1]')
itemRule = Rule(linkRule, callback='parse_item', follow=True, process_request='use_splash')
rules = (
itemRule,
)
def parse_item(self, response):
item = HuananzhiItem()
item['name'] = response.xpath("//div[#class='tab-content']//div[1]/h2/text()").get()
yield item
It didn't work so I tried using scrapy.Spider
def start_requests(self):
url = 'http://www.huananzhi.com/html/1/184/185/index.html'
yield SplashRequest(url=url, callback=self.parse)
def parse(self, response):
links = response.xpath('//article/div[1]/div[1]/div[2]/a[1]/#href')
for link in links:
yield SplashRequest(url=link, callback=self.parse_item)
next_page = response.xpath('//section//li[4]//a[1]')
yield from response.follow(next_page, self.parse)
def parse_item(self, response):
item = HuananzhiItem()
item['name'] = response.xpath("//div[#class='tab-content']//div[1]/h2/text()").get()
yield item
i also use scrapy-user-agent
can anyone tell me how to get the item ? Sorry for such a stupid question, I'm a beginner
Thanks

Failed to retrieve product listings pages from few categories

From this webpage I am trying to get that kind of link where different products are located. There are 6 categories having More info button which when I traverse recursively, I usually reach the target pages. This is one such product listings page I wish to get.
Please note that some of these pages have both product listing and more info buttons, which is why I failed to capture the product listing pages accurately.
Current spider looks like the following (fails to grab lots of product listings pages):
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
for item in response.css(".match-height a.more-info::attr(href)").getall():
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url":inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
Expected output (randomly taken):
https://www.norgren.com/de/en/list/directional-control-valves/in-line-and-manifold-valves
https://www.norgren.com/de/en/list/pressure-switches/electro-mechanical-pressure-switches
https://www.norgren.com/de/en/list/pressure-switches/electronic-pressure-switches
https://www.norgren.com/de/en/list/directional-control-valves/sub-base-valves
https://www.norgren.com/de/en/list/directional-control-valves/non-return-valves
https://www.norgren.com/de/en/list/directional-control-valves/valve-islands
https://www.norgren.com/de/en/list/air-preparation/combination-units-frl
How to get all the product listings pages from the six categories?
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url)
def parse(self, response):
# check if there are items in the page
if response.xpath('//div[contains(#class, "item-list")]//div[#class="buttons"]/div[#class="more-information"]/a/#href'):
yield scrapy.Request(url=response.url, callback=self.get_links, dont_filter=True)
# follow "more info" buttons
for url in response.xpath('//a[text()="More info"]/#href').getall():
yield response.follow(url)
def get_links(self, response):
yield {"target_url": response.url}
next_page = response.xpath('//a[#class="next-button"]/#href').get()
if next_page:
yield response.follow(url=next_page, callback=self.get_links)
Maybe filter only pages that have at least one link to details? Here is an example of how to identify if a page meets the criteria you are searching for:
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
more_info_items = response.css(
".match-height a.more-info::attr(href)").getall()
detail_items = [item for item in more_info_items if '/detail/' in item]
if len(detail_items) > 0:
print(f'This is a link you are searching for: {response.url}')
for item in more_info_items:
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url": inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
I only printed the link to the console, but you can figure out how to log it to where you need.

next page crawl in Scrapy

I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.
Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)

How do I recurse in scrapy?

Here's my code
import scrapy
class PvSpider(scrapy.Spider):
name = 'pv'
allowed_domains = ['www.piaov.com']
start_urls = ['http://www.piaov.com/']
def start_requests(self):
yield scrapy.Request(url='http://www.piaov.com/list/7.html')
def parse(self, response):
names = response.xpath("//ul[#class='mlist']//li/a/#title").extract()
on = response.meta.get("names", [])
cmp_names = on + names
for p in range(2, 7):
yield scrapy.Request(url='http://www.piaov.com/list/7_{}.html'.format(p),
meta={"names": cmp_names},
callback=self.parse)
yield scrapy.Request("http://www.piaov.com", meta={"names": cmp_names}, callback=self.parse_item)
def parse_item(self, response):
pass
When i debug my code in 'parse_item' function,the 'response.meta["names"]' only include the first page datas(12 titles in this case), how could i get the 6 pages datas list.
Its because you have URL http://www.piaov.com and scrapy ignores the duplicate URLs unless dont_filter=True is specified in Request like Request(url_here, dont_filter=True)
Also I don't like your logic of scraper, why are you calling parse_item at all? it is not necessary. Please see the code below and do it like that.
import scrapy
class PvSpider(scrapy.Spider):
name = 'pv'
allowed_domains = ['www.piaov.com']
start_urls = ['http://www.piaov.com/']
def start_requests(self):
yield scrapy.Request(url='http://www.piaov.com/list/7.html')
def parse(self, response):
for name in response.xpath("//ul[#class='mlist']//li/a/#title").extract():
yield {"name": name}
for p in range(2, 7):
yield scrapy.Request(url='http://www.piaov.com/list/7_{}.html'.format(p),
callback=self.parse)

scraping : scraping second level of url

In the below code parse function executes approximately 32 times (foor loop 32 href's found) in the sameway each sublink should go and scrape the data(32 individual urls parse_next function)
.But parse_next function executes only once(one way)/not called( and output csv file is empty.can anyone help me where i did mistake.
import scrapy
import logging
logger = logging.getLogger('mycustomlogger')
from ScrapyTestProject.items import ScrapytestprojectItem
class QuotesSpider(scrapy.Spider):
name = "nestedurl"
allowed_domains = ['www.grohe.in']
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/',
def parse(self, response):
logger.info("Parse function called on %s", response.url)
for divs in response.css('div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract_first(),
'imageurl': divs.css('a img::attr(src)').extract_first(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
#logger.info("This is an information %s", next_page)
yield scrapy.Request(next_page, callback=self.parse_next, meta={'item': item})
#yield item
def parse_next(self, response):
item = response.meta['item']
logger.info("Parse function called on2 %s", response.url)
item['headline'] = response.css('div#content a.headline::text').extract()
return item
#response.css('div#product-variants a::attr(href)').extract()
Ok so a few things go wrong:
Indentation
start_urls list is not closed with a [
allowed_domains uses the domain extension .in while you want to scrape .com
Working code below:
import scrapy
import logging
class QuotesSpider(scrapy.Spider):
name = "nestedurl"
allowed_domains = ['www.grohe.com']
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/'
]
def parse(self, response):
# logger.info("Parse function called on %s", response.url)
for divs in response.css('div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract_first(),
'imageurl': divs.css('a img::attr(src)').extract_first(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
#logger.info("This is an information %s", next_page)
yield scrapy.Request(next_page, callback=self.parse_next, meta={'item': item})
#yield item
def parse_next(self, response):
item = response.meta['item']
# logger.info("Parse function called on2 %s", response.url)
item['headline'] = response.css('div#content a.headline::text').extract()
return item
#response.css('div#product-variants a::attr(href)').extract()
Note: deleted some logging / item pipelines as these are not defined on my machine.

Categories