scraping e-commerce website using scrapy concept - python

I'm new to this scrapy concept. I have written a script for E-commerce website and need to scrape below mentioned details in that website. I facing issue with this script. please anyone help me to get out from this issue.
website:https://savedbythedress.com/collections/maternity-tops
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in link_products:
product_link = domain + link
yield{
'product_link': product_link.css('div[class="product-info-inner"] ::attr(href)').get(),
}
yield scrapy.Request(url=product_link, callback=self.parse_contents)
def parse_contents(self, response):
#scrape needed information
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}

use yield response.follow(page_url, self.parse_contents) it will work for you
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
# link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in response.css('div.product-info'):
page_url = link.css('div[class="product-info-inner"] ::attr(href)').get()
print('PAGE URL IS ', page_url)
yield response.follow(page_url, self.parse_contents)
# product_link = domain + link
# yield{
# 'product_link': link.css('div[class="product-info-inner"] ::attr(href)').get(),
# }
print(page_url)
# yield scrapy.Request(response.follow(page_url), callback=self.parse_contents)
def parse_contents(self, response):
print()
#scrape needed information
print(response.url)
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}

Related

Scrapy not scraping links gathered from pagination

I am trying to scrape an e-commerce website for its products, and I am currently facing an issue that not all of the pages I get with pagination are visited. The links themselves are valid, and visitable, not non-existing.
My spider code:
import scrapy
import json
from pbl.items import ShopCard
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['www.trobos.lt']
start_urls = ['https://trobos.lt/prekes?vendor=MAXIMA']
item = []
list = [{
'sid': 10,
'name': 'Maxima',
'domain': 'hhttps://www.maxima.lt/',
'imageurl': 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg',
'product': item
}]
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
self.getAllItemsXpath = '//*[#id="category"]/div/div[1]/div/div[3]/div[4]/div/div/div/div/div/a/#href'
self.TitleXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/h1/text()'
self.PriceXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/div[1]/div/div[1]/div/div[1]/span/text()'
def parse(self, response):
for href in response.xpath(self.getAllItemsXpath):
url = response.urljoin(href.extract())
yield scrapy.Request(url=url,callback=self.parse_main_item, dont_filter=True)
next_page = [response.url + '&page='+str(x) for x in range(1,193)]
for page in next_page:
print('-'* 100)
print(page)
print('-'* 100)
url = page
yield scrapy.Request(url, callback=self.parse)
def parse_main_item(self,response):
shop = ShopCard()
Title = response.xpath(self.TitleXpath).extract_first()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath(self.PriceXpath).extract_first()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
self.item.append(shop['item'])
def closed(self, reason):
with open("spiderMaxima.json", "w") as final:
json.dump(self.list, final, indent=2, ensure_ascii=False)
I am using a list with range() function, because in the response (from scrapy shell view(response), pagination buttons are connected to a script.
I have also tried scrapy shell several of the links, the outputs for xpaths work, but still, the pages are not getting scraped. What may be the issue? Are there other ways to deal with the pagination?
There are many things wrong with your code, and other things that can be improved. Please read the documentation carefully.
There's really no need to create xpath attributes.
You can write the xpath way shorter.
You can create a start_urls from the beginning.
You can let the item exporter to handle the json.
Here's an example, change it to your needs.
import scrapy
class ShopCard(scrapy.Item):
item = scrapy.Field()
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['trobos.lt']
start_urls = [f'https://trobos.lt/prekes?vendor=MAXIMA&page={i}' for i in range(1, 190)]
items = []
custom_settings = {
'DOWNLOAD_DELAY': 0.4,
'FEEDS': {
'spiderMaxima.json': {
'format': 'json',
'indent': 2,
}
}
}
def parse(self, response):
for url in response.xpath('//div[#class="card small"]//a[contains(#class, "shrink")]/#href').getall():
yield response.follow(url=url, callback=self.parse_main_item)
def parse_main_item(self, response):
shop = ShopCard()
Title = response.xpath('//h1/text()').get()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath('//div[#class="price"]//span/text()').get()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
yield shop

How to scrape description of every link

After i useed scrapy in python and i have all url links of my products from one site, how can i acces every link and scrape description of every product?
This is my code:
import scrapy
class aldi2(scrapy.Spider):
name = 'aldi2'
start_urls = ['https://www.aldi.nl/producten/brood-bakkerij/dagvers-brood.html']
def parse(self,response):
for products in response.css('div.mod.mod-article-tile.mod-article-tile--default.ct-backwaren'):
yield{
'name':products.css('span.mod-article-tile__title::text').get().replace('\n','').replace('\t',''),
'price':products.css('span.price__wrapper::text').get().replace('\n','').replace('\t',''),
'unit' :products.css('span.price__unit::text').get().replace('\n',''),
'link' : products.css ('a.mod-article-tile__action').attrib['href']
}
You can use 'response.follow' or 'scrapy.Request' (but you'll need to add the base url) with the desired url.
(I changed the 'replace' function with 'strip').
import scrapy
class aldi2(scrapy.Spider):
name = 'aldi2'
start_urls = ['https://www.aldi.nl/producten/brood-bakkerij/dagvers-brood.html']
def parse(self, response):
for products in response.css('div.mod.mod-article-tile.mod-article-tile--default.ct-backwaren'):
yield{
'name':products.css('span.mod-article-tile__title::text').get().strip(),
'price':products.css('span.price__wrapper::text').get().strip(),
'unit':products.css('span.price__unit::text').get().strip(),
'link': products.css('a.mod-article-tile__action').attrib['href']
}
url = products.css('a.mod-article-tile__action').attrib['href']
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
pass

next page crawl in Scrapy

I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.
Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)

Scrapy problems with crawling specific TAG

I am having a problem with my scrapy program, I want to crawl information from following website
https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=>
I want to get the "Part No." information inside the "span id=resPartNum" TAG. I have already tried:
- NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
- NAME_SELECTOR = './/span[#class="resPartNum"]/text()
- NAME_SELECTOR = './/tr/td/span[#class="resPartNum"]/a/text()'
Here is my full CODE:
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = '.set'
for part in response.css(SET_SELECTOR):
NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css(NAME_SELECTOR).extract_first(),
}
I am not very advanced in scrapy and would appreciate ANY HELP!!
Use the css selector table.partlookup_table to collect the table item through loop partNum and partName.here extract() return list.
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = 'table.partlookup_table'
for part in response.css(SET_SELECTOR):
#NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css('span.resPartName a::text').extract(),
'partnumber': part.css('span.resPartNum a::text').extract()
}
process = CrawlerProcess()
process.crawl(PartSpider)
process.start()

How can I Scrape next pages with Scrapy

Here is my scrapy code.I dont know my mistake but in only scrapes first page.How can i scrape and traverse through pages ? Is there any other way for scraping next pages ?
import scrapy
class HurriyetEmlakPage(scrapy.Spider):
name = 'hurriyetspider'
allowed_domain = 'hurriyetemlak.com'
start_urls = ['https://www.hurriyetemlak.com/satilik']
def parse(self, response):
fiyat = response.xpath('//div[#class="list-view-price"]//text()').extract()
durum = response.xpath('//div[#class="middle sibling"]//div[#class="left"]//text()').extract()
oda_sayisi = response.xpath('//span[#class="celly houseRoomCount"]//text()').extract()
metrekare = response.xpath('//span[#class="celly squareMeter list-view-size"]//text()').extract()
bina_yasi = response.xpath('//span[#class="celly buildingAge"]//text()').extract()
bulundugu_kat = response.xpath('//span[#class="celly floortype"]//text()').extract()
konum = response.xpath('//div[#class="list-view-location"]//text()').extract()
scraped_info = {
'fiyat':fiyat,
'durum': durum,
'oda_sayisi' : oda_sayisi,
'metrekare' : metrekare,
'bina_yasi' : bina_yasi,
'bulundugu_kat': bulundugu_kat,
'konum' : konum
}
yield scraped_info
next_page_url = response.xpath('//li[#class="next-li pagi-nav"]//a').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url,callback = self.parse)
Actually, you could simply generate your url list like this :
url_list = [f"https://www.hurriyetemlak.com/satilik?page={page}" for page in range(1,7326)]
Output
['https://www.hurriyetemlak.com/satilik?page=1',
'https://www.hurriyetemlak.com/satilik?page=2',
'https://www.hurriyetemlak.com/satilik?page=3',
'https://www.hurriyetemlak.com/satilik?page=4',
'https://www.hurriyetemlak.com/satilik?page=5',
...]

Categories