Scrapy not scraping links gathered from pagination - python

I am trying to scrape an e-commerce website for its products, and I am currently facing an issue that not all of the pages I get with pagination are visited. The links themselves are valid, and visitable, not non-existing.
My spider code:
import scrapy
import json
from pbl.items import ShopCard
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['www.trobos.lt']
start_urls = ['https://trobos.lt/prekes?vendor=MAXIMA']
item = []
list = [{
'sid': 10,
'name': 'Maxima',
'domain': 'hhttps://www.maxima.lt/',
'imageurl': 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg',
'product': item
}]
def __init__(self):
self.declare_xpath()
def declare_xpath(self):
self.getAllItemsXpath = '//*[#id="category"]/div/div[1]/div/div[3]/div[4]/div/div/div/div/div/a/#href'
self.TitleXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/h1/text()'
self.PriceXpath = '//*[#id="product"]/section[1]/div[3]/section/div[2]/div[1]/div/div[1]/div/div[1]/span/text()'
def parse(self, response):
for href in response.xpath(self.getAllItemsXpath):
url = response.urljoin(href.extract())
yield scrapy.Request(url=url,callback=self.parse_main_item, dont_filter=True)
next_page = [response.url + '&page='+str(x) for x in range(1,193)]
for page in next_page:
print('-'* 100)
print(page)
print('-'* 100)
url = page
yield scrapy.Request(url, callback=self.parse)
def parse_main_item(self,response):
shop = ShopCard()
Title = response.xpath(self.TitleXpath).extract_first()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath(self.PriceXpath).extract_first()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
self.item.append(shop['item'])
def closed(self, reason):
with open("spiderMaxima.json", "w") as final:
json.dump(self.list, final, indent=2, ensure_ascii=False)
I am using a list with range() function, because in the response (from scrapy shell view(response), pagination buttons are connected to a script.
I have also tried scrapy shell several of the links, the outputs for xpaths work, but still, the pages are not getting scraped. What may be the issue? Are there other ways to deal with the pagination?

There are many things wrong with your code, and other things that can be improved. Please read the documentation carefully.
There's really no need to create xpath attributes.
You can write the xpath way shorter.
You can create a start_urls from the beginning.
You can let the item exporter to handle the json.
Here's an example, change it to your needs.
import scrapy
class ShopCard(scrapy.Item):
item = scrapy.Field()
class SpidermaximaSpider(scrapy.Spider):
name = 'spiderMaxima'
allowed_domains = ['trobos.lt']
start_urls = [f'https://trobos.lt/prekes?vendor=MAXIMA&page={i}' for i in range(1, 190)]
items = []
custom_settings = {
'DOWNLOAD_DELAY': 0.4,
'FEEDS': {
'spiderMaxima.json': {
'format': 'json',
'indent': 2,
}
}
}
def parse(self, response):
for url in response.xpath('//div[#class="card small"]//a[contains(#class, "shrink")]/#href').getall():
yield response.follow(url=url, callback=self.parse_main_item)
def parse_main_item(self, response):
shop = ShopCard()
Title = response.xpath('//h1/text()').get()
Link = response.url
Image = 'https://upload.wikimedia.org/wikipedia/commons/c/c1/Maxima_logo.svg'
Price = response.xpath('//div[#class="price"]//span/text()').get()
Price = Price.replace(',', '.')
Price = float(Price.split(' ')[0])
shop['item'] = {
'title': Title,
'link': Link,
'image': Image,
'price': Price
}
yield shop

Related

Scrapy.Request returns <GET url> without scraping anything

I wanted to scrape the feed of sitepoint.com, this is my code:
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def parse(self, response):
data = []
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
text = scrapy.Request(url, callback=self.parse_article)
data.append(
{"title": title, "href": href, "img": img, "time": time, "text": text}
)
yield data
def parse_article(self, response):
text = response.xpath(
'//*[#id="main-content"]/article/div/div/div[1]/section/text()'
).extract()
yield text
And this is the response I get:-
[{'title': 'How to Build an MVP with React and Firebase',
'href': '/react-firebase-build-mvp/',
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp-
app.jpg',
'time': 'September 28, 2021',
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]
It just does not scrape the urls. I followed everything said in this question but still could not make it work.
You have to visit the detail page from the listing to scrape the article.
In that case you have to yield the URL first then yield the data in the last spider
Also, the //*[#id="main-content"]/article/div/div/div[1]/section/text() won't return you any text since there are lots of HTML elements under the section tag
One solution is you can scrape all the HTML element inside section tag and clean them later to get your article text data
here is the full working code
import re
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def clean_text(self, raw_html):
"""
:param raw_html: this will take raw html code
:return: text without html tags
"""
cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(cleaner, '', raw_html)
def parse(self, response):
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
"href": href,
"img": img,
"time": time})
def parse_article(self, response):
title = response.request.meta["title"]
href = response.request.meta["href"]
img = response.request.meta["img"]
time = response.request.meta["time"]
all_data = {}
article_html = response.xpath('//*[#id="main-content"]/article/div/div/div[1]/section').get()
all_data["title"] = title
all_data["href"] = href
all_data["img"] = img
all_data["time"] = time
all_data["text"] = self.clean_text(article_html)
yield all_data

scraping e-commerce website using scrapy concept

I'm new to this scrapy concept. I have written a script for E-commerce website and need to scrape below mentioned details in that website. I facing issue with this script. please anyone help me to get out from this issue.
website:https://savedbythedress.com/collections/maternity-tops
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in link_products:
product_link = domain + link
yield{
'product_link': product_link.css('div[class="product-info-inner"] ::attr(href)').get(),
}
yield scrapy.Request(url=product_link, callback=self.parse_contents)
def parse_contents(self, response):
#scrape needed information
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}
use yield response.follow(page_url, self.parse_contents) it will work for you
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
# link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in response.css('div.product-info'):
page_url = link.css('div[class="product-info-inner"] ::attr(href)').get()
print('PAGE URL IS ', page_url)
yield response.follow(page_url, self.parse_contents)
# product_link = domain + link
# yield{
# 'product_link': link.css('div[class="product-info-inner"] ::attr(href)').get(),
# }
print(page_url)
# yield scrapy.Request(response.follow(page_url), callback=self.parse_contents)
def parse_contents(self, response):
print()
#scrape needed information
print(response.url)
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}

Scrapy's JSON output forms an array of JSON objects

I'm trying to scrape a games info website using Scrapy. The scraping process goes like this: scraping the categories -> scraping the list of games (multiple pages for each category) -> scraping game info.
The scraped info supposed to go into a json file. I'm getting the following result:
[
{"category": "cat1", "games": [...]},
{"category": "cat2", "games": [...]},
...
]
but I want to get this result:
{ "categories":
[
{"category": "cat1", "games": [...]},
{"category": "cat2", "games": [...]},
...
]
}
I tried to use the steps from this post and this post, with no success. couldn't find more related questions.
I would appreciate any help.
My spider:
import scrapy
from ..items import Category, Game
class GamesSpider(scrapy.Spider):
name = 'games'
start_urls = ['https://www.example.com/categories']
base_url = 'https://www.exmple.com'
def parse(self, response):
categories = response.xpath("...")
for category in categories:
cat_name = category.xpath(".//text()").get()
url = self.base_url + category.xpath(".//#href").get()
cat = Category()
cat['category'] = cat_name
yield response.follow(url=url,
callback=self.parse_category,
meta={ 'category': cat })
def parse_category(self, response):
games_url_list = response.xpath('//.../a/#href').getall()
cat = response.meta['category']
url = self.base_url + games_url_list.pop()
next_page = response.xpath('//a[...]/#href').get()
if next_page:
next_page = self.base_url + response.xpath('//a[...]/#href').get()
yield response.follow(url=url,
callback=self.parse_game,
meta={'category': cat,
'games_url_list': games_url_list,
'next_page': next_page})
def parse_game(self, response):
cat = response.meta['category']
game = Game()
try:
cat['games_list']
except:
cat['games_list'] = []
game['title_en'] = response.xpath('...')
game['os'] = response.xpath('...')
game['users_rating'] = response.xpath('...')
cat['games_list'].append(game)
games_url_list = response.meta['games_url_list']
next_page = response.meta['next_page']
if games_url_list:
url = self.base_url + games_url_list.pop()
yield response.follow(url=url,
callback=self.parse_game,
meta={'category': cat,
'games_url_list': games_url_list,
'next_page': next_page})
else:
if next_page:
yield response.follow(url=next_page,
callback=self.parse_category,
meta={'category': cat})
else:
yield cat
My item.py file:
import scrapy
class Category(scrapy.Item):
category = scrapy.Field()
games_list = scrapy.Field()
class Game(scrapy.Item):
title_en = scrapy.Field()
os = scrapy.Field()
users_rating = scrapy.Field()
You need to write a custom item exporter, or handle post-processing of the file generated by Scrapy separately, e.g. with a standalone Python script that converts from the output format to the desired format.

How to extract the website URL from the redirect link with Scrapy Python

I wrote a script to get the data from a website. I have issue with collecting the website URL since the #href is the redirect link. How can I convert the redirect URL to the actual website it's redirecting to?
import scrapy
import logging
class AppSpider(scrapy.Spider):
name = 'app'
allowed_domains = ['www.houzz.in']
start_urls = ['https://www.houzz.in/professionals/searchDirectory?topicId=26721&query=Design-Build+Firms&location=Mumbai+City+District%2C+India&distance=100&sort=4']
def parse(self, response):
lists = response.xpath('//li[#class="hz-pro-search-results__item"]/div/div[#class="hz-pro-search-result__info"]/div/div/div/a')
for data in lists:
link = data.xpath('.//#href').get()
yield scrapy.Request(url=link, callback=self.parse_houses, meta={'Links': link})
next_page = response.xpath('(//a[#class="hz-pagination-link hz-pagination-link--next"])[1]/#href').extract_first()
if next_page:
yield response.follow(response.urljoin(next_page), callback=self.parse)
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = response.xpath('(//div[#class="hz-profile-header__contact-info text-right mrm"]/a)[2]/#href').get()
yield {
'Links': link,
'Firm_name': firm_name,
'Name': name,
'Phone': phone,
'Website': website
}
You must to have do a request to that target URL to see where it leads to
In your case, you can do simply the HEAD request, that will not load any body of target URL so that will save bandwidth and increase speed of your script as well
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = response.xpath('(//div[#class="hz-profile-header__contact-info text-right mrm"]/a)[2]/#href').get()
yield Request(url=website,
method="HEAD",
callback=self.get_final_link,
meta={'data':
{
'Links': link,
'Firm_name': firm_name,
'Name': name,
'Phone': phone,
'Website': website
}
}
)
def get_final_link(self, response):
data = response.meta['data']
data['website'] = response.headers['Location']
yield data
If your goal is to get the website, that actual website link is available in source-code of each listing as well, you can grab it by regex, no need to visit the encrypted url
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = re.findall(r"\"url\"\: \"(.*?)\"", response.text)[0]
you can do st like this:
class AppSpider(scrapy.Spider):
base_url = 'www.houzz.in{}'
.
.
.
def foo(self):
actual_url = self.base_url.format(redirect_url)

Use scrapy to get list of urls, and then scrape content inside those urls

I need a Scrapy spider to scrape the following page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) for each URL (30 products, so 30 urls) and then go into each product via that url and scrape the data inside.
I have the second part working exactly as I want:
import scrapy
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}
# next_page = response.css('div.ph-summary-entry-ctn a::attr("href")').extract_first()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
But I don't know how to do the first part. As you will see I have the main page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) set as the start_url. But how do I get it to populate the start_urls list with all 30 urls I need crawled?
I am not able to test at this moment, so please let me know if this works for you so I can edit it should there be any bugs.
The idea here is that we find every link in the first page and yield new scrapy requests passing your product parsing method as a callback
import scrapy
from urllib.parse import urljoin
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
products = response.xpath("//*[contains(#class, 'ph-summary-entry-ctn')]/a/#href").extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}

Categories