Scrapy Pagination Fails on Multiple Listing - python

I'm trying to scrape a website using scrapy.
When I scrape a specific page, pagination scraping works but when I try to scrape all the pages with one jump pagination does not work.
I tried creating an extra function for the pagination but this does not fix the problem. All help would be appreciated. What am I doing wrong ? Here's my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.http import Request
from avtogumi.items import AvtogumiItem
class BasicSpider(scrapy.Spider):
name = 'gumi'
allowed_domains = ['avtogumi.bg']
start_urls = ['https://bg.avtogumi.bg/oscommerce/index.php' ]
def parse(self, response):
urls = response.xpath('//div[#class="brands"]//a/#href').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_params)
def parse_params(self, response):
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
next_page_url = response.xpath('//li/a[#class="next"]/#href').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse_params)

The issue here is this:
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
This snippet of code will parse and load exactly one result. If you have a page with multiple results, you would have to put this code inside a for loop and iterate over all the search results you want to parse:
objects = response.xpath('my_selector_here')
for object in objects:
l = ItemLoader(item=AvtogumiItem(), response=response)
l.add_xpath('title', '//h4/a/text()')
l.add_xpath('subtitle', '//p[#class="ft-darkgray"]/text()')
l.add_xpath('price', '//span[#class="promo-price"]/text()',
MapCompose(str.strip, str.title))
l.add_xpath('stock', '//div[#class="product-box-stock"]//span/text()')
l.add_xpath('category', '//div[#class="labels hidden-md hidden-lg"][0]//text()')
l.add_xpath('brand', '//h4[#class="brand-header"][0]//text()',
MapCompose(str.strip, str.title))
l.add_xpath('img_path', '//div/img[#class="prod-imglist"]/#src')
yield l.load_item()
Hope this helps

use/rewrite this code
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
class BasicSpider(scrapy.Spider):
name = 'gumi'
allowed_domains = ['avtogumi.bg']
start_urls = ['https://bg.avtogumi.bg/oscommerce/']
def parse(self, response):
urls = response.xpath('//div[#class="brands"]//a/#href').extract()
for url in urls:
yield Request(url=response.urljoin(url), callback=self.parse_params)
def parse_params(self, response):
subjects = response.xpath('//div[#class="full-product-box search-box"]')
for subject in subjects:
yield {
'title': subject.xpath('.//h4/a/text()').extract_first(),
'subtitle': subject.xpath('.//p[#class="ft-darkgray"]/text()').extract_first(),
'price': subject.xpath('.//span[#class="promo-price"]/text()').extract_first(),
'stock': subject.xpath('.//div[#class="product-box-stock"]//span/text()').extract_first(),
'category': subject.xpath('.//div[#class="labels hidden-md hidden-lg"][0]//text()').extract_first(),
'brand': subject.xpath('.//h4[#class="brand-header"][0]//text()').extract_first(),
'img_path': subject.xpath('.//div/img[#class="prod-imglist"]/#src').extract_first(),
}
next_page_url = response.xpath('//li/a[#class="next"]/#href').extract_first()
if next_page_url:
yield Request(url=next_page_url, callback=self.parse_params)
13407 items scraped

Related

how to scrape link from a previous function with scrapy

I have this code to scrape a website. The def parse function gives the full link of a full news, and def parse_item returns 3 items which are date, title, and also the full text from the full url.
How can I also scrape and save the link from def parse? So the code would return 4 items which are date, title, text, and also the link.
Here is the code:
import scrapy
from scrapy.crawler import CrawlerProcess
class weeklymining(scrapy.Spider):
name = 'weeklymining'
start_urls = ['https://www.miningweekly.com/page/coal/page:'+str(x) for x in range(0,351)]
def parse(self, response):
for link in response.xpath('//*[#class="en-serif"]/a/#href'):
yield response.follow(
url=link.get(),
callback=self.parse_item
)
def parse_item(self, response):
yield {
'date': response.xpath('//*[#class="article_title"]//p/span[1]/text()').extract(),
'title': response.xpath('//*[#id="article_headline"]/text()').get(),
'text':''.join([x.get().strip() for x in response.xpath('//*[#id="article_content_container"]//p//text()')])
}
if __name__ == '__main__':
process = CrawlerProcess()
process.crawl(weeklymining)
process.start()
Anyhelp would be appreciated, thanks in advance
Just add the response.url to the yielded item.
For example:
import scrapy
from scrapy.crawler import CrawlerProcess
class weeklymining(scrapy.Spider):
name = 'weeklymining'
start_urls = ['https://www.miningweekly.com/page/coal/page:'+str(x) for x in range(0,351)]
def parse(self, response):
for link in response.xpath('//*[#class="en-serif"]/a/#href'):
yield response.follow(
url=link.get(),
callback=self.parse_item
)
def parse_item(self, response):
yield {
'date': response.xpath('//*[#class="article_title"]//p/span[1]/text()').extract(),
'title': response.xpath('//*[#id="article_headline"]/text()').get(),
'text':''.join([x.get().strip() for x in response.xpath('//*[#id="article_content_container"]//p//text()')]),
'link': response.url # <-- added this
}
if __name__ == '__main__':
process = CrawlerProcess()
process.crawl(weeklymining)
process.start()

Scrapy: Following pagination link to scrape data [duplicate]

This question already has answers here:
Scrapy: scraping data from Pagination
(2 answers)
Closed 4 years ago.
I am trying to scrape data from a page and continue scraping following the pagination link.
The page I am trying to scrape is --> here
# -*- coding: utf-8 -*-
import scrapy
class AlibabaSpider(scrapy.Spider):
name = 'alibaba'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']
def parse(self, response):
for products in response.xpath('//div[contains(#class, "m-gallery-product-item-wrap")]'):
item = {
'product_name': products.xpath('.//h2/a/#title').extract_first(),
'price': products.xpath('.//div[#class="price"]/b/text()').extract_first('').strip(),
'min_order': products.xpath('.//div[#class="min-order"]/b/text()').extract_first(),
'company_name': products.xpath('.//div[#class="stitle util-ellipsis"]/a/#title').extract_first(),
'prod_detail_link': products.xpath('.//div[#class="item-img-inner"]/a/#href').extract_first(),
'response_rate': products.xpath('.//i[#class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
#'image_url': products.xpath('.//div[#class=""]/').extract_first(),
}
yield item
#Follow the paginatin link
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield scrapy.Request(url=next_page_url, callback=self.parse)
Problem
The code is not able to follow the pagination link.
How can you help
Modify the code to follow the pagination link.
To get your code working, you need to fix the broken link by using response.follow() or something similar. Try the below approach.
import scrapy
class AlibabaSpider(scrapy.Spider):
name = 'alibaba'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/catalog/agricultural-growing-media_cid144?page=1']
def parse(self, response):
for products in response.xpath('//div[contains(#class, "m-gallery-product-item-wrap")]'):
item = {
'product_name': products.xpath('.//h2/a/#title').extract_first(),
'price': products.xpath('.//div[#class="price"]/b/text()').extract_first('').strip(),
'min_order': products.xpath('.//div[#class="min-order"]/b/text()').extract_first(),
'company_name': products.xpath('.//div[#class="stitle util-ellipsis"]/a/#title').extract_first(),
'prod_detail_link': products.xpath('.//div[#class="item-img-inner"]/a/#href').extract_first(),
'response_rate': products.xpath('.//i[#class="ui2-icon ui2-icon-skip"]/text()').extract_first('').strip(),
#'image_url': products.xpath('.//div[#class=""]/').extract_first(),
}
yield item
#Follow the paginatin link
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield response.follow(url=next_page_url, callback=self.parse)
Your pasted code was badly indented. I've fixed that as well.
It doesn't work because url isn't valid. If you want to keep using scrapy.Request, you could use:
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
A shorter solution:
next_page_url = response.xpath('//link[#rel="next"]/#href').extract_first()
if next_page_url:
yield response.follow(next_page_url)

Use scrapy to get list of urls, and then scrape content inside those urls

I need a Scrapy spider to scrape the following page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) for each URL (30 products, so 30 urls) and then go into each product via that url and scrape the data inside.
I have the second part working exactly as I want:
import scrapy
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}
# next_page = response.css('div.ph-summary-entry-ctn a::attr("href")').extract_first()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
But I don't know how to do the first part. As you will see I have the main page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) set as the start_url. But how do I get it to populate the start_urls list with all 30 urls I need crawled?
I am not able to test at this moment, so please let me know if this works for you so I can edit it should there be any bugs.
The idea here is that we find every link in the first page and yield new scrapy requests passing your product parsing method as a callback
import scrapy
from urllib.parse import urljoin
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
products = response.xpath("//*[contains(#class, 'ph-summary-entry-ctn')]/a/#href").extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}

Scraping/crawling multiple pages

Up to now I have found how to scrape one page or multiple pages with same url, but changing number. However, I could not find how to scrape pages with subcategories and their subcategories and finally get the content needed.
I am trying to scrape this website: http://www.askislam.org/index.html
I am using Scrapy, but I do not know where to start.
Or you can suggest a better option, I just use python and check from there.
Thanks
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy import Selector
from ask_islam.items import AskIslamItem
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import re
class AskislamSpider(Spider):
name = "askislam"
allowed_domains = ["askislam.org"]
start_urls = ['http://www.askislam.org/']
rules = [Rule(LinkExtractor(allow = ()), callback = 'parse', follow=True)]
def parse(self, response):
hxs = Selector(response)
links = hxs.css('div[id="categories"] li a::attr(href)').extract()
for link in links:
url = 'http://www.askislam.org' + link.replace('index.html', '')
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('div[id="categories"] li').extract()
questions = hxs.xpath('a').extract()
if(categories):
for categoryLink in categories:
url = 'http://www.askislam.org' + categoryLink.replace('index.html', '')
yield Request(url, callback=self.parse_page)
# print (question)
EDIT
def start_requests(self):
yield Request("http://www.askislam.org", callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('#categories li')
for cat in categories:
item = AskIslamItem()
link = cat.css('a::attr(href)').extract()[0]
link = "http://www.askislam.org" + link
item['catLink'] = link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)
yield Request(link, callback=self.parse_categories)
def parse_categories(self, response):
logging.info("The Cat Url")
Read links from that http://www.askislam.org/index.html page using xPath or CSS Selectors of those sub-categories and then do another Request()
EDIT:
import logging
class AskislamSpider(Spider):
name = "askislam"
def start_requests(self):
yield Request("http://www.askislam.org/", callback=self.parse_page)
def parse_page(self, response):
categories = response.css('#categories li').extract()
for cat in categories:
link = cat.css("a::attr(href)").extract()[0]
link = "http://www.askislam.org/" + link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)

Scrapy and Splash don't crawl

I made a crawler, splash is working (i tested it in my browser), scrapy though can't crawl and extract items.
My actual code is:
# -*- coding: utf-8 -*-
import scrapy
import json
from scrapy.http.headers import Headers
from scrapy.spiders import CrawlSpider, Rule
from oddsportal.items import OddsportalItem
class OddbotSpider(CrawlSpider):
name = "oddbot"
allowed_domains = ["oddsportal.com"]
start_urls = (
'http://www.oddsportal.com/matches/tennis/',
)
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse, meta={
'splash': {
'endpoint': 'render.html',
'args': {'wait': 5.5}
}
})
def parse(self, response):
item = OddsportalItem()
print response.body
Try importing scrap_splash and call new request through SplashRequest as:
from scrapy_splash import SplashRequest
yield SplashRequest(url, endpoint='render.html', args={'any':any})
You should modify CrawlSpider
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)

Categories