Scraping websites with Scrapy

Scraping websites with Scrapy - python

I have a code.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.css('li > a::attr(href)').extract_first()
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
And I don't know why it scrapes only 3 pages
Output marked in red is only 3 pages of 88
where's the problem in pagination?

Your selector was finding the first <a> tag he could find, which was the language <a> tag. You were changing languages not pages.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.xpath('//a[#class="prev_next"]/#href').extract()[-1]
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)

I looks like the website that you are scraping uses the url format uri?page=x
a simple loop to replace x can solve your problems.

Related

next page crawl in Scrapy

I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.

Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)

How can I Scrape next pages with Scrapy

Here is my scrapy code.I dont know my mistake but in only scrapes first page.How can i scrape and traverse through pages ? Is there any other way for scraping next pages ?
import scrapy
class HurriyetEmlakPage(scrapy.Spider):
name = 'hurriyetspider'
allowed_domain = 'hurriyetemlak.com'
start_urls = ['https://www.hurriyetemlak.com/satilik']
def parse(self, response):
fiyat = response.xpath('//div[#class="list-view-price"]//text()').extract()
durum = response.xpath('//div[#class="middle sibling"]//div[#class="left"]//text()').extract()
oda_sayisi = response.xpath('//span[#class="celly houseRoomCount"]//text()').extract()
metrekare = response.xpath('//span[#class="celly squareMeter list-view-size"]//text()').extract()
bina_yasi = response.xpath('//span[#class="celly buildingAge"]//text()').extract()
bulundugu_kat = response.xpath('//span[#class="celly floortype"]//text()').extract()
konum = response.xpath('//div[#class="list-view-location"]//text()').extract()
scraped_info = {
'fiyat':fiyat,
'durum': durum,
'oda_sayisi' : oda_sayisi,
'metrekare' : metrekare,
'bina_yasi' : bina_yasi,
'bulundugu_kat': bulundugu_kat,
'konum' : konum
}
yield scraped_info
next_page_url = response.xpath('//li[#class="next-li pagi-nav"]//a').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url,callback = self.parse)

Actually, you could simply generate your url list like this :
url_list = [f"https://www.hurriyetemlak.com/satilik?page={page}" for page in range(1,7326)]
Output
['https://www.hurriyetemlak.com/satilik?page=1',
'https://www.hurriyetemlak.com/satilik?page=2',
'https://www.hurriyetemlak.com/satilik?page=3',
'https://www.hurriyetemlak.com/satilik?page=4',
'https://www.hurriyetemlak.com/satilik?page=5',
...]

How to kill single instance of function in python

I'm working on a scraper using Scrapy. Here is the code:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_url = 'http://www.irna.ir/en/services/161'
next_page = 162
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'page_number': 1})
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
def parse_article(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
raise CloseSpider('duplicate article')
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
I want it to only scrape links put up since the last time it was run, so every time it reads an article it compares its published date to the last time the program ran, and, if the article is older, it does not scrape it and kills the program.
The problem here is, there are multiple categories that are all being scraped at the same time with this code, and it's possible that I get to the an older article in one category before I go through all the new articles in another category.
Is it possible to raise something in order to kill just one instance of a function so that the scraper will be able to continue looking through other categories?
edit:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
def parse(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
def parse_article(self, response):
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
The issue with this is that it looks like I am not able to load an article before scraping it to determine its date.

You need some restructuring to your spider. One is that you should not use
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
Because every time you are getting a result page you are running the same urls again and again. So they will be filtered anyways after next request. So you should use this in base_urls
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
Next in your article parse you should get the date from results
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
# get the date for this article
# if the date is already extracted
date_already_processed = <-Get the date from result page->
if date_already_processed:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})

Use scrapy to get list of urls, and then scrape content inside those urls

I need a Scrapy spider to scrape the following page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) for each URL (30 products, so 30 urls) and then go into each product via that url and scrape the data inside.
I have the second part working exactly as I want:
import scrapy
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}
# next_page = response.css('div.ph-summary-entry-ctn a::attr("href")').extract_first()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
But I don't know how to do the first part. As you will see I have the main page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) set as the start_url. But how do I get it to populate the start_urls list with all 30 urls I need crawled?

I am not able to test at this moment, so please let me know if this works for you so I can edit it should there be any bugs.
The idea here is that we find every link in the first page and yield new scrapy requests passing your product parsing method as a callback
import scrapy
from urllib.parse import urljoin
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
'https://www.phidgets.com/?tier=1&catid=64&pcid=57',
]
def parse(self, response):
products = response.xpath("//*[contains(#class, 'ph-summary-entry-ctn')]/a/#href").extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
}

Scrape nested URLs using Scrapy

I am trying to scrape this web page:
https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/
I tried different ways, but every time it gives me a syntax error. I don't know much Python and Scrapy. Can anyone help me?
My requirements are:
In the header section of the page, there is a background image, some description and 2 product-related images.
In the Product Range section there are some number of images. I would like to go through all the images and scrape the individual product details.
The structure is like this:
Here is my code so far:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "plumber"
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/',
]
def parse(self, response):
for divs in response.css('div#product-variants div.viewport div.workspace div.float-box'):
yield {
#response.css('div#product-variants a::attr(href)').extract()
'producturl': divs.css('a::attr(href)').extract(),
'imageurl': divs.css('a img::attr(src)').extract(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract(),
next_page = producturl
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
}

You should take next_page yield out of your item.
In general you can iterate through products, make some load and carry it over in your request's meta parameter, like so:
def parse(self, response):
for divs in response.css('div#product-variants div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract(),
'imageurl': divs.css('a img::attr(src)').extract(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
yield scrapy.Request(next_page, callback=self.parse_page, meta={'item': item})
def parse_page(self, response):
"""This is individual product page"""
item = response.meta['item']
item['something_new'] = 'some_value'
return item

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping websites with Scrapy - python

I looks like the website that you are scraping uses the url format uri?page=x a simple loop to replace x can solve your problems.

Related

next page crawl in Scrapy

How can I Scrape next pages with Scrapy

How to kill single instance of function in python

Use scrapy to get list of urls, and then scrape content inside those urls

Scrape nested URLs using Scrapy

Categories

Resources