I'm working on a scraper using Scrapy. Here is the code:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_url = 'http://www.irna.ir/en/services/161'
next_page = 162
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'page_number': 1})
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
def parse_article(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
raise CloseSpider('duplicate article')
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
I want it to only scrape links put up since the last time it was run, so every time it reads an article it compares its published date to the last time the program ran, and, if the article is older, it does not scrape it and kills the program.
The problem here is, there are multiple categories that are all being scraped at the same time with this code, and it's possible that I get to the an older article in one category before I go through all the new articles in another category.
Is it possible to raise something in order to kill just one instance of a function so that the scraper will be able to continue looking through other categories?
edit:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
def parse(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
def parse_article(self, response):
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
The issue with this is that it looks like I am not able to load an article before scraping it to determine its date.
You need some restructuring to your spider. One is that you should not use
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
Because every time you are getting a result page you are running the same urls again and again. So they will be filtered anyways after next request. So you should use this in base_urls
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
Next in your article parse you should get the date from results
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
# get the date for this article
# if the date is already extracted
date_already_processed = <-Get the date from result page->
if date_already_processed:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
Related
I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()
I am trying to use python's scrappy to extract course catalog information from a website. The thing is, each course has a link to its full page and I need to iterate through those pages one by one to extract their information, which later, are fed to an SQL database. Anyhow, I don't know how to change the url's in the spider successively. here attached below is my code so far.
import scrapy
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
class QuoteSpider(scrapy.Spider):
name = 'courses'
start_urls = [
'http://catalog.aucegypt.edu/content.php?catoid=36&navoid=1738',
]
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('#href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
#iterate through urls
QuoteSpider.start_urls += fullURL
courseName = response.css('.block_content')
yield {
'courseNum': fullURL,
'test': courseName
}
Usually you need to yield this new URL and process it with corresponding callback:
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('#href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
courseName = response.css('.block_content')
yield scrapy.Request(
url=fullURL,
callback=self.parse_course,
cb_kwargs={
'course_name': courseName,
},
)
def parse_course(self, response, course_name):
# parse you course here...
Here is my scrapy code.I dont know my mistake but in only scrapes first page.How can i scrape and traverse through pages ? Is there any other way for scraping next pages ?
import scrapy
class HurriyetEmlakPage(scrapy.Spider):
name = 'hurriyetspider'
allowed_domain = 'hurriyetemlak.com'
start_urls = ['https://www.hurriyetemlak.com/satilik']
def parse(self, response):
fiyat = response.xpath('//div[#class="list-view-price"]//text()').extract()
durum = response.xpath('//div[#class="middle sibling"]//div[#class="left"]//text()').extract()
oda_sayisi = response.xpath('//span[#class="celly houseRoomCount"]//text()').extract()
metrekare = response.xpath('//span[#class="celly squareMeter list-view-size"]//text()').extract()
bina_yasi = response.xpath('//span[#class="celly buildingAge"]//text()').extract()
bulundugu_kat = response.xpath('//span[#class="celly floortype"]//text()').extract()
konum = response.xpath('//div[#class="list-view-location"]//text()').extract()
scraped_info = {
'fiyat':fiyat,
'durum': durum,
'oda_sayisi' : oda_sayisi,
'metrekare' : metrekare,
'bina_yasi' : bina_yasi,
'bulundugu_kat': bulundugu_kat,
'konum' : konum
}
yield scraped_info
next_page_url = response.xpath('//li[#class="next-li pagi-nav"]//a').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url,callback = self.parse)
Actually, you could simply generate your url list like this :
url_list = [f"https://www.hurriyetemlak.com/satilik?page={page}" for page in range(1,7326)]
Output
['https://www.hurriyetemlak.com/satilik?page=1',
'https://www.hurriyetemlak.com/satilik?page=2',
'https://www.hurriyetemlak.com/satilik?page=3',
'https://www.hurriyetemlak.com/satilik?page=4',
'https://www.hurriyetemlak.com/satilik?page=5',
...]
I have a code.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.css('li > a::attr(href)').extract_first()
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
And I don't know why it scrapes only 3 pages
Output marked in red is only 3 pages of 88
where's the problem in pagination?
Your selector was finding the first <a> tag he could find, which was the language <a> tag. You were changing languages not pages.
import scrapy
import requests
class cvbankas(scrapy.Spider):
name ='bankas'
allowed_domains =['cvbankas.lt']
start_urls = ['https://www.cvbankas.lt/']
def parse(self,response):
job_position_tag = response.css("h3.list_h3::text").extract()
city_tag = response.css("span.list_city::text").extract()
company_tag = response.css("span.dib.mt5::text").extract()
salary_tag = response.css("span.salary_amount::text").extract()
for item in zip(job_position_tag,city_tag,company_tag,salary_tag):
scraped_info={
'company':company_tag,
'city': city_tag,
'position': job_position_tag,
'salary': salary_tag,
}
yield scraped_info
next_page = response.xpath('//a[#class="prev_next"]/#href').extract()[-1]
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url = next_page, callback = self.parse)
I looks like the website that you are scraping uses the url format uri?page=x
a simple loop to replace x can solve your problems.
i am trying to scrape amazon.com for the link of products that has more than 800 reviews but i keep getting the same page link from the next page button it keeps returning page 2 over and over again where i should get page 3,4 and so on
I HAVE SET A IF CONDITION TO SPILT AND CONVERT REVIEW STRING LIKE 1,020 TO INTEGER AND COMPARE IF GREATER THAN 800 OR NOT THEN BASED ON THAT VISIT THE PAGE
here is the code
# -*- coding: utf-8 -*-
import scrapy
from amazon.items import AmazonItem
from urlparse import urljoin
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
item = AmazonItem()
review_selector = './/*[#class="acs_product-rating__review-count"]/text()'
link_selector = './/*[#class="a-link-normal"]/#href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
item['LINKS'] = url
if url:
yield scrapy.Request(url, callback=self.parse_link, meta={'item': item})
next_page = './/span[#class="pagnRA"]/a[#id="pagnNextLink"]/#href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)
def parse_link(self, response):
item = AmazonItem(response.meta['item'])
catselector = '.cat-link ::text'
defaultcatselector = '.nav-search-label ::text'
cat = response.css(catselector).extract_first()
if cat:
item['CATAGORY'] = cat
else:
item['CATAGORY'] = response.css(defaultcatselector).extract_first()
return item
here is the output when i was printing the next page link before calling the parse function recursively
and
and here is the screenshot from the next page selector of the page
where am i going wrong ?
Move the next page code block outside the loop.
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
review_selector = './/*[#class="acs_product-rating__review-count"]/text()'
link_selector = './/*[#class="a-link-normal"]/#href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
next_page = './/span[#class="pagnRA"]/a[#id="pagnNextLink"]/#href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)