I am creating a Scrapy spider for the ASPX site https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx. The code needs to collect historical real estate data at scale. After entering the search parameters and submitting, the page directs me to another page where the data is displayed on one table. The problem is that this is only a small portion of the data and the rest is on subsequent pages. I need to be able to paginate thru the pages and collect the table from each. I have been able to replicate the _postback function through Scraoy FormRequest. This allows me to paginate through the pages consecutively one page after the other. But my problem is when I need to start on a later page. For example, let's say I need to start the scraper on page 30. If I were to add the parameters to load page 30 I would be returned a 404 Not Found. How do I get around this error and load whatever page I need to start at.
Here is my code
Any Help is appreciated thanks!
from pprint import pprint as pp
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy import FormRequest
import pandas as pd
from datetime import date
from os.path import exists
import json
import sys
import os
from scrapy import signals
class AspxspiderSpider(scrapy.Spider):
name = 'aspxSpider'
allowed_domains = ['www.realestate.moj.gov.kw']
# start_urls = ['https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx']
url = 'https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx'
page = 1
to_date = f'{date.today().day}/{date.today().month}/{date.today().year}'
path = os.getcwd().replace('\\', '/')
filename = f'{path}/results/{date.today().day}-{date.today().month}-{date.today().year}.parquet'
# def __init__(self, name=None, **kwargs):
# dispatcher.connect(self.spider_closed, signals.spider_closed)
# super().__init__(name, **kwargs)
# def spider_closed(self, spider):
# print('closing time closing time ')
# print(spider.filename)
# print('closing time closing time ')
def start_requests(self):
yield scrapy.Request(self.url, callback=self.step1)
def step1(self, response):
state = response.css('input#__VIEWSTATE::attr(value)').extract_first()
token = response.css(
'input#bodyContent_txtToken::attr(value)').extract_first()
validation = response.css(
'input#__EVENTVALIDATION::attr(value)').extract_first()
validation_gen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').extract_first()
data = {
'ctl00$bodyContent$fromDate': '15/06/1993',
'ctl00$bodyContent$tillDate': self.to_date,
'ctl00$bodyContent$ddlGovernment': '-1',
'ctl00$bodyContent$cbGov': 'on',
'ctl00$bodyContent$hdnGoverment': '',
'ctl00$bodyContent$hdnZone': '',
'ctl00$bodyContent$hdncategory': '',
}
yield FormRequest.from_response(response, formdata=data, callback=self.step2)
def step2(self, response):
data = {
'ctl00$bodyContent$ddlZone': '-1',
'ctl00$bodyContent$cbZone': 'on',
}
yield FormRequest.from_response(response, formdata=data, callback=self.step3)
...
def step3(self, response):
data = {
'__EVENTTARGET': 'ctl00$bodyContent$gridList',
'__EVENTARGUMENT': 'Page$1',
'ctl00$bodyContent$ddlCategory': '-1',
'ctl00$bodyContent$cbCat': 'on',
'ctl00$bodyContent$btnSubmit': 'استعلام',
}
yield FormRequest.from_response(response, formdata=data, callback=self.parse_table)
...
def nxt_page(self, response):
print(f'Going to page {self.page}')
state = response.css('input#__VIEWSTATE::attr(value)').extract_first()
token = response.css(
'input#bodyContent_txtToken::attr(value)').extract_first()
validation = response.css(
'input#__EVENTVALIDATION::attr(value)').extract_first()
validation_gen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').extract_first()
data = {
'__EVENTTARGET': 'ctl00$bodyContent$gridList',
'__EVENTARGUMENT': f'Page${self.page}',
'__VIEWSTATE': state,
'__VIEWSTATEGENERATOR': validation_gen,
'__EVENTVALIDATION': validation,
'ctl00$bodyContent$hdnGoverment': '',
'ctl00$bodyContent$hdnZone': '',
'ctl00$bodyContent$hdncategory': '',
}
yield FormRequest(
# url=self.url,
url='https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx',
method='POST',
callback=self.parse_table,
formdata=data,
meta={'page': self.page}
)
def parse_table(self, response):
self.page += 1
# open_in_browser(response)
# pp(response.body.decode("utf-8"))
table = pd.read_html(response.body)
df = table[0]
if exists(self.filename):
df.to_parquet(self.filename, append=True, index=False)
...
else:
df.to_parquet(self.filename, index=False)
# if self.page == 6:
# sys.exit('Done')
return self.nxt_page(response)
...
Related
I wanted to scrape the feed of sitepoint.com, this is my code:
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def parse(self, response):
data = []
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
text = scrapy.Request(url, callback=self.parse_article)
data.append(
{"title": title, "href": href, "img": img, "time": time, "text": text}
)
yield data
def parse_article(self, response):
text = response.xpath(
'//*[#id="main-content"]/article/div/div/div[1]/section/text()'
).extract()
yield text
And this is the response I get:-
[{'title': 'How to Build an MVP with React and Firebase',
'href': '/react-firebase-build-mvp/',
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp-
app.jpg',
'time': 'September 28, 2021',
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]
It just does not scrape the urls. I followed everything said in this question but still could not make it work.
You have to visit the detail page from the listing to scrape the article.
In that case you have to yield the URL first then yield the data in the last spider
Also, the //*[#id="main-content"]/article/div/div/div[1]/section/text() won't return you any text since there are lots of HTML elements under the section tag
One solution is you can scrape all the HTML element inside section tag and clean them later to get your article text data
here is the full working code
import re
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def clean_text(self, raw_html):
"""
:param raw_html: this will take raw html code
:return: text without html tags
"""
cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(cleaner, '', raw_html)
def parse(self, response):
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
"href": href,
"img": img,
"time": time})
def parse_article(self, response):
title = response.request.meta["title"]
href = response.request.meta["href"]
img = response.request.meta["img"]
time = response.request.meta["time"]
all_data = {}
article_html = response.xpath('//*[#id="main-content"]/article/div/div/div[1]/section').get()
all_data["title"] = title
all_data["href"] = href
all_data["img"] = img
all_data["time"] = time
all_data["text"] = self.clean_text(article_html)
yield all_data
My problem is the following, my spider has just successfully clicked on a button within the function parse_search_page(). In the function parse_identity I am on the next page where I can start scraping some information. But the variable "response" is of type SplashJsonResponse which is not supported by xpath() and response.body is of type bytes which is as well not supported
The solutions of my problem that I think can work are:
Convert SplashJsonResponse to SplashTextResponse (which is an html response)
Use xpath on bytes
Convert scrapy_splash.response.SplashJsonResponse to scrapy.http.response.html.HtmlResponse
Code:
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy_splash import SplashRequest
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['https://app.nominations.hospimedia.fr']
def parse(self, response):
# the function "callback" is called after you have logged in
return scrapy.FormRequest.from_response(
response,
formdata={'user[email]': 'XXX', 'user[password]': 'XXX'},
callback=self.parse_landing_page
)
def parse_landing_page(self, response):
# open webpage after logging in
#open_in_browser(response)
start_urls = 'https://app.nominations.hospimedia.fr'
# we extract the title
# title = response.xpath('//title/text()').extract()
print("hello1")
# regarder a quoi sert le extract() si on le mets pas
next_page_partial_url = response.xpath('//div[#class="l-action l-action--small"]/a/#href').extract()
#print(next_page_partial_url)
next_page_url = start_urls + next_page_partial_url[0]
yield scrapy.Request(next_page_url, callback=self.parse_search_page)
def parse_search_page(self, response):
# if you click on the page below you know if your scrapy-splash is working
# http://localhost:8050/
script = '''
function main(splash, args)
splash:go(splash.args.url)
splash:runjs('document.getElementsByClassName("button tertiary")[0].click()')
return {
html = splash:html(),
}
end
'''
open_in_browser(response)
print("----------")
# scrapy.http.response.html.HtmlResponse
print(type(response))
print("------------")
#yield SplashRequest(response.request.url, callback = self.parse_identity, endpoint='execute', args={'lua_source': script})
yield SplashRequest(callback = self.parse_identity,
endpoint='execute',
args={'url':response.request.url,
'lua_source': script}
)
def parse_identity(self, response):
print("----------------------------------------")
# scrapy_splash.response.SplashJsonResponse
print(type(response))
# <class 'bytes'>
print(type(response.body))
print(response.body)
print(("----------------------------------------"))
next_page_partial_url = response.xpath('//div[#class="medium-6 small-12 columns"]/text()').extract()
#next_page_partial_url = response.xpath('//a[#rel="noopener noreferrer"]/text()').extract()
print(next_page_partial_url)
print(("----------------------------------------"))
#inspect_response(response, self)
#open_in_browser(response)
Actually the solution is to use HtmlResponse
from scrapy.http import HtmlResponse
html_response = HtmlResponse(url=response.url, body=response.text, encoding='utf-8')
I want to submit the form with the 5 data that's on the below. By submitting the form, I can get
the redirection URL. I don't know where is the issue. Can anyone help me to submit the form with required info. to get the next page URL.
Code for your reference:
import requests
import scrapy
class QuotesSpider(scrapy.Spider):
name = "uksite"
login_url = 'https://myeplanning.oxfordshire.gov.uk/Disclaimer/Accept?returnUrl=%2FSearch%2FAdvanced'
start_urls = [login_url]
def parse(self, response):
token = response.css('input[name="__RequestVerificationToken"]::attr(value)').extract_first()
data = {'__RequestVerificationToken': token,
'DateReceivedFrom': '2021-04-07',
'DateReceivedTo': '2021-04-08',
'AdvancedSearch': 'True',
'SearchPlanning': 'True',
}
yield scrapy.FormRequest.from_response(response,
url=self.login_url,
formdata= data,
clickdata={'class': 'occlss-button occlss-button--primary decompress'},
callback = self.value,
)
def value(self, response):
print(response._url)
INPUT URL ==> https://myeplanning.oxfordshire.gov.uk/Disclaimer/Accept?returnUrl=%2FSearch%2FAdvanced
Output URL for the given input ==> https://myeplanning.oxfordshire.gov.uk/Planning/Display/MW.0047/21
Okay, this should do it.
class MyePlanningSpider(scrapy.Spider):
name = "myeplanning"
start_urls = ['https://myeplanning.oxfordshire.gov.uk/Disclaimer/Accept?returnUrl=%2FSearch%2FAdvanced']
login_url = 'https://myeplanning.oxfordshire.gov.uk/Search/Results'
def parse(self, response):
data = {
'__RequestVerificationToken': response.css('input[name="__RequestVerificationToken"]::attr(value)').get(),
'DateReceivedFrom': '2021-04-07',
'DateReceivedTo': '2021-04-08',
'AdvancedSearch': 'True',
'SearchPlanning': 'True',
}
yield scrapy.FormRequest(
url=self.login_url,
formdata= data,
callback=self.parse_value,
)
def parse_value(self, response):
print(response.url)
So, that was the question. I have a Scrapy bot that follows internal links of a given site, writes its links, status code and anchor text into database. But I'm struggling to grab the link's follow status. Is there any way to grab that rel=nofollow/dofollow information? That's my code if anybody wonders;
class MySpider(CrawlSpider):
name = 'spydiiiii'
start_urls = [urlToScrape]
rules = (
Rule (
LxmlLinkExtractor(
allow=(urlToScrape),
deny=(
"google.com",
"facebook.com",
"pinterest.com",
"facebook.com",
"digg.com",
"twitter.com",
"stumbleupon.com",
"linkedin.com"
),
unique=True
),
callback="parse_items",
follow= True,
),
)
def parse_items(self, response):
sel = Selector(response)
items = []
item = InternallinkItem()
referring_url = response.request.headers.get('Referer').decode('utf-8')
item["referring_url"] = referring_url
anchor = response.meta.get('link_text')
item["anchor_text"] = " ".join(anchor.split())
item["current_url"] = response.url
item['status'] = response.status
items.append(item)
return items
Thanks in advance
I use LxmlLinkExtractor manually to get Link objects which have nofollow information.
In parse() I get links from first page and create item with 'nofollow' (and other) informations, and use Requests with this url (and with item in meta) to get status and referer.
New Requests uses parse_item() to get item from meta and add status.
parse_item() also uses extractor to get new links on this page and create new item and execute Requests with parse_item() again.
import scrapy
from scrapy.http import Request
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
#start_urls = ['http://127.0.0.1:5000/'] # for Flask example
extractor = LxmlLinkExtractor(
allow=('http://quotes.toscrape.com'),
#allow=('http://127.0.0.1:5000'), # for Flask example
deny=(
'google.com',
'facebook.com',
'pinterest.com',
'facebook.com',
'digg.com',
'twitter.com',
'stumbleupon.com',
'linkedin.com'
),
unique=True,
)
def parse(self, response):
print('parse url:', response.url)
# use LxmlLinkExtractor manually
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
def parse_item(self, response):
print('parse_item url:', response.url)
item = response.meta['item']
item['referring_url'] = response.request.headers.get('Referer')
#item['referring_url'] = response.request.url
item['status'] = response.status
yield item
# use LxmlLinkExtractor manually with new links
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
# --- run spider without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(MySpider)
c.start()
EDIT:
Because I don't know any page with rel="nofollow" so I created simple code in Flask to test code.
from flask import Flask
app = Flask(__name__)
#app.route('/')
def index():
return 'Test 1 | Test 2'
#app.route('/test1')
def test1():
return 'Main Page'
#app.route('/test2')
def test2():
return 'Main Page'
if __name__ == '__main__':
app.run(debug=True)
I'm working on a scraper using Scrapy. Here is the code:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_url = 'http://www.irna.ir/en/services/161'
next_page = 162
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'page_number': 1})
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
def parse_article(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
raise CloseSpider('duplicate article')
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
I want it to only scrape links put up since the last time it was run, so every time it reads an article it compares its published date to the last time the program ran, and, if the article is older, it does not scrape it and kills the program.
The problem here is, there are multiple categories that are all being scraped at the same time with this code, and it's possible that I get to the an older article in one category before I go through all the new articles in another category.
Is it possible to raise something in order to kill just one instance of a function so that the scraper will be able to continue looking through other categories?
edit:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
def parse(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
def parse_article(self, response):
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
The issue with this is that it looks like I am not able to load an article before scraping it to determine its date.
You need some restructuring to your spider. One is that you should not use
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
Because every time you are getting a result page you are running the same urls again and again. So they will be filtered anyways after next request. So you should use this in base_urls
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
Next in your article parse you should get the date from results
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
# get the date for this article
# if the date is already extracted
date_already_processed = <-Get the date from result page->
if date_already_processed:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})