Scrapy form not submitting properly - python

I want to submit the form with the 5 data that's on the below. By submitting the form, I can get
the redirection URL. I don't know where is the issue. Can anyone help me to submit the form with required info. to get the next page URL.
Code for your reference:
import requests
import scrapy
class QuotesSpider(scrapy.Spider):
name = "uksite"
login_url = 'https://myeplanning.oxfordshire.gov.uk/Disclaimer/Accept?returnUrl=%2FSearch%2FAdvanced'
start_urls = [login_url]
def parse(self, response):
token = response.css('input[name="__RequestVerificationToken"]::attr(value)').extract_first()
data = {'__RequestVerificationToken': token,
'DateReceivedFrom': '2021-04-07',
'DateReceivedTo': '2021-04-08',
'AdvancedSearch': 'True',
'SearchPlanning': 'True',
}
yield scrapy.FormRequest.from_response(response,
url=self.login_url,
formdata= data,
clickdata={'class': 'occlss-button occlss-button--primary decompress'},
callback = self.value,
)
def value(self, response):
print(response._url)
INPUT URL ==> https://myeplanning.oxfordshire.gov.uk/Disclaimer/Accept?returnUrl=%2FSearch%2FAdvanced
Output URL for the given input ==> https://myeplanning.oxfordshire.gov.uk/Planning/Display/MW.0047/21

Okay, this should do it.
class MyePlanningSpider(scrapy.Spider):
name = "myeplanning"
start_urls = ['https://myeplanning.oxfordshire.gov.uk/Disclaimer/Accept?returnUrl=%2FSearch%2FAdvanced']
login_url = 'https://myeplanning.oxfordshire.gov.uk/Search/Results'
def parse(self, response):
data = {
'__RequestVerificationToken': response.css('input[name="__RequestVerificationToken"]::attr(value)').get(),
'DateReceivedFrom': '2021-04-07',
'DateReceivedTo': '2021-04-08',
'AdvancedSearch': 'True',
'SearchPlanning': 'True',
}
yield scrapy.FormRequest(
url=self.login_url,
formdata= data,
callback=self.parse_value,
)
def parse_value(self, response):
print(response.url)

Related

Paginating through ASPX site using Scrapy

I am creating a Scrapy spider for the ASPX site https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx. The code needs to collect historical real estate data at scale. After entering the search parameters and submitting, the page directs me to another page where the data is displayed on one table. The problem is that this is only a small portion of the data and the rest is on subsequent pages. I need to be able to paginate thru the pages and collect the table from each. I have been able to replicate the _postback function through Scraoy FormRequest. This allows me to paginate through the pages consecutively one page after the other. But my problem is when I need to start on a later page. For example, let's say I need to start the scraper on page 30. If I were to add the parameters to load page 30 I would be returned a 404 Not Found. How do I get around this error and load whatever page I need to start at.
Here is my code
Any Help is appreciated thanks!
from pprint import pprint as pp
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy import FormRequest
import pandas as pd
from datetime import date
from os.path import exists
import json
import sys
import os
from scrapy import signals
class AspxspiderSpider(scrapy.Spider):
name = 'aspxSpider'
allowed_domains = ['www.realestate.moj.gov.kw']
# start_urls = ['https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx']
url = 'https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx'
page = 1
to_date = f'{date.today().day}/{date.today().month}/{date.today().year}'
path = os.getcwd().replace('\\', '/')
filename = f'{path}/results/{date.today().day}-{date.today().month}-{date.today().year}.parquet'
# def __init__(self, name=None, **kwargs):
# dispatcher.connect(self.spider_closed, signals.spider_closed)
# super().__init__(name, **kwargs)
# def spider_closed(self, spider):
# print('closing time closing time ')
# print(spider.filename)
# print('closing time closing time ')
def start_requests(self):
yield scrapy.Request(self.url, callback=self.step1)
def step1(self, response):
state = response.css('input#__VIEWSTATE::attr(value)').extract_first()
token = response.css(
'input#bodyContent_txtToken::attr(value)').extract_first()
validation = response.css(
'input#__EVENTVALIDATION::attr(value)').extract_first()
validation_gen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').extract_first()
data = {
'ctl00$bodyContent$fromDate': '15/06/1993',
'ctl00$bodyContent$tillDate': self.to_date,
'ctl00$bodyContent$ddlGovernment': '-1',
'ctl00$bodyContent$cbGov': 'on',
'ctl00$bodyContent$hdnGoverment': '',
'ctl00$bodyContent$hdnZone': '',
'ctl00$bodyContent$hdncategory': '',
}
yield FormRequest.from_response(response, formdata=data, callback=self.step2)
def step2(self, response):
data = {
'ctl00$bodyContent$ddlZone': '-1',
'ctl00$bodyContent$cbZone': 'on',
}
yield FormRequest.from_response(response, formdata=data, callback=self.step3)
...
def step3(self, response):
data = {
'__EVENTTARGET': 'ctl00$bodyContent$gridList',
'__EVENTARGUMENT': 'Page$1',
'ctl00$bodyContent$ddlCategory': '-1',
'ctl00$bodyContent$cbCat': 'on',
'ctl00$bodyContent$btnSubmit': 'استعلام',
}
yield FormRequest.from_response(response, formdata=data, callback=self.parse_table)
...
def nxt_page(self, response):
print(f'Going to page {self.page}')
state = response.css('input#__VIEWSTATE::attr(value)').extract_first()
token = response.css(
'input#bodyContent_txtToken::attr(value)').extract_first()
validation = response.css(
'input#__EVENTVALIDATION::attr(value)').extract_first()
validation_gen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').extract_first()
data = {
'__EVENTTARGET': 'ctl00$bodyContent$gridList',
'__EVENTARGUMENT': f'Page${self.page}',
'__VIEWSTATE': state,
'__VIEWSTATEGENERATOR': validation_gen,
'__EVENTVALIDATION': validation,
'ctl00$bodyContent$hdnGoverment': '',
'ctl00$bodyContent$hdnZone': '',
'ctl00$bodyContent$hdncategory': '',
}
yield FormRequest(
# url=self.url,
url='https://www.realestate.moj.gov.kw/live/Moj_Rs_11.aspx',
method='POST',
callback=self.parse_table,
formdata=data,
meta={'page': self.page}
)
def parse_table(self, response):
self.page += 1
# open_in_browser(response)
# pp(response.body.decode("utf-8"))
table = pd.read_html(response.body)
df = table[0]
if exists(self.filename):
df.to_parquet(self.filename, append=True, index=False)
...
else:
df.to_parquet(self.filename, index=False)
# if self.page == 6:
# sys.exit('Done')
return self.nxt_page(response)
...

Incompatibility between SplashJsonResponse and Xpath()

My problem is the following, my spider has just successfully clicked on a button within the function parse_search_page(). In the function parse_identity I am on the next page where I can start scraping some information. But the variable "response" is of type SplashJsonResponse which is not supported by xpath() and response.body is of type bytes which is as well not supported
The solutions of my problem that I think can work are:
Convert SplashJsonResponse to SplashTextResponse (which is an html response)
Use xpath on bytes
Convert scrapy_splash.response.SplashJsonResponse to scrapy.http.response.html.HtmlResponse
Code:
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy_splash import SplashRequest
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['https://app.nominations.hospimedia.fr']
def parse(self, response):
# the function "callback" is called after you have logged in
return scrapy.FormRequest.from_response(
response,
formdata={'user[email]': 'XXX', 'user[password]': 'XXX'},
callback=self.parse_landing_page
)
def parse_landing_page(self, response):
# open webpage after logging in
#open_in_browser(response)
start_urls = 'https://app.nominations.hospimedia.fr'
# we extract the title
# title = response.xpath('//title/text()').extract()
print("hello1")
# regarder a quoi sert le extract() si on le mets pas
next_page_partial_url = response.xpath('//div[#class="l-action l-action--small"]/a/#href').extract()
#print(next_page_partial_url)
next_page_url = start_urls + next_page_partial_url[0]
yield scrapy.Request(next_page_url, callback=self.parse_search_page)
def parse_search_page(self, response):
# if you click on the page below you know if your scrapy-splash is working
# http://localhost:8050/
script = '''
function main(splash, args)
splash:go(splash.args.url)
splash:runjs('document.getElementsByClassName("button tertiary")[0].click()')
return {
html = splash:html(),
}
end
'''
open_in_browser(response)
print("----------")
# scrapy.http.response.html.HtmlResponse
print(type(response))
print("------------")
#yield SplashRequest(response.request.url, callback = self.parse_identity, endpoint='execute', args={'lua_source': script})
yield SplashRequest(callback = self.parse_identity,
endpoint='execute',
args={'url':response.request.url,
'lua_source': script}
)
def parse_identity(self, response):
print("----------------------------------------")
# scrapy_splash.response.SplashJsonResponse
print(type(response))
# <class 'bytes'>
print(type(response.body))
print(response.body)
print(("----------------------------------------"))
next_page_partial_url = response.xpath('//div[#class="medium-6 small-12 columns"]/text()').extract()
#next_page_partial_url = response.xpath('//a[#rel="noopener noreferrer"]/text()').extract()
print(next_page_partial_url)
print(("----------------------------------------"))
#inspect_response(response, self)
#open_in_browser(response)
Actually the solution is to use HtmlResponse
from scrapy.http import HtmlResponse
html_response = HtmlResponse(url=response.url, body=response.text, encoding='utf-8')

Python Scrapy - how to tick checkboxes and search before scraping specific data

I got this site https://www.ingenieurs-ensea.fr/annuaire/recherche
I need to tick 2023, 2022, 2021 boxes then follow with search button which has input submit type.
On page that follows I got a list of 700+ names and descriptions in 30+ pages that I need to have as 3 seperate tables in Excel (eg. name + rank and year from description in seperate columns)
I tried a various stiched attempts but didn't work.
This was my last attempt but I'm kinda lost overall.
class Names(scrapy.Spider):
name = 'enseafr'
settings = get_project_settings()
start_urls = ['https://www.ingenieurs-ensea.fr/annuaire/recherche',]
def parse(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={'DiplomePromo[]':'2023'},
formname='DiplomePromo1',
callback= self.parse_2,
method= "POST",
)
def parse_2(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={'DiplomePromo[]': '2022'},
formname='DiplomePromo2',
callback=self.parse_3,
method="POST",
)
def parse_3(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={'DiplomePromo[]': '2021'},
formname='DiplomePromo3',
callback=self.after_parse,
method="POST",
)
def after_parse(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata= {'': 'RECHERCHER'},
callback = self.data,
method = "POST",
)
def data(self, response):
items = IngItems()
for item in response.xpath('//div[#class="annuaire_result_list"'):
items['name'] = item.xpath('//*[#id="zoneAnnuaire_layout"]/div[3]/div[2]/div[3]/div[1]/div[2]/div[1]/a/text()').get()
items['description'] = item.xpath('//*[#id="zoneAnnuaire_layout"]/div[3]/div[2]/div[3]/div[1]/div[2]/div[2]/div[1]').get()
yield items
It just produces bunch of errors so I guess its super broken
Use this link instead. I got it from the Network tab:
https://www.ingenieurs-ensea.fr/annuaire/recherche?result=1&annuaire_mode=standard&annuaire_as_no=&keyword=&PersonneNom=&PersonnePrenom=&DiplomePromo%5B%5D=2023&DiplomePromo%5B%5D=2022&DiplomePromo%5B%5D=2021

How to extract the website URL from the redirect link with Scrapy Python

I wrote a script to get the data from a website. I have issue with collecting the website URL since the #href is the redirect link. How can I convert the redirect URL to the actual website it's redirecting to?
import scrapy
import logging
class AppSpider(scrapy.Spider):
name = 'app'
allowed_domains = ['www.houzz.in']
start_urls = ['https://www.houzz.in/professionals/searchDirectory?topicId=26721&query=Design-Build+Firms&location=Mumbai+City+District%2C+India&distance=100&sort=4']
def parse(self, response):
lists = response.xpath('//li[#class="hz-pro-search-results__item"]/div/div[#class="hz-pro-search-result__info"]/div/div/div/a')
for data in lists:
link = data.xpath('.//#href').get()
yield scrapy.Request(url=link, callback=self.parse_houses, meta={'Links': link})
next_page = response.xpath('(//a[#class="hz-pagination-link hz-pagination-link--next"])[1]/#href').extract_first()
if next_page:
yield response.follow(response.urljoin(next_page), callback=self.parse)
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = response.xpath('(//div[#class="hz-profile-header__contact-info text-right mrm"]/a)[2]/#href').get()
yield {
'Links': link,
'Firm_name': firm_name,
'Name': name,
'Phone': phone,
'Website': website
}
You must to have do a request to that target URL to see where it leads to
In your case, you can do simply the HEAD request, that will not load any body of target URL so that will save bandwidth and increase speed of your script as well
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = response.xpath('(//div[#class="hz-profile-header__contact-info text-right mrm"]/a)[2]/#href').get()
yield Request(url=website,
method="HEAD",
callback=self.get_final_link,
meta={'data':
{
'Links': link,
'Firm_name': firm_name,
'Name': name,
'Phone': phone,
'Website': website
}
}
)
def get_final_link(self, response):
data = response.meta['data']
data['website'] = response.headers['Location']
yield data
If your goal is to get the website, that actual website link is available in source-code of each listing as well, you can grab it by regex, no need to visit the encrypted url
def parse_houses(self, response):
link = response.request.meta['Links']
firm_name = response.xpath('//div[#class="hz-profile-header__title"]/h1/text()').get()
name = response.xpath('//div[#class="profile-meta__val"]/text()').get()
phone = response.xpath('//div[#class="hz-profile-header__contact-info text-right mrm"]/a/span/text()').get()
website = re.findall(r"\"url\"\: \"(.*?)\"", response.text)[0]
you can do st like this:
class AppSpider(scrapy.Spider):
base_url = 'www.houzz.in{}'
.
.
.
def foo(self):
actual_url = self.base_url.format(redirect_url)

Scrapy + Splash + ScrapyJS

i am using Splash 2.0.2 + Scrapy 1.0.5 + Scrapyjs 0.1.1 and im still not able to render javascript with a click. Here is an example url https://olx.pt/anuncio/loja-nova-com-250m2-garagem-em-box-fechada-para-arrumos-IDyTzAT.html#c49d3d94cf
I am still getting the page without the phone number rendered:
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
script = """
function main(splash)
splash:go(splash.args.url)
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
"""
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
import ipdb;ipdb.set_trace()
how can i get this to work?
Add
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
to Lua script and it will work.
function main(splash)
splash:go(splash.args.url)
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
.click() is JQuery function https://api.jquery.com/click/
You can avoid having to use Splash in the first place and make the appropriate GET request to get the phone number yourself. Working spider:
import json
import re
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents)
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
property_id = re.search(r"ID(\w+)\.", response.url).group(1)
phone_url = "https://olx.pt/ajax/misc/contact/phone/%s/" % property_id
yield scrapy.Request(phone_url, callback=self.parse_phone)
def parse_phone(self, response):
phone_number = json.loads(response.body)["value"]
print(phone_number)
If there are more things to extract from this "dynamic" website, see if Splash is really enough and, if not, look into browser automation and selenium.

Categories