Using python to scrape ASP.NET site with id in url - python

I'm trying to scrape the search results of this ASP.NET website using Python requests to send a POST request. Even though I use a GET request to get the requestverificationtoken and include it in my header I get just get this reply:
{"Token":"Y2VgsmEAAwA","Link":"/search/Y2VgsmEAAwA/"}
which is not the valid link. It's the total search results with no defined arrival data or area as included in my POST request. What am I missing? Who do I scrape a site like this that generates a (session?) ID for the URL?
Thank you so much in advance to all of you!
My python script:
import json
import requests
from bs4 import BeautifulSoup
r = requests.Session()
# GET request
gr = r.get("http://www.feline.dk")
bsObj = BeautifulSoup(gr.text,"html.parser")
auth_string = bsObj.find("input", {"name": "__RequestVerificationToken"})['value']
#print(auth_string)
#print(gr.url)
# POST request
search_request = {
"Geography.Geography":"Danmark",
"Geography.GeographyLong=":"Danmark (Ferieområde)",
"Geography.Id":"da509992-0830-44bd-869d-0270ba74ff62",
"Geography.SuggestionId": "",
"Period.Arrival":"16-1-2016",
"Period.Duration":7,
"Period.ArrivalCorrection":"false",
"Price.MinPrice":None,
"Price.MaxPrice":None,
"Price.MinDiscountPercentage":None,
"Accommodation.MinPersonNumber":None,
"Accommodation.MinBedrooms":None,
"Accommodation.NumberOfPets":None,
"Accommodation.MaxDistanceWater":None,
"Accommodation.MaxDistanceShopping":None,
"Facilities.SwimmingPool":"false",
"Facilities.Whirlpool":"false",
"Facilities.Sauna":"false",
"Facilities.InternetAccess":"false",
"Facilities.SatelliteCableTV":"false",
"Facilities.FireplaceStove":"false",
"Facilities.Dishwasher":"false",
"Facilities.WashingMachine":"false",
"Facilities.TumblerDryer":"false",
"update":"true"
}
payload = {
"searchRequestJson": json.dumps(search_request),
}
header ={
"Accept":"application/json, text/html, */*; q=0.01",
"Accept-Encoding":"gzip, deflate",
"Accept-Language":"da-DK,da;q=0.8,en-US;q=0.6,en;q=0.4",
"Connection":"keep-alive",
"Content-Length":"720",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"Cookie":"ASP.NET_SessionId=ebkmy3bzorzm2145iwj3bxnq; __RequestVerificationToken=" + auth_string + "; aid=382a95aab250435192664e80f4d44e0f; cid=google-dk; popout=hidden; __utmt=1; __utma=1.637664197.1451565630.1451638089.1451643956.3; __utmb=1.7.10.1451643956; __utmc=1; __utmz=1.1451565630.1.1.utmgclid=CMWOra2PhsoCFQkMcwod4KALDQ|utmccn=(not%20set)|utmcmd=(not%20set)|utmctr=(not%20provided); BNI_Feline.Web.FelineHolidays=0000000000000000000000009b84f30a00000000",
"Host":"www.feline.dk",
"Origin":"http://www.feline.dk",
#"Referer":"http://www.feline.dk/search/Y2WZNDPglgHHXpe2uUwFu0r-JzExMYi6yif5KNswMDBwMDAAAA/",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36",
"X-Requested-With":"XMLHttpRequest"
}
gr = r.post(
url = 'http://www.feline.dk/search',
data = payload,
headers = header
)
#print(gr.url)
bsObj = BeautifulSoup(gr.text,"html.parser")
print(bsObj)

After multiples tries, I found that your search request is misformatted (need to be URL Encoded and not JSON), and cookies informations are overwrited in headers (Just let session make the work).
I simplified the code like that and I get the desired result
r = requests.Session()
# GET request
gr = r.get("http://www.feline.dk")
bsObj = BeautifulSoup(gr.text,"html.parser")
auth_string = bsObj.find("input", {"name": "__RequestVerificationToken"})['value']
# POST request
search_request = "Geography.Geography=Hou&Geography.GeographyLong=Hou%2C+Danmark+(Ferieomr%C3%A5de)&Geography.Id=847fcbc5-0795-4396-9318-01e638f3b0f6&Geography.SuggestionId=&Period.Arrival=&Period.Duration=7&Period.ArrivalCorrection=False&Price.MinPrice=&Price.MaxPrice=&Price.MinDiscountPercentage=&Accommodation.MinPersonNumber=&Accommodation.MinBedrooms=&Accommodation.NumberOfPets=&Accommodation.MaxDistanceWater=&Accommodation.MaxDistanceShopping=&Facilities.SwimmingPool=false&Facilities.Whirlpool=false&Facilities.Sauna=false&Facilities.InternetAccess=false&Facilities.SatelliteCableTV=false&Facilities.FireplaceStove=false&Facilities.Dishwasher=false&Facilities.WashingMachine=false&Facilities.TumblerDryer=false"
gr = r.post(
url = 'http://www.feline.dk/search/',
data = search_request,
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
)
print(gr.url)
Result :
http://www.feline.dk/search/Y2U5erq-ZSr7NOfJEozPLD5v-MZkw8DAwMHAAAA/

Thank you Kantium for your answer, in my case, i found that the RequestVerificationToken was actually generated in a JS script inside the page.
1 - Call the first page that generates the code, in my case it returned something like this inside the HTML:
<script>
Sys.Net.WebRequestManager.add_invokingRequest(function (sender, networkRequestEventArgs) {
var request = networkRequestEventArgs.get_webRequest();
var headers = request.get_headers();
headers['RequestVerificationToken'] = '546bd932b91b4cdba97335574a263e47';
});
$.ajaxSetup({
beforeSend: function (xhr) {
xhr.setRequestHeader("RequestVerificationToken", '546bd932b91b4cdba97335574a263e47');
},
complete: function (result) {
console.log(result);
},
});
</script>
2 - Grab the RequestVerificationToken code and then add it to your request along with the cookie from set-cookie.
let resp_setcookie = response.headers["set-cookie"];
let rege = new RegExp(/(?:RequestVerificationToken", ')(\S*)'/);
let token = rege.exec(response.body)[1];
I actually store them in a global variable, and later in my Nodejs Request i would add this to the request object:
headers.Cookie = gCookies.cookie;
headers.RequestVerificationToken = gCookies.token;
So that the end request would look something like this:
Remember that you can monitor requests sent using:
require("request-debug")(requestpromise);
Good luck !

Related

getting the search page result, login with jwt authentication (python)

I am trying to get the html page to parse. The site itself has login form. I am using the following code to get through the login form:
headers = {
"Content-Type": "application/json",
"referer":"https://somesite/"
}
payload = {
"email": us,
"password": ps,
"web": "true"
}
session_requests = requests.session()
response = session_requests.post(
site,
data = json.dumps(payload),
headers = headers
)
result = response
resultContent = response.content
resultCookies = response.cookies
resultContentJson = json.loads(resultContent)
resultJwtToken = resultContentJson['jwtToken']
That works just fine, I am able to get 200 OK status and jwtToken.
NOW. When I actually trying to get the page (search result) the site returns to me '401 - not authorized'.. So, the question is 'what am I am doing wrong?'. Any suggestion/hint/idea is appreciated!
here is the request that gets 401 response:
siteSearch = "somesite/filters/search"
headersSearch = {
"content-type": "application/json",
"referer":"https://somesite",
"origin":"https://somesite",
"authorization":"Bearer {}".format(resultJwtToken),
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"
}
payloadSearch = {
"userId":50432,
"filters" : [],
"savedSearchIds":[],
"size":24
}
responseSearch = session_requests.post(
siteSearch,
data = json.dumps(payloadSearch),
headers = headers
)
searchResult = response;
looking at the postman and chrome developer tools and seems to me I am sending the identical request as the actual browser (works via browser).. but nope - 401 response.
May be it has something to do with the cookies? The first login response returns bunch of cookies as well, but I thought the session_requests takes care about it?
in any way, any help is appreciated. Thanks
typo.. in responseSearch I used for the headers the headers defined in the initial login. should be headers = headersSearch. All the rest works as expected. Thanks!

How to go to next page on google form using requests.post

I've looked at multiple tutorials on how to fill out a google form and have successfully semi accomplished it. My problem is that the google form has 2 pages before you submit it.
I've created my form data:
form_data = {
'entry.1019016807': 'My name',
'draftResponse': [],
'pageHistory': 0
}
and have the post
user_agent = {
'Referer': 'https://docs.google.com/forms/d/e/1FAIpQLSfktx3zRs4rqaZMNBc17oFuHQOJ1ckHz1lyYaN1kzaNCq9uyQ/formResponse',
'User-Agent': "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36"}
requests.post(url, data=form_data, headers=user_agent)
Changing page history to 1 fills out data on the second page and setting it to 0 fills out data in the first page. I've tried using 2 requests.post with different page history but it just creates 2 separate google form responses. There is also more form data but I didn't include it. All entry.id's are correct.
try this code. I've comment it to understand how it works.
import requests
from bs4 import BeautifulSoup as bs4
# First, download the form and parse it with beautifulsoup :
url = 'https://forms.gle/8nt88S9jc5zNDmqM8'
response = requests.get(url)
html = bs4(response.text, 'html.parser')
# the balise <form action="url_to_post" id="mG61Hd"> contains the post URL
post_url = html.find('form', attrs={'id': 'mG61Hd'})
print(post_url['action'])
# Use the post method of the requests module to POST your data:
r = requests.post(post_url['action'], data = {'key':'value'})
print(r)

Can't scrape names from next pages using requests

I'm trying to parse names traversing multiple pages from a webpage using a python script. With my current attempt I can get the names from it's landing page. However, I can't find any idea to fetch the names from next pages as well using requests and BeautifulSoup.
website link
My attempt so far:
import requests
from bs4 import BeautifulSoup
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
with requests.Session() as s:
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
for elem in soup.select("table#gvContractors tr:has([id*='_lblName'])"):
name = elem.select_one("span[id*='_lblName']").get_text(strip=True)
print(name)
I've tried to modify my script to get only the content from the second page to make sure it is working when there is a next page button involved but unfortunately it still fetches data from the first page:
import requests
from bs4 import BeautifulSoup
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
with requests.Session() as s:
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['__EVENTARGUMENT'] = 'Page$Next'
payload.pop('btnClose')
payload.pop('btnMapClose')
res = s.post(url,data=payload,headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://proximity.niceic.com/mainform.aspx?PostCode=YO95',
})
sauce = BeautifulSoup(res.text,"lxml")
for elem in sauce.select("table#gvContractors tr:has([id*='_lblName'])"):
name = elem.select_one("span[id*='_lblName']").get_text(strip=True)
print(name)
Navigating to next page is being performed via POST request with __VIEWSTATE cursor.
How you can do it with requests:
Make GET request to first page;
Parse required data and __VIEWSTATE cursor;
Prepare POST request for next page with received cursor;
Run it, parse all data and new cursor for next page.
I won't provide any code, because it requires to write down almost all crawler's code.
==== Added ====
You almost done it, but there are two important things you have missed.
It is necessary to send headers with first GET request. If there're no headers sent - we get broken tokens (it is easy to detect visually - they haven't == at the end)
We need to add __ASYNCPOST to payload we send. (It is very interesting: it is not a boolean True, it is a string 'true')
Here's code. I removed bs4 and added lxml (i don't like bs4, it is very slow). We exactly know which data we need to send, so let's parse only few inputs.
import re
import requests
from lxml import etree
def get_nextpage_tokens(response_body):
""" Parse tokens from XMLHttpRequest response for making next request to next page and create payload """
try:
payload = dict()
payload['ToolkitScriptManager1'] = 'UpdatePanel1|gvContractors'
payload['__EVENTTARGET'] = 'gvContractors'
payload['__EVENTARGUMENT'] = 'Page$Next'
payload['__VIEWSTATEENCRYPTED'] = ''
payload['__VIEWSTATE'] = re.search(r'__VIEWSTATE\|([^\|]+)', response_body).group(1)
payload['__VIEWSTATEGENERATOR'] = re.search(r'__VIEWSTATEGENERATOR\|([^\|]+)', response_body).group(1)
payload['__EVENTVALIDATION'] = re.search(r'__EVENTVALIDATION\|([^\|]+)', response_body).group(1)
payload['__ASYNCPOST'] = 'true'
return payload
except:
return None
if __name__ == '__main__':
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://proximity.niceic.com/mainform.aspx?PostCode=YO95',
}
with requests.Session() as s:
page_num = 1
r = s.get(url, headers=headers)
parser = etree.HTMLParser()
tree = etree.fromstring(r.text, parser)
# Creating payload
payload = dict()
payload['ToolkitScriptManager1'] = 'UpdatePanel1|gvContractors'
payload['__EVENTTARGET'] = 'gvContractors'
payload['__EVENTARGUMENT'] = 'Page$Next'
payload['__VIEWSTATE'] = tree.xpath("//input[#name='__VIEWSTATE']/#value")[0]
payload['__VIEWSTATEENCRYPTED'] = ''
payload['__VIEWSTATEGENERATOR'] = tree.xpath("//input[#name='__VIEWSTATEGENERATOR']/#value")[0]
payload['__EVENTVALIDATION'] = tree.xpath("//input[#name='__EVENTVALIDATION']/#value")[0]
payload['__ASYNCPOST'] = 'true'
headers['X-Requested-With'] = 'XMLHttpRequest'
while True:
page_num += 1
res = s.post(url, data=payload, headers=headers)
print(f'page {page_num} data: {res.text}') # FIXME: Parse data
payload = get_nextpage_tokens(res.text) # Creating payload for next page
if not payload:
# Break if we got no tokens - maybe it was last page (it must be checked)
break
Important
Response not a well formed HTML. So You have to deal with it: cut table or something else. Good luck!

Python: 'GetContextWebInformation' to udpate SharePoint list item

I am trying to read/write SharePoint list items through python
I've written below which reads SharePoint details successfully as a response
import requests
from requests_ntlm import HttpNtlmAuth
requests.packages.urllib3.disable_warnings() # suprress all SSL warnings
url = "https://sharepoint.company.com/_api/web/lists/getbytitle('listname')/items?$top=3&$select=ID,Title,Notes" # just reading 3 columns
headers = {'accept': 'application/xml;q=0.9, */*;q=0.8'}
response = requests.get(url, headers=headers, auth=HttpNtlmAuth('domain\\username','Password'), verify=False, stream=True)
Now, when I try to update one of the items, I receive response 403 error
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
json_data = [{ '__metadata': { 'type': 'SP.List' }, 'Notes': 'Test Note' }]
response = requests.post(url, { '__metadata': { 'type': 'SP.List' }, 'Notes': 'Test Note' }, headers = self.headers, auth=HttpNtlmAuth('domain\\username','Password'), verify=False)
Microsoft SharePoint says X-RequestDigest: form digest value has to be sent in headers.
After reading through articles, found the below code to get form digest value:
site_url = "https://sharepoint.company.com"
login_user = 'domain\\username'
auth = HttpNtlmAuth(login_user, 'PASSWORD')
sharepoint_contextinfo_url = self.site_url + '/_api/contextinfo'
headers = {
'accept': 'application/json;odata=verbose',
'content-type': 'application/json;odata=verbose',
'odata': 'verbose',
'X-RequestForceAuthentication': 'true'
}
r = requests.post(sharepoint_contextinfo_url, auth=auth, headers=headers, verify=False)
form_digest_value = self.r.json()['d']['GetContextWebInformation']['FormDigestValue']
But, I do not receive form_digest_value
I tried to access the context info through Browser like https://sharepoint.company.com/_api/contextinfo and received below error:
<?xml version="1.0" encoding="UTF-8"?>
-<m:error xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata">
<m:code>-1, Microsoft.SharePoint.Client.ClientServiceException</m:code>
<m:message xml:lang="en-US">The HTTP method 'GET' cannot be used to access the resource 'GetContextWebInformation'. The operation type of the resource is specified as 'Default'. Please use correct HTTP method to invoke the resource.</m:message>
</m:error>
Can someone please help how to get form digest value? Or is there anyway around to update SharePoint list item?
Thanks in advance!
Updated
After going through this article, I can understand we can get __REQUESTDIGEST value from Page source. On refreshing the page every min, can see value differs. how can I get the request digest value through python and keep it alive at least for 5mins?
Posting the answer, may be it could help someone
Data passed for update is not done properly here
So, passed like below:
json_data = {
"__metadata": { "type": "SP.Data.TasksListItem" },
"Title": "updated title from Python"
}
and passed json_data to requests like below:
r= requests.post(api_page, json.dumps(json_data), auth=auth, headers=update_headers, verify=False).text
After the above changes, code updated the Title on SharePoint.

POST request always returns "Disallowed Key Characters"

I want to retrieve Atmospheric particulate matter values from a table (sadly the site is not in english, so feel free to ask for everything): I failed with the combination of BeautifulSoup and GET request sent with requests, since table is filled with Bootstrap dinamically and a parser like BeautifulSoup can't find values which still must be inserted.
With Firebug I checked every angle of the page, and I found out that by selecting a different day of the table, a POST request is sent (the site, as you can see in Referer, is http://www.arpat.toscana.it/temi-ambientali/aria/qualita-aria/bollettini/index/regionale/, where the table is):
POST /temi-ambientali/aria/qualita-aria/bollettini/aj_dati_bollettini HTTP/1.1
Host: www.arpat.toscana.it
User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
X-Requested-With: XMLHttpRequest
Referer: http://www.arpat.toscana.it/temi-ambientali/aria/qualita-aria/bollettini/index/regionale/26-12-2016
Content-Length: 114
Cookie: [...]
DNT: 1
Connection: keep-alive
With the following params:
v_data_osservazione=26-12-2016&v_tipo_bollettino=regionale&v_zona=&csrf_test_name=b88d2517c59809a529
b6f8141256e6ca
Data in the answer are in JSON format.
So I started to craft my personal POST request, in order to directly get the JSON data which will fill the table.
In the params, in addition to the date, a csrf_test_name is required: here I discovered this site is protected against CSRF vulnerability; in order to perform a correct query in params, I need a CSRF token: that's why I perform a GET request to the site (see Referer in POST request for the URL) and get CSRF token from the cookie like this:
r = get(url)
csrf_token = r.cookies["csrf_cookie_name"]
At end of the day, with my CSRF token and POST request ready, I send it...and with status code 200, I always get Disallowed Key Characters.!
Looking for this error, I always see posts about CodeIgniter, which (I think) is not what I need: I tried every combination of headers and parameters, yet nothing changed. Before giving up on BeautifulSoup and requests and start learning Selenium, I'd like to figure out what the problem is: Selenium is too high level, low level libraries like BeautifulSoup and requests let me learn lot of useful things, so I'd prefer continue learning with these two.
Here's the code:
from requests import get, post
from bs4 import BeautifulSoup
import datetime
import json
url = "http://www.arpat.toscana.it/temi-ambientali/aria/qualita-aria/bollettini/index/regionale/" # + %d-%m-%Y
yesterday = datetime.date.today() - datetime.timedelta(1)
date_object = datetime.datetime.strptime(str(yesterday), '%Y-%m-%d')
yesterday_string = str(date_object.strftime('%d-%m-%Y'))
full_url = url + yesterday_string
print("REFERER " + full_url)
r = get(url)
csrf_token = r.cookies["csrf_cookie_name"]
print(csrf_token)
# preparing headers for POST request
headers = {
"Host": "www.arpat.toscana.it",
"Accept" : "*/*",
"Accept-Language" : "en-US,en;q=0.5",
"Accept-Encoding" : "gzip, deflate",
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With" : "XMLHttpRequest", # XHR
"Referer" : full_url,
"DNT" : "1",
"Connection" : "keep-alive"
}
# preparing POST parameters (to be inserted in request's body)
payload_string = "v_data_osservazione="+yesterday_string+"&v_tipo_bollettino=regionale&v_zona=&csrf_test_name="+csrf_token
print(payload_string)
# data -- (optional) Dictionary, bytes, or file-like object to send in the body of the Request.
# json -- (optional) json data to send in the body of the Request.
req = post("http://www.arpat.toscana.it/temi-ambientali/aria/qualita-aria/bollettini/aj_dati_bollettini",
headers = headers, json = payload_string
)
print("URL " + req.url)
print("RESPONSE:")
print('\t'+str(req.status_code))
print("\tContent-Encoding: " + req.headers["Content-Encoding"])
print("\tContent-type: " + req.headers["Content-type"])
print("\tContent-Length: " + req.headers["Content-Length"])
print('\t'+req.text)
This code works for me:
I use request.Session() and it keeps all cookies
I use data= instead of json=
finally I don't need all commented elements
to compare browser requests and code requests I used Charles web debugging proxy application
code:
import requests
import datetime
#proxies = {
# 'http': 'http://localhost:8888',
# 'https': 'http://localhost:8888',
#}
s = requests.Session()
#s.proxies = proxies # for test only
date = datetime.datetime.today() - datetime.timedelta(days=1)
date = date.strftime('%d-%m-%Y')
# --- main page ---
url = "http://www.arpat.toscana.it/temi-ambientali/aria/qualita-aria/bollettini/index/regionale/"
print("REFERER:", url+date)
r = s.get(url)
# --- data ---
csrf_token = s.cookies["csrf_cookie_name"]
#headers = {
#'User-Agent': 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0',
#"Host": "www.arpat.toscana.it",
#"Accept" : "*/*",
#"Accept-Language" : "en-US,en;q=0.5",
#"Accept-Encoding" : "gzip, deflate",
#"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",
#"X-Requested-With" : "XMLHttpRequest", # XHR
#"Referer" : url,
#"DNT" : "1",
#"Connection" : "keep-alive"
#}
payload = {
'csrf_test_name': csrf_token,
'v_data_osservazione': date,
'v_tipo_bollettino': 'regionale',
'v_zona': None,
}
url = "http://www.arpat.toscana.it/temi-ambientali/aria/qualita-aria/bollettini/aj_dati_bollettini"
r = s.post(url, data=payload) #, headers=headers)
print('Status:', r.status_code)
print(r.json())
proxy:

Categories