Python requests POST gives different response content than browser - python

I'm writing a Python script to scrape a table from this site (this is public information about ocean tide levels).
One of the stations I'd like to scrape is Punta del Este, code 83.0, in any given day. But my scripts returns a different table than the browser even when the POST request seems to have the same input.
When I fill the form in my browser, the headers and data sent to the server are these:
So I wrote my script to make a POST request as it follows:
url = 'https://www.ambiente.gub.uy/SIH-JSF/paginas/sdh/consultaHDMCApublic.xhtml'
s = requests.Session()
r = s.get(url, verify=False)
soupGet = BeautifulSoup(r.content, 'lxml')
#JSESSIONID = s.cookies['JSESSIONID']
javax_faces_ViewState = soupGet.find("input", {"type": "hidden", "name":"javax.faces.ViewState"})['value']
headersSih = {
'Accept': 'application/xml, text/xml, */*; q=0.01',
'Accept-Language': 'gzip, deflate, br',
'Accept-Language': 'es-ES,es;q=0.6',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie': 'JSESSIONID=FBE5ZdMQVFrgQ-P6K_yTc1bw.dinaguasihappproduccion',
'Faces-Request': 'partial/ajax',
'Origin': 'https://www.ambiente.gub.uy',
'Referer': url,
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Sec-GPC': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
ini_date = datetime.strftime(fecha0 , '%d/%m/%Y %H:%M')
end_date = datetime.strftime(fecha0 + timedelta(days=1), '%d/%m/%Y %H:%M')
codigo = 830
dataSih = {
'javax.faces.partial.ajax': 'true',
'javax.faces.source': 'formConsultaHorario:j_idt64',
'javax.faces.partial.execute': '#all',
'javax.faces.partial.render': 'formConsultaHorario:pnlhorarioConsulta',
'formConsultaHorario:j_idt64': 'formConsultaHorario:j_idt64',
'formConsultaHorario': 'formConsultaHorario',
'formConsultaHorario:estacion_focus': '',
'formConsultaHorario:estacion_input': codigo,
'formConsultaHorario:fechaDesde_input': ini_date,
'formConsultaHorario:fechaHasta_input': end_date,
'formConsultaHorario:variables_focus': '',
'formConsultaHorario:variables_input': '26', # Variable: H,Nivel
'formConsultaHorario:fcal_focus': '',
'formConsultaHorario:fcal_input': '7', # Tipo calculo: Ingresado
'formConsultaHorario:ptiempo_focus': '',
'formConsultaHorario:ptiempo_input': '2', #Paso de tiempo: Escala horaria
'javax.faces.ViewState': javax_faces_ViewState,
}
page = s.post(url, headers=headersSih, data=dataSih)
However, when I do it via browser I get a table full of data, while python request returns (page.text) a table saying "No data was found".
Is there something I'm missing? I've tried changing a lots of stuff but nothing seems to do the trick.

Maybe on this website javascript loads the data. Requests dont activate it. If you want to get data from there use Selenium

Related

Python "read_html" is outputting NaN for data values in a table from Stathead sports data website

When I try to use "read_html" to get the values from a table on Stathead.com, the first 10 rows are output as NaN, and the rest of the data is output normally. I've tried a bunch of different things but can't get it to work. (I do have a paid Stathead subscription, so maybe it has something to do with that?)
import pandas as pd
import requests
url = 'https://stathead.com/basketball/player-game-finder.cgi?request=1&player_game_min=1&team_game_min=1&comp_type=reg&order_by=pts&match=player_game&season_start=1&player_game_max=9999&year_max=2023&team_game_max=84&season_end=-1&order_by_asc=0&positions%5B%5D=G&positions%5B%5D=GF&comp_id=NBA&year_min=2023&cstat%5B1%5D=mp&ccomp%5B1%5D=gt&cval%5B1%5D=1&offset=0'
page = requests.get(url)
dfs = pd.read_html(page.text)
print(dfs)
Output data table part 1
Output data table part 2
I was expecting there to be actual values in all cells because the actual table has values if you go to the Stathead URL
EDIT
Okay, I tried the URL in a different browser where I wasn't logged into Stathead, and here's what I see: Not logged in table
So the issue is almost definitely not being logged in. Is there a way to show that I'm logged in when using read_html, or something else I could do?
EDIT 2
Thank you for the suggestions #Driftr95 !! I tried both for the last 2 hours and couldn't figure it out... here was my first try using the request module [I put "MyUsername" and "MyPassword" in the code below, but in my real code I put my actual user name and password (: ]
url_to_open = 'https://stathead.com/basketball/player-game-finder.cgi?request=1&player_game_min=1&team_game_min=1&comp_type=reg&order_by=pts&match=player_game&season_start=1&player_game_max=9999&year_max=2023&team_game_max=84&season_end=-1&order_by_asc=0&positions[]=G&positions[]=GF&comp_id=NBA&year_min=2023&cstat[1]=mp&ccomp[1]=gt&cval[1]=1&offset=0'
# Fill in your details here to be posted to the login form.
payload = {
'username': 'MyUsername',
'password': 'MyPassword'
}
# Use 'with' to ensure the session context is closed after use.
with requests.Session() as s:
p = s.post('https://stathead.com/users/login.cgi', data=payload)
page = requests.get(url_to_open)
dfs = pd.read_html(page.text)
print(dfs)
And here was my second try using convert curl:
import requests
cookies = {
'srcssfull': 'yes',
'_gid': 'GA1.2.550164619.1676068467',
'_gcl_au': '1.1.935969025.1676068468',
'ln_or': 'eyIzNTM4NTk2IjoiZCJ9',
'hubspotutk': 'e35045f7f0a6c7771ecfd1aa0e8d4276',
'_fbp': 'fb.1.1676068478646.152421943',
'__hssrc': '1',
'__hstc': '205977932.e35045f7f0a6c7771ecfd1aa0e8d4276.1676068478416.1676086198834.1676088324184.4',
'_gat_gtag_UA_1890630_24': '1',
'_gat_gtag_UA_1890630_9': '1',
'csrf_token': 'cff1c9cb5b8fdc60fad64d9fae3494fd',
'_ga': 'GA1.2.1796201960.1676068467',
'__hssc': '205977932.15.1676088324184',
'_ga_2M1H4N076C': 'GS1.1.1676088445.4.1.1676091530.0.0.0',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Cookie': 'srcssfull=yes; _gid=GA1.2.550164619.1676068467; _gcl_au=1.1.935969025.1676068468; ln_or=eyIzNTM4NTk2IjoiZCJ9; hubspotutk=e35045f7f0a6c7771ecfd1aa0e8d4276; _fbp=fb.1.1676068478646.152421943; __hssrc=1; __hstc=205977932.e35045f7f0a6c7771ecfd1aa0e8d4276.1676068478416.1676086198834.1676088324184.4; _gat_gtag_UA_1890630_24=1; _gat_gtag_UA_1890630_9=1; csrf_token=cff1c9cb5b8fdc60fad64d9fae3494fd; _ga=GA1.2.1796201960.1676068467; __hssc=205977932.15.1676088324184; _ga_2M1H4N076C=GS1.1.1676088445.4.1.1676091530.0.0.0',
'Origin': 'https://stathead.com',
'Referer': 'https://stathead.com/users/login.cgi',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
data = {
'username': 'MyUsername',
'password': 'MyPassword',
'remember': '1',
'referrer': 'https%3A%2F%2Fstathead.com%2Fprofile%2F',
'token': '0',
'csrf_token': '612ccbc4f75b46114af2f23a618c9668',
}
response = requests.post('https://stathead.com/users/login.cgi', cookies=cookies, headers=headers, data=data)
page = requests.get(url_to_open)
dfs = pd.read_html(page.text)
print(dfs)

python requests not returning json data

I would like to get the json data from for instance https://app.weathercloud.net/d0838117883#current using python requests module.
I tried:
import re
import requests
device='0838117883'
URL='https://app.weathercloud.net'
URL1=URL+'/d'+device
URL2=URL+'/device/stats'
headers={'Content-Type':'text/plain; charset=UTF-8',
'Referer':URL1,
'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36',
'Accept':'application/json, text/javascript,*/*'}
with requests.Session() as s:
#get html from URL1 in order to get the CSRF token
page = s.get(URL1)
CSRF=re.findall('WEATHERCLOUD_CSRF_TOKEN:"(.*)"},',page.text)[0]
#create parameters for URL2, in order to get the json file
params={'code':device,'WEATHERCLOUD_CSRF_TOKEN':CSRF}
page_stats=requests.get(URL2,params=params,headers=headers)
print(page_stats.url)
print(page_stats) #<Response [200]>
print(page_stats.text) #empty
print(page_stats.json()) #error
But the page_stats is empty.
How can I get the stats data from weathercloud?
Inspecting the page with DevTools, you'll find a useful endpoint:
https://app.weathercloud.net/device/stats
You can "replicate" the original web request made by your browser with requests library:
import requests
cookies = {
'PHPSESSID': '************************',
'WEATHERCLOUD_CSRF_TOKEN':'***********************',
'_ga': '**********',
'_gid': '**********',
'__gads': 'ID=**********',
'WeathercloudCookieAgreed': 'true',
'_gat': '1',
'WEATHERCLOUD_RECENT_ED3C8': '*****************',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '^\\^Google',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'sec-ch-ua-platform': '^\\^Windows^\\^',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://app.weathercloud.net/d0838117883',
'Accept-Language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7,es;q=0.6',
}
params = (
('code', '0838117883'),
('WEATHERCLOUD_CSRF_TOKEN', '****************'),
)
response = requests.get('https://app.weathercloud.net/device/stats', headers=headers, params=params, cookies=cookies)
# Serializing json
json_object = json.loads(response.text)
json Output:
{'last_update': 1632842172,
'bar_current': [1632842172, 1006.2],
'bar_day_max': [1632794772, 1013.4],
'bar_day_min': [1632845772, 1006.2],
'bar_month_max': [1632220572, 1028],
'bar_month_min': [1632715572, 997.3],
'bar_year_max': [1614418512, 1038.1],
'bar_year_min': [1615434432, 988.1],
'wdir_current': [1632842172, 180],
..............}
That's it.

How do I Webscrape a website that uses iframes?

I am trying to scrape this website 'https://swimming.org.nz/results.html'. In the form that comes up, I am filling in only the Age column as 8 to 8. I am using the following code to scrape the table as suggested elsewhere in StackOverflow. I am unable to get the table. How to get all the tables for this age group 8 to 8.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
r = s.get("https://swimming.org.nz/results.html")
soup = BeautifulSoup(r.content, "html.parser")
iframe_src = soup.select_one("x-MS_FIELD_AGE.FROM.L").attrs["src"]
r = s.get(f"https:{iframe_src}")
soup = BeautifulSoup(r.content, "html.parser")
for row in soup.select("x-form-text x-form-field"):
print("\t".join([e.text for e in row.select("th, td")]))
You will see that it is not necessary using BeautifulSoup if you' ve look at the developer tools on your browser. Sending request is below and response type is xml. You don't need any scraping tool. You can get all of that data changing the StartRowIndexand MaximumRowCount.
import requests
url = "https://connect.swimming.org.nz/snz-wrap-public/pages/pageRequestHandler?tunnelTarget=tableData%2F%3F&data_file=MS.COMP.RESULTS&dict_file=MS.COMP.RESULTS&doGet=true"
payload="StartRowIndex=0&MaximumRowCount=100&sort=BY-DSND%20COMP.DATE%20BY-DSND%20STAGE&dir=ASC&tid=extTable1620108707767_4352538&selectCriteria=GET-LIST%20CMS_TABLE_19483_65507_076_184811_&extraColumns=%3CColumns%20DynamicLinkRoot%3D%22https%3A%2F%2Fconnect.swimming.org.nz%3A443%2Fsnz-wrap-public%2Fworkflows%2F%22%3E%3CColumn%3E%3CColumnName%3EExpander%3C%2FColumnName%3E%3CField%3EFRAGMENT_DISPLAY.SPLITS%3C%2FField%3E%3CShowInExpander%3Etrue%3C%2FShowInExpander%3E%3C%2FColumn%3E%3CColumn%3E%3CFieldExpression%3E%7BMEMBER.FORE1%7D%20%7BMEMBER.SURNAME%7D%3C%2FFieldExpression%3E%3CField%3EEXPRESSION_FIELD_1%3C%2FField%3E%3CColumnName%3EName%2520%3C%2FColumnName%3E%3CWidth%3E130%3C%2FWidth%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EXGENDER%3C%2FField%3E%3CColumnName%3EGender%3C%2FColumnName%3E%3CWidth%3E50%3C%2FWidth%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EENTRANT.AGE%3C%2FField%3E%3CColumnName%3EAge%3C%2FColumnName%3E%3CWidth%3E35%3C%2FWidth%3E%3C%2FColumn%3E%3CColumn%3E%3CFieldExpression%3E%7BXCATEGORY2%7D%3C%2FFieldExpression%3E%3CField%3ECATEGORY2.NUM%24%24SNZ%3C%2FField%3E%3CColumnName%3EDistance%3C%2FColumnName%3E%3CWidth%3E70%3C%2FWidth%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EXCATEGORY1%3C%2FField%3E%3CColumnName%3EStroke%3C%2FColumnName%3E%3CWidth%3E70%3C%2FWidth%3E%3C%2FColumn%3E%3CColumn%3E%3CFieldExpression%3E%7BTIME%24%24SNZ%7D%3C%2FFieldExpression%3E%3CField%3ERESULT.TIME.MILLISECONDS%3C%2FField%3E%3CColumnName%3ETime%2520%3C%2FColumnName%3E%3CWidth%3E70%3C%2FWidth%3E%3CAlign%3Eright%3C%2FAlign%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EFINA.POINTS%24%24SNZ%3C%2FField%3E%3CColumnName%3EFINA%2520Points%3C%2FColumnName%3E%3CWidth%3E85%3C%2FWidth%3E%3CAlign%3Eright%3C%2FAlign%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EFINA.YEAR%24%24SNZ%3C%2FField%3E%3CColumnName%3EPoints%2520Year%3C%2FColumnName%3E%3CWidth%3E80%3C%2FWidth%3E%3CAlign%3Eright%3C%2FAlign%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3E%24DATE%24COMP.DATE%3C%2FField%3E%3CColumnName%3EDate%3C%2FColumnName%3E%3CWidth%3E70%3C%2FWidth%3E%3CAlign%3Eright%3C%2FAlign%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EXEVENT.CODE%3C%2FField%3E%3CColumnName%3EMeet%3C%2FColumnName%3E%3CWidth%3E190%3C%2FWidth%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EPARAMETER1%3C%2FField%3E%3CColumnName%3ECourse%3C%2FColumnName%3E%3CWidth%3E50%3C%2FWidth%3E%3C%2FColumn%3E%3C%2FColumns%3E&extraColumnsDownload=%3CDownloadColumns%20DynamicLinkRoot%3D%22https%3A%2F%2Fconnect.swimming.org.nz%3A443%2Fsnz-wrap-public%2Fworkflows%2F%22%3E%3CColumn%3E%3CField%3EXGENDER%3C%2FField%3E%3CColumnName%3EGender%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EENTRANT.AGE%3C%2FField%3E%3CColumnName%3EAge%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EXCATEGORY2%3C%2FField%3E%3CColumnName%3EDistance%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EXCATEGORY1%3C%2FField%3E%3CColumnName%3EStroke%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3ETIME%24%24SNZ%3C%2FField%3E%3CColumnName%3ETime%2520%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EFINA.POINTS%24%24SNZ%3C%2FField%3E%3CColumnName%3EFINA%2520Points%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EFINA.YEAR%24%24SNZ%3C%2FField%3E%3CColumnName%3EPoints%2520Year%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3E%24DATE%24COMP.DATE%3C%2FField%3E%3CColumnName%3EDate%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EXEVENT.CODE%3C%2FField%3E%3CColumnName%3EMeet%3C%2FColumnName%3E%3C%2FColumn%3E%3CColumn%3E%3CField%3EPARAMETER1%3C%2FField%3E%3CColumnName%3ECourse%3C%2FColumnName%3E%3C%2FColumn%3E%3C%2FDownloadColumns%3E"
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'accept-language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7,ru;q=0.6',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'https://connect.swimming.org.nz',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://connect.swimming.org.nz/snz-wrap-public/workflows/COMP.RESULTS.FIND',
'Cookie': 'JSESSIONID=93F2FEA63BA41ECB2505E2D1CD76374D; _ga=GA1.3.1735786808.1620106921; _gid=GA1.3.1806138988.1620106921'
}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)

How to maintain connection after login with requests to navigate web page

I am looking to navigate a webpage after login via requests. My code is successful at logging into the web page but I am unable to access the webpage following login. Here is the procedure I am looking to execute upon:
Login to the website
Click the "Search" tab
Enter text into the search bar that appears following the click
As I said, I have successfully finished step 1 but am having a hard time proceeding with steps 2 and 3. Below is my code:
import requests
class ptab:
def __init__(self):
self.data = '{"userName":"username","password":"pword"}'
self.params = (
('cacheFix', '1615866146331'),
)
self.cookies = {
'_gid': 'GA1.2.505039707.1615866090',
'_gat_SiteSpecificT': '1',
'_gat_RollupT': '1',
'_gat_GSA_ENOR0': '1',
'_ga': 'GA1.2.767718179.1615866090',
'_gat_UA-21265023-12': '1',
'_4c_': 'lZJNj5swEIb%2FysrnOMHYGJNb%2BqEqUluttNuVekLGOMEKwZbxQrOr%2FPeOE0j22HJh5hnP6xl439HY6A6tCSeZ4JwkCSV0gQ761KP1O%2FKmjq8BrRGFQkbzAhPNCGYqY1iKiuGMc7qrqBQqFWiB%2FkStlHKWJkXCk%2BS8QIOZNThNeUYkxTTTArOKU1wQpXAqE7qTRUVVwWaNOE8uioIzskAhtMBEEh9QVG5SfEfK1hqUSbEkZEmgObxBilOWQKy7eGsf9hB%2F25S%2Ftl8gzXmeE0Fy6LguDYNC%2FdXDFagJwfXr1Wocx%2BVr74Jd7u2wcjLoLvQrF2S1Ct7Itp8YvmRYdjWWzmkIKyt9jXVXL5twbEHYeVu%2FqlCGk4uTjrp66OsDFGo9GKXL0dShiSukIrnTRpt9EwDD0pE6D3EK0Wi62o4fui71id66OM2BPl7nhvw3ZM9e1voo%2FSGCnwC2j%2BV3OZaPtjXqVG67nZ0KLzC%2B9R%2FJZpCmLb%2B2WgVvO6PKT%2BatfDrdlLrQlhsVzGCC0TN91n0wR9udyientWpuhcrbsddxo8%2BNt0f9IAqgNpZ%2FSAWh1zvt%2FeUEZL0J8cvd%2FseEwKQzxVfqLkaFoLVKtrEHvH2%2BGarIUgaGymZDCc5mR7l2chS5208kDCyc0dl%2B7H56mE7Tf9G%2BbguW%2BJ%2B28%2Fkv',
'_ga_CD30TTEK1F': 'GS1.1.1615866089.1.1.1615866100.0',
'_gat': '1',
}
self.headers = {
'Connection': 'keep-alive',
'Accept': 'application/json, text/plain, */*',
'ADRUM': 'isAjax:true',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'https://ptab.uspto.gov',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://ptab.uspto.gov/',
'Accept-Language': 'en-US,en;q=0.9',
}
def login(self, url):
with requests.Session() as r:
response = r.post(url, headers=self.headers, params=self.params, cookies=self.cookies, data=self.data).url
print(response)
# After this line the code should perform steps 2 and 3 described above
ptab = ptab()
ptab.login('https://ptab.uspto.gov/ptabe2e/rest/login')

Python POST to check a box on a webpage

I am trying to scrape a webpage that posts prices for the Mexico power market. The webpage has checkboxes that need to be checked for the file with prices to show up. Once I get the relevant box checked, I want to pull the links on the page and check if the particular file I am looking for is posted. I am getting stuck in the first part where I get the checkbox selected using requests.post. I used fiddler to track the changes when I post and passed those arguments in through requests.post.
I was expecting to be able to parse out all the 'href' links in the response but I didn't get any. Any help in redirecting me toward a solution would be greatly appreciated.
Below is the relevant portion of the code I am using:
data{
"ctl00$ContentPlaceHolder1$toolkit":"ctl00$ContentPlaceHolder1$UpdatePanel1|ctl00$ContentPlaceHolder1$treePrincipal",
"_EVENTTARGET": "ctl00$ContentPlaceHolder1$treePrincipal",
"__EVENTARGUMENT":{"commandName":"Check","index":"0:0:0:0"},
"__VIEWSTATE": "/verylongstringhere",
"__VIEWSTATEGENERATOR":"6B88769A",
"__EVENTVALIDATION":"/wEdAAPhpIpHlL5kdIfX6MRCtKcRwfFVx5pEsE3np13JV2opXVEvSNmVO1vU+umjph0Dtwe41EcPKcg0qvxOp6m6pWTIV4q0ZOXSBrDwJTrxjo3dZg==",
"ctl00_ContentPlaceHolder1_treePrincipal_ClientState":{"expandedNodes":[],"collapsedNodes":
[],"logEntries":[],"selectedNodes":[],"checkedNodes":["0","0:0","0:0:0","0:0:0:0"],"scrollPosition":0},
"ctl00_ContentPlaceHolder1_ListViewNodos_ClientState":"",
"ctl00_ContentPlaceHolder1_NotifAvisos_ClientState":"",
"ctl00$ContentPlaceHolder1$NotifAvisos$hiddenState":"",
"ctl00_ContentPlaceHolder1_NotifAvisos_XmlPanel_ClientState":"",
"ctl00_ContentPlaceHolder1_NotifAvisos_TitleMenu_ClientState":"",
"__ASYNCPOST":"true"
}
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Length': '26255',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': '_ga=GA1.3.1966843891.1571403663; _gid=GA1.3.1095695800.1571665852',
'Host': 'www.cenace.gob.mx',
'Origin': 'https://www.cenace.gob.mx',
'Referer': 'https://www.cenace.gob.mx/SIM/VISTA/REPORTES/PreEnergiaSisMEM.aspx',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/77.0.3865.120 Safari/537.36',
'X-MicrosoftAjax': 'Delta=true',
'X-Requested-With': 'XMLHttpRequest'
}
url ="https://www.cenace.gob.mx/SIM/VISTA/REPORTES/PreEnergiaSisMEM.aspx"
r= requests.post(url,data=data, headers=headers, verify=False)
This is what Fiddler showed on the Post:enter image description here
Maybe you have incorrect __EVENTVALIDATION or __VIEWSTATE fields. You can get the initial page & scrape all the inputs with the initial values.
The following code grabs the input on the first requests, edit them like you did & then send the POST request scraping all the href values :
import requests
import json
from bs4 import BeautifulSoup
base_url = "https://www.cenace.gob.mx"
url = "{}/SIM/VISTA/REPORTES/PreEnergiaSisMEM.aspx".format(base_url)
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
payload = dict([
(t['name'],t.get('value',''))
for t in soup.select("input")
if t.has_attr('name')
])
payload['ctl00$ContentPlaceHolder1$toolkit'] = 'ctl00$ContentPlaceHolder1$UpdatePanel1|ctl00$ContentPlaceHolder1$treePrincipal'
payload['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$treePrincipal'
payload['__ASYNCPOST'] = 'true'
payload['__EVENTARGUMENT']= json.dumps({
"commandName":"Check",
"index":"0:1:1:0"
})
payload['ctl00_ContentPlaceHolder1_treePrincipal_ClientState'] = json.dumps({
"expandedNodes":[], "collapsedNodes":[],
"logEntries":[], "selectedNodes":[],
"checkedNodes":["0","0:1","0:1:1","0:1:1:0"],
"scrollPosition":0
})
r = requests.post(url, data = payload, headers= {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"
})
soup = BeautifulSoup(r.text, "html.parser")
print([
"{}/{}".format(base_url, t["href"])
for t in soup.findAll('a')
if not t["href"].startswith('javascript')
])

Categories