Authenticate with Python-requests on aspx based website - python

I want to scrape data from the shipping page of our company logistics which based on ASP. I watched a lot of tutorial on Internet about BeautifulSoup and Requests library. But it isn't working as expected for me.
The login url is:
https://portal-vesta.sequoialog.com.br/tms/LoginPortal.aspx
I wrote code in bash script and my login attempt worked, returns this message:
69|dataItem||<script type="text/javascript">window.location="about:blank"</script>|32|pageRedirect||/tms/HomePortal.aspx?Usu=1070rpr|
But in python, it is returning me this message:
b'69|dataItem||<script type="text/javascript">window.location="about:blank"</script>|21|pageRedirect||/TMS/LoginPortal.aspx|'
My code is:
with Session() as s:
page = s.get(urls[0])
get_ev_vs(page)
payload = "Ajax=UpdatePanel1%7CButton1&Button1=ENTRAR&__ASYNCPOST=true&__EVENTARGUMENT=&__EVENTTARGET=&__EVENTVALIDATION={}&__LASTFOCUS=&__VIEWSTATE={}&txtSenha={}&txtUsuario={}&txtUsuarioSolicita=".format(ev, vs, password, user)
head = {
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'Cache-Control': 'no-cache',
'X-MicrosoftAjax': 'Delta=true',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*'
}
response = s.post(urls[0], data=payload, headers=head)
payload = {}
headers = {
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cookie': 'ASP.NET_SessionId=' + s.cookies["ASP.NET_SessionId"]
}
open_page = s.get(urls[1], data=payload, headers=headers)
print(open_page.text)
def get_ev_vs(page):
soup = BeautifulSoup(page.text, 'html.parser')
global vs, ev
vs = soup.select_one('#__VIEWSTATE')['value']
ev = soup.select_one('#__EVENTVALIDATION')['value']

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
target = ['__VIEWSTATE', '__EVENTVALIDATION']
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(r.text, 'lxml')
values = [soup.select_one('#{}'.format(x))['value'] for x in target]
data = {
"Ajax": "UpdatePanel1|Button1",
"__LASTFOCUS": "",
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__VIEWSTATE": values[0],
"__EVENTVALIDATION": values[1],
"txtUsuario": "aaa", #username should be here
"txtSenha": "aaa", #password too!
"txtUsuarioSolicita": "",
"__ASYNCPOST": "true",
"Button1": "ENTRAR"
}
r = req.post(url, data=data)
match = re.search(r'ScriptPath\|([^|]+db)\|', r.text).group(1)
final = urljoin(url, match)
r = req.get(final)
print(r.text)
main('https://portal-vesta.sequoialog.com.br/tms/LoginPortal.aspx')

Related

Scraping data through Api from json

I would like to limit the data they receive to the first 8 links on the website. As shown in the picture, there is no data available beyond the 8th link, as seen in the CSV file. How can I apply this limit so that they only receive data from the first 8 links? The website link is https://www.linkedin.com/learning/search?keywords=data%20science,
JSON API
CSV File
Code part
import requests
import pandas as pd
url = "https://www.linkedin.com/learning-api/searchV2?keywords=data%20science&q=keywords&searchRequestId=RW4AuZRJT22%2BUeXnsZJGQA%3D%3D"
payload={}
headers = {
'authority': 'www.linkedin.com',
'accept': 'application/vnd.linkedin.normalized+json+2.1',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'cookie': 'bscookie="v=1&202108281231498ed9b977-a15a-4647-83ff-d0ef12adfbfbAQFdf9p_GSaBPrFkmyztJ8zyOnqVND-D"; li_theme=light; li_theme_set=app; li_sugr=4752e3dd-9232-4bb9-9dbb-b29c1a127f77; bcookie="v=2&9fb3a4d0-1139-4e2b-89ba-e5374eeb9735"; aam_uuid=08800810176251362264578372297522883472; _gcl_au=1.1.240501668.1664707206; li_rm=AQELLfU3ZqmMhAAAAYQ_tPjGK8ONpN3EEUxH1P4M6Czq5fk6EXaEXSzKwoNSXoSZ7KgO5uSTE9iZ30fuhs6ju1rLH1VgXYyRM3nNuiTQEx1k2ca6SR0Hk1d5-NBafeE0zv65QetFY5Yrx2ufzRlfEXUkJJSoO9Z2o7MeuX-3Go7P4dI-m5HQM7VOKLiK_TD-ZWzj_OkdkR75K31QKGq8bxPLa0JpkGUzhDIVGWzl6vqkcl6BJEK2s-keIZjsiH5MZ9sbLXEVOxLg4vD21TTJBNshE6zaiWrSnxx_PEm44eDPqjvXRMVWFeX7VZfIe2KFshWXLRc4SY8hAQINymU; visit=v=1&M; G_ENABLED_IDPS=google; JSESSIONID="ajax:7673827752327651374"; timezone=Asia/Karachi; _guid=0f0d3402-80be-4bef-9baf-18d281f68921; mbox=session^#965dfb20b29e4f2688eedcf643d2e5ab^#1671620169|PC^#965dfb20b29e4f2688eedcf643d2e5ab.38_0^#1687170309; __ssid=db28305b-28da-4f8b-ad3a-54dea10b9eb9; dfpfpt=da2e5dde482a41b09cf7178ba1bcec7e; g_state={"i_l":0}; liap=true; li_at=AQEDATKxuC8DTVh9AAABhaytidQAAAGGZN5q6E0AdHv14xrDnsngkfFuMyIIbGYccHR15UrPQ8rb3qpS0_-mpCFm9pXQkoNYGdk87LiGVIqiw4oXuJ9tqflCEOev71_L83JoJ-fkbOfZwdG0RICtuIHn; AnalyticsSyncHistory=AQKUIualgILMBgAAAYZHP2t3mvejt25dMqUMRmrpyhaQMe1cucNiAMliFNRUf4cu4aKnZ1z1kQ_FGeqFr2m04Q; lms_ads=AQEr9ksNAL4kugAAAYZHP2z8QK26stPkoXe2TgJZW3Fnrl4dCzbC2DtithS1-zp5Ve85QwxzRhPvP9okaC0kbu40FYX7EqIk; lms_analytics=AQEr9ksNAL4kugAAAYZHP2z8QK26stPkoXe2TgJZW3Fnrl4dCzbC2DtithS1-zp5Ve85QwxzRhPvP9okaC0kbu40FYX7EqIk; fid=AQGWcXnO5AffyAAAAYZRr6tph6cekZ9ZD66e1xdHhumlVvJ3cKYzZLwfK-I3nJyeRyLQs3LRnowKjQ; lil-lang=en_US; lang=v=2&lang=en-us; _dd_l=1; _dd=ff90da3c-aa07-4491-9106-b226eba1c09c; AMCVS_14215E3D5995C57C0A495C55%40AdobeOrg=1; AMCV_14215E3D5995C57C0A495C55%40AdobeOrg=-637568504%7CMCIDTS%7C19403%7CMCMID%7C09349215808923073694559483836331055195%7CMCAAMLH-1677084815%7C3%7CMCAAMB-1677084815%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1676487215s%7CNONE%7CMCCIDH%7C1076847823%7CvVersion%7C5.1.1; s_cc=true; UserMatchHistory=AQJJ3j-efkcQeQAAAYZWAETxBE44VVBGzo_i-gr5nEGPOK85mS3kDScLdGC24_GeNx-GEeCNDrPOjkQde_MGT4iPc7vJV4sT_nPL8Tv4WMTLarIEliLYPkCvou8zFlb3dFNkbXZjVV_KTVeDvUSJ5WJTeStLNXmzV3_EV5mI9dbSRpoTFlJ94vi_zxcCmnLTaGAYGQAdymMv4SbaMgtnt3QcY8Zj9-hnwxdsIEmJloq47_QTP7sfl-SG-vw8xvhl9KYb0ZPKCnQ6ioJhu3G4cFpKJiSUbULkYMADSo0; lidc="b=VB23:s=V:r=V:a=V:p=V:g=4060:u=105:x=1:i=1676480108:t=1676566269:v=2:sig=AQEz2UktgVcQuJwMoVRgKgnUuKtCEm9C"; s_sq=%5B%5BB%5D%5D; gpv_pn=www.linkedin.com%2Flearning%2Fsearch; s_ips=615; s_plt=7.03; s_pltp=www.linkedin.com%2Flearning%2Fsearch; s_tp=6116; s_ppv=www.linkedin.com%2Flearning%2Fsearch%2C47%2C10%2C2859%2C7%2C18; s_tslv=1676480356388',
'csrf-token': 'ajax:7673827752327651374',
'referer': 'https://www.linkedin.com/learning/search?keywords=data%20science',
'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'x-li-lang': 'en_US',
'x-li-page-instance': 'urn:li:page:d_learning_search;gNOg2MJoSqWv2XNAh4ukiQ==',
'x-li-pem-metadata': 'Learning Exp - Search=search',
'x-li-track': '{"clientVersion":"1.1.2236","mpVersion":"1.1.2236","osName":"web","timezoneOffset":5,"timezone":"Asia/Karachi","mpName":"learning-web","displayDensity":1,"displayWidth":1366,"displayHeight":768}',
'x-lil-intl-library': 'en_US',
'x-restli-protocol-version': '2.0.0'
}
res = requests.request("GET", url, headers=headers, data=payload).json()
product=[]
items=res['included']
for item in items:
try:
title=item['headline']['title']['text']
except:
title=''
try:
url='https://www.linkedin.com/learning/'+item['slug']
except:
url=''
try:
rating=item['rating']['ratingCount']
except:
rating=''
wev={
'title':title,
'instructor':name,
'review':rating,
'url':url
}
product.append(wev)
df=pd.DataFrame(product)
df.to_csv('learning.csv')
To filter the rows that contain empty columns, specifically those with an empty title column, you can simply add the following code:
df=pd.DataFrame(product)
filter = df["title"] != ""
dfNew = df[filter]
dfNew.to_csv('learning.csv')
The entire code will be:
import requests
import pandas as pd
url = "https://www.linkedin.com/learning-api/searchV2?keywords=data%20science&q=keywords&searchRequestId=RW4AuZRJT22%2BUeXnsZJGQA%3D%3D"
payload={}
headers = {
'authority': 'www.linkedin.com',
'accept': 'application/vnd.linkedin.normalized+json+2.1',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'cookie': 'bscookie="v=1&202108281231498ed9b977-a15a-4647-83ff-d0ef12adfbfbAQFdf9p_GSaBPrFkmyztJ8zyOnqVND-D"; li_theme=light; li_theme_set=app; li_sugr=4752e3dd-9232-4bb9-9dbb-b29c1a127f77; bcookie="v=2&9fb3a4d0-1139-4e2b-89ba-e5374eeb9735"; aam_uuid=08800810176251362264578372297522883472; _gcl_au=1.1.240501668.1664707206; li_rm=AQELLfU3ZqmMhAAAAYQ_tPjGK8ONpN3EEUxH1P4M6Czq5fk6EXaEXSzKwoNSXoSZ7KgO5uSTE9iZ30fuhs6ju1rLH1VgXYyRM3nNuiTQEx1k2ca6SR0Hk1d5-NBafeE0zv65QetFY5Yrx2ufzRlfEXUkJJSoO9Z2o7MeuX-3Go7P4dI-m5HQM7VOKLiK_TD-ZWzj_OkdkR75K31QKGq8bxPLa0JpkGUzhDIVGWzl6vqkcl6BJEK2s-keIZjsiH5MZ9sbLXEVOxLg4vD21TTJBNshE6zaiWrSnxx_PEm44eDPqjvXRMVWFeX7VZfIe2KFshWXLRc4SY8hAQINymU; visit=v=1&M; G_ENABLED_IDPS=google; JSESSIONID="ajax:7673827752327651374"; timezone=Asia/Karachi; _guid=0f0d3402-80be-4bef-9baf-18d281f68921; mbox=session^#965dfb20b29e4f2688eedcf643d2e5ab^#1671620169|PC^#965dfb20b29e4f2688eedcf643d2e5ab.38_0^#1687170309; __ssid=db28305b-28da-4f8b-ad3a-54dea10b9eb9; dfpfpt=da2e5dde482a41b09cf7178ba1bcec7e; g_state={"i_l":0}; liap=true; li_at=AQEDATKxuC8DTVh9AAABhaytidQAAAGGZN5q6E0AdHv14xrDnsngkfFuMyIIbGYccHR15UrPQ8rb3qpS0_-mpCFm9pXQkoNYGdk87LiGVIqiw4oXuJ9tqflCEOev71_L83JoJ-fkbOfZwdG0RICtuIHn; AnalyticsSyncHistory=AQKUIualgILMBgAAAYZHP2t3mvejt25dMqUMRmrpyhaQMe1cucNiAMliFNRUf4cu4aKnZ1z1kQ_FGeqFr2m04Q; lms_ads=AQEr9ksNAL4kugAAAYZHP2z8QK26stPkoXe2TgJZW3Fnrl4dCzbC2DtithS1-zp5Ve85QwxzRhPvP9okaC0kbu40FYX7EqIk; lms_analytics=AQEr9ksNAL4kugAAAYZHP2z8QK26stPkoXe2TgJZW3Fnrl4dCzbC2DtithS1-zp5Ve85QwxzRhPvP9okaC0kbu40FYX7EqIk; fid=AQGWcXnO5AffyAAAAYZRr6tph6cekZ9ZD66e1xdHhumlVvJ3cKYzZLwfK-I3nJyeRyLQs3LRnowKjQ; lil-lang=en_US; lang=v=2&lang=en-us; _dd_l=1; _dd=ff90da3c-aa07-4491-9106-b226eba1c09c; AMCVS_14215E3D5995C57C0A495C55%40AdobeOrg=1; AMCV_14215E3D5995C57C0A495C55%40AdobeOrg=-637568504%7CMCIDTS%7C19403%7CMCMID%7C09349215808923073694559483836331055195%7CMCAAMLH-1677084815%7C3%7CMCAAMB-1677084815%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1676487215s%7CNONE%7CMCCIDH%7C1076847823%7CvVersion%7C5.1.1; s_cc=true; UserMatchHistory=AQJJ3j-efkcQeQAAAYZWAETxBE44VVBGzo_i-gr5nEGPOK85mS3kDScLdGC24_GeNx-GEeCNDrPOjkQde_MGT4iPc7vJV4sT_nPL8Tv4WMTLarIEliLYPkCvou8zFlb3dFNkbXZjVV_KTVeDvUSJ5WJTeStLNXmzV3_EV5mI9dbSRpoTFlJ94vi_zxcCmnLTaGAYGQAdymMv4SbaMgtnt3QcY8Zj9-hnwxdsIEmJloq47_QTP7sfl-SG-vw8xvhl9KYb0ZPKCnQ6ioJhu3G4cFpKJiSUbULkYMADSo0; lidc="b=VB23:s=V:r=V:a=V:p=V:g=4060:u=105:x=1:i=1676480108:t=1676566269:v=2:sig=AQEz2UktgVcQuJwMoVRgKgnUuKtCEm9C"; s_sq=%5B%5BB%5D%5D; gpv_pn=www.linkedin.com%2Flearning%2Fsearch; s_ips=615; s_plt=7.03; s_pltp=www.linkedin.com%2Flearning%2Fsearch; s_tp=6116; s_ppv=www.linkedin.com%2Flearning%2Fsearch%2C47%2C10%2C2859%2C7%2C18; s_tslv=1676480356388',
'csrf-token': 'ajax:7673827752327651374',
'referer': 'https://www.linkedin.com/learning/search?keywords=data%20science',
'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'x-li-lang': 'en_US',
'x-li-page-instance': 'urn:li:page:d_learning_search;gNOg2MJoSqWv2XNAh4ukiQ==',
'x-li-pem-metadata': 'Learning Exp - Search=search',
'x-li-track': '{"clientVersion":"1.1.2236","mpVersion":"1.1.2236","osName":"web","timezoneOffset":5,"timezone":"Asia/Karachi","mpName":"learning-web","displayDensity":1,"displayWidth":1366,"displayHeight":768}',
'x-lil-intl-library': 'en_US',
'x-restli-protocol-version': '2.0.0'
}
res = requests.request("GET", url, headers=headers, data=payload).json()
product=[]
items=res['included']
for item in items:
try:
title=item['headline']['title']['text']
except:
title=''
try:
url='https://www.linkedin.com/learning/'+item['slug']
except:
url=''
try:
rating=item['rating']['ratingCount']
except:
rating=''
name = item.get("description", {}).get("text", "")
wev={
'title':title,
'instructor':name,
'review':rating,
'url':url
}
product.append(wev)
df=pd.DataFrame(product)
filter = df["title"] != ""
dfNew = df[filter]
dfNew.to_csv('learning.csv')
However, this is solution works because the web is structured. For complex/irregular websites I prefer to use scrapy as we use in my job.

python requests not returning json data

I would like to get the json data from for instance https://app.weathercloud.net/d0838117883#current using python requests module.
I tried:
import re
import requests
device='0838117883'
URL='https://app.weathercloud.net'
URL1=URL+'/d'+device
URL2=URL+'/device/stats'
headers={'Content-Type':'text/plain; charset=UTF-8',
'Referer':URL1,
'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36',
'Accept':'application/json, text/javascript,*/*'}
with requests.Session() as s:
#get html from URL1 in order to get the CSRF token
page = s.get(URL1)
CSRF=re.findall('WEATHERCLOUD_CSRF_TOKEN:"(.*)"},',page.text)[0]
#create parameters for URL2, in order to get the json file
params={'code':device,'WEATHERCLOUD_CSRF_TOKEN':CSRF}
page_stats=requests.get(URL2,params=params,headers=headers)
print(page_stats.url)
print(page_stats) #<Response [200]>
print(page_stats.text) #empty
print(page_stats.json()) #error
But the page_stats is empty.
How can I get the stats data from weathercloud?
Inspecting the page with DevTools, you'll find a useful endpoint:
https://app.weathercloud.net/device/stats
You can "replicate" the original web request made by your browser with requests library:
import requests
cookies = {
'PHPSESSID': '************************',
'WEATHERCLOUD_CSRF_TOKEN':'***********************',
'_ga': '**********',
'_gid': '**********',
'__gads': 'ID=**********',
'WeathercloudCookieAgreed': 'true',
'_gat': '1',
'WEATHERCLOUD_RECENT_ED3C8': '*****************',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '^\\^Google',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'sec-ch-ua-platform': '^\\^Windows^\\^',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://app.weathercloud.net/d0838117883',
'Accept-Language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7,es;q=0.6',
}
params = (
('code', '0838117883'),
('WEATHERCLOUD_CSRF_TOKEN', '****************'),
)
response = requests.get('https://app.weathercloud.net/device/stats', headers=headers, params=params, cookies=cookies)
# Serializing json
json_object = json.loads(response.text)
json Output:
{'last_update': 1632842172,
'bar_current': [1632842172, 1006.2],
'bar_day_max': [1632794772, 1013.4],
'bar_day_min': [1632845772, 1006.2],
'bar_month_max': [1632220572, 1028],
'bar_month_min': [1632715572, 997.3],
'bar_year_max': [1614418512, 1038.1],
'bar_year_min': [1615434432, 988.1],
'wdir_current': [1632842172, 180],
..............}
That's it.

How can I use Python to get a download link with a password for onedrive

OneDrive is a great tool, but the downside is that for links shared by others there is no way to download them without a graphical interface.
I've recently been working on how to download OneDrive files that people have shared with me from the command line, and I've now worked out how to bulk push download links to Aria2, a download tool, when the shared links don't have passwords.
header = {
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'dnt': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'service-worker-navigation-preload': 'true',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-dest': 'iframe',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
}
def downloadFiles(originalPath, aria2URL, token):
req = requests.session()
req = req.get(originalPath, headers=header)
p = re.search(
'g_listData = {"wpq":"","Templates":{},"ListData":{ "Row" : ([\s\S]*?),"FirstRow"', req.text)
jsonData = json.loads(p.group(1))
redURL = req.url
redsURL = redURL.split("/")
downloadURL = "/".join(redsURL[:-1])+"/download.aspx?UniqueId="
# print(downloadURL)
s2 = parse.urlparse(redURL)
header["referer"] = redURL
header["cookie"] = req.headers["Set-Cookie"]
header["authority"] = s2.netloc
# .replace("-", "%2D")
# print(dd, [cc])
headerStr = ""
for key, value in header.items():
# print(key+':'+str(value))
headerStr += key+':'+str(value)+"\n"
# print(headerStr)
for i in jsonData:
cc = downloadURL+(i["UniqueId"][1:-1].lower())
dd = dict(out=i["FileLeafRef"], header=headerStr)
jsonreq = json.dumps({'jsonrpc': '2.0', 'id': 'qwer',
'method': 'aria2.addUri',
"params": ["token:"+token, [cc], dd]})
c = requests.post(aria2URL, data=jsonreq)
pprint(json.loads(c.text))
However, if the link itself has a password, although it is possible to post a link that verifies the password, it gets stuck in endless redirects, so how do I solve this problem?
The process of post a link to verify a password:
def getFilesHavePwd(originalPath, password):
req = requests.session()
req.cookies.update(header)
r = req.get(originalPath)
p = re.search(
'SideBySideToken" value="(.*?)" />', r.text)
SideBySideToken = p.group(1)
p = re.search(
'id="__VIEWSTATE" value="(.*?)" />', r.text)
__VIEWSTATE = p.group(1)
p = re.search(
'id="__VIEWSTATEGENERATOR" value="(.*?)" />', r.text)
__VIEWSTATEGENERATOR = p.group(1)
p = re.search(
'__EVENTVALIDATION" value="(.*?)" />', r.text)
__EVENTVALIDATION = p.group(1)
s2 = parse.urlparse(originalPath)
redURL = originalPath
redsURL = redURL.split("/")
shareQuery = s2.path.split("/")[-1]
redsURL[-1] = "guestaccess.aspx?"+s2.query+"&share="+shareQuery
pwdURL = "/".join(redsURL)
print(r.headers)
hewHeader = {
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'dnt': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'content-type': 'application/x-www-form-urlencoded',
"connection": "keep-alive",
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
"host": s2.netloc,
"origin": s2.scheme+"://"+s2.netloc,
"Referer": originalPath,
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
}
req.cookies.update(header)
r = req.post(pwdURL, data={
"__EVENTTARGET": "btnSubmitPassword",
"__EVENTARGUMENT": None,
"SideBySideToken": SideBySideToken,
"__VIEWSTATE": __VIEWSTATE,
"__VIEWSTATEGENERATOR": __VIEWSTATEGENERATOR,
"__VIEWSTATEENCRYPTED": None,
"__EVENTVALIDATION": __EVENTVALIDATION,
"txtPassword": password
}, headers=hewHeader)
print(r.headers, r.text)
A link for test:
https://acgmi-my.sharepoint.com/:f:/g/personal/support_acgmi_club/Elkm2koDXKFEq1onWqnefdwBTqR7wgm0MiNvmWWrurKskQ?e=qotSdz
Password:123456

Set API Key when doing reqests.get in Python

Im trying to get data from one api, and I have key for it, but Im always getting an 401 error and when I print it {"error":"API key required."}.
What am I doing wrong?
I have tried these 5 things, and error is always the same:
import json
import requests
api_url = "https://idir.uta.edu/claimbuster/api/v2/score/text/Denver%20lost%20to%20LA%20Lakers"
response = requests.get(api_url, headers = {'Authorization': 'Token xxx'})
#response = requests.get(api_url, headers = {'Authorization': 'xxx'})
#response = requests.get(api_url, headers = {'Auth': 'Token xxx'})
#response = requests.get(api_url, headers = {'Auth': 'xxx'})
#response = requests.get(api_url,key ='xxx')
#response = requests.get(api_url, headers = {'x-api-key': 'Token xxx'}
The documentation says it should be
import requests
headers = {
'Connection': 'keep-alive',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'x-api-key': 'Token xxx',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': '',
'Accept-Language': '*',
}
response = requests.get(api_url, headers=headers)

BeautifulSoup unsuccessful request

I'm using urlopen to extract data from a list of websites right now but keep running in problems with unsuccessful requests. Can anyone help me with it?
I save the website as HTML file
path = "/Users/runyao/Downloads/The Proposition 65 List | OEHHA.html"
soup = BeautifulSoup(open(path), "html.parser")
list = []
pattern = "(chemicals)+([a-z0-9-])"
for counter in range(1,852):
temp = str(soup.find_all('option')[counter])
temptext = temp.replace("\n","/")
temptext = temptext.replace('"',"/")
temptext = temptext.replace(">","")
templist = temptext.split("/")
list.append(templist[-4])
url = 'https://oehha.ca.gov/chemicals/'+ list[1] + '/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
values = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
data = data.encode('ascii')
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
the_page = response.read()
import urllib.parse
import urllib.request
​
url = "https://oehha.ca.gov/chemicals/"+ list[1]
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp, "html.parser")
print(respData)
<html>
<head>
<meta content="noindex,nofollow" name="robots"/>
<script src="/_Incapsula_Resource?SWJIYLWA=5074a744e2e3d891814e9a2dace20bd4,719d34d31c8e3a6e6fffd425f7e032f3">
</script>
<body>
</body></head></html>
And if you try adding more headers , like this....this work for me :
import requests
headers = {
'authority': 'oehha.ca.gov',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'es-ES,es;q=0.9',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
'if-none-match': '^\\^1575853039-1^\\^',
}
response = requests.get('https://oehha.ca.gov/chemicals/abiraterone-acetate', headers=headers)
print(response.content)
UPDATE:I have updated the code using urllib , probably have problems because they have blocked the IP from where you make the requests.
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/abiraterone-acetate"
# headers = {}
# headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
}
req = urllib.request.Request(url, data=None, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp,"html.parser",from_encoding="iso-8859-1")
print(respData)
Result :

Categories