Api calls post method throwing error : certificate verification - python

As table data is generating dynamically by JavaScript that's why I use api calls url
but throwing the above mentioned warning. Anyone's help is appreciated.
Base_URL
My code:
import requests
import pandas as pd
import json
body = { 'tipoEmpresa': '0'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'x-dtpc': '33$511511524_409h2vHHVRBIAIGILPJNCRGRCECUBIACWCBUEE-0e37',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json'
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.post(url, data=json.dumps(body), headers =headers, verify = False)
resp = r.json()['d']
#df = pd.DataFrame(resp)
#print(df)
main('https://www.rad.cvm.gov.br/ENET/frmConsultaExternaCVM.aspx/PopulaComboEmpresas')

Try to add these two lines at the beginning of your script to suppress warning messages:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

Related

Can't fetch tabular content from a webpage using requests

I would like to scrape tabular content from the landing page of this website. There are 100 rows in it's first page. When I observe network activity in dev tools, I could notice that some get requests is being issued to this url https://io6.dexscreener.io/u/ws3/screener3/ with appropriate parameters which ends up producing json content.
However, when I try to mimic that requests through my following efforts:
import requests
url = 'https://io6.dexscreener.io/u/ws3/screener3/'
params = {
'EIO': '4',
'transport': 'polling',
't': 'NwYSrFK',
'sid': 'ztAOHWOb-1ulTq-0AQwi',
}
headers = {
'accept': '*/*',
'referer': 'https://dexscreener.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
with requests.Session() as s:
s.headers.update(headers)
res = s.get(url,params=params)
print(res.content)
I get this response:
`{"code":3,"message":"Bad request"}`
How can I get response having tabular content from that webpage?
Here is a very quick and dirty piece of python code that does the initial handshake and sets up the websocket connection and downloads the data in json format infinitely. I haven't tested this code extensively and I am not sure exactly what is necessary or not (in terms of the steps in the handshake) but I have mimicked the browser behaviour and it seems to work fine:
import requests
from websocket import create_connection
import json
s = requests.Session()
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://dexscreener.com/ethereum'
resp = s.get(url,headers=headers)
print(resp)
step1 = s.get('https://io3.dexscreener.io/u/ws3/screener3/?EIO=4&transport=polling&t=Nwof-Os')
step2 = s.get('https://io4.dexscreener.io/u/ws3/screener3/?EIO=4&transport=polling&t=Nwof-S5')
obj = json.loads(step2.text[1:])
code = obj['sid']
payload = '40/u/ws/screener/consolidated/platform/ethereum/h1/top/1,'
step3 = s.post(f'https://io4.dexscreener.io/u/ws3/screener3/?EIO=4&transport=polling&t=Nwof-Xt&sid={code}',data=payload)
step4 = s.get(f'https://io4.dexscreener.io/u/ws3/screener3/?EIO=4&transport=polling&t=Nwof-Xu&sid={code}')
d = step4.text.replace('','').replace('42/u/ws/screener/consolidated/platform/ethereum/h1/top/1,','').replace(payload,'')
start = '["screener",'
end = ']["latestBlock",'
dirty = d[d.find(start)+len(start):d.rfind(end)].strip()
clean = json.loads(dirty)
print(clean)
# Initialize the headers needed for the websocket connection
headers = json.dumps({
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-ZA,en;q=0.9,en-GB;q=0.8,en-US;q=0.7,de;q=0.6',
'Cache-Control':'no-cache',
'Connection':'Upgrade',
'Host':'io3.dexscreener.io',
'Origin':'https://dexscreener.com',
'Pragma':'no-cache',
'Sec-WebSocket-Extensions':'permessage-deflate; client_max_window_bits',
'Sec-WebSocket-Key':'ssklBDKxAOUt3D47SoEttQ==',
'Sec-WebSocket-Version':'13',
'Upgrade':'websocket',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
})
# Then create a connection to the tunnel
ws = create_connection(f"wss://io4.dexscreener.io/u/ws3/screener3/?EIO=4&transport=websocket&sid={code}",headers=headers)
# Then send the initial messages through the tunnel
ws.send('2probe')
ws.send('5')
# Here you will view the message return from the tunnel
while True:
try:
json_data = json.loads(ws.recv().replace('42/u/ws/screener/consolidated/platform/ethereum/h1/top/1,',''))
print(json_data)
except:
pass

Parsing ASPX site with Python POST request

I am trying to perfom parsing but when I send POST method to get searching results, getting page with error:
The requested URL was rejected. Please consult with your administrator.
Website: https://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx
I've collected data like viewstate, viewstategenerator etc.. to pass throught form but doesn't work.
What am I missing?
#import requests
from bs4 import BeautifulSoup
import lxml
import urllib
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
import time
#s = HTMLSession(browser_args=["--no-sandbox", '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'])
s= HTMLSession()
header_simple = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'HTTP_ACCEPT': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Content-Type': 'application/x-www-form-urlencoded',
}
r = s.request('get', 'http://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx')
soup_dummy = BeautifulSoup(r.content, "lxml")
# parse and retrieve two vital form values
viewstate = soup_dummy.select("#__VIEWSTATE")[0]['value']
viewstategen = soup_dummy.select("#__VIEWSTATEGENERATOR")[0]['value']
eventvalidation = soup_dummy.select("#__EVENTVALIDATION")[0]['value']
english = soup_dummy.select("#hfEnglishWebsiteUrl")[0]['value']
data = {
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00$MainContent$txtName': 'bank',
'ctl00$MainContent$cbIncludeCeased': 'on',
'ctl00$MainContent$btnSearch': 'Find',
'ctl00$hfAuthRequired': 'False',
'ctl00$hfEnglishWebsiteUrl': english,
'ctl00$stWarningLength': '30',
'ctl00$stIdleAfter': '1200',
'ctl00$stPollingInterval': '60',
'ctl00$stMultiTabTimeoutSyncInterval': '20'
}
time.sleep(3)
p = s.request('post', 'https://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx', params=data, headers=header_simple)
print(p.content)
This is one of the ways how you can populate results from that page using requests module. Be sure to include all the keys and values within data parameters while sending with post requests in order to access the desired content.
Working script:
import lxml
import requests
from pprint import pprint
from bs4 import BeautifulSoup
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
r = s.get('http://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx')
soup = BeautifulSoup(r.text,"lxml")
data = {i['name']:i.get('value','') for i in soup.select('input[name]')}
data['ctl00$MainContent$txtName'] = 'bank'
data['ctl00$MainContent$cbIncludeCeased'] = 'on'
data['ctl00$MainContent$btnSearch'] = 'Find'
data.pop('ctl00$MainContent$btnClear')
data.pop('ctl00$versionDetails$btnClose')
# pprint(data) #print it to see the keys and values that have been included within data
p = s.post('https://prod.ceidg.gov.pl/CEIDG/CEIDG.Public.UI/Search.aspx', data=data)
soup = BeautifulSoup(p.text,"lxml")
print(soup.select_one("table#MainContent_DataListEntities"))

WEB SCRAPING - python requests session not able to gather data

I've seen some similar threads but neither gave me the answer. I simply need to get html content from one website. I'm sending the POST request with data for particular case and then using GET requests I want to scrape the text from html. The problem is that I always receive the first page's content. Not sure what I am doing wrong.
import requests
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded',
'Origin':'https://przegladarka-ekw.ms.gov.pl',
'Referer':'https://przegladarka-ekw.ms.gov.pl/eukw_prz/KsiegiWieczyste/wyszukiwanieKW',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
data = {
'kodWydzialu':'PT1R',
'nrKw':'00037314',
'cyfraK':'9',
}
url = 'https://przegladarka-ekw.ms.gov.pl/eukw_prz/KsiegiWieczyste/wyszukiwanieKW'
r = requests.session()
r.post(url, data=data, headers=headers)
final_content = r.get(url, headers=headers)
print(final_content.text)
The GET requests come from ("https://przegladarka-ekw.ms.gov.pl/eukw_prz/eukw201906070952/js/jquery-1.11.0_min.js
") but it returns a wall of code. My goal is to scrape the page which appears after providing the data from above to search menu.
try this
import json
import urllib.request
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded',
'Origin':'https://przegladarka-ekw.ms.gov.pl',
'Referer':'https://przegladarka-ekw.ms.gov.pl/eukw_prz/KsiegiWieczyste/wyszukiwanieKW',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
data = {
'kodWydzialu':'PT1R',
'nrKw':'00037314',
'cyfraK':'9',
}
url = 'https://przegladarka-ekw.ms.gov.pl/eukw_prz/KsiegiWieczyste/wyszukiwanieKW'
r=urllib.request.urlopen(url, data=bytes(json.dumps(data), encoding="utf-8"))
final_content = r
for i in r:
print(i)

I want to send file as attachment to an API POST request

I am trying to send a file for a candidate in POST request naturalHR API:
I have tried the same request using POSTMAN and it worked fine. But when i try to integrate the API's POST request using python to attach the file I am getting an error that It cv parameter should be a file(its API error response).
Source Code:
from pprint import pprint
import json
import requests
import urllib.request
headers = {
'accept': 'application/json',
'Authorization': api_key,
'Host': 'api02.naturalhr.net',
'Referer': 'https://api02.naturalhr.net/api/documentation',
'Content-type': 'multipart/form-data',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
payLoad = dict()
payLoad["firstname"] = json_of_vals['firstname']
payLoad["surname"] = json_of_vals['surname']
payLoad["email"] = json_of_vals['email']
payLoad["cv"] = "Path/To/PDF_File"
files = {'file': "outfilename.pdf"}
api_url = "https://api02.naturalhr.net/api/v1/candidate"
res = requests.post(api_url, files=files, headers=headers, data=request_data)
print(res.content)
Please dont mark this as a duplicate to a question here which is already been answered because I have tested it by using files as request's argument like:
res = requests.post(api_url, files=files, headers=headers, data=request_data)
Edited:
The answer which I have tried:
Using Python Requests to send file and JSON in single request
I was adding a header
'accept': 'application/json'
Which should not be there, I tried it using only user-agent and an API-key and it worked perfectly fine as per requirements.
Corrected Code:
from pprint import pprint
import json
import requests
import urllib.request
headers = {
'Authorization': api_key,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
payLoad = dict()
payLoad["firstname"] = json_of_vals['firstname']
payLoad["surname"] = json_of_vals['surname']
payLoad["email"] = json_of_vals['email']
files = {'file': "PATH/TO/FILE/FROM/LOCAL/DRIVE"}
api_url = "https://api02.naturalhr.net/api/v1/candidate"
res = requests.post(api_url, headers=headers, data=payLoad, files=files)
print("Status Code is: ", res.status_code)
print("Returned JSON Response is:\n")
pprint(res.text)

python requests does not POST after redirect

For some reason python requests does not do rePOST after encountered redirect header
import requests
proxies = {'http': 'http://127.0.0.1:8888',}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
'content-type': 'application/x-www-form-urlencoded'}
r = requests.post(url, data, headers=headers, timeout=timeout, proxies=proxies, allow_redirects=True,)
html = r.text
So it means I can't login to any form that is behind redirect. How can I solve this issue? Thank you!

Categories