I am trying to scrape https://www.vitals.com/locations/primary-care-doctors/ny. I have been able to scrape other sites by editing my headers, but I keep getting a 403 error with this one.
from bs4 import BeautifulSoup
import requests
with requests.Session() as se:
se.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Language": "en-US,en;q=0.9",
}
test_sites = [
'http://fashiontoast.com/',
'https://www.vitals.com/locations/primary-care-doctors/ny',
'http://www.seaofshoes.com/',
]
for site in test_sites:
print(site)
#get page soure
response = se.get(site)
print(response)
#print(response.text)
Try adding the code to the with statement as follows
from bs4 import BeautifulSoup
import requests
with requests.Session() as se:
se.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Language": "en-US,en;q=0.9",
}
test_sites = [
'http://fashiontoast.com/',
'https://www.vitals.com/locations/primary-care-doctors/ny',
'http://www.seaofshoes.com/',
]
for site in test_sites:
print(site)
#get page soure
response = se.get(site)
print(response)
#print(response.text)
Related
The goal is to get access to the subsciption-based content of the website.
I am trying to post my user/pass to https://www.ukwhoswho.com/ and get access to the (un)locked content.
Please note that the log-in box that I should use is the one within the website (to the left) NOT the "Personal Profile" on top right of the screen.
I have used the code below but I cannot log-in, can anyone please tell me what I am doing wrong?
import requests
params = {'user':username, 'pass':password}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'} # This is chrome, you can set whatever browser you like
r = requests.post('https://www.ukwhoswho.com/LOGIN', data=params, headers=headers)
r
I have also tried it using the one below but I still get the same error.
import requests
post_url = 'https://www.ukwhoswho.com/LOGIN'
client = requests.session()
r = client.get('https://www.ukwhoswho.com/')
header_info = {
"Host": "www.ukwhoswho.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Content-Type": "application/x-www-form-urlencoded",
"Content-Length": "39",
"Origin": "https://www.ukwhoswho.com",
"DNT": "1",
"Connection": "keep-alive",
"Referer": "https://www.ukwhoswho.com/"
}
payload = {'user':username, 'pass':password}
r = client.post(post_url, data=payload, headers = header_info)
print(r.text)
print(r.status_code)
Any help is much appreciated.
Im trying to send a post request using vba and struggling to convert my python code to vba.
In python I can do this
import requests
headers = {
"authority": "platform.example.com",
"accept": "application/json, text/plain, */*",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36",
"content-type": "application/json",
"sec-fetch-site": "same-site",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"accept-language": "en-US,en;q=0.9",
}
data = "{"messages":[{"to":"12345","channel":"sms","content":"Test message"}]}"
r=requests.post("https://example.com/v1/message", headers=headers, data=data)
I have tried this in vba but no luck. Msgbox comes up blank
Sub test()
Set objHTTP = CreateObject("MSXML2.ServerXMLHTTP")
Url = "https://example.com/v1/message"
objHTTP.Open "POST", Url, False
objHTTP.setRequestHeader "authority", "platform.example.com",
objHTTP.setRequestHeader "accept", "application/json, text/plain, */*"
objHTTP.setRequestHeader "user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
objHTTP.setRequestHeader "content-type", "application/json"
objHTTP.setRequestHeader "sec-fetch-site", "same-site"
objHTTP.setRequestHeader "sec-fetch-mode", "cors"
objHTTP.setRequestHeader "sec-fetch-dest", "empty"
objHTTP.setRequestHeader "accept-language", "en-US,en;q=0.9"
Data = """messages=[{'to':'1234','channel':'sms','content':'Test message'}]"""
objHTTP.send Data
MsgBox objHTTP.ResponseText
End Sub
How can I get the vba working?
Try with double-quotes to replicate the Data that works:
Data = "{""messages"":[{""to"":""1234"",""channel"":""sms"",""content"":""Test message""}]}"
I want to scrape using beautiful soup and python requests a website that requires a login first, I'm able to login by giving my username and password via a post request, however making a get request within the same session after login yeilds error 403(FORBIDDEN), is there a solution to this? The last line in my code is producing a 'forbidden' message, is there a workaround?
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
payload = {
'login' : '#my_username' , 'password': '#my_password', 'remember_me': 'false', 'fallback': 'false'
}
with requests.Session() as s:
url = 'https://www.hackerrank.com/auth/login'
r = s.get(url , headers = headers)
soup = BeautifulSoup(r.content , 'html5lib')
r = s.post(url , data = payload , headers = headers)
print(r.content)
s.get('Webpage_that_can_be_accessed_only_after_login' , headers = headers)
I did the almost the same thing only difference was that I passed the exact header I saw being passed in chrome and passed csrf_token
import requests
import json
import sys
from bs4 import BeautifulSoup
#header string picked from chrome
headerString='''
{
"accept": "text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9',text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q':0.9",
"cache-control": "max-age=0",
"cookie": "hackerrank_mixpanel_token':7283187c-1f24-4134-a377-af6c994db2a0; hrc_l_i':F; _hrank_session':653fb605c88c81624c6d8f577c9094e4f8657136ca3487f07a3068c25080706db7178cc4deda978006ce9d0937c138b52271e3cd199fda638e8a0b8650e24bb7; _ga':GA1.2.397113208.1599678708; _gid':GA1.2.933726361.1599678708; user_type':hacker; session_id':h3xb3ljp-1599678763378; __utma':74197771.397113208.1599678708.1599678764.1599678764.1; __utmc':74197771; __utmz':74197771.1599678764.1.1.utmcsr':(direct)|utmccn':(direct)|utmcmd':(none); __utmt':1; __utmb':74197771.3.10.1599678764; _biz_uid':5969ac22487d4b0ff8d000621de4a30c; _biz_sid:79bd07; _biz_nA':1; _biz_pendingA':%5B%5D; _biz_flagsA':%7B%22Version%22%3A1%2C%22ViewThrough%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _gat_UA-45092266-28':1; _gat_UA-45092266-26':1; session_referrer':https%3A%2F%2Fwww.google.com%2F; session_referring_domain':www.google.com; session_landing_url':https%3A%2F%2Fwww.hackerrank.com%2Fprefetch_data%3Fcontest_slug%3Dmaster%26get_feature_feedback_list%3Dtrue",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
'''
d=json.loads(headerString)
#creating session
s = requests.Session()
url='https://www.hackerrank.com/auth/login'
r=s.get(url, headers=d)
#getting the csrf_token
soup = BeautifulSoup(r.text, 'html.parser')
csrf_token=soup.find('meta', id='csrf-token')['content']
#using it in login post call
request_header={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"x-csrf-token": csrf_token
}
payload={"login":"<user-name>","password":"<password>","remember_me":False,"fallback":True}
r=s.post(url, headers=request_header, data=payload)
#then I tested if login is successful by going into dashboard page
d=json.loads(r.text)
csrf_token=d['csrf_token']
url='https://www.hackerrank.com/dashboard'
request_header={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"x-csrf-token": csrf_token
}
r=s.get(url, headers=request_header, data=payload)
print(r.text)```
I am trying to get the http end point data which is in json format using the below code but the output which I receive is not as it's visible at the link posted below. Can you share your inputs on how to do this?
Code :-
import requests
r = requests.get('http://headers.jsontest.com/')
print( r.json() )
Output what I am receiving :-
{u'Host': u'headers.jsontest.com', u'User-Agent': u'python-requests/2.18.4', u'Accept': u'*/*', u'X-Cloud-Trace-Context': u'9034981eba16654b42daa1d10f503cab/2431501768736408823'}
Output what I would like to see( Pls open the link ) :-
http://headers.jsontest.com/
{
"X-Cloud-Trace-Context": "37e49d37206663367d675c2ff01db8a2/15699104367477007142",
"Upgrade-Insecure-Requests": "1",
"Accept-Language": "en-US,en;q=0.9",
"X-IMForwards": "20",
"Host": "headers.jsontest.com",
"Referer": "http://www.jsontest.com/",
"Via": "1.1 ironport-s680.air-worldwide.com:80 (Cisco-WSA/9.0.1-162)",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
}
In fact you do get the correct HTTP response. It just depends on how you send it (through browser or an empty request from Python)
When you open it through your browser, additional headers are added by the browser itself. When you send your request through requests in Python, those headers are not sent.
The HTTP response just mirrors the headers we sent to it. So if you want to see the same response, you'll have to send the headers manually using requests.
You can add them like that:
headers = {
"X-Cloud-Trace-Context": "37e49d37206663367d675c2ff01db8a2/15699104367477007142",
"Upgrade-Insecure-Requests": "1",
"Accept-Language": "en-US,en;q=0.9",
"X-IMForwards": "20",
"Host": "headers.jsontest.com",
"Referer": "http://www.jsontest.com/",
"Via": "1.1 ironport-s680.air-worldwide.com:80 (Cisco-WSA/9.0.1-162)",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
}
import requests
r = requests.get('http://headers.jsontest.com/', headers = headers)
print( r.json() )
This returns:
{u'Via': u'1.1 ironport-s680.air-worldwide.com:80 (Cisco-WSA/9.0.1-162)', u'Acce
pt-Language': u'en-US,en;q=0.9', u'X-IMForwards': u'20', u'Accept': u'text/html,
application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', u'
Upgrade-Insecure-Requests': u'1', u'Host': u'headers.jsontest.com', u'Referer':
u'http://www.jsontest.com/', u'X-Cloud-Trace-Context': u'37e49d37206663367d675c2
ff01db8a2/14205897462093082399', u'User-Agent': u'Mozilla/5.0 (Windows NT 6.1; W
in64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/53
7.36'}
I am trying to access this URL by using requests.get() but it gives 405. No such issue when accessing via browser.
I have added the proper headers required, try this
import requests
headers = {
"Host": "www.propertyshark.com",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
}
url = "https://www.propertyshark.com/mason/Property/53047525/1302-Richland-Ave-Santa-Ana-CA-92703/"
req = requests.get(url, headers=headers)
print(req.status_code)
Just a guess,probably because you are trying to access the page directly.
Can you try adding a referral url
Where my_referer is the url of the website home page
requests.get(url, headers={'referer': my_referer})