I am trying to access this URL by using requests.get() but it gives 405. No such issue when accessing via browser.
I have added the proper headers required, try this
import requests
headers = {
"Host": "www.propertyshark.com",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
}
url = "https://www.propertyshark.com/mason/Property/53047525/1302-Richland-Ave-Santa-Ana-CA-92703/"
req = requests.get(url, headers=headers)
print(req.status_code)
Just a guess,probably because you are trying to access the page directly.
Can you try adding a referral url
Where my_referer is the url of the website home page
requests.get(url, headers={'referer': my_referer})
Related
The goal is to get access to the subsciption-based content of the website.
I am trying to post my user/pass to https://www.ukwhoswho.com/ and get access to the (un)locked content.
Please note that the log-in box that I should use is the one within the website (to the left) NOT the "Personal Profile" on top right of the screen.
I have used the code below but I cannot log-in, can anyone please tell me what I am doing wrong?
import requests
params = {'user':username, 'pass':password}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'} # This is chrome, you can set whatever browser you like
r = requests.post('https://www.ukwhoswho.com/LOGIN', data=params, headers=headers)
r
I have also tried it using the one below but I still get the same error.
import requests
post_url = 'https://www.ukwhoswho.com/LOGIN'
client = requests.session()
r = client.get('https://www.ukwhoswho.com/')
header_info = {
"Host": "www.ukwhoswho.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Content-Type": "application/x-www-form-urlencoded",
"Content-Length": "39",
"Origin": "https://www.ukwhoswho.com",
"DNT": "1",
"Connection": "keep-alive",
"Referer": "https://www.ukwhoswho.com/"
}
payload = {'user':username, 'pass':password}
r = client.post(post_url, data=payload, headers = header_info)
print(r.text)
print(r.status_code)
Any help is much appreciated.
I go onto this webpage
https://iso19139echnap.geocat.live/geonetwork/doc/api/index.html#/records/getRecord
and try this API call under Records/get/get a metadata record.
Worked,
However if I try to call the API in python, it responds 403
import requests
url_metadata = "https://iso19139echnap.geocat.live/geonetwork/srv/api/0.1/records/d1ec996c-d21c-4bc4-9888-6f1722b44a57"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Cookie": "XSRF-TOKEN=97bb29dd-9165-4fd4-bbd1-e2c72bffa509; JSESSIONID=78C1024AF960D630A4EA49DA02DFC89A; serverTime=1615580729954; sessionExpiry=1615582829954",
"Host": "iso19139echnap.geocat.live",
"Referer": "https://iso19139echnap.geocat.live/geonetwork/doc/api/index.html",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36 Edg/89.0.774.45",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
}
payload = {}
r_metadata = requests.request("GET", url_metadata, headers=headers, data=payload)
print("single metadata api status: "+ str(r_metadata))
It's an authentication problem, you need to include X-XSRF-TOKEN as a header. please refer to this answer on how to send a request to GeoNetwork from an API client.
I want to scrape using beautiful soup and python requests a website that requires a login first, I'm able to login by giving my username and password via a post request, however making a get request within the same session after login yeilds error 403(FORBIDDEN), is there a solution to this? The last line in my code is producing a 'forbidden' message, is there a workaround?
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
payload = {
'login' : '#my_username' , 'password': '#my_password', 'remember_me': 'false', 'fallback': 'false'
}
with requests.Session() as s:
url = 'https://www.hackerrank.com/auth/login'
r = s.get(url , headers = headers)
soup = BeautifulSoup(r.content , 'html5lib')
r = s.post(url , data = payload , headers = headers)
print(r.content)
s.get('Webpage_that_can_be_accessed_only_after_login' , headers = headers)
I did the almost the same thing only difference was that I passed the exact header I saw being passed in chrome and passed csrf_token
import requests
import json
import sys
from bs4 import BeautifulSoup
#header string picked from chrome
headerString='''
{
"accept": "text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9',text/html,application/xhtml+xml,application/xml;q':0.9,image/avif,image/webp,image/apng,*/*;q':0.8,application/signed-exchange;v':b3;q':0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q':0.9",
"cache-control": "max-age=0",
"cookie": "hackerrank_mixpanel_token':7283187c-1f24-4134-a377-af6c994db2a0; hrc_l_i':F; _hrank_session':653fb605c88c81624c6d8f577c9094e4f8657136ca3487f07a3068c25080706db7178cc4deda978006ce9d0937c138b52271e3cd199fda638e8a0b8650e24bb7; _ga':GA1.2.397113208.1599678708; _gid':GA1.2.933726361.1599678708; user_type':hacker; session_id':h3xb3ljp-1599678763378; __utma':74197771.397113208.1599678708.1599678764.1599678764.1; __utmc':74197771; __utmz':74197771.1599678764.1.1.utmcsr':(direct)|utmccn':(direct)|utmcmd':(none); __utmt':1; __utmb':74197771.3.10.1599678764; _biz_uid':5969ac22487d4b0ff8d000621de4a30c; _biz_sid:79bd07; _biz_nA':1; _biz_pendingA':%5B%5D; _biz_flagsA':%7B%22Version%22%3A1%2C%22ViewThrough%22%3A%221%22%2C%22XDomain%22%3A%221%22%7D; _gat_UA-45092266-28':1; _gat_UA-45092266-26':1; session_referrer':https%3A%2F%2Fwww.google.com%2F; session_referring_domain':www.google.com; session_landing_url':https%3A%2F%2Fwww.hackerrank.com%2Fprefetch_data%3Fcontest_slug%3Dmaster%26get_feature_feedback_list%3Dtrue",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
'''
d=json.loads(headerString)
#creating session
s = requests.Session()
url='https://www.hackerrank.com/auth/login'
r=s.get(url, headers=d)
#getting the csrf_token
soup = BeautifulSoup(r.text, 'html.parser')
csrf_token=soup.find('meta', id='csrf-token')['content']
#using it in login post call
request_header={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"x-csrf-token": csrf_token
}
payload={"login":"<user-name>","password":"<password>","remember_me":False,"fallback":True}
r=s.post(url, headers=request_header, data=payload)
#then I tested if login is successful by going into dashboard page
d=json.loads(r.text)
csrf_token=d['csrf_token']
url='https://www.hackerrank.com/dashboard'
request_header={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
"x-csrf-token": csrf_token
}
r=s.get(url, headers=request_header, data=payload)
print(r.text)```
I am trying to scrape https://www.vitals.com/locations/primary-care-doctors/ny. I have been able to scrape other sites by editing my headers, but I keep getting a 403 error with this one.
from bs4 import BeautifulSoup
import requests
with requests.Session() as se:
se.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Language": "en-US,en;q=0.9",
}
test_sites = [
'http://fashiontoast.com/',
'https://www.vitals.com/locations/primary-care-doctors/ny',
'http://www.seaofshoes.com/',
]
for site in test_sites:
print(site)
#get page soure
response = se.get(site)
print(response)
#print(response.text)
Try adding the code to the with statement as follows
from bs4 import BeautifulSoup
import requests
with requests.Session() as se:
se.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Language": "en-US,en;q=0.9",
}
test_sites = [
'http://fashiontoast.com/',
'https://www.vitals.com/locations/primary-care-doctors/ny',
'http://www.seaofshoes.com/',
]
for site in test_sites:
print(site)
#get page soure
response = se.get(site)
print(response)
#print(response.text)
I'm trying to scraping an AJAX loaded part on a webpage without executing the javascript. By using Chrome dev tool, I found that the AJAX container is pulling the content from a URL through a POST request, so I want to duplicate the request with python requests package. But strangely, by using the Headers information given from Chrome, I always get 400 error, and the same happens with the curl command copied from Chrome. So I'm wondering whether someone could kindly share some insights.
The website I'm interested in is here. Using Chrome: ctrl-shift-I, network, XHR, and the part I want is 'content'. The script I'm using is:
headers = {"authority": "cafe.bithumb.com",
"path": "/boards/43/contents",
"method": "POST",
"origin":"https://cafe.bithumb.com",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
"accept-encoding":"gzip, deflate, br",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"accept":"application/json, text/javascript, */*; q=0.01",
"referer":"https://cafe.bithumb.com/view/boards/43",
"x-requested-with":"XMLHttpRequest",
"scheme": "https",
"content-length":"1107"}
s=requests.Session()
s.headers.update(headers)
r = s.post('https://cafe.bithumb.com/boards/43/contents')
You just need to compare two post data, then you will find they have almost same except the a few parameter(draw=page...start=xx). That means you can scrape Ajax data by modifying draw and start.
Edit: Data was transformed to dictionary so we do not need urlencode, also we don't need cookie(i tested).
import requests
import json
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Origin": "https://cafe.bithumb.com",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
"DNT": "1",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Referer": "https://cafe.bithumb.com/view/boards/43",
"Accept-Encoding": "gzip, deflate, br"
}
string = """columns[0][data]=0&columns[0][name]=&columns[0][searchable]=true&columns[0][orderable]=false&columns[0][search][value]=&columns[0][search][regex]=false&columns[1][data]=1&columns[1][name]=&columns[1][searchable]=true&columns[1][orderable]=false&columns[1][search][value]=&columns[1][search][regex]=false&columns[2][data]=2&columns[2][name]=&columns[2][searchable]=true&columns[2][orderable]=false&columns[2][search][value]=&columns[2][search][regex]=false&columns[3][data]=3&columns[3][name]=&columns[3][searchable]=true&columns[3][orderable]=false&columns[3][search][value]=&columns[3][search][regex]=false&columns[4][data]=4&columns[4][name]=&columns[4][searchable]=true&columns[4][orderable]=false&columns[4][search][value]=&columns[4][search][regex]=false&start=30&length=30&search[value]=&search[regex]=false"""
article_root = "https://cafe.bithumb.com/view/board-contents/{}"
for page in range(1,4):
with requests.Session() as s:
s.headers.update(headers)
data = {"draw":page}
data.update( { ele[:ele.find("=")]:ele[ele.find("=")+1:] for ele in string.split("&") } )
data["start"] = 30 * (page - 1)
r = s.post('https://cafe.bithumb.com/boards/43/contents', data = data, verify = False) # set verify = False while you are using fiddler
json_data = json.loads(r.text).get("data") # transform string to dict then we can extract data easier
for each in json_data:
url = article_root.format(each[0])
print(url)