I'm using urlopen to extract data from a list of websites right now but keep running in problems with unsuccessful requests. Can anyone help me with it?
I save the website as HTML file
path = "/Users/runyao/Downloads/The Proposition 65 List | OEHHA.html"
soup = BeautifulSoup(open(path), "html.parser")
list = []
pattern = "(chemicals)+([a-z0-9-])"
for counter in range(1,852):
temp = str(soup.find_all('option')[counter])
temptext = temp.replace("\n","/")
temptext = temptext.replace('"',"/")
temptext = temptext.replace(">","")
templist = temptext.split("/")
list.append(templist[-4])
url = 'https://oehha.ca.gov/chemicals/'+ list[1] + '/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
values = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
data = data.encode('ascii')
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
the_page = response.read()
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/"+ list[1]
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp, "html.parser")
print(respData)
<html>
<head>
<meta content="noindex,nofollow" name="robots"/>
<script src="/_Incapsula_Resource?SWJIYLWA=5074a744e2e3d891814e9a2dace20bd4,719d34d31c8e3a6e6fffd425f7e032f3">
</script>
<body>
</body></head></html>
And if you try adding more headers , like this....this work for me :
import requests
headers = {
'authority': 'oehha.ca.gov',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'es-ES,es;q=0.9',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
'if-none-match': '^\\^1575853039-1^\\^',
}
response = requests.get('https://oehha.ca.gov/chemicals/abiraterone-acetate', headers=headers)
print(response.content)
UPDATE:I have updated the code using urllib , probably have problems because they have blocked the IP from where you make the requests.
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/abiraterone-acetate"
# headers = {}
# headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
}
req = urllib.request.Request(url, data=None, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp,"html.parser",from_encoding="iso-8859-1")
print(respData)
Result :
Related
I am trying to retrieve the html response code of a website redirected by 302. It seams that python3 is not following the temporary redirect and returns the original page instead.
Is there a way to configure the command in order to follow the 302?
import random
import requests
from pprint import pprint
from bs4 import BeautifulSoup
url = 'https://www.zurrose.de/catalogsearch/result?q=11343654'
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'Referer': 'https://www.zurrose.de',
'Connection': 'keep-alive'
}
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36,',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
]
headers['User-Agent'] = random.choice(user_agents)
r = requests.post(url, headers=headers)
print('start')
print(r.history[0].status_code)
if r.history:
print("Request was redirected")
print(r.history)
for resp in r.history:
print(resp.status_code, resp.url)
print("Final destination:")
print(r.status_code, r.url)
if not r.history:
print(f'No redirect on {url}. Status {r.status_code}. PP URL not found for {sku}')
elif r.history[0].status_code < 300:
print(f'No PP URL retrieved for {sku} on {url}. Status {r.history[0].status_code}')
soup = BeautifulSoup(r.content, 'html.parser')
for i in soup.select('link[rel*=canonical]'):
# print(i['href'])
url_pp = i['href']
print(url_pp)
pprint(r.content)
The method you want to use will not redirect you to the correct page. The matter is that the redirect occurs in cookies. You need to send request in session. I ll show 2 ways how to get the page you need
import requests
from bs4 import BeautifulSoup
def show_product_ajax(sku):
url = f"https://www.zurrose.de/search/ajax/suggest/?q={sku}"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
for product in requests.request("GET", url, headers=headers).json():
print(product['title'], product['url'], BeautifulSoup(product['price'], features='lxml').find('span', class_='price').getText())
def show_product_session(sku):
url = f'https://www.zurrose.de/catalogsearch/result?q={sku}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
s = requests.Session()
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, features='lxml')
product_url = soup.find('meta', attrs={'itemprop': 'url'}).get('content')
product_title = soup.find('span', attrs={'itemprop': 'name', 'data-ui-id': 'page-title-wrapper'}).getText()
product_price = soup.find('span', class_='price').getText()
print(product_title, product_url, product_price)
show_product_ajax(11343654)
show_product_session(11343654)
Result:
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764 23,50 €
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel-15 St https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764?variation=344764_6502482 23,50 €
I'm trying to scrape all the follower names from a profile page using requests module. The problem is when I run the script below, I get the first 20 names over and over again.
The parameters used in post requests only have two keys and values like size:20 and continuation:timestamp. I tried to use the parameters in the right way but still I get the same results repeatedly.
import time
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 20}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
with requests.Session() as s:
s.headers.update(headers)
while True:
res = s.post(link,params=params,json=payload)
print(s.headers)
for item in res.json():
print(item['owner'].get('name',''))
payload['continuation'] = f"{int(time.time() * 1000)}"
time.sleep(2)
How can I parse all the follower names from that page using requests?
Your next continuation value is in X-CONTINUATION response header, so this will work when increasing size in payload doesn't:
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 20}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
res = requests.post(link, headers=headers, params=params, json=payload)
print(res.headers["X-CONTINUATION"])
while True:
for item in res.json():
print(item['owner'].get('name',))
if not res.headers["X-CONTINUATION"]:
break
payload['continuation'] = res.headers["X-CONTINUATION"]
res = requests.post(link, headers=headers, params=params, json=payload)
some api may block you from extracting values more than certain limit and also may show in pages with limits.
For me just increasing the size payload worked with your code.
import time
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 10000}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
with requests.Session() as s:
s.headers.update(headers)
res = s.post(link,params=params,json=payload)
print(len(res.json()))
for item in res.json():
print(item['owner'].get('name',''))
I would like to get the json data from for instance https://app.weathercloud.net/d0838117883#current using python requests module.
I tried:
import re
import requests
device='0838117883'
URL='https://app.weathercloud.net'
URL1=URL+'/d'+device
URL2=URL+'/device/stats'
headers={'Content-Type':'text/plain; charset=UTF-8',
'Referer':URL1,
'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36',
'Accept':'application/json, text/javascript,*/*'}
with requests.Session() as s:
#get html from URL1 in order to get the CSRF token
page = s.get(URL1)
CSRF=re.findall('WEATHERCLOUD_CSRF_TOKEN:"(.*)"},',page.text)[0]
#create parameters for URL2, in order to get the json file
params={'code':device,'WEATHERCLOUD_CSRF_TOKEN':CSRF}
page_stats=requests.get(URL2,params=params,headers=headers)
print(page_stats.url)
print(page_stats) #<Response [200]>
print(page_stats.text) #empty
print(page_stats.json()) #error
But the page_stats is empty.
How can I get the stats data from weathercloud?
Inspecting the page with DevTools, you'll find a useful endpoint:
https://app.weathercloud.net/device/stats
You can "replicate" the original web request made by your browser with requests library:
import requests
cookies = {
'PHPSESSID': '************************',
'WEATHERCLOUD_CSRF_TOKEN':'***********************',
'_ga': '**********',
'_gid': '**********',
'__gads': 'ID=**********',
'WeathercloudCookieAgreed': 'true',
'_gat': '1',
'WEATHERCLOUD_RECENT_ED3C8': '*****************',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '^\\^Google',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'sec-ch-ua-platform': '^\\^Windows^\\^',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://app.weathercloud.net/d0838117883',
'Accept-Language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7,es;q=0.6',
}
params = (
('code', '0838117883'),
('WEATHERCLOUD_CSRF_TOKEN', '****************'),
)
response = requests.get('https://app.weathercloud.net/device/stats', headers=headers, params=params, cookies=cookies)
# Serializing json
json_object = json.loads(response.text)
json Output:
{'last_update': 1632842172,
'bar_current': [1632842172, 1006.2],
'bar_day_max': [1632794772, 1013.4],
'bar_day_min': [1632845772, 1006.2],
'bar_month_max': [1632220572, 1028],
'bar_month_min': [1632715572, 997.3],
'bar_year_max': [1614418512, 1038.1],
'bar_year_min': [1615434432, 988.1],
'wdir_current': [1632842172, 180],
..............}
That's it.
I want to scrape data from the shipping page of our company logistics which based on ASP. I watched a lot of tutorial on Internet about BeautifulSoup and Requests library. But it isn't working as expected for me.
The login url is:
https://portal-vesta.sequoialog.com.br/tms/LoginPortal.aspx
I wrote code in bash script and my login attempt worked, returns this message:
69|dataItem||<script type="text/javascript">window.location="about:blank"</script>|32|pageRedirect||/tms/HomePortal.aspx?Usu=1070rpr|
But in python, it is returning me this message:
b'69|dataItem||<script type="text/javascript">window.location="about:blank"</script>|21|pageRedirect||/TMS/LoginPortal.aspx|'
My code is:
with Session() as s:
page = s.get(urls[0])
get_ev_vs(page)
payload = "Ajax=UpdatePanel1%7CButton1&Button1=ENTRAR&__ASYNCPOST=true&__EVENTARGUMENT=&__EVENTTARGET=&__EVENTVALIDATION={}&__LASTFOCUS=&__VIEWSTATE={}&txtSenha={}&txtUsuario={}&txtUsuarioSolicita=".format(ev, vs, password, user)
head = {
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'Cache-Control': 'no-cache',
'X-MicrosoftAjax': 'Delta=true',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*'
}
response = s.post(urls[0], data=payload, headers=head)
payload = {}
headers = {
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cookie': 'ASP.NET_SessionId=' + s.cookies["ASP.NET_SessionId"]
}
open_page = s.get(urls[1], data=payload, headers=headers)
print(open_page.text)
def get_ev_vs(page):
soup = BeautifulSoup(page.text, 'html.parser')
global vs, ev
vs = soup.select_one('#__VIEWSTATE')['value']
ev = soup.select_one('#__EVENTVALIDATION')['value']
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
target = ['__VIEWSTATE', '__EVENTVALIDATION']
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url)
soup = BeautifulSoup(r.text, 'lxml')
values = [soup.select_one('#{}'.format(x))['value'] for x in target]
data = {
"Ajax": "UpdatePanel1|Button1",
"__LASTFOCUS": "",
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__VIEWSTATE": values[0],
"__EVENTVALIDATION": values[1],
"txtUsuario": "aaa", #username should be here
"txtSenha": "aaa", #password too!
"txtUsuarioSolicita": "",
"__ASYNCPOST": "true",
"Button1": "ENTRAR"
}
r = req.post(url, data=data)
match = re.search(r'ScriptPath\|([^|]+db)\|', r.text).group(1)
final = urljoin(url, match)
r = req.get(final)
print(r.text)
main('https://portal-vesta.sequoialog.com.br/tms/LoginPortal.aspx')
I am trying to scrape text from this website. It returns text like this:
डा. à¤à¥à¤·à¤¬à¤¹à¤¾à¤¦à¥à¤° थापालाठपà¥à¤¤à¥à¤°à¥à¤¶à¥à¤, à¤à¤®à¥à¤°à¤¿à¤à¤¾à¤®à¤¾ तà¥à¤à¤¶à¥à¤°à¥à¤à¥ निधन
instead of:
भारतीय विदेश सचिव गोखले आज नेपाल आउँदै.
Current Code:
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
def get_url_soup(url):
url_request = requests.get(url, headers=headers, allow_redirects=True)
soup = BeautifulSoup(url_request.text, 'lxml')
return soup
soup = get_url_soup('https://www.onlinekhabar.com/2019/03/753522')
title_card = soup.find('div', {'class': 'nws__title--card'})
Using EncodingDetector:
from bs4.dammit import EncodingDetector
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
def get_url_soup(url):
url_request = requests.get(url, headers=headers, allow_redirects=True)
http_encoding = url_request.encoding if 'charset' in url_request.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(url_request.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(url_request.content, 'lxml', from_encoding=encoding)
return soup
soup = get_url_soup('https://www.onlinekhabar.com/2019/03/753522')
title_card = soup.find('div', {'class': 'nws__title--card'})
print(title_card.text)
OUTPUT:
होमपेज /
समाचार /
राष्ट्रिय समाचार
भारतीय विदेश सचिव गोखले आज नेपाल आउँदै
प्रधानमन्त्रीलगायत शीर्ष नेतासँग भेट्ने
.
.
.