Encoding Error while scraping a website using Beautiful Soup - python

I am trying to scrape text from this website. It returns text like this:
डा. भà¥à¤·à¤¬à¤¹à¤¾à¤¦à¥à¤° थापालाठपà¥à¤¤à¥à¤°à¥à¤¶à¥à¤, à¤à¤®à¥à¤°à¤¿à¤à¤¾à¤®à¤¾ तà¥à¤à¤¶à¥à¤°à¥à¤à¥ निधन
instead of:
भारतीय विदेश सचिव गोखले आज नेपाल आउँदै.
Current Code:
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
def get_url_soup(url):
url_request = requests.get(url, headers=headers, allow_redirects=True)
soup = BeautifulSoup(url_request.text, 'lxml')
return soup
soup = get_url_soup('https://www.onlinekhabar.com/2019/03/753522')
title_card = soup.find('div', {'class': 'nws__title--card'})

Using EncodingDetector:
from bs4.dammit import EncodingDetector
headers = {
'Connection': 'close',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
def get_url_soup(url):
url_request = requests.get(url, headers=headers, allow_redirects=True)
http_encoding = url_request.encoding if 'charset' in url_request.headers.get('content-type', '').lower() else None
html_encoding = EncodingDetector.find_declared_encoding(url_request.content, is_html=True)
encoding = html_encoding or http_encoding
soup = BeautifulSoup(url_request.content, 'lxml', from_encoding=encoding)
return soup
soup = get_url_soup('https://www.onlinekhabar.com/2019/03/753522')
title_card = soup.find('div', {'class': 'nws__title--card'})
print(title_card.text)
OUTPUT:
होमपेज /
समाचार /
राष्ट्रिय समाचार
भारतीय विदेश सचिव गोखले आज नेपाल आउँदै
प्रधानमन्त्रीलगायत शीर्ष नेतासँग भेट्ने
.
.
.

Related

How to make python requests follow a temporary redirect (302

I am trying to retrieve the html response code of a website redirected by 302. It seams that python3 is not following the temporary redirect and returns the original page instead.
Is there a way to configure the command in order to follow the 302?
import random
import requests
from pprint import pprint
from bs4 import BeautifulSoup
url = 'https://www.zurrose.de/catalogsearch/result?q=11343654'
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'Referer': 'https://www.zurrose.de',
'Connection': 'keep-alive'
}
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36,',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
]
headers['User-Agent'] = random.choice(user_agents)
r = requests.post(url, headers=headers)
print('start')
print(r.history[0].status_code)
if r.history:
print("Request was redirected")
print(r.history)
for resp in r.history:
print(resp.status_code, resp.url)
print("Final destination:")
print(r.status_code, r.url)
if not r.history:
print(f'No redirect on {url}. Status {r.status_code}. PP URL not found for {sku}')
elif r.history[0].status_code < 300:
print(f'No PP URL retrieved for {sku} on {url}. Status {r.history[0].status_code}')
soup = BeautifulSoup(r.content, 'html.parser')
for i in soup.select('link[rel*=canonical]'):
# print(i['href'])
url_pp = i['href']
print(url_pp)
pprint(r.content)
The method you want to use will not redirect you to the correct page. The matter is that the redirect occurs in cookies. You need to send request in session. I ll show 2 ways how to get the page you need
import requests
from bs4 import BeautifulSoup
def show_product_ajax(sku):
url = f"https://www.zurrose.de/search/ajax/suggest/?q={sku}"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
for product in requests.request("GET", url, headers=headers).json():
print(product['title'], product['url'], BeautifulSoup(product['price'], features='lxml').find('span', class_='price').getText())
def show_product_session(sku):
url = f'https://www.zurrose.de/catalogsearch/result?q={sku}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
s = requests.Session()
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, features='lxml')
product_url = soup.find('meta', attrs={'itemprop': 'url'}).get('content')
product_title = soup.find('span', attrs={'itemprop': 'name', 'data-ui-id': 'page-title-wrapper'}).getText()
product_price = soup.find('span', class_='price').getText()
print(product_title, product_url, product_price)
show_product_ajax(11343654)
show_product_session(11343654)
Result:
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764 23,50 €
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel-15 St https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764?variation=344764_6502482 23,50 €

How to get past the consent page of google shopping using requests and python

For a small project I tried to get past the google consent page to webscrape the prices found by it. However i could not get the to the good stuff. Sofar i tried the following code, as also proposed by Using python requests with google search.
First try:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
#%% get acces
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
"cookie": "CONSENT=YES"}
cookie = {
'set-cookie': 'CONSENT=YES+cb.20211212-16-p1.nl+FX+671'
}
url = 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw'
s = requests.Session()
s.get(url,headers=headers)
purl = 'https://consent.google.com/s'
payload = {
'gl': 'NL',
'm': 'false',
'pc': 'srp',
'continue': 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw',
'ca': 'r',
'x': '6',
'v': 'cb.20211212-16-p1.nl+FX+092',
't': 'ADw3F8jHa3HqOqq133-wOkXCYf4K_r-AIA:1641909071268',
'hl': 'nl',
'src': '1'
}
s.post(purl,params=payload,headers=headers)
page = s.get(url, headers=headers, cookies=cookie)
Second try:
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
# url = f"https://www.google.com/search?q=fitness+wear"
headers = {
"referer":"referer: https://www.google.com/",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
}
s.post(url, headers=headers)
response = s.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)```

Website scraping javascript element with python

I scrape the page with bs4 and requests python. I want to get all the values ​​given below span.
My code returns an empty output.
That is my code:
import requests
from bs4 import BeautifulSoup
url = 'https://finance.yahoo.com/?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAAL27GM7owB-wouEznTgEc042sYEQEVDVrvFu5gPk62z1oKnTUhzN297s6vD5rzOVWHpoex7Zc8frVJe0saldAedZOe49BauM9YtLDhHtx6PMlH4ENmihvT2fgmlnqsAPFFqfC9aW1dF_NgBYi6lfREpk6uUwP7DnDhikzgEkYIUd'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
for i in soup.find_all('span', class_='Trsdu(0.3s) Fz(s) Mt(4px) Mb(0px) Fw(b) D(ib)'):
print(i.text)
Change the User-Agent to obtain the correct values:
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAAL27GM7owB-wouEznTgEc042sYEQEVDVrvFu5gPk62z1oKnTUhzN297s6vD5rzOVWHpoex7Zc8frVJe0saldAedZOe49BauM9YtLDhHtx6PMlH4ENmihvT2fgmlnqsAPFFqfC9aW1dF_NgBYi6lfREpk6uUwP7DnDhikzgEkYIUd"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0",
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
for i in soup.find_all(
"span", class_="Trsdu(0.3s) Fz(s) Mt(4px) Mb(0px) Fw(b) D(ib)"
):
print(i.text)
Prints:
4,181.17
33,874.85
13,962.68

Scraping Data after filling form in python of a website

I have tried to scrape data from http://www.educationboardresults.gov.bd/ with python and BeautifulSoup.
Firstly, website need to fill the form. After filling the form the website provide results. I have attached two image here.
Before Submitting Form: https://prnt.sc/w4lo7i
After Submission: https://prnt.sc/w4lqd0
I have tried with following code
import requests
from bs4 import BeautifulSoup as bs
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': 2012,
'board': 'chittagong',
'roll': 102275,
'reg': 626948,
'button2': 'Submit',
}
headers ={
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'cookie': 'PHPSESSID=24vp2g7ll9utu1p2ob5bniq263; tcount_unique_eb_log=1',
'Origin': 'http://www.educationboardresults.gov.bd',
'Referer': 'http://www.educationboardresults.gov.bd/',
'Request URL': 'http://www.educationboardresults.gov.bd/result.php'
}
with requests.Session() as s:
url = 'http://www.educationboardresults.gov.bd'
r = s.get(url, headers=headers)
soup = bs(r.content,'html5lib')
#Scraping and by passing Captcha
alltable =soup.findAll('td')
captcha = alltable[56].text.split('+')
for digit in captcha:
value_one, value_two = int(captcha[0]), int(captcha[1])
resultdata['value_s'] = value_one+value_two
r=s.post(url, data=resultdata, headers= headers)
While printing r.content it is showing first page's code. I want to scrape the second page.
Thanks in Advance
You are making post requests to the wrong url. Moreover, you are supposed to add the value of two numbers and use the result right next to value_s. If you are using bs4 version 3.7 or later, the following selector will work for you as I've used pseudo css selector. The bottom line is your issue is solved. Try the following:
import requests
from bs4 import BeautifulSoup
link = 'http://www.educationboardresults.gov.bd/'
result_url = 'http://www.educationboardresults.gov.bd/result.php'
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': 2012,
'board': 'chittagong',
'roll': 102275,
'reg': 626948,
'button2': 'Submit',
}
def get_number(s,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"html5lib")
num = 0
captcha_numbers = soup.select_one("tr:has(> td > #value_s) > td + td").text.split("+")
for i in captcha_numbers:
num+=int(i)
return num
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
resultdata['value_s'] = get_number(s,link)
r = s.post(result_url, data=resultdata)
print(r.text)
I am also trying.
import requests
from bs4 import BeautifulSoup as bs
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': "2012",
'board': 'chittagong',
'roll': "102275",
'reg': "626948",
}
headers ={
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'cookie': 'PHPSESSID=24vp2g7ll9utu1p2ob5bniq263; tcount_unique_eb_log=1',
'Origin': 'http://www.educationboardresults.gov.bd',
'Referer': 'http://www.educationboardresults.gov.bd/',
'Request URL': 'http://www.educationboardresults.gov.bd/result.php'
}
with requests.Session() as s:
url = 'http://www.educationboardresults.gov.bd/index.php'
r = s.get(url, headers=headers)
soup = bs(r.content,'lxml')
# print(soup.prettify())
#Scraping and by passing Captcha
alltable =soup.findAll('td')
captcha = alltable[56].text.split('+')
print(captcha)
value_one, value_two = int(captcha[0]), int(captcha[1])
print(value_one, value_one)
resultdata['value_s'] = value_one+value_two
resultdata['button2'] = 'Submit'
print(resultdata)
r=s.post("http://www.educationboardresults.gov.bd/result.php", data=resultdata, headers= headers)
soup = bs(r.content, 'lxml')
print(soup.prettify())

BeautifulSoup unsuccessful request

I'm using urlopen to extract data from a list of websites right now but keep running in problems with unsuccessful requests. Can anyone help me with it?
I save the website as HTML file
path = "/Users/runyao/Downloads/The Proposition 65 List | OEHHA.html"
soup = BeautifulSoup(open(path), "html.parser")
list = []
pattern = "(chemicals)+([a-z0-9-])"
for counter in range(1,852):
temp = str(soup.find_all('option')[counter])
temptext = temp.replace("\n","/")
temptext = temptext.replace('"',"/")
temptext = temptext.replace(">","")
templist = temptext.split("/")
list.append(templist[-4])
url = 'https://oehha.ca.gov/chemicals/'+ list[1] + '/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
values = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
data = data.encode('ascii')
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
the_page = response.read()
import urllib.parse
import urllib.request
​
url = "https://oehha.ca.gov/chemicals/"+ list[1]
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp, "html.parser")
print(respData)
<html>
<head>
<meta content="noindex,nofollow" name="robots"/>
<script src="/_Incapsula_Resource?SWJIYLWA=5074a744e2e3d891814e9a2dace20bd4,719d34d31c8e3a6e6fffd425f7e032f3">
</script>
<body>
</body></head></html>
And if you try adding more headers , like this....this work for me :
import requests
headers = {
'authority': 'oehha.ca.gov',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'es-ES,es;q=0.9',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
'if-none-match': '^\\^1575853039-1^\\^',
}
response = requests.get('https://oehha.ca.gov/chemicals/abiraterone-acetate', headers=headers)
print(response.content)
UPDATE:I have updated the code using urllib , probably have problems because they have blocked the IP from where you make the requests.
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/abiraterone-acetate"
# headers = {}
# headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
}
req = urllib.request.Request(url, data=None, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp,"html.parser",from_encoding="iso-8859-1")
print(respData)
Result :

Categories