Web scraping when goes to 403 page - python

I am a beginner at web scraping and am required to scrape https://mirror-h.org/archive/page/1 using Beautifulsoup. But it is giving an error and goes to the 403 page. How can I solve this? I really appreciate your help.
Here is my code:
import requests
from bs4 import BeautifulSoup
import pandas
url = "https://mirror-h.org/archive/page/1"
page = pandas.read_html(url)
headers = {
'user-agent:' 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
The error I get is:
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

import requests
import pandas as pd
from bs4 import BeautifulSoup
# make sure you insert the headers as a dict as you missed the : within your original code
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
}
def main(url):
# included headers in request
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# response 200
print(r)
# this is how you can use pandas with the previous headers to get 200 response text
df = pd.read_html(r.text)
print(df) # you will get error --> ValueError: No tables found because you are dealing with JS website behind CloudFlare protection! try selenium then!
main('https://mirror-h.org/archive/page/1 ')

Related

Js site does not return data on shaving

I'm using the script I always use to scrape data from the web but I'm not getting success.
I would like to get the data from the table on the website:
https://www.rad.cvm.gov.br/ENET/frmConsultaExternaCVM.aspx
I'm using the following code for scraping:
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://www.rad.cvm.gov.br/ENET/frmConsultaExternaCVM.aspx"
browser = webdriver.PhantomJS()
browser.get(url)
html = browser.page_source
bs = BeautifulSoup(html, 'lxml')
print(bs)
currently I only receive js from the site and not the data from the table itself
Do HTTP POST to https://www.rad.cvm.gov.br/ENET/frmConsultaExternaCVM.aspx/PopulaComboEmpresas
This will return you data of the table as JSON.
In the browser do F12 --> Network --> Fetch/XHR in order to see more details like HTTP header and POST Body.
You can do that easily using only requests as api calls json response and following the post method.
Here is the working code:
import requests
import json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
body = { 'tipoEmpresa': '0'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'x-dtpc': '33$511511524_409h2vHHVRBIAIGILPJNCRGRCECUBIACWCBUEE-0e37',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json'
}
url='https://www.rad.cvm.gov.br/ENET/frmConsultaExternaCVM.aspx/PopulaComboEmpresas'
r = requests.post(url, data=json.dumps(body), headers =headers, verify = False)
res = r.json()['d']
print(res)

Want to know how to crawling at tripadvisor

I am trying to get all of the url links of restaurants in Singapore but my code is not working
data = requests.get("https://www.tripadvisor.com.sg/Restaurants-g294265-Singapore.html").text
soup = BeautifulSoup(data, "html.parser")
for link in soup.find_all('a', {'property_title'}):
print('https://www.tripadvisor.com/Restaurant_Review-g294265-' + link.get('href'))
print(link.string)
It keeps on loading and loading again in the code soup = BeautifulSoup(data, "html.parser")
I don't know why this happens even though this works well for other sites.
Is this because trip advisor block crawling or code is wrong?
It keeps on loading and loading again
To get a response, add the user-agent header:
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
data = requests.get(
"https://www.tripadvisor.com.sg/Restaurants-g294265-Singapore.html", headers=headers
).text
But the data is loaded dynamically, and requests doesn't support dynamically loaded pages. However, the is available in JSON format on the website, (It's not clear what you want to scrape). To get all the data you can use the json/re modules:
import json
...
data = requests.get(
"https://www.tripadvisor.com.sg/Restaurants-g294265-Singapore.html", headers=headers
).text
json_data = re.search(r"window\.__WEB_CONTEXT__=({.*});", data, flags=re.MULTILINE).group(1)
print(
# Prints all the data, you can use `json.loads` instead to access the data instead
json.dumps(json_data, indent=4)
)
To get all the links:
import re
import requests
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
data = requests.get(
"https://www.tripadvisor.com.sg/Restaurants-g294265-Singapore.html", headers=headers
).text
for link in re.findall(r'"detailPageUrl":"(.*?)"', data):
print("https://www.tripadvisor.com.sg/" + link)
Output (truncated):
https://www.tripadvisor.com.sg//Restaurant_Review-g294265-d1145149-Reviews-Grand_Shanghai_Restaurant-Singapore.html
https://www.tripadvisor.com.sg//Restaurant_Review-g294265-d1193730-Reviews-Entre_Nous_creperie-Singapore.html
https://www.tripadvisor.com.sg//Restaurant_Review-g294265-d1173583-Reviews-The_Courtyard-Singapore.html
https://www.tripadvisor.com.sg//Restaurant_Review-g294265-d4611806-Reviews-NOX_Dine_in_the_Dark-Singapore.html
https://www.tripadvisor.com.sg//Restaurant_Review-g294265-d13152787-Reviews-Positano_Risto-Singapore.html

web scraping using beautiful soup

I'm using beautiful soup to scrape a site.
Code:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.bewakoof.com/biker-t-shirts'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div", {"class": "productGrid"})
print(len(containers))
I am getting below mentioned error.
Error
o = containerClass(current_data)
TypeError: __init__() takes 1 positional argument but 2 were given
When I tryed to run part of yours code I've catch an error:
After that i've try to use requests:
>>> my_url = 'https://www.bewakoof.com/biker-t-shirts'
>>> import requests as re
>>> r = re.get(my_url)
>>> r
<Response [403]>
You have got code 403 - it means that the server understood the request but refuses to authorize it. You can get more information about that here
Most often, this error is associated with primitive protection from parsers. To solve this, use this method: You must use headers to deceive the site that you are a browser
To do this download requests lib
then create a dict
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
Instead of these values you can substitute your own. The easiest way to do this is with Network Analiser in your browser (press F12 in Chrome)
Then
import requests as req
url = "url"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
r = req.get(url, headers)
But in this situation, the problem is different. The site you are trying to access simply does not work:

request returns 403 in python beautifulsoup

I am using beautiful soup to try to parse information from a webpage:
url='https://www.onthemarket.com/for-sale/2-bed-flats-apartments/shortlands-station/?max-bedrooms=&radius=0.5'
req=requests.get(url)
req returns <Response [403]>
Python requests. 403 Forbidden suggests there is a user-agent issue, but I cannot find it in my instance.
Are there any suggestions
In such case so please use headers which include user-agent
from bs4 import BeautifulSoup
import requests
url = 'https://www.onthemarket.com/for-sale/2-bed-flats-apartments/shortlands-station/?max-bedrooms=&radius=0.5'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
html_page = requests.get(url, headers=headers).text
soup = BeautifulSoup(html_page, "html.parser")
print(soup.text)

How to do scraping from a page with BeautifulSoup

The question asked is very simple, but for me, it doesn't work and I don't know!
I want to scrape the rating beer from this page https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone with BeautifulSoup, but it doesn't work.
This is my code:
import requests
import bs4
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url).text
soup = BeautifulSoup(test_html, "lxml")
rating = soup.findAll("span", class_="ratingValue")
rating
When I finish, it doesn't work, but if I do the same thing with another page is work... I don't know. Someone can help me? The result of rating is 4.58
Thanks everybody!
If you print the test_html, you'll find you get a 403 forbidden response.
You should add a header (at least a user-agent : ) ) to your GET request.
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
# 4.58
The reason behind getting forbidden status code (HTTP error 403) which means the server will not fulfill your request despite understanding the response. You will definitely get this error if you try scrape a lot of the more popular websites which will have security features to prevent bots. So you need to disguise your request!
For that you need use Headers.
Also you need correct your tag attribute whose data you're trying to get i.e. itemprop
use lxml as your tree builder, or any other of your choice
import requests
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
# Add this
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'lxml')
rating = soup.find('span', {'itemprop':'ratingValue'})
print(rating.text)
The page you are requesting response as 403 forbidden so you might not be getting an error but it will provide you blank result as []. To avoid this situation we add user agent and this code will get you the desired result.
import urllib.request
from bs4 import BeautifulSoup
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = "https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone"
headers={'User-Agent':user_agent}
request=urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, "lxml")
rating = soup.find('span', {'itemprop':'ratingValue'})
rating.text
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36
(KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southerntier-pumking
clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
you are facing this error because some websites can't be scraped by beautiful soup. So for these kinds of websites, you have to use selenium
download latest chrome driver from this link according to your operating system
install selenium driver by this command "pip install selenium"
# import required modules
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time, os
curren_dir = os.getcwd()
print(curren_dir)
# concatinate web driver with your current dir && if you are using window change "/" to '\' .
# make sure , you placed chromedriver in current directory
driver = webdriver.Chrome(curren_dir+'/chromedriver')
# driver.get open url on your browser
driver.get('https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone')
time.sleep(1)
# it fetch data html data from driver
super_html = driver.page_source
# now convert raw data with 'html.parser'
soup=BeautifulSoup(super_html,"html.parser")
rating = soup.findAll("span",itemprop="ratingValue")
rating[0].text

Categories