Not able to open the URL with requests.get - python

I'm getting HTTPError: 400 for the below code,i didn't understand why i'm not able to open the url
from bs4 import BeautifulSoup
import requests
import lxml
import pandas as pd
import re
search_url = f'https://www.booking.com/reviewlist.en-gb.html?aid=304142&label=gen173nr-1DCAsoAkIbY2VudHJvLXlhcy1pc2xhbmQtYWJ1LWRoYWJpSDNYBGhsiAEBmAEJuAEGyAEM2AED6AEBiAIBqAIDuAKEwOrxBcACAQ&sid=61a721d17d76bc82ccf82c3c3d92de7c&cc1=ae&dist=1&pagename=centro-yas-island-abu-dhabi&srpvid=fee14d92dc160043&type=total&rows=10&offset=0'
page = requests.get(search_url)
print(page)
if page.status_code == requests.codes.ok:
soup = BeautifulSoup(page.text, 'lxml')
# get_property_attributes(soup)
else:
print('open error')
```
```
ouput : <Response [400]>
```
please any one give me some suggestions to overcome the issue

Try adding headers parameter in the request:
from bs4 import BeautifulSoup
import requests
import lxml
import pandas as pd
import re
search_url = 'https://www.booking.com/reviewlist.en-gb.html?aid=304142&label=gen173nr-1DCAsoAkIbY2VudHJvLXlhcy1pc2xhbmQtYWJ1LWRoYWJpSDNYBGhsiAEBmAEJuAEGyAEM2AED6AEBiAIBqAIDuAKEwOrxBcACAQ&sid=61a721d17d76bc82ccf82c3c3d92de7c&cc1=ae&dist=1&pagename=centro-yas-island-abu-dhabi&srpvid=fee14d92dc160043&type=total&rows=10&offset=0'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
page = requests.get(search_url, headers=headers)
print(page)
if page.status_code == requests.codes.ok:
soup = BeautifulSoup(page.text, 'lxml')
# get_property_attributes(soup)
else:
print('open error')
Output:
<Response [200]>

Related

I am trying to get one element of a website but it prints "none" (Python Requests)

from bs4 import BeautifulSoup
import requests
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', id_="most-recorded")
print(element)
This always prints "none" but when I go to the website, I can see it. I even deleted all cookies and it's still there.
Without specifying a user agent, the site does not give you the tag you need.
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', {'id': "most-recorded"}).get_text(strip=True)
print(element)
OUTPUT:
Fortnite

Web scraping when goes to 403 page

I am a beginner at web scraping and am required to scrape https://mirror-h.org/archive/page/1 using Beautifulsoup. But it is giving an error and goes to the 403 page. How can I solve this? I really appreciate your help.
Here is my code:
import requests
from bs4 import BeautifulSoup
import pandas
url = "https://mirror-h.org/archive/page/1"
page = pandas.read_html(url)
headers = {
'user-agent:' 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
The error I get is:
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
import requests
import pandas as pd
from bs4 import BeautifulSoup
# make sure you insert the headers as a dict as you missed the : within your original code
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
}
def main(url):
# included headers in request
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# response 200
print(r)
# this is how you can use pandas with the previous headers to get 200 response text
df = pd.read_html(r.text)
print(df) # you will get error --> ValueError: No tables found because you are dealing with JS website behind CloudFlare protection! try selenium then!
main('https://mirror-h.org/archive/page/1 ')

How do I retrieve the text between those

https://imgur.com/a/JcTnbiw
how do I retrieve the highlighted text with beautifulsoup?
a example would be the best answer, thank you ;)
edit; heres the code
import requests
import pyperclip
from bs4 import BeautifulSoup
import time
url = 'https://sales.elhst.co/'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}
site = requests.get(url, headers=headers)
site = str(site)
if site == "<Response [200]>":
print("Site is up..")
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
time.sleep(2)
target = soup.find("pp", id="copies")
print(target)
and the output is:
Site is up..
<pp id="copies"></pp>
and i wanna to get this text:
https://imgur.com/a/JcTnbiw
is there any way to do it?
The data you see on the page is loaded from external URL. You can try this script to print number of copies:
import re
import json
import requests
url = 'https://sales.elhst.co/socket.io/?EIO=3&transport=polling'
copies_url = 'https://sales.elhst.co/socket.io/?EIO=3&transport=polling&sid={sid}'
r = requests.get(url).text
sid = json.loads(re.search(r'(\{".*)', r).group(1))['sid']
r = requests.get(copies_url.format(sid=sid)).text
copies = json.loads(re.search(r'(\[".*)', r).group(1))[-1]
print(copies)
Prints:
0
from lxml import html
import requests
page = requests.get('http://url')
tree = html.fromstring(page.content)
#This will extract the text you need
buyers = tree.xpath('//pp[#id="copies"]/text()')
It should work. But I don't know pp tag. I think it's a mistake and there should be tag <p>.
More info about lxml here.

Python Web Scrape - 403 Error

I'm trying to open up this website using python beautifulsoup and urllib but I keep getting a 403 error. Can someone guide me with this error?
My current code is this;
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.cubesmart.com/florida-self-storage/st--petersburg-self-storage/3337.html?utm_source=local&utm_medium=organic&utm_campaign=googlemybusiness&utm_term=3337'
uClient = uReq(my_url)
but I get the 403 error.
I searched around and tried using the approach below, but it too is giving me the same error.
from urllib.request import Request, urlopen
url="https://www.cubesmart.com/florida-self-storage/st--petersburg-self-storage/3337.html?utm_source=local&utm_medium=organic&utm_campaign=googlemybusiness&utm_term=3337"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
Any help is appreciated.
Try to use session() from requests as below:
import requests
my_session = requests.session()
for_cookies = my_session.get("https://www.cubesmart.com")
cookies = for_cookies.cookies
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
my_url = 'https://www.cubesmart.com/florida-self-storage/st--petersburg-self-storage/3337.html?utm_source=local&utm_medium=organic&utm_campaign=googlemybusiness&utm_term=3337'
response = my_session.get(my_url, headers=headers, cookies=cookies)
print(response.status_code) # 200

Beautifulsoup can't extract data in this website

import requests
from bs4 import BeautifulSoup
import lxml
import urllib2
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
f =open('ala2009link.csv','r')
s=open('2009alanews.csv','w')
for row in csv.reader(f):
url=row[0]
print url
res = requests.get(url)
print res.content
soup = BeautifulSoup(res.content)
print soup
data=soup.find_all("article",{"class":"article-wrapper news"})
#data=soup.find_all("main",{"class":"main-content"})
for item in data:
title= item.find_all("h2",{"class","article-headline"})[0].text
s.write("%s \n"% title)
content=soup.find_all("p")
for main in content:
k=main.text.encode('utf-8')
s.write("%s \n"% k)
#k=csv.writer(s)
#k.writerow('%s\n'% (main))
s.close()
f.close()
this is my code to extract data in website ,but i don't know why i can't extract data ,is this ad blocker warning to block my beautifulsoup ?
this is the example link:http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=football
The reason that no results are returned is because this website requires that you have a User-Agent header in your request.
To fix this add a headers parameter with a User-Agent to the requests.get() like so.
url = 'http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=football'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/29.0.1547.65 Chrome/29.0.1547.65 Safari/537.36',
}
res = requests.get(url, headers=headers)

Categories