How can i access this web data successfully using BeautifulSoup? - python

i want to get the informatiom from booking.com (like hotel names, prices...), but I cannot find these information when I access the website through python using BeautifulSoup.
This is what I did:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
url="https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaGKIAQGYAQm4AQfIAQzYAQPoAQGIAgGoAgO4AtDrhJMGwAIB0gIkZjQzNmY0MTQtMjY3OS00NGE0LTkwOWEtNGQ3YzQ0OTY1Mjc42AIE4AIB&lang=en-gb&sid=b9d75b447deb2624c8cfaadad9969120&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaGKIAQGYAQm4AQfIAQzYAQPoAQGIAgGoAgO4AtDrhJMGwAIB0gIkZjQzNmY0MTQtMjY3OS00NGE0LTkwOWEtNGQ3YzQ0OTY1Mjc42AIE4AIB%3Bsid%3Db9d75b447deb2624c8cfaadad9969120%3Bsb_price_type%3Dtotal%26%3B&ss=Hong+Kong&is_ski_area=0&ssne=Hong+Kong&ssne_untouched=Hong+Kong&dest_id=-1353149&dest_type=city&checkin_year=2022&checkin_month=4&checkin_monthday=25&checkout_year=2022&checkout_month=4&checkout_monthday=30&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1"
requests.get(url)
response = requests.get(url)
response.status_code
soup = BeautifulSoup(response.content,'html.parser')
print(soup)
after I print soup, I can only see the information like scores but I cannot find anything about the hotel names when I use find(), can you tell me what I did wrong and how can I do it right? Thank you so much!!

You just simply need to inspect the HTML of the page that is returned in the soup, for example if you inspect hotel heading in the browser you will notice top 10 results of hotels are being shown in the tag with class of card
Then finally you can use find to fetch all the info e.g. check the following modified version of your code
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
url="https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1DCAEoggI46AdIM1gEaGKIAQGYAQm4AQfIAQzYAQPoAQGIAgGoAgO4AtDrhJMGwAIB0gIkZjQzNmY0MTQtMjY3OS00NGE0LTkwOWEtNGQ3YzQ0OTY1Mjc42AIE4AIB&lang=en-gb&sid=b9d75b447deb2624c8cfaadad9969120&sb=1&sb_lp=1&src=index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Findex.en-gb.html%3Flabel%3Dgen173nr-1DCAEoggI46AdIM1gEaGKIAQGYAQm4AQfIAQzYAQPoAQGIAgGoAgO4AtDrhJMGwAIB0gIkZjQzNmY0MTQtMjY3OS00NGE0LTkwOWEtNGQ3YzQ0OTY1Mjc42AIE4AIB%3Bsid%3Db9d75b447deb2624c8cfaadad9969120%3Bsb_price_type%3Dtotal%26%3B&ss=Hong+Kong&is_ski_area=0&ssne=Hong+Kong&ssne_untouched=Hong+Kong&dest_id=-1353149&dest_type=city&checkin_year=2022&checkin_month=4&checkin_monthday=25&checkout_year=2022&checkout_month=4&checkout_monthday=30&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1"
requests.get(url)
response = requests.get(url)
response.status_code
soup = BeautifulSoup(response.content,'html.parser')
#filter all elements with tag span, class bui-card__title and itemprop as name
hotels = soup.findAll("span", {"class": "bui-card__title", "itemprop": "name"})
for hotel in hotels:
print(hotel.decode_contents().strip())
Output is following

Related

Web scraping IMDB with Python's Beautiful Soup

I am trying to parse this page "https://www.imdb.com/title/tt0068112/?ref_=fn_al_tt_1", but I can't find the href that I need (href="/title/tt0068112/episodes?ref_=tt_eps_sm").
I tried with this code:
url="https://www.imdb.com/title/tt0068112/?ref_=fn_al_tt_1"
page(requests.get(url)
soup=BeautifulSoup(page.content,"html.parser")
for a in soup.find_all('a'):
print(a['href'])
What's wrong with this? I also tried to check "manually" with print(soup.prettify()) but it seems that that link is hidden or something like that.
You can get the page html with requests, the href item is in there, no need for special apis. I tried this and it worked:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.imdb.com/title/tt0068112/?ref_=fn_al_tt_1")
soup = BeautifulSoup(page.content, "html.parser")
scooby_link = ""
for item in soup.findAll("a", href="/title/tt0068112/episodes?ref_=tt_eps_sm"):
print(item["href"])
scooby_link = "https://www.imdb.com" + "/title/tt0068112/episodes?ref_=tt_eps_sm"
print(scooby_link)
I'm assuming you also wanted to save the link to a variable for further scraping so I did that as well. 🙂
To get the link with Episodes you can use next example:
import requests
from bs4 import BeautifulSoup
url = "https://www.imdb.com/title/tt0068112/?ref_=fn_al_tt_1"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
print(soup.select_one("a:-soup-contains(Episodes)")["href"])
Prints:
/title/tt0068112/episodes?ref_=tt_eps_sm

How to select all links of apps from app store and extract its href?

from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
url = f'https://www.apple.com/kr/search/youtube?src=globalnav'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
links = soup.select(".rf-serp-productname-list")
print(links)
I want to crawl through all links of shown apps. When I searched for a keyword, I thought links = soup.select(".rf-serp-productname-list") would work, but links list is empty.
What should I do?
Just check this code, I think is what you want:
import re
import requests
from bs4 import BeautifulSoup
pages = set()
def get_links(page_url):
global pages
pattern = re.compile("^(/)")
html = requests.get(f"your_URL{page_url}").text # fstrings require Python 3.6+
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all("a", href=pattern):
if "href" in link.attrs:
if link.attrs["href"] not in pages:
new_page = link.attrs["href"]
print(new_page)
pages.add(new_page)
get_links(new_page)
get_links("")
Source:
https://gist.github.com/AO8/f721b6736c8a4805e99e377e72d3edbf
You can change the part:
for link in soup.find_all("a", href=pattern):
#do something
To check for a keyword I think
You are cooking a soup so first at all taste it and check if everything you expect contains in it.
ResultSet of your selection is empty cause structure in response differs a bit from your expected one from the developer tools.
To get the list of links select more specific:
links = [a.get('href') for a in soup.select('a.icon')]
Output:
['https://apps.apple.com/kr/app/youtube/id544007664', 'https://apps.apple.com/kr/app/%EC%BF%A0%ED%8C%A1%ED%94%8C%EB%A0%88%EC%9D%B4/id1536885649', 'https://apps.apple.com/kr/app/youtube-music/id1017492454', 'https://apps.apple.com/kr/app/instagram/id389801252', 'https://apps.apple.com/kr/app/youtube-kids/id936971630', 'https://apps.apple.com/kr/app/youtube-studio/id888530356', 'https://apps.apple.com/kr/app/google-chrome/id535886823', 'https://apps.apple.com/kr/app/tiktok-%ED%8B%B1%ED%86%A1/id1235601864', 'https://apps.apple.com/kr/app/google/id284815942']

Beautifulsoup extracting urls from a given website menu

Hello every one I'm new to beautifulsoup, I'm trying to write a function that will be able to extract second level urls from a given website.
For example if I have this website url : https://edition.cnn.com/ my function should be able to return
https://edition.cnn.com/world
https://edition.cnn.com/politics
https://edition.cnn.com/business
https://edition.cnn.com/health
https://edition.cnn.com/entertainment
https://edition.cnn.com/style
https://edition.cnn.com/travel
first I have tried this code to retrieve all links starting with the string of the url:
from bs4 import BeautifulSoup as bs4
import requests
import lxml
import re
def getLinks(url):
response = requests.get(url)
data = response.text
soup = bs4(data, 'lxml')
links = []
for link in soup.find_all('a', href=re.compile(str(url))):
links.append(link.get('href'))
return links
But then again the actual output is giving me all the links even links of articles which is not I'm looking for. is there a method that I can use to get what I want using regular expression or others.
The links are inside <nav> tag, so using CSS selector nav a[href] will select only links inside <nav> tag:
import requests
from bs4 import BeautifulSoup
url = 'https://edition.cnn.com'
soup = BeautifulSoup(requests.get(url).text, 'lxml')
for a in soup.select('nav a[href]'):
if a['href'].count('/') > 1 or '#' in a['href']:
continue
print(url + a['href'])
Prints:
https://edition.cnn.com/world
https://edition.cnn.com/politics
https://edition.cnn.com/business
https://edition.cnn.com/health
https://edition.cnn.com/entertainment
https://edition.cnn.com/style
https://edition.cnn.com/travel
https://edition.cnn.com/sport
https://edition.cnn.com/videos
https://edition.cnn.com/world
https://edition.cnn.com/africa
https://edition.cnn.com/americas
https://edition.cnn.com/asia
https://edition.cnn.com/australia
https://edition.cnn.com/china
https://edition.cnn.com/europe
https://edition.cnn.com/india
https://edition.cnn.com/middle-east
https://edition.cnn.com/uk
...and so on.

How to make beautiful soup grab only what is between a set of "[:" ":]" in a web page?

Good afternoon! How do I make Beautifulsoup grab only what is between multiple sets of "[:" and ":]" So far I have got the entire page in my soup, but it does not have tags, sadly.
What it looks like so far
I have tried a couple of things so far:
soup.findAll(text="[")
keys = soup.find("span", attrs = {"class": "objectBox objectBox-string"})
import bs4 as bs
import urllib.request
source = urllib.request.urlopen("https://login.microsoftonline.com/common/discovery/keys").read()
soup = bs.BeautifulSoup(source,'lxml')
# ---------------------------------------------
# prior script that I was playing with trying to tackle this issue
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set URL to scrape new certs from
newcerts = "https://login.microsoftonline.com/common/discovery/keys"
# Connect to the URL
response = requests.get(newcerts)
# Parse HTML and save to BeautifulSoup Object
soup = BeautifulSoup(response.text, "html.parser")
keys = soup.find("span", attrs = {"class": "objectBox objectBox-string"})
End goal is to retrieve the public PKI keys from Azure's website at https://login.microsoftonline.com/common/discovery/keys
Not sure if this is what you meant to grab. Try the script below:
import json
import requests
url = 'https://login.microsoftonline.com/common/discovery/keys'
res = requests.get(url)
jsonobject = json.loads(res.content)
for item in jsonobject['keys']:
print(item['x5c'])

Problem with scraping data from website with BeautifulSoup

I am trying to take a movie rating from the website Letterboxd. I have used code like this on other websites and it has worked, but it is not getting the info I want off of this website.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://letterboxd.com/film/avengers-endgame/")
soup = BeautifulSoup(page.content, 'html.parser')
final = soup.find("section", attrs={"class":"section ratings-histogram-
chart"})
print(final)
This prints nothing, but there is a tag in the website for this class and the info I want is under it.
The reason behind this, is that the website loads most of the content asynchronously, so you'll have to look at the http requests it sends to the server in order to load the page content after loading the page layout. You can find them in "network" section in the browser (F12 key).
For instance, one of the apis they use to load the rating is this one:
https://letterboxd.com/csi/film/avengers-endgame/rating-histogram/
You can get the weighted average from another tag
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://letterboxd.com/film/avengers-endgame/')
soup = bs(r.content, 'lxml')
print(soup.select_one('[name="twitter:data2"]')['content'])
Text of all histogram
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://letterboxd.com/csi/film/avengers-endgame/rating-histogram/')
soup = bs(r.content, 'lxml')
ratings = [item['title'].replace('\xa0',' ') for item in soup.select('.tooltip')]
print(ratings)

Categories