Unable to parse a rating information from a webpage using requests - python

I tried to scrape a certain information from a webpage but failed miserably. The text I wish to grab is available in the page source but I still can't fetch it. This is the site address. I'm after the portion visible in the image as Not Rated.
Relevant html:
<div class="subtext">
Not Rated
<span class="ghost">|</span> <time datetime="PT188M">
3h 8min
</time>
<span class="ghost">|</span>
Drama,
Musical,
Romance
<span class="ghost">|</span>
<a href="/title/tt0150992/releaseinfo?ref_=tt_ov_inf" title="See more release dates">18 June 1999 (India)
</a> </div>
I've tried with:
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
rating = soup.select_one(".titleBar .subtext").next_element
print(rating)
I get None using the script above.
Expected output:
Not Rated
How can I get the rating from that webpage?

If you want to get correct version of HTML page, specify Accept-Language http header:
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
s.headers['Accept-Language'] = 'en-US,en;q=0.5' # <-- specify also this!
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
rating = soup.select_one(".titleBar .subtext").next_element
print(rating)
Prints:
Not Rated

There is a better way to getting info on the page. If you dump the html content returned by the request.
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
with open("response.html", "w", encoding=r.encoding) as file:
file.write(r.text)
you will find a element <script type="application/ld+json"> which contains all the information about the movie.
Then, you simply get the element text, parse it as json, and use the json to extract the info you wanted.
here is a working example
import json
import requests
from bs4 import BeautifulSoup
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
with requests.Session() as s:
s.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
movie_data = soup.find("script", attrs={"type": "application/ld+json"}).next # Find the element <script type="application/ld+json"> and get it's content
movie_data = json.loads(movie_data) # parse the data to json
content_rating = movie_data["contentRating"] # get rating

IMDB is one of those webpages that makes it incredible easy to do webscraping and I love it. So what they do to make it easy for webscrapers is to put a script in the top of the html that contains the whole movie object in the format of JSON.
So to get all the relevant information and organize it you simply need to get the content of that single script tag, and convert it to JSON, then you can simply ask for the specific information like with a dictionary.
import requests
import json
from bs4 import BeautifulSoup
#This part is basically the same as yours
link = "https://www.imdb.com/title/tt0150992/?ref_=ttfc_fc_tt"
r = requests.get(link)
soup = BeautifulSoup(r.content,"lxml")
#Why not get the whole json element of the movie?
script = soup.find('script', {"type" : "application/ld+json"})
element = json.loads(script.text)
print(element['contentRating'])
#Outputs "Not Rated"
# You can also inspect te rest of the json it has all the relevant information inside
#Just -> print(json.dumps(element, indent=2))
Note:
Headers and session are not necessary in this example.

Related

Get text from <span class: with Beautifulsoup and requests

so I tried to get a specific text from a website but it only gives me the error (floor = soup.find('span', {'class': 'text-white fs-14px text-truncate attribute-value'}).text
AttributeError: 'NoneType' object has no attribute 'text')
I specifically want to get the 'Floor Price' text.
My code:
import bs4
from bs4 import BeautifulSoup
#target url
url = "https://magiceden.io/marketplace/solsamo"
#act like browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get('https://magiceden.io/marketplace/solsamo')
#parse the downloaded page
soup = BeautifulSoup(response.content, 'lxml')
floor = soup.find('span', {'class': 'text-white fs-14px text-truncate attribute-value'}).text
print(floor)
There is no needed data in HTML you receive after:
response = requests.get('https://magiceden.io/marketplace/solsamo')
You can make sure of this by looking at page source code:
view-source:https://magiceden.io/marketplace/solsamo
You should use Selenium instead requests to get your data or you can examine XHR-requests on this page, maybe you can get this data using requests by following other link.

How can I scrape this data when requests doesn't return it?

I want to scrape the information from this page:
https://databases.usatoday.com/nfl-arrests/
Each of the arrests is listed in a table on the page under the css selector: #csp-data I can see this in the page's source as well: <div id="csp-data" class="csp-data"></div> but there is nothing in-between those tags for me to parse.
When I try to run the following code, I return no results.
import requests
from bs4 import BeautifulSoup
url = "https://databases.usatoday.com/nfl-arrests/"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
test = soup.select('#csp-data > div > div:nth-child(3) > div > div.table-responsive > table > tbody')
print(test)
If I use test = soup.select('#csp-data'), I return <div class="csp-data" id="csp-data"></div> If I move to the next step #csp-data > div, I return no results.
I'm assuming that the data isn't being loaded when requests gets the data, but I'm not sure. When I go in through my browser and use inspect element, I can see the table has loaded.
Does anyone have an idea on how I could move forward here?
Here is the working output from ajax calls
import requests
import json
body = 'action=cspFetchTable&security=3193d24eb0&pageID=10&blogID=&sortBy=Date&sortOrder=desc&page=1&searches={}&heads=true'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded'}
url='https://databases.usatoday.com/wp-admin/admin-ajax.php'
r = requests.post(url, data=body, headers =headers)
tables = r.json()['data']['Result']
for table in tables:
print(table['First_name'])
Output:Example
Bradley
Deonte
Barkevious
Darius
Jarron
Tamorrion
Zaven
Frank
Justin
Aldon
Jeff
Marshon
Broderick
Frank
Jaydon
Kevin
Kemah
Chad
Isaiah
Rashard

BeautifulSoup - find() function not working for some elements

I'm trying to scrape financial data off this URL:https://www.londonstockexchange.com/stock/STAN/standard-chartered-plc/fundamentals
In this webpage, scraping the h1 tag works perfectly by referencing its class.
Source HTML:
<h1 _ngcontent-ng-lseg-c11="" class="company-name font-bold hero-font"><!----><!---->STANDARD CHARTERED PLC<!----><!----><!----></h1>
My Python Code:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
url = 'https://www.londonstockexchange.com/stock/{}/{}'
stock = 'STAN/standard-chartered-plc'
info = 'fundamentals'
full_url = url.format(stock, info)
print(full_url)
r = requests.get(full_url)
soup = BeautifulSoup(r.text, 'lxml')
title = soup.find('title')
print(title)
rows = soup.find(class_='company-name font-bold hero-font')
print(rows)
Output:
https://www.londonstockexchange.com/stock/STAN/standard-chartered-plc/fundamentals
<title>STANDARD CHARTERED PLC STAN Fundamentals - Stock | London Stock Exchange</title>
<h1 _ngcontent-sc12="" class="company-name font-bold hero-font"><!-- --><!-- -->STANDARD CHARTERED PLC<!-- --><!-- --><!-- --></h1>
But when trying to scrape another part of the webpage, namely the following tag, this function ceases to work:
<thead _ngcontent-ng-lseg-c21="" class="accordion-header gtm-trackable">
My Python Code:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
url = 'https://www.londonstockexchange.com/stock/{}/{}'
stock = 'STAN/standard-chartered-plc'
info = 'fundamentals'
full_url = url.format(stock, info)
print(full_url)
r = requests.get(full_url)
soup = BeautifulSoup(r.text, 'lxml')
title = soup.find('title')
print(title)
rows = soup.find(class_='accordion-header gtm-trackable')
print(rows)
My output is as follows:
https://www.londonstockexchange.com/stock/STAN/standard-chartered-plc/fundamentals
<title>STANDARD CHARTERED PLC STAN Fundamentals - Stock | London Stock Exchange</title>
None
I've tried using 'html.parser' and 'lxml' and both cause the same problem.
Data is dynamically loaded from a script tag. You can regex out the string holding your data, then do a replace on some entities to get a string json can turn in to a json object. Then parse out what you what.
import requests
from bs4 import BeautifulSoup
import json
r = requests.get('https://www.londonstockexchange.com/stock/STAN/standard-chartered-plc/fundamentals')
soup = BeautifulSoup(r.text, 'lxml')
data = json.loads(soup.select_one('#ng-lseg-state').string.replace('&q;','"'))
print(data['sortedComponents']['content'][1]['status']['childComponents'][1]['content'].keys())
There may be some other entities to replace. It may be sufficient to add the following:
import html
and later
data = json.loads(html.unescape(soup.select_one('#ng-lseg-state').string.replace('&q;','"')))
Sample of data:
To match image:
from pprint import pprint
pprint(data['sortedComponents']['content'][1]['status']['childComponents'][1]['content'])
String pasted into json viewer:
json.dumps(data['sortedComponents']['content'][1]['status']['childComponents'][1]['content'])

Webscraping latitude longitude from google results

How can I scrape latitude and longitude from the google results in the image below using beautiful soup.
Google result latitude longitude
Here is the code for do it with bs4:
from requests import get
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',}
response = get("https://www.google.com/search?q=latitude+longitude+of+75270+postal+code+paris+france",headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
a = soup.find("div", class_= "Z0LcW").text
print(a)
Please provide more input on further questions since we don't want to do the pre-work to create a solution.
You will have to grab this container:
<div class="HwtpBd gsrt PZPZlf" data-attrid="kc:/location/location:coordinates" aria-level="3" role="heading"><div class="Z0LcW XcVN5d">48.8573° N, 2.3370° E</div><div></div></div>
BS4
#BeautifoulSoup Stuff
import requests
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import re
# Make the request
url = "https://www.google.com/search?q=latitude+longitude+of+75270+postal+code+paris+france&rlz=1C1CHBF_deDE740DE740&oq=latitude+longitude+of+75270+postal+code+paris+france&aqs=chrome..69i57.4020j0j8&sourceid=chrome&ie=UTF-8"
response = requests.get(url)
# Convert it to proper html
html = response.text
# Parse it in html document
soup = BeautifulSoup(html, 'html.parser')
# Grab the container and its content
target_container = soup.find("div", {"class": "Z0LcW XcVN5d"}).text
Then you have a string inside the div returned.
..Assuming google doesn't change the class declarations randomly. I tried five refreshes and the classname didn't change, but who knows.
Make sure you're using user-agent (you can also use python fake user-agents library)
Code and replit.com that grabs location from Google Search results:
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.com/search?q=latitude longitude of 75270 postal code paris france',
headers=headers).text
soup = BeautifulSoup(html, 'lxml')
location = soup.select_one('.XcVN5d').text
print(location)
Output:
48.8573° N, 2.3370° E

How to do scraping from a page with BeautifulSoup

The question asked is very simple, but for me, it doesn't work and I don't know!
I want to scrape the rating beer from this page https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone with BeautifulSoup, but it doesn't work.
This is my code:
import requests
import bs4
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url).text
soup = BeautifulSoup(test_html, "lxml")
rating = soup.findAll("span", class_="ratingValue")
rating
When I finish, it doesn't work, but if I do the same thing with another page is work... I don't know. Someone can help me? The result of rating is 4.58
Thanks everybody!
If you print the test_html, you'll find you get a 403 forbidden response.
You should add a header (at least a user-agent : ) ) to your GET request.
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
# 4.58
The reason behind getting forbidden status code (HTTP error 403) which means the server will not fulfill your request despite understanding the response. You will definitely get this error if you try scrape a lot of the more popular websites which will have security features to prevent bots. So you need to disguise your request!
For that you need use Headers.
Also you need correct your tag attribute whose data you're trying to get i.e. itemprop
use lxml as your tree builder, or any other of your choice
import requests
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
# Add this
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'lxml')
rating = soup.find('span', {'itemprop':'ratingValue'})
print(rating.text)
The page you are requesting response as 403 forbidden so you might not be getting an error but it will provide you blank result as []. To avoid this situation we add user agent and this code will get you the desired result.
import urllib.request
from bs4 import BeautifulSoup
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = "https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone"
headers={'User-Agent':user_agent}
request=urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, "lxml")
rating = soup.find('span', {'itemprop':'ratingValue'})
rating.text
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36
(KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southerntier-pumking
clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
you are facing this error because some websites can't be scraped by beautiful soup. So for these kinds of websites, you have to use selenium
download latest chrome driver from this link according to your operating system
install selenium driver by this command "pip install selenium"
# import required modules
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time, os
curren_dir = os.getcwd()
print(curren_dir)
# concatinate web driver with your current dir && if you are using window change "/" to '\' .
# make sure , you placed chromedriver in current directory
driver = webdriver.Chrome(curren_dir+'/chromedriver')
# driver.get open url on your browser
driver.get('https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone')
time.sleep(1)
# it fetch data html data from driver
super_html = driver.page_source
# now convert raw data with 'html.parser'
soup=BeautifulSoup(super_html,"html.parser")
rating = soup.findAll("span",itemprop="ratingValue")
rating[0].text

Categories