#Days
try:
days2.append(link2.find_all('div',{'class':'list-card variable-text list-card-img-overlay'}).text)
except Exception:
days2.append('N/A')
#Views
try:
views.append(link2.find_all('div',{'class':'Text-c11n-8-53-2__sc-aiai24-0 duChdW'}[2]).text)
except Exception:
views.append('N/A')
https://www.zillow.com/manhattan-new-york-ny-10023/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%2210023%22%2C%22mapBounds%22%3A%7B%22west%22%3A-73.9951494604187%2C%22east%22%3A-73.9682415395813%2C%22south%22%3A40.763770638446054%2C%22north%22%3A40.7898340773195%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A61637%2C%22regionType%22%3A7%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A15%7D
enter image description here
keep getting N/A instead of 2hours and 88views
If you want to extract the time on Zillow data you mentioned in the comments, you could search for divs with the appropriate class name.
from bs4 import BeautifulSoup
html = """
<div class="hdp__sc-qe1dn6-1 jqZymu"><div class="Text-c11n-8-53-2__sc-aiai24-0 iBdXNb">Time on Zillow</div><div class="Text-c11n-8-53-2__sc-aiai24-0 duChdW">36 minutes</div></div>
"""
soup = BeautifulSoup(html, 'html.parser')
time = soup.find('div', class_="Text-c11n-8-53-2__sc-aiai24-0 duChdW")
print(time.text)
# 36 minutes
Related
I want to get 8.9 from follow html tag by using BeautifulSoup.
<div rating-value="8.9" ratings-count="23" product-url="lenovo-v14-ada-amd-ryzen-3-3250u-8-gb-vram-256-gb-ssd-14-inch-windows-home-1-82c6006cuk/version.asp" class="ng-isolate-scope">
import requests
from bs4 import BeautifulSoup
import pandas as pd
website = 'https://www.laptopsdirect.co.uk/ct/laptops-and-netbooks/laptops?fts=laptops'
response = requests.get(website)
soup = BeautifulSoup(response.content, 'lxml')
results = soup.find_all('div', class_='OfferBox')
name = results[0].find('a', class_='offerboxtitle').get_text()
price = results[0].find('span', class_='offerprice').get_text()
review_rating = results[0].find('')
print(review_rating)
I tried:
review_rating = results[0].find('div.rating-value')
None
review_rating = results[0].find('div')['rating-value']
KeyError: 'rating-value'
I'm not familiar with BeautifulSoup yet, so I failed.
Please teach me how to get 8.9?
Thanks
You might use .get method for retrieving attributes values as follows
from bs4 import BeautifulSoup
html = '''<div rating-value="8.9" ratings-count="23" product-url="lenovo-v14-ada-amd-ryzen-3-3250u-8-gb-vram-256-gb-ssd-14-inch-windows-home-1-82c6006cuk/version.asp" class="ng-isolate-scope">'''
soup = BeautifulSoup(html, "html.parser")
print(soup.find("div").get("rating-value"))
output
8.9
Keep in mind that what .get return is str ("8.9").
You are looking for the data in wrong tag. The HTML shows the data inside a <div> but in the soup, it is present inside <star-rating>.
The rating is present as an attribute of a tag called <star-rating>. Just extract the data from it.
price = results[0].find('span', class_='offerprice').get_text()
review_rating = results[0].find('star-rating').get('rating-value')
print(review_rating)
8.9
I am having trouble printing the element inside a div.
so this is the tag that I want to scrape
div class="page-box house-lst-page-box" comp-module="page" page-url="/ershoufang/miyun/pg{page}" page-data="{"totalPage":73,"curPage":1}"
I want my code to print the the integer inside totalPage, which is 73.
thanks in advance!
Try:
import json
from bs4 import BeautifulSoup
html_doc = """<div class="page-box house-lst-page-box" comp-module="page" page-url="/ershoufang/miyun/pg{page}" page-data="{"totalPage":73,"curPage":1}"><a class="on" href="/ershoufang/miyun/" data-page="1">1</a>23<span>...</span>73下一页</div>"""
soup = BeautifulSoup(html_doc, "html.parser")
data = soup.select_one("div[page-data]")["page-data"]
data = json.loads(data)
print("Total page:", data["totalPage"])
Prints:
Total page: 73
I am somewhat new to Python and can't for the life of me figure out why the following code isn’t pulling the element I am trying to get.
It currently returns:
for player in all_players:
player_first, player_last = player.split()
player_first = player_first.lower()
player_last = player_last.lower()
first_name_letters = player_first[:2]
last_name_letters = player_last[:5]
player_url_code = '/{}/{}{}01'.format(last_name_letters[0], last_name_letters, first_name_letters)
player_url = 'https://www.basketball-reference.com/players' + player_url_code + '.html'
print(player_url) #test
req = urlopen(player_url)
soup = bs.BeautifulSoup(req, 'lxml')
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
Currently returning:
--> for td in table.find_all('td'):
player_pbp_data.append(td.get_text()) #if this works, would like to
AttributeError: 'NoneType' object has no attribute 'find_all'
Note: iterating through children of the wrapper object returns:
< div class="table_outer_container" > as part of the tree.
Thanks!
Make sure that table contains the data you expect.
For example https://www.basketball-reference.com/players/a/abdulka01.html doesn't seem to contain a div with id='all_advanced_pbp'
Try to explicitly pass the html instead:
bs.BeautifulSoup(the_html, 'html.parser')
I trie to extract data from the url you gave but it did not get full DOM. after then i try to access the page with browser with javascrip and without javascrip, i know website need javascrip to load some data. But the page like players it need not. The simple way to get dynamic data is using selenium
This is my test code
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
player_pbp_data = []
def get_list(t="a"):
with requests.Session() as se:
url = "https://www.basketball-reference.com/players/{}/".format(t)
req = se.get(url)
soup = BeautifulSoup(req.text,"lxml")
with open("a.html","wb") as f:
f.write(req.text.encode())
table = soup.find("div",class_="table_wrapper setup_long long")
players = {player.a.text:"https://www.basketball-reference.com"+player.a["href"] for player in table.find_all("th",class_="left ")}
def get_each_player(player_url="https://www.basketball-reference.com/players/a/abdulta01.html"):
with webdriver.Chrome() as ph:
ph.get(player_url)
text = ph.page_source
'''
with requests.Session() as se:
text = se.get(player_url).text
'''
soup = BeautifulSoup(text, 'lxml')
try:
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
except Exception as e:
print("This page dose not contain pbp")
get_each_player()
I am practicing on Beautiful Soup and am after a products price, description and item number. The first 2 are text and are easy to get. The third is an attribute of the tag data-trade-price as seen below:-
<div class="price-group display-metro has-promo-price medium ng-scope" ng-class="{'has-trade-price': ShowTrade}" data-trade-price="221043">
I am after the numbers such as 221043 which is loaded in by the page. IE - all 24 item numbers matching all 24 products
My code is:-
import requests
r = requests.get('http://www.supercheapauto.com.au/store/car-care/wash-wax-polish/1021762?page=1&pageSize=24&sort=-ProductSummaryPurchasesWeighted%2C-ProductSummaryPurchases')
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'lxml')
results = soup.find_all('div', class_='details')
for result in results:
try:
SKU = result.select_one("data-trade-price")
except AttributeError: SKU = "N/A"
DESC = result.find('div', class_='title').text.strip().upper()
PRICE = result.find('span', class_='currency').text.strip().upper()
print(SKU,'\t', DESC,'\t', PRICE)
What is the syntax to get the item number from the soup?
Sorry - I am after the syntax that can iterate through the page of 24 products and recover the 24 different item numbers. The example given was to show the part of the attribute value that I was after. I ran the given answer and it works. I am unsure of how to integrate into the code given as the variations I use do not. Any suggestions.
You can access the attribute just like a dictionary.
Ex:
from bs4 import BeautifulSoup
s = """<div class="price-group display-metro has-promo-price medium ng-scope" ng-class="{'has-trade-price': ShowTrade}" data-trade-price="221043"<\div>"""
soup = BeautifulSoup(s, "html.parser")
print( soup.find("div", class_="price-group display-metro has-promo-price medium ng-scope").attrs["data-trade-price"] )
or
print( soup.find("div", class_="price-group display-metro has-promo-price medium ng-scope")["data-trade-price"] )
Output:
221043
I have written code to extract the url and title of a book using BeautifulSoup from a page.
But it is not extracting the name of the book Astounding Stories of Super-Science April 1930 between > and </a> tags.
How can I extract the name of the book?
I have tried the findnext method recommended in another question, but I get an AttributeError on that.
HTML:
<li>
<a class="extiw" href="//www.gutenberg.org/ebooks/29390" title="ebook:29390">Astounding Stories of Super-Science April 1930</a>
<a class="image" href="/wiki/File:BookIcon.png"><img alt="BookIcon.png" height="16" src="//www.gutenberg.org/w/images/9/92/BookIcon.png" width="16"/></a>
(English)
</li>
Code below:
def make_soup(BASE_URL):
r = requests.get(BASE_URL, verify = False)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract_text_urls(html):
soup = make_soup(BASE_URL)
for li in soup.findAll('li'):
try:
try:
print li.a['href'], li.a['title']
print "\n"
except KeyError:
pass
except TypeError:
pass
extract_text_urls(filename)
You should use the text attribute of the element. The following works for me:
def make_soup(BASE_URL):
r = requests.get(BASE_URL)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def extract_text_urls(html):
soup = make_soup(BASE_URL)
for li in soup.findAll('li'):
try:
try:
print li.a['href'], li.a.text
print "\n"
except KeyError:
pass
except TypeError:
pass
extract_text_urls('http://www.gutenberg.org/wiki/Science_Fiction_(Bookshelf)')
I get the following output for the element in question
//www.gutenberg.org/ebooks/29390 Astounding Stories of Super-Science April 1930
According to the BeautifulSoup documentation the .string property should accomplish what you are trying to do, by editing your original listing this way:
# ...
try:
print li.a['href'], li.a['title']
print "\n"
print li.a.string
except KeyError:
pass
# ...
You probably want to surround it with something like
if li.a['class'] == "extiw":
print li.a.string
since, in your example, only the anchors of class extiw contain a book title.
Thanks #wilbur for pointing out the optimal solution.
I did not see how you can extract the text within the tag. I would do something like this:
from bs4 import BeatifulSoup as bs
from urllib2 import urlopen as uo
soup = bs(uo(html))
for li in soup.findall('li'):
a = li.find('a')
book_title = a.contents[0]
print book_title
To get just the text that is not inside any tags use the get_text() method. It is in the documentation here.
I can't test it because I don't know the url of the page you are trying to scrape, but you can probably just do it with the li tag since there doesn't seem to be any other text.
Try replacing this:
for li in soup.findAll('li'):
try:
try:
print li.a['href'], li.a['title']
print "\n"
except KeyError:
pass
except TypeError:
pass
with this:
for li in soup.findAll('li'):
try:
print(li.get_text())
print("\n")
except TypeError:
pass