Beautiful soup get text from Id - python

I try to catch the text of an id with BeautifulSoup. The result should be 30,66.
My actual code print the complete span element:
[<span class="mainValueAmount simpleTextFit" id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldValue">30,66</span>]
How do I get just the value 30,66?
from bs4 import BeautifulSoup
u = '<div class="widgetBox" data-name="pvEnergy"><div class="widgetHead">PV-Energie</div><div class="widgetBody"><div class="mainValue"><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldValue" class="mainValueAmount simpleTextFit">30,66</span><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldUnit" class="mainValueUnit">kWh</span><br><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldPeriodTitle" class="mainValueDescription">Heute</span></div></div><div id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalDiv" class="widgetFooter">Gesamt: <span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalValue">158,953</span><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalUnit">MWh</span></div></div>'
idAktWert = 'ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldValue'
soup = BeautifulSoup(u, "html.parser")
aktWert = soup.select("#" + idAktWert)
print(aktWert)
Thanks for your help!

Use .text
Ex:
from bs4 import BeautifulSoup
u = '<div class="widgetBox" data-name="pvEnergy"><div class="widgetHead">PV-Energie</div><div class="widgetBody"><div class="mainValue"><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldValue" class="mainValueAmount simpleTextFit">30,66</span><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldUnit" class="mainValueUnit">kWh</span><br><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldPeriodTitle" class="mainValueDescription">Heute</span></div></div><div id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalDiv" class="widgetFooter">Gesamt: <span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalValue">158,953</span><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalUnit">MWh</span></div></div>'
idAktWert = 'ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldValue'
soup = BeautifulSoup(u, "html.parser")
aktWert = soup.select("#" + idAktWert)[0] #Note: I have used Index to select the first element in list.
print(aktWert.text)
Output:
30,66

You simply need get_text() for this.
from bs4 import BeautifulSoup
u = '<div class="widgetBox" data-name="pvEnergy"><div class="widgetHead">PV-Energie</div><div class="widgetBody"><div class="mainValue"><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldValue" class="mainValueAmount simpleTextFit">30,66</span><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldUnit" class="mainValueUnit">kWh</span><br><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldPeriodTitle" class="mainValueDescription">Heute</span></div></div><div id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalDiv" class="widgetFooter">Gesamt: <span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalValue">158,953</span><span id="ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldTotalUnit">MWh</span></div></div>'
idAktWert = 'ctl00_ContentPlaceHolder1_PublicPagePlaceholder1_PageUserControl_ctl00_PublicPageLoadFixPage_energyYieldWidget_energyYieldValue'
soup = BeautifulSoup(u, "html.parser")
aktWert = soup.select("#" + idAktWert)
// since aktWert is an array, we need to get the 1st index
print(aktWert[0].get_text()) // outputs 30,66

Related

Get the text from the nested span tag with beautifulsoup

I have an html file:
...
<span class="value">401<span class="Suffix">st</span></span>
...
and I want to get only first span tag text which is 401 but when I run:
>>> get_text = soup.find(class_ = 'value').text
>>> print(get_text)
401st
the output contains inner spans text(st).
This will work fine:
from bs4 import BeautifulSoup
s = '''<span class="value">401<span class="Suffix">st</span></span>'''
soup = BeautifulSoup(s, 'html.parser')
get_text = soup.find(class_='value')
print(get_text.contents[0])
Output
401
Use .strings. This is a #property that gives a generator of individual string elements. .text or .get_text is what joins those strings together.
>>> soup = bs4.BeautifulSoup('<span class="value">401<span class="Suffix">st</span></span>')
>>> t = soup.find(class_='value')
>>> next(t.strings)
'401'
>>> list(t.strings)
['401', 'st']
You need to use recursive=False in BeautifulSoup
<span class="value">401<span class="Suffix">st</span></span>
soup = BeautifulSoup(html)
all_parent_p = soup.find_all('p', recursive=False)
for parent_p in all_parent_p:
ptext = parent_p.find(text=True, recursive=False)

Get html text with Beautiful Soup

I'm trying to get the number from inside a div:
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
I need the 122.7 number, but I cant get it. I have tried with:
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").string
But, there are more than one element and I receive "none".
Is there a way to print the childs and get the string from childs?
Use .getText().
For example:
from bs4 import BeautifulSoup
sample_html = """
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
"""
soup = BeautifulSoup(sample_html, "html.parser")
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").getText()
print(strings)
Output:
122.78
Or use __next__() to get only the 122.7.
soup = BeautifulSoup(sample_html, "html.parser")
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").strings.__next__()
print(strings)
Output:
122.7
To only get the first text, search for the tag, and call the next_element method.
from bs4 import BeautifulSoup
html = """
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
"""
soup = BeautifulSoup(html, "html.parser")
print(
soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").next_element
)
Output:
122.7
You could use selenium to find the element and then use BS4 to parse it.
An example would be
import selenium.webdriver as WD
from selenium.webdrive.chrome.options import Options
import bs4 as B
driver = WD.Chrome()
objXpath = driver.find_element_by_xpath("""yourelementxpath""")
objHtml = objXpath.get_attribute("outerHTML")
soup = B.BeutifulSoup(objHtml, 'html.parser')
text = soup.get_text()
This code should work.
DISCLAIMER
I haven't done work w/ selenium and bs4 in a while so you might have to tweak it a little bit.

python nested Tags (beautiful Soup)

I used beautiful soup using python to get data from a specific website
but I don't know how to get one of these prices but I want the price in gram (g)
AS shown below this is the HTML codeL:
<div class="promoPrice margBottom7">16,000
L.L./200g<br/><span class="kiloPrice">79,999
L.L./Kg</span></div>
I use this code:
p_price = product.findAll("div{"class":"promoPricemargBottom7"})[0].text
my result was:
16,000 L.L./200g 79,999 L.L./Kg
but i want to have:
16,000 L.L./200g
only
You will need to first decompose the span inside the div element:
from bs4 import BeautifulSoup
h = """
<div class="promoPrice margBottom7">16,000 L.L./200g<br/>
<span class="kiloPrice">79,999 L.L./Kg</span></div>
"""
soup = BeautifulSoup(h, "html.parser")
element = soup.find("div", {'class': 'promoPrice'})
element.span.decompose()
print(element.text)
#16,000 L.L./200g
Try using soup.select_one('div.promoPrice').contents[0]
from bs4 import BeautifulSoup
html = """<div class="promoPrice margBottom7">16,000 L.L./200g<br/>
<span class="kiloPrice">79,999 L.L./Kg</span></div>"""
soup = BeautifulSoup(html, features='html.parser')
# value = soup.select('div.promoPrice > span') # for 79,999 L.L./Kg
value = soup.select_one('div.promoPrice').contents[0]
print(value)
Prints
16,000 L.L./200g

Python BeautifulSoup - get values from p

html = '<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'
soup = BeautifulSoup(html, 'html.parser')
sup_elem = soup.find("sup").string # 33 - it works
How do I get the "96" before the element ?
You can grab the previousSibling tag
from bs4 import BeautifulSoup
html = '''<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'''
soup = BeautifulSoup(html, 'html.parser')
elem1 = soup.find("sup").previousSibling
elem2 = soup.find("sup").text # 33 - it works
print ('.'.join([elem1, elem2]))
Output:
96.33
You can use children method. It will return a list of all the children of p tag. (6 will be first child of it.
html = '<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'
soup = BeautifulSoup(html, 'html.parser')
elem = list(soup.find("p").children)[0] #0th element of the list will be 96
sup_elem = soup.find("sup").string
result = elem + '.' + sup_elem #96.33
Use select instead.
from bs4 import BeautifulSoup
html = '''<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'''
soup = BeautifulSoup(html, 'html.parser')
print(soup.select_one('.product-new-price').text.strip().replace('Lei',''))
There is no "." in source but you can always divide by 100
print(int(soup.select_one('.product-new-price').text.strip().replace('Lei',''))/100)

how to extract an attribute value of div using BeautifulSoup

I have a div whose id is "img-cont"
<div class="img-cont-box" id="img-cont" style='background-image: url("http://example.com/example.jpg");'>
I want to extract the url in background-image using beautiful soup.How can I do it?
You can you find_all or find for the first match.
import re
soup = BeautifulSoup(html_str)
result = soup.find('div',attrs={'id':'img-cont','style':True})
if result is not None:
url = re.findall('\("(http.*)"\)',result['style']) # return a list.
Try this:
import re
from bs4 import BeautifulSoup
html = '''\
<div class="img-cont-box" \
id="img-cont" \
style='background-image: url("http://example.com/example.jpg");'>\
'''
soup = BeautifulSoup(html, 'html.parser')
div = soup.find('div', id='img-cont')
print(re.search(r'url\("(.+)"\)', div['style']).group(1))

Categories