BeautifulSoup - extracting text from multiple span elements w/o classes - python

So that's how HTML looks:
<p class="details">
<span>detail1</span>
<span class="number">1</span>
<span>detail2</span>
<span>detail3</span>
</p>
I need to extract detail2 & detail3.
But with this piece of code I only get detail1.
info = data.find("p", class_ = "details").span.text
How do I extract the needed items?
Thanks in advance!

Select your elements more specific in your case all sibling <span> of <span> with class number:
soup.select('span.number ~ span')
Example
from bs4 import BeautifulSoup
html='''<p class="details">
<span>detail1</span>
<span class="number">1</span>
<span>detail2</span>
<span>detail3</span>
</p>'''
soup = BeautifulSoup(html)
[t.text for t in soup.select('span.number ~ span')]
Output
['detail2', 'detail3']

You can find all <span>s and do normal indexing:
from bs4 import BeautifulSoup
html_doc = """\
<p class="details">
<span>detail1</span>
<span class="number">1</span>
<span>detail2</span>
<span>detail3</span>
</p>"""
soup = BeautifulSoup(html_doc, "html.parser")
spans = soup.find("p", class_="details").find_all("span")
for s in spans[-2:]:
print(s.text)
Prints:
detail2
detail3
Or CSS selectors:
spans = soup.select(".details span:nth-last-of-type(-n+2)")
for s in spans:
print(s.text)
Prints:
detail2
detail3

Related

How to just get the content of the tag when you use findAll in Beautiful Soup?

So on the project I'm building I want to find the price contained on the multiple results I got with the findAll() command. Here's the code:
soup = BeautifulSoup(driver.page_source, 'html.parser')
price = soup.find_all(class_='search-result__market-price--value')
print(price)
And this is what I get:
[<span class="search-result__market-price--value" tabindex="-1"> $0.11 </span>, <span class="search-result__market-price--value" tabindex="-1"> $0.24 </span>, ... ]
I tried using this code I found somewhere else price = soup.find_all(class_='search-result__market-price--value')[0].string but it just gave the error IndexError: list index out of range.
What can I do to just get the numbers?
Iterate the ResultSet created by find_all():
soup = BeautifulSoup(driver.page_source, 'html.parser')
for price in soup.find_all(class_='search-result__market-price--value'):
print(price.text)
or to just get the numbers
print(price.text.split('$')[-1])
Example
from bs4 import BeautifulSoup
html='''
<span class="search-result__market-price--value" tabindex="-1"> $0.11 </span>
<span class="search-result__market-price--value" tabindex="-1"> $0.24 </span>
'''
soup = BeautifulSoup(html, 'html.parser')
for tag in soup.find_all('span'):
print(tag.text.split('$')[-1])
Output
0.11
0.24

How to BeautifulSoup getting value that following div class

I'm trying to extract the " 24.8 " from the following HTML code:
<div class="anlik-sicaklik">
<div class="anlik-sicaklik-deger ng-binding" ng-bind="sondurum[0].sicaklik | comma">
24,8
::after
</div>
<div class="anlik-sicaklik-havadurumu">
<div class="anlik-sicaklik-havadurumu-ikonu">
Here's my code
import requests
from bs4 import BeautifulSoup
r = requests.get("https://mgm.gov.tr/tahmin/il-ve-ilceler.aspx?il=ANTALYA&ilce=KUMLUCA")
soup = BeautifulSoup(r.content, "lxml")
sicaklik = soup.find('div', {'class':'anlik-sicaklik-deger'})
print(sicaklik)
My code's output
<div class="anlik-sicaklik-deger" ng-bind="sondurum[0].sicaklik | comma">
</div>
could you please help me to get 24,8 value?
Your question concern more about parsing string than web-page. So it is better, once found the tag with bs4, to parse the string with some regex.
The matching condition ([0-9]+,[0-9]) is one or more number separated by a , and then a number again.
Notice the the final result, nr, is a string, to make it a number you should use float(nr.replace(',', '.')).
from bs4 import BeautifulSoup
import re
html = """
<div class="anlik-sicaklik-deger ng-binding" ng-bind="sondurum[0].sicaklik | comma">
24,8
::after
</div>
"""
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', class_='anlik-sicaklik-deger', string=True)
# get text
text = str(div.string).strip()
# regex
nr = re.search(r'([0-9]+,[0-9])', text).group(0)
print(nr)
Output
24,8
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
sicaklik = soup.find('div', {'class':'anlik-sicaklik-deger'}).**text**

How to extract exact information by span class using Beautiful Soup

This is my code and output for my price monitoring code:
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="result_0_name").get_text()
price = soup.find("span", class_ = "normal_price")
#converted_price = price[0:3]
print(price.get_text())
print(title.strip())
the output is as follows
Starting at:
$0.70 USD
$0.67 USD
Operation Broken Fang Case
and html of the page is as so
<span class="market_table_value normal_price">Starting at:<br/>
<span class="normal_price" data-currency="1" data-price="69">$0.69 USD</span>
<span class="sale_price">$0.66 USD</span>
</span>
as you can see there is no ID, so I cannot use that, I only wish to display the 'normal_price' and not the other data in that span. Any ideas?
Just make the selection of the span more specific, for example use the fact, that it is an element inside an element:
soup.select_one('span > span.normal_price').get_text()
Example
from bs4 import BeautifulSoup
html='''
<span class="market_table_value normal_price">
Starting at:<br/>
<span class="normal_price" data-currency="1" data-price="69">$0.69 USD</span>
<span class="sale_price">$0.66 USD</span>
</span>
'''
soup = BeautifulSoup(html,'html.parser')
price = soup.select_one('span > span.normal_price').get_text()
price
Output
$0.69 USD
You can also try like this
from bs4 import BeautifulSoup
html ="""<span class="market_table_value normal_price">
Starting at:<br/>
<span class="normal_price" data-currency="1" data-price="69">$0.69 USD</span>
<span class="sale_price">$0.66 USD</span>
</span>"""
soup = BeautifulSoup(html, 'html.parser')
using attribute
price = soup.select_one('span[data-currency="1"]').get_text()
exact attribute
price = soup.select_one('span[data-currency^="1"]').get_text()
print(price) #$0.69 USD

Extract content of div tag except other tags using BeuatifulSoup

I have below HTML content, wherein div tag looks like below
<div class="block">aaa
<p> bbb</p>
<p> ccc</p>
</div>
From above I want to extract text only as "aaa" and not other tags content.
When I do,
soup.find('div', {"class": "block"})
it gives me all the content as text and I want to avoid the contents of p tag.
Is there a method available in BeautifulSoup to do this?
Check the type of element,You could try:
from bs4 import BeautifulSoup
from bs4 import element
s = '''
<div class="block">aaa
<p> bbb</p>
<p> ccc</p>
<h1>ddd</h1>
</div>
'''
soup = BeautifulSoup(s, "lxml")
for e in soup.find('div', {"class": "block"}):
if type(e) == element.NavigableString and e.strip():
print(e.strip())
# aaa
And this will ignore all text in sub tags.
You can remove the p tags from that div, which effectively gives you the aaa text.
Here's how:
from bs4 import BeautifulSoup
sample = """<div class="block">aaa
<p> bbb</p>
<p> ccc</p>
</div>
"""
s = BeautifulSoup(sample, "html.parser")
excluded = [i.extract() for i in s.find("div", class_="block").find_all("p")]
print(s.text.strip())
Output:
aaa
You can use find_next(), which returns the first match found:
from bs4 import BeautifulSoup
html = '''
<div class="block">aaa
<p> bbb</p>
<p> ccc</p>
</div>
'''
soup = BeautifulSoup(html, "html.parser")
print(soup.find('div', {"class": "block"}).find_next(text=True))
Output:
aaa

Scrape 2 inner texts within div as one value

I have the following html
<div class="price-block__highlight"><span class="promo-price" data-
test="price">102,
<sup class="promo-price__fraction" data-test="price-fraction">99</sup>
</span>
</div>
I want to print the price of this html without comma, so
print price should result in:
102.99
I have the following code
pricea = page_soup.find("div", {"class":"price-block__highlight"})
price = str(pricea.text.replace('-','').replace(',','.').strip())
print price
This however results in:
102.
99
When writing in a csv it creates multiple rows. How to get both numbers in one value?
i think you are using bs4
from bs4 import BeautifulSoup
html_doc = """
<div class="price-block__highlight"><span class="promo-price" data-
test="price">102,
<sup class="promo-price__fraction" data-test="price-fraction">99</sup>
</span>
</div>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
price_div = soup.find("div", {"class": 'price-block__highlight'})
texts = [x.strip() for x in price_div.text.split(',')]
print('.'.join(texts))
Output
102.99

Categories