How to extract the content using beautifulsoup - python

I want to try to extract the product name and price from the website using beautifulsoup. But I do not know how to extract the content.
Python code:
from bs4 import BeautifulSoup
import re
div = '<div pagetype="simple_table_nonFashion" class="itemBox"
id="itemSearchResultCon_679026"><p class="proPrice"><em class="num"
id="price0_679026" productid="679026" adproductflag="0" yhdprice="49.9"
productunit="" diapernum="0" diapernumunit=""><b>¥</b>49.90</em></p><p
class="proName clearfix"><a id="pdlink2_679026" pmid="0"
href="//item.yhd.com/679026.html"><style type="text/css">.preSellOrAppoint
{border: 1px solid #FFFFFF;}</style>印尼进口</a></p></div>'
soup = BeautifulSoup(div, "lxml")
itemBox = soup.find("div", {"class": "itemBox"})
proPrice = itemBox.find("p", {"class": "proPrice"}).find("em").text
pdlink2 = itemBox.find('a',{"id": re.compile('pdlink2_*')}).text
print(proPrice)
print(pdlink2)
Print out the result:
¥49.90
.preSellOrAppoint {border: 1px solid #FFFFFF;}印尼进口
The picture:
My expected result is the content:
49.90
印尼进口

With soup.select_one() method:
from bs4 import BeautifulSoup
div = '''<div pagetype="simple_table_nonFashion" class="itemBox"
id="itemSearchResultCon_679026"><p class="proPrice"><em class="num"
id="price0_679026" productid="679026" adproductflag="0" yhdprice="49.9"
productunit="" diapernum="0" diapernumunit=""><b>¥</b>49.90</em></p><p
class="proName clearfix"><a id="pdlink2_679026" pmid="0"
href="//item.yhd.com/679026.html"><style type="text/css">.preSellOrAppoint
{border: 1px solid #FFFFFF;}</style>印尼进口</a></p></div>'''
soup = BeautifulSoup(div, "lxml")
proPrice = soup.select_one("p.proPrice em").contents[-1]
pdlink2 = soup.select_one('p.proName > a').contents[-1]
print(proPrice)
print(pdlink2)
The output:
49.90
印尼进口
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors

Here's the code based on the BeautifulSoup object you provided:
from bs4 import BeautifulSoup
import re
div = '<div pagetype="simple_table_nonFashion" class="itemBox" id="itemSearchResultCon_679026"><p class="proPrice"><em class="num" id="price0_679026" productid="679026" adproductflag="0" yhdprice="49.9" productunit="" diapernum="0" diapernumunit=""><b>¥</b>49.90</em></p><p class="proName clearfix"><a id="pdlink2_679026" pmid="0" href="//item.yhd.com/679026.html"><style type="text/css">.preSellOrAppoint {border: 1px solid #FFFFFF;}</style>印尼进口</a></p></div>'
soup = BeautifulSoup(div, "lxml")
proPrice = soup.b.next_sibling
pdlink2 = soup.style.next_sibling
print(proPrice)
print(pdlink2)
.next_sibling allows you to access the text outside of the <b> and <style> tags.

Related

How to BeautifulSoup getting value that following div class

I'm trying to extract the " 24.8 " from the following HTML code:
<div class="anlik-sicaklik">
<div class="anlik-sicaklik-deger ng-binding" ng-bind="sondurum[0].sicaklik | comma">
24,8
::after
</div>
<div class="anlik-sicaklik-havadurumu">
<div class="anlik-sicaklik-havadurumu-ikonu">
Here's my code
import requests
from bs4 import BeautifulSoup
r = requests.get("https://mgm.gov.tr/tahmin/il-ve-ilceler.aspx?il=ANTALYA&ilce=KUMLUCA")
soup = BeautifulSoup(r.content, "lxml")
sicaklik = soup.find('div', {'class':'anlik-sicaklik-deger'})
print(sicaklik)
My code's output
<div class="anlik-sicaklik-deger" ng-bind="sondurum[0].sicaklik | comma">
</div>
could you please help me to get 24,8 value?
Your question concern more about parsing string than web-page. So it is better, once found the tag with bs4, to parse the string with some regex.
The matching condition ([0-9]+,[0-9]) is one or more number separated by a , and then a number again.
Notice the the final result, nr, is a string, to make it a number you should use float(nr.replace(',', '.')).
from bs4 import BeautifulSoup
import re
html = """
<div class="anlik-sicaklik-deger ng-binding" ng-bind="sondurum[0].sicaklik | comma">
24,8
::after
</div>
"""
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', class_='anlik-sicaklik-deger', string=True)
# get text
text = str(div.string).strip()
# regex
nr = re.search(r'([0-9]+,[0-9])', text).group(0)
print(nr)
Output
24,8
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
sicaklik = soup.find('div', {'class':'anlik-sicaklik-deger'}).**text**

beautifulsoup find_all title

html is
<div class="trn-defstat__value">
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-ash.16913d82e3.png" title="ASH" style="height: 35px; padding-right: 8px;">
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-jager.600b2773be.png" title="JÄGER" style="height: 35px; padding-right: 8px;">
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-bandit.385144d970.png" title="BANDIT" style="height: 35px; padding-right: 8px;">
</div>
I want to get each title value.
But before that, I write like this
from bs4 import BeautifulSoup as bs
import requests
bsURL = "https://r6.tracker.network/profile/pc/Spoit.GODSENT"
respinse = requests.get(bsURL)
html = bs(respinse.text, 'html.parser')
title = html.find_all(class_='trn-defstat__value')[4]
print(title)
Result ->
<div class="trn-defstat__value">
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-ash.16913d82e3.png" style="height: 35px; padding-right: 8px;" title="ASH"/>
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-jager.600b2773be.png" style="height: 35px; padding-right: 8px;" title="JÄGER"/>
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-bandit.385144d970.png" style="height: 35px; padding-right: 8px;" title="BANDIT"/>
</div>
What should I do?
This script will print all <img> titles from Top Operators section:
from bs4 import BeautifulSoup as bs
import requests
bsURL = "https://r6.tracker.network/profile/pc/Spoit.GODSENT"
respinse = requests.get(bsURL)
html = bs(respinse.text, 'html.parser')
# find Top Operators tag
operators = html.find(class_='trn-defstat__name', text='Top Operators')
for img in operators.find_next('div').find_all('img'):
print(img['title'])
Prints:
ASH
JÄGER
BANDIT
Or using CSS:
for img in html.select('.trn-defstat__name:contains("Top Operators") + * img'):
print(img['title'])
Just use the .get() function to get the attribute and pass in the attribute name.
pip install html5lib
I suggest you use that, I believe it's a better parser.
from bs4 import BeautifulSoup as bs
import requests
bsURL = "https://r6.tracker.network/profile/pc/Spoit.GODSENT"
respinse = requests.get(bsURL)
html = bs(respinse.content, 'html5lib')
container = html.find("div", class_= "trn-defstat mb0 top-operators")
imgs = container.find_all("img")
for img in imgs:
print(img.get("title"))
I did not seem to understand what part of the site you were trying to scrape but take note of it to sometimes get first the block of html code where there are the details you want to scraped :)
This should help u:
from bs4 import BeautifulSoup
html = """
<div class="trn-defstat__value">
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-ash.16913d82e3.png" title="ASH" style="height: 35px; padding-right: 8px;">
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-jager.600b2773be.png" title="JÄGER" style="height: 35px; padding-right: 8px;">
<img src="https://trackercdn.com/rainbow6-ubi/assets/images/badge-bandit.385144d970.png" title="BANDIT" style="height: 35px; padding-right: 8px;">
</div>
"""
soup = BeautifulSoup(html,'html.parser')
imgs = soup.find_all('img')
for img in imgs:
print(img['title'])
Output:
ASH
JÄGER
BANDIT
Here is the complete code:
from bs4 import BeautifulSoup as bs
import requests
bsURL = "https://r6.tracker.network/profile/pc/Spoit.GODSENT"
respinse = requests.get(bsURL)
html = bs(respinse.text, 'html.parser')
divs = html.find_all('div',class_ = "trn-defstat__value")
imgs = []
for div in divs:
try:
imgs.append(div.find_all('img'))
except:
pass
imgs = [ele for ele in imgs if ele != []]
imgs = [j for sub in imgs for j in sub]
for img in imgs:
print(img['title'])
Output:
ASH
JÄGER
BANDIT

Selecting and stripping img src in HTML string

I'm interested in stripping the s3 credientials from image tags within a block of text that is represented as a string in python.
For each tag in the string (of which there can be many), I'd like to start at ".jpeg", end at the next instance of a quotation mark, and delete everything inbetween those locations.
For example, the following string:
<p><img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJZALJ3EN746L6QWQ%2F20190430%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20190430T021347Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=daf406a830d7d0f1ac2d631603b95e7e2ce0bdacd58d5a383d35f6dcd1466012" style="width: 50%; float: right;" class="note-float-right"><br></p><p><br></p><p> This is extra text in the body.</p>
Would become:
<p><img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg" style="width: 50%; float: right;" class="note-float-right"><br></p><p><br></p><p> This is extra text in the body.</p>
I'm struggling to figure out how to do this. Any help would be appreciated.
Thanks!
Regex is not the tool for the job. A more robust solution is using a HTML parser like BeautifulSoup to extract the src attribute of the img tag, and a URL parser to remove the query from the URL:
from bs4 import BeautifulSoup
from urllib.parse import urlsplit
input_str = '''<p><img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJZALJ3EN746L6QWQ%2F20190430%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20190430T021347Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=daf406a830d7d0f1ac2d631603b95e7e2ce0bdacd58d5a383d35f6dcd1466012" style="width: 50%; float: right;" class="note-float-right"><br></p><p><br></p><p> This is extra text in the body.</p>'''
soup = BeautifulSoup(input_str, "html.parser")
img_url = soup.find('img')['src']
new_url = urlsplit(img_url)._replace(query=None).geturl()
soup.find('img')['src'] = new_url
print(soup)
Output:
<p><img class="note-float-right" src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg" style="width: 50%; float: right;"/><br/></p><p><br/></p><p> This is extra text in the body.</p>
Edit: if you have more than one img tag per string, you can use:
input_str = '''<p><img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJZALJ3EN746L6QWQ%2F20190430%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20190430T021347Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=daf406a830d7d0f1ac2d631603b95e7e2ce0bdacd58d5a383d35f6dcd1466012" style="width: 50%; float: right;" class="note-float-right"><br></p><p><br></p><p> This is extra text in the body.</p>
<img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJZALJ3EN746L6QWQ%2F20190430%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20190430T021347Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=daf406a830d7d0f1ac2d631603b95e7e2ce0bdacd58d5a383d35f6dcd1466012" style="width: 50%; float: right;" class="note-float-right"><br><p><br></p><p> This is extra text in the body.</p>'''
soup = BeautifulSoup(input_str, "html.parser")
for img in soup.find_all('img'):
img_url = img['src']
new_url = urlsplit(img_url)._replace(query=None).geturl()
img['src'] = new_url
print(soup)
This will update the src attribute of each img tag:
<p><img class="note-float-right" src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg" style="width: 50%; float: right;"/><br/></p><p><br/></p><p> This is extra text in the body.</p>
<img class="note-float-right" src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg" style="width: 50%; float: right;"/><br/><p><br/></p><p> This is extra text in the body.</p>
Assuming the string is stored in s:
import re
re.sub('\.jpeg[^\"]+\"', '.jpeg', s)
This will look for areas that start with ".jpeg" and end with quotation marks and replace them with empty string.
Using re you can find and remove all between ? and "
text = re.sub('\?[^"]+', '', text)
Example code
text = '<p><img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJZALJ3EN746L6QWQ%2F20190430%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20190430T021347Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=daf406a830d7d0f1ac2d631603b95e7e2ce0bdacd58d5a383d35f6dcd1466012" style="width: 50%; float: right;" class="note-float-right"><br></p><p><br></p><p> This is extra text in the body.</p>'
expected_result = '<p><img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg" style="width: 50%; float: right;" class="note-float-right"><br></p><p><br></p><p> This is extra text in the body.</p>'
import re
result = re.sub('\?[^"]+', '', text)
print(result == expected_result) # True
EDIT: if there is text with ? and " then you can add more elements in regex
result = re.sub('\.jpeg\?[^"]+', '.jpeg', text)
Use BeautifulSoup to parse the html and then use urlparse
Ex:
from bs4 import BeautifulSoup
try:
from urllib.parse import urlparse #python3
except:
from urlparse import urlparse #python2
html = """<p><img src="https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJZALJ3EN746L6QWQ%2F20190430%2Fus-east-2%2Fs3%2Faws4_request&X-Amz-Date=20190430T021347Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=daf406a830d7d0f1ac2d631603b95e7e2ce0bdacd58d5a383d35f6dcd1466012" style="width: 50%; float: right;" class="note-float-right"><br></p><p><br></p><p> This is extra text in the body.</p>"""
soup = BeautifulSoup(html, "html.parser")
for img in soup.find_all("img"): #Find all img tags
o = urlparse(img["src"]) #Get URL
print(o.scheme + "://" + o.netloc + o.path)
Output:
https://s3beanzoid.s3.us-east-2.amazonaws.com/media/django-summernote/2019-04-30/ec707c65-aa6d-4b81-a252-2fa1c1aef087.jpeg

BeautifulSoup, change specific style attribute

I want to change only the background-color style with BeautifulSoup :
My html :
<td style="font-size: .8em; font-family: monospace; background-color: rgb(244, 244, 244);">
</td>
I would like to do something like this :
soup_td = BeautifulSoup(html_td, "html.parser")
soup_td.td["style"]["background-color"] = "red;"
That's a rather complicated answer above; you can also just do this:
for tag in soup.findAll(attrs={'class':'example'}):
tag['style'] = "color: red;"
Combine the soup.findAll with whatever selector of BeautifulSoup you'd like to use.
Use cssutils to manipulate CSS, like this:
from bs4 import BeautifulSoup
from cssutils import parseStyle
html = '<td style="font-size: .8em; font-family: monospace; background-color: rgb(244, 244, 244);"></td>'
# Create soup from html
soup = BeautifulSoup(html, 'html.parser')
# Parse td's styles
style = parseStyle(soup.td['style'])
# Change 'background-color' to 'red'
style['background-color'] = 'red'
# Replace td's styles in the soup with the modified styles
soup.td['style'] = style.cssText
# Outputs: <td style="font-size: .8em; font-family: monospace; background-color: red"></td>
print(soup.td)
You could also use regex if you're comfortable with using it.

Python extract italic content from html

I am trying to extract 'Italic' Content from a pdf in python. I have converted the pdf to html so that I can use the italic tag to extract the text.
Here is how the html looks like
<br></span></div><div style="position:absolute; border: textbox 1px
solid; writing-mode:lr-tb; left:71px; top:225px; width:422px;
height:15px;"><span style="font-family: TTPGFA+Symbol; font-
size:12px">•</span><span style="font-family: YUWTQX+ArialMT; font-
size:14px"> Kornai, Janos. 1992. </span><span style="font-family:
PUCJZV+Arial-ItalicMT; font-size:14px">The Socialist System: The
Political Economy of Communism</span><span style="font-family:
YUWTQX+ArialMT; font-size:14px">.
This is how the code looks:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open("/../..myfile.html"))
bTags = []
for i in soup.findAll('span'):
bTags.append(i.text)
I am not sure how can I get only the italic text.
Try this:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
bTags = []
for i in soup.find_all('span', style=lambda x: x and 'Italic' in x):
bTags.append(i.text)
print bTags
Passing a function to the style argument will filter results by the result of that function, with its input as the value of the style attribute. We check to see if the string Italic is inside the attribute, and if so, return True.
You may need a more sophisticated algorithm depending on the rest of what your HTML looks like.

Categories