Trying to get content of span in Python using BeautifulSoup - python

from bs4 import BeautifulSoup
url = 'C:\\Users\\Zandrio\\Documents\\Python-Selexion\\HTML-localhost\\Selexion.html'
page = open(url)
soup = BeautifulSoup(page.read(), features="lxml")
prettify = soup.prettify
Model = "".join([div.text for div in soup.find_all('div', {'class' : 'title-options'})])
print(Model)
Output:
PS C:\Users\Zandrio> & C:/Users/Zandrio/AppData/Local/Programs/Python/Python38/python.exe c:/Users/Zandrio/Documents/Requests/selexion.py
SQQE55Q90R
Merk:
Samsung Afdrukken
HTML:
<div class="title-options">
<span>
SQQE55Q90R
</span>
<span>
Merk: Samsung
</span>
<span class="print"> Afdrukken
</span>
</div>
I just want the Model number in this case, that is SQQE55Q90R here. Please suggest any solution.

from bs4 import BeautifulSoup
url = 'C:\\Users\\Zandrio\\Documents\\Python-Selexion\\HTML-localhost\\Selexion.html'
page = open(url)
soup = BeautifulSoup(page.read(), features="lxml")
div = soup.body.find('div', attrs={'class': 'title-options'})
model_number = div.span.text.strip() # text of first span
print(model_number)

Related

Cannot get text of a span attribute using BeautifulSoup

I am trying to get from the following
<span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span></div>
the value of data-nodeid
I did the following
price_nodes = soup.find('span', attrs={'id': 'SkuNumber'})
datanode = price_nodes.select_one('span[data-nodeid]')
But I get "None"
How can I fix this? thank you
If price_nodes is correctly fill
i.e. price_nodes =
<span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span>
You just have to do this:
datanode = price_nodes.get('data-nodeid')
Full code should be:
from bs4 import BeautifulSoup as soup
html = '<div><span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span></div>'
page = soup(html, 'html.parser')
price_nodes = page.find('span', {'id': 'SkuNumber'})
datanode = price_nodes.get('data-nodeid')
from bs4 import BeautifulSoup
html = '<span id="SkuNumber" itemprop="identifier" content="sku:473768" data-nodeid="176579" class="product-code col-lg-4 col-md-4">ΚΩΔ. 473768</span></div>'
soup = BeautifulSoup(html)
price_nodes = soup.find('span', attrs={'id': 'SkuNumber'})
print(price_nodes['data-nodeid'])

How do I extract text after <i class> tag?

I am trying to print out the text 'Dealer' from div class by using beautifulSoup, but I do not know how to extract it.
I tried to print the i class, but the text Dealer did not come out
url = 'https://www.carlist.my/used-cars-for-sale/proton/malaysia'
response = requests.get(url, params={'page_number': 1})
soup = BeautifulSoup(response.text, 'lxml')
articles = soup.find_all('article')[:25]
seller_type = articles[4].find('div', class_ = 'item push-quarter--ends listing__spec--dealer')
seller_type_text = articles[4].find('i', class_ = 'icon icon--secondary muted valign--top push-quarter--right icon--user-formal')
print(seller_type.prettify())
print()
print(seller_type_text)
This is the output that I got:
<div class="item push-quarter--ends listing__spec--dealer">
<i class="icon icon--secondary muted valign--top push-quarter--right icon--user-formal">
</i>
Dealer
<span class="flyout listing__badge listing__badge--trusted-seller inline--block valign--top push-quarter--left">
<i class="icon icon--thumb-up">
</i>
<span class="flyout__content flyout__content--tip visuallyhidden--portable">
This 'Trusted Dealer' has a proven track record of upholding the best car selling practices certified by Carlist.my
</span>
</span>
<!-- used car -->
<!-- BMW -->
</div>
<i class="icon icon--secondary muted valign--top push-quarter--right icon--user-formal"></i>
How do I print the word 'Dealer' right after i class and before the span class?
Can someone please help me?
Thanks a lot!
There is a faster way of using one of the compound class names of the i tag element along with next_sibling.
If you examine the html you can see "Dealer" is part of the parent div of the i tag, and follows the i tag; so, you can grab the i tag then use next_sibling
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.carlist.my/used-cars-for-sale/proton/malaysia')
soup = bs(r.content, 'lxml')
print(soup.select_one('.icon--user-formal').next_sibling)
Take a look at the contents property of your seller_type. You'll see that Dealer is at seller_type.contents[2]. In other words,
import requests
from bs4 import BeautifulSoup
url = 'https://www.carlist.my/used-cars-for-sale/proton/malaysia?profile_type=Dealer'
response = requests.get(url, params={'page_number': 1})
soup = BeautifulSoup(response.text, 'lxml')
articles = soup.find_all('article')[:25]
seller_type = articles[4].find('div', class_ = 'item push-quarter--ends listing__spec--dealer')
print(seller_type.contents[2])
import requests
from bs4 import BeautifulSoup
url = 'https://www.carlist.my/used-cars-for-sale/proton/malaysia?profile_type=Dealer'
response = requests.get(url, params={'page_number': 1})
soup = BeautifulSoup(response.text, 'lxml')
articles = soup.find_all('article')[:25]
seller_type = articles[4].find('div', class_ = 'item push-quarter--ends listing__spec--dealer')
print(seller_type.contents[2])

beautifulsoup does not shows all the elements on website

I am making parser for website [https://edp.by/shop/womens-fragrances/][1]
first i got all the links from site to navigate through site
import requests
from bs4 import BeautifulSoup
def get_html(url):
r = requests.get(url,'lxml')
return r.text
url='https://edp.by/'
html=get_html(url)
soup=BeautifulSoup(html, )
x = soup.findAll("div", {"class": "row mainmenu"})
#print(x)
links=[]
for i in x:
z=i.find_all("ul", {"class": "nav navbar-nav"})[0].find_all("a", {"class": "dropdown-toggle"})
print(233,z,len(z),type(z))
for i in z:
q=i["href"]
links.append(url+str(q))
then i am trying to get each product from links:
url='https://edp.by/shop/womens-fragrances/'
html=get_html(url)
soup=BeautifulSoup(html, )
#x = soup.findAll("div", {"class": "row"})
#print()
action = soup.find('form').get('action')
print(action)
and result is : /search/
but at website i see all the structure via google code analizator
<form method="get" action="/shop/womens-fragrances/">
<div class="rr-widget" data-rr-widget-category-id="594" data-rr-widget-id="516e7cba0d422d00402a14b4" data-rr-widget-width="100%"></div>
<div class="shop_block">
<div class="shop_table">
<div class="col-md-4 col-sm-4 col-xs-12 product">
<div class="block">
<a href="/shop/womens-fragrances/43653/">
<img src="/images/no-image.png" class="text-center" alt="" title="">
<p class="fch"></p>
<p class="tch">0,00 руб. </p>
</a>
i want to get link to product, image ,and texts, but bs4 does not shows it. Whats the reason and how could i get it?
i tried also mechanicalsoup, no result also
import mechanicalsoup
browser = mechanicalsoup.StatefulBrowser()
browser.open(links[0])
form = browser.select_form('form')
action = form.form.attrs['action']
print(action) `/search/`
.find() will only get the first appearance of that tag. There are 6 elements with the <form> tag. You can use the .find_all(), then when you iterate through that, you'll see it's the 3rd index position in that list:
import requests
from bs4 import BeautifulSoup
def get_html(url):
r = requests.get(url,'lxml')
return r.text
url='https://edp.by/'
html=get_html(url)
soup=BeautifulSoup(html, )
x = soup.findAll("div", {"class": "row mainmenu"})
#print(x)
links=[]
for i in x:
z=i.find_all("ul", {"class": "nav navbar-nav"})[0].find_all("a", {"class": "dropdown-toggle"})
print(233,z,len(z),type(z))
for i in z:
q=i["href"]
links.append(url+str(q))
url='https://edp.by/shop/womens-fragrances/'
html=get_html(url)
soup=BeautifulSoup(html, 'html.parser')
#x = soup.findAll("div", {"class": "row"})
#print()
actions = soup.find_all('form')
for action in actions:
alpha = action.get('action')
print (alpha)
Output:
/search/
/filter-ajax/
/filter-ajax/
/shop/womens-fragrances/
/shop/womens-fragrances/?lxml
/users/

wrong python html parsing

My code:
from bs4 import BeautifulSoup
import urllib.request
url = "http://yaz.tek.firat.edu.tr/tr/duyurular"
url_oku = urllib.request.urlopen(url)
soup = BeautifulSoup(url_oku, 'html.parser')
icerik = soup.find_all('div',attrs={'class':'views-row views-row-1 views-row-odd views-row-first'})
print(kardiz)
my output :
[<div class="views-row views-row-1 views-row-odd views-row-first">
<span class="views-field views-field-title"> <span class="field-content">Grup-1, Grup-2, Grup-3, Grup-4 ve Grup-6 Öğrencileri İçin Staj Sunum Tarihleri</span> </span>
<span class="views-field views-field-created"> <span class="field-content"><i class="fa fa-calendar"></i> Salı, Aralık 5, 2017 - 09:58 </span> </span> </div>]
But I want to get just " Grup-1, Grup-2, Grup-3, Grup-4 ve Grup-6 Öğrencileri İçin Staj Sunum Tarihleri ". How can I achieve that?
You can call .text on a result from BeautifulSoup. It takes the textual content of the elements found, skipping the tags of the elements.
e.g.
from bs4 import BeautifulSoup
import urllib.request
url = "http://yaz.tek.firat.edu.tr/tr/duyurular"
url_oku = urllib.request.urlopen(url)
soup = BeautifulSoup(url_oku, 'html.parser')
icerik = soup.find_all('div',attrs={'class':'views-row views-row-1 views-row-odd views-row-first'})
for result in icerik:
print(result.text)
You can try like this as well to get the title and link from that page. I used css selector to get them:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
url = "http://yaz.tek.firat.edu.tr/tr/duyurular"
res = requests.get(url)
soup = BeautifulSoup(res.text,'lxml')
for item in soup.select("#content .field-content a"):
link = urljoin(url,item['href'])
print("Title: {}\nLink: {}\n".format(item.text,link))
Partial output:
Title: 2017-2018 Güz Dönemi Final Sınav Programı (TASLAK)
Link: http://yaz.tek.firat.edu.tr/tr/node/481
Title: NETAŞ İşyeri Eğitimi Mülakatları Hakkında Duyuru
Link: http://yaz.tek.firat.edu.tr/tr/node/480

Extracting anchor text from span class with BeautifulSoup

This is the html I am trying to scrape:
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
I want to get the anchor text for each a href: cinematic, dissolve, epic, etc.
This is the code I have:
url = urllib2.urlopen("http: example.com")
content = url.read()
soup = BeautifulSoup(content)
links = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for link in links:
print link.find_all('a')['href']
If I do it with "link.find_all" I get error: TypeError: List indices must be integers, not str.
But if I do print link.find('a')['href'] I get the first one only.
How can I get all of them ?
You could do the following:
from bs4 import BeautifulSoup
content = '''
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
'''
soup = BeautifulSoup(content)
spans = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for span in spans:
links = span.find_all('a')
for link in links:
print link['href']
Output
/tags/cinematic
/tags/dissolve
/tags/epic
/tags/fly
from bs4 import BeautifulSoup
html = """
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
"""
soup = BeautifulSoup(html, "lxml")
spans = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for span in spans:
for link in span.find_all('a'):
print link.text, link['href']
Another, pricier, way could be:
from bs4 import BeautifulSoup
html = """
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
"""
soup = BeautifulSoup(html, "lxml")
links = soup.find_all("a")
for link in links:
if 'meta-attributes__attr-tags' not in link.parent.get('class', []):
continue
print link.text, link['href']
link.find_all('a') returns a list with bs4 Tags. You probably want to index each of this links by href. So maybe this comes closer to your needs:
span = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for links in span:
for link in links.find_all('a'):
print(link['href'])
You may avoid nested loops or any additional if checks inside a loop by using a CSS selector:
for link in soup.select(".meta-attributes__attr-tags a[href]"):
print(link["href"], link.get_text())

Categories