Web crawling using python beautifulsoup - python

How to extract data that is inside <p> paragraph tags and <li> which are under a named <div> class?

Use the functions find() and find_all():
import requests
from bs4 import BeautifulSoup
url = '...'
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
div = soup.find('div', {'class':'class-name'})
ps = div.find_all('p')
lis = div.find_all('li')
# print the content of all <p> tags
for p in ps:
print(p.text)
# print the content of all <li> tags
for li in lis:
print(li.text)

Related

python nested Tags (beautiful Soup)

I used beautiful soup using python to get data from a specific website
but I don't know how to get one of these prices but I want the price in gram (g)
AS shown below this is the HTML codeL:
<div class="promoPrice margBottom7">16,000
L.L./200g<br/><span class="kiloPrice">79,999
L.L./Kg</span></div>
I use this code:
p_price = product.findAll("div{"class":"promoPricemargBottom7"})[0].text
my result was:
16,000 L.L./200g 79,999 L.L./Kg
but i want to have:
16,000 L.L./200g
only
You will need to first decompose the span inside the div element:
from bs4 import BeautifulSoup
h = """
<div class="promoPrice margBottom7">16,000 L.L./200g<br/>
<span class="kiloPrice">79,999 L.L./Kg</span></div>
"""
soup = BeautifulSoup(h, "html.parser")
element = soup.find("div", {'class': 'promoPrice'})
element.span.decompose()
print(element.text)
#16,000 L.L./200g
Try using soup.select_one('div.promoPrice').contents[0]
from bs4 import BeautifulSoup
html = """<div class="promoPrice margBottom7">16,000 L.L./200g<br/>
<span class="kiloPrice">79,999 L.L./Kg</span></div>"""
soup = BeautifulSoup(html, features='html.parser')
# value = soup.select('div.promoPrice > span') # for 79,999 L.L./Kg
value = soup.select_one('div.promoPrice').contents[0]
print(value)
Prints
16,000 L.L./200g

Python BeautifulSoup - get values from p

html = '<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'
soup = BeautifulSoup(html, 'html.parser')
sup_elem = soup.find("sup").string # 33 - it works
How do I get the "96" before the element ?
You can grab the previousSibling tag
from bs4 import BeautifulSoup
html = '''<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'''
soup = BeautifulSoup(html, 'html.parser')
elem1 = soup.find("sup").previousSibling
elem2 = soup.find("sup").text # 33 - it works
print ('.'.join([elem1, elem2]))
Output:
96.33
You can use children method. It will return a list of all the children of p tag. (6 will be first child of it.
html = '<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'
soup = BeautifulSoup(html, 'html.parser')
elem = list(soup.find("p").children)[0] #0th element of the list will be 96
sup_elem = soup.find("sup").string
result = elem + '.' + sup_elem #96.33
Use select instead.
from bs4 import BeautifulSoup
html = '''<p class="product-new-price">96<sup>33</sup> <span class="tether-target tether-enabled tether-element-attached-top tether-element-attached-left tether-target-attached-top tether-target-attached-right">Lei</span>
</p>'''
soup = BeautifulSoup(html, 'html.parser')
print(soup.select_one('.product-new-price').text.strip().replace('Lei',''))
There is no "." in source but you can always divide by 100
print(int(soup.select_one('.product-new-price').text.strip().replace('Lei',''))/100)

How to extract link under a <li> tag with a specific class?

<li class="a-last">Buy Now</li>
How can you extract the link /macbook-pro inside the class a-last? Efficiency is a consideration.
One possibility is CSS selectors:
data = '''<li class="a-last">Buy Now</li>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
print(soup.select_one('li.a-last [href]')['href'])
Prints:
/macbook-pro
li.a-last [href] will select tag with attribute href that is under <li> tag with class a-last.
If you want to be more specific and want to extract only <a> tag directly under <li class="a-last">, you can use:
print(soup.select_one('li.a-last > a[href]')['href'])
You can do this:
from bs4 import BeautifulSoup
html = """<li class="a-last">Buy Now</li>"""
soup = BeautifulSoup(html, 'html.parser')
href = soup.find('li', {'class': 'a-last'}).find('a').get('href')
print(href)
RESULTS:
/macbook-pro
This is the list of all needed hrefs:
[el.find('a').get('href') for el in soup.find_all('li', {'class': 'a-last'})]

how to print only text beautifulsoup

I am trying to learn how beautifulsoup works in order to create an application.
I am able to find and print all elements with .find_all() however they print the html tags as well. How can I print ONLY the text within these tags.
This is what I have:
from bs4 import BeautifulSoup
"""<html>
<p>1</p>
<p>2</p>
<p>3</p>
"""
soup = BeautifulSoup(open('index.html'), "html.parser")
i = soup.find_all('p')
print i
This may help you:-
from bs4 import BeautifulSoup
source_code = """<html>
<p>1</p>
<p>2</p>
<p>3</p>
"""
soup = BeautifulSoup(source_code)
print soup.text
Output:-
1
2
3
soup = BeautifulSoup(open('index.html'), "html.parser")
i = soup.find_all('p')
for p in i:
print p.text
find_all() will return a list of tag, you should iterate over it and use tag.text to get the text under the tag
Better way:
for p in soup.find_all('p'):
print p.text
I think you can do what they do in this stackoverflow question. Use findAll(text=True). So in your code:
from bs4 import BeautifulSoup
"""<html>
<p>1</p>
<p>2</p>
<p>3</p>
"""
soup = BeautifulSoup(open('index.html'), "html.parser")
i = soup.findAll(text=True)
print i

Extracting anchor text from span class with BeautifulSoup

This is the html I am trying to scrape:
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
I want to get the anchor text for each a href: cinematic, dissolve, epic, etc.
This is the code I have:
url = urllib2.urlopen("http: example.com")
content = url.read()
soup = BeautifulSoup(content)
links = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for link in links:
print link.find_all('a')['href']
If I do it with "link.find_all" I get error: TypeError: List indices must be integers, not str.
But if I do print link.find('a')['href'] I get the first one only.
How can I get all of them ?
You could do the following:
from bs4 import BeautifulSoup
content = '''
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
'''
soup = BeautifulSoup(content)
spans = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for span in spans:
links = span.find_all('a')
for link in links:
print link['href']
Output
/tags/cinematic
/tags/dissolve
/tags/epic
/tags/fly
from bs4 import BeautifulSoup
html = """
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
"""
soup = BeautifulSoup(html, "lxml")
spans = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for span in spans:
for link in span.find_all('a'):
print link.text, link['href']
Another, pricier, way could be:
from bs4 import BeautifulSoup
html = """
<span class="meta-attributes__attr-tags">
cinematic,
dissolve,
epic,
fly,
</span>
"""
soup = BeautifulSoup(html, "lxml")
links = soup.find_all("a")
for link in links:
if 'meta-attributes__attr-tags' not in link.parent.get('class', []):
continue
print link.text, link['href']
link.find_all('a') returns a list with bs4 Tags. You probably want to index each of this links by href. So maybe this comes closer to your needs:
span = soup.find_all("span", {"class": "meta-attributes__attr-tags"})
for links in span:
for link in links.find_all('a'):
print(link['href'])
You may avoid nested loops or any additional if checks inside a loop by using a CSS selector:
for link in soup.select(".meta-attributes__attr-tags a[href]"):
print(link["href"], link.get_text())

Categories