get specific value with Beautiful Soup using python - python

I have this code that extracts all the numbers on the website
if I want to get a specific value how can I do it?
I did this but it doesn't work
import urllib
import re
import requests
from bs4 import *
url = requests.get("http://python-data.dr-chuck.net/comments_216543.html")
soup = BeautifulSoup(url.content, "html.parser")
sum=0
tags = soup('span')
for tag in tags:
y=str(tag)
x= re.findall("[0-9]+",y)
for i in x:
print (i[1])

To get tag "Coby", you can use pass a custom function to .find():
import requests
from bs4 import *
url = requests.get("http://python-data.dr-chuck.net/comments_216543.html")
soup = BeautifulSoup(url.content, "html.parser")
coby = soup.find(lambda tag: tag.name == "tr" and "Coby" in tag.text)
print(coby.get_text(separator=" "))
Output:
Coby 95
Or, to only get the comment, use .find_next():
print(coby.find_next("span", class_="comments").get_text())
Output:
95

Related

Get html text with Beautiful Soup

I'm trying to get the number from inside a div:
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
I need the 122.7 number, but I cant get it. I have tried with:
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").string
But, there are more than one element and I receive "none".
Is there a way to print the childs and get the string from childs?
Use .getText().
For example:
from bs4 import BeautifulSoup
sample_html = """
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
"""
soup = BeautifulSoup(sample_html, "html.parser")
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").getText()
print(strings)
Output:
122.78
Or use __next__() to get only the 122.7.
soup = BeautifulSoup(sample_html, "html.parser")
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").strings.__next__()
print(strings)
Output:
122.7
To only get the first text, search for the tag, and call the next_element method.
from bs4 import BeautifulSoup
html = """
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
"""
soup = BeautifulSoup(html, "html.parser")
print(
soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").next_element
)
Output:
122.7
You could use selenium to find the element and then use BS4 to parse it.
An example would be
import selenium.webdriver as WD
from selenium.webdrive.chrome.options import Options
import bs4 as B
driver = WD.Chrome()
objXpath = driver.find_element_by_xpath("""yourelementxpath""")
objHtml = objXpath.get_attribute("outerHTML")
soup = B.BeutifulSoup(objHtml, 'html.parser')
text = soup.get_text()
This code should work.
DISCLAIMER
I haven't done work w/ selenium and bs4 in a while so you might have to tweak it a little bit.

BS4 + html, b Tag issue

This question is about web scraping with bs4
this is the code I have written:
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"]', namespaces=None, flags=0)
#score = json.loads(stars)
print('Stars', stars)
My outcome:
<a class="score-lite" data-spm-click="gostr=/details.index.reviewLevel;locaid=dreviewLevel" href="https://onuliss.en.alibaba.com/company_profile/feedback.html" target="_blank"><b>4.8 </b><img src="//img.alicdn.com/tfs/TB1MJPmiQL0gK0jSZFtXXXQCXXa-8-9.svg"/></a>
The outcome I want is just the 4.8 number between the 'b' tags
What do I have to do with the = soup.select_one() function?
Thank you very much :)
Try with a more specific selector, the string property of the match and strip() to get rid of eventual extra spaces.
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"] > b', namespaces=None, flags=0).get_text(strip=True)
#score = json.loads(stars)
print('Stars', stars)
Stars 4.8
how about SimplifiedDoc
import requests
from simplified_scrapy.simplified_doc import SimplifiedDoc
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a SimplifiedDoc object
doc = SimplifiedDoc(page.text)
# get element use tag and class
stars = doc.getElement('a','class',"score-lite")
print('Stars', stars.text, stars.b.text) # Stars 4.8 4.8
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
item = soup.find('a', {'class': 'score-lite'}).find('b')
print(item.get_text(strip=True))
output:
4.8

Python: BeautifulSoup extract all the heading text from div class

import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
first_event = event_containers[0]
print(first_event.h3.text)
By using this code i'm able to extract the event name,I'm trying for a way to loop and extract all the event names and dates ? and also i'm trying to extract the location information which is visable after clicking on readmore link
event_containers is a bs4.element.ResultSet object, which is basically a list of Tag objects.
Just loop over the tags in event_containers and select h3 for the title, div.date for the date and a for the URL, example:
for tag in event_containers:
print(tag.h3.text)
print(tag.select_one('div.date').text)
print(tag.a['href'])
Now, for the location information you'll have to visit each URL and collect the text in div.date.
Full code:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
base_url = 'http://aicd.companydirectors.com.au'
for tag in event_containers:
link = base_url + tag.a['href']
soup = BeautifulSoup(requests.get(link).text,"lxml")
location = ', '.join(list(soup.select_one('div.event-add').stripped_strings)[1:-1])
print('Title:', tag.h3.text)
print('Date:', tag.select_one('div.date').text)
print('Link:', link)
print('Location:', location)
Try this to get all the events and dates you are after:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all(class_='lead'):
date = item.find_previous_sibling().text.split(" |")[0]
print(item.text,date)

How to extract href links from anchor tags using BeautifulSoup?

I've been trying to extract just the links corresponding to the jobs on each page. But for some reason they dont print when I execute the script. No errors occur.
for the inputs I put engineering, toronto respectively. Here is my code.
import requests
from bs4 import BeautifulSoup
import webbrowser
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
all_job_url = []
for tag in prettify.find_all('div', {'data-tn-element':"jobTitle"}):
for links in tag.find_all('a'):
print (links['href'])
You should be looking for the anchor a tag. It looks like this:
<a class="turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=3611ac98c0167102&fccid=459dce363200e1be" ...>Project <b>Engineer</b></a>
Call soup.find_all and iterate over the result set, extracting the links through the href attribute.
import requests
from bs4 import BeautifulSoup
# valid query, replace with something else
url = "https://ca.indeed.com/jobs?q=engineer&l=Calgary%2C+AB"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
all_job_url = []
for tag in soup.find_all('a', {'data-tn-element':"jobTitle"}):
all_job_url.append(tag['href'])

Crawling div id tags with dynamic id

I would like to crawl content from webpages with beautiful soup.
However, the div id tags have dynamic ids. Such as In this case, the number 1 is dynamically generated. How do i use this?
I have tried this.
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen(
'http://forums.hardwarezone.com.sg/eat-drink-man-woman-16/%5Bofficial%5D-chit-chat-students-part-2-a-5526993-55.html').read()
soup = BeautifulSoup(r, "lxml")
letters = soup.find_all("div", attrs={"id":"post_message"})
print letters
letters returns a empty list.
You can use regex inside attrs like this:
from bs4 import BeautifulSoup
import urllib
import re
r = urllib.urlopen(
'http://forums.hardwarezone.com.sg/eat-drink-man-woman-16/%5Bofficial%5D-chit-chat-students-part-2-a-5526993-55.html').read()
soup = BeautifulSoup(r, "lxml")
letters = soup.find_all("div", attrs={"id": re.compile('post_message_\d+')})
print letters
you can try this.
from bs4 import BeautifulSoup
import urllib
import re
r = urllib.urlopen(
'http://forums.hardwarezone.com.sg/eat-drink-man-woman-16/%5Bofficial%5D-chit-chat-students-part-2-a-5526993-55.html').read()
soup = BeautifulSoup(r, "lxml")
letters = soup.find_all("div", attrs={"id": re.compile("^post_message_\d+")})
print letters

Categories