BS4 + html, b Tag issue

BS4 + html, b Tag issue - python

This question is about web scraping with bs4
this is the code I have written:
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"]', namespaces=None, flags=0)
#score = json.loads(stars)
print('Stars', stars)
My outcome:
<a class="score-lite" data-spm-click="gostr=/details.index.reviewLevel;locaid=dreviewLevel" href="https://onuliss.en.alibaba.com/company_profile/feedback.html" target="_blank"><b>4.8 </b><img src="//img.alicdn.com/tfs/TB1MJPmiQL0gK0jSZFtXXXQCXXa-8-9.svg"/></a>
The outcome I want is just the 4.8 number between the 'b' tags
What do I have to do with the = soup.select_one() function?
Thank you very much :)

Try with a more specific selector, the string property of the match and strip() to get rid of eventual extra spaces.
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"] > b', namespaces=None, flags=0).get_text(strip=True)
#score = json.loads(stars)
print('Stars', stars)
Stars 4.8

how about SimplifiedDoc
import requests
from simplified_scrapy.simplified_doc import SimplifiedDoc
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a SimplifiedDoc object
doc = SimplifiedDoc(page.text)
# get element use tag and class
stars = doc.getElement('a','class',"score-lite")
print('Stars', stars.text, stars.b.text) # Stars 4.8 4.8

import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
item = soup.find('a', {'class': 'score-lite'}).find('b')
print(item.get_text(strip=True))
output:
4.8

Related

Beautiful Soup get value from page which updates daily

I am trying to get a single value from website which will update daily.
I am trying to get latest price in Vijayawada in page
below is my code but I am getting empty space as output but expecting 440 as output.
import requests
from bs4 import BeautifulSoup
import csv
res = requests.get('https://www.e2necc.com/home/eggprice')
soup = BeautifulSoup(res.content, 'html.parser')
price = soup.select("Vijayawada")
Looking to get value: 440 [Which is today value] any suggestions?

One approach could be to select the tr by its text and iterate over its strings to pick the last one that isdigit():
[x for x in soup.select_one('tr:-soup-contains("Vijayawada")').stripped_strings if x.isdigit()][-1]
Example
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.e2necc.com/home/eggprice')
soup = BeautifulSoup(res.content, 'html.parser')
price = [x for x in soup.select_one('tr:-soup-contains("Vijayawada")').stripped_strings if x.isdigit()][-1]
print(price)
Another one is to pick the element by day:
import requests
from bs4 import BeautifulSoup
import datetime
d = datetime.datetime.now().strftime("%d")
res = requests.get('https://www.e2necc.com/home/eggprice')
soup = BeautifulSoup(res.content, 'html.parser')
soup.select('tr:-soup-contains("Vijayawada") td')[int(d)].text
Output
440

get specific value with Beautiful Soup using python

I have this code that extracts all the numbers on the website
if I want to get a specific value how can I do it?
I did this but it doesn't work
import urllib
import re
import requests
from bs4 import *
url = requests.get("http://python-data.dr-chuck.net/comments_216543.html")
soup = BeautifulSoup(url.content, "html.parser")
sum=0
tags = soup('span')
for tag in tags:
y=str(tag)
x= re.findall("[0-9]+",y)
for i in x:
print (i[1])

To get tag "Coby", you can use pass a custom function to .find():
import requests
from bs4 import *
url = requests.get("http://python-data.dr-chuck.net/comments_216543.html")
soup = BeautifulSoup(url.content, "html.parser")
coby = soup.find(lambda tag: tag.name == "tr" and "Coby" in tag.text)
print(coby.get_text(separator=" "))
Output:
Coby 95
Or, to only get the comment, use .find_next():
print(coby.find_next("span", class_="comments").get_text())
Output:
95

Scraping multiple pages with Python and BeautifulSoup

I'm trying to scrape many pages in Python using BeautifulSoup but with no positive results.
I tried using request.get() and session.get(). The number of pages I should scrape is 92.
import requests
from bs4 import BeautifulSoup
import urllib.request
with requests.Session as session:
count = 0
for i in range(92):
count+=1
page = "https://www.paginegialle.it/lazio/roma/dentisti/p-"+str(count)+".html"
r = session.get(page)
soup = BeautifulSoup(r.content)
Using print(page) the page are formatted corectly. But executing soup to print all the values stored in the variable, only the values of the first page are printed.
I'm using a jupyter notebook

you can do as below:
import requests
from bs4 import BeautifulSoup
import urllib.request
for i in range(92):
url = "https://www.paginegialle.it/lazio/roma/dentisti/p-"+str(i)+".html"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
p = soup.select('p')
print(len(p))

This will work.
from bs4 import BeautifulSoup
import requests
count = 0
for i in range(92):
count +=1
source1 = requests.get("https://www.paginegialle.it/lazio/roma/dentisti/p-"+str(count)+".html").text
soup1 = BeautifulSoup(source1, 'lxml')
print(soup1.body)
print()
print("done")

Another solution.
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
count = 0
for i in range(92):
count+=1
html = req.get('https://www.paginegialle.it/lazio/roma/dentisti/p-'+str(i)+'.html')
doc = SimplifiedDoc(html)
print(doc.select('title>text()'))
print (count)

Scraping data using BeautifulSoup

I'm trying scrape the data into a dictionary from this site,
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info.append(items)
however, the HTML tags are not being removed.

You need to use .text. Then to get in the way you want, would need to do a bit of string manipulation.
from bs4 import BeautifulSoup
import requests
from pprint import pprint
url = 'https://webscraper.io/'
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
item = soup.find("div",{"class":"info%s" %x}).text.strip().replace('\n',': ')
info.append(item)
info = '\n'.join(info)
print (info)

Something like this might work? (Replace the webscraper.io url with your actual request URL; Also, you'd still need to clean up the \n characters from the output):
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info += [item.text for item in items]
I.e. item.text, and concatenate the resulting array with info

Crawling div id tags with dynamic id

I would like to crawl content from webpages with beautiful soup.
However, the div id tags have dynamic ids. Such as In this case, the number 1 is dynamically generated. How do i use this?
I have tried this.
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen(
'http://forums.hardwarezone.com.sg/eat-drink-man-woman-16/%5Bofficial%5D-chit-chat-students-part-2-a-5526993-55.html').read()
soup = BeautifulSoup(r, "lxml")
letters = soup.find_all("div", attrs={"id":"post_message"})
print letters
letters returns a empty list.

You can use regex inside attrs like this:
from bs4 import BeautifulSoup
import urllib
import re
r = urllib.urlopen(
'http://forums.hardwarezone.com.sg/eat-drink-man-woman-16/%5Bofficial%5D-chit-chat-students-part-2-a-5526993-55.html').read()
soup = BeautifulSoup(r, "lxml")
letters = soup.find_all("div", attrs={"id": re.compile('post_message_\d+')})
print letters

you can try this.
from bs4 import BeautifulSoup
import urllib
import re
r = urllib.urlopen(
'http://forums.hardwarezone.com.sg/eat-drink-man-woman-16/%5Bofficial%5D-chit-chat-students-part-2-a-5526993-55.html').read()
soup = BeautifulSoup(r, "lxml")
letters = soup.find_all("div", attrs={"id": re.compile("^post_message_\d+")})
print letters

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

BS4 + html, b Tag issue - python

Related

Beautiful Soup get value from page which updates daily

get specific value with Beautiful Soup using python

Scraping multiple pages with Python and BeautifulSoup

Scraping data using BeautifulSoup

Crawling div id tags with dynamic id

Categories

Resources