BeautifulSoup's find() can't match Chinese character

BeautifulSoup's find() can't match Chinese character - python

from bs4 import BeautifulSoup
import requests
url = "http://www.paopaoche.net/psp/280873.html"
res = requests.get(url)
res.encoding="gb2312"
bsObj = BeautifulSoup(res.text)
tag1 = bsObj.find("dd", {"class":"left"}).find(class_="xq").find("em", text="游戏类型")
print(tag1)
The terminal return "None". If I change find("em", text="游戏类型") to find("em", text="1993"), terminal return correct result. Where is the problem?

Here is slightly modified code:
from bs4 import BeautifulSoup
import requests
url = "http://www.paopaoche.net/psp/280873.html"
res = requests.get(url)
res.encoding="gb2312"
bsObj = BeautifulSoup(res.content.decode('gb2312'), 'html5lib')
tag1 = bsObj.select("dd.left .xq")[0].find(lambda tag: tag.name == "em" and "游戏类型" in tag.text)
print(tag1)
"em" element contains not only text searched, but also another text and child elements, so it's needed to find elements containing search expression (not having text equal to search expression).

Related

How to find specific text under multiple spans in Beautifulsoup?

I want to extract the IPA keys under the French section of the wiki page:
https://en.wiktionary.org/wiki/son#French
I want only the data in the french section.
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import pandas as pd
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span', {'class':'mw-headline'} and {'id':'French'})
for fr_ipas in french_section.find_next('span', {'class':'IPA'}):
ipa_data.append(fr_ipas)
fr_ipas_all = french_section.find_all_next('span', {'class':'IPA'})
find_next only returns the first element under the french section.
find_all and find_all_next returns a list of all the elements within the html.
I just want the elements under the french section. There are multiple IPA keys under the french section.

Close to your goal, but you have to check if the next elements
or .find_next_siblings() has your IPA element and break the iteration until there is a <hr>, that defines the next section:
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'})
Example
from bs4 import BeautifulSoup
import requests
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'}))
return ipa_data
main()

get specific value with Beautiful Soup using python

I have this code that extracts all the numbers on the website
if I want to get a specific value how can I do it?
I did this but it doesn't work
import urllib
import re
import requests
from bs4 import *
url = requests.get("http://python-data.dr-chuck.net/comments_216543.html")
soup = BeautifulSoup(url.content, "html.parser")
sum=0
tags = soup('span')
for tag in tags:
y=str(tag)
x= re.findall("[0-9]+",y)
for i in x:
print (i[1])

To get tag "Coby", you can use pass a custom function to .find():
import requests
from bs4 import *
url = requests.get("http://python-data.dr-chuck.net/comments_216543.html")
soup = BeautifulSoup(url.content, "html.parser")
coby = soup.find(lambda tag: tag.name == "tr" and "Coby" in tag.text)
print(coby.get_text(separator=" "))
Output:
Coby 95
Or, to only get the comment, use .find_next():
print(coby.find_next("span", class_="comments").get_text())
Output:
95

Which element to use in Selenium?

I want to find "Moderat" in <p class="text-spread-level">Moderat</p>
I have tried with id, name, xpath and link text.

Would you like to try this?
from bs4 import BeautifulSoup
import requests
sentences = []
res = requests.get(url) # assign your url in variable
soup = BeautifulSoup(res.text, "lxml")
tag_list = soup.select("p.text-spread-level")
for tag in tag_list:
sentences.append(tag.text)
print(sentences)

Find the element by class name and get the text.
el=driver.find_element_by_class_name('text-spread-level')
val=el.text
print(val)

Get html text with Beautiful Soup

I'm trying to get the number from inside a div:
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
I need the 122.7 number, but I cant get it. I have tried with:
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").string
But, there are more than one element and I receive "none".
Is there a way to print the childs and get the string from childs?

Use .getText().
For example:
from bs4 import BeautifulSoup
sample_html = """
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
"""
soup = BeautifulSoup(sample_html, "html.parser")
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").getText()
print(strings)
Output:
122.78
Or use __next__() to get only the 122.7.
soup = BeautifulSoup(sample_html, "html.parser")
strings = soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").strings.__next__()
print(strings)
Output:
122.7

To only get the first text, search for the tag, and call the next_element method.
from bs4 import BeautifulSoup
html = """
<div class="tv-symbol-price-quote__value js-symbol-last">122.7<span class="">8</span></div>
"""
soup = BeautifulSoup(html, "html.parser")
print(
soup.find("div", class_="tv-symbol-price-quote__value js-symbol-last").next_element
)
Output:
122.7

You could use selenium to find the element and then use BS4 to parse it.
An example would be
import selenium.webdriver as WD
from selenium.webdrive.chrome.options import Options
import bs4 as B
driver = WD.Chrome()
objXpath = driver.find_element_by_xpath("""yourelementxpath""")
objHtml = objXpath.get_attribute("outerHTML")
soup = B.BeutifulSoup(objHtml, 'html.parser')
text = soup.get_text()
This code should work.
DISCLAIMER
I haven't done work w/ selenium and bs4 in a while so you might have to tweak it a little bit.

How to extract href links from anchor tags using BeautifulSoup?

I've been trying to extract just the links corresponding to the jobs on each page. But for some reason they dont print when I execute the script. No errors occur.
for the inputs I put engineering, toronto respectively. Here is my code.
import requests
from bs4 import BeautifulSoup
import webbrowser
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
all_job_url = []
for tag in prettify.find_all('div', {'data-tn-element':"jobTitle"}):
for links in tag.find_all('a'):
print (links['href'])

You should be looking for the anchor a tag. It looks like this:
<a class="turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=3611ac98c0167102&fccid=459dce363200e1be" ...>Project <b>Engineer</b></a>
Call soup.find_all and iterate over the result set, extracting the links through the href attribute.
import requests
from bs4 import BeautifulSoup
# valid query, replace with something else
url = "https://ca.indeed.com/jobs?q=engineer&l=Calgary%2C+AB"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
all_job_url = []
for tag in soup.find_all('a', {'data-tn-element':"jobTitle"}):
all_job_url.append(tag['href'])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

BeautifulSoup's find() can't match Chinese character - python

Related

How to find specific text under multiple spans in Beautifulsoup?

get specific value with Beautiful Soup using python

Which element to use in Selenium?

Get html text with Beautiful Soup

How to extract href links from anchor tags using BeautifulSoup?

Categories

Resources