How to find specific text under multiple spans in Beautifulsoup? - python

I want to extract the IPA keys under the French section of the wiki page:
https://en.wiktionary.org/wiki/son#French
I want only the data in the french section.
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import pandas as pd
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span', {'class':'mw-headline'} and {'id':'French'})
for fr_ipas in french_section.find_next('span', {'class':'IPA'}):
ipa_data.append(fr_ipas)
fr_ipas_all = french_section.find_all_next('span', {'class':'IPA'})
find_next only returns the first element under the french section.
find_all and find_all_next returns a list of all the elements within the html.
I just want the elements under the french section. There are multiple IPA keys under the french section.

Close to your goal, but you have to check if the next elements
or .find_next_siblings() has your IPA element and break the iteration until there is a <hr>, that defines the next section:
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'})
Example
from bs4 import BeautifulSoup
import requests
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'}))
return ipa_data
main()

Related

Using multiple for loop with Python Using Beautiful Soup

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)

BeautifulSoup's find() can't match Chinese character

from bs4 import BeautifulSoup
import requests
url = "http://www.paopaoche.net/psp/280873.html"
res = requests.get(url)
res.encoding="gb2312"
bsObj = BeautifulSoup(res.text)
tag1 = bsObj.find("dd", {"class":"left"}).find(class_="xq").find("em", text="游戏类型")
print(tag1)
The terminal return "None". If I change find("em", text="游戏类型") to find("em", text="1993"), terminal return correct result. Where is the problem?
Here is slightly modified code:
from bs4 import BeautifulSoup
import requests
url = "http://www.paopaoche.net/psp/280873.html"
res = requests.get(url)
res.encoding="gb2312"
bsObj = BeautifulSoup(res.content.decode('gb2312'), 'html5lib')
tag1 = bsObj.select("dd.left .xq")[0].find(lambda tag: tag.name == "em" and "游戏类型" in tag.text)
print(tag1)
"em" element contains not only text searched, but also another text and child elements, so it's needed to find elements containing search expression (not having text equal to search expression).

I tried to print out the words of a title online, but instead, nothing shows up on the console

For practice, I wanted to build a word frequency counter in Python. I decided to use the title of a post on Reddit (if that isn't a problem) as an example for this. The first step for me was to get the words from that title, and put them into a list, like this:
import requests
from bs4 import BeautifulSoup
def get_words(url):
word_list = []
source_code = requests.get(url).text
soup = BeautifulSoup(source_code, features='html.parser')
for word in soup.find_all('a', {'class': 'title may-blank loggedin'}):
content = word.string
every_word = content.lower().split()
for every in every_word:
print(every)
word_list.append(every)
get_words('https://www.reddit.com/r/nba/comments/hje9ud/kemba_walker_im_a_single_man_with_no_kids_so_ima/')
But when I run it, nothing runs on the console (even when I decide to iterate through the title and print out all the words). Is there a reason for this? Is it because I'm using a big site like Reddit for an example?
You are probably receiving a 502.
Instead of this:
source_code = requests.get(url).text
you should be doing this:
response = requests.get(url)
assert response.status_code == 200
source_code = response.text
and you will see it fail.
You need to add a "user agent" header. See this question:
502 error using Requests to search website in Python
Even at that, your soup selector is weird. You are looking for a elements with 'title' class but there are no such elements on that page. You probably should just get <h1> elements.
So this would work:
import requests
from bs4 import BeautifulSoup
def get_words(url):
word_list = []
source_code = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text
soup = BeautifulSoup(source_code, features='html.parser')
for word in soup.find_all('h1'):
content = word.string
every_word = content.lower().split()
for every in every_word:
print(every)
word_list.append(every)
get_words('https://www.reddit.com/r/nba/comments/hje9ud/kemba_walker_im_a_single_man_with_no_kids_so_ima/')
I tested this and it finds duplicate headers so maybe just use the first one, like this:
import requests
from bs4 import BeautifulSoup
def get_words(url):
word_list = []
source_code = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text
soup = BeautifulSoup(source_code, features='html.parser')
content = soup.find_all('h1')[0].string
every_word = content.lower().split()
for every in every_word:
print(every)
word_list.append(every)
get_words('https://www.reddit.com/r/nba/comments/hje9ud/kemba_walker_im_a_single_man_with_no_kids_so_ima/')

How to extract href links from anchor tags using BeautifulSoup?

I've been trying to extract just the links corresponding to the jobs on each page. But for some reason they dont print when I execute the script. No errors occur.
for the inputs I put engineering, toronto respectively. Here is my code.
import requests
from bs4 import BeautifulSoup
import webbrowser
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
all_job_url = []
for tag in prettify.find_all('div', {'data-tn-element':"jobTitle"}):
for links in tag.find_all('a'):
print (links['href'])
You should be looking for the anchor a tag. It looks like this:
<a class="turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=3611ac98c0167102&fccid=459dce363200e1be" ...>Project <b>Engineer</b></a>
Call soup.find_all and iterate over the result set, extracting the links through the href attribute.
import requests
from bs4 import BeautifulSoup
# valid query, replace with something else
url = "https://ca.indeed.com/jobs?q=engineer&l=Calgary%2C+AB"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
all_job_url = []
for tag in soup.find_all('a', {'data-tn-element':"jobTitle"}):
all_job_url.append(tag['href'])

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

Categories