bs4 scraping python get contents until specific class name - python

I want to scrape this site
https://www.eduvision.edu.pk/institutions-detail.php?city=51I&institute=5_allama-iqbal-open-university-islamabad
and i want only the bachelor data in this url which is under class name=academicsList and i don't want below MS(MASTERS) data.
I want my scraper to stop before ms data. my logic is that we can set temporary incrementor on class=academicsHead and it should stop when it gets second academicsHead
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
header = {'user-agent':ua.chrome}
response = requests.get('https://www.eduvision.edu.pk/institutions-detail.php?city=51I&institute=5_allama-iqbal-open-university-islamabad',headers=header)
soup = BeautifulSoup(response.content, 'html.parser')
disciplines = soup.findAll("ul", {"class": "academicsList"})
#temp = soup.findAll("ul",{"class":"academicsHead"})
#stop at second academicsHead
for d in disciplines:
print(d.findAll('li')[0].text)

We can check if the class is 'academicsHead' and if it is just check if the text is BACHELOR if not break the loop.
Something like this would work:
disciplines = soup.findAll('ul',attrs={'class':re.compile(r'academics+(.)+')})
for i in disciplines:
if i['class'][0] == 'academicsHead':
if i.find('li').text.strip() != 'BACHELOR':
break
else:
print(i.find('li').text.strip())

Related

How to find specific text under multiple spans in Beautifulsoup?

I want to extract the IPA keys under the French section of the wiki page:
https://en.wiktionary.org/wiki/son#French
I want only the data in the french section.
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import pandas as pd
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span', {'class':'mw-headline'} and {'id':'French'})
for fr_ipas in french_section.find_next('span', {'class':'IPA'}):
ipa_data.append(fr_ipas)
fr_ipas_all = french_section.find_all_next('span', {'class':'IPA'})
find_next only returns the first element under the french section.
find_all and find_all_next returns a list of all the elements within the html.
I just want the elements under the french section. There are multiple IPA keys under the french section.
Close to your goal, but you have to check if the next elements
or .find_next_siblings() has your IPA element and break the iteration until there is a <hr>, that defines the next section:
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'})
Example
from bs4 import BeautifulSoup
import requests
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'}))
return ipa_data
main()

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

python crawling beautifulsoup how to crawl several pages?

Please Help.
I want to get all the company names of each pages and they have 12 pages.
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1
http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/2
-- this website only changes the number.
So Here is my code so far.
Can I get just the title (company name) of 12 pages?
Thank you in advance.
from bs4 import BeautifulSoup
import requests
maximum = 0
page = 1
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/1'
response = requests.get(URL)
source = response.text
soup = BeautifulSoup(source, 'html.parser')
whole_source = ""
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/' + str(page_number)
response = requests.get(URL)
whole_source = whole_source + response.text
soup = BeautifulSoup(whole_source, 'html.parser')
find_company = soup.select("#content > div.wrap_analysis_data > div.public_con_box.public_list_wrap > ul > li:nth-child(13) > div > strong")
for company in find_company:
print(company.text)
---------Output of one page
---------page source :)
So, you want to remove all the headers and get only the string of the company name?
Basically, you can use the soup.findAll to find the list of company in the format like this:
<strong class="company"><span>중소기업진흥공단</span></strong>
Then you use the .find function to extract information from the <span> tag:
<span>중소기업진흥공단</span>
After that, you use .contents function to get the string from the <span> tag:
'중소기업진흥공단'
So you write a loop to do the same for each page, and make a list called company_list to store the results from each page and append them together.
Here's the code:
from bs4 import BeautifulSoup
import requests
maximum = 12
company_list = [] # List for result storing
for page_number in range(1, maximum+1):
URL = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(page_number)
response = requests.get(URL)
print(page_number)
whole_source = response.text
soup = BeautifulSoup(whole_source, 'html.parser')
for entry in soup.findAll('strong', attrs={'class': 'company'}): # Finding all company names in the page
company_list.append(entry.find('span').contents[0]) # Extracting name from the result
The company_list will give you all the company names you want
I figured it out eventually. Thank you for your answer though!
image : code captured in jupyter notebook
Here is my final code.
from urllib.request import urlopen
from bs4 import BeautifulSoup
company_list=[]
for n in range(12):
url = 'http://www.saramin.co.kr/zf_user/jobs/company-labs/list/page/{}'.format(n+1)
webpage = urlopen(url)
source = BeautifulSoup(webpage,'html.parser',from_encoding='utf-8')
companys = source.findAll('strong',{'class':'company'})
for company in companys:
company_list.append(company.get_text().strip().replace('\n','').replace('\t','').replace('\r',''))
file = open('company_name1.txt','w',encoding='utf-8')
for company in company_list:
file.write(company+'\n')
file.close()

ESPN.com Python web scraping issue

I am trying to pull data for the rosters for all college football teams because I want to run some analysis on team performance based on composition of their roster.
My script is working on the first page, and it iterates over each team and can open the rosters link for each team, but then the Beautiful Soup commands I am running on the rosters page for a team keep throwing Index Errors. When I look at the HTML, it seems as if the commands I am writing should work yet when I print the page source from the Beautiful Soup I don't see what I see in Developer Tools in Chrome. Is this some instance of JS being used to serve up the content? If so, I thought Selenium got around this?
My code...
import requests
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_soup = BeautifulSoup(teams_html, "html5lib")
i = 0
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
roster_driver = webdriver.Firefox()
roster_driver.get(roster_link)
roster_html = teams_driver.page_source
roster_soup = BeautifulSoup(roster_html, "html5lib")
team_name_html = roster_soup.find_all('a', class_='sub-brand-title')[0]
team_name = team_name_html.find_all('b')[0].text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name)
print('\t', player_name)
roster_driver.close()
teams_driver.close()
In your for loop you're using the html of the 1st page (roster_html = teams_driver.page_source), so you get an index error when you try to select the 1st item of team_name_html because find_all returns an empty list.
Also you don't need to have all those instances of Firefox open, you can close the driver when you have the html.
teams_driver = webdriver.Firefox()
teams_driver.get("http://www.espn.com/college-football/teams")
teams_html = teams_driver.page_source
teams_driver.quit()
But you don't have to use selenium for this task, you can get all the data with requests and bs4.
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.espn.com/college-football/teams")
teams_soup = BeautifulSoup(r.text, "html5lib")
for link_html in teams_soup.find_all('a'):
if link_html.text == 'Roster':
roster_link = 'https://www.espn.com' + link_html['href']
r = requests.get(roster_link)
roster_soup = BeautifulSoup(r.text, "html5lib")
team_name = roster_soup.find('a', class_='sub-brand-title').find('b').text
for player_html in roster_soup.find_all('tr', class_='oddrow'):
player_name = player_html.find_all('a')[0].text
player_pos = player_html.find_all('td')[2].text
player_height = player_html.find_all('td')[3].text
player_weight = player_html.find_all('td')[4].text
player_year = player_html.find_all('td')[5].text
player_hometown = player_html.find_all('td')[6].text
print(team_name, player_name, player_pos, player_height, player_weight, player_year, player_hometown)

I want to crawl data from 1 to 10 pages automatically from website.How can i do it?

import requests
from bs4 import BeautifulSoup
My_Url = "http://questions.consumercomplaints.in/page/2"
Data = requests.get(My_Url)
Soup = BeautifulSoup(Data.content)
head_id = Soup.find_all({"div":"href"})
len(head_id)
for i in head_id:
print i.text
From above code i scrapped (reviews/complaints) from web page 2.
How do i craw data automatically all pages (http://questions.consumercomplaints.in/page/3)
Why not surround your function in a ranged for loop?
import requests
from bs4 import BeautifulSoup
for i in range(3,11):
My_Url = "http://questions.consumercomplaints.in/page/" + str(i)
Data = requests.get(My_Url)
Soup = BeautifulSoup(Data.content)
head_id = Soup.find_all({"div":"href"})
len(head_id)
for i in head_id:
print i.text
Have look at how the range function works here.

Categories