Extract data from a specific cell in a table using BeautifulSoup? - python

I'm trying to extract the triage waiting times for a specific hospital to feed into other applications. Data from ALL local hospitals is available from: https://www.health.wa.gov.au/emergencyactivity/EDdata/edsv/
Here is the progress I have made so far:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.health.wa.gov.au/emergencyactivity/EDdata/edsv/'
headers = {
"User-Agent": 'Mozilla/5.0 (X11; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'
}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
table_rows = soup.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
I want to extract only the triage time for Sir Charles Gairdner Hospital but have no clue how to do that. Any help would be much appreciated!

You are almost there. Try something like this:
from bs4 import Tag
table_rows = soup.select('tr td')
for tr in table_rows:
if tr.text == 'Sir Charles Gairdner Hospital':
for ns in tr.next_siblings:
if isinstance(ns,Tag):
print(ns.text)
Another alternative:
table = soup.select('table')[0]
for row in table:
if isinstance(row,Tag):
tds = row.select('td')
if len(tds)>0 and tds[0].text=='Sir Charles Gairdner Hospital':
for td in tds:
print(td.text)
Output:
73
5
36
Edit:
To print just the triage waiting time for that location, use:
for tr in table_rows:
if tr.text == 'Sir Charles Gairdner Hospital':
print(tr.next_sibling.text) #note: it's "next_sibling", not "siblings" this time

Related

Python Web Scraping Yahoo - Result in empty list

import requests
import csv
from bs4 import BeautifulSoup
ticker = input('Enter the ticker symbol: ')
url = f'https://finance.yahoo.com/quote/{ticker}/history?p={ticker}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'W(100%) M(0)'})
rows = table.tbody.find_all('tr')
stock_prices = []
for row in rows:
cells = row.find_all('td')
if cells:
try:
stock_prices.append(float(cells[4].text.replace(',', '')))
except ValueError:
print('Error parsing stock price')
print(stock_prices)
I'm trying to scrap yahoo finance for "market close" prices of a given stock. I went through the html and am not sure what table row or cell I have wrong. The output list is empty.
I'm trying to scrap yahoo finance for "market close" prices of a given stock. I went through the html and am not sure what table row or cell I have wrong. The output list is empty.
Try to set User-Agent header when requesting the page from Yahoo:
import requests
from bs4 import BeautifulSoup
ticker = input("Enter the ticker symbol: ")
url = f"https://finance.yahoo.com/quote/{ticker}/history?p={ticker}"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", {"class": "W(100%) M(0)"})
rows = table.tbody.find_all("tr")
stock_prices = []
for row in rows:
cells = row.find_all("td")
if cells and len(cells) > 3:
try:
stock_prices.append(float(cells[4].text.replace(",", "")))
except ValueError:
print("Error parsing stock price")
print(stock_prices)
Prints (for example AAPL):
Enter the ticker symbol: AAPL
https://finance.yahoo.com/quote/AAPL/history?p=AAPL
[124.9, 129.93, 129.61, 126.04, 130.03, 131.86, 132.23, 135.45, 132.3, 132.37, 134.51, 136.5, 143.21, 145.47, 144.49, 142.16, 142.65, 140.94, 142.91, 146.63, 147.81, 148.31, 148.03, 141.17, 144.22, 148.11, 151.07, 150.18, 148.01, 151.29, 150.72, 148.79, 150.04, 148.28, 149.7, 146.87, 134.87, 139.5, 138.92, 138.38, 138.88, 145.03, 150.65, 153.34, 155.74, 144.8, 149.35, 152.34, 149.45, 147.27, 143.39, 143.86, 143.75, 142.41, 138.38, 142.99, 138.34, 138.98, 140.42, 140.09, 145.43, 146.4, 146.1, 142.45, 138.2, 142.48, 149.84, 151.76, 150.77, 150.43, 152.74, 153.72, 156.9, 154.48, 150.7, 152.37, 155.31, 153.84, 163.43, 157.37, 154.46, 155.96, 154.53, 155.81, 157.96, 157.22, 158.91, 161.38, 163.62, 170.03, 167.53, 167.23, 167.57, 171.52, 174.15, 174.55, 173.03, 173.19, 172.1]

How can I use BeautifulSoup to scrape this table?

I am new to Python and learning data analysis. I am trying to scrape data from this web page: https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7
I am able to scrape data with simple websites but I think since BitInfoCharts has tables it may be a more complex HTML setup than the tutorials I am following.
My goal is to scrape the data from the table which includes Block, Time, Amount, Balance, ect and have it in a csv file. I previously tried using pandas but found that it was difficult to select the data I want from the HTML.
To do this, I think that what I need to do is get the header/table information from the "class="table abtb tablesorter tablesorter-default" and then pull all of the information from each object inside that class that contains "class="trb". The class=trb changes from page to page (Example, one person may have 7 transactions, and another may have 40). I am not exactly sure though as this is new territory for me.
I would really appreciate any help.
import requests
from bs4 import BeautifulSoup as bs
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = bs(r.content)
table = soup.find_all("table_maina")
print(table)
If you do decide to do it manually, this does the same thing:
import csv
import requests
from bs4 import BeautifulSoup as bs
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = bs(r.content,'lxml')
table = soup.find(id="table_maina")
headers = []
datarows = []
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append( [td.text for td in row.find_all('td')] )
fcsv = csv.writer( open('x.csv','w',newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
There is only one table element called 'table_maina' so you should call find() vs find_all(). Also, you need you specify the "table" tag as first argument in find() function.
Try:
table = soup.find('table', id='table_maina')
for tr in table.find_all('tr', class_='trb'):
print(tr.text)
Output:
4066317 2022-01-17 15:41:22 UTC2022-01-17 15:41:22 UTC-33,000,000 DOGE (5,524,731.65 USD)220,000,005.04121223 DOGE$36,831,545 # $0.167$-28,974,248
4063353 2022-01-15 11:04:46 UTC2022-01-15 11:04:46 UTC+4,000,000 DOGE (759,634.87 USD)253,000,005.04121223 DOGE$48,046,907 # $0.19$-23,283,618
...
Next, to output each row into CSV file then try this:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers, verify=False)
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find("table", id='table_maina')
with open('out.csv', 'w', newline='') as fout:
csv_writer = csv.writer(fout)
csv_writer.writerow(['Block', 'Time', 'Amount', 'Balance', 'Price', 'Profit'])
for tr in table.find_all('tr', class_='trb'):
tds = tr.find_all('td')
csv_writer.writerow([x.text for x in tds])
Output:
Block,Time,Amount,Balance,Price,Profit
4066317 2022-01-17 15:41:22 UTC,2022-01-17 15:41:22 UTC,"-33,000,000 DOGE (5,524,731.65 USD)","220,000,005.04121223 DOGE","$36,831,545 # $0.167","$-28,974,248"
...

web scraping can't get data of all links in page at same time

From someday I am trying to crawl all vessel data from vesselfinder with its description page, like from description page I want its information like vessel type, Imo number etc. in table form. I try different way to do this but still a lot of errors. First, I found that how I go through these links to its description page, how to get all these links from all pages, also how to get specific table data from its description page (which is still not complete but get some).
But today I try get the data from all links with its description pages at same time, it gives me a lot of error which make me so confused (by combining the code).
I attached my code, which is not good but to this point #print(len(vessellist)) it work after that… errors..
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
baseurl = 'https://www.vesselfinder.com/vessels'
vessellist = []
for x in range(1,6):
response = requests.get(
f'https://www.vesselfinder.com/vessels?page={x}',
headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
contents = soup.find_all('td', class_='v2')
for property in contents:
for item in property.find_all('a', href=True):
vessellist.append(baseurl + item['href'])
for link in vessellist:
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_ = 'tparams')
head = []
for i in table.find_all('td', class_ = 'n3'):
title = i.text
head.append(title)
values =[]
for row in table.find_all('td', class_ = 'v3'):
data = row.text
values.append(data)
df = pd.DataFrame(values)
print(df)
two steps: get summary data (includes href).Next get detailled ones. Theses two steps are implemented in two functions. Here I get first 10 pages, 200 are available.
import requests as rq
from bs4 import BeautifulSoup as bs
from requests.api import head
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def getSummaryData():
data = []
url = "https://www.vesselfinder.com/vessels"
for page in range(1, 10+1, 1): # only 200 first pages autorized ?
print("Page : %d/10" % page)
resp = rq.get(url + "?page=%s" % page, headers=headers)
soup = bs(resp.content, "lxml")
section = soup.find_all('section', {'class', 'listing'})[0]
tbody = section.find_all('tbody')[0]
trs = tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
# column 1 data
sub = tds[1].find('a')
href = sub['href']
divs = sub.find_all('div')
country = divs[0]['title']
sub_divs = divs[1].find_all('div')
vessel_name = sub_divs[0].text
vessel_type = sub_divs[1].text
# column 2 data
build_year = tds[2].text
# column 3 data
gt = tds[3].text
# column 4 data
dwt = tds[4].text
# column 5 data
size = tds[5].text
# save data
tr_data = {'country': country,
'vessel_name': vessel_name,
'vessel_type': vessel_type,
'build_year': build_year,
'gt': gt,
'dwt': dwt,
'size': size,
'href': href}
data.append(tr_data)
return data
def getDetailledData(data):
for (iel, el) in enumerate(data):
print("%d/%d" % (iel+1, len(data)))
url = "https://www.vesselfinder.com" + el['href']
# make get call
resp = rq.get(url, headers=headers)
soup = bs(resp.content, "lxml")
# position and voyage data
table = soup.find_all('table', {'class', 'aparams'})[0]
trs = table.find_all('tr')
labels = ["course_speed", "current_draught","navigation_status",
"position_received", "IMO_MMSI", "callsign", "flag", "length_beam"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
# vessel particulars
table = soup.find_all('table', {'class', 'tparams'})[0]
trs = table.find_all('tr')
labels = ["IMO_number", "vessel_name", "ship_type", "flag",
"homeport", "gross_tonnage", "summer_deadweight_t",
"length_overall_m", "beam_m", "draught_m", "year_of_built",
"builder", "place_of_built", "yard", "TEU", "crude", "grain",
"bale", "classification_society", "registered_owner", "manager"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
#break
return data
Call theses functions :
data = getSummaryData() # href include
data = getDetailledData(data)
Don't rely on 'class' tag to target the data. Generally, you need to go throught table -> tbody and then get tds or trs to be sure that's the correct ones.

Extract webpage elements up to a point

I want to extract elements from a webpage up to a point, and that is when it reaches this line on the webpage: <div class="clear"></div>. This appears twice on the webpage I am trying to extract, so I wanted to extract all the elements before the first one and then break.
For example:
hhref = ['https://www.ukfirestations.co.uk/stations/bedfordshire','https://www.ukfirestations.co.uk/stations/buckinghamshire']
dats = []
for i in range(0, 2, 1):
r = requests.get(hhref[i], headers=headers)
soup = BeautifulSoup(r.content)
station = soup.find('div',{'id':'stations-grid'}).find_all('a')
for j in station:
dats.append(j['href'])
This extracts all the information, including those after <div class="clear"></div>. The webpage splits the stations I am after by 'current' and 'old'. I wanted to grab only those in the 'current' section, though I am unsure of how I can tell BeautifulSoup to extract elements up to a point.
You can use CSS selector with :not to filter-out unwanted stations:
import requests
from bs4 import BeautifulSoup
hhref = [
"https://www.ukfirestations.co.uk/stations/bedfordshire",
"https://www.ukfirestations.co.uk/stations/buckinghamshire",
]
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
}
for link in hhref:
print(f"{link=}")
print()
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, "html.parser")
stations = soup.select(
'h2:-soup-contains("Current Stations") ~ .stations-row .station:not(h2:-soup-contains("Old Stations") ~ .stations-row .station)'
)
for s in stations:
print(
"{:<30} {}".format(s.select_one(".station-name").text, s.a["href"])
)
print()
Prints:
link='https://www.ukfirestations.co.uk/stations/bedfordshire'
Ampthill https://www.ukfirestations.co.uk/station/ampthill
Bedford https://www.ukfirestations.co.uk/station/bedford
Biggleswade https://www.ukfirestations.co.uk/station/biggleswade
Dunstable https://www.ukfirestations.co.uk/station/dunstable
Harrold https://www.ukfirestations.co.uk/station/harrold
Headquarters https://www.ukfirestations.co.uk/station/headquarters-10
Kempston https://www.ukfirestations.co.uk/station/kempston
Leighton Buzzard https://www.ukfirestations.co.uk/station/leighton-buzzard
Luton https://www.ukfirestations.co.uk/station/luton
Potton https://www.ukfirestations.co.uk/station/potton
Sandy https://www.ukfirestations.co.uk/station/sandy
Shefford https://www.ukfirestations.co.uk/station/shefford
Stopsley https://www.ukfirestations.co.uk/station/stopsley
Toddington https://www.ukfirestations.co.uk/station/toddington
Woburn https://www.ukfirestations.co.uk/station/woburn
link='https://www.ukfirestations.co.uk/stations/buckinghamshire'
Amersham https://www.ukfirestations.co.uk/station/amersham
Aylesbury & HQ https://www.ukfirestations.co.uk/station/aylesbury-hq
Beaconsfield https://www.ukfirestations.co.uk/station/beaconsfield
Brill https://www.ukfirestations.co.uk/station/brill
Broughton https://www.ukfirestations.co.uk/station/broughton
Buckingham https://www.ukfirestations.co.uk/station/buckingham
Chesham https://www.ukfirestations.co.uk/station/chesham
Gerrards Cross https://www.ukfirestations.co.uk/station/gerrards-cross
Great Missenden https://www.ukfirestations.co.uk/station/great-missenden
Haddenham https://www.ukfirestations.co.uk/station/haddenham
High Wycombe https://www.ukfirestations.co.uk/station/high-wycombe
Marlow https://www.ukfirestations.co.uk/station/marlow
Newport Pagnell https://www.ukfirestations.co.uk/station/newport-pagnell
Olney https://www.ukfirestations.co.uk/station/olney
Princes Risborough https://www.ukfirestations.co.uk/station/princes-risborough
Stokenchurch https://www.ukfirestations.co.uk/station/stokenchurch
Waddesdon https://www.ukfirestations.co.uk/station/waddesdon
West Ashland https://www.ukfirestations.co.uk/station/milton-keynes
Winslow https://www.ukfirestations.co.uk/station/winslow
Or: use .find_previous() to check if you're in correct section:
for link in hhref:
print(f"{link=}")
print()
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, "html.parser")
for s in soup.select(".station"):
h2 = s.find_previous("h2")
if h2.get_text(strip=True) != "Current Stations":
continue
print(
"{:<30} {}".format(s.select_one(".station-name").text, s.a["href"])
)
print()

Having trouble in scraping table data using beautiful soup

I would like to scrape the table data from this site. I've tried the code below but for whatever reason, BS4 seems unable to fetch the table data:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('table', attrs={"id": "table"})
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
I would really appreciate your help :)
You used wrong tag and id name to find the right table. The following should work:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('template', attrs={"id":"table-data"})
for tr in table.find_all('tr'):
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
import requests
from bs4 import BeautifulSoup as bs4
url = ('https://drafty.cs.brown.edu/csprofessors')
response = requests.get(url)
if response.ok:
data = list()
soup = bs4(response.text, 'html.parser')
fullnames = soup.select('td:nth-child(1)')
university = soup.select('td:nth-child(2)')
join_year = soup.select('td:nth-child(3)')
sub_field = soup.select('td:nth-child(4)')
bachelors = soup.select('td:nth-child(5)')
doctorate = soup.select('td:nth-child(6)')
for item in range(1, len(fullnames) + 1):
data.append(
[
{
'fullnames': fullnames,
'university': university,
'join_year': join_year,
'sub_field': sub_field,
'bachelors': bachelors,
'doctorate': doctorate
}
]
)
You can simply use selenium combined with pandas to scrape the table. Here is how you do it:
import pandas as pd
from selenium import webdriver
import time
url = 'https://drafty.cs.brown.edu/csprofessors'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="welcome-screen"]/div/div/div[1]/button').click()
time.sleep(1)
page = driver.page_source
df = pd.read_html(page)[0]
print(df)

Categories