Python beautifulsoup paring html table - td data missing - python

As subject, I try to fetch the table using beautifulsoup.
http://www.hkjc.com/english/racing/Horse.asp?HorseNo=T421
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import lxml
import xlrd
HorseNo = ["T421"]
driver = webdriver.PhantomJS(r'D:\Program Files\Python\Path\PhantomJS\bin\phantomjs.exe')
#driver = webdriver.Chrome(r'D:\Program Files\Python\Path\chromedriver.exe')
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
print(table)
for row in table:
cells = row.findAll("td")
print(cells)
Print(table) result is fine though print(cells) is not able to return every td in the table. Would somebody advise me further. Thanks.

try this below using requests
from bs4 import BeautifulSoup
import requests
HorseNo = ["T421"]
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
cells = []
for row in table:
cell = row.findAll("td")
cells.append(cell)
print(cells)

Related

Having trouble in scraping table data using beautiful soup

I would like to scrape the table data from this site. I've tried the code below but for whatever reason, BS4 seems unable to fetch the table data:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('table', attrs={"id": "table"})
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
I would really appreciate your help :)
You used wrong tag and id name to find the right table. The following should work:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('template', attrs={"id":"table-data"})
for tr in table.find_all('tr'):
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
import requests
from bs4 import BeautifulSoup as bs4
url = ('https://drafty.cs.brown.edu/csprofessors')
response = requests.get(url)
if response.ok:
data = list()
soup = bs4(response.text, 'html.parser')
fullnames = soup.select('td:nth-child(1)')
university = soup.select('td:nth-child(2)')
join_year = soup.select('td:nth-child(3)')
sub_field = soup.select('td:nth-child(4)')
bachelors = soup.select('td:nth-child(5)')
doctorate = soup.select('td:nth-child(6)')
for item in range(1, len(fullnames) + 1):
data.append(
[
{
'fullnames': fullnames,
'university': university,
'join_year': join_year,
'sub_field': sub_field,
'bachelors': bachelors,
'doctorate': doctorate
}
]
)
You can simply use selenium combined with pandas to scrape the table. Here is how you do it:
import pandas as pd
from selenium import webdriver
import time
url = 'https://drafty.cs.brown.edu/csprofessors'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="welcome-screen"]/div/div/div[1]/button').click()
time.sleep(1)
page = driver.page_source
df = pd.read_html(page)[0]
print(df)

Finding Table Class Python

I am having trouble finding a table while web scraping using python/Beautiful Soup
import requests
from bs4 import BeautifulSoup
url = 'https://www.espn.com/nba/player/gamelog/_/id/3907387/ben-simmons'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
t = soup.find_all('table' , class_='Table Table--align-right')
This is returning null
You can use pandas read_html to read the table as a list and find the required list from that as below:
import requests
import pandas as pd
url = 'https://www.espn.com/nba/player/gamelog/_/id/3907387/ben-simmon'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[3]
print(df)

Loop through different links on a website and scrape certain information

Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()
You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.
A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)

Web scraping with Python - how to parse tables

How to parse table from https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10 with BeautifulSoup and make pandas DataFrame?
My code:
import requests
from bs4 import BeautifulSoup
url = 'https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10'
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
table = soup.find_all("table")
for each_table in table:
for row in each_table.find_all('tr'):
for cell in row.find_all("td"):
print(cell.text)
I try this:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tbl = soup.find("table", {"id": "MainContent_dataGridView1"})
data_frame = pd.read_html(str(tbl))[0]
print(data_frame)
But it says:
"ValueError: No tables found"
I see only a table with id="transparence_t"
So:
tbl = soup.find("table", {"id": "transparence_t"})
data_frame = pd.read_html(str(tbl))[0]
print(data_frame)
It returns to me a 698x6 dataframe

How to scrape the yahoo earnings calendar with beautifulsoup

How can I scrape the yahoo earnings calendar to pull out the dates?
This is for python 3.
from bs4 import BeautifulSoup as soup
import urllib
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find('p')
print(table)
the output is "None"
Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation
from bs4 import BeautifulSoup as soup
import urllib.request
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find_all('td')
Dates = []
for something in table:
try:
if something['aria-label'] == "Earnings Date":
Dates.append(something.text)
except:
print('')
print(Dates)
Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines:
import pandas as pd
earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]
Here are two succinct ways
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1')
soup = bs(r.content, 'lxml')
# using attribute = value selector
dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')]
#using nth-of-type to get column
dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]

Categories