Having trouble in scraping table data using beautiful soup - python

I would like to scrape the table data from this site. I've tried the code below but for whatever reason, BS4 seems unable to fetch the table data:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('table', attrs={"id": "table"})
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
I would really appreciate your help :)

You used wrong tag and id name to find the right table. The following should work:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('template', attrs={"id":"table-data"})
for tr in table.find_all('tr'):
td = tr.find_all('td')
row = [i.text for i in td]
print(row)

import requests
from bs4 import BeautifulSoup as bs4
url = ('https://drafty.cs.brown.edu/csprofessors')
response = requests.get(url)
if response.ok:
data = list()
soup = bs4(response.text, 'html.parser')
fullnames = soup.select('td:nth-child(1)')
university = soup.select('td:nth-child(2)')
join_year = soup.select('td:nth-child(3)')
sub_field = soup.select('td:nth-child(4)')
bachelors = soup.select('td:nth-child(5)')
doctorate = soup.select('td:nth-child(6)')
for item in range(1, len(fullnames) + 1):
data.append(
[
{
'fullnames': fullnames,
'university': university,
'join_year': join_year,
'sub_field': sub_field,
'bachelors': bachelors,
'doctorate': doctorate
}
]
)

You can simply use selenium combined with pandas to scrape the table. Here is how you do it:
import pandas as pd
from selenium import webdriver
import time
url = 'https://drafty.cs.brown.edu/csprofessors'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="welcome-screen"]/div/div/div[1]/button').click()
time.sleep(1)
page = driver.page_source
df = pd.read_html(page)[0]
print(df)

Related

How to get the link behind the label using Beautifulsoup?

I've tried it in so many ways, but I only get to the hyperlink/label.
from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()
url = 'http://www.warrencountyschools.org/district_staff.aspx?action=search&location=12&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
#link = cols[2].find('a', href=True)
#link = cols[2].find('a').attrs['href']
print(link)
Here's the output:
/staff/2068
/staff/1183
/staff/24563
/staff/1261
/staff/25535
Behind each line printed above, there's a link that I would like to get instead of the label.
Here's where I'm getting it from:
Thanks for any help.
They are relative urls and you have to convert them into asbolute urls.
Script:
from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()
base_url='http://www.warrencountyschools.org'
url = 'http://www.warrencountyschools.org/district_staff.aspx?action=search&location=12&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
abs_link= base_url+link
#link = cols[2].find('a', href=True)
#link = cols[2].find('a').attrs['href']
print(abs_link)
Output:
http://www.warrencountyschools.org/staff/2068
http://www.warrencountyschools.org/staff/1183
http://www.warrencountyschools.org/staff/24563
http://www.warrencountyschools.org/staff/1261
http://www.warrencountyschools.org/staff/25535
http://www.warrencountyschools.org/staff/14375
http://www.warrencountyschools.org/staff/1184
http://www.warrencountyschools.org/staff/24724
http://www.warrencountyschools.org/staff/25949
http://www.warrencountyschools.org/staff/25652
http://www.warrencountyschools.org/staff/1186
http://www.warrencountyschools.org/staff/1188
http://www.warrencountyschools.org/staff/14132
http://www.warrencountyschools.org/staff/1189
http://www.warrencountyschools.org/staff/1768
http://www.warrencountyschools.org/staff/1191
http://www.warrencountyschools.org/staff/1194
http://www.warrencountyschools.org/staff/14130
http://www.warrencountyschools.org/staff/14072
http://www.warrencountyschools.org/staff/25123
http://www.warrencountyschools.org/staff/13776
http://www.warrencountyschools.org/staff/25994
http://www.warrencountyschools.org/staff/1199
http://www.warrencountyschools.org/staff/3404
http://www.warrencountyschools.org/staff/14022
http://www.warrencountyschools.org/staff/24620
http://www.warrencountyschools.org/staff/24546
http://www.warrencountyschools.org/staff/1203
http://www.warrencountyschools.org/staff/3321
http://www.warrencountyschools.org/staff/25479
http://www.warrencountyschools.org/staff/14418
http://www.warrencountyschools.org/staff/1204
http://www.warrencountyschools.org/staff/25332
http://www.warrencountyschools.org/staff/1206
http://www.warrencountyschools.org/staff/25372
http://www.warrencountyschools.org/staff/2988
http://www.warrencountyschools.org/staff/1205
http://www.warrencountyschools.org/staff/13598
http://www.warrencountyschools.org/staff/24947
http://www.warrencountyschools.org/staff/25159
http://www.warrencountyschools.org/staff/25887
http://www.warrencountyschools.org/staff/24931
http://www.warrencountyschools.org/staff/25093
http://www.warrencountyschools.org/staff/25956
http://www.warrencountyschools.org/staff/1212
http://www.warrencountyschools.org/staff/14245
http://www.warrencountyschools.org/staff/24587
http://www.warrencountyschools.org/staff/14389
http://www.warrencountyschools.org/staff/26061
http://www.warrencountyschools.org/staff/25451
http://www.warrencountyschools.org/staff/1438
http://www.warrencountyschools.org/staff/1216
http://www.warrencountyschools.org/staff/25154
http://www.warrencountyschools.org/staff/14227
http://www.warrencountyschools.org/staff/1221
http://www.warrencountyschools.org/staff/24687
http://www.warrencountyschools.org/staff/24472
http://www.warrencountyschools.org/staff/1222
http://www.warrencountyschools.org/staff/25874
http://www.warrencountyschools.org/staff/1223
http://www.warrencountyschools.org/staff/1226
http://www.warrencountyschools.org/staff/25529
http://www.warrencountyschools.org/staff/1227
http://www.warrencountyschools.org/staff/1229
http://www.warrencountyschools.org/staff/14277
http://www.warrencountyschools.org/staff/25460
http://www.warrencountyschools.org/staff/24780
http://www.warrencountyschools.org/staff/1231
http://www.warrencountyschools.org/staff/24932
http://www.warrencountyschools.org/staff/1233
http://www.warrencountyschools.org/staff/14248
http://www.warrencountyschools.org/staff/13890
http://www.warrencountyschools.org/staff/14033
http://www.warrencountyschools.org/staff/3108
http://www.warrencountyschools.org/staff/25146
http://www.warrencountyschools.org/staff/24748
http://www.warrencountyschools.org/staff/1236
http://www.warrencountyschools.org/staff/25156
http://www.warrencountyschools.org/staff/1237
http://www.warrencountyschools.org/staff/1239
http://www.warrencountyschools.org/staff/3152
http://www.warrencountyschools.org/staff/1243
http://www.warrencountyschools.org/staff/24946
http://www.warrencountyschools.org/staff/26021
http://www.warrencountyschools.org/staff/14377
http://www.warrencountyschools.org/staff/1806
http://www.warrencountyschools.org/staff/1245
http://www.warrencountyschools.org/staff/1246
http://www.warrencountyschools.org/staff/3248
http://www.warrencountyschools.org/staff/24942
http://www.warrencountyschools.org/staff/14399
http://www.warrencountyschools.org/staff/25069
http://www.warrencountyschools.org/staff/13769
http://www.warrencountyschools.org/staff/13768
http://www.warrencountyschools.org/staff/25884
http://www.warrencountyschools.org/staff/1249
http://www.warrencountyschools.org/staff/1250
http://www.warrencountyschools.org/staff/14016
http://www.warrencountyschools.org/staff/1253
http://www.warrencountyschools.org/staff/13741
http://www.warrencountyschools.org/staff/24631
http://www.warrencountyschools.org/staff/25005
http://www.warrencountyschools.org/staff/25712
http://www.warrencountyschools.org/staff/13719
http://www.warrencountyschools.org/staff/25050
http://www.warrencountyschools.org/staff/14129
http://www.warrencountyschools.org/staff/2842
http://www.warrencountyschools.org/staff/3240
http://www.warrencountyschools.org/staff/1256
http://www.warrencountyschools.org/staff/25081
http://www.warrencountyschools.org/staff/24753
http://www.warrencountyschools.org/staff/24471
http://www.warrencountyschools.org/staff/14385
http://www.warrencountyschools.org/staff/14131
http://www.warrencountyschools.org/staff/25219
http://www.warrencountyschools.org/staff/2741
http://www.warrencountyschools.org/staff/1260
http://www.warrencountyschools.org/staff/1262
http://www.warrencountyschools.org/staff/13491
http://www.warrencountyschools.org/staff/1263
http://www.warrencountyschools.org/staff/13928
http://www.warrencountyschools.org/staff/1079
http://www.warrencountyschools.org/staff/24506
http://www.warrencountyschools.org/staff/1267
http://www.warrencountyschools.org/staff/24570
http://www.warrencountyschools.org/staff/13983
http://www.warrencountyschools.org/staff/25415
http://www.warrencountyschools.org/staff/25284
http://www.warrencountyschools.org/staff/13935
http://www.warrencountyschools.org/staff/24970
http://www.warrencountyschools.org/staff/1273
http://www.warrencountyschools.org/staff/24606
http://www.warrencountyschools.org/staff/25741
http://www.warrencountyschools.org/staff/14249
If I've understood what you're asking correctly, the links like /staff/2068 ARE the link addresses. They are relative to the address of the page. For example go to:
https://www.warrencountyschools.org/staff/2068

Get value between tags TD (python)

I want to take values between td tags. I tried to write the code, but I think it can be improved, made more beautiful, please tell me.
from bs4 import BeautifulSoup
import requests
invite_date = str()
url = 'http://reestr.nostroy.ru/reestr/clients/233/members/5801625'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = soup.find('table', class_='items table')
i = 0
for tr in news.find_all('tr'):
if tr.find('td'):
if i == 6:
cols = tr.findAll('td')
for t in cols:
invite_date = t.text
i += 1
print (invite_date)
For getting only invite_date. You can try it:
from bs4 import BeautifulSoup
import requests
invite_date = str()
url = 'http://reestr.nostroy.ru/reestr/clients/233/members/5801625'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = soup.find('table', class_='items table')
invite_date = news.find_all('tr')[7].td.text
print(invite_date)
Output will be:
21.05.2019

Loop through different links on a website and scrape certain information

Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()
You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.
A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)

Web scraping with Python - how to parse tables

How to parse table from https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10 with BeautifulSoup and make pandas DataFrame?
My code:
import requests
from bs4 import BeautifulSoup
url = 'https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10'
page = requests.get(url)
soup = BeautifulSoup(page.content,"html.parser")
table = soup.find_all("table")
for each_table in table:
for row in each_table.find_all('tr'):
for cell in row.find_all("td"):
print(cell.text)
I try this:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://ege.hse.ru/rating/2019/81031971/all/?rlist=&ptype=0&vuz-abiturients-budget-order=ge&vuz-abiturients-budget-val=10"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tbl = soup.find("table", {"id": "MainContent_dataGridView1"})
data_frame = pd.read_html(str(tbl))[0]
print(data_frame)
But it says:
"ValueError: No tables found"
I see only a table with id="transparence_t"
So:
tbl = soup.find("table", {"id": "transparence_t"})
data_frame = pd.read_html(str(tbl))[0]
print(data_frame)
It returns to me a 698x6 dataframe

Python beautifulsoup paring html table - td data missing

As subject, I try to fetch the table using beautifulsoup.
http://www.hkjc.com/english/racing/Horse.asp?HorseNo=T421
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import lxml
import xlrd
HorseNo = ["T421"]
driver = webdriver.PhantomJS(r'D:\Program Files\Python\Path\PhantomJS\bin\phantomjs.exe')
#driver = webdriver.Chrome(r'D:\Program Files\Python\Path\chromedriver.exe')
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
print(table)
for row in table:
cells = row.findAll("td")
print(cells)
Print(table) result is fine though print(cells) is not able to return every td in the table. Would somebody advise me further. Thanks.
try this below using requests
from bs4 import BeautifulSoup
import requests
HorseNo = ["T421"]
url = "http://www.hkjc.com/english/racing/horse.asp?HorseNo=" + str(HorseNo)
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", {"class" :"bigborder", "width":"970"}).findAll("tr")
cells = []
for row in table:
cell = row.findAll("td")
cells.append(cell)
print(cells)

Categories