How to get the link behind the label using Beautifulsoup? - python

I've tried it in so many ways, but I only get to the hyperlink/label.
from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()
url = 'http://www.warrencountyschools.org/district_staff.aspx?action=search&location=12&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
#link = cols[2].find('a', href=True)
#link = cols[2].find('a').attrs['href']
print(link)
Here's the output:
/staff/2068
/staff/1183
/staff/24563
/staff/1261
/staff/25535
Behind each line printed above, there's a link that I would like to get instead of the label.
Here's where I'm getting it from:
Thanks for any help.

They are relative urls and you have to convert them into asbolute urls.
Script:
from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()
base_url='http://www.warrencountyschools.org'
url = 'http://www.warrencountyschools.org/district_staff.aspx?action=search&location=12&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
abs_link= base_url+link
#link = cols[2].find('a', href=True)
#link = cols[2].find('a').attrs['href']
print(abs_link)
Output:
http://www.warrencountyschools.org/staff/2068
http://www.warrencountyschools.org/staff/1183
http://www.warrencountyschools.org/staff/24563
http://www.warrencountyschools.org/staff/1261
http://www.warrencountyschools.org/staff/25535
http://www.warrencountyschools.org/staff/14375
http://www.warrencountyschools.org/staff/1184
http://www.warrencountyschools.org/staff/24724
http://www.warrencountyschools.org/staff/25949
http://www.warrencountyschools.org/staff/25652
http://www.warrencountyschools.org/staff/1186
http://www.warrencountyschools.org/staff/1188
http://www.warrencountyschools.org/staff/14132
http://www.warrencountyschools.org/staff/1189
http://www.warrencountyschools.org/staff/1768
http://www.warrencountyschools.org/staff/1191
http://www.warrencountyschools.org/staff/1194
http://www.warrencountyschools.org/staff/14130
http://www.warrencountyschools.org/staff/14072
http://www.warrencountyschools.org/staff/25123
http://www.warrencountyschools.org/staff/13776
http://www.warrencountyschools.org/staff/25994
http://www.warrencountyschools.org/staff/1199
http://www.warrencountyschools.org/staff/3404
http://www.warrencountyschools.org/staff/14022
http://www.warrencountyschools.org/staff/24620
http://www.warrencountyschools.org/staff/24546
http://www.warrencountyschools.org/staff/1203
http://www.warrencountyschools.org/staff/3321
http://www.warrencountyschools.org/staff/25479
http://www.warrencountyschools.org/staff/14418
http://www.warrencountyschools.org/staff/1204
http://www.warrencountyschools.org/staff/25332
http://www.warrencountyschools.org/staff/1206
http://www.warrencountyschools.org/staff/25372
http://www.warrencountyschools.org/staff/2988
http://www.warrencountyschools.org/staff/1205
http://www.warrencountyschools.org/staff/13598
http://www.warrencountyschools.org/staff/24947
http://www.warrencountyschools.org/staff/25159
http://www.warrencountyschools.org/staff/25887
http://www.warrencountyschools.org/staff/24931
http://www.warrencountyschools.org/staff/25093
http://www.warrencountyschools.org/staff/25956
http://www.warrencountyschools.org/staff/1212
http://www.warrencountyschools.org/staff/14245
http://www.warrencountyschools.org/staff/24587
http://www.warrencountyschools.org/staff/14389
http://www.warrencountyschools.org/staff/26061
http://www.warrencountyschools.org/staff/25451
http://www.warrencountyschools.org/staff/1438
http://www.warrencountyschools.org/staff/1216
http://www.warrencountyschools.org/staff/25154
http://www.warrencountyschools.org/staff/14227
http://www.warrencountyschools.org/staff/1221
http://www.warrencountyschools.org/staff/24687
http://www.warrencountyschools.org/staff/24472
http://www.warrencountyschools.org/staff/1222
http://www.warrencountyschools.org/staff/25874
http://www.warrencountyschools.org/staff/1223
http://www.warrencountyschools.org/staff/1226
http://www.warrencountyschools.org/staff/25529
http://www.warrencountyschools.org/staff/1227
http://www.warrencountyschools.org/staff/1229
http://www.warrencountyschools.org/staff/14277
http://www.warrencountyschools.org/staff/25460
http://www.warrencountyschools.org/staff/24780
http://www.warrencountyschools.org/staff/1231
http://www.warrencountyschools.org/staff/24932
http://www.warrencountyschools.org/staff/1233
http://www.warrencountyschools.org/staff/14248
http://www.warrencountyschools.org/staff/13890
http://www.warrencountyschools.org/staff/14033
http://www.warrencountyschools.org/staff/3108
http://www.warrencountyschools.org/staff/25146
http://www.warrencountyschools.org/staff/24748
http://www.warrencountyschools.org/staff/1236
http://www.warrencountyschools.org/staff/25156
http://www.warrencountyschools.org/staff/1237
http://www.warrencountyschools.org/staff/1239
http://www.warrencountyschools.org/staff/3152
http://www.warrencountyschools.org/staff/1243
http://www.warrencountyschools.org/staff/24946
http://www.warrencountyschools.org/staff/26021
http://www.warrencountyschools.org/staff/14377
http://www.warrencountyschools.org/staff/1806
http://www.warrencountyschools.org/staff/1245
http://www.warrencountyschools.org/staff/1246
http://www.warrencountyschools.org/staff/3248
http://www.warrencountyschools.org/staff/24942
http://www.warrencountyschools.org/staff/14399
http://www.warrencountyschools.org/staff/25069
http://www.warrencountyschools.org/staff/13769
http://www.warrencountyschools.org/staff/13768
http://www.warrencountyschools.org/staff/25884
http://www.warrencountyschools.org/staff/1249
http://www.warrencountyschools.org/staff/1250
http://www.warrencountyschools.org/staff/14016
http://www.warrencountyschools.org/staff/1253
http://www.warrencountyschools.org/staff/13741
http://www.warrencountyschools.org/staff/24631
http://www.warrencountyschools.org/staff/25005
http://www.warrencountyschools.org/staff/25712
http://www.warrencountyschools.org/staff/13719
http://www.warrencountyschools.org/staff/25050
http://www.warrencountyschools.org/staff/14129
http://www.warrencountyschools.org/staff/2842
http://www.warrencountyschools.org/staff/3240
http://www.warrencountyschools.org/staff/1256
http://www.warrencountyschools.org/staff/25081
http://www.warrencountyschools.org/staff/24753
http://www.warrencountyschools.org/staff/24471
http://www.warrencountyschools.org/staff/14385
http://www.warrencountyschools.org/staff/14131
http://www.warrencountyschools.org/staff/25219
http://www.warrencountyschools.org/staff/2741
http://www.warrencountyschools.org/staff/1260
http://www.warrencountyschools.org/staff/1262
http://www.warrencountyschools.org/staff/13491
http://www.warrencountyschools.org/staff/1263
http://www.warrencountyschools.org/staff/13928
http://www.warrencountyschools.org/staff/1079
http://www.warrencountyschools.org/staff/24506
http://www.warrencountyschools.org/staff/1267
http://www.warrencountyschools.org/staff/24570
http://www.warrencountyschools.org/staff/13983
http://www.warrencountyschools.org/staff/25415
http://www.warrencountyschools.org/staff/25284
http://www.warrencountyschools.org/staff/13935
http://www.warrencountyschools.org/staff/24970
http://www.warrencountyschools.org/staff/1273
http://www.warrencountyschools.org/staff/24606
http://www.warrencountyschools.org/staff/25741
http://www.warrencountyschools.org/staff/14249

If I've understood what you're asking correctly, the links like /staff/2068 ARE the link addresses. They are relative to the address of the page. For example go to:
https://www.warrencountyschools.org/staff/2068

Related

Having trouble in scraping table data using beautiful soup

I would like to scrape the table data from this site. I've tried the code below but for whatever reason, BS4 seems unable to fetch the table data:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('table', attrs={"id": "table"})
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
I would really appreciate your help :)
You used wrong tag and id name to find the right table. The following should work:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('template', attrs={"id":"table-data"})
for tr in table.find_all('tr'):
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
import requests
from bs4 import BeautifulSoup as bs4
url = ('https://drafty.cs.brown.edu/csprofessors')
response = requests.get(url)
if response.ok:
data = list()
soup = bs4(response.text, 'html.parser')
fullnames = soup.select('td:nth-child(1)')
university = soup.select('td:nth-child(2)')
join_year = soup.select('td:nth-child(3)')
sub_field = soup.select('td:nth-child(4)')
bachelors = soup.select('td:nth-child(5)')
doctorate = soup.select('td:nth-child(6)')
for item in range(1, len(fullnames) + 1):
data.append(
[
{
'fullnames': fullnames,
'university': university,
'join_year': join_year,
'sub_field': sub_field,
'bachelors': bachelors,
'doctorate': doctorate
}
]
)
You can simply use selenium combined with pandas to scrape the table. Here is how you do it:
import pandas as pd
from selenium import webdriver
import time
url = 'https://drafty.cs.brown.edu/csprofessors'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="welcome-screen"]/div/div/div[1]/button').click()
time.sleep(1)
page = driver.page_source
df = pd.read_html(page)[0]
print(df)

Get value between tags TD (python)

I want to take values between td tags. I tried to write the code, but I think it can be improved, made more beautiful, please tell me.
from bs4 import BeautifulSoup
import requests
invite_date = str()
url = 'http://reestr.nostroy.ru/reestr/clients/233/members/5801625'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = soup.find('table', class_='items table')
i = 0
for tr in news.find_all('tr'):
if tr.find('td'):
if i == 6:
cols = tr.findAll('td')
for t in cols:
invite_date = t.text
i += 1
print (invite_date)
For getting only invite_date. You can try it:
from bs4 import BeautifulSoup
import requests
invite_date = str()
url = 'http://reestr.nostroy.ru/reestr/clients/233/members/5801625'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = soup.find('table', class_='items table')
invite_date = news.find_all('tr')[7].td.text
print(invite_date)
Output will be:
21.05.2019

Loop through different links on a website and scrape certain information

Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()
You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.
A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)

Extract both text in urls from table in page - beautiful soup

I am trying to extract both text and urls in a table from a website but I only seem to be able to get the text. I am guessing this has something to do with the
text.strip in my code but I am not how I can clean up the html tags without removing the url links in there. Here's what I've put together so far:
import requests
from bs4 import BeautifulSoup
start_number = 0
max_number = 5
urls=[]
for number in range(start_number, max_number + start_number):
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=' + str(number)+ '&lang=en'
urls.append(url)
data = []
for url in urls:
r = requests.get(url)
soup = BeautifulSoup(r.content,"html.parser")
table = soup.find("table")
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
Simply extract the href from the <a> element. For the purpose of the answer, I simplified the code not to worry about subsequent pages.
from collections import namedtuple
import requests
from bs4 import BeautifulSoup
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=0&lang=en'
data = []
Record = namedtuple('Record', 'id company agency date pdf_link')
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
rows = soup.select('table > tbody > tr')
for row in rows[1:]: # omit header row
cols = row.find_all('td')
fields = [td.text.strip() for td in cols if td.text.strip()]
if fields: # if the row is not empty
pdf_link = row.find('a')['href']
record = Record(*fields, pdf_link)
data.append(record)
>>> data[0].pdf_link
'images/notifikasi/619.%20Pengumuman%20Publik%20PT%20IGP.compressed.pdf'

requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied

I am working on a web scraping project and have run into the following error.
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
Below is my code. I retrieve all of the links from the html table and they print out as expected. But when I try to loop through them (links) with request.get I get the error above.
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
Your mistake is second for loop in code
for ref in table.find_all('a', href=True):
links = (ref['href'])
print (links)
for link in links:
ref['href'] gives you single url but you use it as list in next for loop.
So you have
for link in ref['href']:
and it gives you first char from url http://properties.kimcore... which is h
Full working code
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
for ref in table.find_all('a', href=True):
link = ref['href']
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
table = []
# Find all the divs we need in one go.
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
# find all the enclosing a tags.
anchors = div.find_all('a')
for anchor in anchors:
# Now we have groups of 3 list items (li) tags
lis = anchor.find_all('li')
# we clean up the text from the group of 3 li tags and add them as a list to our table list.
table.append([unicodedata.normalize("NFKD",lis[0].text).strip(), lis[1].text, lis[2].text.strip()])
# We have all the data so we add it to a DataFrame.
headers = ['Number', 'Tenant', 'Square Footage']
df = DataFrame(table, columns=headers)
print (df)
BTW: if you use comma in (ref['href'], ) then you get tuple and then second for works correclty.
EDIT: it create list table_data at start and add all data into this list. And it convert into DataFrame at the end.
But now I see it read the same page few times - because in every row the same url is in every column. You would have to get url only from one column.
EDIT: now it doesn't read the same url many times
EDIT: now it get text and hre from first link and add to every element in list when you use append().
from bs4 import BeautifulSoup
import requests
import unicodedata
from pandas import DataFrame
page = requests.get("http://properties.kimcorealty.com/property/output/find/search4/view:list/")
soup = BeautifulSoup(page.content, 'html.parser')
table_data = []
# all rows in table except first ([1:]) - headers
rows = soup.select('table tr')[1:]
for row in rows:
# link in first column (td[0]
#link = row.select('td')[0].find('a')
link = row.find('a')
link_href = link['href']
link_text = link.text
print('text:', link_text)
print('href:', link_href)
page = requests.get(link_href)
soup = BeautifulSoup(page.content, 'html.parser')
divs = soup.find_all('div', {'id':['units_box_1', 'units_box_2', 'units_box_3']})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
lis = anchor.find_all('li')
item1 = unicodedata.normalize("NFKD", lis[0].text).strip()
item2 = lis[1].text
item3 = lis[2].text.strip()
table_data.append([item1, item2, item3, link_text, link_href])
print('table_data size:', len(table_data))
headers = ['Number', 'Tenant', 'Square Footage', 'Link Text', 'Link Href']
df = DataFrame(table_data, columns=headers)
print(df)

Categories