Extract both text in urls from table in page - beautiful soup - python

I am trying to extract both text and urls in a table from a website but I only seem to be able to get the text. I am guessing this has something to do with the
text.strip in my code but I am not how I can clean up the html tags without removing the url links in there. Here's what I've put together so far:
import requests
from bs4 import BeautifulSoup
start_number = 0
max_number = 5
urls=[]
for number in range(start_number, max_number + start_number):
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=' + str(number)+ '&lang=en'
urls.append(url)
data = []
for url in urls:
r = requests.get(url)
soup = BeautifulSoup(r.content,"html.parser")
table = soup.find("table")
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values

Simply extract the href from the <a> element. For the purpose of the answer, I simplified the code not to worry about subsequent pages.
from collections import namedtuple
import requests
from bs4 import BeautifulSoup
url = 'http://www.ispo-org.or.id/index.php?option=com_content&view=article&id=79:pengumumanpublik&catid=10&Itemid=233&showall=&limitstart=0&lang=en'
data = []
Record = namedtuple('Record', 'id company agency date pdf_link')
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
rows = soup.select('table > tbody > tr')
for row in rows[1:]: # omit header row
cols = row.find_all('td')
fields = [td.text.strip() for td in cols if td.text.strip()]
if fields: # if the row is not empty
pdf_link = row.find('a')['href']
record = Record(*fields, pdf_link)
data.append(record)
>>> data[0].pdf_link
'images/notifikasi/619.%20Pengumuman%20Publik%20PT%20IGP.compressed.pdf'

Related

How to get the link behind the label using Beautifulsoup?

I've tried it in so many ways, but I only get to the hyperlink/label.
from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()
url = 'http://www.warrencountyschools.org/district_staff.aspx?action=search&location=12&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
#link = cols[2].find('a', href=True)
#link = cols[2].find('a').attrs['href']
print(link)
Here's the output:
/staff/2068
/staff/1183
/staff/24563
/staff/1261
/staff/25535
Behind each line printed above, there's a link that I would like to get instead of the label.
Here's where I'm getting it from:
Thanks for any help.
They are relative urls and you have to convert them into asbolute urls.
Script:
from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()
base_url='http://www.warrencountyschools.org'
url = 'http://www.warrencountyschools.org/district_staff.aspx?action=search&location=12&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
abs_link= base_url+link
#link = cols[2].find('a', href=True)
#link = cols[2].find('a').attrs['href']
print(abs_link)
Output:
http://www.warrencountyschools.org/staff/2068
http://www.warrencountyschools.org/staff/1183
http://www.warrencountyschools.org/staff/24563
http://www.warrencountyschools.org/staff/1261
http://www.warrencountyschools.org/staff/25535
http://www.warrencountyschools.org/staff/14375
http://www.warrencountyschools.org/staff/1184
http://www.warrencountyschools.org/staff/24724
http://www.warrencountyschools.org/staff/25949
http://www.warrencountyschools.org/staff/25652
http://www.warrencountyschools.org/staff/1186
http://www.warrencountyschools.org/staff/1188
http://www.warrencountyschools.org/staff/14132
http://www.warrencountyschools.org/staff/1189
http://www.warrencountyschools.org/staff/1768
http://www.warrencountyschools.org/staff/1191
http://www.warrencountyschools.org/staff/1194
http://www.warrencountyschools.org/staff/14130
http://www.warrencountyschools.org/staff/14072
http://www.warrencountyschools.org/staff/25123
http://www.warrencountyschools.org/staff/13776
http://www.warrencountyschools.org/staff/25994
http://www.warrencountyschools.org/staff/1199
http://www.warrencountyschools.org/staff/3404
http://www.warrencountyschools.org/staff/14022
http://www.warrencountyschools.org/staff/24620
http://www.warrencountyschools.org/staff/24546
http://www.warrencountyschools.org/staff/1203
http://www.warrencountyschools.org/staff/3321
http://www.warrencountyschools.org/staff/25479
http://www.warrencountyschools.org/staff/14418
http://www.warrencountyschools.org/staff/1204
http://www.warrencountyschools.org/staff/25332
http://www.warrencountyschools.org/staff/1206
http://www.warrencountyschools.org/staff/25372
http://www.warrencountyschools.org/staff/2988
http://www.warrencountyschools.org/staff/1205
http://www.warrencountyschools.org/staff/13598
http://www.warrencountyschools.org/staff/24947
http://www.warrencountyschools.org/staff/25159
http://www.warrencountyschools.org/staff/25887
http://www.warrencountyschools.org/staff/24931
http://www.warrencountyschools.org/staff/25093
http://www.warrencountyschools.org/staff/25956
http://www.warrencountyschools.org/staff/1212
http://www.warrencountyschools.org/staff/14245
http://www.warrencountyschools.org/staff/24587
http://www.warrencountyschools.org/staff/14389
http://www.warrencountyschools.org/staff/26061
http://www.warrencountyschools.org/staff/25451
http://www.warrencountyschools.org/staff/1438
http://www.warrencountyschools.org/staff/1216
http://www.warrencountyschools.org/staff/25154
http://www.warrencountyschools.org/staff/14227
http://www.warrencountyschools.org/staff/1221
http://www.warrencountyschools.org/staff/24687
http://www.warrencountyschools.org/staff/24472
http://www.warrencountyschools.org/staff/1222
http://www.warrencountyschools.org/staff/25874
http://www.warrencountyschools.org/staff/1223
http://www.warrencountyschools.org/staff/1226
http://www.warrencountyschools.org/staff/25529
http://www.warrencountyschools.org/staff/1227
http://www.warrencountyschools.org/staff/1229
http://www.warrencountyschools.org/staff/14277
http://www.warrencountyschools.org/staff/25460
http://www.warrencountyschools.org/staff/24780
http://www.warrencountyschools.org/staff/1231
http://www.warrencountyschools.org/staff/24932
http://www.warrencountyschools.org/staff/1233
http://www.warrencountyschools.org/staff/14248
http://www.warrencountyschools.org/staff/13890
http://www.warrencountyschools.org/staff/14033
http://www.warrencountyschools.org/staff/3108
http://www.warrencountyschools.org/staff/25146
http://www.warrencountyschools.org/staff/24748
http://www.warrencountyschools.org/staff/1236
http://www.warrencountyschools.org/staff/25156
http://www.warrencountyschools.org/staff/1237
http://www.warrencountyschools.org/staff/1239
http://www.warrencountyschools.org/staff/3152
http://www.warrencountyschools.org/staff/1243
http://www.warrencountyschools.org/staff/24946
http://www.warrencountyschools.org/staff/26021
http://www.warrencountyschools.org/staff/14377
http://www.warrencountyschools.org/staff/1806
http://www.warrencountyschools.org/staff/1245
http://www.warrencountyschools.org/staff/1246
http://www.warrencountyschools.org/staff/3248
http://www.warrencountyschools.org/staff/24942
http://www.warrencountyschools.org/staff/14399
http://www.warrencountyschools.org/staff/25069
http://www.warrencountyschools.org/staff/13769
http://www.warrencountyschools.org/staff/13768
http://www.warrencountyschools.org/staff/25884
http://www.warrencountyschools.org/staff/1249
http://www.warrencountyschools.org/staff/1250
http://www.warrencountyschools.org/staff/14016
http://www.warrencountyschools.org/staff/1253
http://www.warrencountyschools.org/staff/13741
http://www.warrencountyschools.org/staff/24631
http://www.warrencountyschools.org/staff/25005
http://www.warrencountyschools.org/staff/25712
http://www.warrencountyschools.org/staff/13719
http://www.warrencountyschools.org/staff/25050
http://www.warrencountyschools.org/staff/14129
http://www.warrencountyschools.org/staff/2842
http://www.warrencountyschools.org/staff/3240
http://www.warrencountyschools.org/staff/1256
http://www.warrencountyschools.org/staff/25081
http://www.warrencountyschools.org/staff/24753
http://www.warrencountyschools.org/staff/24471
http://www.warrencountyschools.org/staff/14385
http://www.warrencountyschools.org/staff/14131
http://www.warrencountyschools.org/staff/25219
http://www.warrencountyschools.org/staff/2741
http://www.warrencountyschools.org/staff/1260
http://www.warrencountyschools.org/staff/1262
http://www.warrencountyschools.org/staff/13491
http://www.warrencountyschools.org/staff/1263
http://www.warrencountyschools.org/staff/13928
http://www.warrencountyschools.org/staff/1079
http://www.warrencountyschools.org/staff/24506
http://www.warrencountyschools.org/staff/1267
http://www.warrencountyschools.org/staff/24570
http://www.warrencountyschools.org/staff/13983
http://www.warrencountyschools.org/staff/25415
http://www.warrencountyschools.org/staff/25284
http://www.warrencountyschools.org/staff/13935
http://www.warrencountyschools.org/staff/24970
http://www.warrencountyschools.org/staff/1273
http://www.warrencountyschools.org/staff/24606
http://www.warrencountyschools.org/staff/25741
http://www.warrencountyschools.org/staff/14249
If I've understood what you're asking correctly, the links like /staff/2068 ARE the link addresses. They are relative to the address of the page. For example go to:
https://www.warrencountyschools.org/staff/2068

i have tried to scrape a table using beautifulsoup and only one line of the table appears as output

I have tried to scrape the table http://www.geonames.org/search.html?q=kwadukuza&country=ZA, however only the last line of the table appears
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)=
df = pd.DataFrame(columns = headers)
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
you can use seperate list to append row data to make list of list data and then use it as row for your df
all_rows=[]
for j in table_data.find_all('tr',class_="odd"):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data]
all_rows.append(row)
For DataFrame:
df = pd.DataFrame(columns = headers,data=all_rows)
Output:
df.shape
(25,6)
As the comment already says, you need to put the row = [tr.text.strip() for tr in row_data] in the for loop. Otherwise you would just get the last entry.
In order to add the rows to the DataFrame, you need to make a list of all rows and put it together with the headers to a DataFrame. You could also append the rows to the DataFrame, but it is less efficient
Solution
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.geonames.org/search.html?q=kwadukuza&country=ZA'
requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table_data = soup.find('table', class_ = "restable")
headers = []
for i in table_data.find_all('th'):
title = i.text.strip()
headers.append(title)
data = []
for j in table_data.find_all('tr', class_='odd'):
row_data = j.find_all('td')
row = [tr.text.strip() for tr in row_data] # Put into the for loop
data.append(row)
# DataFrame
df = pd.DataFrame(columns=headers, data=data)
print(df)

scraping select all checkbox using python

import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table = soup.find('table', {'class' : 'table'})
rows = table.find_all('th')
headers = []
for i in table.find_all('th'):
title = i.text
headers.append(title)
df = pd.DataFrame(columns = headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
print(df)
I need to scrape a table from a website but it has select all checkbox for each row .What should I do.
Any help will be appreciated thank you.
(If I understand your question correctly: you want to remove the checkboxes from the output of the table).
Since the checkboxes are the first index of the table, you can skip them using index slicing. Use: [1:], which means: "from the first index to the last" (skipping the zero-based index).
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = (
"https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha"
)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
table = soup.find("table", {"class": "table"})
rows = table.find_all("th")
headers = []
for i in table.find_all("th"):
title = i.text.strip()
headers.append(title)
rows = []
for row in table.find_all("tr")[1:]:
data = row.find_all("td")
rows.append(td.text.strip() for td in data[1:])
df = pd.DataFrame(rows[:-1], columns=headers[1:])
print(df)
Output:
Scrip P.Close Open High Low LTP # REAL LTP(NOW) Result
0 BRITANNIA 3379.10 3385.00 3447.00 3385.00 3439.50 3439.50 0
1 EICHERMOT 2551.20 2565.00 2634.00 2565.00 2625.05 2625.05 0
You don't need to check those boxes in order to return all rows.
You can grab the table with pandas and drop the first column by name (if desired).
You can also do some tidying to match the web page.
import pandas as pd
df = pd.read_html('https://www.pivottrading.co.in/beta/tools/open-high-low-scanner.php?broker=zerodha')[0]
df.drop(columns={'Sr.No.'}, inplace=True)
df.iloc[-1, 0:4] = ''
df.fillna(0, inplace=True)
df

BeautifulSoup html table scrape - will only return last row

I am attempting a simple scrape of an HTML table using BeautifulSoup with the following:
import urllib
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
page = urllib.request.urlopen(url)
sdata = BeautifulSoup(page, 'html.parser')
return sdata
url = 'http://www.satp.org/satporgtp/countries/pakistan/database/bombblast.htm'
soup = make_soup(url)
table = soup.findAll('table', attrs={'class':'pagraph1'})
table = table[0]
trows = table.findAll('tr')
bbdata_ = []
bbdata = []
for trow in trows:
bbdata_ = trow.findAll('td')
bbdata = [ele.text.strip() for ele in bbdata_]
print(bbdata)
However, I can only extract the last row in the table, i.e.
['Total*', '369', '1032+']
All of the data is included in the trows, so I must be forming my loop incorrectly, but I am not sure how.
Your problem is here:
bbdata = [ele.text.strip() for ele in bbdata_]
You want to append to the list or extend it:
bbdata.append([ele.text.strip() for ele in bbdata_])
You are overwriting bbdata each time through the loop which is why it ends up only with the final value.

Scraping multiple pages in one Beautiful Soup script -- getting same result

I'm trying to loop through a script that parses tables with Beautiful Soup in Python 2.7.
The first table parse works and produces the expected results. The second loop produces exactly the same results as the first loop.
Additional details:
If I manually use the url that the second loop used to parse, I get
the intended page that I want to scrape. There is a little delay in refresh.
I use this on other websites and the loop works as intended.
Here is the script:
import urllib2
import csv
from bs4 import BeautifulSoup # latest version bs4
week = raw_input("Which week?")
week = str(week)
data = []
first = "http://fantasy.nfl.com/research/projections#researchProjections=researchProjections%2C%2Fresearch%2Fprojections%253Foffset%253D"
middle = "%2526position%253DO%2526sort%253DprojectedPts%2526statCategory%253DprojectedStats%2526statSeason%253D2015%2526statType%253DweekProjectedStats%2526statWeek%253D"
last = "%2Creplace"
page_num = 1
for page_num in range(1,3):
page_mult = (page_num-1) * 25 +1
next = str(page_mult)
url = first + next + middle + week + last
print url #I added this in order to check my output
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html,"lxml")
table = soup.find('table', attrs={'class':'tableType-player hasGroups'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
b = open('NFLtable.csv', 'w')
a = csv.writer(b)
a.writerows(data)
b.close()
page_num =page_num+1
print data
On the actual page they are using AJAX to request additional results, with a JSON response with some HTML as one of the values.
I modified your code a bit, give it a try:
import urllib2
import urllib
import csv
from bs4 import BeautifulSoup # latest version bs4
import json
week = raw_input("Which week?")
week = str(week)
data = []
url_format = "http://fantasy.nfl.com/research/projections?offset={offset}&position=O&sort=projectedPts&statCategory=projectedStats&statSeason=2015&statType=weekProjectedStats&statWeek={week}"
for page_num in range(1, 3):
page_mult = (page_num - 1) * 25 + 1
next = str(page_mult)
url = url_format.format(week=week, offset=page_mult)
print url # I added this in order to check my output
request = urllib2.Request(url, headers={'Ajax-Request': 'researchProjections'})
raw_json = urllib2.urlopen(request).read()
parsed_json = json.loads(raw_json)
html = parsed_json['content']
soup = BeautifulSoup(html, "html.parser")
table = soup.find('table', attrs={'class': 'tableType-player hasGroups'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
print data
I tested with week=4.

Categories