I'm trying to extract the title of a link using BeautifulSoup. The code that I'm working with is as follows:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
hdr={'User-Agent':'Chrome/84.0.4147.135'}
frame=[]
for page_number in range(19):
http= "https://www.epa.wa.gov.au/media-statements?page={}".format(page_number+1)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.content, 'html.parser')
for row in soup.select('.view-content .views-row'):
content = row.select_one('.views-field-body').get_text(strip=True)
title = row.text.strip(':')
link = 'https://www.epa.wa.gov.au' + row.a['href']
date = row.select_one('.date-display-single').get_text(strip=True)
frame.append({
'title': title,
'link': link,
'date': date,
'content': content
})
dfs = pd.DataFrame(frame)
dfs.to_csv('epa_scrapper.csv',index=False,encoding='utf-8-sig')
However, nothing gets displayed after I run the above code. How can I extract the value stored inside the title attribute of the anchor tag stored in link?
Also, I just want to know how can I get append "title", "link", "dt", "content" into a csv file.
Thank you so much in advance.
To get the link text, you can use selector "h5 a". For example:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
hdr={'User-Agent':'Chrome/84.0.4147.135'}
frame=[]
for page_number in range(1, 20):
http= "https://www.epa.wa.gov.au/media-statements?page={}".format(page_number)
print('Downloading page %s...' % http)
url= requests.get(http,headers=hdr)
soup = BeautifulSoup(url.content, 'html.parser')
for row in soup.select('.view-content .views-row'):
content = row.select_one('.views-field-body').get_text(strip=True, separator='\n')
title = row.select_one('h5 a').get_text(strip=True)
link = 'https://www.epa.wa.gov.au' + row.a['href']
date = row.select_one('.date-display-single').get_text(strip=True)
frame.append({
'title': title,
'link': link,
'date': date,
'content': content
})
dfs = pd.DataFrame(frame)
dfs.to_csv('epa_scrapper.csv',index=False,encoding='utf-8-sig')
Creates epa_scrapper.csv (screenshot from LibreOffice):
Related
I'm trying to use BeautifulSoup4 in Orange to scrape data from a list of URLs scraped from that same website.
I have managed to scraped the data from a single page when I set the URL manually.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zone-points.aspx?year=2021&zone=1§ion=1901"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
for child in rank.children:
print(url,child)
and I have been able to scrape the list of URLs I need
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
link = soup.find('div',class_='contentSection')
url_list = link.find('a').get('href')
for url_list in link.find_all('a'):
print (url_list.get('href'))
But so far I haven't been able to combine both to scrape the data from that URL list. Can I do that only by nesting for loops, and if so, how? Or how can I do it?
I am sorry if this is a stupid question, but I only started trying with Python and Web-Scraping yesterday and I have not been able to figure this by consulting similar-ish topics.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# get all links
url_list = []
for a in soup.find("div", class_="contentSection").find_all("a"):
url_list.append(a["href"].replace("ยง", "§"))
# get all data from URLs
all_data = []
for url in url_list:
print(url)
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
h2 = soup.h2
sub = h2.find_next("p")
for tr in soup.select("tr:has(td)"):
all_data.append(
[
h2.get_text(strip=True),
sub.get_text(strip=True),
*[td.get_text(strip=True) for td in tr.select("td")],
]
)
# save data to CSV
df = pd.DataFrame(
all_data,
columns=[
"title",
"sub_title",
"Rank",
"Horse / Owner",
"Points",
"Total Comps",
],
)
print(df)
df.to_csv("data.csv", index=None)
This traverses all URLs and saves all data to data.csv (screenshot from LibreOffice):
I'm trying to scrape the "team per game stats" table from this website using this code:
from urllib.request import urlopen as uo
from bs4 import BeautifulSoup as BS
import pandas as pd
url = 'https://www.basketball-reference.com/leagues/NBA_2020.html'
html = uo(url)
soup = BS(html, 'html.parser')
soup.findAll('tr')
headers = [th.getText() for th in soup.findAll('tr')]
headers = headers[1:]
print(headers)
rows = soup.findAll('tr')[1:]
team_stats = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
stats = pd.DataFrame(team_stats, columns=headers)
But it returns this error:
AssertionError: 71 columns passed, passed data had 212 columns
The problem is that the data is hidden in a commented section of the HTML. The table you want to extract is rendered with Javascript in your browser. Requesting the page with requests or urllib just yields the raw HTML.
So be aware that you have to examine the source code of the page with "View page source" rather than the rendered page with "Inspect Element" if you search for the proper tags to find with BeautifulSoup.
Try this:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.basketball-reference.com/leagues/NBA_2020.html'
html = requests.get(url)
section_start = '<span class="section_anchor" id="team-stats-per_game_link" data-label="Team Per Game Stats">'
block_start = html.text.split(section_start)[1].split("<!--")[1]
block = block_start.split("-->")[0]
soup = BeautifulSoup(block)
data = [th.get_text(",") for th in soup.findAll('tr')]
header = data[0]
header = [x.strip() for x in header.split(",") if x.strip() !=""]
data = [x.split(",") for x in data[1:]]
pd.DataFrame(data, columns=header)
Explanation: You first need to find the commented section by simply splitting the raw HTML just before the section. You extract the section as text, convert to soup and then parse.
I have the following code:
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import re
market = 'INDU:IND'
quote_page = 'http://www.bloomberg.com/quote/' + market
page = urllib.request.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('h1', attrs={'class': 'name'})
name = name_box.text.strip()
print('Market: ' + name)
This code works and lets me get the market name from the url. I'm trying to do something similar to this website. Here is my code:
market = 'BTC-GBP'
quote_page = 'https://uk.finance.yahoo.com/quote/' + market
page = urllib.request.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('span', attrs={'class': 'Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)'})
name = name_box.text.strip()
print('Market: ' + name)
I'm not sure what to do. I want to retrieve the current rate, the amount it's increased/decreased by as a number & a percentage. And finally as of when the information was updated. How do I do this, I don't mind if you do a different method to the one I used previously as long as you explain it. If my code is inefficient/unpythonic could you also tell me what to do to fix this. I'm pretty new to web scraping and these new modules. Thanks!
You can use BeautifulSoup and when searching for the desired data, use regex to match the dynamic span classnames generated by the site's backend script:
from bs4 import BeautifulSoup as soup
import requests
import re
data = requests.get('https://uk.finance.yahoo.com/quote/BTC-GBP').text
s = soup(data, 'lxml')
d = [i.text for i in s.find_all('span', {'class':re.compile('Trsdu\(0\.\d+s\) Trsdu\(0\.\d+s\) Fw\(\w+\) Fz\(\d+px\) Mb\(-\d+px\) D\(\w+\)|Trsdu\(0\.\d+s\) Fw\(\d+\) Fz\(\d+px\) C\(\$data\w+\)')})]
date_published = re.findall('As of\s+\d+:\d+PM GMT\.|As of\s+\d+:\d+AM GMT\.', data)
final_results = dict(zip(['current', 'change', 'published'], d+date_published))
Output:
{'current': u'6,785.02', 'change': u'-202.99 (-2.90%)', 'published': u'As of 3:55PM GMT.'}
Edit: given the new URL, you need to change the span classname:
data = requests.get('https://uk.finance.yahoo.com/quote/AAPL?p=AAPL').text
final_results = dict(zip(['current', 'change', 'published'], [i.text for i in soup(data, 'lxml').find_all('span', {'class':re.compile('Trsdu\(0\.\d+s\) Trsdu\(0\.\d+s\) Fw\(b\) Fz\(\d+px\) Mb\(-\d+px\) D\(b\)|Trsdu\(0\.\d+s\) Fw\(\d+\) Fz\(\d+px\) C\(\$data\w+\)')})] + re.findall('At close:\s+\d:\d+PM EST', data)))
Output:
{'current': u'175.50', 'change': u'+3.00 (+1.74%)', 'published': u'At close: 4:00PM EST'}
You can directly use api provided by yahoo Finance,
For reference check this answer :-
Yahoo finance webservice API
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
first_event = event_containers[0]
print(first_event.h3.text)
By using this code i'm able to extract the event name,I'm trying for a way to loop and extract all the event names and dates ? and also i'm trying to extract the location information which is visable after clicking on readmore link
event_containers is a bs4.element.ResultSet object, which is basically a list of Tag objects.
Just loop over the tags in event_containers and select h3 for the title, div.date for the date and a for the URL, example:
for tag in event_containers:
print(tag.h3.text)
print(tag.select_one('div.date').text)
print(tag.a['href'])
Now, for the location information you'll have to visit each URL and collect the text in div.date.
Full code:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
event_containers = soup.find_all('div', class_ = "col-xs-12 col-sm-6 col-md-8")
base_url = 'http://aicd.companydirectors.com.au'
for tag in event_containers:
link = base_url + tag.a['href']
soup = BeautifulSoup(requests.get(link).text,"lxml")
location = ', '.join(list(soup.select_one('div.event-add').stripped_strings)[1:-1])
print('Title:', tag.h3.text)
print('Date:', tag.select_one('div.date').text)
print('Link:', link)
print('Location:', location)
Try this to get all the events and dates you are after:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://aicd.companydirectors.com.au/events/events-calendar')
soup = BeautifulSoup(res.text,"lxml")
for item in soup.find_all(class_='lead'):
date = item.find_previous_sibling().text.split(" |")[0]
print(item.text,date)
I'd like to write the url links in this url into a file but there are 2 'td a' tags for each line on the table. I just want the one where a class="pagelink" href="/search" etc.
I tried the following code, hoping to pick up only the ones where "class":"pagelink", but produced an error:
AttributeError: 'Doctype' object has no attribute 'find_all'
Can anyone help please?
import requests
from bs4 import BeautifulSoup as soup
import csv
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
url = https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=1000&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results
response = session.get(url) #not used until after the iteration begins
html = soup(response.text, 'lxml')
for link in html:
prop_link = link.find_all("td a", {"class":"pagelink"})
writer.writerow([prop_link])
Your html variable contains a Doctype object which is not iterable.
You'll need to use find_all or select in that object to find the nodes that you want.
Example:
import requests
from bs4 import BeautifulSoup as soup
import csv
outputfilename = 'Ed_Streets2.csv'
#inputfilename = 'Edinburgh.txt'
baseurl = 'https://www.saa.gov.uk'
outputfile = open(outputfilename, 'wb')
writer = csv.writer(outputfile)
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
session = requests.session()
url = "https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=100&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results"
response = session.get(url)
html = soup(response.text, 'lxml')
prop_link = html.find_all("a", class_="pagelink button small")
for link in prop_link:
prop_url = baseurl+(link["href"])
print prop_url
writer.writerow([prop_url, "", "", ""])
Try this.
You need to look for the links before starting the loop.
import requests
from bs4 import BeautifulSoup as soup
import csv
writer.writerow(['URL', 'Reference', 'Description', 'Address'])
url = "https://www.saa.gov.uk/search/?SEARCHED=1&ST=&SEARCH_TERM=city+of+edinburgh%2C+EDINBURGH&ASSESSOR_ID=&SEARCH_TABLE=valuation_roll_cpsplit&PAGE=0&DISPLAY_COUNT=1000&TYPE_FLAG=CP&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=city+of+edinburgh&DRILL_SEARCH_TERM=BOSWALL+PARKWAY%2C+EDINBURGH&DD_TOWN=EDINBURGH&DD_STREET=BOSWALL+PARKWAY#results"
response = requests.get(url) #not used until after the iteration begins
html = soup(response.text, 'lxml')
prop_link = html.find_all("a", {"class":"pagelink button small"})
for link in prop_link:
if(type(link) != type(None) and link.has_attr("href")):
wr = link["href"]
writer.writerow([wr])