How do I get hrefs from hrefs using Python in class and method format?
I have tried:
root_url = 'https://www.iea.org'
class IEAData:
def __init__(self):
try:--
except:
def get_links(self, url):
all_links = []
page = requests.get(root_url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links =iea_obj.get_links(yearLinks)
print(links)
Recommended: links variable must have all month hrefs but not getting, so please tell me how I should do it.
There were a couple of issues with your code. Your get_links() function was not using the url that was passed to it. When looping over the returned links, you were passing yearLinks rather than the url.
The following should get you going:
from bs4 import BeautifulSoup
import requests
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for li in soup.find_all(class_='omrlist'):
all_links.append(root_url + li.find('a').get('href'))
return all_links
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
for url in yearLinks:
links = iea_obj.get_links(url)
print(url, links)
This would give you output starting:
https://www.iea.org/oilmarketreport/reports/2018/ ['https://www.iea.org/oilmarketreport/reports/2018/0118/', 'https://www.iea.org/oilmarketreport/reports/2018/0218/', 'https://www.iea.org/oilmarketreport/reports/2018/0318/', 'https://www.iea.org/oilmarketreport/reports/2018/0418/', 'https://www.iea.org/oilmarketreport/reports/2018/0518/', 'https://www.iea.org/oilmarketreport/reports/2018/0618/', 'https://www.iea.org/oilmarketreport/reports/2018/0718/', 'https://www.iea.org/oilmarketreport/reports/2018/0818/', 'https://www.iea.org/oilmarketreport/reports/2018/1018/']
https://www.iea.org/oilmarketreport/reports/2017/ ['https://www.iea.org/oilmarketreport/reports/2017/0117/', 'https://www.iea.org/oilmarketreport/reports/2017/0217/', 'https://www.iea.org/oilmarketreport/reports/2017/0317/', 'https://www.iea.org/oilmarketreport/reports/2017/0417/', 'https://www.iea.org/oilmarketreport/reports/2017/0517/', 'https://www.iea.org/oilmarketreport/reports/2017/0617/', 'https://www.iea.org/oilmarketreport/reports/2017/0717/', 'https://www.iea.org/oilmarketreport/reports/2017/0817/', 'https://www.iea.org/oilmarketreport/reports/2017/0917/', 'https://www.iea.org/oilmarketreport/reports/2017/1017/', 'https://www.iea.org/oilmarketreport/reports/2017/1117/', 'https://www.iea.org/oilmarketreport/reports/2017/1217/']
I'm fairly new to programming, and I'm still learning and trying to understand how classes and whatnot all work together. But gave it a shot (that's how we learn, right?)
Not sure if this is what you're looking for as your output. I changed 2 things and was able to put all the links from within the yearLinks into a list. Note that it'll also include the PDF links as well as the months links that I think you wanted. If you don't want those PDF links, and exclusively the months, then just don't include the pdf.
So here's the code I did it with, and maybe you can use that to fit into how you have it structured.
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links = iea_obj.get_links(url)
# uncomment line below if you do not want the .pdf links
#links = [ x for x in links if ".pdf" not in x ]
reportLinks += links
Related
I am trying create a function that scrapes college baseball team roster pages for a project. And I have created a function that crawls the roster page, gets a list of the links I want to scrape. But when I try to scrape the individual links for each player, it works but cannot find the data that is on their page.
This is the link to the page I am crawling from at the start:
https://gvsulakers.com/sports/baseball/roster
These are just functions that I call within the function that I am having a problem with:
def parse_row(rows):
return [str(x.string)for x in rows.find_all('td')]
def scrape(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
row = soop.find_all('tr')
lopr = [parse_row(rows) for rows in row]
return(lopr)
Here is what I am having an issue with. when I assign type1_roster with a variable and print it, i only get an empty list. Ideally it should contain data about a player or players from a players roster page.
# Roster page crawler
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
href_tags = soop.find_all(href = True)
hrefs = [tag.get('href') for tag in href_tags]
# get all player links
player_hrefs = []
for href in hrefs:
if 'sports/baseball/roster' in href:
if 'sports/baseball/roster/coaches' not in href:
if 'https:' not in href:
player_hrefs.append(href)
# get rid of duplicates
player_links = list(set(player_hrefs))
# scrape the roster links
for link in player_links:
player_ = url + link[24:]
return(find_data(player_))
A number of things:
I would pass the headers as a global
You are slicing 1 character too late the link I think for player_
You need to re-work the logic of find_data(), as data is present in a mixture of element types and not in table/tr/td elements e.g. found in spans. The html attributes are nice and descriptive and will support targeting content easily
You can target the player links from the landing page more tightly with the css selector list shown below. This removes the need for multiple loops as well as the use of list(set())
import requests
from bs4 import BeautifulSoup
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def scrape(url):
page = requests.get(url, headers=HEADERS)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers=HEADERS)
#print(page)
html = page.text
soop = BeautifulSoup(html, 'lxml')
# re-think logic here to return desired data e.g.
# soop.select_one('.sidearm-roster-player-jersey-number').text
first_name = soop.select_one('.sidearm-roster-player-first-name').text
# soop.select_one('.sidearm-roster-player-last-name').text
# need targeted string cleaning possibly
bio = soop.select_one('#sidearm-roster-player-bio').get_text('')
return (first_name, bio)
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
player_links = [i['href'] for i in soop.select(
'.sidearm-roster-players-container .sidearm-roster-player h3 > a')]
# scrape the roster links
for link in player_links:
player_ = url + link[23:]
# print(player_)
return(find_data(player_))
print(type1_roster('gvsulakers'))
I am trying to scrape a page that includes 12 links. I need to open each of these links and scrape all of their titles. When I open each page, I face multiple pages in each link. However, my code could only scrape the first page in all of these 12 links
By below code, I can print all the 12 links URLs that exist on the main page.
url = 'http://mlg.ucd.ie/modules/COMP41680/assignment2/index.html'
res = requests.get (url)
soup = BeautifulSoup(res.text, 'html.parser')
links = soup.find_all("a")
all_urls = []
for link in links[1:]:
link_address ='http://mlg.ucd.ie/modules/COMP41680/assignment2/' + link.get("href")
all_urls.append(link_address)
Then, I looped in all of them.
for i in range(0,12):
url = all_urls[i]
res = requests.get (url)
soup = BeautifulSoup(res.text, 'html.parser')
The title could be extracted by below lines:
title_news = []
news_div = soup.find_all('div', class_ = 'article')
for container in news_div:
title = container.h5.a.text
title_news.append(title)
The output of this code only includes the title for the first page of each of these 12 pages, while I need my code to go through multiple pages in these 12 URLs.
The below gives me the links of all the pages that exist in each of these 12 links if it defines in an appropriate loop. ( It reads the pagination section and look for the next page URL link)
page = soup.find('ul', {'class' : 'pagination'}).select('li', {'class': "page-link"})[2].find('a')['href']
How I should use a page variable inside my code to extract multiple pages in all of these 12 links and read all the titles and not only first-page titles.
You can use this code to get all titles from all the pages:
import requests
from bs4 import BeautifulSoup
base_url = "http://mlg.ucd.ie/modules/COMP41680/assignment2/"
soup = BeautifulSoup(
requests.get(base_url + "index.html").content, "html.parser"
)
title_news = []
for a in soup.select("#all a"):
next_link = a["href"]
print("Getting", base_url + next_link)
while True:
soup = BeautifulSoup(
requests.get(base_url + next_link).content, "html.parser"
)
for title in soup.select("h5 a"):
title_news.append(title.text)
next_link = soup.select_one('a[aria-label="Next"]')["href"]
if next_link == "#":
break
print("Length of title_news:", len(title_news))
Prints:
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-jan-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-feb-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-mar-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-apr-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-may-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-jun-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-jul-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-aug-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-sep-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-oct-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-nov-001.html
Getting http://mlg.ucd.ie/modules/COMP41680/assignment2/month-dec-001.html
Length of title_news: 16226
I need to get href links which is present in href(which i have already) So I need to hit that href links and collect the other href. I tried but from that code only first href are getting, want to hit that one and collect href which present in that previous one. so how could I do that.
I Tried:
from bs4 import BeautifulSoup
import requests
url = 'https://www.iea.org/oilmarketreport/reports/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
#soup.prettify()
#table = soup.find("table")
#print(table)
links = []
for href in soup.find_all(class_='omrlist'):
#print(href)
links.append(href.find('a').get('href'))
print(links)
here how to loop to get report url
import requests
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href')) # add prefix 'http://....'
return all_links
yearLinks = getLinks(root_url + '/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
reportLinks.extend(links)
print(reportLinks)
for url in reportLinks:
if '.pdf' in url:
url = url.replace('../../..', '')
# do download pdf file
....
else:
# do extract pdf url from html and download it
....
....
now you can loop reportLinks to get pdf url
I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)
I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone
You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]