How to scrape whole website using beautifulsoup - python

I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)

Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)

Related

My while loop to scrape all pages in the website is not working

Here is the website I am trying to scrape: https://books.toscrape.com/
Below are my functions. The scrape_all_pages() is not working. Is there a better way to get the page number from the website directly so I can use the range function instead?
I did checkout Finding number of pages using Python BeautifulSoup
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(num):
"""Takes a page and returns a list of links to the book that are on the page."""
BASE_URL = 'http://books.toscrape.com/catalogue/'
PAGE_URL = BASE_URL + str('page-')
book_url = []
soup = get_soup(PAGE_URL + str(num)+ '.html')
for x in soup.findAll("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
return book_url
def scrape_all_pages():
"""Scrapes all pages, returning a list of book links."""
page_num = 0
all_urls = []
while True:
url = scrape_page(page_num)
if not url:
break
all_urls += url
page_num += 1
return all_urls
It do not need range() in most cases. Would recommend to change strategy and take a look if there is a link to next page available or not:
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
or from python3.8 and later:
nextPage = BASE_URL + a['href'] if(a := soup.select_one('li.next a[href]')) else None
Example
Note Starts from https://books.toscrape.com/catalogue/page-45.html to limit for demo. You could simply change it to get https://books.toscrape.com/ getting all pages scraped
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(url):
"""Takes a page and append link of the books that are on the page to global list"""
BASE_URL = 'http://books.toscrape.com/catalogue/'
soup = get_soup(url)
for x in soup.find_all("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
return nextPage
def scrape_all_pages(url):
"""Scrapes all pages, returning a list of book links."""
while True:
if url:
print(url)
url = scrape_page(url)
else:
break
return book_url
book_url = []
scrape_all_pages('https://books.toscrape.com/catalogue/page-45.html')
Output
['http://books.toscrape.com/catalogue/annie-on-my-mind_120/index.html',
'http://books.toscrape.com/catalogue/and-then-there-were-none_119/index.html',
'http://books.toscrape.com/catalogue/a-walk-in-the-woods-rediscovering-america-on-the-appalachian-trail_118/index.html',
'http://books.toscrape.com/catalogue/a-visit-from-the-goon-squad_117/index.html',
'http://books.toscrape.com/catalogue/a-storm-of-swords-a-song-of-ice-and-fire-3_116/index.html',
'http://books.toscrape.com/catalogue/a-heartbreaking-work-of-staggering-genius_115/index.html',
'http://books.toscrape.com/catalogue/8-keys-to-mental-health-through-exercise_114/index.html',
'http://books.toscrape.com/catalogue/girlboss_113/index.html',
'http://books.toscrape.com/catalogue/the-suffragettes-little-black-classics-96_112/index.html',
'http://books.toscrape.com/catalogue/the-sense-of-an-ending_111/index.html',
'http://books.toscrape.com/catalogue/the-sandman-vol-2-the-dolls-house-the-sandman-volumes-2_110/index.html',
'http://books.toscrape.com/catalogue/the-course-of-love_109/index.html',
'http://books.toscrape.com/catalogue/sugar-rush-offensive-line-2_108/index.html',
'http://books.toscrape.com/catalogue/saga-volume-2-saga-collected-editions-2_107/index.html',
'http://books.toscrape.com/catalogue/run-spot-run-the-ethics-of-keeping-pets_106/index.html',...]

Cannot find the table data within the soup, but I know its there

I am trying create a function that scrapes college baseball team roster pages for a project. And I have created a function that crawls the roster page, gets a list of the links I want to scrape. But when I try to scrape the individual links for each player, it works but cannot find the data that is on their page.
This is the link to the page I am crawling from at the start:
https://gvsulakers.com/sports/baseball/roster
These are just functions that I call within the function that I am having a problem with:
def parse_row(rows):
return [str(x.string)for x in rows.find_all('td')]
def scrape(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
row = soop.find_all('tr')
lopr = [parse_row(rows) for rows in row]
return(lopr)
Here is what I am having an issue with. when I assign type1_roster with a variable and print it, i only get an empty list. Ideally it should contain data about a player or players from a players roster page.
# Roster page crawler
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
href_tags = soop.find_all(href = True)
hrefs = [tag.get('href') for tag in href_tags]
# get all player links
player_hrefs = []
for href in hrefs:
if 'sports/baseball/roster' in href:
if 'sports/baseball/roster/coaches' not in href:
if 'https:' not in href:
player_hrefs.append(href)
# get rid of duplicates
player_links = list(set(player_hrefs))
# scrape the roster links
for link in player_links:
player_ = url + link[24:]
return(find_data(player_))
A number of things:
I would pass the headers as a global
You are slicing 1 character too late the link I think for player_
You need to re-work the logic of find_data(), as data is present in a mixture of element types and not in table/tr/td elements e.g. found in spans. The html attributes are nice and descriptive and will support targeting content easily
You can target the player links from the landing page more tightly with the css selector list shown below. This removes the need for multiple loops as well as the use of list(set())
import requests
from bs4 import BeautifulSoup
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def scrape(url):
page = requests.get(url, headers=HEADERS)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers=HEADERS)
#print(page)
html = page.text
soop = BeautifulSoup(html, 'lxml')
# re-think logic here to return desired data e.g.
# soop.select_one('.sidearm-roster-player-jersey-number').text
first_name = soop.select_one('.sidearm-roster-player-first-name').text
# soop.select_one('.sidearm-roster-player-last-name').text
# need targeted string cleaning possibly
bio = soop.select_one('#sidearm-roster-player-bio').get_text('')
return (first_name, bio)
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
player_links = [i['href'] for i in soop.select(
'.sidearm-roster-players-container .sidearm-roster-player h3 > a')]
# scrape the roster links
for link in player_links:
player_ = url + link[23:]
# print(player_)
return(find_data(player_))
print(type1_roster('gvsulakers'))

How do I get href links from href using python/pandas

I need to get href links which is present in href(which i have already) So I need to hit that href links and collect the other href. I tried but from that code only first href are getting, want to hit that one and collect href which present in that previous one. so how could I do that.
I Tried:
from bs4 import BeautifulSoup
import requests
url = 'https://www.iea.org/oilmarketreport/reports/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
#soup.prettify()
#table = soup.find("table")
#print(table)
links = []
for href in soup.find_all(class_='omrlist'):
#print(href)
links.append(href.find('a').get('href'))
print(links)
here how to loop to get report url
import requests
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href')) # add prefix 'http://....'
return all_links
yearLinks = getLinks(root_url + '/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
reportLinks.extend(links)
print(reportLinks)
for url in reportLinks:
if '.pdf' in url:
url = url.replace('../../..', '')
# do download pdf file
....
else:
# do extract pdf url from html and download it
....
....
now you can loop reportLinks to get pdf url

How do I get hrefs from hrefs?

How do I get hrefs from hrefs using Python in class and method format?
I have tried:
root_url = 'https://www.iea.org'
class IEAData:
def __init__(self):
try:--
except:
def get_links(self, url):
all_links = []
page = requests.get(root_url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links =iea_obj.get_links(yearLinks)
print(links)
Recommended: links variable must have all month hrefs but not getting, so please tell me how I should do it.
There were a couple of issues with your code. Your get_links() function was not using the url that was passed to it. When looping over the returned links, you were passing yearLinks rather than the url.
The following should get you going:
from bs4 import BeautifulSoup
import requests
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for li in soup.find_all(class_='omrlist'):
all_links.append(root_url + li.find('a').get('href'))
return all_links
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
for url in yearLinks:
links = iea_obj.get_links(url)
print(url, links)
This would give you output starting:
https://www.iea.org/oilmarketreport/reports/2018/ ['https://www.iea.org/oilmarketreport/reports/2018/0118/', 'https://www.iea.org/oilmarketreport/reports/2018/0218/', 'https://www.iea.org/oilmarketreport/reports/2018/0318/', 'https://www.iea.org/oilmarketreport/reports/2018/0418/', 'https://www.iea.org/oilmarketreport/reports/2018/0518/', 'https://www.iea.org/oilmarketreport/reports/2018/0618/', 'https://www.iea.org/oilmarketreport/reports/2018/0718/', 'https://www.iea.org/oilmarketreport/reports/2018/0818/', 'https://www.iea.org/oilmarketreport/reports/2018/1018/']
https://www.iea.org/oilmarketreport/reports/2017/ ['https://www.iea.org/oilmarketreport/reports/2017/0117/', 'https://www.iea.org/oilmarketreport/reports/2017/0217/', 'https://www.iea.org/oilmarketreport/reports/2017/0317/', 'https://www.iea.org/oilmarketreport/reports/2017/0417/', 'https://www.iea.org/oilmarketreport/reports/2017/0517/', 'https://www.iea.org/oilmarketreport/reports/2017/0617/', 'https://www.iea.org/oilmarketreport/reports/2017/0717/', 'https://www.iea.org/oilmarketreport/reports/2017/0817/', 'https://www.iea.org/oilmarketreport/reports/2017/0917/', 'https://www.iea.org/oilmarketreport/reports/2017/1017/', 'https://www.iea.org/oilmarketreport/reports/2017/1117/', 'https://www.iea.org/oilmarketreport/reports/2017/1217/']
I'm fairly new to programming, and I'm still learning and trying to understand how classes and whatnot all work together. But gave it a shot (that's how we learn, right?)
Not sure if this is what you're looking for as your output. I changed 2 things and was able to put all the links from within the yearLinks into a list. Note that it'll also include the PDF links as well as the months links that I think you wanted. If you don't want those PDF links, and exclusively the months, then just don't include the pdf.
So here's the code I did it with, and maybe you can use that to fit into how you have it structured.
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links = iea_obj.get_links(url)
# uncomment line below if you do not want the .pdf links
#links = [ x for x in links if ".pdf" not in x ]
reportLinks += links

Deep parse with beautifulsoup

I try to parse https://www.drugbank.ca/drugs. The idea is to extract all the drug names and some additional informationfor each drug. As you can see each webpage represents a table with drug names and the when we hit the drugname we can access to this drug information.
Let's say I will keep the following code to handle the pagination:
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
The issue is that in each page, and for each drug in the table of this page I need to capture :
Name.
Accession Number.
Structured Indications,
Generic Prescription Products,
I used the classical request/beautifusoup but can't go deep ..
Some Help please
Create function with requests and BeautifulSoup to get data from subpage
import requests
from bs4 import BeautifulSoup
def get_details(url):
print('details:', url)
# get subpage
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get data on subpabe
dts = soup.findAll('dt')
dds = soup.findAll('dd')
# display details
for dt, dd in zip(dts, dds):
print(dt.text)
print(dd.text)
print('---')
print('---------------------------')
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get links to subpages
links = soup.select('strong a')
for link in links:
# exeecute function to get subpage
get_details('https://www.drugbank.ca' + link['href'])
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
To crawl effectively, you'll want to implement a few measures, such as maintaining a queue of urls to visit and be aware of what urls you have already visited.
Keeping in mind that links can be absolute or relative and that redirects are very likely, you also probably want to construct the urls dynamically rather than string concatenation.
Here is a generic (we usually only want to use example.com on SO) crawling workflow...
from urllib.parse import urljoin, urlparse # python
# from urlparse import urljoin, urlparse # legacy python2
import requests
from bs4 import BeautifulSoup
def process_page(soup):
'''data extraction process'''
pass
def is_external(link, base='example.com'):
'''determine if the link is external to base'''
site = urlparse(link).netloc
return base not in site
def resolve_link(current_location, href):
'''resolves final location of a link including redirects'''
req_loc = urljoin(current_location, href)
response = requests.head(req_loc)
resolved_location = response.url # location after redirects
# if you don't want to visit external links...
if is_external(resolved_location):
return None
return resolved_location
url_queue = ['https://example.com']
visited = set()
while url_queue:
url = url_queue.pop() # removes a url from the queue and assign it to `url`
response = requests.get(url)
current_location = response.url # final location after redirects
visited.add(url) # note that we've visited the given url
visited.add(current_location) # and the final location
soup = BeautifulSoup(response.text, 'lxml')
process_page(soup) # scrape the page
link_tags = soup.find_all('a') # gather additional links
for anchor in link_tags:
href = anchor.get('href')
link_location = resolve_link(current_location, href)
if link_location and link_location not in visited:
url_queue.append(link_location)

Categories