How to download .qrs files from a website using python and BeautifulSoup? - python

I would like to download all the files ending with .qrs, .dat, .hea and store them to a local folder from this website.
https://physionet.org/physiobank/database/shareedb/
I tried modifying the solution from the following link.
Download .xls files from a webpage using Python and BeautifulSoup
This is how I modified the code:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://physionet.org/physiobank/database/shareedb/'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="https://"]'): # or a[href*="shareedb/0"]
href = link.get('href')
if not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
# href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
When I run this code, it does not extract the files from the target page, nor outputs any failure message (e.g. 'failed to download').
After some debugging I saw that in my case non of the files are being selected. I suspect that it has to do more with the structure of the html.
How can I download these files to a local directory using Python?

You could use the excellent requests library as follows:
import bs4
import requests
url = "https://physionet.org/physiobank/database/shareedb/"
html = requests.get(url)
soup = bs4.BeautifulSoup(html.text, "html.parser")
for link in soup.find_all('a', href=True):
href = link['href']
if any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
print "Downloading '{}'".format(href)
remote_file = requests.get(url + href)
with open(href, 'wb') as f:
for chunk in remote_file.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
This would download all .dat, .hea and .qrs files to your computer.
Install using the standard:
pip install requests
Note, all of the hrefs on that URL are already in a form suitable for using directly as a filename (so no need at the moment to parse away any / characters).

To expand the answer by wolf tian, the select does not find anything because the links in that site do not have "https://" (nor do they have a "shareedb") in their href. The files you are trying to download all have the structure 01911.hea. Their path is relative. So what you need to do is first extract those filenames for example like this:
for link in soup.select('a'):
href = link.get('href')
if not href or not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href)
And then you need to apply the host part to the URL before retrieving it:
urlretreive(URL + href, filename)

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://physionet.org/physiobank/database/shareedb/'
r = requests.get(start_url)
soup = BeautifulSoup(r.text, 'lxml')
# get full url of file
pre = soup.find('pre')
file_urls = pre.select('a[href*="."]')
full_urls = [urljoin(start_url, url['href'])for url in file_urls]
# download file
for full_url in full_urls:
file_name = full_url.split('/')[-1]
print("Downloading {} to {}...".format(full_url, file_name))
with open(file_name, 'wb') as f:
fr = requests.get(full_url, stream=True)
for chunk in fr.iter_content(chunk_size=1024):
f.write(chunk)
print('Done')
out:
Downloading https://physionet.org/physiobank/database/shareedb/01911.dat to 01911.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.hea to 01911.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.qrs to 01911.qrs...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.dat to 02012.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.hea to 02012.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.qrs to 02012.qrs...

From you symptom, the possible reason may be that no matched url then it don't enter into the loop.
Due to I use python 2.7.
I don't verify the code.
You may try to print the link you matched and then check whether the urls can be downloaded and extracted.

Related

Why do I scrape corrupted PDFs of same size with BeautifulSoup?

I went through similar topics here but did not find anything helpful for my case.
I managed to get all PDFs (for personal learning purposes) in local folder but cannot open them. They also have the same (310 kB) size. Perhaps, you find some mistake in my code. Thanks.
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# make a request to the URL
response = requests.get(url)
# parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# find all links on the page that contain 'href="/medikamente/beipackzettel/"'
links = soup.find_all('a', href=lambda href: href and '/medikamente/beipackzettel/' in href)
# loop through each link and download the PDF
for link in links:
href = link['href']
file_name = href.split('?')[0].split('/')[-1] + '.pdf'
pdf_url = 'https://www.apotheken-umschau.de' + href + '&file=pdf'
response = requests.get(pdf_url)
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(response.content)
f.close()
print(f'Downloaded {file_name} to {save_path}')
There are some issues here:
Select your elements from the list more specific, used css selectors:
soup.select('article li a[href*="/medikamente/beipackzettel/"]')
Check the responses you get from your requests if expected elements are available and what the behavior looks like.
You will notice that you will have to iterate more levels as you have done.
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content)
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content)
You will notice that the PDF is displayed in an IFRAME and you have to scrape it via external url
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
You will notice that there are not only Beipackzettel for download
Example
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# parse the HTML content of the page
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# loop through each link and download the PDF
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content, 'html.parser')
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content, 'html.parser')
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
file_name = file.get('href').split('.html')[0].split('/')[-1] + '.pdf'
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(requests.get(pdf_url).content)
f.close()
print(f'Downloaded {file_name} to {save_path}')

BeautifulSoup can not find every link in page

Here is my code:
from bs4 import BeautifulSoup
import requests
from requests import get
import os
def file_download():
domain = "ec.europa.eu"
page = requests.get("https://ec.europa.eu/eurostat/web/main/data/database")
html = page.text
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all('a'):
url = link.get('href')
print(url)
if ".gz" in url:
file_name = url.split("file=", 1)[1]
if os.path.exists(file_name):
print("File already exists.")
continue
else:
with open(file_name, 'wb') as file:
print('Downloading...')
response = get(url)
file.write(response.content)
continue
else:
continue
print('\nEvery file has been downloaded!')
In the above code I can not seem to find every possible link in from the page.
In chrome inspection copied element provides me with what I wrote as comment.
That is what I want to find with beautifulsoup as well as other similar links.
It is probably best to avoid accessing the files via the tree structure (as it would require a lot of JSON interactions).
An easier approach is to use their file listing of all of their files:
from bs4 import BeautifulSoup
import requests
session = requests.Session()
req_all = session.get("https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?dir=data&sort=1&sort=2&start=all")
soup = BeautifulSoup(req_all.content, "lxml")
table = soup.find('table', id='filetable')
for a in table.find_all('a', href=True):
if a.text == "Download":
href = a['href']
if '.gz' in href:
filename = href.rsplit('%2F', 1)[1]
if not os.path.exists(filename):
with open(filename, 'wb') as f_gz:
f_gz.write(requests.get(href).content)
print(filename)

Scraping and downloading excel files using python from url

First question, so take it easy on me.
I'm trying to programmatically download all the excel files from a website using python. I'm very new to webscraping so my code my not be up to snuff - I've dropped in below. When I run the script, I'm not seeing any output and the files that I was meaning to download are nowhere to be found.
Not sure what I've got wrong or if I'm running the script wrong. I'm running it through anaconda navigator, navigating to the directory with the script and then running it using the below:
python file-scraper.py
Here is the code for my script. Any help or suggestions are appreciated!
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
for link in get_soup(URL).find_all('a'):
file_link = link.get('href')
if FILETYPE in file_link:
print(file_link)
with open(link.text, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
You have the most common problem - browser uses JavaScript to add links to page (when you click year) but requests/beatifulsoup can't run JavaScript.
You have to turn off JavaScript in browser and check if you can get files in browser without JavaScript. And then you have to see how it works and do the same in code. But sometimes it may need Selenium to control real browser which can run JavaScript.
When I open URL in browser without JavaScript then I don't see any .xls. I have to click year and then it loads different URL with .xls.
2017: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/213974/Row1.aspx
2018: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/285051/Row1.aspx
2019: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/312510/Row1.aspx
2020: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/384496/Row1.aspx
2021: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/466963/Row1.aspx
You have to use beautifulsoup to find these urls and load them with requests and then you should search .xls
EDIT:
Code searchs subpages and uses them to download files.
It downloads every year to separated folder.
import requests
from bs4 import BeautifulSoup as bs
import os
# --- functions ---
def get_soup(url):
response = requests.get(url)
#print(response.status_code)
#print(response.text)
html = response.text
soup = bs(html, 'html.parser')
#soup = bs(html, 'lxml')
#soup = bs(html, 'html5lib')
return soup
# --- main ---
# - data -
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
# - code -
soup = get_soup(URL)
for folder_link in soup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
folder_name = folder_link.get('aria-label').split(' ')[0]
folder_link = folder_link.get('href')
print('folder:', folder_name)
os.makedirs(folder_name, exist_ok=True)
subsoup = get_soup(DOMAIN + folder_link)
for file_link in subsoup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
file_name = file_link.get('aria-label')[:-4] # skip extra `.xls` at the end
file_link = file_link.get('href')
if file_link.endswith(FILETYPE):
print(' file:', file_name)
file_name = os.path.join(folder_name, file_name)
with open(file_name, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
BTW: I put it on GitHub furas/python-examples
Your webpage only contains the folders that, as a human you have to click in order to get the files.
With BS you have to get the links of the folders, then request them to get the file list.
What simplifies your case is that both folders and files have the class attribute DocumentBrowserNameLink.
You can have a function to find them
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
def get_links(page):
result = page.find_all(class_="DocumentBrowserNameLink")
return result
page = get_soup(URL)
folder_links = get_links(page)
for link in folder_links:
page2 = get_soup(DOMAIN + link['href'])
file_links = get_links(page2)
for file in file_links:
filepath = file['href']
if FILETYPE in filepath:
print(DOMAIN + filepath)

Download all files from web using Python error

I am trying to download all files from this website https://superbancos.gob.pa/es/fin-y-est/reportes-estadisticos
I found this code on a page and I am trying to adapt it to my process
If you could help me I would appreciate it
#Aqui importe las librerias
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
archive_url = "https://www.superbancos.gob.pa/es/fin-y-est/reportes-estadisticos"
def get_video_links():
r = requests.get(archive_url)
soup = BeautifulSoup(r.content,'html5lib')
links = soup.findAll('a')
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('xlsx')]
return video_links
def download_video_series(video_links):
for link in video_links:
'''iterate through all links in video_links
and download them one by one'''
# obtain filename by splitting url and getting
# last string
file_name = link.split('/')[-1]
print ("Downloading file:{!s}".format(file_name))
# create response object
r = requests.get(link, stream = True)
# download started
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ("{!s} downloaded!\n".format(file_name))
print ("All files downloaded!")
return
if __name__ == "__main__":
video_links = get_video_links()
download_video_series(video_links)
but when i start the program he said All files downloaded and dont download anyone
The information you are looking for is dynamically loaded with JS code. So you should use something that can run JS and render the page like you see it in the browser.
The most straightforward way is using selenium:
from bs4 import BeautifulSoup
from selenium import webdriver
def get_soup(link):
driver = webdriver.Chrome()
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html5lib')
driver.close()
return soup
So your first function could be rewritten as
def get_video_links():
soup = get_soup(archive_url)
links = soup.findAll('a')
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('xlsx')]
return video_links
Just make sure to set up your ChromeDriver properly! here is the documentation.
The Problem here is that the page is required javascript. Your best bet here is to use selenium webdriver to handle this, instead of bs4:

Downloading files from a website using python

I need to download all the files from (https://www.sec.gov/litigation/suspensions.shtml) given website. It has data from years 1995 to 2017 and inside each year there are multiple links for the files which need to be downloaded. Th files are in .pdf, .htm and .txt format. I tried scraping the data by seeing various tutorials but what I require to do is different from the usual web scraping tutorials. I used the following code but it did not serve my purpose. I am new to python and I am stuck here as to how to move forward. Can anyone please suggest what needs to be done.
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.sec.gov/litigation/suspensions.shtml")
r.content
soup = BeautifulSoup(r.content)
print soup.prettify()
links = soup.find_all("a")
for link in links:
print "<a href= '%s'>%s</a>" %(link.get("href"), link.text)
g_data = soup.find_all("p", {"id": "archive-links"})
print g_data
for item in g_data:
print item.text
That should do the work. Checked that on Python 3.6, but code should be Python2.7 compatible.
The main idea is to find link for every year and then grab all links to pdf, htm and txt files for each year.
from __future__ import print_function
import requests
from bs4 import BeautifulSoup
def file_links_filter(tag):
"""
Tags filter. Return True for links that ends with 'pdf', 'htm' or 'txt'
"""
if isinstance(tag, str):
return tag.endswith('pdf') or tag.endswith('htm') or tag.endswith('txt')
def get_links(tags_list):
return [WEB_ROOT + tag.attrs['href'] for tag in tags_list]
def download_file(file_link, folder):
file = requests.get(file_link).content
name = file_link.split('/')[-1]
save_path = folder + name
print("Saving file:", save_path)
with open(save_path, 'wb') as fp:
fp.write(file)
WEB_ROOT = 'https://www.sec.gov'
SAVE_FOLDER = '~/download_files/' # directory in which files will be downloaded
r = requests.get("https://www.sec.gov/litigation/suspensions.shtml")
soup = BeautifulSoup(r.content, 'html.parser')
years = soup.select("p#archive-links > a") # css selector for all <a> inside <p id='archive'> tag
years_links = get_links(years)
links_to_download = []
for year_link in years_links:
page = requests.get(year_link)
beautiful_page = BeautifulSoup(page.content, 'html.parser')
links = beautiful_page.find_all("a", href=file_links_filter)
links = get_links(links)
links_to_download.extend(links)
# make set to exclude duplicate links
links_to_download = set(links_to_download)
print("Got links:", links_to_download)
for link in set(links_to_download):
download_file(link, SAVE_FOLDER)

Categories