Downloading files from a website using python - python

I need to download all the files from (https://www.sec.gov/litigation/suspensions.shtml) given website. It has data from years 1995 to 2017 and inside each year there are multiple links for the files which need to be downloaded. Th files are in .pdf, .htm and .txt format. I tried scraping the data by seeing various tutorials but what I require to do is different from the usual web scraping tutorials. I used the following code but it did not serve my purpose. I am new to python and I am stuck here as to how to move forward. Can anyone please suggest what needs to be done.
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.sec.gov/litigation/suspensions.shtml")
r.content
soup = BeautifulSoup(r.content)
print soup.prettify()
links = soup.find_all("a")
for link in links:
print "<a href= '%s'>%s</a>" %(link.get("href"), link.text)
g_data = soup.find_all("p", {"id": "archive-links"})
print g_data
for item in g_data:
print item.text

That should do the work. Checked that on Python 3.6, but code should be Python2.7 compatible.
The main idea is to find link for every year and then grab all links to pdf, htm and txt files for each year.
from __future__ import print_function
import requests
from bs4 import BeautifulSoup
def file_links_filter(tag):
"""
Tags filter. Return True for links that ends with 'pdf', 'htm' or 'txt'
"""
if isinstance(tag, str):
return tag.endswith('pdf') or tag.endswith('htm') or tag.endswith('txt')
def get_links(tags_list):
return [WEB_ROOT + tag.attrs['href'] for tag in tags_list]
def download_file(file_link, folder):
file = requests.get(file_link).content
name = file_link.split('/')[-1]
save_path = folder + name
print("Saving file:", save_path)
with open(save_path, 'wb') as fp:
fp.write(file)
WEB_ROOT = 'https://www.sec.gov'
SAVE_FOLDER = '~/download_files/' # directory in which files will be downloaded
r = requests.get("https://www.sec.gov/litigation/suspensions.shtml")
soup = BeautifulSoup(r.content, 'html.parser')
years = soup.select("p#archive-links > a") # css selector for all <a> inside <p id='archive'> tag
years_links = get_links(years)
links_to_download = []
for year_link in years_links:
page = requests.get(year_link)
beautiful_page = BeautifulSoup(page.content, 'html.parser')
links = beautiful_page.find_all("a", href=file_links_filter)
links = get_links(links)
links_to_download.extend(links)
# make set to exclude duplicate links
links_to_download = set(links_to_download)
print("Got links:", links_to_download)
for link in set(links_to_download):
download_file(link, SAVE_FOLDER)

Related

Why do I scrape corrupted PDFs of same size with BeautifulSoup?

I went through similar topics here but did not find anything helpful for my case.
I managed to get all PDFs (for personal learning purposes) in local folder but cannot open them. They also have the same (310 kB) size. Perhaps, you find some mistake in my code. Thanks.
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# make a request to the URL
response = requests.get(url)
# parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# find all links on the page that contain 'href="/medikamente/beipackzettel/"'
links = soup.find_all('a', href=lambda href: href and '/medikamente/beipackzettel/' in href)
# loop through each link and download the PDF
for link in links:
href = link['href']
file_name = href.split('?')[0].split('/')[-1] + '.pdf'
pdf_url = 'https://www.apotheken-umschau.de' + href + '&file=pdf'
response = requests.get(pdf_url)
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(response.content)
f.close()
print(f'Downloaded {file_name} to {save_path}')
There are some issues here:
Select your elements from the list more specific, used css selectors:
soup.select('article li a[href*="/medikamente/beipackzettel/"]')
Check the responses you get from your requests if expected elements are available and what the behavior looks like.
You will notice that you will have to iterate more levels as you have done.
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content)
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content)
You will notice that the PDF is displayed in an IFRAME and you have to scrape it via external url
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
You will notice that there are not only Beipackzettel for download
Example
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# parse the HTML content of the page
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# loop through each link and download the PDF
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content, 'html.parser')
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content, 'html.parser')
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
file_name = file.get('href').split('.html')[0].split('/')[-1] + '.pdf'
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(requests.get(pdf_url).content)
f.close()
print(f'Downloaded {file_name} to {save_path}')

nested web scraping with beautifulSoup

I am looking to download the "Latest File" from provided url below
https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-national-income-expenditure-and-product
The file i want to download is at the following exact location
https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-national-income-expenditure-and-product/sep-2022#data-downloads
for example file name is "Table 1"
how can i download this when i am only given the base URL as above? using beautifulSoup
I am unable to figure out how to work through nested urls within the html page to find the one i need to download.
First u need to get latest link:
latest_link = 'https://www.abs.gov.au/' + soup.find('span', class_='flag_latest').find_previous('a').get('href')
Then find document to download, in my example - download all, but u can change it:
download_all_link = 'https://www.abs.gov.au/' + soup.find('div', class_='anchor-button-wrapper').find('a').get('href')
And last point - download it.
FULL CODE:
import requests
from bs4 import BeautifulSoup
url = 'https://www.abs.gov.au/statistics/economy/national-accounts/australian-national-accounts-national-income-expenditure-and-product'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
latest_link = 'https://www.abs.gov.au/' + soup.find('span', class_='flag_latest').find_previous('a').get('href')
response = requests.get(latest_link)
soup = BeautifulSoup(response.text, 'lxml')
download_all_link = 'https://www.abs.gov.au/' + soup.find('div', class_='anchor-button-wrapper').find('a').get('href')
file_data = requests.get(download_all_link).content
with open(download_all_link.split("/")[-1], 'wb') as handler:
handler.write(file_data)
I've never used BeautifulSoup before. Pretty cool stuff. This seems to do it or me:
from bs4 import BeautifulSoup
with open("demo.html") as fp:
soup = BeautifulSoup(fp, "html.parser")
# lets look for the span with the 'flag_latest' class attribute
for span in soup.find_all('span'):
if span.get('class', None) and 'flag_latest' in span['class']:
# step up the a level to the div and grab the a tag
print(span.parent.a['href'])
So we just look for the span with the 'flag_latest' class and then step up a level in the tree (a div) and then grab the first a tag and extract the href.
Check out the docs and read the sections on "Navigating the Tree" and "Searching the Tree"

Scraping and downloading excel files using python from url

First question, so take it easy on me.
I'm trying to programmatically download all the excel files from a website using python. I'm very new to webscraping so my code my not be up to snuff - I've dropped in below. When I run the script, I'm not seeing any output and the files that I was meaning to download are nowhere to be found.
Not sure what I've got wrong or if I'm running the script wrong. I'm running it through anaconda navigator, navigating to the directory with the script and then running it using the below:
python file-scraper.py
Here is the code for my script. Any help or suggestions are appreciated!
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
for link in get_soup(URL).find_all('a'):
file_link = link.get('href')
if FILETYPE in file_link:
print(file_link)
with open(link.text, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
You have the most common problem - browser uses JavaScript to add links to page (when you click year) but requests/beatifulsoup can't run JavaScript.
You have to turn off JavaScript in browser and check if you can get files in browser without JavaScript. And then you have to see how it works and do the same in code. But sometimes it may need Selenium to control real browser which can run JavaScript.
When I open URL in browser without JavaScript then I don't see any .xls. I have to click year and then it loads different URL with .xls.
2017: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/213974/Row1.aspx
2018: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/285051/Row1.aspx
2019: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/312510/Row1.aspx
2020: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/384496/Row1.aspx
2021: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/466963/Row1.aspx
You have to use beautifulsoup to find these urls and load them with requests and then you should search .xls
EDIT:
Code searchs subpages and uses them to download files.
It downloads every year to separated folder.
import requests
from bs4 import BeautifulSoup as bs
import os
# --- functions ---
def get_soup(url):
response = requests.get(url)
#print(response.status_code)
#print(response.text)
html = response.text
soup = bs(html, 'html.parser')
#soup = bs(html, 'lxml')
#soup = bs(html, 'html5lib')
return soup
# --- main ---
# - data -
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
# - code -
soup = get_soup(URL)
for folder_link in soup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
folder_name = folder_link.get('aria-label').split(' ')[0]
folder_link = folder_link.get('href')
print('folder:', folder_name)
os.makedirs(folder_name, exist_ok=True)
subsoup = get_soup(DOMAIN + folder_link)
for file_link in subsoup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
file_name = file_link.get('aria-label')[:-4] # skip extra `.xls` at the end
file_link = file_link.get('href')
if file_link.endswith(FILETYPE):
print(' file:', file_name)
file_name = os.path.join(folder_name, file_name)
with open(file_name, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
BTW: I put it on GitHub furas/python-examples
Your webpage only contains the folders that, as a human you have to click in order to get the files.
With BS you have to get the links of the folders, then request them to get the file list.
What simplifies your case is that both folders and files have the class attribute DocumentBrowserNameLink.
You can have a function to find them
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
def get_links(page):
result = page.find_all(class_="DocumentBrowserNameLink")
return result
page = get_soup(URL)
folder_links = get_links(page)
for link in folder_links:
page2 = get_soup(DOMAIN + link['href'])
file_links = get_links(page2)
for file in file_links:
filepath = file['href']
if FILETYPE in filepath:
print(DOMAIN + filepath)

I want to download many Files of same file extension with either Wget or Python, from a given Website Link

I would like to download Files of the same File types .utu and .zip from the Following Microsoft Flight Simulator AI Traffic Websites :-
http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/acfiles.php?cat=6 *(Current Repaints)
http://web.archive.org/web/20050315112940/http://www.projectai.com:80/libraries/acfiles.php?cat=1 (Vintage Repaints)
On each of those pages there are Subcatagories for Airbus Boeing etc for the AI Aircraft types, and the repaints .zip Files choices are shown when you click on the Aircraft image.
The Folder name then becomes http://web.archive.org/web/20041114195147/http://www.projectai.com:80/libraries/repaints.php?ac=number&cat=(number) Then when you click the downloads repaints.php? becomes download.php?fileid=(4 digit number)
What do I need to type to download all the .zip Files at once ? As clicking on them individually to download would take ages.
Also I would like to download all .utu File extension File, For Flight 1 ultimate Traffic AI Aircraft repaints. from the Following Webpage :-
http://web.archive.org/web/20060512161232/http://ultimatetraffic.flight1.net:80/utfiles.asp?mode=1&index=0
Then When you click to download the Ultimate Traffic Aircraft Texture :- The last Folder Path becomes /utfiles.asp?mode=download&id=F1AIRepaintNumbers-Numbers-Numbers.utu And I would like to do the same as for the other Websites.
I used the following written code in Python 2.79, found on a video on Youtube, inserting my info to achieve my aim, but it unsurprisingly didn't work when I ran it timeouts and errors etc, probably due to it's simplicity :-
import requests
from bs4 import BeautifulSoup
import wget
def download_links(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a'):
href = link.get('href')
print(href)
wget.download(href)
download_links('http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/acfiles.php?cat=6')
Update: Try this update, it should now download all zip files from all links on the first page:
from bs4 import BeautifulSoup
import requests, zipfile, io
def get_zips(zips_page):
# print(zips_page)
zips_source = requests.get(zips_page).text
zip_soup = BeautifulSoup(zips_source, "html.parser")
for zip_file in zip_soup.select("a[href*=download.php?fileid=]"):
zip_url = link_root + zip_file['href']
print('downloading', zip_file.text, '...',)
r = requests.get(zip_url)
with open(zip_file.text, 'wb') as zipFile:
zipFile.write(r.content)
def download_links(root, cat):
url = ''.join([root, cat])
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for zips_suffix in soup.select("a[href*=repaints.php?ac=]"):
# get_zips(root, zips_suffix['href'])
next_page = ''.join([root, zips_suffix['href']])
get_zips(next_page)
link_root = 'http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/'
category = 'acfiles.php?cat=6'
download_links(link_root, category)

How to download .qrs files from a website using python and BeautifulSoup?

I would like to download all the files ending with .qrs, .dat, .hea and store them to a local folder from this website.
https://physionet.org/physiobank/database/shareedb/
I tried modifying the solution from the following link.
Download .xls files from a webpage using Python and BeautifulSoup
This is how I modified the code:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://physionet.org/physiobank/database/shareedb/'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="https://"]'): # or a[href*="shareedb/0"]
href = link.get('href')
if not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
# href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
When I run this code, it does not extract the files from the target page, nor outputs any failure message (e.g. 'failed to download').
After some debugging I saw that in my case non of the files are being selected. I suspect that it has to do more with the structure of the html.
How can I download these files to a local directory using Python?
You could use the excellent requests library as follows:
import bs4
import requests
url = "https://physionet.org/physiobank/database/shareedb/"
html = requests.get(url)
soup = bs4.BeautifulSoup(html.text, "html.parser")
for link in soup.find_all('a', href=True):
href = link['href']
if any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
print "Downloading '{}'".format(href)
remote_file = requests.get(url + href)
with open(href, 'wb') as f:
for chunk in remote_file.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
This would download all .dat, .hea and .qrs files to your computer.
Install using the standard:
pip install requests
Note, all of the hrefs on that URL are already in a form suitable for using directly as a filename (so no need at the moment to parse away any / characters).
To expand the answer by wolf tian, the select does not find anything because the links in that site do not have "https://" (nor do they have a "shareedb") in their href. The files you are trying to download all have the structure 01911.hea. Their path is relative. So what you need to do is first extract those filenames for example like this:
for link in soup.select('a'):
href = link.get('href')
if not href or not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href)
And then you need to apply the host part to the URL before retrieving it:
urlretreive(URL + href, filename)
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://physionet.org/physiobank/database/shareedb/'
r = requests.get(start_url)
soup = BeautifulSoup(r.text, 'lxml')
# get full url of file
pre = soup.find('pre')
file_urls = pre.select('a[href*="."]')
full_urls = [urljoin(start_url, url['href'])for url in file_urls]
# download file
for full_url in full_urls:
file_name = full_url.split('/')[-1]
print("Downloading {} to {}...".format(full_url, file_name))
with open(file_name, 'wb') as f:
fr = requests.get(full_url, stream=True)
for chunk in fr.iter_content(chunk_size=1024):
f.write(chunk)
print('Done')
out:
Downloading https://physionet.org/physiobank/database/shareedb/01911.dat to 01911.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.hea to 01911.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.qrs to 01911.qrs...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.dat to 02012.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.hea to 02012.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.qrs to 02012.qrs...
From you symptom, the possible reason may be that no matched url then it don't enter into the loop.
Due to I use python 2.7.
I don't verify the code.
You may try to print the link you matched and then check whether the urls can be downloaded and extracted.

Categories