Download all files from web using Python error - python

I am trying to download all files from this website https://superbancos.gob.pa/es/fin-y-est/reportes-estadisticos
I found this code on a page and I am trying to adapt it to my process
If you could help me I would appreciate it
#Aqui importe las librerias
import requests
from bs4 import BeautifulSoup
# specify the URL of the archive here
archive_url = "https://www.superbancos.gob.pa/es/fin-y-est/reportes-estadisticos"
def get_video_links():
r = requests.get(archive_url)
soup = BeautifulSoup(r.content,'html5lib')
links = soup.findAll('a')
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('xlsx')]
return video_links
def download_video_series(video_links):
for link in video_links:
'''iterate through all links in video_links
and download them one by one'''
# obtain filename by splitting url and getting
# last string
file_name = link.split('/')[-1]
print ("Downloading file:{!s}".format(file_name))
# create response object
r = requests.get(link, stream = True)
# download started
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ("{!s} downloaded!\n".format(file_name))
print ("All files downloaded!")
return
if __name__ == "__main__":
video_links = get_video_links()
download_video_series(video_links)
but when i start the program he said All files downloaded and dont download anyone

The information you are looking for is dynamically loaded with JS code. So you should use something that can run JS and render the page like you see it in the browser.
The most straightforward way is using selenium:
from bs4 import BeautifulSoup
from selenium import webdriver
def get_soup(link):
driver = webdriver.Chrome()
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html5lib')
driver.close()
return soup
So your first function could be rewritten as
def get_video_links():
soup = get_soup(archive_url)
links = soup.findAll('a')
video_links = [archive_url + link['href'] for link in links if link['href'].endswith('xlsx')]
return video_links
Just make sure to set up your ChromeDriver properly! here is the documentation.

The Problem here is that the page is required javascript. Your best bet here is to use selenium webdriver to handle this, instead of bs4:

Related

Why do I scrape corrupted PDFs of same size with BeautifulSoup?

I went through similar topics here but did not find anything helpful for my case.
I managed to get all PDFs (for personal learning purposes) in local folder but cannot open them. They also have the same (310 kB) size. Perhaps, you find some mistake in my code. Thanks.
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# make a request to the URL
response = requests.get(url)
# parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# find all links on the page that contain 'href="/medikamente/beipackzettel/"'
links = soup.find_all('a', href=lambda href: href and '/medikamente/beipackzettel/' in href)
# loop through each link and download the PDF
for link in links:
href = link['href']
file_name = href.split('?')[0].split('/')[-1] + '.pdf'
pdf_url = 'https://www.apotheken-umschau.de' + href + '&file=pdf'
response = requests.get(pdf_url)
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(response.content)
f.close()
print(f'Downloaded {file_name} to {save_path}')
There are some issues here:
Select your elements from the list more specific, used css selectors:
soup.select('article li a[href*="/medikamente/beipackzettel/"]')
Check the responses you get from your requests if expected elements are available and what the behavior looks like.
You will notice that you will have to iterate more levels as you have done.
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content)
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content)
You will notice that the PDF is displayed in an IFRAME and you have to scrape it via external url
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
You will notice that there are not only Beipackzettel for download
Example
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# parse the HTML content of the page
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# loop through each link and download the PDF
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content, 'html.parser')
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content, 'html.parser')
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
file_name = file.get('href').split('.html')[0].split('/')[-1] + '.pdf'
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(requests.get(pdf_url).content)
f.close()
print(f'Downloaded {file_name} to {save_path}')

search pdf links from all over the website

I want to search a website and look for all pdf links. I know there are several solutions with BeautifulSoup to look for pdf files using < a > tags but I want to search the whole domain using the base url, instead of just the page linked.
My idea was to a) first search a whole website for all sub links and then b) filter out the links that have a .pdf extension. For the first part, I tried this https://github.com/mujeebishaque/extract-urls:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
user_input_url = "https://www.aurednik.de/"
if not user_input_url or len(user_input_url) < 1:
raise Exception("INFO: Invalid Input")
_start = user_input_url.find('//')
_end = user_input_url.find('.com')
readable_website_name = user_input_url[_start+2:_end].strip()
try:
website_content = requests.get(user_input_url.strip()).text
except:
check_internet = requests.get('https://google.com').status_code
if check_internet != requests.codes.ok:
raise ConnectionError("ERROR: Check internet connection.")
_soup = BeautifulSoup(website_content, features='lxml')
internal_url_links = []
external_url_links = []
for link in _soup.find_all('a', href=True):
if readable_website_name in link.get('href'):
internal_url_links.append(link['href'])
if readable_website_name not in link.get('href') and len(link.get('href')) > 3:
external_url_links.append(link['href'])
print(internal_url_links, '\n')
print(external_url_links, '\n')
I was expecting that it would be able to crawl and return all links such as
https://www.aurednik.de/info-service/downloads/#unserekataloge
and https://www.aurednik.de/downloads/AUREDNIK_Haupt2021.pdf
but that is not the case. I dont see the 2nd pdf link at all and for the first link, I only see
/info-service/downloads/#unserekataloge
when I print out the external links. I want the full link and preferably also all pdf links on the website domain. How else could I achieve this? I am open to using any tools or libraries.
Maybe the website has dynamic content. Check if the HTML loaded by BeautifulSoup contains is the same as when you inspect the website in your browser. If not use for example selenium to scrape the website with dynamically loaded content.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
html = driver.page_source
soup = BeautifulSoup(html)
internal_url_links = []
external_url_links = []
for link in soup.find_all('a', href=True):
if readable_website_name in link.get('href'):
internal_url_links.append(link['href'])
if readable_website_name not in link.get('href') and len(link.get('href')) > 3:
external_url_links.append(link['href'])
print(internal_url_links, '\n')
print(external_url_links, '\n')
driver.close()

Scraping and downloading excel files using python from url

First question, so take it easy on me.
I'm trying to programmatically download all the excel files from a website using python. I'm very new to webscraping so my code my not be up to snuff - I've dropped in below. When I run the script, I'm not seeing any output and the files that I was meaning to download are nowhere to be found.
Not sure what I've got wrong or if I'm running the script wrong. I'm running it through anaconda navigator, navigating to the directory with the script and then running it using the below:
python file-scraper.py
Here is the code for my script. Any help or suggestions are appreciated!
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
for link in get_soup(URL).find_all('a'):
file_link = link.get('href')
if FILETYPE in file_link:
print(file_link)
with open(link.text, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
You have the most common problem - browser uses JavaScript to add links to page (when you click year) but requests/beatifulsoup can't run JavaScript.
You have to turn off JavaScript in browser and check if you can get files in browser without JavaScript. And then you have to see how it works and do the same in code. But sometimes it may need Selenium to control real browser which can run JavaScript.
When I open URL in browser without JavaScript then I don't see any .xls. I have to click year and then it loads different URL with .xls.
2017: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/213974/Row1.aspx
2018: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/285051/Row1.aspx
2019: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/312510/Row1.aspx
2020: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/384496/Row1.aspx
2021: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/466963/Row1.aspx
You have to use beautifulsoup to find these urls and load them with requests and then you should search .xls
EDIT:
Code searchs subpages and uses them to download files.
It downloads every year to separated folder.
import requests
from bs4 import BeautifulSoup as bs
import os
# --- functions ---
def get_soup(url):
response = requests.get(url)
#print(response.status_code)
#print(response.text)
html = response.text
soup = bs(html, 'html.parser')
#soup = bs(html, 'lxml')
#soup = bs(html, 'html5lib')
return soup
# --- main ---
# - data -
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
# - code -
soup = get_soup(URL)
for folder_link in soup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
folder_name = folder_link.get('aria-label').split(' ')[0]
folder_link = folder_link.get('href')
print('folder:', folder_name)
os.makedirs(folder_name, exist_ok=True)
subsoup = get_soup(DOMAIN + folder_link)
for file_link in subsoup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
file_name = file_link.get('aria-label')[:-4] # skip extra `.xls` at the end
file_link = file_link.get('href')
if file_link.endswith(FILETYPE):
print(' file:', file_name)
file_name = os.path.join(folder_name, file_name)
with open(file_name, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
BTW: I put it on GitHub furas/python-examples
Your webpage only contains the folders that, as a human you have to click in order to get the files.
With BS you have to get the links of the folders, then request them to get the file list.
What simplifies your case is that both folders and files have the class attribute DocumentBrowserNameLink.
You can have a function to find them
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
def get_links(page):
result = page.find_all(class_="DocumentBrowserNameLink")
return result
page = get_soup(URL)
folder_links = get_links(page)
for link in folder_links:
page2 = get_soup(DOMAIN + link['href'])
file_links = get_links(page2)
for file in file_links:
filepath = file['href']
if FILETYPE in filepath:
print(DOMAIN + filepath)

Web scraping for downloading images from NHTSA website (CIREN crash cases)

I am trying to download some images from NHTSA Crash Viewer (CIREN cases). An example of the case https://crashviewer.nhtsa.dot.gov/nass-CIREN/CaseForm.aspx?xsl=main.xsl&CaseID=99817
If I try to download a Front crash image then there is no file downloaded. I am using beautifulsoup4 and requests libraries. This code works for other websites.
The link of images are in the following format: https://crashviewer.nhtsa.dot.gov/nass-CIREN/GetBinary.aspx?Image&ImageID=555004572&CaseID=555003071&Version=0
I have also tried the previous answers from SO but none solution works, Error obtained:
No response form server
Code used for web scraping
from bs4 import *
import requests as rq
import os
r2 = rq.get("https://crashviewer.nhtsa.dot.gov/nass-CIREN/GetBinary.aspx?Image&ImageID=555004572&CaseID=555003071&Version=0")
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
x = soup2.select('img[src^="https://crashviewer.nhtsa.dot.gov"]')
for img in x:
links.append(img['src'])
os.mkdir('ciren_photos')
i=1
for index, img_link in enumerate(links):
if i<=200:
img_data = rq.get(img_link).content
with open("ciren_photos\\"+str(index+1)+'.jpg', 'wb+') as f:
f.write(img_data)
i += 1
else:
f.close()
break
This is a task that would require Selenium, but luckily there is a shortcut. On the top of the page there is a "Text and Images Only" link that goes to a page like this one: https://crashviewer.nhtsa.dot.gov/nass-CIREN/CaseForm.aspx?ViewText&CaseID=99817&xsl=textonly.xsl&websrc=true that contains all the images and text content in one page. You can select that link with soup.find('a', text='Text and Images Only').
That link and the image links are relative (links to the same site are usually relative links), so you'll have to use urljoin() to get the full urls.
from bs4 import BeautifulSoup
import requests as rq
from urllib.parse import urljoin
url = 'https://crashviewer.nhtsa.dot.gov/nass-CIREN/CaseForm.aspx?xsl=main.xsl&CaseID=99817'
with rq.session() as s:
r = s.get(url)
soup = BeautifulSoup(r.text, "html.parser")
url = urljoin(url, soup.find('a', text='Text and Images Only')['href'])
r = s.get(url)
soup = BeautifulSoup(r.text, "html.parser")
links = [urljoin(url, i['src']) for i in soup.select('img[src^="GetBinary.aspx"]')]
for link in links:
content = s.get(link).content
# write `content` to file
So, the site doesn't return valid pictures unless the request has valid cookies. There are two ways to get the cookies: either use cookies from a previous request or use a Sessiion object. It's best to use a Session because it also handles the TCP connection and other parameters.

How to download .qrs files from a website using python and BeautifulSoup?

I would like to download all the files ending with .qrs, .dat, .hea and store them to a local folder from this website.
https://physionet.org/physiobank/database/shareedb/
I tried modifying the solution from the following link.
Download .xls files from a webpage using Python and BeautifulSoup
This is how I modified the code:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://physionet.org/physiobank/database/shareedb/'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="https://"]'): # or a[href*="shareedb/0"]
href = link.get('href')
if not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
# href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
When I run this code, it does not extract the files from the target page, nor outputs any failure message (e.g. 'failed to download').
After some debugging I saw that in my case non of the files are being selected. I suspect that it has to do more with the structure of the html.
How can I download these files to a local directory using Python?
You could use the excellent requests library as follows:
import bs4
import requests
url = "https://physionet.org/physiobank/database/shareedb/"
html = requests.get(url)
soup = bs4.BeautifulSoup(html.text, "html.parser")
for link in soup.find_all('a', href=True):
href = link['href']
if any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
print "Downloading '{}'".format(href)
remote_file = requests.get(url + href)
with open(href, 'wb') as f:
for chunk in remote_file.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
This would download all .dat, .hea and .qrs files to your computer.
Install using the standard:
pip install requests
Note, all of the hrefs on that URL are already in a form suitable for using directly as a filename (so no need at the moment to parse away any / characters).
To expand the answer by wolf tian, the select does not find anything because the links in that site do not have "https://" (nor do they have a "shareedb") in their href. The files you are trying to download all have the structure 01911.hea. Their path is relative. So what you need to do is first extract those filenames for example like this:
for link in soup.select('a'):
href = link.get('href')
if not href or not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href)
And then you need to apply the host part to the URL before retrieving it:
urlretreive(URL + href, filename)
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://physionet.org/physiobank/database/shareedb/'
r = requests.get(start_url)
soup = BeautifulSoup(r.text, 'lxml')
# get full url of file
pre = soup.find('pre')
file_urls = pre.select('a[href*="."]')
full_urls = [urljoin(start_url, url['href'])for url in file_urls]
# download file
for full_url in full_urls:
file_name = full_url.split('/')[-1]
print("Downloading {} to {}...".format(full_url, file_name))
with open(file_name, 'wb') as f:
fr = requests.get(full_url, stream=True)
for chunk in fr.iter_content(chunk_size=1024):
f.write(chunk)
print('Done')
out:
Downloading https://physionet.org/physiobank/database/shareedb/01911.dat to 01911.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.hea to 01911.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.qrs to 01911.qrs...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.dat to 02012.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.hea to 02012.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.qrs to 02012.qrs...
From you symptom, the possible reason may be that no matched url then it don't enter into the loop.
Due to I use python 2.7.
I don't verify the code.
You may try to print the link you matched and then check whether the urls can be downloaded and extracted.

Categories