BeautifulSoup can not find every link in page - python

Here is my code:
from bs4 import BeautifulSoup
import requests
from requests import get
import os
def file_download():
domain = "ec.europa.eu"
page = requests.get("https://ec.europa.eu/eurostat/web/main/data/database")
html = page.text
soup = BeautifulSoup(html, "html.parser")
for link in soup.find_all('a'):
url = link.get('href')
print(url)
if ".gz" in url:
file_name = url.split("file=", 1)[1]
if os.path.exists(file_name):
print("File already exists.")
continue
else:
with open(file_name, 'wb') as file:
print('Downloading...')
response = get(url)
file.write(response.content)
continue
else:
continue
print('\nEvery file has been downloaded!')
In the above code I can not seem to find every possible link in from the page.
In chrome inspection copied element provides me with what I wrote as comment.
That is what I want to find with beautifulsoup as well as other similar links.

It is probably best to avoid accessing the files via the tree structure (as it would require a lot of JSON interactions).
An easier approach is to use their file listing of all of their files:
from bs4 import BeautifulSoup
import requests
session = requests.Session()
req_all = session.get("https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?dir=data&sort=1&sort=2&start=all")
soup = BeautifulSoup(req_all.content, "lxml")
table = soup.find('table', id='filetable')
for a in table.find_all('a', href=True):
if a.text == "Download":
href = a['href']
if '.gz' in href:
filename = href.rsplit('%2F', 1)[1]
if not os.path.exists(filename):
with open(filename, 'wb') as f_gz:
f_gz.write(requests.get(href).content)
print(filename)

Related

Why do I scrape corrupted PDFs of same size with BeautifulSoup?

I went through similar topics here but did not find anything helpful for my case.
I managed to get all PDFs (for personal learning purposes) in local folder but cannot open them. They also have the same (310 kB) size. Perhaps, you find some mistake in my code. Thanks.
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# make a request to the URL
response = requests.get(url)
# parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# find all links on the page that contain 'href="/medikamente/beipackzettel/"'
links = soup.find_all('a', href=lambda href: href and '/medikamente/beipackzettel/' in href)
# loop through each link and download the PDF
for link in links:
href = link['href']
file_name = href.split('?')[0].split('/')[-1] + '.pdf'
pdf_url = 'https://www.apotheken-umschau.de' + href + '&file=pdf'
response = requests.get(pdf_url)
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(response.content)
f.close()
print(f'Downloaded {file_name} to {save_path}')
There are some issues here:
Select your elements from the list more specific, used css selectors:
soup.select('article li a[href*="/medikamente/beipackzettel/"]')
Check the responses you get from your requests if expected elements are available and what the behavior looks like.
You will notice that you will have to iterate more levels as you have done.
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content)
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content)
You will notice that the PDF is displayed in an IFRAME and you have to scrape it via external url
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
You will notice that there are not only Beipackzettel for download
Example
import os
import requests
from bs4 import BeautifulSoup
# define the URL to scrape
url = 'https://www.apotheken-umschau.de/medikamente/arzneimittellisten/medikamente_i.html'
# define the folder to save the PDFs to
save_path = r'C:\PDFs'
# create the folder if it doesn't exist
if not os.path.exists(save_path):
os.makedirs(save_path)
# parse the HTML content of the page
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# loop through each link and download the PDF
for link in soup.select('article li a[href*="/medikamente/beipackzettel/"]'):
soup_detail_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + link.get('href')).content, 'html.parser')
for file in soup_detail_page.select('a:-soup-contains("Original Beipackzettel")'):
soup_file_page = BeautifulSoup(requests.get('https://www.apotheken-umschau.de' + file.get('href')).content, 'html.parser')
pdf_url = soup_file_page.iframe.get('src').split('?file=')[-1]
file_name = file.get('href').split('.html')[0].split('/')[-1] + '.pdf'
with open(os.path.join(save_path, file_name), 'wb') as f:
f.write(requests.get(pdf_url).content)
f.close()
print(f'Downloaded {file_name} to {save_path}')

Is there such thing as a "if x (or any variable) has any value" function in Python?

I'm trying to build a web crawler that generates a text file for multiple different websites. After it crawls a website it is supposed to get all the links in a website. However, I have encountered a problem while web crawling Wikipedia. The python script gives me the error:
Traceback (most recent call last):
File "/home/banana/Desktop/Search engine/data/crawler?.py", line 22, in <module>
urlwaitinglist.write(link.get('href'))
TypeError: write() argument must be str, not None
I looked deeper into it by having it print the discovered links and it has "None" at the top. I'm wondering if there is a function to see if the variable has any value.
Here is the code I have written so far:
from bs4 import BeautifulSoup
import os
import requests
import random
import re
toscan = "https://en.wikipedia.org/wiki/Wikipedia:Contents"
url = toscan
source_code = requests.get(url)
plain_text = source_code.text
removal_list = ["http://", "https://", "/"]
for word in removal_list:
toscan = toscan.replace(word, "")
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
urlwaitinglist = open("/home/banana/Desktop/Search engine/data/toscan", "a")
urlwaitinglist.write('\n')
urlwaitinglist.write(link.get('href'))
urlwaitinglist.close()
print(soup.get_text())
directory = "/home/banana/Desktop/Search engine/data/Crawled Data/"
results = soup.get_text()
results = results.strip()
f = open("/home/banana/Desktop/Search engine/data/Crawled Data/" + toscan + ".txt", "w")
f.write(url)
f.write('\n')
f.write(results)
f.close()
Looks like not every <a> tag you are grabbing is returning a value. I would suggest making every link variable you grab a string and check if its not None. It is also bad practice to to open a file without using the 'with' clause. I have added an example that grabs every https|http link and writing it to file using some of your code below:
from bs4 import BeautifulSoup
import os
import requests
import random
import re
file_directory = './' # your specified directory location
filename = 'urls.txt' # your specified filename
url = "https://en.wikipedia.org/wiki/Wikipedia:Contents"
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
links = []
for link in soup.find_all('a'):
link = link.get('href')
print(link)
match = re.search('^(http|https)://', str(link))
if match:
links.append(str(link))
with open(file_directory + filename, 'w') as file:
for link in links:
file.write(link + '\n')

Scraping and downloading excel files using python from url

First question, so take it easy on me.
I'm trying to programmatically download all the excel files from a website using python. I'm very new to webscraping so my code my not be up to snuff - I've dropped in below. When I run the script, I'm not seeing any output and the files that I was meaning to download are nowhere to be found.
Not sure what I've got wrong or if I'm running the script wrong. I'm running it through anaconda navigator, navigating to the directory with the script and then running it using the below:
python file-scraper.py
Here is the code for my script. Any help or suggestions are appreciated!
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
for link in get_soup(URL).find_all('a'):
file_link = link.get('href')
if FILETYPE in file_link:
print(file_link)
with open(link.text, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
You have the most common problem - browser uses JavaScript to add links to page (when you click year) but requests/beatifulsoup can't run JavaScript.
You have to turn off JavaScript in browser and check if you can get files in browser without JavaScript. And then you have to see how it works and do the same in code. But sometimes it may need Selenium to control real browser which can run JavaScript.
When I open URL in browser without JavaScript then I don't see any .xls. I have to click year and then it loads different URL with .xls.
2017: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/213974/Row1.aspx
2018: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/285051/Row1.aspx
2019: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/312510/Row1.aspx
2020: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/384496/Row1.aspx
2021: https://lfportal.loudoun.gov/LFPortalinternet/0/fol/466963/Row1.aspx
You have to use beautifulsoup to find these urls and load them with requests and then you should search .xls
EDIT:
Code searchs subpages and uses them to download files.
It downloads every year to separated folder.
import requests
from bs4 import BeautifulSoup as bs
import os
# --- functions ---
def get_soup(url):
response = requests.get(url)
#print(response.status_code)
#print(response.text)
html = response.text
soup = bs(html, 'html.parser')
#soup = bs(html, 'lxml')
#soup = bs(html, 'html5lib')
return soup
# --- main ---
# - data -
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
# - code -
soup = get_soup(URL)
for folder_link in soup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
folder_name = folder_link.get('aria-label').split(' ')[0]
folder_link = folder_link.get('href')
print('folder:', folder_name)
os.makedirs(folder_name, exist_ok=True)
subsoup = get_soup(DOMAIN + folder_link)
for file_link in subsoup.find_all('a', {'class': 'DocumentBrowserNameLink'}):
file_name = file_link.get('aria-label')[:-4] # skip extra `.xls` at the end
file_link = file_link.get('href')
if file_link.endswith(FILETYPE):
print(' file:', file_name)
file_name = os.path.join(folder_name, file_name)
with open(file_name, 'wb') as file:
response = requests.get(DOMAIN + file_link)
file.write(response.content)
BTW: I put it on GitHub furas/python-examples
Your webpage only contains the folders that, as a human you have to click in order to get the files.
With BS you have to get the links of the folders, then request them to get the file list.
What simplifies your case is that both folders and files have the class attribute DocumentBrowserNameLink.
You can have a function to find them
from bs4 import BeautifulSoup as bs
import requests
DOMAIN = 'https://lfportal.loudoun.gov/LFPortalinternet/'
URL = 'https://lfportal.loudoun.gov/LFPortalinternet/Browse.aspx?startid=213973&row=1&dbid=0'
FILETYPE = '.xls'
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
def get_links(page):
result = page.find_all(class_="DocumentBrowserNameLink")
return result
page = get_soup(URL)
folder_links = get_links(page)
for link in folder_links:
page2 = get_soup(DOMAIN + link['href'])
file_links = get_links(page2)
for file in file_links:
filepath = file['href']
if FILETYPE in filepath:
print(DOMAIN + filepath)

Downloading all images without src tag from website

I'm scraping images from websites. I'm locating them by src, but what if they don't have src tag? How should I get them? Right now utilizing such code
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import os
def url_to_page_name(url):
parsed = urlparse(str(url))
return parsed.netloc
def get_images_job(page_url):
"""Request given page and extract images"""
directory_name = url_to_page_name(page_url)
response = requests.get(page_url)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
if not os.path.exists(directory_name):
os.makedirs(directory_name)
for url in urls:
file_name = re.search(r'/([\w_-]+[.](jpg|jpeg|gif|png|bmp|webp|svg))$', url)
if file_name:
file_name = file_name.group(1)
with open(os.path.join(f'{directory_name}/' + file_name), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(page_url, url)
response = requests.get(url)
f.write(response.content)
get_images_job("https://pixabay.com/")
And what if?
they are used as background
background="img/tile.jpg"
They can be located inside CSS
They can be masked as base64

How to download .qrs files from a website using python and BeautifulSoup?

I would like to download all the files ending with .qrs, .dat, .hea and store them to a local folder from this website.
https://physionet.org/physiobank/database/shareedb/
I tried modifying the solution from the following link.
Download .xls files from a webpage using Python and BeautifulSoup
This is how I modified the code:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://physionet.org/physiobank/database/shareedb/'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
for link in soup.select('a[href^="https://"]'): # or a[href*="shareedb/0"]
href = link.get('href')
if not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href.rsplit('/', 1)[-1])
# We need a https:// URL for this site
# href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
When I run this code, it does not extract the files from the target page, nor outputs any failure message (e.g. 'failed to download').
After some debugging I saw that in my case non of the files are being selected. I suspect that it has to do more with the structure of the html.
How can I download these files to a local directory using Python?
You could use the excellent requests library as follows:
import bs4
import requests
url = "https://physionet.org/physiobank/database/shareedb/"
html = requests.get(url)
soup = bs4.BeautifulSoup(html.text, "html.parser")
for link in soup.find_all('a', href=True):
href = link['href']
if any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
print "Downloading '{}'".format(href)
remote_file = requests.get(url + href)
with open(href, 'wb') as f:
for chunk in remote_file.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
This would download all .dat, .hea and .qrs files to your computer.
Install using the standard:
pip install requests
Note, all of the hrefs on that URL are already in a form suitable for using directly as a filename (so no need at the moment to parse away any / characters).
To expand the answer by wolf tian, the select does not find anything because the links in that site do not have "https://" (nor do they have a "shareedb") in their href. The files you are trying to download all have the structure 01911.hea. Their path is relative. So what you need to do is first extract those filenames for example like this:
for link in soup.select('a'):
href = link.get('href')
if not href or not any(href.endswith(x) for x in ['.dat','.hea','.qrs']):
continue
filename = os.path.join(OUTPUT_DIR, href)
And then you need to apply the host part to the URL before retrieving it:
urlretreive(URL + href, filename)
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://physionet.org/physiobank/database/shareedb/'
r = requests.get(start_url)
soup = BeautifulSoup(r.text, 'lxml')
# get full url of file
pre = soup.find('pre')
file_urls = pre.select('a[href*="."]')
full_urls = [urljoin(start_url, url['href'])for url in file_urls]
# download file
for full_url in full_urls:
file_name = full_url.split('/')[-1]
print("Downloading {} to {}...".format(full_url, file_name))
with open(file_name, 'wb') as f:
fr = requests.get(full_url, stream=True)
for chunk in fr.iter_content(chunk_size=1024):
f.write(chunk)
print('Done')
out:
Downloading https://physionet.org/physiobank/database/shareedb/01911.dat to 01911.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.hea to 01911.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/01911.qrs to 01911.qrs...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.dat to 02012.dat...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.hea to 02012.hea...
Done
Downloading https://physionet.org/physiobank/database/shareedb/02012.qrs to 02012.qrs...
From you symptom, the possible reason may be that no matched url then it don't enter into the loop.
Due to I use python 2.7.
I don't verify the code.
You may try to print the link you matched and then check whether the urls can be downloaded and extracted.

Categories