Download an image using BeautifulSoup - python

I'm using BeautifulSoup in my python code to download an image from a website which changes regularly. It all works well.
However, on the page (https://apod.nasa.gov/apod/astropix.html) there is one lower resolution image (which my code currently downloads) but then if you click the image it takes you to a higher resolution version of that same image.
Can someone please suggest how I can change my code so that it downloads the higher resolution image?:
from bs4 import BeautifulSoup as BSHTML
import requests
import subprocess
import urllib2
page = urllib2.urlopen('https://apod.nasa.gov/apod/astropix.html')
soup = BSHTML(page,features="html.parser")
images = soup.findAll('img')
url = 'https://apod.nasa.gov/apod/'+images[0]['src']
r = requests.get(url, allow_redirects=True)
with open('/home/me/Downloads/apod.jpg',"w") as f:
f.write(r.content)

You can select the <a> tag that contains <img> and then "href" attribute contains your image URL:
import requests
from bs4 import BeautifulSoup as BSHTML
page = requests.get("https://apod.nasa.gov/apod/astropix.html")
soup = BSHTML(page.content, features="html.parser")
image_url = (
"https://apod.nasa.gov/apod/" + soup.select_one("a:has(>img)")["href"]
)
r = requests.get(image_url, allow_redirects=True)
with open("/home/paul/Downloads/apod.jpg", "wb") as f:
f.write(r.content)

You need to download and write to disk:
import requests
from os.path import basename
r = requests.get("xxx")
soup = BeautifulSoup(r.content)
for link in links:
if "http" in link.get('src'):
lnk = link.get('src')
with open(basename(lnk), "wb") as f:
f.write(requests.get(lnk).content)
You can also use a select to filter your tags to only get the ones with http links:
for link in soup.select("img[src^=http]"):
lnk = link["src"]
with open(basename(lnk)," wb") as f:
f.write(requests.get(lnk).content)

Related

Filter images by name before scraping with beautifulsoup?

I'm trying to create a program that scrapes a site for images using bs4. The site contains two types of images, low quality ones and high quality ones. The high quality files are named the same thing as their low quality versions, but contain the word "website" and the end before the .png. I'd like to only download the "website" files. Here's what I tried.
from bs4 import BeautifulSoup
import requests
URL = "https://www.ssbwiki.com/Category:Head_icons_(SSBU)"
getURL = requests.get(URL, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
if not image.endswith("Website.png"):
continue
if image.endswith("Website.png"):
webs = requests.get(image)
open('scraped_images/' + image.split('/')[-1], 'wb').write(webs.content)
I don't get any error messages, but no files download. Any tips?
You are only checking if it ends with "Website.png" after you have already established that it doesn't. Better not to even check if it doesn't:
from bs4 import BeautifulSoup
import requests
URL = "https://www.ssbwiki.com/Category:Head_icons_(SSBU)"
getURL = requests.get(URL, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
if image.endswith("Website.png"):
webs = requests.get(image)
open('scraped_images/' + image.split('/')[-1], 'wb').write(webs.content)
Actually using list comprehensions you can make your code less procedural and prevent mistakes of the sort you made in the future:
from bs4 import BeautifulSoup
import requests
from requests.compat import urljoin
URL = "https://www.ssbwiki.com/Category:Head_icons_(SSBU)"
getURL = requests.get(URL, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(getURL.text, 'html.parser')
image_urls = [urljoin(URL,image.get('src')) for image in soup.find_all('img')]
# let's make this one a generator so we don't keep too many downloaded
# images in memory
images = (requests.get(url) for url in image_urls if url.endswith("Website.png"))
for image in images:
# use the context manager so the files are closed after write
with open('scraped_images/' + image.split('/')[-1], 'wb') as f:
f.write(image.content)

how to download images using requests library that are behind cloudflare?

I'm learning about python's request library so that I can automatically download some images through their links.
But the images that I'm trying to download are behind Cloudflare, and so I get ERROR 1020 Access Denied
Here's my code
import requests
from bs4 import BeautifulSoup
# -------------------------------------------------------------------------------------------------------
response = requests.get("https://main_link").text
soup = BeautifulSoup( response , 'html.parser')
for i, link in enumerate(soup.find_all('img')): # getting all image elements
l = link.get('src') # image link -> https://link/link/image.jpg
img_data = requests.get(l).content
with open(f'Test{i}.png', 'wb') as f:
f.write(img_data)
I looked at some resources like this StackOverflow question
which says to use cfscrape
And this is my code:
import requests
import cfscrape
from bs4 import BeautifulSoup
# ------------------------------------------------------------------------------------------------------
scraper = cfscrape.create_scraper()
response = scraper.get("https://main_link").text
soup = BeautifulSoup( response , 'html.parser')
for i, link in enumerate(soup.find_all('img')):
l = link.get('src') # https://link/link/image.jpg
img_data = scraper.get(l).content
with open(f'Test{i}.png', 'wb') as f:
f.write(img_data)
But I still get the 1020 ERROR
I even used the cloudscraper library that too does not work.
I've looked at other resources but can't seem to understand what to do.
Any help is appreciated

How do I filter tags with class in Python and BeautifulSoup?

I'm trying to scrape images from a site using beautifulsoup HTML parser.
There are 2 kinds of image tags for each image on the site. One is for the thumbnail and the other is the bigger size image that only appears after I click on the thumbnail and expand. The bigger size tag contains a class="expanded-image" attribute.
I'm trying to parse through the HTML and get the "src" attribute of the expanded image which contains the source for the image.
When I try to execute my code, nothing happens. It just says the process finished without scraping any image. But when I don't try to filter the code and just give tag as an argument, it downloads all the thumbnails.
Here's my code:
import webbrowser, requests, os
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata('https://boards.4chan.org/a/thread/30814')
soup = BeautifulSoup(htmldata, 'html.parser')
list = []
for i in soup.find_all("img",{"class":"expanded-thumb"}):
list.append(i['src'].replace("//","https://"))
def download(url, pathname):
if not os.path.isdir(pathname):
os.makedirs(pathname)
filename = os.path.join(pathname, url.split("/")[-1])
response = requests.get(url, stream=True)
with open(filename, "wb") as f:
f.write(response.content)
for a in list:
download(a,"file")
You might be running into a problem using "list" as a variable name. It's a type in python. Start with this (replacing TEST_4CHAN_URL with whatever thread you want), incorporating my suggestion from the comment above.
import requests
from bs4 import BeautifulSoup
TEST_4CHAN_URL = "https://boards.4chan.org/a/thread/<INSERT_THREAD_ID_HERE>"
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata(TEST_4CHAN_URL)
soup = BeautifulSoup(htmldata, "html.parser")
src_list = []
for i in soup.find_all("a", {"class":"fileThumb"}):
src_list.append(i['href'].replace("//", "https://"))
print(src_list)

How to get video src using BeautifulSoup in Python

I am trying to find a downloadable video links in a website. For example, I am working with urls like these https://www.loc.gov/item/2015669100/. You can see that there is a m3u8 video link under mejs__mediaelement div tag.
However my code is not printing anything. Meaning that it's not finding the Video urls for the website.
My code is below
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
with open('pages2crawl.txt', 'r') as inFile:
lines = [line.rstrip() for line in inFile]
for page in lines:
req = Request(page, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(urlopen(req).read(), 'html.parser')
pages = soup.findAll('div', attrs={'class' : 'mejs__mediaelement'})
for e in pages:
video = e.find("video").get("src")
if video.endswith("m3u8"):
print(video)
If you just want to make a simple script it would probably be easier to use regex.
import re, requests
s = requests.Session() #start the session
data = s.get(url) #http get request to download data
data = data.text #get the raw text
vidlinks = re.findall("src='(.*?).m3u8'/>", data) #find all between the two parts in the data
print(vidlinks[0] + ".m3u8") #print the full link with extension
You can use CSS selector source[type="application/x-mpegURL"] to extract MPEG link (or source[type="video/mp4"] to extract mp4 link):
import requests
from bs4 import BeautifulSoup
url = "https://www.loc.gov/item/2015669100/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
link_mpeg = soup.select_one('source[type="application/x-mpegURL"]')["src"]
link_mp4 = soup.select_one('source[type="video/mp4"]')["src"]
print(link_mpeg)
print(link_mp4)
Prints:
https://tile.loc.gov/streaming-services/iiif/service:afc:afc2010039:afc2010039_crhp0001:afc2010039_crhp0001_mv04/full/full/0/full/default.m3u8
https://tile.loc.gov/storage-services/service/afc/afc2010039/afc2010039_crhp0001/afc2010039_crhp0001_mv04.mp4

Python download file with original filename

I had use Python and beautiful soup to detect links from a website, now I want to download the image files from detected url and store them to store in a specific folder, what would be the most easiest way do so?
The code I develop so far:
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
from PIL import Image
import requests
my_url = "https://abc/videos/vod/movies/actress/letter=a/sort=popular/page=1/"
uClient = uReq(my_url)
page_html=uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
for div in page_soup.findAll('div', attrs={'class':'main'}):
for ul in div.findAll('ul'):
for li in ul.findAll('li'):
for img in li.findAll('img', alt=True):
link=img['src']
The url links detected:
https://abcde/mono/actjpgs/abb1.jpg
https://abcde/mono/actjpgs/t31sw.jpg
https://abcde/mono/actjpgs/beaas.jpg
End result file name:
abb1.jpg
t31sw.jpg
beaas.jpg
As Karl suggested a quick google search would have told you this but since I am being helpful in my early SO career I will try and do it for you.
import requests
link = your/example/link.jpg
# Get image and file name
r = requests.get(link, allow_redirects=True)
fname = link.split('/')[-1]
# save the file
open(fname, 'wb').write(r.content)
I have not tested this code.
import os
import shutil
from urllib.parse import urlparse
# get filename from URL
url = "https://abcde/mono/actjpgs/abb1.jpg"
url_parsed = urlparse(url)
filename = os.path.basename(url_parsed.path) # will contain abb1.jpg
# download file
with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:
shutil.copyfileobj(response, out_file)

Categories