Download images from webpage - python

i've tried to download images from a webpage, what am i missing here please ?
import urllib
from urllib.request import urlopen, Request
import requests
from bs4 import BeautifulSoup
import os
urlpage ='https://www.google.com/search?site=imghp&tbm=isch&source=hp&biw=1414&bih=709&q=little+cofee'
header = {'User-Agent': 'Mozilla/5.0'}
page = urlopen(Request(urlpage,headers=header))
soup = BeautifulSoup(page)
images = soup.find_all("div", {"class":"thumb-pic"})
for image in images:
imgUrl = image.a['href'].split("imgurl=")[1]
urllib.request.urlretrieve(imgUrl, os.path.basename(imgUrl))

It's tricky. Sometimes they use short URLs like "images/img.jpg", "/images/img.jpg", "../images/img.jpg". But the google page you are trying has no html tags at all. It contains just javascript.
I made a quick and dirty example just to show you how it might work in Python 2.7 but you can just save the page opened in your browser and all images will be saved neatly in a folder.
#!/usr/bin/python
import urllib
url ='http://www.blogto.com/cafes/little-nickys-coffee-toronto'
ext=['.jpg', '.png', '.gif'] # image type to download
response= urllib.urlopen(url)
html = response.read()
IMGs=[]
L=html.split('src="')
for item in L:
item=item[:item.find('"')]
item=item.strip()
if item.find('http') == -1:
item=url[:url.find('/', 10)]+item
for e in ext:
if item.find(e) != -1:
if item not in IMGs:
IMGs.append(item)
n=len(IMGs)
print 'Found', n, 'images'
i=1
for img in IMGs:
ext=img[img.rfind('.'):]
filename='0'*(len(str(n))-len(str(i)))+str(i)
i += 1
try:
print img
f = open(filename+ext,'wb')
f.write(urllib.urlopen(img).read())
f.close()
except:
print "Unpredictable error:", img
print 'Done!'

Related

Downloading all Images from a page with beautifulSoup not working

I am trying to download the show images from this page with beautifulsoup.
When I run the below code the only image that downloads is the spinning loading icon.
When I check the requests tab on the page I can see requests for all the other images on the page so assume they should be downloaded as well. I am not sure why they would not download as they are contained within img tags in the html on the page?
import re
import requests
from bs4 import BeautifulSoup
site = 'https://www.tvnz.co.nz/categories/sci-fi-and-fantasy'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
image_tags = soup.find_all('img')
urls = [img['src'] for img in image_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regular expression didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
print("Download complete, downloaded images can be found in current directory!")
You can try via the api they seem to be using to populate the page
api_url = 'https://apis-edge-prod.tech.tvnz.co.nz/api/v1/web/play/page/categories/sci-fi-and-fantasy'
r = requests.get(api_url)
try:
embVals = r.json()['_embedded'].values()
except Exception as e:
embVals = []
print('failed to get embedded items\n', str(e))
urls = [img for images in [ [
v['src'] for k, v in ev.items() if
k is not None and 'image' in k.lower()
and v is not None and 'src' in v
] for ev in embVals] for img in images]
# for url in urls: # should work the same
(Images seem to be in nested dictionaries with keys like 'portraitTileImage', 'image', 'tileImage', 'coverImage'. You can also use for-loop/s to go through embVals and extract other data if you want to include more in the filename/metadata/etc.)
I don't know if it will get you ALL the images on the page, but when I tried it, urls had 297 links.

Filter images by name before scraping with beautifulsoup?

I'm trying to create a program that scrapes a site for images using bs4. The site contains two types of images, low quality ones and high quality ones. The high quality files are named the same thing as their low quality versions, but contain the word "website" and the end before the .png. I'd like to only download the "website" files. Here's what I tried.
from bs4 import BeautifulSoup
import requests
URL = "https://www.ssbwiki.com/Category:Head_icons_(SSBU)"
getURL = requests.get(URL, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
if not image.endswith("Website.png"):
continue
if image.endswith("Website.png"):
webs = requests.get(image)
open('scraped_images/' + image.split('/')[-1], 'wb').write(webs.content)
I don't get any error messages, but no files download. Any tips?
You are only checking if it ends with "Website.png" after you have already established that it doesn't. Better not to even check if it doesn't:
from bs4 import BeautifulSoup
import requests
URL = "https://www.ssbwiki.com/Category:Head_icons_(SSBU)"
getURL = requests.get(URL, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(getURL.text, 'html.parser')
images = soup.find_all('img')
resolvedURLs = []
for image in images:
src = image.get('src')
resolvedURLs.append(requests.compat.urljoin(URL, src))
for image in resolvedURLs:
if image.endswith("Website.png"):
webs = requests.get(image)
open('scraped_images/' + image.split('/')[-1], 'wb').write(webs.content)
Actually using list comprehensions you can make your code less procedural and prevent mistakes of the sort you made in the future:
from bs4 import BeautifulSoup
import requests
from requests.compat import urljoin
URL = "https://www.ssbwiki.com/Category:Head_icons_(SSBU)"
getURL = requests.get(URL, headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(getURL.text, 'html.parser')
image_urls = [urljoin(URL,image.get('src')) for image in soup.find_all('img')]
# let's make this one a generator so we don't keep too many downloaded
# images in memory
images = (requests.get(url) for url in image_urls if url.endswith("Website.png"))
for image in images:
# use the context manager so the files are closed after write
with open('scraped_images/' + image.split('/')[-1], 'wb') as f:
f.write(image.content)

Unable to make my script download rest of the images, if some of them are already in a folder

I've written a script in python to download different movie images from a torrent site and store them in a folder in desktop. My script can download & save the images in a folder.
If none of the images or all of them are there in the folder, my script can handle the process of downloading or not downloading.
How can I make my script download rest of the images, if some of the images are already in the folder?
This is my try:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = "https://www.yify-torrent.org/search/1080p/"
dirf = os.environ['USERPROFILE'] + '\Desktop\Images'
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
items = len([name for name in os.listdir(dirf) if os.path.isfile(os.path.join(dirf, name))])
if not items:
response = requests.get(link)
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".img-item .poster-thumb"):
filename = item['src'].split('/')[-1]
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(link,item['src'])).content)
else:
print("All images are there")
Examine each image separately.
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = "https://www.yify-torrent.org/search/1080p/"
dirf = os.environ['USERPROFILE'] + '\Desktop\Images'
if not os.path.exists(dirf):
os.makedirs(dirf)
response = requests.get(link)
soup = BeautifulSoup(response.text, "lxml")
counter = 0
for item in soup.select(".img-item .poster-thumb"):
filename = item['src'].split('/')[-1]
localfile = os.path.join(dirf, filename)
if os.path.isfile(localfile):
continue
# else
counter += 1
with open(localfile, 'wb') as f:
f.write(requests.get(urljoin(link,item['src'])).content)
if counter:
print("Downloaded {} images".format(counter))
else:
print("All images are there")
Inside the for loop, for each extracted image link, we check if the image exists locally, and if it already does, we don't do anything with it.
(I also took out the chdir because it wasn't doing anything useful. If you want to chdir you can simplify the rest of the code to not append dirf in front of the local file name.)
Try this. (Note that I haven't tested actually retrieving the images.)
Holler if you need something clarified.
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = "https://www.yify-torrent.org/search/1080p/"
dirf = os.environ['USERPROFILE'] + '\Desktop\Images'
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
# get ist of previously downloaded images
items = [name for name in os.listdir(dirf) if os.path.isfile(os.path.join(dirf, name))]
# get list of available images as a dictionary since we need the full src
filenames = {}
response = requests.get(link)
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".img-item .poster-thumb"):
filename = item['src'].split('/')[-1]
filenames[filename] = item['src']
# get list of images for download
remaining = set(filenames)-set(items)
if remaining:
for filename in remaining:
if filename in items: continue
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(link,filenames[filename])).content)
else:
print("All images are there")

Beautifulsoup - How to open images and download them

I am looking to grab the full size product images from here
My thinking was:
Follow the image link
Download the picture
Go back
Repeat for n+1 pictures
I know how to open the image thumbnails but not how to get the full size images. Any ideas on how this could be done?
This will get you all URL of the images:
import urllib2
from bs4 import BeautifulSoup
url = "http://icecat.biz/p/toshiba/pscbxe-01t00een/satellite-pro-notebooks-4051528049077-Satellite+Pro+C8501GR-17732197.html"
html = urllib2.urlopen(url)
soup = BeautifulSoup(html)
imgs = soup.findAll("div", {"class":"thumb-pic"})
for img in imgs:
print img.a['href'].split("imgurl=")[1]
Output:
http://www.toshiba.fr/contents/fr_FR/SERIES_DESCRIPTION/images/g1_satellite-pro-c850.jpg
http://www.toshiba.fr/contents/fr_FR/SERIES_DESCRIPTION/images/g4_satellite-pro-c850.jpg
http://www.toshiba.fr/contents/fr_FR/SERIES_DESCRIPTION/images/g2_satellite-pro-c850.jpg
http://www.toshiba.fr/contents/fr_FR/SERIES_DESCRIPTION/images/g5_satellite-pro-c850.jpg
http://www.toshiba.fr/contents/fr_FR/SERIES_DESCRIPTION/images/g3_satellite-pro-c850.jpg
And this code is for downloading and saving those images:
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
url = "http://icecat.biz/p/toshiba/pscbxe-01t00een/satellite-pro-notebooks-4051528049077-Satellite+Pro+C8501GR-17732197.html"
html = urllib2.urlopen(url)
soup = BeautifulSoup(html)
imgs = soup.findAll("div", {"class":"thumb-pic"})
for img in imgs:
imgUrl = img.a['href'].split("imgurl=")[1]
urllib.urlretrieve(imgUrl, os.path.basename(imgUrl))

Scraping a page for images but files are returned as empty

I'm modifying this script to scrape pages like this for the book page images. Using the script directly from stackoverflow, it returns all the images correctly except the one image I want. The page is returned as empty file with a title like this: img.php?dir=39d761947ad84e71e51e3c300f7af8ff&file=1.png.
In my modified version below I'm only pulling the book page image.
Here's my script:
from bs4 import BeautifulSoup as bs
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
out_folder = '/Users/Craig/Desktop/img'
def main(url, out_folder):
soup = bs(urlopen(url))
parsed = list(urlparse.urlparse(url))
for image in soup.findAll('img', id='page_image'):
print "Image: %(src)s" % image
filename = image["src"].split("/")[-1]
parsed[2] = image["src"]
outpath = os.path.join(out_folder, filename)
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
def _usage():
print "usage: python dumpimages.py http://example.com [outpath]"
if __name__ == "__main__":
url = sys.argv[-1]
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
Any ideas?
The issue here is that the url you are using to retrieve the image is:
http://bookre.org/loader/img.php?dir=39d761947ad84e71e51e3c300f7af8ff&file=1.png?file=1077091&pg=1
When you actually want it to be:
http://bookre.org/loader/img.php?dir=39d761947ad84e71e51e3c300f7af8ff&file=1.png
Here's something I hacked together in 2 minutes to download the image you required from the website you listed:
import urllib
import urllib2
import urlparse
from bs4 import BeautifulSoup
def main(url):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html.read())
parsed = list(urlparse.urlparse(url))
for image in soup.find_all(id="page_image"):
if image["src"].lower().startswith("http"):
urllib.urlretrieve(image["src"], "image.png")
else:
new = (parsed[0], parsed[1], image["src"], "", "", "")
urllib.urlretrieve(urlparse.urlunparse(new), "image.png")
if __name__ == '__main__':
main("http://bookre.org/reader?file=1077091&pg=1")
The script saves the image as "image.png" in the directory the script is located in.
Hope this is what you were after; let us know if you run into any difficulties.
In your:
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
You need to replace some of the elements in parsed with those from image["src"]
So much easier with pyquery:
from pyquery import PyQuery as pq
image, = [img.attrib['src'] for img in pq(url=url)('img#page_image')]
...
(Note the funky use of name, = ['string'] to unroll the one-element list).

Categories