Downloading a image using Python Mechanize - python

I'm trying to write a Python script to download a image and set it as my wallpaper. Unfortunately, the Mechanize documentation is quite poor. My script is following the link correctly, but I'm having a hard time to actually save the image on my computer. From what I researched, the .retrieve() method should do the work, but How do I specify the path to where the file should be downloaded to? Here is what I have...
def followLink(browser, fixedLink):
browser.open(fixedLink)
if browser.find_link(url_regex = r'1600x1200'):
browser.follow_link(url_regex = r'1600x1200')
elif browser.find_link(url_regex = r'1400x1050'):
browser.follow_link(url_regex = r'1400x1050')
elif browser.find_link(url_regex = r'1280x960'):
browser.follow_link(url_regex = r'1280x960')
return

import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
data = browser.open(image['src']).read()
browser.back()
save = open(filename, 'wb')
save.write(data)
save.close()
This can help you download all the images from a web page. As for parsing html you'd better use BeautifulSoup or lxml. And download is just read the data and then write it to a local file. You should assign your own value to dir. It is where you images exist.

Not sure why this solution hasn't come up, but you can use the mechanize.Browser.retrieve function as well. Perhaps this only works in newer versions of mechanize and has thus not been mentioned?
Anyway, if you wanted to shorten the answer by zhangyangyu, you could do this:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
browser.retrieve(image['src'], filename)
browser.back()
Also keep in mind that you'll likely want to put all of this into a try except block like this one:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
try:
browser.retrieve(image['src'], filename)
browser.back()
except (mechanize.HTTPError,mechanize.URLError) as e:
pass
# Use e.code and e.read() with HTTPError
# Use e.reason.args with URLError
Of course you'll want to adjust this to your needs. Perhaps you want it to bomb out if it encounters an issue. It totally depends on what you want to achieve.

You can get/download the image by opening the url of the img src.
image_response = browser.open_novisit(img['src'])
to save the file now, just use fopen:
with open('image_out.png', 'wb') as f:
f.write(image_response.read())

It's really crappy but It "works" for me, with 0xc0000022l anwer's
import mechanize, os
from BeautifulSoup import BeautifulSoup
import urllib2
def DownloadIMGs(url): # IMPORTANT URL WITH HTTP OR HTTPS
print "From", url
dir = 'F:\Downloadss' #Dir for Downloads
basicImgFileTypes = ['png','bmp','cur','ico','gif','jpg','jpeg','psd','raw','tif']
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
print "N Images:", len(image_tags)
print
#---------SAVE PATH
#check if available
if not os.path.exists(dir):
os.makedirs(dir)
#---------SAVE PATH
for image in image_tags:
#---------SAVE PATH + FILENAME (Where It is downloading)
filename = image['src']
fileExt = filename.split('.')[-1]
fileExt = fileExt[0:3]
if (fileExt in basicImgFileTypes):
print 'File Extension:', fileExt
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1])
num = filename.find(fileExt) + len(fileExt)
filename = filename[:num]
else:
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1]) + '.' + basicImgFileTypes[0]
print 'File Saving:', filename
#---------SAVE PATH + FILENAME (Where It is downloading)
#--------- FULL URL PATH OF THE IMG
imageUrl = image['src']
print 'IMAGE SRC:', imageUrl
if (imageUrl.find('http://') > -1 or imageUrl.find('https://') > -1):
pass
else:
if (url.find('http://') > -1):
imageUrl = url[:len('http://')]
imageUrl = 'http://' + imageUrl.split('/')[0] + image['src']
elif(url.find('https://') > -1):
imageUrl = url[:len('https://')]
imageUrl = 'https://' + imageUrl.split('/')[0] + image['src']
else:
imageUrl = image['src']
print 'IMAGE URL:', imageUrl
#--------- FULL URL PATH OF THE IMG
#--------- TRY DOWNLOAD
try:
browser.retrieve(imageUrl, filename)
print "Downloaded:", image['src'].split('/')[-1]
print
except (mechanize.HTTPError,mechanize.URLError) as e:
print "Can't Download:", image['src'].split('/')[-1]
print
pass
#--------- TRY DOWNLOAD
browser.close()
DownloadIMGs('https://stackoverflow.com/questions/15593925/downloading-a-image-using-python-mechanize')

Related

How to create one folder to store images from a website scrape

I have written the following code to extract images for each product from a website scrape. I am very new to this and am unsure how to stop it creating a new folder for each product. Currently it creates a new folder called Whiteline Images inside the previous folder also titled whiteline images - easy enough to fix manually when its 5 products - not so much when I change it to 500+!! I know where in the code its doing this... just unsure how to fix it. Any help is appreciated!
import requests
from bs4 import BeautifulSoup
import os
def imagedown(url,folder):
try:
os.mkdir(os.path.join(os.getcwd(), folder))
except:
pass
os.chdir(os.path.join(os.getcwd(), folder))
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img',{"src":True})
for index, image in enumerate(images, start=1):
if(image.get('src').startswith('https://imageapi.partsdb.com.au/api/Image')):
link = (image.get('src'))
name = f'{soup.find("div", {"class": "head2BR"}).text} ({index})'
with open(name + '.jpg','wb') as f:
im = requests.get(link)
f.write(im.content)
print('Writing:', name)
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=KBR15', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=W13374', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=BMR98', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=W51210', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=W51211', 'whiteline_images')
Instead of changing directories, use os.path.join when writing the image to the directory:
import requests, os
from bs4 import BeautifulSoup
def imagedown(url, folder):
if not os.path.isdir(folder): #cleaner to use os.path.isdir when checking for folder existence
os.mkdir(folder)
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
for index, image in enumerate(soup.findAll('img',{"src":True}), start=1):
if image.get('src').startswith('https://imageapi.partsdb.com.au/api/Image'):
link = image.get('src')
name = f'{soup.find("div", {"class": "head2BR"}).text} ({index})'
with open(os.path.join(folder, name + '.jpg'), 'wb') as f: #join folder name to new image name
im = requests.get(link)
f.write(im.content)
Edit: updated solution:
def imagedown(url, folder):
if not os.path.isdir(folder): #cleaner to use os.path.isdir when checking for folder existence
os.mkdir(folder)
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
for i, a in enumerate(soup.select('img:is(.mainman, .thumbbot)'), 1):
name = soup.select_one('div.head2BR').text+f'({i})'
with open(os.path.join(folder, name + '.jpg'), 'wb') as f: #join folder name to new image name
im = requests.get(a['src'])
f.write(im.content)
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=KBR15', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=W13374', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=BMR98', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=W51210', 'whiteline_images')
imagedown('https://www.whiteline.com.au/product_detail4.php?part_number=W51211', 'whiteline_images')

How to put the image files I scraped using Beautiful soup into a list?

This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.

How to extract and download all images from a website using beautifulSoup?

I am trying to extract and download all images from a url.
I wrote a script
import urllib2
import re
from os.path import basename
from urlparse import urlsplit
url = "http://filmygyan.in/katrina-kaifs-top-10-cutest-pics-gallery/"
urlContent = urllib2.urlopen(url).read()
# HTML image tag: <img src="url" alt="some_text"/>
imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
# download all images
for imgUrl in imgUrls:
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
I don't want to extract image of this page see this image http://i.share.pho.to/1c9884b1_l.jpeg
I just want to get all the images without clicking on "Next" button
I am not getting how can I get the all pics within "Next" class.?What changes I should do in findall?
The following should extract all images from a given page and write it to the directory where the script is being run.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Slight modification to Jonathan's answer (because I can't comment): adding 'www' to the website will fix most "File Type Not Supported" errors.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://www.google.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
from bs4 import *
import requests
import os
def folder_create(images):
try:
folder_name = input("Enter Folder Name:- ")
# folder creation
os.mkdir(folder_name)
except:
print("Folder Exist with that name!")
folder_create()
download_images(images, folder_name)
def download_images(images, folder_name):
count = 0
print(f"Total {len(images)} Image Found!")
if len(images) != 0:
for i, image in enumerate(images):
try:
image_link = image["data-srcset"]
except:
try:
image_link = image["data-src"]
except:
try:
image_link = image["data-fallback-src"]
except:
try:
image_link = image["src"]
except:
pass
try:
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
with open(f"{folder_name}/images{i+1}.jpg", "wb+") as f:
f.write(r)
count += 1
except:
pass
if count == len(images):
print("All Images Downloaded!")
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
folder_create(images)
url = input("Enter URL:- ")
main(url)`
If you want only pictures then you can just download them without even scrapping the webpage. The all have the same URL:
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute1.jpg
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute2.jpg
...
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute10.jpg
So simple code as that will give you all images:
import os
import urllib
import urllib2
baseUrl = "http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-"\
"cutest-pics-gallery/cute%s.jpg"
for i in range(1,11):
url = baseUrl % i
urllib.urlretrieve(url, os.path.basename(url))
With Beautifulsoup you will have to click or go to the next page to scrap the images. If you want ot scrap each page individually try to scrathem using there class which is shutterset_katrina-kaifs-top-10-cutest-pics-gallery

Cannot get image

Why does the code below that I got from here:
http://blog.yhathq.com/posts/image-classification-in-Python.html
Give no result. Everything seems to be Ok. It runs without error, but I get no images.
What am I doing wrong ?
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
def get_soup(url):
return BeautifulSoup(requests.get(url).text)
image_type = "check"
query = "check"
url = "http://www.bing.com/images/search?q=" + query + \
"&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3"
soup = get_soup(url)
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("mm.bing.net")})]
for img in images:
raw_img = urllib2.urlopen(img).read()
cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
f = open("images/" + image_type + "_"+ str(cntr), 'wb')
f.write(raw_img)
f.close()
I got a SyntaxError and changed the url line to:
url = ("http://www.bing.com/images/search?q=" + query +
"&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3")
Then it just ran fine.
The attr "src" should be "src2". Try this code.
images = [a['src2'] for a in soup.find_all("img", {"src2": re.compile("mm.bing.net")})]

Need help with a Python scraper

I am trying to use urllib with python to make a scraper, I can download the images, but they are a thumbnail, 250x250 or less.(I am trying of 4chan, Because I like some of the picture threads)
How can I get the full image?
here is my code
import urllib2, urllib
from BeautifulSoup import BeautifulSoup
import re
import urlparse
i = 0
ext = "'src' : re.compile(r'(jpe?g)|(png)|$'"
url = raw_input("Enter URL here:")
ender = raw_input("Enter File Type Here(For Images enter 'img'):")
if ender == "img":
ender = 'img', {'src' : re.compile(r'(.jpe?g)|(.png)|(.gif)$')}
else:
if "." in ender:
end = ender
else:
end = ".%s" % ender
raw = urllib.urlopen(url)
soup = BeautifulSoup(raw)
parse = list(urlparse.urlparse(url))
for ender in soup.findAll(ender):
links = "%(src)s"% ender
print links
str(links)
if ".jpg" in links:
end = ".jpg"
if ".jpeg" in links:
end = ".jpeg"
if ".gif" in links:
end = ".gif"
if ".png" in links:
end = ".png"
i += 1
urllib.urlretrieve(links, "%s%s" % (i, end))
Because you can click to see a larger link, the URL in the <a href="url"> that is around the image tag points to the full image.
So just read the value of the href property, and download that instead of the src property of the image.

Categories