Cannot get image - python

Why does the code below that I got from here:
http://blog.yhathq.com/posts/image-classification-in-Python.html
Give no result. Everything seems to be Ok. It runs without error, but I get no images.
What am I doing wrong ?
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
def get_soup(url):
return BeautifulSoup(requests.get(url).text)
image_type = "check"
query = "check"
url = "http://www.bing.com/images/search?q=" + query + \
"&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3"
soup = get_soup(url)
images = [a['src'] for a in soup.find_all("img", {"src": re.compile("mm.bing.net")})]
for img in images:
raw_img = urllib2.urlopen(img).read()
cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
f = open("images/" + image_type + "_"+ str(cntr), 'wb')
f.write(raw_img)
f.close()

I got a SyntaxError and changed the url line to:
url = ("http://www.bing.com/images/search?q=" + query +
"&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3")
Then it just ran fine.

The attr "src" should be "src2". Try this code.
images = [a['src2'] for a in soup.find_all("img", {"src2": re.compile("mm.bing.net")})]

Related

How to crawl images inside links of a page

I need the crawler to go to the links inside a website and scan images there. I've managed to get this far but I'm confused.
I'm trying to do something like this but I'm sure there's gonna be an easier way.
from bs4 import *
import requests as rq
import os
import sys
from urllib.parse import urlparse
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
images = []
link_urls = soup2.select('a')
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
print(img['src'])
return img['src']
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
# print(link)
if url_validator(link):
crawl_images(link)
I try python3 new_crawler.py imdb.com 3 which should print sources of images crawled in 3 links inside imdb.com but it's not printing anything.
You want to crawl through the images, correct? Try this:
from bs4 import BeautifulSoup
import requests as rq
URL = ""
source = rq.get(URL)
soup = BeautifulSoup(source.text, "html.parser")
image_links = soup.find_all("img")
for img in image_links:
print(img['src'])
Add the website's url to the constant URL that you are trying to scrap. The page's img tags should all be saved in the image_links variable.
This is what I ended up with. It's not working how it's supposed to but the time for the task is up and I decided to share anyway.
from bs4 import *
import requests as rq
import sys
from urllib.parse import urlparse
import json
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
link_urls = soup2.select('a')
links = []
images_sources = []
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
images_sources.append(img['src'])
results = {
"imageUrl": img['src'],
"sourceUrl": link,
"depth": depth
}
json_object = json.dumps(results)
with open("results.json", "w") as f:
f.write(json_object)
f.close()
return results
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
if url_validator(link):
crawl_images(link)

How to put the image files I scraped using Beautiful soup into a list?

This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.

image_url = soup.select('.image a')[0]['href'] IndexError: list index out of range

I used part of this code to retrieve the images from imgur when the URL doesn't end in an image extension like .png/.jpg. However I am getting these errors. Please have a look and suggest fixes:
https://github.com/asweigart/imgur-hosted-reddit-posted-downloader/blob/master/imgur-hosted-reddit-posted-downloader.py
import datetime
import praw
import re
import urllib
import requests
from bs4 import BeautifulSoup
sub = 'dog'
imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
r = praw.Reddit(user_agent = "download all images from a subreddit",
user_site = "lamiastella")
already_done = []
#checkWords = ['i.imgur.com', 'jpg', 'png',]
check_words = ['jpg', 'png']
subreddit = r.get_subreddit(sub)
for submission in subreddit.get_hot(limit=10000):
is_image = any(string in submission.url for string in check_words)
print '[LOG] Getting url: ' + submission.url
if submission.id not in already_done and is_image:
if submission.url.endswith('/'):
modified_url = submission.url[:len(submission.url)-1]
try:
urllib.urlretrieve(modified_url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-4:])
except IOError:
pass
else:
try:
urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-4:])
except IOError:
pass
already_done.append(submission.id)
print '[LOG] Done Getting ' + submission.url
print('{0}: {1}'.format('submission id is', submission.id))
elif 'http://imgur.com/' in submission.url:
# This is an Imgur page with a single image.
html_source = requests.get(submission.url).text # download the image's page
soup = BeautifulSoup(html_source, "lxml")
image_url = soup.select('.image a')[0]['href']
if image_url.startswith('//'):
# if no schema is supplied in the url, prepend 'http:' to it
image_url = 'http:' + image_url
image_id = image_url[image_url.rfind('/') + 1:image_url.rfind('.')]
urllib.urlretrieve(image_url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + image_url[-4:])
The error is:
[LOG] Getting url: http://imgur.com/a/yOLjm
Traceback (most recent call last):
File "download_images.py", line 43, in <module>
image_url = soup.select('.image a')[0]['href']
IndexError: list index out of range
You have wrong selection - you need soup.select('img')[0]['src']
import datetime
import urllib
import requests
from bs4 import BeautifulSoup
url = 'http://imgur.com/gyUWtWP'
html_source = requests.get(url).text
soup = BeautifulSoup(html_source, "lxml")
image_url = soup.select('img')[0]['src']
if image_url.startswith('//'):
image_url = 'http:' + image_url
image_id = image_url[image_url.rfind('/') + 1:image_url.rfind('.')]
urllib.urlretrieve(image_url, datetime.datetime.now().strftime('%y-%m-%d-%s') + image_url[-4:])

How to extract and download all images from a website using beautifulSoup?

I am trying to extract and download all images from a url.
I wrote a script
import urllib2
import re
from os.path import basename
from urlparse import urlsplit
url = "http://filmygyan.in/katrina-kaifs-top-10-cutest-pics-gallery/"
urlContent = urllib2.urlopen(url).read()
# HTML image tag: <img src="url" alt="some_text"/>
imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
# download all images
for imgUrl in imgUrls:
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
I don't want to extract image of this page see this image http://i.share.pho.to/1c9884b1_l.jpeg
I just want to get all the images without clicking on "Next" button
I am not getting how can I get the all pics within "Next" class.?What changes I should do in findall?
The following should extract all images from a given page and write it to the directory where the script is being run.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Slight modification to Jonathan's answer (because I can't comment): adding 'www' to the website will fix most "File Type Not Supported" errors.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://www.google.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
from bs4 import *
import requests
import os
def folder_create(images):
try:
folder_name = input("Enter Folder Name:- ")
# folder creation
os.mkdir(folder_name)
except:
print("Folder Exist with that name!")
folder_create()
download_images(images, folder_name)
def download_images(images, folder_name):
count = 0
print(f"Total {len(images)} Image Found!")
if len(images) != 0:
for i, image in enumerate(images):
try:
image_link = image["data-srcset"]
except:
try:
image_link = image["data-src"]
except:
try:
image_link = image["data-fallback-src"]
except:
try:
image_link = image["src"]
except:
pass
try:
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
with open(f"{folder_name}/images{i+1}.jpg", "wb+") as f:
f.write(r)
count += 1
except:
pass
if count == len(images):
print("All Images Downloaded!")
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
folder_create(images)
url = input("Enter URL:- ")
main(url)`
If you want only pictures then you can just download them without even scrapping the webpage. The all have the same URL:
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute1.jpg
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute2.jpg
...
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute10.jpg
So simple code as that will give you all images:
import os
import urllib
import urllib2
baseUrl = "http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-"\
"cutest-pics-gallery/cute%s.jpg"
for i in range(1,11):
url = baseUrl % i
urllib.urlretrieve(url, os.path.basename(url))
With Beautifulsoup you will have to click or go to the next page to scrap the images. If you want ot scrap each page individually try to scrathem using there class which is shutterset_katrina-kaifs-top-10-cutest-pics-gallery

Downloading a image using Python Mechanize

I'm trying to write a Python script to download a image and set it as my wallpaper. Unfortunately, the Mechanize documentation is quite poor. My script is following the link correctly, but I'm having a hard time to actually save the image on my computer. From what I researched, the .retrieve() method should do the work, but How do I specify the path to where the file should be downloaded to? Here is what I have...
def followLink(browser, fixedLink):
browser.open(fixedLink)
if browser.find_link(url_regex = r'1600x1200'):
browser.follow_link(url_regex = r'1600x1200')
elif browser.find_link(url_regex = r'1400x1050'):
browser.follow_link(url_regex = r'1400x1050')
elif browser.find_link(url_regex = r'1280x960'):
browser.follow_link(url_regex = r'1280x960')
return
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
data = browser.open(image['src']).read()
browser.back()
save = open(filename, 'wb')
save.write(data)
save.close()
This can help you download all the images from a web page. As for parsing html you'd better use BeautifulSoup or lxml. And download is just read the data and then write it to a local file. You should assign your own value to dir. It is where you images exist.
Not sure why this solution hasn't come up, but you can use the mechanize.Browser.retrieve function as well. Perhaps this only works in newer versions of mechanize and has thus not been mentioned?
Anyway, if you wanted to shorten the answer by zhangyangyu, you could do this:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
browser.retrieve(image['src'], filename)
browser.back()
Also keep in mind that you'll likely want to put all of this into a try except block like this one:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
try:
browser.retrieve(image['src'], filename)
browser.back()
except (mechanize.HTTPError,mechanize.URLError) as e:
pass
# Use e.code and e.read() with HTTPError
# Use e.reason.args with URLError
Of course you'll want to adjust this to your needs. Perhaps you want it to bomb out if it encounters an issue. It totally depends on what you want to achieve.
You can get/download the image by opening the url of the img src.
image_response = browser.open_novisit(img['src'])
to save the file now, just use fopen:
with open('image_out.png', 'wb') as f:
f.write(image_response.read())
It's really crappy but It "works" for me, with 0xc0000022l anwer's
import mechanize, os
from BeautifulSoup import BeautifulSoup
import urllib2
def DownloadIMGs(url): # IMPORTANT URL WITH HTTP OR HTTPS
print "From", url
dir = 'F:\Downloadss' #Dir for Downloads
basicImgFileTypes = ['png','bmp','cur','ico','gif','jpg','jpeg','psd','raw','tif']
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
print "N Images:", len(image_tags)
print
#---------SAVE PATH
#check if available
if not os.path.exists(dir):
os.makedirs(dir)
#---------SAVE PATH
for image in image_tags:
#---------SAVE PATH + FILENAME (Where It is downloading)
filename = image['src']
fileExt = filename.split('.')[-1]
fileExt = fileExt[0:3]
if (fileExt in basicImgFileTypes):
print 'File Extension:', fileExt
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1])
num = filename.find(fileExt) + len(fileExt)
filename = filename[:num]
else:
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1]) + '.' + basicImgFileTypes[0]
print 'File Saving:', filename
#---------SAVE PATH + FILENAME (Where It is downloading)
#--------- FULL URL PATH OF THE IMG
imageUrl = image['src']
print 'IMAGE SRC:', imageUrl
if (imageUrl.find('http://') > -1 or imageUrl.find('https://') > -1):
pass
else:
if (url.find('http://') > -1):
imageUrl = url[:len('http://')]
imageUrl = 'http://' + imageUrl.split('/')[0] + image['src']
elif(url.find('https://') > -1):
imageUrl = url[:len('https://')]
imageUrl = 'https://' + imageUrl.split('/')[0] + image['src']
else:
imageUrl = image['src']
print 'IMAGE URL:', imageUrl
#--------- FULL URL PATH OF THE IMG
#--------- TRY DOWNLOAD
try:
browser.retrieve(imageUrl, filename)
print "Downloaded:", image['src'].split('/')[-1]
print
except (mechanize.HTTPError,mechanize.URLError) as e:
print "Can't Download:", image['src'].split('/')[-1]
print
pass
#--------- TRY DOWNLOAD
browser.close()
DownloadIMGs('https://stackoverflow.com/questions/15593925/downloading-a-image-using-python-mechanize')

Categories