How to extract and download all images from a website using beautifulSoup?

How to extract and download all images from a website using beautifulSoup? - python

I am trying to extract and download all images from a url.
I wrote a script
import urllib2
import re
from os.path import basename
from urlparse import urlsplit
url = "http://filmygyan.in/katrina-kaifs-top-10-cutest-pics-gallery/"
urlContent = urllib2.urlopen(url).read()
# HTML image tag: <img src="url" alt="some_text"/>
imgUrls = re.findall('img .*?src="(.*?)"', urlContent)
# download all images
for imgUrl in imgUrls:
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
I don't want to extract image of this page see this image http://i.share.pho.to/1c9884b1_l.jpeg
I just want to get all the images without clicking on "Next" button
I am not getting how can I get the all pics within "Next" class.?What changes I should do in findall?

The following should extract all images from a given page and write it to the directory where the script is being run.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)

Slight modification to Jonathan's answer (because I can't comment): adding 'www' to the website will fix most "File Type Not Supported" errors.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://www.google.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)

from bs4 import *
import requests
import os
def folder_create(images):
try:
folder_name = input("Enter Folder Name:- ")
# folder creation
os.mkdir(folder_name)
except:
print("Folder Exist with that name!")
folder_create()
download_images(images, folder_name)
def download_images(images, folder_name):
count = 0
print(f"Total {len(images)} Image Found!")
if len(images) != 0:
for i, image in enumerate(images):
try:
image_link = image["data-srcset"]
except:
try:
image_link = image["data-src"]
except:
try:
image_link = image["data-fallback-src"]
except:
try:
image_link = image["src"]
except:
pass
try:
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
with open(f"{folder_name}/images{i+1}.jpg", "wb+") as f:
f.write(r)
count += 1
except:
pass
if count == len(images):
print("All Images Downloaded!")
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.findAll('img')
folder_create(images)
url = input("Enter URL:- ")
main(url)`

If you want only pictures then you can just download them without even scrapping the webpage. The all have the same URL:
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute1.jpg
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute2.jpg
...
http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-cutest-pics-gallery/cute10.jpg
So simple code as that will give you all images:
import os
import urllib
import urllib2
baseUrl = "http://filmygyan.in/wp-content/gallery/katrina-kaifs-top-10-"\
"cutest-pics-gallery/cute%s.jpg"
for i in range(1,11):
url = baseUrl % i
urllib.urlretrieve(url, os.path.basename(url))
With Beautifulsoup you will have to click or go to the next page to scrap the images. If you want ot scrap each page individually try to scrathem using there class which is shutterset_katrina-kaifs-top-10-cutest-pics-gallery

Related

How to crawl images inside links of a page

I need the crawler to go to the links inside a website and scan images there. I've managed to get this far but I'm confused.
I'm trying to do something like this but I'm sure there's gonna be an easier way.
from bs4 import *
import requests as rq
import os
import sys
from urllib.parse import urlparse
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
images = []
link_urls = soup2.select('a')
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
print(img['src'])
return img['src']
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
# print(link)
if url_validator(link):
crawl_images(link)
I try python3 new_crawler.py imdb.com 3 which should print sources of images crawled in 3 links inside imdb.com but it's not printing anything.

You want to crawl through the images, correct? Try this:
from bs4 import BeautifulSoup
import requests as rq
URL = ""
source = rq.get(URL)
soup = BeautifulSoup(source.text, "html.parser")
image_links = soup.find_all("img")
for img in image_links:
print(img['src'])
Add the website's url to the constant URL that you are trying to scrap. The page's img tags should all be saved in the image_links variable.

This is what I ended up with. It's not working how it's supposed to but the time for the task is up and I decided to share anyway.
from bs4 import *
import requests as rq
import sys
from urllib.parse import urlparse
import json
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
link_urls = soup2.select('a')
links = []
images_sources = []
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
images_sources.append(img['src'])
results = {
"imageUrl": img['src'],
"sourceUrl": link,
"depth": depth
}
json_object = json.dumps(results)
with open("results.json", "w") as f:
f.write(json_object)
f.close()
return results
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
if url_validator(link):
crawl_images(link)

How to make this program use instagram pic urls and download? [duplicate]

This question already has answers here:
Download large file in python with requests
(8 answers)
Closed 2 years ago.
The goal is for the program to take user given instagram url and allow to download and save a picture.
I've got the main part in place but cant understand how I can go further and use the filtered and right url to download and save the picture on my computer.
My code so far:
EDIT: I added a download line but can't seem to get the right file type? I mean it saves as whatever I want it to but I can't open it:
import requests
import re
import shutil
def get_response(url):
r = requests.get(url)
while r.status_code != 200:
r.raw.decode_content = True
r = requests.get(url, stream = True)
return r.text
def prepare_urls(matches):
return list({match.replace("\\u0026", "&") for match in matches})
url = input('Enter Instagram URL: ')
response = get_response(url)
vid_matches = re.findall('"video_url":"([^"]+)"', response)
pic_matches = re.findall('"display_url":"([^"]+)"', response)
vid_urls = prepare_urls(vid_matches)
pic_urls = prepare_urls(pic_matches)
if vid_urls:
print('Detected Videos:\n{0}'.format('\n'.join(vid_urls)))
print("Can't download video, the provided URL must be of a picture.")
if pic_urls:
print('Detected Pictures:\n{0}'.format('\n'.join(pic_urls)))
from urllib.request import urlretrieve
dst = 'Instagram picture.jpg'
urlretrieve(url, dst)
#EDIT ^
if not (vid_urls or pic_urls):
print('Could not recognize the media in the provided URL.')

I think this might help...
import requests
from bs4 import BeautifulSoup as bs
import json
import os.path
insta_url = 'https://www.instagram.com'
inta_username = input('enter username of instagram : ')
response = requests.get(f"{insta_url}/{inta_username}/")
if response.ok:
html = response.text
bs_html = bs(html, features="lxml")
bs_html = bs_html.text
index = bs_html.find('profile_pic_url_hd')+21
remaining_text = bs_html[index:]
remaining_text_index = remaining_text.find('requested_by_viewer')-3
string_url = remaining_text[:remaining_text_index].replace("\\u0026", "&")
print(string_url, "\ndownloading...")
while True:
filename = 'pic_ins.jpg'
file_exists = os.path.isfile(filename)
if not file_exists:
with open(filename, 'wb+') as handle:
response = requests.get(string_url, stream=True)
if not response.ok:
print(response)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
else:
continue
break
print("completed")
You can change the name of the image downloaded by changing the filename variable

Extract .jpg from HTML source code with Python

I set up this code to extract the links from the following website. The problem is that it breaks into register 19 and doesn't continue with the listing.
You can help me.
import urllib.request
import os
tematica = 'fun'
url = "https://www.shutterstock.com/es/search/" + tematica
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
data_content = response.read()
Html_file= open("html_file.html","wb")
Html_file.write(data_content)
Html_file.close()
html=codecs.open("html_file.html", 'r', 'utf-8').read()
soup = BeautifulSoup(html)
for i,img_element in enumerate(soup.findAll('img', None)):
try:
img_src = img_element['src']
print(i,img_src)
except:
pass

How to put the image files I scraped using Beautiful soup into a list?

This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)

create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.

Downloading a image using Python Mechanize

I'm trying to write a Python script to download a image and set it as my wallpaper. Unfortunately, the Mechanize documentation is quite poor. My script is following the link correctly, but I'm having a hard time to actually save the image on my computer. From what I researched, the .retrieve() method should do the work, but How do I specify the path to where the file should be downloaded to? Here is what I have...
def followLink(browser, fixedLink):
browser.open(fixedLink)
if browser.find_link(url_regex = r'1600x1200'):
browser.follow_link(url_regex = r'1600x1200')
elif browser.find_link(url_regex = r'1400x1050'):
browser.follow_link(url_regex = r'1400x1050')
elif browser.find_link(url_regex = r'1280x960'):
browser.follow_link(url_regex = r'1280x960')
return

import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
data = browser.open(image['src']).read()
browser.back()
save = open(filename, 'wb')
save.write(data)
save.close()
This can help you download all the images from a web page. As for parsing html you'd better use BeautifulSoup or lxml. And download is just read the data and then write it to a local file. You should assign your own value to dir. It is where you images exist.

Not sure why this solution hasn't come up, but you can use the mechanize.Browser.retrieve function as well. Perhaps this only works in newer versions of mechanize and has thus not been mentioned?
Anyway, if you wanted to shorten the answer by zhangyangyu, you could do this:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
browser.retrieve(image['src'], filename)
browser.back()
Also keep in mind that you'll likely want to put all of this into a try except block like this one:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
try:
browser.retrieve(image['src'], filename)
browser.back()
except (mechanize.HTTPError,mechanize.URLError) as e:
pass
# Use e.code and e.read() with HTTPError
# Use e.reason.args with URLError
Of course you'll want to adjust this to your needs. Perhaps you want it to bomb out if it encounters an issue. It totally depends on what you want to achieve.

You can get/download the image by opening the url of the img src.
image_response = browser.open_novisit(img['src'])
to save the file now, just use fopen:
with open('image_out.png', 'wb') as f:
f.write(image_response.read())

It's really crappy but It "works" for me, with 0xc0000022l anwer's
import mechanize, os
from BeautifulSoup import BeautifulSoup
import urllib2
def DownloadIMGs(url): # IMPORTANT URL WITH HTTP OR HTTPS
print "From", url
dir = 'F:\Downloadss' #Dir for Downloads
basicImgFileTypes = ['png','bmp','cur','ico','gif','jpg','jpeg','psd','raw','tif']
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
print "N Images:", len(image_tags)
print
#---------SAVE PATH
#check if available
if not os.path.exists(dir):
os.makedirs(dir)
#---------SAVE PATH
for image in image_tags:
#---------SAVE PATH + FILENAME (Where It is downloading)
filename = image['src']
fileExt = filename.split('.')[-1]
fileExt = fileExt[0:3]
if (fileExt in basicImgFileTypes):
print 'File Extension:', fileExt
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1])
num = filename.find(fileExt) + len(fileExt)
filename = filename[:num]
else:
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1]) + '.' + basicImgFileTypes[0]
print 'File Saving:', filename
#---------SAVE PATH + FILENAME (Where It is downloading)
#--------- FULL URL PATH OF THE IMG
imageUrl = image['src']
print 'IMAGE SRC:', imageUrl
if (imageUrl.find('http://') > -1 or imageUrl.find('https://') > -1):
pass
else:
if (url.find('http://') > -1):
imageUrl = url[:len('http://')]
imageUrl = 'http://' + imageUrl.split('/')[0] + image['src']
elif(url.find('https://') > -1):
imageUrl = url[:len('https://')]
imageUrl = 'https://' + imageUrl.split('/')[0] + image['src']
else:
imageUrl = image['src']
print 'IMAGE URL:', imageUrl
#--------- FULL URL PATH OF THE IMG
#--------- TRY DOWNLOAD
try:
browser.retrieve(imageUrl, filename)
print "Downloaded:", image['src'].split('/')[-1]
print
except (mechanize.HTTPError,mechanize.URLError) as e:
print "Can't Download:", image['src'].split('/')[-1]
print
pass
#--------- TRY DOWNLOAD
browser.close()
DownloadIMGs('https://stackoverflow.com/questions/15593925/downloading-a-image-using-python-mechanize')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to extract and download all images from a website using beautifulSoup? - python

Related

How to crawl images inside links of a page

How to make this program use instagram pic urls and download? [duplicate]

Extract .jpg from HTML source code with Python

How to put the image files I scraped using Beautiful soup into a list?

Downloading a image using Python Mechanize

Categories

Resources