I need the crawler to go to the links inside a website and scan images there. I've managed to get this far but I'm confused.
I'm trying to do something like this but I'm sure there's gonna be an easier way.
from bs4 import *
import requests as rq
import os
import sys
from urllib.parse import urlparse
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
images = []
link_urls = soup2.select('a')
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
print(img['src'])
return img['src']
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
# print(link)
if url_validator(link):
crawl_images(link)
I try python3 new_crawler.py imdb.com 3 which should print sources of images crawled in 3 links inside imdb.com but it's not printing anything.
You want to crawl through the images, correct? Try this:
from bs4 import BeautifulSoup
import requests as rq
URL = ""
source = rq.get(URL)
soup = BeautifulSoup(source.text, "html.parser")
image_links = soup.find_all("img")
for img in image_links:
print(img['src'])
Add the website's url to the constant URL that you are trying to scrap. The page's img tags should all be saved in the image_links variable.
This is what I ended up with. It's not working how it's supposed to but the time for the task is up and I decided to share anyway.
from bs4 import *
import requests as rq
import sys
from urllib.parse import urlparse
import json
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
link_urls = soup2.select('a')
links = []
images_sources = []
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
images_sources.append(img['src'])
results = {
"imageUrl": img['src'],
"sourceUrl": link,
"depth": depth
}
json_object = json.dumps(results)
with open("results.json", "w") as f:
f.write(json_object)
f.close()
return results
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
if url_validator(link):
crawl_images(link)
This question already has answers here:
Download large file in python with requests
(8 answers)
Closed 2 years ago.
The goal is for the program to take user given instagram url and allow to download and save a picture.
I've got the main part in place but cant understand how I can go further and use the filtered and right url to download and save the picture on my computer.
My code so far:
EDIT: I added a download line but can't seem to get the right file type? I mean it saves as whatever I want it to but I can't open it:
import requests
import re
import shutil
def get_response(url):
r = requests.get(url)
while r.status_code != 200:
r.raw.decode_content = True
r = requests.get(url, stream = True)
return r.text
def prepare_urls(matches):
return list({match.replace("\\u0026", "&") for match in matches})
url = input('Enter Instagram URL: ')
response = get_response(url)
vid_matches = re.findall('"video_url":"([^"]+)"', response)
pic_matches = re.findall('"display_url":"([^"]+)"', response)
vid_urls = prepare_urls(vid_matches)
pic_urls = prepare_urls(pic_matches)
if vid_urls:
print('Detected Videos:\n{0}'.format('\n'.join(vid_urls)))
print("Can't download video, the provided URL must be of a picture.")
if pic_urls:
print('Detected Pictures:\n{0}'.format('\n'.join(pic_urls)))
from urllib.request import urlretrieve
dst = 'Instagram picture.jpg'
urlretrieve(url, dst)
#EDIT ^
if not (vid_urls or pic_urls):
print('Could not recognize the media in the provided URL.')
I think this might help...
import requests
from bs4 import BeautifulSoup as bs
import json
import os.path
insta_url = 'https://www.instagram.com'
inta_username = input('enter username of instagram : ')
response = requests.get(f"{insta_url}/{inta_username}/")
if response.ok:
html = response.text
bs_html = bs(html, features="lxml")
bs_html = bs_html.text
index = bs_html.find('profile_pic_url_hd')+21
remaining_text = bs_html[index:]
remaining_text_index = remaining_text.find('requested_by_viewer')-3
string_url = remaining_text[:remaining_text_index].replace("\\u0026", "&")
print(string_url, "\ndownloading...")
while True:
filename = 'pic_ins.jpg'
file_exists = os.path.isfile(filename)
if not file_exists:
with open(filename, 'wb+') as handle:
response = requests.get(string_url, stream=True)
if not response.ok:
print(response)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
else:
continue
break
print("completed")
You can change the name of the image downloaded by changing the filename variable
I set up this code to extract the links from the following website. The problem is that it breaks into register 19 and doesn't continue with the listing.
You can help me.
import urllib.request
import os
tematica = 'fun'
url = "https://www.shutterstock.com/es/search/" + tematica
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
data_content = response.read()
Html_file= open("html_file.html","wb")
Html_file.write(data_content)
Html_file.close()
html=codecs.open("html_file.html", 'r', 'utf-8').read()
soup = BeautifulSoup(html)
for i,img_element in enumerate(soup.findAll('img', None)):
try:
img_src = img_element['src']
print(i,img_src)
except:
pass
This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.
I'm trying to write a Python script to download a image and set it as my wallpaper. Unfortunately, the Mechanize documentation is quite poor. My script is following the link correctly, but I'm having a hard time to actually save the image on my computer. From what I researched, the .retrieve() method should do the work, but How do I specify the path to where the file should be downloaded to? Here is what I have...
def followLink(browser, fixedLink):
browser.open(fixedLink)
if browser.find_link(url_regex = r'1600x1200'):
browser.follow_link(url_regex = r'1600x1200')
elif browser.find_link(url_regex = r'1400x1050'):
browser.follow_link(url_regex = r'1400x1050')
elif browser.find_link(url_regex = r'1280x960'):
browser.follow_link(url_regex = r'1280x960')
return
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
data = browser.open(image['src']).read()
browser.back()
save = open(filename, 'wb')
save.write(data)
save.close()
This can help you download all the images from a web page. As for parsing html you'd better use BeautifulSoup or lxml. And download is just read the data and then write it to a local file. You should assign your own value to dir. It is where you images exist.
Not sure why this solution hasn't come up, but you can use the mechanize.Browser.retrieve function as well. Perhaps this only works in newer versions of mechanize and has thus not been mentioned?
Anyway, if you wanted to shorten the answer by zhangyangyu, you could do this:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
browser.retrieve(image['src'], filename)
browser.back()
Also keep in mind that you'll likely want to put all of this into a try except block like this one:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
try:
browser.retrieve(image['src'], filename)
browser.back()
except (mechanize.HTTPError,mechanize.URLError) as e:
pass
# Use e.code and e.read() with HTTPError
# Use e.reason.args with URLError
Of course you'll want to adjust this to your needs. Perhaps you want it to bomb out if it encounters an issue. It totally depends on what you want to achieve.
You can get/download the image by opening the url of the img src.
image_response = browser.open_novisit(img['src'])
to save the file now, just use fopen:
with open('image_out.png', 'wb') as f:
f.write(image_response.read())
It's really crappy but It "works" for me, with 0xc0000022l anwer's
import mechanize, os
from BeautifulSoup import BeautifulSoup
import urllib2
def DownloadIMGs(url): # IMPORTANT URL WITH HTTP OR HTTPS
print "From", url
dir = 'F:\Downloadss' #Dir for Downloads
basicImgFileTypes = ['png','bmp','cur','ico','gif','jpg','jpeg','psd','raw','tif']
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
print "N Images:", len(image_tags)
print
#---------SAVE PATH
#check if available
if not os.path.exists(dir):
os.makedirs(dir)
#---------SAVE PATH
for image in image_tags:
#---------SAVE PATH + FILENAME (Where It is downloading)
filename = image['src']
fileExt = filename.split('.')[-1]
fileExt = fileExt[0:3]
if (fileExt in basicImgFileTypes):
print 'File Extension:', fileExt
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1])
num = filename.find(fileExt) + len(fileExt)
filename = filename[:num]
else:
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1]) + '.' + basicImgFileTypes[0]
print 'File Saving:', filename
#---------SAVE PATH + FILENAME (Where It is downloading)
#--------- FULL URL PATH OF THE IMG
imageUrl = image['src']
print 'IMAGE SRC:', imageUrl
if (imageUrl.find('http://') > -1 or imageUrl.find('https://') > -1):
pass
else:
if (url.find('http://') > -1):
imageUrl = url[:len('http://')]
imageUrl = 'http://' + imageUrl.split('/')[0] + image['src']
elif(url.find('https://') > -1):
imageUrl = url[:len('https://')]
imageUrl = 'https://' + imageUrl.split('/')[0] + image['src']
else:
imageUrl = image['src']
print 'IMAGE URL:', imageUrl
#--------- FULL URL PATH OF THE IMG
#--------- TRY DOWNLOAD
try:
browser.retrieve(imageUrl, filename)
print "Downloaded:", image['src'].split('/')[-1]
print
except (mechanize.HTTPError,mechanize.URLError) as e:
print "Can't Download:", image['src'].split('/')[-1]
print
pass
#--------- TRY DOWNLOAD
browser.close()
DownloadIMGs('https://stackoverflow.com/questions/15593925/downloading-a-image-using-python-mechanize')