I have a python script that searches for images on a web page and it's supposed to download them to folder named 'downloaded'. Last 2-3 lines are problematic, I don't know how to write the correct 'with open' code.
The biggest part of the script is fine, lines 42-43 give an error
import os
import requests
from bs4 import BeautifulSoup
downloadDirectory = "downloaded"
baseUrl = "http://pythonscraping.com"
def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://"+source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = source[4:]
url = "http://"+source
else:
url = baseUrl+"/"+source
if baseUrl not in url:
return None
return url
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.", "")
path = path.replace(baseUrl, "")
path = downloadDirectory+path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = requests.get("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html.content, 'html.parser')
downloadList = bsObj.find_all(src=True)
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl,download["src"])
if fileUrl is not None:
print(fileUrl)
with open(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory), 'wb') as out_file:
out_file.write(fileUrl.content)
It opens downloaded folder on my computer and misc folder within it. And it gives a traceback error.
Traceback:
http://pythonscraping.com/misc/jquery.js?v=1.4.4
Traceback (most recent call last):
File "C:\Python36\kodovi\downloaded.py", line 43, in <module>
with open(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory), 'wb
') as out_file:
TypeError: an integer is required (got type str)
It seems your downloadList includes some URLs that aren't images. You could instead look for any <img> tags in the HTML:
downloadList = bsObj.find_all('img')
Then use this to download those images:
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl,download["src"])
r = requests.get(fileUrl, allow_redirects=True)
filename = os.path.join(downloadDirectory, fileUrl.split('/')[-1])
open(filename, 'wb').write(r.content)
EDIT: I've updated the filename = ... line so that it writes the file of the same name to the directory in the string downloadDirectory. By the way, the normal convention for Python variables is not to use camel case.
Related
I'm making this project for a course and I can't get my Instagram pic downloader program to save the photo in a usable format. Even if I give it .jpg, it doesn't help. Still says "It appears we don't support this file format" when trying to open the picture.
Been stuck on this for a while, I've tried other ways of download too but the downloaded file still cant be used.
Here's the code:
import requests
import re
import shutil
url = input('Enter Instagram Photo URL: ')
def get_response(url):
r = requests.get(url)
while r.status_code != 200:
r.raw.decode_content = True
r = requests.get(url, stream = True)
return r.text
response = get_response(url)
def prepare_urls(matches):
return list({match.replace("\\u0026", "&") for match in matches})
vid_matches = re.findall('"video_url":"([^"]+)"', response)
pic_matches = re.findall('"display_url":"([^"]+)"', response)
vid_urls = prepare_urls(vid_matches)
pic_urls = prepare_urls(pic_matches)
if vid_urls:
print('Detected Videos:\n{0}'.format('\n'.join(vid_urls)))
print("Can't download video, the provided URL must be of a picture.")
if pic_urls:
print('Detected Pictures:\n{0}'.format('\n'.join(pic_urls)))
from urllib.request import urlretrieve
dst = 'INSTA.jpg'
urlretrieve(url, dst)
if not (vid_urls or pic_urls):
print('Could not recognize the media in the provided URL.')
I'm trying to create a web scraper to download certain images from a webpage using Python and BeautifulSoup. I'm a beginner and have built this just through finding code online and trying to adapt it. My problem is that when I run the code, it produces this error:
line 24, in <module>
if len(nametemp) == 0:
TypeError: object of type 'NoneType' has no len()
This is what my code looks like:
i = 1
def makesoup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = makesoup("https://www.stiga.pl/sklep/koszenie-trawnika/agregaty-tnace/agregaty-tnace-park-villa/agregat-park-100-combi-3-el-qf")
for img in soup.findAll('img'):
temp=img.get('src')
if temp[:1]=="/":
image = "https://www.stiga.pl/sklep/koszenie-trawnika/agregaty-tnace/agregaty-tnace-park-villa/agregat-park-100-combi-3-el-qf" + temp
else:
image = temp
nametemp = img.get('alt', [])
if len(nametemp) == 0:
filename = str(i)
i = i + 1
else:
filename = nametemp
This works now! Thanks for the replies!
Now when I run the code, only some of the images from the webpage appear in my folder. And it returns this:
Traceback (most recent call last):
File "scrape_stiga.py", line 31, in <module>
imagefile.write(urllib .request.urlopen(image).read())
File "/Users/opt/anaconda3/lib/python3.7/urllib/request.py", line
222, in urlopen
return opener.open(url, data, timeout)
File "/Users/opt/anaconda3/lib/python3.7/urllib/request.py", line
510, in open
req = Request(fullurl, data)
File "/Users/opt/anaconda3/lib/python3.7/urllib/request.py", line
328, in __init__
self.full_url = url
File "/Users/opt/anaconda3/lib/python3.7/urllib/request.py", line
354, in full_url
self._parse()
File "/Users/opt/anaconda3/lib/python3.7/urllib/request.py", line
383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: 'assets/img/logo-white.png'
Replace nametemp = img.get('alt') with nametemp = img.get('alt', '').
Some <img> elements could be missing the alt attribute. In such a case, img.get('alt') will return None and len function doesn't work on None.
By using img.get('alt', ''), you are returning an empty string when the image lacks alt attribute. len('') will return 0 and your code will not break.
Looks like nametemp is being assigned none ( that’s the default behaviour of get ).
In order to ensure nametemp is iterable, try changing your assignment line:
nametemp = img.get('alt',[])
This will ensure that if “alt” isn’t found that you will return a list and thus you can call “len”.
To control which directory your file is stored to, simply change your filename to contain the whole path i.e: “C:/Desktop/mySpecialFile.jpeg”
You are taking the length of nametemp when the error is raised. It says you can't take the length of a NoneType object. This tells you that nametemp at that point must be None.
Why is it None? Let's go back to:
nametemp = img.get('alt')
OK. img is the current <img> tag, since you're iterating over image tags. At some point you iterate over an image tag which does not have an alt attribute. Therefore, img.get('alt') returns None, and None is assigned to nametemp.
Check the HTML you are parsing and confirm that all image tags have an alt attribute. If you only want to iterate over image tags with an alt attribute, you can use a css-selector to find only image tags with an alt attribute, or you could add a try-catch to your loop, and simply continue if you come across an image tag you don't like.
EDIT - You said you want to scrape product images, but it isn't really clear what page you are trying to scrape these images from exactly. You did update your post with a URL - thank you - but what exactly are you trying to achieve? Do you want to scrape the page that contains all (or some) of the products within a certain category, and simply scrape the thumbnails? Or do you want to visit each product page individually and download the higher resolution image?
Here's something I put together: It just looks at the first page of all products within a certain category, and then scrapes and downloads the thumbnails (low resolution) images into a downloaded_images folder. If the folder doesn't exist, it will create it automatically. This does require the third party module requests, which you can install using pip install requests - though you should be able to do something similar using urllib.request if you don't want to install requests:
def download_image(image_url):
import requests
from pathlib import Path
dir_path = Path("downloaded_images/")
dir_path.mkdir(parents=True, exist_ok=True)
image_name = image_url[image_url.rfind("/")+1:]
image_path = str(dir_path) + "/" + image_name
with requests.get(image_url, stream=True) as response:
response.raise_for_status()
with open(image_path, "wb") as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
file.flush()
print(f"Finished downloading \"{image_url}\" to \"{image_path}\".\n")
def main():
import requests
from bs4 import BeautifulSoup
root_url = "https://www.stiga.pl/"
url = f"{root_url}sklep/koszenie-trawnika/agregaty-tnace/agregaty-tnace-park-villa"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
for product in soup.findAll("div", {"class": "products__item"}):
image_url = root_url + product.find("img")["data-src"]
download_image(image_url)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
To recap, you are using BeautifulSoup to find the URLs to the images, and then you use a simple requests.get to download the image.
How would I go by changing the twitter banner using an image from url using tweepy library: https://github.com/tweepy/tweepy/blob/v2.3.0/tweepy/api.py#L392
So far I got this and it returns:
def banner(self):
url = 'https://blog.snappa.com/wp-content/uploads/2019/01/Twitter-Header-Size.png'
file = requests.get(url)
self.api.update_profile_banner(filename=file.content)
ValueError: stat: embedded null character in path
It seems like filename requires an image to be downloaded. Anyway to process this without downloading the image and then removing it?
Looking at library's code you can do what you want.
def update_profile_banner(self, filename, *args, **kargs):
f = kargs.pop('file', None)
So what you need to do is supply the filename and the file kwarg:
filename = url.split('/')[-1]
self.api.update_profile_banner(filename, file=file.content)
import tempfile
def banner():
url = 'file_url'
file = requests.get(url)
temp = tempfile.NamedTemporaryFile(suffix=".png")
try:
temp.write(file.content)
self.api.update_profile_banner(filename=temp.name)
finally:
temp.close()
I'm trying to make web scraper that downloads images from searched keywords. The code works completely fine until it has to download that image from extracted URL
from bs4 import BeautifulSoup
import requests
import os
import urllib
search = raw_input("search for images: ")
params = {"q": search}
r = requests.get("http://wwww.bing.com/images/search", params=params)
dir_name = search.replace(" ", "_").lower()
if not os.path.isdir(dir_name):
os.makedirs(dir_name)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for items in links:
img_obj = requests.get(items.attrs["href"])
print "Getting: ", items.attrs["href"]
title = items.attrs["href"].split("/")[-1]
urllib.urlretrieve(items.attrs["href"], "./scraped_images/")
OUTPUT:
search for images: cats
Getting: http://c1.staticflickr.com/3/2755/4353908962_2a0003aebf.jpg
Traceback (most recent call last):
File "C:/Users/qazii/PycharmProjects/WebScraping/exm.py", line 21, in <module>
urllib.urlretrieve(items.attrs["href"], "./scraped_images/")
File "E:\anaconda\envs\WebScraping\lib\urllib.py", line 98, in urlretrieve
return opener.retrieve(url, filename, reporthook, data)
File "E:\anaconda\envs\WebScraping\lib\urllib.py", line 249, in retrieve
tfp = open(filename, 'wb')
IOError: [Errno 13] Permission denied: './scraped_images/'
You're attempting to save the image to a "file" named ./scraped_images/. Since this is a directory and not a file, you get a permissions error (you can't open a directory with write permissions). Instead, try saving to a specific file name.
urllib.urlretrieve(items.attrs["href"], os.path.join("./scrapped_images", title))
I don't know what this error means. Any advice about the error or the rest of the code is greatly appreciated.
import urllib
import urllib2
import os
import re
from bs4 import BeautifulSoup
def image_scrape():
url = raw_input("Type url for image scrape: ")
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content)
name = 0
for tag in soup.find_all(re.compile("img")):
path = 'C:\Users\Sorcerer\Downloads'
name += 1
filename = name
file_path = "%s%s" % (path, filename)
downloaded_image = file(file_path, "wb")
downloaded_image.write(buf)
downloaded_image.close()
image_scrape()
You have a line in your code:
downloaded_image.write(buf)
The Python interpreter has not seen this variable buf before in your code. And hence the error.
Thoughts on the rest of your code:
It is advisable to use the os module to do what you are doing with this line:
file_path = "%s%s" % (path, filename)
like this:
import os
path = os.path.normpath('C:\\Users\\Sorcerer\\Downloads')
file_path = os.path.join(path, name)
Looks like you are trying to find all the image links in the page and trying to save it to the file system at the location referenced by file_path. Assuming the link to the image is in the variable tag, this is what you do:
import requests
r = requests.get(tag, stream=True)
if r.status_code == 200:
with open(name, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
f.close()