I'm making this project for a course and I can't get my Instagram pic downloader program to save the photo in a usable format. Even if I give it .jpg, it doesn't help. Still says "It appears we don't support this file format" when trying to open the picture.
Been stuck on this for a while, I've tried other ways of download too but the downloaded file still cant be used.
Here's the code:
import requests
import re
import shutil
url = input('Enter Instagram Photo URL: ')
def get_response(url):
r = requests.get(url)
while r.status_code != 200:
r.raw.decode_content = True
r = requests.get(url, stream = True)
return r.text
response = get_response(url)
def prepare_urls(matches):
return list({match.replace("\\u0026", "&") for match in matches})
vid_matches = re.findall('"video_url":"([^"]+)"', response)
pic_matches = re.findall('"display_url":"([^"]+)"', response)
vid_urls = prepare_urls(vid_matches)
pic_urls = prepare_urls(pic_matches)
if vid_urls:
print('Detected Videos:\n{0}'.format('\n'.join(vid_urls)))
print("Can't download video, the provided URL must be of a picture.")
if pic_urls:
print('Detected Pictures:\n{0}'.format('\n'.join(pic_urls)))
from urllib.request import urlretrieve
dst = 'INSTA.jpg'
urlretrieve(url, dst)
if not (vid_urls or pic_urls):
print('Could not recognize the media in the provided URL.')
Related
I have problem with my script when i try download images from web url. It works on other page (offex.pl) but in my shop images are not working.
i just have all files but i can't open Files
my code:
import os
import time
import requests
from termcolor import colored
def get_folder(url):
all_folders= os.path.dirname(url)
folder=os.path.basename(all_folders)
return folder
def filename(url):
file=url[url.rfind("/") + 1:]
return file
def download(link):
error = []
ok = 0
fail = 0
root_folder = get_folder(link)
path = "{}/{}".format("download", root_folder)
if not os.path.exists(path):
os.makedirs(path)
url = link
file = filename(link)
result = requests.get(url, stream=True)
completeName = os.path.join("download", root_folder, file)
print(completeName)
if result.status_code == 200:
image = result.raw.read()
open(completeName, "wb").write(image)
ok += 1
succes = "{} {} {}".format(ok, colored("Pobrano:", "green"), url)
print(succes)
time.sleep(1)
else:
found_error = "{} {}".format(colored("Brak pliku!:", "red"), url)
print(found_error)
fail += 1
error.append("ID:{} NUMBER:{} link: {}".format(id, url))
with open("log.txt", "w") as filehandle:
for listitem in error:
filehandle.write('%s\n' % listitem)
print(colored("Pobrano plików: ", "green"), ok)
print(colored("Błędy pobierania: ", "red"), fail)
img_url="https://sw19048.smartweb-static.com/upload_dir/shop/misutonida_ec-med-384-ix.jpg"
download(img_url)
What Im doing wrong?
for example (https://offex.pl/images/detailed/11/94102_jeep_sbhn-8h.jpg) download OK
but for my shop url https://sw19048.smartweb-static.com/upload_dir/shop/misutonida_ec-med-384-ix.jpg is not working.
If you want to use requests module,you can use this:
import requests
response = requests.get("https://sw19048.smartweb-static.com/upload_dir/shop/misutonida_ec-med-384-ix.jpg")
with open('./Image.jpg','wb') as f:
f.write(response.content)
The issue is with the URL which you are using to download. Its not an issue, but a difference from other URL you have mentioned.
Let me explain
The URL https://offex.pl/images/detailed/11/94102_jeep_sbhn-8h.jpg returns an image as response with out any compression.
On the other hand, the shop URL https://sw19048.smartweb-static.com/upload_dir/shop/misutonida_ec-med-384-ix.jpg returns the image with gzip compression enabled in the headers.
So the raw response you get is compressed with gzip compression. You can decompress the response with gzip, if you know the compression is always gzip like below
import gzip
import io
image = result.raw.read()
buffer = io.BytesIO(image)
deflatedContent = gzip.GzipFile(fileobj=buffer)
open("D:/sample.jpg", "wb").write(deflatedContent.read())
Or you can use alternative libraries like urllib2 or similar ones, which takes care of decompression. I was trying to explain why it failed for your URL , but not for other. Hope this makes sense.
try :
import urllib2
def download_web_image(url):
request = urllib2.Request(url)
img = urllib2.urlopen(request).read()
with open('test.jpg', 'wb') as f:
f.write(img)
download_web_image("https://sw19048.smartweb-static.com/upload_dir/shop/misutonida_ec-med-384-ix.jpg")
It is working for your URL. I think the issue is with the request response of the used library.
from io import BytesIO
import requests
from PIL import Image
fileRequest = requests.get("https://sw19048.smartweb-static.com/upload_dir/shop/misutonida_ec-med-384-ix.jpg")
doc = Image.open(BytesIO(fileRequest.content))
doc.save("newFile.jpg")
How would I go by changing the twitter banner using an image from url using tweepy library: https://github.com/tweepy/tweepy/blob/v2.3.0/tweepy/api.py#L392
So far I got this and it returns:
def banner(self):
url = 'https://blog.snappa.com/wp-content/uploads/2019/01/Twitter-Header-Size.png'
file = requests.get(url)
self.api.update_profile_banner(filename=file.content)
ValueError: stat: embedded null character in path
It seems like filename requires an image to be downloaded. Anyway to process this without downloading the image and then removing it?
Looking at library's code you can do what you want.
def update_profile_banner(self, filename, *args, **kargs):
f = kargs.pop('file', None)
So what you need to do is supply the filename and the file kwarg:
filename = url.split('/')[-1]
self.api.update_profile_banner(filename, file=file.content)
import tempfile
def banner():
url = 'file_url'
file = requests.get(url)
temp = tempfile.NamedTemporaryFile(suffix=".png")
try:
temp.write(file.content)
self.api.update_profile_banner(filename=temp.name)
finally:
temp.close()
I have a python script that searches for images on a web page and it's supposed to download them to folder named 'downloaded'. Last 2-3 lines are problematic, I don't know how to write the correct 'with open' code.
The biggest part of the script is fine, lines 42-43 give an error
import os
import requests
from bs4 import BeautifulSoup
downloadDirectory = "downloaded"
baseUrl = "http://pythonscraping.com"
def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://"+source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = source[4:]
url = "http://"+source
else:
url = baseUrl+"/"+source
if baseUrl not in url:
return None
return url
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.", "")
path = path.replace(baseUrl, "")
path = downloadDirectory+path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = requests.get("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html.content, 'html.parser')
downloadList = bsObj.find_all(src=True)
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl,download["src"])
if fileUrl is not None:
print(fileUrl)
with open(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory), 'wb') as out_file:
out_file.write(fileUrl.content)
It opens downloaded folder on my computer and misc folder within it. And it gives a traceback error.
Traceback:
http://pythonscraping.com/misc/jquery.js?v=1.4.4
Traceback (most recent call last):
File "C:\Python36\kodovi\downloaded.py", line 43, in <module>
with open(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory), 'wb
') as out_file:
TypeError: an integer is required (got type str)
It seems your downloadList includes some URLs that aren't images. You could instead look for any <img> tags in the HTML:
downloadList = bsObj.find_all('img')
Then use this to download those images:
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl,download["src"])
r = requests.get(fileUrl, allow_redirects=True)
filename = os.path.join(downloadDirectory, fileUrl.split('/')[-1])
open(filename, 'wb').write(r.content)
EDIT: I've updated the filename = ... line so that it writes the file of the same name to the directory in the string downloadDirectory. By the way, the normal convention for Python variables is not to use camel case.
I'm trying to download a video from s URL using Python's urllib package. My Python version is 3.6.
Here's what I have tried:
from views.py:
def post(self, request, *args, **kwargs):
serializer = VideoConverterSerializer(data=self.request.data)
validation = serializer.is_valid()
print(serializer.errors)
if validation is True:
url = request.POST.get('video_url')
try:
r = urllib.request.urlopen(url)
with open('my_video.mp4', 'wb') as f:
f.write(r.read())
rea_response = HttpResponse('my_video.mp4', content_type='video/mp4')
rea_response['Content-Disposition'] = 'attachment; filename=my_video.mp4'
return rea_response
except TimeoutError:
return HttpResponse(TimeoutError)
else:
return HttpResponse('Not a valid request')
Here's an example URL I'm trying with:
https://expirebox.com/files/386713962c5f8b7556bc77c4a6c2a576.mp4
The code above download the video file as my_video.mp4 but the video is not playable. The actual size of the video is ~5.9 MB but the size of download video is 11 KB only, so definitely something wrong with the downloaded video.
What can be wrong here?
help me, please!
Thanks in advance!
You don't actually include the video file in your response; you simply create an HttpResponse with the literal text 'my_video.mp4'.
Rather than reading into a file and then trying to pass that file in the response, you should probably use an in-memory buffer:
from io import BytesIO
...
r = urllib.request.urlopen(url)
data = BytesIO(r.read())
rea_response = HttpResponse(data, content_type='video/mp4')
However, you should note that your URL appears to point to an HTML page, not a video file.
OK I'm trying to scrape jpg image from Gucci website. Take this one as example.
http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg
I tried urllib.urlretrieve, which doesn't work becasue Gucci blocked the function. So I wanted to use requests to scrape the source code for the image and then write it into a .jpg file.
image = requests.get("http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg").text.encode('utf-8')
I encoded it because if I don't, it keeps telling me that gbk cannot encode the string.
Then:
with open('1.jpg', 'wb') as f:
f.write(image)
looks good right? But the result is -- the jpg file cannot be opened. There's no image! Windows tells me the jpg file is damaged.
What could be the problem?
I'm thinking that maybe when I scraped the image, I lost some information, or some characters are wrongly scraped. But how can I find out which?
I'm thinking that maybe some information is lost via encoding. But if I don't encode, I cannot even print it, not to mention writing it into a file.
What could go wrong?
I am not sure about the purpose of your use of encode. You're not working with text, you're working with an image. You need to access the response as binary data, not as text, and use image manipulation functions rather than text ones. Try this:
from PIL import Image
from io import BytesIO
import requests
response = requests.get("http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg")
bytes = BytesIO(response.content)
image = Image.open(bytes)
image.save("1.jpg")
Note the use of response.content instead of response.text. You will need to have PIL or Pillow installed to use the Image module. BytesIO is included in Python 3.
Or you can just save the data straight to disk without looking at what's inside:
import requests
response = requests.get("http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/277520_F4CYG_4080_001_web_full_new_theme.jpg")
with open('1.jpg','wb') as f:
f.write(response.content)
A JPEG file is not text, it's binary data. So you need to use the request.content attribute to access it.
The code below also includes a get_headers() function, which can be handy when you're exploring a Web site.
import requests
def get_headers(url):
resp = requests.head(url)
print("Status: %d" % resp.status_code)
resp.raise_for_status()
for t in resp.headers.items():
print('%-16s : %s' % t)
def download(url, fname):
''' Download url to fname '''
print("Downloading '%s' to '%s'" % (url, fname))
resp = requests.get(url)
resp.raise_for_status()
with open(fname, 'wb') as f:
f.write(resp.content)
def main():
site = 'http://www.gucci.com/images/ecommerce/styles_new/201501/web_full/'
basename = '277520_F4CYG_4080_001_web_full_new_theme.jpg'
url = site + basename
fname = 'qtest.jpg'
try:
#get_headers(url)
download(url, fname)
except requests.exceptions.HTTPError as e:
print("%s '%s'" % (e, url))
if __name__ == '__main__':
main()
We call the .raise_for_status() method so that get_headers() and download() raise an Exception if something goes wrong; we catch the Exception in main() and print the relevant info.