I need to download several files via http in Python.
The most obvious way to do it is just using urllib2:
import urllib2
u = urllib2.urlopen('http://server.com/file.html')
localFile = open('file.html', 'w')
localFile.write(u.read())
localFile.close()
But I'll have to deal with the URLs that are nasty in some way, say like this: http://server.com/!Run.aspx/someoddtext/somemore?id=121&m=pdf. When downloaded via the browser, the file has a human-readable name, ie. accounts.pdf.
Is there any way to handle that in python, so I don't need to know the file names and hardcode them into my script?
Download scripts like that tend to push a header telling the user-agent what to name the file:
Content-Disposition: attachment; filename="the filename.ext"
If you can grab that header, you can get the proper filename.
There's another thread that has a little bit of code to offer up for Content-Disposition-grabbing.
remotefile = urllib2.urlopen('http://example.com/somefile.zip')
remotefile.info()['Content-Disposition']
Based on comments and #Oli's anwser, I made a solution like this:
from os.path import basename
from urlparse import urlsplit
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, localFileName = None):
localName = url2name(url)
req = urllib2.Request(url)
r = urllib2.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
if localFileName:
# we can force to save the file as specified name
localName = localFileName
f = open(localName, 'wb')
f.write(r.read())
f.close()
It takes file name from Content-Disposition; if it's not present, uses filename from the URL (if redirection happened, the final URL is taken into account).
Combining much of the above, here is a more pythonic solution:
import urllib2
import shutil
import urlparse
import os
def download(url, fileName=None):
def getFileName(url,openUrl):
if 'Content-Disposition' in openUrl.info():
# If the response has Content-Disposition, try to get filename from it
cd = dict(map(
lambda x: x.strip().split('=') if '=' in x else (x.strip(),''),
openUrl.info()['Content-Disposition'].split(';')))
if 'filename' in cd:
filename = cd['filename'].strip("\"'")
if filename: return filename
# if no filename was found above, parse it out of the final URL.
return os.path.basename(urlparse.urlsplit(openUrl.url)[2])
r = urllib2.urlopen(urllib2.Request(url))
try:
fileName = fileName or getFileName(url,r)
with open(fileName, 'wb') as f:
shutil.copyfileobj(r,f)
finally:
r.close()
2 Kender:
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
it is not safe -- web server can pass wrong formatted name as ["file.ext] or [file.ext'] or even be empty and localName[0] will raise exception.
Correct code can looks like this:
localName = localName.replace('"', '').replace("'", "")
if localName == '':
localName = SOME_DEFAULT_FILE_NAME
Using wget:
custom_file_name = "/custom/path/custom_name.ext"
wget.download(url, custom_file_name)
Using urlretrieve:
urllib.urlretrieve(url, custom_file_name)
urlretrieve also creates the directory structure if not exists.
You need to look into 'Content-Disposition' header, see the solution by kender.
How to download a file using python in a 'smarter' way?
Posting his solution modified with a capability to specify an output folder:
from os.path import basename
import os
from urllib.parse import urlsplit
import urllib.request
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, out_path):
localName = url2name(url)
req = urllib.request.Request(url)
r = urllib.request.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
localName = os.path.join(out_path, localName)
f = open(localName, 'wb')
f.write(r.read())
f.close()
download("https://example.com/demofile", '/home/username/tmp')
I have just updated the answer of kender for python3
Related
How would I go by changing the twitter banner using an image from url using tweepy library: https://github.com/tweepy/tweepy/blob/v2.3.0/tweepy/api.py#L392
So far I got this and it returns:
def banner(self):
url = 'https://blog.snappa.com/wp-content/uploads/2019/01/Twitter-Header-Size.png'
file = requests.get(url)
self.api.update_profile_banner(filename=file.content)
ValueError: stat: embedded null character in path
It seems like filename requires an image to be downloaded. Anyway to process this without downloading the image and then removing it?
Looking at library's code you can do what you want.
def update_profile_banner(self, filename, *args, **kargs):
f = kargs.pop('file', None)
So what you need to do is supply the filename and the file kwarg:
filename = url.split('/')[-1]
self.api.update_profile_banner(filename, file=file.content)
import tempfile
def banner():
url = 'file_url'
file = requests.get(url)
temp = tempfile.NamedTemporaryFile(suffix=".png")
try:
temp.write(file.content)
self.api.update_profile_banner(filename=temp.name)
finally:
temp.close()
I don't know what this error means. Any advice about the error or the rest of the code is greatly appreciated.
import urllib
import urllib2
import os
import re
from bs4 import BeautifulSoup
def image_scrape():
url = raw_input("Type url for image scrape: ")
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content)
name = 0
for tag in soup.find_all(re.compile("img")):
path = 'C:\Users\Sorcerer\Downloads'
name += 1
filename = name
file_path = "%s%s" % (path, filename)
downloaded_image = file(file_path, "wb")
downloaded_image.write(buf)
downloaded_image.close()
image_scrape()
You have a line in your code:
downloaded_image.write(buf)
The Python interpreter has not seen this variable buf before in your code. And hence the error.
Thoughts on the rest of your code:
It is advisable to use the os module to do what you are doing with this line:
file_path = "%s%s" % (path, filename)
like this:
import os
path = os.path.normpath('C:\\Users\\Sorcerer\\Downloads')
file_path = os.path.join(path, name)
Looks like you are trying to find all the image links in the page and trying to save it to the file system at the location referenced by file_path. Assuming the link to the image is in the variable tag, this is what you do:
import requests
r = requests.get(tag, stream=True)
if r.status_code == 200:
with open(name, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
f.close()
I can run the following code:
import httplib2
h = httplib2.Http('.cache')
response, content = h.request('http://2.bp.blogspot.com/-CXFfl9luHPM/TV-Os6opQfI/AAAAAAAAA2E/oCgrgvWqzrY/s1600/cow.jpg')
print(response.status)
with open('cow.jpg', 'wb') as f:
f.write(content)
When I run the code, I download a file called cow.jpg which is what I want, but I also get a duplicate image with a different name called: 2.bp.blogspot.com,-CXFfl9luHPM,TV-Os6opQfI,AAAAAAAAA2E,oCgrgvWqzrY,s1600,cow.jpg,77ba31012a25509bfdc78bea4e1bfdd1. It's the http address with commas plus other junk. Any ideas on how I can create only one image using httplib2? Thanks.
Just write the content to a file:
with open('cow.jpg', 'wb') as f:
f.write(content)
Use urllib and method urlretrieve, the second argument is the file location.
for python 2.x
import urllib
urllib.urlretrieve(URL, path_destination)
Is using urllib2 ok for you, too? If yes, you can use this function:
def download_file(url):
"""Create an urllib2 request and return the request plus some useful info"""
name = filename_from_url(url)
r = urllib2.urlopen(urllib2.Request(url))
info = r.info()
if 'Content-Disposition' in info:
# If the response has Content-Disposition, we take filename from it
name = info['Content-Disposition'].split('filename=')[1]
if name[0] == '"' or name[0] == "'":
name = name[1:-1]
elif r.geturl() != url:
# if we were redirected, take the filename from the final url
name = filename_from_url(r.geturl())
content_type = None
if 'Content-Type' in info:
content_type = info['Content-Type'].split(';')[0]
# Try to guess missing info
if not name and not content_type:
name = 'unknown'
elif not name:
name = 'unknown' + mimetypes.guess_extension(content_type) or ''
elif not content_type:
content_type = mimetypes.guess_type(name)[0]
return r, name, content_type
Usage:
fp, filename, content_type = download_file('http://url/to/some/file')
with open('somefile', 'w') as dst:
shutil.copyfileobj(fp, dst)
This code has the advantage that is never reads the whole file into memory - so it works fine for huge files, too. Besides that, it also gives you the filename received from the server and the content-type in case you want/need it.
I'm scripting an install script in python.
How do I download file from ftp in python?
Operating system -- Windows XP - if that makes a difference.
from urllib2 import urlopen
req = urlopen('ftp://ftp.gnu.org/README')
Then you can use req.read() to load the file content into a variable or do anything else with it, or shutil.copyfileobj to save the content to a disk without loading it to memory.
Here's a code snippet I'm currently using.
import mimetypes
import os
import urllib2
import urlparse
def filename_from_url(url):
return os.path.basename(urlparse.urlsplit(url)[2])
def download_file(url):
"""Create an urllib2 request and return the request plus some useful info"""
name = filename_from_url(url)
r = urllib2.urlopen(urllib2.Request(url))
info = r.info()
if 'Content-Disposition' in info:
# If the response has Content-Disposition, we take filename from it
name = info['Content-Disposition'].split('filename=')[1]
if name[0] == '"' or name[0] == "'":
name = name[1:-1]
elif r.geturl() != url:
# if we were redirected, take the filename from the final url
name = filename_from_url(r.geturl())
content_type = None
if 'Content-Type' in info:
content_type = info['Content-Type'].split(';')[0]
# Try to guess missing info
if not name and not content_type:
name = 'unknown'
elif not name:
name = 'unknown' + mimetypes.guess_extension(content_type) or ''
elif not content_type:
content_type = mimetypes.guess_type(name)[0]
return r, name, content_type
Usage:
req, filename, content_type = download_file('http://some.url')
Then you can use req as a file-like object and e.g. use shutil.copyfileobj() to copy the file contents into a local file. If the MIME type doesn't matter simply remove that part of the code.
Since you seem to be lazy, here's code downloading the file directly to a local file:
import shutil
def download_file_locally(url, dest):
req, filename, content_type = download_file(url)
if dest.endswith('/'):
dest = os.path.join(dest, filename)
with open(dest, 'wb') as f:
shutil.copyfileobj(req, f)
req.close()
This method is smart enough to use the filename sent by the server if you specify a path ending with a slash, otherwise it uses the destination you specified.
Use ftplib
Code Sample from the documentation:
>>> from ftplib import FTP
>>> ftp = FTP('ftp.cwi.nl') # connect to host, default port
>>> ftp.login() # user anonymous, passwd anonymous#
>>> ftp.retrlines('LIST') # list directory contents
total 24418
drwxrwsr-x 5 ftp-usr pdmaint 1536 Mar 20 09:48 .
dr-xr-srwt 105 ftp-usr pdmaint 1536 Mar 21 14:32 ..
-rw-r--r-- 1 ftp-usr pdmaint 5305 Mar 20 09:48 INDEX
.
.
.
>>> ftp.retrbinary('RETR README', open('README', 'wb').write)
'226 Transfer complete.'
>>> ftp.quit()
from urllib.request import urlopen
try:
req = urlopen('ftp://ftp.expasy.org/databases/enzyme/enzclass.txt')
except:
print ("Error")
I can’t really understand how YouTube serves videos, but I have been reading through what I can.
It seems like the old method get_video is now obsolete and can't be used any more. Is there another Pythonic and simple method for collecting YouTube videos?
You might have some luck with youtube-dl
http://rg3.github.com/youtube-dl/documentation.html
I'm not sure if there's a good API, but it's written in Python, so theoretically you could do something a little better than Popen :)
Here is a quick Python script which downloads a YouTube video. No bells and whistles, just scrapes out the necessary URLs, hits the generate_204 URL and then streams the data to a file:
import lxml.html
import re
import sys
import urllib
import urllib2
_RE_G204 = re.compile('"(http:.+.youtube.com.*\/generate_204[^"]+")', re.M)
_RE_URLS = re.compile('"fmt_url_map": "(\d*[^"]+)",.*', re.M)
def _fetch_url(url, ref=None, path=None):
opener = urllib2.build_opener()
headers = {}
if ref:
headers['Referer'] = ref
request = urllib2.Request(url, headers=headers)
handle = urllib2.urlopen(request)
if not path:
return handle.read()
sys.stdout.write('saving: ')
# Write result to file
with open(path, 'wb') as out:
while True:
part = handle.read(65536)
if not part:
break
out.write(part)
sys.stdout.write('.')
sys.stdout.flush()
sys.stdout.write('\nFinished.\n')
def _extract(html):
tree = lxml.html.fromstring(html)
res = {'204': _RE_G204.findall(html)[0].replace('\\', '')}
for script in tree.findall('.//script'):
text = script.text_content()
if 'fmt_url_map' not in text:
continue
# Found it. Extract the URLs we need
for tmp in _RE_URLS.findall(text)[0].split(','):
url_id, url = tmp.split('|')
res[url_id] = url.replace('\\', '')
break
return res
def main():
target = sys.argv[1]
dest = sys.argv[2]
html = _fetch_url(target)
res = dict(_extract(html))
# Hit the 'generate_204' URL first and remove it
_fetch_url(res['204'], ref=target)
del res['204']
# Download the video. Now I grab the first 'download' URL and use it.
first = res.values()[0]
_fetch_url(first, ref=target, path=dest)
if __name__ == '__main__':
main()
Running it:
python youdown.py 'http://www.youtube.com/watch?v=Je_iqbgGXFw' stevegadd.flv
saving: ........................... finished.
I would recommend writing your own parser using urllib2 or Beautiful Soup. You can look at the source code for DownThemAll to see how that plugin finds the video URL.