How to download file from ftp? - python

I'm scripting an install script in python.
How do I download file from ftp in python?
Operating system -- Windows XP - if that makes a difference.

from urllib2 import urlopen
req = urlopen('ftp://ftp.gnu.org/README')
Then you can use req.read() to load the file content into a variable or do anything else with it, or shutil.copyfileobj to save the content to a disk without loading it to memory.

Here's a code snippet I'm currently using.
import mimetypes
import os
import urllib2
import urlparse
def filename_from_url(url):
return os.path.basename(urlparse.urlsplit(url)[2])
def download_file(url):
"""Create an urllib2 request and return the request plus some useful info"""
name = filename_from_url(url)
r = urllib2.urlopen(urllib2.Request(url))
info = r.info()
if 'Content-Disposition' in info:
# If the response has Content-Disposition, we take filename from it
name = info['Content-Disposition'].split('filename=')[1]
if name[0] == '"' or name[0] == "'":
name = name[1:-1]
elif r.geturl() != url:
# if we were redirected, take the filename from the final url
name = filename_from_url(r.geturl())
content_type = None
if 'Content-Type' in info:
content_type = info['Content-Type'].split(';')[0]
# Try to guess missing info
if not name and not content_type:
name = 'unknown'
elif not name:
name = 'unknown' + mimetypes.guess_extension(content_type) or ''
elif not content_type:
content_type = mimetypes.guess_type(name)[0]
return r, name, content_type
Usage:
req, filename, content_type = download_file('http://some.url')
Then you can use req as a file-like object and e.g. use shutil.copyfileobj() to copy the file contents into a local file. If the MIME type doesn't matter simply remove that part of the code.
Since you seem to be lazy, here's code downloading the file directly to a local file:
import shutil
def download_file_locally(url, dest):
req, filename, content_type = download_file(url)
if dest.endswith('/'):
dest = os.path.join(dest, filename)
with open(dest, 'wb') as f:
shutil.copyfileobj(req, f)
req.close()
This method is smart enough to use the filename sent by the server if you specify a path ending with a slash, otherwise it uses the destination you specified.

Use ftplib
Code Sample from the documentation:
>>> from ftplib import FTP
>>> ftp = FTP('ftp.cwi.nl') # connect to host, default port
>>> ftp.login() # user anonymous, passwd anonymous#
>>> ftp.retrlines('LIST') # list directory contents
total 24418
drwxrwsr-x 5 ftp-usr pdmaint 1536 Mar 20 09:48 .
dr-xr-srwt 105 ftp-usr pdmaint 1536 Mar 21 14:32 ..
-rw-r--r-- 1 ftp-usr pdmaint 5305 Mar 20 09:48 INDEX
.
.
.
>>> ftp.retrbinary('RETR README', open('README', 'wb').write)
'226 Transfer complete.'
>>> ftp.quit()

from urllib.request import urlopen
try:
req = urlopen('ftp://ftp.expasy.org/databases/enzyme/enzclass.txt')
except:
print ("Error")

Related

How to download file by using python? [duplicate]

I have a small utility that I use to download an MP3 file from a website on a schedule and then builds/updates a podcast XML file which I've added to iTunes.
The text processing that creates/updates the XML file is written in Python. However, I use wget inside a Windows .bat file to download the actual MP3 file. I would prefer to have the entire utility written in Python.
I struggled to find a way to actually download the file in Python, thus why I resorted to using wget.
So, how do I download the file using Python?
One more, using urlretrieve:
import urllib.request
urllib.request.urlretrieve("http://www.example.com/songs/mp3.mp3", "mp3.mp3")
(for Python 2 use import urllib and urllib.urlretrieve)
Use urllib.request.urlopen():
import urllib.request
with urllib.request.urlopen('http://www.example.com/') as f:
html = f.read().decode('utf-8')
This is the most basic way to use the library, minus any error handling. You can also do more complex stuff such as changing headers.
On Python 2, the method is in urllib2:
import urllib2
response = urllib2.urlopen('http://www.example.com/')
html = response.read()
In 2012, use the python requests library
>>> import requests
>>>
>>> url = "http://download.thinkbroadband.com/10MB.zip"
>>> r = requests.get(url)
>>> print len(r.content)
10485760
You can run pip install requests to get it.
Requests has many advantages over the alternatives because the API is much simpler. This is especially true if you have to do authentication. urllib and urllib2 are pretty unintuitive and painful in this case.
2015-12-30
People have expressed admiration for the progress bar. It's cool, sure. There are several off-the-shelf solutions now, including tqdm:
from tqdm import tqdm
import requests
url = "http://download.thinkbroadband.com/10MB.zip"
response = requests.get(url, stream=True)
with open("10MB", "wb") as handle:
for data in tqdm(response.iter_content()):
handle.write(data)
This is essentially the implementation #kvance described 30 months ago.
import urllib2
mp3file = urllib2.urlopen("http://www.example.com/songs/mp3.mp3")
with open('test.mp3','wb') as output:
output.write(mp3file.read())
The wb in open('test.mp3','wb') opens a file (and erases any existing file) in binary mode so you can save data with it instead of just text.
Python 3
urllib.request.urlopen
import urllib.request
response = urllib.request.urlopen('http://www.example.com/')
html = response.read()
urllib.request.urlretrieve
import urllib.request
urllib.request.urlretrieve('http://www.example.com/songs/mp3.mp3', 'mp3.mp3')
Note: According to the documentation, urllib.request.urlretrieve is a "legacy interface" and "might become deprecated in the future" (thanks gerrit)
Python 2
urllib2.urlopen (thanks Corey)
import urllib2
response = urllib2.urlopen('http://www.example.com/')
html = response.read()
urllib.urlretrieve (thanks PabloG)
import urllib
urllib.urlretrieve('http://www.example.com/songs/mp3.mp3', 'mp3.mp3')
use wget module:
import wget
wget.download('url')
import os,requests
def download(url):
get_response = requests.get(url,stream=True)
file_name = url.split("/")[-1]
with open(file_name, 'wb') as f:
for chunk in get_response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
download("https://example.com/example.jpg")
An improved version of the PabloG code for Python 2/3:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import ( division, absolute_import, print_function, unicode_literals )
import sys, os, tempfile, logging
if sys.version_info >= (3,):
import urllib.request as urllib2
import urllib.parse as urlparse
else:
import urllib2
import urlparse
def download_file(url, dest=None):
"""
Download and save a file specified by url to dest directory,
"""
u = urllib2.urlopen(url)
scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
filename = os.path.basename(path)
if not filename:
filename = 'downloaded.file'
if dest:
filename = os.path.join(dest, filename)
with open(filename, 'wb') as f:
meta = u.info()
meta_func = meta.getheaders if hasattr(meta, 'getheaders') else meta.get_all
meta_length = meta_func("Content-Length")
file_size = None
if meta_length:
file_size = int(meta_length[0])
print("Downloading: {0} Bytes: {1}".format(url, file_size))
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = "{0:16}".format(file_size_dl)
if file_size:
status += " [{0:6.2f}%]".format(file_size_dl * 100 / file_size)
status += chr(13)
print(status, end="")
print()
return filename
if __name__ == "__main__": # Only run if this file is called directly
print("Testing with 10MB download")
url = "http://download.thinkbroadband.com/10MB.zip"
filename = download_file(url)
print(filename)
Simple yet Python 2 & Python 3 compatible way comes with six library:
from six.moves import urllib
urllib.request.urlretrieve("http://www.example.com/songs/mp3.mp3", "mp3.mp3")
Following are the most commonly used calls for downloading files in python:
urllib.urlretrieve ('url_to_file', file_name)
urllib2.urlopen('url_to_file')
requests.get(url)
wget.download('url', file_name)
Note: urlopen and urlretrieve are found to perform relatively bad with downloading large files (size > 500 MB). requests.get stores the file in-memory until download is complete.
Wrote wget library in pure Python just for this purpose. It is pumped up urlretrieve with these features as of version 2.0.
In python3 you can use urllib3 and shutil libraires.
Download them by using pip or pip3 (Depending whether python3 is default or not)
pip3 install urllib3 shutil
Then run this code
import urllib.request
import shutil
url = "http://www.somewebsite.com/something.pdf"
output_file = "save_this_name.pdf"
with urllib.request.urlopen(url) as response, open(output_file, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
Note that you download urllib3 but use urllib in code
I agree with Corey, urllib2 is more complete than urllib and should likely be the module used if you want to do more complex things, but to make the answers more complete, urllib is a simpler module if you want just the basics:
import urllib
response = urllib.urlopen('http://www.example.com/sound.mp3')
mp3 = response.read()
Will work fine. Or, if you don't want to deal with the "response" object you can call read() directly:
import urllib
mp3 = urllib.urlopen('http://www.example.com/sound.mp3').read()
If you have wget installed, you can use parallel_sync.
pip install parallel_sync
from parallel_sync import wget
urls = ['http://something.png', 'http://somthing.tar.gz', 'http://somthing.zip']
wget.download('/tmp', urls)
# or a single file:
wget.download('/tmp', urls[0], filenames='x.zip', extract=True)
Doc:
https://pythonhosted.org/parallel_sync/pages/examples.html
This is pretty powerful. It can download files in parallel, retry upon failure , and it can even download files on a remote machine.
You can get the progress feedback with urlretrieve as well:
def report(blocknr, blocksize, size):
current = blocknr*blocksize
sys.stdout.write("\r{0:.2f}%".format(100.0*current/size))
def downloadFile(url):
print "\n",url
fname = url.split('/')[-1]
print fname
urllib.urlretrieve(url, fname, report)
If speed matters to you, I made a small performance test for the modules urllib and wget, and regarding wget I tried once with status bar and once without. I took three different 500MB files to test with (different files- to eliminate the chance that there is some caching going on under the hood). Tested on debian machine, with python2.
First, these are the results (they are similar in different runs):
$ python wget_test.py
urlretrive_test : starting
urlretrive_test : 6.56
==============
wget_no_bar_test : starting
wget_no_bar_test : 7.20
==============
wget_with_bar_test : starting
100% [......................................................................] 541335552 / 541335552
wget_with_bar_test : 50.49
==============
The way I performed the test is using "profile" decorator. This is the full code:
import wget
import urllib
import time
from functools import wraps
def profile(func):
#wraps(func)
def inner(*args):
print func.__name__, ": starting"
start = time.time()
ret = func(*args)
end = time.time()
print func.__name__, ": {:.2f}".format(end - start)
return ret
return inner
url1 = 'http://host.com/500a.iso'
url2 = 'http://host.com/500b.iso'
url3 = 'http://host.com/500c.iso'
def do_nothing(*args):
pass
#profile
def urlretrive_test(url):
return urllib.urlretrieve(url)
#profile
def wget_no_bar_test(url):
return wget.download(url, out='/tmp/', bar=do_nothing)
#profile
def wget_with_bar_test(url):
return wget.download(url, out='/tmp/')
urlretrive_test(url1)
print '=============='
time.sleep(1)
wget_no_bar_test(url2)
print '=============='
time.sleep(1)
wget_with_bar_test(url3)
print '=============='
time.sleep(1)
urllib seems to be the fastest
Just for the sake of completeness, it is also possible to call any program for retrieving files using the subprocess package. Programs dedicated to retrieving files are more powerful than Python functions like urlretrieve. For example, wget can download directories recursively (-R), can deal with FTP, redirects, HTTP proxies, can avoid re-downloading existing files (-nc), and aria2 can do multi-connection downloads which can potentially speed up your downloads.
import subprocess
subprocess.check_output(['wget', '-O', 'example_output_file.html', 'https://example.com'])
In Jupyter Notebook, one can also call programs directly with the ! syntax:
!wget -O example_output_file.html https://example.com
Late answer, but for python>=3.6 you can use:
import dload
dload.save(url)
Install dload with:
pip3 install dload
Source code can be:
import urllib
sock = urllib.urlopen("http://diveintopython.org/")
htmlSource = sock.read()
sock.close()
print htmlSource
I wrote the following, which works in vanilla Python 2 or Python 3.
import sys
try:
import urllib.request
python3 = True
except ImportError:
import urllib2
python3 = False
def progress_callback_simple(downloaded,total):
sys.stdout.write(
"\r" +
(len(str(total))-len(str(downloaded)))*" " + str(downloaded) + "/%d"%total +
" [%3.2f%%]"%(100.0*float(downloaded)/float(total))
)
sys.stdout.flush()
def download(srcurl, dstfilepath, progress_callback=None, block_size=8192):
def _download_helper(response, out_file, file_size):
if progress_callback!=None: progress_callback(0,file_size)
if block_size == None:
buffer = response.read()
out_file.write(buffer)
if progress_callback!=None: progress_callback(file_size,file_size)
else:
file_size_dl = 0
while True:
buffer = response.read(block_size)
if not buffer: break
file_size_dl += len(buffer)
out_file.write(buffer)
if progress_callback!=None: progress_callback(file_size_dl,file_size)
with open(dstfilepath,"wb") as out_file:
if python3:
with urllib.request.urlopen(srcurl) as response:
file_size = int(response.getheader("Content-Length"))
_download_helper(response,out_file,file_size)
else:
response = urllib2.urlopen(srcurl)
meta = response.info()
file_size = int(meta.getheaders("Content-Length")[0])
_download_helper(response,out_file,file_size)
import traceback
try:
download(
"https://geometrian.com/data/programming/projects/glLib/glLib%20Reloaded%200.5.9/0.5.9.zip",
"output.zip",
progress_callback_simple
)
except:
traceback.print_exc()
input()
Notes:
Supports a "progress bar" callback.
Download is a 4 MB test .zip from my website.
You can use PycURL on Python 2 and 3.
import pycurl
FILE_DEST = 'pycurl.html'
FILE_SRC = 'http://pycurl.io/'
with open(FILE_DEST, 'wb') as f:
c = pycurl.Curl()
c.setopt(c.URL, FILE_SRC)
c.setopt(c.WRITEDATA, f)
c.perform()
c.close()
Use Python Requests in 5 lines
import requests as req
remote_url = 'http://www.example.com/sound.mp3'
local_file_name = 'sound.mp3'
data = req.get(remote_url)
# Save file data to local copy
with open(local_file_name, 'wb')as file:
file.write(data.content)
Now do something with the local copy of the remote file
This may be a little late, But I saw pabloG's code and couldn't help adding a os.system('cls') to make it look AWESOME! Check it out :
import urllib2,os
url = "http://download.thinkbroadband.com/10MB.zip"
file_name = url.split('/')[-1]
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)
os.system('cls')
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
print status,
f.close()
If running in an environment other than Windows, you will have to use something other then 'cls'. In MAC OS X and Linux it should be 'clear'.
urlretrieve and requests.get are simple, however the reality not.
I have fetched data for couple sites, including text and images, the above two probably solve most of the tasks. but for a more universal solution I suggest the use of urlopen. As it is included in Python 3 standard library, your code could run on any machine that run Python 3 without pre-installing site-package
import urllib.request
url_request = urllib.request.Request(url, headers=headers)
url_connect = urllib.request.urlopen(url_request)
#remember to open file in bytes mode
with open(filename, 'wb') as f:
while True:
buffer = url_connect.read(buffer_size)
if not buffer: break
#an integer value of size of written data
data_wrote = f.write(buffer)
#you could probably use with-open-as manner
url_connect.close()
This answer provides a solution to HTTP 403 Forbidden when downloading file over http using Python. I have tried only requests and urllib modules, the other module may provide something better, but this is the one I used to solve most of the problems.
New Api urllib3 based implementation
>>> import urllib3
>>> http = urllib3.PoolManager()
>>> r = http.request('GET', 'your_url_goes_here')
>>> r.status
200
>>> r.data
*****Response Data****
More info: https://pypi.org/project/urllib3/
You can python requests
import os
import requests
outfile = os.path.join(SAVE_DIR, file_name)
response = requests.get(URL, stream=True)
with open(outfile,'wb') as output:
output.write(response.content)
You can use shutil
import os
import requests
import shutil
outfile = os.path.join(SAVE_DIR, file_name)
response = requests.get(url, stream = True)
with open(outfile, 'wb') as f:
shutil.copyfileobj(response.content, f)
If you are downloading from restricted url, don't forget to include access token in headers
I wanted do download all the files from a webpage. I tried wget but it was failing so I decided for the Python route and I found this thread.
After reading it, I have made a little command line application, soupget, expanding on the excellent answers of PabloG and Stan and adding some useful options.
It uses BeatifulSoup to collect all the URLs of the page and then download the ones with the desired extension(s). Finally it can download multiple files in parallel.
Here it is:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import (division, absolute_import, print_function, unicode_literals)
import sys, os, argparse
from bs4 import BeautifulSoup
# --- insert Stan's script here ---
# if sys.version_info >= (3,):
#...
#...
# def download_file(url, dest=None):
#...
#...
# --- new stuff ---
def collect_all_url(page_url, extensions):
"""
Recovers all links in page_url checking for all the desired extensions
"""
conn = urllib2.urlopen(page_url)
html = conn.read()
soup = BeautifulSoup(html, 'lxml')
links = soup.find_all('a')
results = []
for tag in links:
link = tag.get('href', None)
if link is not None:
for e in extensions:
if e in link:
# Fallback for badly defined links
# checks for missing scheme or netloc
if bool(urlparse.urlparse(link).scheme) and bool(urlparse.urlparse(link).netloc):
results.append(link)
else:
new_url=urlparse.urljoin(page_url,link)
results.append(new_url)
return results
if __name__ == "__main__": # Only run if this file is called directly
# Command line arguments
parser = argparse.ArgumentParser(
description='Download all files from a webpage.')
parser.add_argument(
'-u', '--url',
help='Page url to request')
parser.add_argument(
'-e', '--ext',
nargs='+',
help='Extension(s) to find')
parser.add_argument(
'-d', '--dest',
default=None,
help='Destination where to save the files')
parser.add_argument(
'-p', '--par',
action='store_true', default=False,
help="Turns on parallel download")
args = parser.parse_args()
# Recover files to download
all_links = collect_all_url(args.url, args.ext)
# Download
if not args.par:
for l in all_links:
try:
filename = download_file(l, args.dest)
print(l)
except Exception as e:
print("Error while downloading: {}".format(e))
else:
from multiprocessing.pool import ThreadPool
results = ThreadPool(10).imap_unordered(
lambda x: download_file(x, args.dest), all_links)
for p in results:
print(p)
An example of its usage is:
python3 soupget.py -p -e <list of extensions> -d <destination_folder> -u <target_webpage>
And an actual example if you want to see it in action:
python3 soupget.py -p -e .xlsx .pdf .csv -u https://healthdata.gov/dataset/chemicals-cosmetics
Another possibility is with built-in http.client:
from http import HTTPStatus, client
from shutil import copyfileobj
# using https
connection = client.HTTPSConnection("www.example.com")
with connection.request("GET", "/noise.mp3") as response:
if response.status == HTTPStatus.OK:
copyfileobj(response, open("noise.mp3")
else:
raise Exception("request needs work")
The HTTPConnection object is considered “low-level” in that it performs the desired request once and assumes the developer will subclass it or script in a way to handle the nuances of HTTP. Libraries such as requests tend to handle more special cases such as automatically following redirects and so on.
You can use keras.utils.get_file to do it:
from tensorflow import keras
path_to_downloaded_file = keras.utils.get_file(
fname="file name",
origin="https://www.linktofile.com/link/to/file",
extract=True,
archive_format="zip", # downloaded file format
cache_dir="/", # cache and extract in current directory
)
Another way is to call an external process such as curl.exe. Curl by default displays a progress bar, average download speed, time left, and more all formatted neatly in a table.
Put curl.exe in the same directory as your script
from subprocess import call
url = ""
call(["curl", {url}, '--output', "song.mp3"])
Note: You cannot specify an output path with curl, so do an os.rename afterwards

Change twitter banner from url

How would I go by changing the twitter banner using an image from url using tweepy library: https://github.com/tweepy/tweepy/blob/v2.3.0/tweepy/api.py#L392
So far I got this and it returns:
def banner(self):
url = 'https://blog.snappa.com/wp-content/uploads/2019/01/Twitter-Header-Size.png'
file = requests.get(url)
self.api.update_profile_banner(filename=file.content)
ValueError: stat: embedded null character in path
It seems like filename requires an image to be downloaded. Anyway to process this without downloading the image and then removing it?
Looking at library's code you can do what you want.
def update_profile_banner(self, filename, *args, **kargs):
f = kargs.pop('file', None)
So what you need to do is supply the filename and the file kwarg:
filename = url.split('/')[-1]
self.api.update_profile_banner(filename, file=file.content)
import tempfile
def banner():
url = 'file_url'
file = requests.get(url)
temp = tempfile.NamedTemporaryFile(suffix=".png")
try:
temp.write(file.content)
self.api.update_profile_banner(filename=temp.name)
finally:
temp.close()

How to download a few files simultaneusly from ftp in Python

I'm a newbie in Python programming.
My question is, how to download a few files at the same time. Not file by file but simultaneously from one directory on ftp. Now I use this script but I don't know how I can rebuild this code:
filenames = []
ftp.retrlines("NLST", filenames.append)
print filenames
print path
for filename in filenames:
local_filename = filename
print filename
print local_filename
f = open(local_filename, "wb")
s = ftp.size(local_filename)
sMB = s/(1024*1024)
print "file name: " + local_filename + "\nfile size: " + str(sMB) + " MB"
ftp.retrbinary("RETR %s" % local_filename, f.write)
print "\n Done :) "
time.sleep(2)
f.close()
ftp.quit() #closing connection
time.sleep(5)
It works fine, but not what I need.
You could use multiple threads or processes. Make sure you create a new ftplib.FTP object in each thread. The simplest way (code-wise) is to use multiprocessing.Pool:
#!/usr/bin/env python
from multiprocessing.dummy import Pool # use threads
try:
from urllib import urlretrieve
except ImportError: # Python 3
from urllib.request import urlretrieve
def download(url):
url = url.strip()
try:
return urlretrieve(url, url2filename(url)), None
except Exception as e:
return None, e
if __name__ == "__main__":
p = Pool(20) # specify number of concurrent downloads
print(p.map(download, open('urls'))) # perform parallel downloads
where urls contains ftp urls for the files to download e.g., ftp://example.com/path/to/file and url2filename() extracts the filename part from an url e.g.:
import os
import posixpath
try:
from urlparse import urlsplit
from urllib import unquote
except ImportError: # Python 3
from urllib.parse import urlsplit, unquote
def url2filename(url, encoding='utf-8'):
"""Return basename corresponding to url.
>>> print url2filename('http://example.com/path/to/dir%2Ffile%C3%80?opt=1')
fileÀ
"""
urlpath = urlsplit(url).path
basename = posixpath.basename(unquote(urlpath))
if os.path.basename(basename) != basename:
raise ValueError(url) # reject 'dir%5Cbasename.ext' on Windows
return basename

How do I save a file using python's httplib2?

I can run the following code:
import httplib2
h = httplib2.Http('.cache')
response, content = h.request('http://2.bp.blogspot.com/-CXFfl9luHPM/TV-Os6opQfI/AAAAAAAAA2E/oCgrgvWqzrY/s1600/cow.jpg')
print(response.status)
with open('cow.jpg', 'wb') as f:
f.write(content)
When I run the code, I download a file called cow.jpg which is what I want, but I also get a duplicate image with a different name called: 2.bp.blogspot.com,-CXFfl9luHPM,TV-Os6opQfI,AAAAAAAAA2E,oCgrgvWqzrY,s1600,cow.jpg,77ba31012a25509bfdc78bea4e1bfdd1. It's the http address with commas plus other junk. Any ideas on how I can create only one image using httplib2? Thanks.
Just write the content to a file:
with open('cow.jpg', 'wb') as f:
f.write(content)
Use urllib and method urlretrieve, the second argument is the file location.
for python 2.x
import urllib
urllib.urlretrieve(URL, path_destination)
Is using urllib2 ok for you, too? If yes, you can use this function:
def download_file(url):
"""Create an urllib2 request and return the request plus some useful info"""
name = filename_from_url(url)
r = urllib2.urlopen(urllib2.Request(url))
info = r.info()
if 'Content-Disposition' in info:
# If the response has Content-Disposition, we take filename from it
name = info['Content-Disposition'].split('filename=')[1]
if name[0] == '"' or name[0] == "'":
name = name[1:-1]
elif r.geturl() != url:
# if we were redirected, take the filename from the final url
name = filename_from_url(r.geturl())
content_type = None
if 'Content-Type' in info:
content_type = info['Content-Type'].split(';')[0]
# Try to guess missing info
if not name and not content_type:
name = 'unknown'
elif not name:
name = 'unknown' + mimetypes.guess_extension(content_type) or ''
elif not content_type:
content_type = mimetypes.guess_type(name)[0]
return r, name, content_type
Usage:
fp, filename, content_type = download_file('http://url/to/some/file')
with open('somefile', 'w') as dst:
shutil.copyfileobj(fp, dst)
This code has the advantage that is never reads the whole file into memory - so it works fine for huge files, too. Besides that, it also gives you the filename received from the server and the content-type in case you want/need it.

How to download a file using python in a 'smarter' way?

I need to download several files via http in Python.
The most obvious way to do it is just using urllib2:
import urllib2
u = urllib2.urlopen('http://server.com/file.html')
localFile = open('file.html', 'w')
localFile.write(u.read())
localFile.close()
But I'll have to deal with the URLs that are nasty in some way, say like this: http://server.com/!Run.aspx/someoddtext/somemore?id=121&m=pdf. When downloaded via the browser, the file has a human-readable name, ie. accounts.pdf.
Is there any way to handle that in python, so I don't need to know the file names and hardcode them into my script?
Download scripts like that tend to push a header telling the user-agent what to name the file:
Content-Disposition: attachment; filename="the filename.ext"
If you can grab that header, you can get the proper filename.
There's another thread that has a little bit of code to offer up for Content-Disposition-grabbing.
remotefile = urllib2.urlopen('http://example.com/somefile.zip')
remotefile.info()['Content-Disposition']
Based on comments and #Oli's anwser, I made a solution like this:
from os.path import basename
from urlparse import urlsplit
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, localFileName = None):
localName = url2name(url)
req = urllib2.Request(url)
r = urllib2.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
if localFileName:
# we can force to save the file as specified name
localName = localFileName
f = open(localName, 'wb')
f.write(r.read())
f.close()
It takes file name from Content-Disposition; if it's not present, uses filename from the URL (if redirection happened, the final URL is taken into account).
Combining much of the above, here is a more pythonic solution:
import urllib2
import shutil
import urlparse
import os
def download(url, fileName=None):
def getFileName(url,openUrl):
if 'Content-Disposition' in openUrl.info():
# If the response has Content-Disposition, try to get filename from it
cd = dict(map(
lambda x: x.strip().split('=') if '=' in x else (x.strip(),''),
openUrl.info()['Content-Disposition'].split(';')))
if 'filename' in cd:
filename = cd['filename'].strip("\"'")
if filename: return filename
# if no filename was found above, parse it out of the final URL.
return os.path.basename(urlparse.urlsplit(openUrl.url)[2])
r = urllib2.urlopen(urllib2.Request(url))
try:
fileName = fileName or getFileName(url,r)
with open(fileName, 'wb') as f:
shutil.copyfileobj(r,f)
finally:
r.close()
2 Kender:
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
it is not safe -- web server can pass wrong formatted name as ["file.ext] or [file.ext'] or even be empty and localName[0] will raise exception.
Correct code can looks like this:
localName = localName.replace('"', '').replace("'", "")
if localName == '':
localName = SOME_DEFAULT_FILE_NAME
Using wget:
custom_file_name = "/custom/path/custom_name.ext"
wget.download(url, custom_file_name)
Using urlretrieve:
urllib.urlretrieve(url, custom_file_name)
urlretrieve also creates the directory structure if not exists.
You need to look into 'Content-Disposition' header, see the solution by kender.
How to download a file using python in a 'smarter' way?
Posting his solution modified with a capability to specify an output folder:
from os.path import basename
import os
from urllib.parse import urlsplit
import urllib.request
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, out_path):
localName = url2name(url)
req = urllib.request.Request(url)
r = urllib.request.urlopen(req)
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
localName = os.path.join(out_path, localName)
f = open(localName, 'wb')
f.write(r.read())
f.close()
download("https://example.com/demofile", '/home/username/tmp')
I have just updated the answer of kender for python3

Categories