I'm using Django admin to upload large files into another server ( A download host ).
The files are usually 100mb.
I'm using FTP currently based on this.
It works fine with files less than 1mb but as it itself says in the documentation, it doesn't work with larger files and I get a 503 when the upload finishes in the Django admin.
I really searched a lot about another way to to this but it seems there is no other way in Django.
can you help me?
This is my settings.py
FTP_STORAGE_LOCATION = 'ftp://<myuser>:<mypass>#<host>:<port>/[path]'
my models.py
from . import ftp
fs = ftp.FTPStorage()
def my_awesome_upload_function(instance, filename):
return os.path.join('public_ftp/public/{}/'.format(instance.get_directory()), filename)
class Video(models.Model):
video_file_ftp = models.FileField(upload_to = my_awesome_upload_function, storage=fs)
this is ftp.py
# FTP storage class for Django pluggable storage system.
# Author: Rafal Jonca <jonca.rafal#gmail.com>
# License: MIT
# Comes from http://www.djangosnippets.org/snippets/1269/
#
# Usage:
#
# Add below to settings.py:
# FTP_STORAGE_LOCATION = '[a]ftp://<user>:<pass>#<host>:<port>/[path]'
#
# In models.py you can write:
# from FTPStorage import FTPStorage
# fs = FTPStorage()
# class FTPTest(models.Model):
# file = models.FileField(upload_to='a/b/c/', storage=fs)
import ftplib
import io
import os
from datetime import datetime
from urllib.parse import urljoin, urlparse
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from django.core.files.base import File
from django.core.files.storage import Storage
from django.utils.deconstruct import deconstructible
from storages.utils import setting
class FTPStorageException(Exception):
pass
#deconstructible
class FTPStorage(Storage):
"""FTP Storage class for Django pluggable storage system."""
def __init__(self, location=None, base_url=None, encoding=None):
location = location or setting('FTP_STORAGE_LOCATION')
if location is None:
raise ImproperlyConfigured("You must set a location at "
"instanciation or at "
" settings.FTP_STORAGE_LOCATION'.")
self.location = location
self.encoding = encoding or setting('FTP_STORAGE_ENCODING') or 'latin-1'
base_url = base_url or settings.MEDIA_URL
self._config = self._decode_location(location)
self._base_url = base_url
self._connection = None
def _decode_location(self, location):
"""Return splitted configuration data from location."""
splitted_url = urlparse(location)
config = {}
if splitted_url.scheme not in ('ftp', 'aftp'):
raise ImproperlyConfigured(
'FTPStorage works only with FTP protocol!'
)
if splitted_url.hostname == '':
raise ImproperlyConfigured('You must at least provide hostname!')
if splitted_url.scheme == 'aftp':
config['active'] = True
else:
config['active'] = False
config['path'] = splitted_url.path
config['host'] = splitted_url.hostname
config['user'] = splitted_url.username
config['passwd'] = splitted_url.password
config['port'] = int(splitted_url.port)
return config
def _start_connection(self):
# Check if connection is still alive and if not, drop it.
if self._connection is not None:
try:
self._connection.pwd()
except ftplib.all_errors:
self._connection = None
# Real reconnect
if self._connection is None:
ftp = ftplib.FTP()
ftp.encoding = self.encoding
try:
ftp.connect(self._config['host'], self._config['port'])
ftp.login(self._config['user'], self._config['passwd'])
if self._config['active']:
ftp.set_pasv(False)
if self._config['path'] != '':
ftp.cwd(self._config['path'])
self._connection = ftp
return
except ftplib.all_errors:
raise FTPStorageException(
'Connection or login error using data %s'
% repr(self._config)
)
def disconnect(self):
self._connection.quit()
self._connection = None
def _mkremdirs(self, path):
pwd = self._connection.pwd()
path_splitted = path.split(os.path.sep)
for path_part in path_splitted:
try:
self._connection.cwd(path_part)
except ftplib.all_errors:
try:
self._connection.mkd(path_part)
self._connection.cwd(path_part)
except ftplib.all_errors:
raise FTPStorageException(
'Cannot create directory chain %s' % path
)
self._connection.cwd(pwd)
return
def _put_file(self, name, content):
# Connection must be open!
try:
self._mkremdirs(os.path.dirname(name))
pwd = self._connection.pwd()
self._connection.cwd(os.path.dirname(name))
self._connection.storbinary('STOR ' + os.path.basename(name),
content.file,
content.DEFAULT_CHUNK_SIZE)
self._connection.cwd(pwd)
except ftplib.all_errors:
raise FTPStorageException('Error writing file %s' % name)
def _open(self, name, mode='rb'):
remote_file = FTPStorageFile(name, self, mode=mode)
return remote_file
def _read(self, name):
memory_file = io.BytesIO()
try:
pwd = self._connection.pwd()
self._connection.cwd(os.path.dirname(name))
self._connection.retrbinary('RETR ' + os.path.basename(name),
memory_file.write)
self._connection.cwd(pwd)
memory_file.seek(0)
return memory_file
except ftplib.all_errors:
raise FTPStorageException('Error reading file %s' % name)
def _save(self, name, content):
content.open()
self._start_connection()
self._put_file(name, content)
content.close()
return name
def _get_dir_details(self, path):
# Connection must be open!
try:
lines = []
self._connection.retrlines('LIST ' + path, lines.append)
dirs = {}
files = {}
for line in lines:
words = line.split()
if len(words) < 6:
continue
if words[-2] == '->':
continue
if words[0][0] == 'd':
dirs[words[-1]] = 0
elif words[0][0] == '-':
files[words[-1]] = int(words[-5])
return dirs, files
except ftplib.all_errors:
raise FTPStorageException('Error getting listing for %s' % path)
def modified_time(self, name):
self._start_connection()
resp = self._connection.sendcmd('MDTM ' + name)
if resp[:3] == '213':
s = resp[3:].strip()
# workaround for broken FTP servers returning responses
# starting with e.g. 1904... instead of 2004...
if len(s) == 15 and s[:2] == '19':
s = str(1900 + int(s[2:5])) + s[5:]
return datetime.strptime(s, '%Y%m%d%H%M%S')
raise FTPStorageException(
'Error getting modification time of file %s' % name
)
def listdir(self, path):
self._start_connection()
try:
dirs, files = self._get_dir_details(path)
return list(dirs.keys()), list(files.keys())
except FTPStorageException:
raise
def delete(self, name):
if not self.exists(name):
return
self._start_connection()
try:
self._connection.delete(name)
except ftplib.all_errors:
raise FTPStorageException('Error when removing %s' % name)
def exists(self, name):
self._start_connection()
try:
nlst = self._connection.nlst(
os.path.dirname(name) + '/'
)
if name in nlst or os.path.basename(name) in nlst:
return True
else:
return False
except ftplib.error_temp:
return False
except ftplib.error_perm:
# error_perm: 550 Can't find file
return False
except ftplib.all_errors:
raise FTPStorageException('Error when testing existence of %s'
% name)
def size(self, name):
self._start_connection()
try:
dirs, files = self._get_dir_details(os.path.dirname(name))
if os.path.basename(name) in files:
return files[os.path.basename(name)]
else:
return 0
except FTPStorageException:
return 0
def url(self, name):
if self._base_url is None:
raise ValueError("This file is not accessible via a URL.")
return urljoin(self._base_url, name).replace('\\', '/')
class FTPStorageFile(File):
def __init__(self, name, storage, mode):
self.name = name
self._storage = storage
self._mode = mode
self._is_dirty = False
self.file = io.BytesIO()
self._is_read = False
#property
def size(self):
if not hasattr(self, '_size'):
self._size = self._storage.size(self.name)
return self._size
def readlines(self):
if not self._is_read:
self._storage._start_connection()
self.file = self._storage._read(self.name)
self._is_read = True
return self.file.readlines()
def read(self, num_bytes=None):
if not self._is_read:
self._storage._start_connection()
self.file = self._storage._read(self.name)
self._is_read = True
return self.file.read(num_bytes)
def write(self, content):
if 'w' not in self._mode:
raise AttributeError("File was opened for read-only access.")
self.file = io.BytesIO(content)
self._is_dirty = True
self._is_read = True
def close(self):
if self._is_dirty:
self._storage._start_connection()
self._storage._put_file(self.name, self)
self._storage.disconnect()
self.file.close()
I just simply get the 503 service unavailable on my browser when I try to upload large files. but with less than 1mb files everything works.
I'm trying to get the list of files that are fully uploaded on the FTP server.
I have access to this FTP server where a 3rd party writes data and marker files every 15 minutes. Once the data file is completely uploaded then a marker file gets created. we know once this marker file is there that means data files are ready and we can download it. I'm looking for a way to efficiently approach this problem. I want to check every minute if there are any new stable files on FTP server, if there is then I'll download those files. one preferred way is see if the marker file is 2 minutes old then we are good to download marker file and corresponding data file.
I'm new with python and looking for help.
I have some code till I list out the files
import paramiko
from datetime import datetime, timedelta
FTP_HOST = 'host_address'
FTP_PORT = 21
FTP_USERNAME = 'username'
FTP_PASSWORD = 'password'
FTP_ROOT_PATH = 'path_to_dir'
def today():
return datetime.strftime(datetime.now(), '%Y%m%d')
def open_ftp_connection(ftp_host, ftp_port, ftp_username, ftp_password):
"""
Opens ftp connection and returns connection object
"""
client = paramiko.SSHClient()
client.load_system_host_keys()
try:
transport = paramiko.Transport(ftp_host, ftp_port)
except Exception as e:
return 'conn_error'
try:
transport.connect(username=ftp_username, password=ftp_password)
except Exception as identifier:
return 'auth_error'
ftp_connection = paramiko.SFTPClient.from_transport(transport)
return ftp_connection
def show_ftp_files_stat():
ftp_connection = open_ftp_connection(FTP_HOST, int(FTP_PORT), FTP_USERNAME, FTP_PASSWORD)
full_ftp_path = FTP_ROOT_PATH + "/" + today()
file_attr_list = ftp_connection.listdir_attr(full_ftp_path)
print(file_attr_list)
for file_attr in file_attr_list:
print(file_attr.filename, file_attr.st_size, file_attr.st_mtime)
if __name__ == '__main__':
show_ftp_files_stat()
Sample file name
org-reference-delta-quotes.REF.48C2.20200402.92.1.1.txt.gz
Sample corresponding marker file name
org-reference-delta-quotes.REF.48C2.20200402.92.note.txt.gz
I solved my use case with 2 min stable rule, if modified time is within 2 min of the current time, I consider them stable.
import logging
import time
from datetime import datetime, timezone
from ftplib import FTP
FTP_HOST = 'host_address'
FTP_PORT = 21
FTP_USERNAME = 'username'
FTP_PASSWORD = 'password'
FTP_ROOT_PATH = 'path_to_dir'
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
def today():
return datetime.strftime(datetime.now(tz=timezone.utc), '%Y%m%d')
def current_utc_ts():
return datetime.utcnow().timestamp()
def current_utc_ts_minus_120():
return int(datetime.utcnow().timestamp()) - 120
def yyyymmddhhmmss_string_epoch_ts(dt_string):
return time.mktime(time.strptime(dt_string, '%Y%m%d%H%M%S'))
def get_ftp_connection(ftp_host, ftp_username, ftp_password):
try:
ftp = FTP(ftp_host, ftp_username, ftp_password)
except Exception as e:
print(e)
logger.error(e)
return 'conn_error'
return ftp
def get_list_of_files(ftp_connection, date_to_process):
full_ftp_path = FTP_ROOT_PATH + "/" + date_to_process + "/"
ftp_connection.cwd(full_ftp_path)
entries = list(ftp_connection.mlsd())
entry_list = [line for line in entries if line[0].endswith('.gz') | line[0].endswith('.zip')]
ftp_connection.quit()
print('Total file count', len(entry_list))
return entry_list
def parse_file_list_to_dict(entries):
try:
file_dict_list = []
for line in entries:
file_dict = dict({"file_name": line[0],
"server_timestamp": int(yyyymmddhhmmss_string_epoch_ts(line[1]['modify'])),
"server_date": line[0].split(".")[3])
file_dict_list.append(file_dict)
except IndexError as e:
# Output expected IndexErrors.
logging.exception(e)
except Exception as exception:
# Output unexpected Exceptions.
logging.exception(exception, False)
return file_dict_list
def get_stable_files_dict_list(dict_list):
stable_list = list(filter(lambda d: d['server_timestamp'] < current_utc_ts_minus_120(), dict_list))
print('stable file count: {}'.format(len(stable_list)))
return stable_list
if __name__ == '__main__':
ftp_connection = get_ftp_connection(FTP_HOST, FTP_USERNAME, FTP_PASSWORD)
if ftp_connection == 'conn_error':
logger.error('Failed to connect FTP Server!')
else:
file_list = get_list_of_files(ftp_connection, today())
parse_file_list = parse_file_list_to_dict(file_list)
stable_file_list = get_stable_files_dict_list(parse_file_list)
I'm using a combination of the GCS python SDK and google API client to loop through a version-enabled bucket and download specific objects based on metadata.
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print(item['bucket'])
print(item['name'])
print(item['metadata']['epoch'])
print(item['updated'])
blob = source_bucket.blob(item['name'])
blob.download_to_filename(
'/Users/admin/git/data-processing/{}'.format(item))
downloadepoch_objects()
The above function works properly for a blob that is not within a directory (gs://bucketname/test1.txt) as the item that gets passed in is simply test1.txt. The issue I am running into is when trying to download files from a complex directory tree (gs://bucketname/nfs/media/docs/test1.txt) The item that gets passed is nfs/media/docs/test1.txt. Is it possible to have the .download_to_file() method to create directories if they are not present?
Below is the working solution. I ended up stripping away the path from the object name and creating the directory structure on the fly. A better way might be as #Brandon Yarbrough suggested using 'prefix + response['prefixes'][0]' but I couldn't quite figure that out. Hope this helps others out.
#!/usr/local/bin/python3
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json
import os
import pathlib
bucket_name = 'test-bucket'
restore_epoch = '1519189202'
restore_location = '/Users/admin/data/'
credentials = GoogleCredentials.get_application_default()
service = discovery.build('storage', 'v1', credentials=credentials)
storage_client = storage.Client()
source_bucket = storage_client.get_bucket(bucket_name)
def listall_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
print(json.dumps(response, indent=2))
def listname_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
print(item['name'] + ' Uploaded on: ' + item['updated'] +
' Epoch: ' + item['metadata']['epoch'])
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
try:
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print('Downloading ' + item['name'] + ' from ' +
item['bucket'] + '; Epoch= ' + item['metadata']['epoch'])
print('Saving to: ' + restore_location)
blob = source_bucket.blob(item['name'])
path = pathlib.Path(restore_location + r'{}'.format(item['name'])).parent
if os.path.isdir(path):
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
else:
os.mkdir(path)
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
except Exception:
pass
# listall_objects()
# listname_objects()
downloadepoch_objects()
GCS does not have a notion of "directories," although tools like gsutil do a good job of pretending for convenience. If you want all of the objects under the "nfs/media/docs/" path, you can specify that as a prefix, like so:
request = service.objects.list(
bucket=bucket_name,
versions=True,
prefix='nfs/media/docs/', # Only show objects beginning like this
delimiter='/' # Consider this character a directory marker.
)
response = request.execute()
subdirectories = response['prefixes']
objects = response['items']
Because of the prefix parameter, only objects that begin with 'nfs/media/docs' will be returned in response['items']. Because of the delimiter parameter, "subdirectories" will be returned in response['prefixes']. You can get more details in the Python documentation of the objects.list method.
If you were to use the newer google-cloud Python library, which I'd recommended for new code, the same call would look pretty similar:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(bucket_name)
iterator = bucket.list_blobs(
versions=True,
prefix='nfs/media/docs/',
delimiter='/'
)
subdirectories = iterator.prefixes
objects = list(iterator)
Following solution worked for me. I am recursively downloading all blobs from a path prefix to a model directory at the project root, while maintaining the folder structure.
Multiple blobs are being downloaded concurrently.
GCS client version
google-cloud-storage==1.41.1
import os
from datetime import datetime
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor
BUCKET_NAME = "ml-model"
def timer(func):
def time_wrapper(*arg, **kwargs):
start = datetime.now()
func(*arg, **kwargs)
diff = datetime.now() - start
logger.info(f"{func.__name__} took {diff.seconds} s and {diff.microseconds//1000} ms")
return time_wrapper
def fetch_environment() -> str:
env = os.environ.get("environment", "staging")
return env
def create_custom_folder(dir_name: str):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def fetch_gcs_credential_file_path():
return os.environ.get("GCS_CREDENTIAL_FILE_PATH")
class GCS:
def __init__(self):
cred_file_path = fetch_gcs_credential_file_path()
self.client = storage.Client.from_service_account_json(cred_file_path)
self.bucket = self.client.bucket(BUCKET_NAME)
def download_blob(self, blob):
filename = blob.name.replace(self.path_prefix, '')
delimiter_based_splits = filename.split('/')
if len(delimiter_based_splits) > 1:
dir_name = "model/" + "/".join(delimiter_based_splits[: len(delimiter_based_splits)-1])
create_custom_folder(dir_name)
blob.download_to_filename(f"{dir_name}/{delimiter_based_splits[-1]}")
else:
blob.download_to_filename(f"model/" + filename)
#timer
def download_blobs_multithreaded(self, prefix: str):
'''
CREATE FOLDER IF NOT EXISTS
'''
create_custom_folder("model")
blobs = self.bucket.list_blobs(prefix=prefix)
self.path_prefix = prefix
with ThreadPoolExecutor() as executor:
executor.map(self.download_blob, blobs
def download_model():
env = fetch_environment()
folder_path_prefix = f"ml/{env}/{ML_MODEL_NAME}/v1/tf-saved-model/"
gcs = GCS()
gcs.download_blobs_multithreaded(folder_path_prefix)
if __name__ == '__main__':
download_model()
I am uploading a large file using the Python requests package, and I can't find any way to give data back about the progress of the upload. I have seen a number of progress meters for downloading a file, but these will not work for a file upload.
The ideal solution would be some sort of callback method such as:
def progress(percent):
print percent
r = requests.post(URL, files={'f':hugeFileHandle}, callback=progress)
Thanks in advance for your help :)
requests doesn't support upload streaming e.g.:
import os
import sys
import requests # pip install requests
class upload_in_chunks(object):
def __init__(self, filename, chunksize=1 << 13):
self.filename = filename
self.chunksize = chunksize
self.totalsize = os.path.getsize(filename)
self.readsofar = 0
def __iter__(self):
with open(self.filename, 'rb') as file:
while True:
data = file.read(self.chunksize)
if not data:
sys.stderr.write("\n")
break
self.readsofar += len(data)
percent = self.readsofar * 1e2 / self.totalsize
sys.stderr.write("\r{percent:3.0f}%".format(percent=percent))
yield data
def __len__(self):
return self.totalsize
# XXX fails
r = requests.post("http://httpbin.org/post",
data=upload_in_chunks(__file__, chunksize=10))
btw, if you don't need to report progress; you could use memory-mapped file to upload large file.
To workaround it, you could create a file adaptor similar to the one from
urllib2 POST progress monitoring:
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = len(iterable)
def read(self, size=-1): # TBD: add buffer for `len(data) > size` case
return next(self.iterator, b'')
def __len__(self):
return self.length
Example
it = upload_in_chunks(__file__, 10)
r = requests.post("http://httpbin.org/post", data=IterableToFileAdapter(it))
# pretty print
import json
json.dump(r.json, sys.stdout, indent=4, ensure_ascii=False)
I recommend to use a tool package named requests-toolbelt, which make monitoring upload bytes very easy, like
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
import requests
def my_callback(monitor):
# Your callback function
print monitor.bytes_read
e = MultipartEncoder(
fields={'field0': 'value', 'field1': 'value',
'field2': ('filename', open('file.py', 'rb'), 'text/plain')}
)
m = MultipartEncoderMonitor(e, my_callback)
r = requests.post('http://httpbin.org/post', data=m,
headers={'Content-Type': m.content_type})
And you may want to read this to show a progress bar.
I got it working with the code from here: Simple file upload progressbar in PyQt.
I changed it a bit, to use BytesIO instead of StringIO.
class CancelledError(Exception):
def __init__(self, msg):
self.msg = msg
Exception.__init__(self, msg)
def __str__(self):
return self.msg
__repr__ = __str__
class BufferReader(BytesIO):
def __init__(self, buf=b'',
callback=None,
cb_args=(),
cb_kwargs={}):
self._callback = callback
self._cb_args = cb_args
self._cb_kwargs = cb_kwargs
self._progress = 0
self._len = len(buf)
BytesIO.__init__(self, buf)
def __len__(self):
return self._len
def read(self, n=-1):
chunk = BytesIO.read(self, n)
self._progress += int(len(chunk))
self._cb_kwargs.update({
'size' : self._len,
'progress': self._progress
})
if self._callback:
try:
self._callback(*self._cb_args, **self._cb_kwargs)
except: # catches exception from the callback
raise CancelledError('The upload was cancelled.')
return chunk
def progress(size=None, progress=None):
print("{0} / {1}".format(size, progress))
files = {"upfile": ("file.bin", open("file.bin", 'rb').read())}
(data, ctype) = requests.packages.urllib3.filepost.encode_multipart_formdata(files)
headers = {
"Content-Type": ctype
}
body = BufferReader(data, progress)
requests.post(url, data=body, headers=headers)
The trick is, to generate data and header from the files list manually, using encode_multipart_formdata() from urllib3
I know this is an old question, but I couldn't find an easy answer anywhere else, so hopefully this will help somebody else:
import requests
import tqdm
with open(file_name, 'rb') as f:
r = requests.post(url, data=tqdm(f.readlines()))
This solution uses requests_toolbelt and tqdm both well maintained and popular libraries.
from pathlib import Path
from tqdm import tqdm
import requests
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
def upload_file(upload_url, fields, filepath):
path = Path(filepath)
total_size = path.stat().st_size
filename = path.name
with tqdm(
desc=filename,
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
) as bar:
with open(filepath, "rb") as f:
fields["file"] = ("filename", f)
e = MultipartEncoder(fields=fields)
m = MultipartEncoderMonitor(
e, lambda monitor: bar.update(monitor.bytes_read - bar.n)
)
headers = {"Content-Type": m.content_type}
requests.post(upload_url, data=m, headers=headers)
Example usage
upload_url = 'https://uploadurl'
fields = {
"field1": value1,
"field2": value2
}
filepath = '97a6fce8_owners_2018_Van Zandt.csv'
upload_file(upload_url, fields, filepath)
Usually you would build a streaming datasource (a generator) that reads the file chunked and reports its progress on the way (see kennethreitz/requests#663. This does not work with requests file-api, because requests doesn’t support streaming uploads (see kennethreitz/requests#295) – a file to upload needs to be complete in memory before it starts getting processed.
but requests can stream content from a generator as J.F. Sebastian has proven before, but this generator needs to generate the complete datastream including the multipart encoding and boundaries. This is where poster comes to play.
poster is originally written to be used with pythons urllib2 and supports streaming generation of multipart requests, providing progress indication as it goes along. Posters Homepage provides examples of using it together with urllib2 but you really don’t want to use urllib2. Check out this example-code on how to to HTTP Basic Authentication with urllib2. Horrrrrrrrible.
So we really want to use poster together with requests to do file uploads with tracked progress. And here is how:
# load requests-module, a streamlined http-client lib
import requests
# load posters encode-function
from poster.encode import multipart_encode
# an adapter which makes the multipart-generator issued by poster accessable to requests
# based upon code from http://stackoverflow.com/a/13911048/1659732
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = iterable.total
def read(self, size=-1):
return next(self.iterator, b'')
def __len__(self):
return self.length
# define a helper function simulating the interface of posters multipart_encode()-function
# but wrapping its generator with the file-like adapter
def multipart_encode_for_requests(params, boundary=None, cb=None):
datagen, headers = multipart_encode(params, boundary, cb)
return IterableToFileAdapter(datagen), headers
# this is your progress callback
def progress(param, current, total):
if not param:
return
# check out http://tcd.netinf.eu/doc/classnilib_1_1encode_1_1MultipartParam.html
# for a complete list of the properties param provides to you
print "{0} ({1}) - {2:d}/{3:d} - {4:.2f}%".format(param.name, param.filename, current, total, float(current)/float(total)*100)
# generate headers and gata-generator an a requests-compatible format
# and provide our progress-callback
datagen, headers = multipart_encode_for_requests({
"input_file": open('recordings/really-large.mp4', "rb"),
"another_input_file": open('recordings/even-larger.mp4', "rb"),
"field": "value",
"another_field": "another_value",
}, cb=progress)
# use the requests-lib to issue a post-request with out data attached
r = requests.post(
'https://httpbin.org/post',
auth=('user', 'password'),
data=datagen,
headers=headers
)
# show response-code and -body
print r, r.text
My upload server doesn't support Chunk-Encoded so I came up with this solution. It basically just a wrapper around python IOBase and allow tqdm.wrapattr to work seamless.
import io
import requests
from typing import Union
from tqdm import tqdm
from tqdm.utils import CallbackIOWrapper
class UploadChunksIterator(Iterable):
"""
This is an interface between python requests and tqdm.
Make tqdm to be accessed just like IOBase for requests lib.
"""
def __init__(
self, file: Union[io.BufferedReader, CallbackIOWrapper], total_size: int, chunk_size: int = 16 * 1024
): # 16MiB
self.file = file
self.chunk_size = chunk_size
self.total_size = total_size
def __iter__(self):
return self
def __next__(self):
data = self.file.read(self.chunk_size)
if not data:
raise StopIteration
return data
# we dont retrive len from io.BufferedReader because CallbackIOWrapper only has read() method.
def __len__(self):
return self.total_size
fp = "data/mydata.mp4"
s3url = "example.com"
_quiet = False
with open(fp, "rb") as f:
total_size = os.fstat(f.fileno()).st_size
if not _quiet:
f = tqdm.wrapattr(f, "read", desc=hv, miniters=1, total=total_size, ascii=True)
with f as f_iter:
res = requests.put(
url=s3url,
data=UploadChunksIterator(f_iter, total_size=total_size),
)
res.raise_for_status()
Making #jfs' answer better in terms of an informative progress bar.
import math
import os
import requests
import sys
class ProgressUpload:
def __init__(self, filename, chunk_size=1250):
self.filename = filename
self.chunk_size = chunk_size
self.file_size = os.path.getsize(filename)
self.size_read = 0
self.divisor = min(math.floor(math.log(self.file_size, 1000)) * 3, 9) # cap unit at a GB
self.unit = {0: 'B', 3: 'KB', 6: 'MB', 9: 'GB'}[self.divisor]
self.divisor = 10 ** self.divisor
def __iter__(self):
progress_str = f'0 / {self.file_size / self.divisor:.2f} {self.unit} (0 %)'
sys.stderr.write(f'\rUploading {dist_file}: {progress_str}')
with open(self.filename, 'rb') as f:
for chunk in iter(lambda: f.read(self.chunk_size), b''):
self.size_read += len(chunk)
yield chunk
sys.stderr.write('\b' * len(progress_str))
percentage = self.size_read / self.file_size * 100
completed_str = f'{self.size_read / self.divisor:.2f}'
to_complete_str = f'{self.file_size / self.divisor:.2f} {self.unit}'
progress_str = f'{completed_str} / {to_complete_str} ({percentage:.2f} %)'
sys.stderr.write(progress_str)
sys.stderr.write('\n')
def __len__(self):
return self.file_size
# sample usage
requests.post(upload_url, data=ProgressUpload('file_path'))
The key is the __len__ method. Without it, I was getting connection closed errors. That's the only reason you can't just use tqdm + iter to get a simple progress bar.
My python code that works great. Credit : twine
import sys
import tqdm
import requests
import requests_toolbelt
class ProgressBar(tqdm.tqdm):
def update_to(self, n: int) -> None:
self.update(n - self.n)
with open("test.zip", "rb") as fp:
data_to_send = []
session = requests.session()
data_to_send.append(
("files", ("test.zip", fp))
)
encoder = requests_toolbelt.MultipartEncoder(data_to_send)
with ProgressBar(
total=encoder.len,
unit="B",
unit_scale=True,
unit_divisor=1024,
miniters=1,
file=sys.stdout,
) as bar:
monitor = requests_toolbelt.MultipartEncoderMonitor(
encoder, lambda monitor: bar.update_to(monitor.bytes_read)
)
r = session.post(
'http://httpbin.org/post',
data=monitor,
headers={"Content-Type": monitor.content_type},
)
print(r.text)
I'm new to most of this so forgive me if I'm doing something really dumb. The following is a simple Twisted xmlrpc server which is supposed to return file info. It works fine except that the xmlrpc_hash function gives the same result for every file. Example below code. Any help would be great!
from twisted.web import xmlrpc, server
import os
class rfi(xmlrpc.XMLRPC):
"""
rfi - Remote File Info server
"""
def xmlrpc_echo(self, x):
"""
Return all passed args as a test
"""
return x
def xmlrpc_location(self):
"""
Return current directory name
"""
return os.getcwd()
def xmlrpc_ls(self, path):
"""
Run ls on the path
"""
result = []
listing = os.listdir(path)
for l in listing:
result.append(l)
return result
def xmlrpc_stat(self, path):
"""
Stat the path
"""
result = str(os.stat(path))
return result
def xmlrpc_hash(self, path):
"""
Hash the path
"""
from hashlib import sha1
if os.path.isfile(path):
f = open(path,'rb')
h = sha1()
block_size = 2**20
f.close()
return h.hexdigest()
else:
return 'Not a file'
if __name__ == '__main__':
from twisted.internet import reactor
r = rfi()
reactor.listenTCP(7081, server.Site(r))
reactor.run()
Example output:
import xmlrpclib
s = xmlrpclib.Server('http://localhost:7081/')
s.hash('file_1.txt')
'da39a3ee5e6b4b0d3255bfef95601890afd80709'
s.hash('file_2.txt')
'da39a3ee5e6b4b0d3255bfef95601890afd80709'
This is because you're never actually updating the hash object:
from hashlib import sha1
if os.path.isfile(path):
f = open(path,'rb')
h = sha1()
h.update(f.read()) # You're missing this line
f.close()
return h.hexdigest()
else:
return 'Not a file'