I am uploading a large file using the Python requests package, and I can't find any way to give data back about the progress of the upload. I have seen a number of progress meters for downloading a file, but these will not work for a file upload.
The ideal solution would be some sort of callback method such as:
def progress(percent):
print percent
r = requests.post(URL, files={'f':hugeFileHandle}, callback=progress)
Thanks in advance for your help :)
requests doesn't support upload streaming e.g.:
import os
import sys
import requests # pip install requests
class upload_in_chunks(object):
def __init__(self, filename, chunksize=1 << 13):
self.filename = filename
self.chunksize = chunksize
self.totalsize = os.path.getsize(filename)
self.readsofar = 0
def __iter__(self):
with open(self.filename, 'rb') as file:
while True:
data = file.read(self.chunksize)
if not data:
sys.stderr.write("\n")
break
self.readsofar += len(data)
percent = self.readsofar * 1e2 / self.totalsize
sys.stderr.write("\r{percent:3.0f}%".format(percent=percent))
yield data
def __len__(self):
return self.totalsize
# XXX fails
r = requests.post("http://httpbin.org/post",
data=upload_in_chunks(__file__, chunksize=10))
btw, if you don't need to report progress; you could use memory-mapped file to upload large file.
To workaround it, you could create a file adaptor similar to the one from
urllib2 POST progress monitoring:
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = len(iterable)
def read(self, size=-1): # TBD: add buffer for `len(data) > size` case
return next(self.iterator, b'')
def __len__(self):
return self.length
Example
it = upload_in_chunks(__file__, 10)
r = requests.post("http://httpbin.org/post", data=IterableToFileAdapter(it))
# pretty print
import json
json.dump(r.json, sys.stdout, indent=4, ensure_ascii=False)
I recommend to use a tool package named requests-toolbelt, which make monitoring upload bytes very easy, like
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
import requests
def my_callback(monitor):
# Your callback function
print monitor.bytes_read
e = MultipartEncoder(
fields={'field0': 'value', 'field1': 'value',
'field2': ('filename', open('file.py', 'rb'), 'text/plain')}
)
m = MultipartEncoderMonitor(e, my_callback)
r = requests.post('http://httpbin.org/post', data=m,
headers={'Content-Type': m.content_type})
And you may want to read this to show a progress bar.
I got it working with the code from here: Simple file upload progressbar in PyQt.
I changed it a bit, to use BytesIO instead of StringIO.
class CancelledError(Exception):
def __init__(self, msg):
self.msg = msg
Exception.__init__(self, msg)
def __str__(self):
return self.msg
__repr__ = __str__
class BufferReader(BytesIO):
def __init__(self, buf=b'',
callback=None,
cb_args=(),
cb_kwargs={}):
self._callback = callback
self._cb_args = cb_args
self._cb_kwargs = cb_kwargs
self._progress = 0
self._len = len(buf)
BytesIO.__init__(self, buf)
def __len__(self):
return self._len
def read(self, n=-1):
chunk = BytesIO.read(self, n)
self._progress += int(len(chunk))
self._cb_kwargs.update({
'size' : self._len,
'progress': self._progress
})
if self._callback:
try:
self._callback(*self._cb_args, **self._cb_kwargs)
except: # catches exception from the callback
raise CancelledError('The upload was cancelled.')
return chunk
def progress(size=None, progress=None):
print("{0} / {1}".format(size, progress))
files = {"upfile": ("file.bin", open("file.bin", 'rb').read())}
(data, ctype) = requests.packages.urllib3.filepost.encode_multipart_formdata(files)
headers = {
"Content-Type": ctype
}
body = BufferReader(data, progress)
requests.post(url, data=body, headers=headers)
The trick is, to generate data and header from the files list manually, using encode_multipart_formdata() from urllib3
I know this is an old question, but I couldn't find an easy answer anywhere else, so hopefully this will help somebody else:
import requests
import tqdm
with open(file_name, 'rb') as f:
r = requests.post(url, data=tqdm(f.readlines()))
This solution uses requests_toolbelt and tqdm both well maintained and popular libraries.
from pathlib import Path
from tqdm import tqdm
import requests
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
def upload_file(upload_url, fields, filepath):
path = Path(filepath)
total_size = path.stat().st_size
filename = path.name
with tqdm(
desc=filename,
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
) as bar:
with open(filepath, "rb") as f:
fields["file"] = ("filename", f)
e = MultipartEncoder(fields=fields)
m = MultipartEncoderMonitor(
e, lambda monitor: bar.update(monitor.bytes_read - bar.n)
)
headers = {"Content-Type": m.content_type}
requests.post(upload_url, data=m, headers=headers)
Example usage
upload_url = 'https://uploadurl'
fields = {
"field1": value1,
"field2": value2
}
filepath = '97a6fce8_owners_2018_Van Zandt.csv'
upload_file(upload_url, fields, filepath)
Usually you would build a streaming datasource (a generator) that reads the file chunked and reports its progress on the way (see kennethreitz/requests#663. This does not work with requests file-api, because requests doesn’t support streaming uploads (see kennethreitz/requests#295) – a file to upload needs to be complete in memory before it starts getting processed.
but requests can stream content from a generator as J.F. Sebastian has proven before, but this generator needs to generate the complete datastream including the multipart encoding and boundaries. This is where poster comes to play.
poster is originally written to be used with pythons urllib2 and supports streaming generation of multipart requests, providing progress indication as it goes along. Posters Homepage provides examples of using it together with urllib2 but you really don’t want to use urllib2. Check out this example-code on how to to HTTP Basic Authentication with urllib2. Horrrrrrrrible.
So we really want to use poster together with requests to do file uploads with tracked progress. And here is how:
# load requests-module, a streamlined http-client lib
import requests
# load posters encode-function
from poster.encode import multipart_encode
# an adapter which makes the multipart-generator issued by poster accessable to requests
# based upon code from http://stackoverflow.com/a/13911048/1659732
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = iterable.total
def read(self, size=-1):
return next(self.iterator, b'')
def __len__(self):
return self.length
# define a helper function simulating the interface of posters multipart_encode()-function
# but wrapping its generator with the file-like adapter
def multipart_encode_for_requests(params, boundary=None, cb=None):
datagen, headers = multipart_encode(params, boundary, cb)
return IterableToFileAdapter(datagen), headers
# this is your progress callback
def progress(param, current, total):
if not param:
return
# check out http://tcd.netinf.eu/doc/classnilib_1_1encode_1_1MultipartParam.html
# for a complete list of the properties param provides to you
print "{0} ({1}) - {2:d}/{3:d} - {4:.2f}%".format(param.name, param.filename, current, total, float(current)/float(total)*100)
# generate headers and gata-generator an a requests-compatible format
# and provide our progress-callback
datagen, headers = multipart_encode_for_requests({
"input_file": open('recordings/really-large.mp4', "rb"),
"another_input_file": open('recordings/even-larger.mp4', "rb"),
"field": "value",
"another_field": "another_value",
}, cb=progress)
# use the requests-lib to issue a post-request with out data attached
r = requests.post(
'https://httpbin.org/post',
auth=('user', 'password'),
data=datagen,
headers=headers
)
# show response-code and -body
print r, r.text
My upload server doesn't support Chunk-Encoded so I came up with this solution. It basically just a wrapper around python IOBase and allow tqdm.wrapattr to work seamless.
import io
import requests
from typing import Union
from tqdm import tqdm
from tqdm.utils import CallbackIOWrapper
class UploadChunksIterator(Iterable):
"""
This is an interface between python requests and tqdm.
Make tqdm to be accessed just like IOBase for requests lib.
"""
def __init__(
self, file: Union[io.BufferedReader, CallbackIOWrapper], total_size: int, chunk_size: int = 16 * 1024
): # 16MiB
self.file = file
self.chunk_size = chunk_size
self.total_size = total_size
def __iter__(self):
return self
def __next__(self):
data = self.file.read(self.chunk_size)
if not data:
raise StopIteration
return data
# we dont retrive len from io.BufferedReader because CallbackIOWrapper only has read() method.
def __len__(self):
return self.total_size
fp = "data/mydata.mp4"
s3url = "example.com"
_quiet = False
with open(fp, "rb") as f:
total_size = os.fstat(f.fileno()).st_size
if not _quiet:
f = tqdm.wrapattr(f, "read", desc=hv, miniters=1, total=total_size, ascii=True)
with f as f_iter:
res = requests.put(
url=s3url,
data=UploadChunksIterator(f_iter, total_size=total_size),
)
res.raise_for_status()
Making #jfs' answer better in terms of an informative progress bar.
import math
import os
import requests
import sys
class ProgressUpload:
def __init__(self, filename, chunk_size=1250):
self.filename = filename
self.chunk_size = chunk_size
self.file_size = os.path.getsize(filename)
self.size_read = 0
self.divisor = min(math.floor(math.log(self.file_size, 1000)) * 3, 9) # cap unit at a GB
self.unit = {0: 'B', 3: 'KB', 6: 'MB', 9: 'GB'}[self.divisor]
self.divisor = 10 ** self.divisor
def __iter__(self):
progress_str = f'0 / {self.file_size / self.divisor:.2f} {self.unit} (0 %)'
sys.stderr.write(f'\rUploading {dist_file}: {progress_str}')
with open(self.filename, 'rb') as f:
for chunk in iter(lambda: f.read(self.chunk_size), b''):
self.size_read += len(chunk)
yield chunk
sys.stderr.write('\b' * len(progress_str))
percentage = self.size_read / self.file_size * 100
completed_str = f'{self.size_read / self.divisor:.2f}'
to_complete_str = f'{self.file_size / self.divisor:.2f} {self.unit}'
progress_str = f'{completed_str} / {to_complete_str} ({percentage:.2f} %)'
sys.stderr.write(progress_str)
sys.stderr.write('\n')
def __len__(self):
return self.file_size
# sample usage
requests.post(upload_url, data=ProgressUpload('file_path'))
The key is the __len__ method. Without it, I was getting connection closed errors. That's the only reason you can't just use tqdm + iter to get a simple progress bar.
My python code that works great. Credit : twine
import sys
import tqdm
import requests
import requests_toolbelt
class ProgressBar(tqdm.tqdm):
def update_to(self, n: int) -> None:
self.update(n - self.n)
with open("test.zip", "rb") as fp:
data_to_send = []
session = requests.session()
data_to_send.append(
("files", ("test.zip", fp))
)
encoder = requests_toolbelt.MultipartEncoder(data_to_send)
with ProgressBar(
total=encoder.len,
unit="B",
unit_scale=True,
unit_divisor=1024,
miniters=1,
file=sys.stdout,
) as bar:
monitor = requests_toolbelt.MultipartEncoderMonitor(
encoder, lambda monitor: bar.update_to(monitor.bytes_read)
)
r = session.post(
'http://httpbin.org/post',
data=monitor,
headers={"Content-Type": monitor.content_type},
)
print(r.text)
Related
I have a url :"https://findicons.com/files/icons/2787/beautiful_flat_icons/128/running.png"
I want to get the image and write it to file , i write the code as follow:
import urllib.request
web = urllib.request.urlopen(iturl)
itdata = web.read()
f = open(str(cou) + '.png', "wb")
cou = cou + 1
f.write(itdata)
f.close()
My question is ,if i have many urls to download ,how can i implement it by coroutine of tornado?
This isn't the entire code, just something I came up with in 5 mins but it should give you enough information to satisfy your requirements. If you have any questions or further explanation is required, please let me know.
from tornado import gen, httpclient, ioloop
#gen.coroutine
def main():
client = httpclient.AsyncHTTPClient()
response = yield client.fetch(
'https://findicons.com/files/icons/2787/beautiful_flat_icons/128/running.png',
download_image,
follow_redirects = True)
#gen.coroutine
def download_image(response):
buffer_size = 1024
filename = response.request.url.split('/')[-1] # this is not always reliable
with open(filename, 'ab') as img:
while True:
chunk = response.buffer.read(buffer_size)
if chunk == '':
break
img.write(chunk)
yield
ioloop.IOLoop.current().run_sync(main)
References
Tornado Issue #1616
Examples of RequestHandler
I'm want to POST a large file from a python client to cherrypy. I'm using the requests library.
This is my client code:
def upload(fileName=None):
url = 'http://localhost:8080/upload'
files = {'myFile': ( fileName, open(fileName, 'rb') )}
r = requests.post(url, files=files)
#with open(fileName,'rb') as payload:
#headers = {'content-type': 'multipart/form-data'}
#r = requests.post('http://127.0.0.1:8080', data=payload,verify=False,headers=headers)
if __name__ == '__main__':
upload(sys.argv[1])
The problem is that this puts the whole file in the RAM memory. Is there any way to POST the file in pieces?
class FileDemo(object):
#cherrypy.expose
def upload(self, myFile):
print myFile.filename
#size = 0
#decoder = MultipartDecoder(myFile, 'image/jpeg')
#for part in decoder.parts:
#print(part.header['content-type'])
#while True:
#advances to the content that hasn't been read
#myFile.file.seek(size, 0)
#reads 100mb at a time so it doesn't fill up the RAM
#data = myFile.file.read(10240000)
#newFile = open("/home/ivo/Desktop/"+str(myFile.filename), 'a+')
#newFile.write(data)
#newFile.close
#size += len(data)
#if len(data) < 10240000:
#break
if __name__ == '__main__':
cherrypy.quickstart(FileDemo())
This is the code in the server side. It has a lot of comments because I've been trying a lot of stuff. Right now I'm just printing the file name and the client still transfers the whole file to RAM.
I don't know what else to try. Thank you in advance for your help.
If it's CherryPy specific upload you can skip multipart/form-data encoding obstacles and just send streaming POST body of file contents.
client
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import io
import os
class FileLenIO(io.FileIO):
def __init__(self, name, mode = 'r', closefd = True):
io.FileIO.__init__(self, name, mode, closefd)
self.__size = statinfo = os.stat(name).st_size
def __len__(self):
return self.__size
f = FileLenIO('/home/user/Videos/video.mp4', 'rb')
request = urllib2.Request('http://127.0.0.1:8080/upload', f)
request.add_header('Content-Type', 'application/octet-stream')
# you can add custom header with filename if you need it
response = urllib2.urlopen(request)
print response.read()
server
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import tempfile
import shutil
import cherrypy
config = {
'global' : {
'server.socket_host' : '127.0.0.1',
'server.socket_port' : 8080,
'server.thread_pool' : 8,
# remove any limit on the request body size; cherrypy's default is 100MB
'server.max_request_body_size' : 0,
# increase server socket timeout to 60s; cherrypy's defult is 10s
'server.socket_timeout' : 60
}
}
class App:
#cherrypy.config(**{'response.timeout': 3600}) # default is 300s
#cherrypy.expose()
def upload(self):
'''Handle non-multipart upload'''
destination = os.path.join('/home/user/test-upload')
with open(destination, 'wb') as f:
shutil.copyfileobj(cherrypy.request.body, f)
return 'Okay'
if __name__ == '__main__':
cherrypy.quickstart(App(), '/', config)
Tested on 1.3GiB video file. Server-side memory consumption is under 10MiB, client's under 5MiB.
This is how I solved the problem:
client
import poster
def upload(fileName=None):
register_openers()
url = 'http://localhost:8080/upload'
data, headers = multipart_encode({"myFile": open(fileName, "rb")})
request = urllib2.Request(url, data, headers)
request.unverifiable = True
response = urllib2.urlopen(request)
the_page = response.read()
if __name__ == '__main__':
upload(sys.argv[1])
server
#cherrypy.expose
def upload(self, myFile):
cherrypy.response.timeout = 3600
newFile = open("/home/ivo/Desktop/"+str(myFile.filename), 'a+')
newFile.write(myFile.file.read())
newFile.close
I am trying to upload a file using multipart_encode to realize the MIME process. However, I met the following error AttributeError: multipart_yielder instance has no attribute '__len__'. Below are is my approach, I really appreciate if anyone can give me some suggestions.
url = "https://pi-user-files.s3-external-1.amazonaws.com/"
post_data = {}
#data is a dict
post_data['AWSAccessKeyId']=(data['ticket']['AWSAccessKeyId'])
post_data['success_action_redirect']=(data['ticket']['success_action_redirect'])
post_data['acl']=(data['ticket']['acl'])
post_data['key']=(data['ticket']['key'])
post_data['signature']=(data['ticket']['signature'])
post_data['policy']=(data['ticket']['policy'])
post_data['Content-Type']=(data['ticket']['Content-Type'])
#I would like to upload a text file "new 2"
post_data['file']=open("new 2.txt", "rb")
datagen, headers = multipart_encode(post_data)
request2 = urllib2.Request(url, datagen, headers)
result = urllib2.urlopen(request2)
If you want to send a file you should wrap other parameters with a MultipartParam object, example code for creating a send file request:
from poster.encode import multipart_encode, MultipartParam
import urllib2
def postFileRequest(url, paramName, fileObj, additionalHeaders={}, additionalParams={}):
items = []
#wrap post parameters
for name, value in additionalParams.items():
items.append(MultipartParam(name, value))
#add file
items.append(MultipartParam.from_file(paramName, fileObj))
datagen, headers = multipart_encode(items)
#add headers
for item, value in additionalHeaders.iteritems():
headers[item] = value
return urllib2.Request(url, datagen, headers)
Also I think you should execute register_openers() once at the beginning. Some details you can find in docs
The problem is that in httplib.py, the generator is not detected as such and is treated instead like a string that holds the full data to be sent (and therefore it tries to find its length):
if hasattr(data,'read') and not isinstance(data, array): # generator
if self.debuglevel > 0: print "sendIng a read()able"
....
A solution is to make the generator act like a read()able:
class GeneratorToReadable():
def __init__(self, datagen):
self.generator = datagen
self._end = False
self.data = ''
def read(self, n_bytes):
while not self._end and len(self.data) < n_bytes:
try:
next_chunk = self.generator.next()
if next_chunk:
self.data += next_chunk
else:
self._end = True
except StopIteration:
self._end = True
result = self.data[0:n_bytes]
self.data = self.data[n_bytes:]
return result
and use like so:
datagen, headers = multipart_encode(post_data)
readable = GeneratorToReadable(datagen)
req = urllib2.Request(url, readable, headers)
result = urllib2.urlopen(req)
I'm uploading a fairly large file with urllib2 to a server-side script via POST. I want to display a progress indicator that shows the current upload progress. Is there a hook or a callback provided by urllib2 that allows me to monitor upload progress? I know that you can do it with download using successive calls to the connection's read() method, but I don't see a write() method, you just add data to the request.
It is possible but you need to do a few things:
Fake out the urllib2 subsystem into passing a file handle down to httplib by attaching a __len__ attribute which makes len(data) return the correct size, used to populate the Content-Length header.
Override the read() method on your file handle: as httplib calls read() your callback will be invoked, letting you calculate the percentage and update your progress bar.
This could work with any file-like object, but I've wrapped file to show how it could work with a really large file streamed from disk:
import os, urllib2
from cStringIO import StringIO
class Progress(object):
def __init__(self):
self._seen = 0.0
def update(self, total, size, name):
self._seen += size
pct = (self._seen / total) * 100.0
print '%s progress: %.2f' % (name, pct)
class file_with_callback(file):
def __init__(self, path, mode, callback, *args):
file.__init__(self, path, mode)
self.seek(0, os.SEEK_END)
self._total = self.tell()
self.seek(0)
self._callback = callback
self._args = args
def __len__(self):
return self._total
def read(self, size):
data = file.read(self, size)
self._callback(self._total, len(data), *self._args)
return data
path = 'large_file.txt'
progress = Progress()
stream = file_with_callback(path, 'rb', progress.update, path)
req = urllib2.Request(url, stream)
res = urllib2.urlopen(req)
Output:
large_file.txt progress: 0.68
large_file.txt progress: 1.36
large_file.txt progress: 2.04
large_file.txt progress: 2.72
large_file.txt progress: 3.40
...
large_file.txt progress: 99.20
large_file.txt progress: 99.87
large_file.txt progress: 100.00
requests 2.0.0 has streaming uploads. This means you can use a generator to yield tiny chunks and print the progress between chunks.
I don't think this is possible, but pycurl does have upload/download progress callbacks you can use.
poster supports this
import json
import os
import sys
import urllib2
from poster.encode import multipart_encode
from poster.streaminghttp import register_openers
def _upload_progress(param, current, total):
sys.stdout.write(
"\r{} - {:.0f}% "
.format(param.name,
(float(current) / float(total)) * 100.0))
sys.stdout.flush()
def upload(request_resource, large_file_path):
register_openers()
with open(large_file_path, 'r') as large_file:
request_data, request_headers = multipart_encode(
[('file', largs_file)],
cb=_upload_progress)
request_headers.update({
'X-HockeyAppToken': 'we use this for hockeyapp upload'
})
upload_request = urllib2.Request(request_resource,
request_data,
request_headers)
upload_connection = urllib2.urlopen(upload_request)
upload_response = json.load(upload_connection)
print "Done"
I want to stream a big file via werkzeug.
Currently my wsgi application looks like this:
from werkzeug.wrappers import Request, Response
from werkzeug.wsgi import ClosingIterator, wrap_file
import os
class Streamer(object):
def __init__(self):
pass
def __call__(self, environ, start_response):
request = Request(environ)
filename = os.getcwd() + "/bigfile.xml"
try:
response = wrap_file(environ, open(filename) )
return response
except HTTPException, e:
response = e
return ClosingIterator(response(environ, start_response))
I'm not sure what I should do with the object returned by the wrap_file function.
Haven't tried myself but I think following will work.
g = file(path_to_bigfile) # or any generator
return Response(g, direct_passthrough=True)
Just in case one would additionally like to:
1. preserve the file name
2. issue download without page redirect
# file_name assumed to be known
# file_path assumed to be known
file_size = os.path.getsize(file_path)
fh = file(file_path, 'rb')
return Response(fh,
mimetype='application/octet-stream',
headers=[
('Content-Length', str(file_size)),
('Content-Disposition', "attachment; filename=\"%s\"" % file_name),
],
direct_passthrough=True)