I've written a custom Django file upload handler for my current project. It's a proof-of-concept which allows you to compute a hash of an uploaded file without storing that file on-disk. It's a proof of concept, to be sure, but if I can get it to work, I can get onto the real purpose of my work.
Essentially, here's what I have so far, which is working fine with one major exception:
from django.core.files.uploadhandler import *
from hashlib import sha256
from myproject.upload.files import MyProjectUploadedFile
class MyProjectUploadHandler(FileUploadHandler):
def __init__(self, *args, **kwargs):
super(MyProjectUploadHandler, self).__init__(*args, **kwargs)
def handle_raw_input(self, input_data, META, content_length, boundary,
encoding = None):
self.activated = True
def new_file(self, *args, **kwargs):
super(MyProjectUploadHandler, self).new_file(*args, **kwargs)
self.digester = sha256()
raise StopFutureHandlers()
def receive_data_chunk(self, raw_data, start):
self.digester.update(raw_data)
def file_complete(self, file_size):
return MyProjectUploadedFile(self.digester.hexdigest())
The custom upload handler works great. The hash is accurate and works without storing any of the uploaded file to disk and only uses 64kb of memory at any one time.
The only problem I'm having is that I need to access another field from the POST request before processing the file, a text salt input by the user. My form looks like this:
<form id="myForm" method="POST" enctype="multipart/form-data" action="/upload/">
<fieldset>
<input name="salt" type="text" placeholder="Salt">
<input name="uploadfile" type="file">
<input type="submit">
</fieldset>
</form>
The "salt" POST variable is only made available to me after the request has been processed and the file has been uploaded, which doesn't work for my use case. I can't seem to find a way to access this variable in any way, shape, or form in my upload handler.
Is there a way for me to access each multipart variable as it comes across instead of just accessing the filess which are uploaded?
My solution didn't come easy, but here it is:
class IntelligentUploadHandler(FileUploadHandler):
"""
An upload handler which overrides the default multipart parser to allow
simultaneous parsing of fields and files... intelligently. Subclass this
for real and true awesomeness.
"""
def __init__(self, *args, **kwargs):
super(IntelligentUploadHandler, self).__init__(*args, **kwargs)
def field_parsed(self, field_name, field_value):
"""
A callback method triggered when a non-file field has been parsed
successfully by the parser. Use this to listen for new fields being
parsed.
"""
pass
def handle_raw_input(self, input_data, META, content_length, boundary,
encoding = None):
"""
Parse the raw input from the HTTP request and split items into fields
and files, executing callback methods as necessary.
Shamelessly adapted and borrowed from django.http.multiparser.MultiPartParser.
"""
# following suit from the source class, this is imported here to avoid
# a potential circular import
from django.http import QueryDict
# create return values
self.POST = QueryDict('', mutable=True)
self.FILES = MultiValueDict()
# initialize the parser and stream
stream = LazyStream(ChunkIter(input_data, self.chunk_size))
# whether or not to signal a file-completion at the beginning of the loop.
old_field_name = None
counter = 0
try:
for item_type, meta_data, field_stream in Parser(stream, boundary):
if old_field_name:
# we run this test at the beginning of the next loop since
# we cannot be sure a file is complete until we hit the next
# boundary/part of the multipart content.
file_obj = self.file_complete(counter)
if file_obj:
# if we return a file object, add it to the files dict
self.FILES.appendlist(force_text(old_field_name, encoding,
errors='replace'), file_obj)
# wipe it out to prevent havoc
old_field_name = None
try:
disposition = meta_data['content-disposition'][1]
field_name = disposition['name'].strip()
except (KeyError, IndexError, AttributeError):
continue
transfer_encoding = meta_data.get('content-transfer-encoding')
if transfer_encoding is not None:
transfer_encoding = transfer_encoding[0].strip()
field_name = force_text(field_name, encoding, errors='replace')
if item_type == FIELD:
# this is a POST field
if transfer_encoding == "base64":
raw_data = field_stream.read()
try:
data = str(raw_data).decode('base64')
except:
data = raw_data
else:
data = field_stream.read()
self.POST.appendlist(field_name, force_text(data, encoding,
errors='replace'))
# trigger listener
self.field_parsed(field_name, self.POST.get(field_name))
elif item_type == FILE:
# this is a file
file_name = disposition.get('filename')
if not file_name:
continue
# transform the file name
file_name = force_text(file_name, encoding, errors='replace')
file_name = self.IE_sanitize(unescape_entities(file_name))
content_type = meta_data.get('content-type', ('',))[0].strip()
try:
charset = meta_data.get('content-type', (0, {}))[1].get('charset', None)
except:
charset = None
try:
file_content_length = int(meta_data.get('content-length')[0])
except (IndexError, TypeError, ValueError):
file_content_length = None
counter = 0
# now, do the important file stuff
try:
# alert on the new file
self.new_file(field_name, file_name, content_type,
file_content_length, charset)
# chubber-chunk it
for chunk in field_stream:
if transfer_encoding == "base64":
# base 64 decode it if need be
over_bytes = len(chunk) % 4
if over_bytes:
over_chunk = field_stream.read(4 - over_bytes)
chunk += over_chunk
try:
chunk = base64.b64decode(chunk)
except Exception as e:
# since this is anly a chunk, any error is an unfixable error
raise MultiPartParserError("Could not decode base64 data: %r" % e)
chunk_length = len(chunk)
self.receive_data_chunk(chunk, counter)
counter += chunk_length
# ... and we're done
except SkipFile:
# just eat the rest
exhaust(field_stream)
else:
# handle file upload completions on next iteration
old_field_name = field_name
except StopUpload as e:
# if we get a request to stop the upload, exhaust it if no con reset
if not e.connection_reset:
exhaust(input_data)
else:
# make sure that the request data is all fed
exhaust(input_data)
# signal the upload has been completed
self.upload_complete()
return self.POST, self.FILES
def IE_sanitize(self, filename):
"""Cleanup filename from Internet Explorer full paths."""
return filename and filename[filename.rfind("\\")+1:].strip()
Essentially, by subclassing this class, you can have a more... intelligent upload handler. Fields will be announced with the field_parsed method to subclasses, as I needed for my purposes.
I've reported this as a feature request to the Django team, hopefully this functionality becomes a part of the regular toolbox in Django, rather than monkey-patching the source code as done above.
Based on the code for FileUploadHandler, found here at line 62:
https://github.com/django/django/blob/master/django/core/files/uploadhandler.py
It looks like the request object is passed into the handler and stored as self.request
In that case you should be able to access the salt at any point in your upload handler by doing
salt = self.request.POST.get('salt')
Unless I'm misunderstanding your question.
Related
I have a django app that asks the user to upload an image.
I get the image from html django.
This image I pass to the python script as a parameter. I did a lot of stuff with this image (like using the PIL libraries), the class of the parameter is:
'django.core.files.uploadedfile.InMemoryUploadedFile'
But the problem comes when I try to use one function that ask for the predeterminate type of .open() of python, that is:
'_io.BufferedReader'
Concretely, the function I'm using is:
block_blob_service.create_blob_from_stream() (a Microsoft Azure function)
So my question is, can I convert from django opened file type to python opened file type? It may be without saving the file and opening again.
And, if by any chance, somebody has worked with this library, I've also tried block_blob_service.create_blob_from_bytes() and it's not working (to convert from django to bytes I've just done img = django_input.read() (I get a Bytes type) and block_blob_service.create_blob_from_path(), is not an option, because I can't get the path of the file, nor I don't want to save the image and get a new path.
Just according to the source code for django.core.files.uploadedfile, class InMemoryUploadedFile inherit from class UploadedFile which inherit from django.core.files.base.File, as the code and figure below said.
from django.core.files.base import File
class UploadedFile(File):
"""
An abstract uploaded file (``TemporaryUploadedFile`` and
``InMemoryUploadedFile`` are the built-in concrete subclasses).
An ``UploadedFile`` object behaves somewhat like a file object and
represents some file data that the user submitted with a form.
"""
def __init__(self, file=None, name=None, content_type=None, size=None, charset=None, content_type_extra=None):
super().__init__(file, name)
self.size = size
self.content_type = content_type
self.charset = charset
self.content_type_extra = content_type_extra
def __repr__(self):
return "<%s: %s (%s)>" % (self.__class__.__name__, self.name, self.content_type)
def _get_name(self):
return self._name
def _set_name(self, name):
# Sanitize the file name so that it can't be dangerous.
if name is not None:
# Just use the basename of the file -- anything else is dangerous.
name = os.path.basename(name)
# File names longer than 255 characters can cause problems on older OSes.
if len(name) > 255:
name, ext = os.path.splitext(name)
ext = ext[:255]
name = name[:255 - len(ext)] + ext
self._name = name
name = property(_get_name, _set_name)
class InMemoryUploadedFile(UploadedFile):
"""
A file uploaded into memory (i.e. stream-to-memory).
"""
def __init__(self, file, field_name, name, content_type, size, charset, content_type_extra=None):
super().__init__(file, name, content_type, size, charset, content_type_extra)
self.field_name = field_name
def open(self, mode=None):
self.file.seek(0)
return self
def chunks(self, chunk_size=None):
self.file.seek(0)
yield self.read()
def multiple_chunks(self, chunk_size=None):
# Since it's in memory, we'll never have multiple chunks.
return False
The figure below comes from https://docs.djangoproject.com/en/2.2/ref/files/uploads/
So you can get the data bytes from InMemoryUploadedFile object and pass it to the function block_blob_service.create_blob_from_bytes.
Meanwhile, as I known, it's not a good idea. The simple solution for creating a blob from the uploaded file in Django is to use django-storages with Azure Storage backend, please see its document about Azure Storage to know how to use. And there is the existing similar SO thread Django Azure upload file to blob storage which you can refer to.
this is my models.py file
class Post(models.Model):
"""docstring for Post"""
poster = models.ForeignKey(User, null= False,blank=True, default=User.objects.get(username="admin"))
post_image = models.ImageField(upload_to='posts', null=True, blank=True)
def save(self, url='', *args, **kwargs):
if self.post_image != '' and url != '': # Don't do anything if we don't get passed anything!
image = download_image(url) # See function definition below
try:
filename = urlparse.urlparse(url).path.split('/')[-1]
self.post_image = filename
tempfile = image
tempfile_io = io.StringIO() # Will make a file-like object in memory that you can then save
tempfile.save(tempfile_io, format=image.format)
self.post_image.save(filename, ContentFile(tempfile_io.getvalue()), save=False) # Set save=False otherwise you will have a looping save method
except Exception as e:
print ("Error trying to save model: saving image failed: " + str(e))
pass
super(Post, self).save(*args, **kwargs)
def download_image(url):
"""Downloads an image and makes sure it's verified.
Returns a PIL Image if the image is valid, otherwise raises an exception.
"""
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0'} # More likely to get a response if server thinks you're a browser
r = urllib.Request(url, headers=headers)
request = urllib.urlopen(r, timeout=10)
image_data = io.StringIO(request.read()) # StringIO imitates a file, needed for verification step
img = Image.open(image_data) # Creates an instance of PIL Image class - PIL does the verification of file
img_copy = copy.copy(img) # Verify the copied image, not original - verification requires you to open the image again after verification, but since we don't have the file saved yet we won't be able to. This is because once we read() urllib2.urlopen we can't access the response again without remaking the request (i.e. downloading the image again). Rather than do that, we duplicate the PIL Image in memory.
if valid_img(img_copy):
return img
else:
# Maybe this is not the best error handling...you might want to just provide a path to a generic image instead
raise Exception('An invalid image was detected when attempting to save a Product!')
def valid_img(img):
"""Verifies that an instance of a PIL Image Class is actually an image and returns either True or False."""
type = img.format
if type in ('GIF', 'JPEG', 'JPG', 'PNG'):
try:
img.verify()
return True
except:
return False
else: return False
def __unicode__(self):
return self.post_image.url
and my view.py is
def createpost(request):
# Handle file upload
new_img_id = 0
if request.method == 'POST':
external_url = request.POST['url']
p = Post(poster=request.user)
p.save(external_url)
new_img_id=p.id
post = Post.objects.filter(id=new_img_id)
return render_to_response('create.html',{'post': post},context_instance=RequestContext(request))
and this is where the url gets called
$.ajax({
type: "POST",
url: "/create/",
data: {'url': newURL, 'csrfmiddlewaretoken': csrftoken},
success: function(){}
});
in the console I am getting this
in save
NameError: name 'download_image' is not defined
and in the browser console I'm getting this
POST http://localhost:8000/create/ 500 (INTERNAL SERVER ERROR)
If anyone can understand where the origin or this problem may be please help :D
I did try changing the order of the defs but there was not difference
Since you functions are methods of Post, you need to call them as such. Methods are always referred to via the instance, so in this case self.download_image(url), and always need to take self as the first parameter, so def download_image(self, url). Both of these also apply to valid_img.
Note also that it is a very bad idea to override the signature of the save method. Lots of code both in Django and in third-party applications will not be expecting that parameter. Instead, get it from kwargs:
def save(self, *args, **kwargs):
url = kwargs.pop('url', '')
If you want to make an object method, first parameter should be self and you can call method via self.download_image(...)
Also you should write download_image method inside save method, if you want to use like this.
def save(self, ...):
def download_image():
...
download_image()
I have a Class that I'm instantiating and then passing into a Tornado Web template. Both functions return a list, but I'm missing something in making the Class itself an iterable object. I'm afraid it's something fundamental I'm doing incorrectly. I'm making REST API calls, parsing the returned XML and returning some of the data to the webapp. Here's the code:
The API calls:
class GetVMList:
def __init__(self):
user = 'contoso\\administrator'
password = "apassword"
url = "http://scspf:8090/SC2012/VMM/Microsoft.Management.Odata.svc/VirtualMachines?$filter=VMMServer%20eq%20'scvmm'"
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, user, password)
# create the NTLM authentication handler
auth_NTLM = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman)
# create and install the opener
opener = urllib2.build_opener(auth_NTLM)
urllib2.install_opener(opener)
# retrieve the result
self.response = urllib2.urlopen(url)
self.data = self.response.read()
def name(self):
dom = parseString(self.data)
raw_xml = dom.getElementsByTagName('d:Name')
clean_xml = []
clean_data = []
for i in raw_xml:
clean_xml.append(i.toxml())
for i in clean_xml:
clean_data.append(i.replace('<d:Name>', '').replace('</d:Name>', ''))
return clean_data
def os(self):
dom = parseString(self.data)
raw_xml = dom.getElementsByTagName('d:OperatingSystem')
clean_xml = []
clean_data = []
for i in raw_xml:
clean_xml.append(i.toxml())
for i in clean_xml:
clean_data.append(i.replace('<d:OperatingSystem>', '').replace('</d:OperatingSystem>', ''))
return clean_data
The instantiation:
class ListHandler(tornado.web.RequestHandler):
def get(self):
self.render('temp/search.html', data='')
def post(self):
vm_list = GetVMList()
self.render('temp/search.html', data=vm_list)
And then the template contains this:
{% for vm in data %}
<li>{{ vm.name }} running {{ vm.os }}</li>
{% end %}
The error is: TypeError: iteration over non-sequence. I would imagine I need to use __iter__ in my Class, but I'm not sure I understand exactly how it works.
I believe you're missing the definition of __iter__ in your class.
My advice would be the following:
Create a class VM for storing information about a single VM. Its __init__ should take the information you want to store about each VM and set it as attributes on the instance. If you don't need any actual code to go along with the data about the VM, you can use a collections.namedtuple, which will save you writing an __init__() method.
Write getVMs() as a generator that, given a user, password, and URL, yields a sequence of VM instances. This result can be iterated over as-is, or can easily be converted to a regular list if you need one (just pass it to list()) or used to create a dictionary that maps VM names to OSs or vice versa.
e.g. (this code hasn't been tested):
class VM(object):
def __init__(self, name, os):
self.name = name
self.os = os
def getVMs(user, password, URL):
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, user, password)
auth_NTLM = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman)
urllib2.install_opener(urllib2.build_opener(auth_NTLM))
dom = parseString(urllib2.urlopen(url).read())
for vmnode in dom.getElementsByTagName('d:VM') # the tag representing a VM
name = vmnode.getElementsByTagName('d:Name')[0] # get name of current VM
name = name.replace('<d:Name>', '').replace('</d:Name>', '')
os = vmnode.getElementsByTagName('d:OperatingSystem')[0] # same for OS
os = os.replace('<d:OperatingSystem>', '').replace('</d:OperatingSystem>', ''))
yield VM(name, os)
... you could also give your VM objects the XML for the name and OS, or the XML for the whole VM, but this sample implementation only does the name and OS as strings.
(There are better ways to get the contents of a DOM node without resorting to replacing the XML tags with blank strings, but I don't have time to do that right now.)
Calling it:
user = r"contoso\administrator"
pass = "apassword"
url = ("http://scspf:8090/SC2012/VMM/Microsoft.Management.Odata.svc"
"/VirtualMachines?$filter=VMMServer%20eq%20'scvmm'")
vmlist = list(getVMs(user, pass, url))
Or to just print the info for each VM without storing an intermediate list:
for vm in getVMs(user, pass, url):
print vm.name, vm.os
Or to build a dictionary of names to VM instances (assuming a recent version of Python that has dict comprehensions):
vmdict = {vm.name: vm for vm in getVMs(user, pass, url)}
Using the generator model makes it maximally flexible for the caller. Even if that caller is you, it'll make your life easier.
I'm implementing a REST-style interface and would like to be able to create (via upload) files via a HTTP PUT request. I would like to create either a TemporaryUploadedFile or a InMemoryUploadedFile which I can then pass to my existing FileField and .save() on the object that is part of the model, thereby storing the file.
I'm not quite sure about how to handle the file upload part. Specifically, this being a put request, I do not have access to request.FILES since it does not exist in a PUT request.
So, some questions:
Can I leverage existing functionality in the HttpRequest class, specifically the part that handles file uploads? I know a direct PUT is not a multipart MIME request, so I don't think so, but it is worth asking.
How can I deduce the mime type of what is being sent? If I've got it right, a PUT body is simply the file without prelude. Do I therefore require that the user specify the mime type in their headers?
How do I extend this to large amounts of data? I don't want to read it all into memory since that is highly inefficient. Ideally I'd do what TemporaryUploadFile and related code does - write it part at a time?
I've taken a look at this code sample which tricks Django into handling PUT as a POST request. If I've got it right though, it'll only handle form encoded data. This is REST, so the best solution would be to not assume form encoded data will exist. However, I'm happy to hear appropriate advice on using mime (not multipart) somehow (but the upload should only contain a single file).
Django 1.3 is acceptable. So I can either do something with request.raw_post_data or request.read() (or alternatively some other better method of access). Any ideas?
Django 1.3 is acceptable. So I can
either do something with
request.raw_post_data or
request.read() (or alternatively some
other better method of access). Any
ideas?
You don't want to be touching request.raw_post_data - that implies reading the entire request body into memory, which if you're talking about file uploads might be a very large amount, so request.read() is the way to go. You can do this with Django <= 1.2 as well, but it means digging around in HttpRequest to figure out the the right way to use the private interfaces, and it's a real drag to then ensure your code will also be compatible with Django >= 1.3.
I'd suggest that what you want to do is to replicate the existing file upload behaviour parts of the MultiPartParser class:
Retrieve the upload handers from request.upload_handlers (Which by default will be MemoryFileUploadHandler & TemporaryFileUploadHandler)
Determine the request's content length (Search of Content-Length in HttpRequest or MultiPartParser to see the right way to do this.)
Determine the uploaded file's filename, either by letting the client specify this using the last path part of the url, or by letting the client specify it in the "filename=" part of the Content-Disposition header.
For each handler, call handler.new_file with the relevant args (mocking up a field name)
Read the request body in chunks using request.read() and calling handler.receive_data_chunk() for each chunk.
For each handler call handler.file_complete(), and if it returns a value, that's the uploaded file.
How can I deduce the mime type of what
is being sent? If I've got it right, a
PUT body is simply the file without
prelude. Do I therefore require that
the user specify the mime type in
their headers?
Either let the client specify it in the Content-Type header, or use python's mimetype module to guess the media type.
I'd be interested to find out how you get on with this - it's something I've been meaning to look into myself, be great if you could comment to let me know how it goes!
Edit by Ninefingers as requested, this is what I did and is based entirely on the above and the django source.
upload_handlers = request.upload_handlers
content_type = str(request.META.get('CONTENT_TYPE', ""))
content_length = int(request.META.get('CONTENT_LENGTH', 0))
if content_type == "":
return HttpResponse(status=400)
if content_length == 0:
# both returned 0
return HttpResponse(status=400)
content_type = content_type.split(";")[0].strip()
try:
charset = content_type.split(";")[1].strip()
except IndexError:
charset = ""
# we can get the file name via the path, we don't actually
file_name = path.split("/")[-1:][0]
field_name = file_name
Since I'm defining the API here, cross browser support isn't a concern. As far as my protocol is concerned, not supplying the correct information is a broken request. I'm in two minds as to whether I want say image/jpeg; charset=binary or if I'm going to allow non-existent charsets. In any case, I'm putting setting Content-Type validly as a client-side responsibility.
Similarly, for my protocol, the file name is passed in. I'm not sure what the field_name parameter is for and the source didn't give many clues.
What happens below is actually much simpler than it looks. You ask each handler if it will handle the raw input. As the author of the above states, you've got MemoryFileUploadHandler & TemporaryFileUploadHandler by default. Well, it turns out MemoryFileUploadHandler will when asked to create a new_file decide whether it will or not handle the file (based on various settings). If it decides it's going to, it throws an exception, otherwise it won't create the file and lets another handler take over.
I'm not sure what the purpose of counters was, but I've kept it from the source. The rest should be straightforward.
counters = [0]*len(upload_handlers)
for handler in upload_handlers:
result = handler.handle_raw_input("",request.META,content_length,"","")
for handler in upload_handlers:
try:
handler.new_file(field_name, file_name,
content_type, content_length, charset)
except StopFutureHandlers:
break
for i, handler in enumerate(upload_handlers):
while True:
chunk = request.read(handler.chunk_size)
if chunk:
handler.receive_data_chunk(chunk, counters[i])
counters[i] += len(chunk)
else:
# no chunk
break
for i, handler in enumerate(upload_handlers):
file_obj = handler.file_complete(counters[i])
if not file_obj:
# some indication this didn't work?
return HttpResponse(status=500)
else:
# handle file obj!
Newer Django versions allow for handling this a lot easier thanks to https://gist.github.com/g00fy-/1161423
I modified the given solution like this:
if request.content_type.startswith('multipart'):
put, files = request.parse_file_upload(request.META, request)
request.FILES.update(files)
request.PUT = put.dict()
else:
request.PUT = QueryDict(request.body).dict()
to be able to access files and other data like in POST. You can remove the calls to .dict() if you want your data to be read-only.
I hit this problem while working with Django 2.2, and was looking for something that just worked for uploading a file via PUT request.
from django.http import QueryDict
from django.http.multipartparser import MultiValueDict
from django.core.files.uploadhandler import (
SkipFile,
StopFutureHandlers,
StopUpload,
)
class PutUploadMiddleware(object):
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
method = request.META.get("REQUEST_METHOD", "").upper()
if method == "PUT":
self.handle_PUT(request)
return self.get_response(request)
def handle_PUT(self, request):
content_type = str(request.META.get("CONTENT_TYPE", ""))
content_length = int(request.META.get("CONTENT_LENGTH", 0))
file_name = request.path.split("/")[-1:][0]
field_name = file_name
content_type_extra = None
if content_type == "":
return HttpResponse(status=400)
if content_length == 0:
# both returned 0
return HttpResponse(status=400)
content_type = content_type.split(";")[0].strip()
try:
charset = content_type.split(";")[1].strip()
except IndexError:
charset = ""
upload_handlers = request.upload_handlers
for handler in upload_handlers:
result = handler.handle_raw_input(
request.body,
request.META,
content_length,
boundary=None,
encoding=None,
)
counters = [0] * len(upload_handlers)
for handler in upload_handlers:
try:
handler.new_file(
field_name,
file_name,
content_type,
content_length,
charset,
content_type_extra,
)
except StopFutureHandlers:
break
for chunk in request:
for i, handler in enumerate(upload_handlers):
chunk_length = len(chunk)
chunk = handler.receive_data_chunk(chunk, counters[i])
counters[i] += chunk_length
if chunk is None:
# Don't continue if the chunk received by
# the handler is None.
break
for i, handler in enumerate(upload_handlers):
file_obj = handler.file_complete(counters[i])
if file_obj:
# If it returns a file object, then set the files dict.
request.FILES.appendlist(file_name, file_obj)
break
any(handler.upload_complete() for handler in upload_handlers)
Is there an easy way to cache things when using urllib2 that I am over-looking, or do I have to roll my own?
If you don't mind working at a slightly lower level, httplib2 (https://github.com/httplib2/httplib2) is an excellent HTTP library that includes caching functionality.
You could use a decorator function such as:
class cache(object):
def __init__(self, fun):
self.fun = fun
self.cache = {}
def __call__(self, *args, **kwargs):
key = str(args) + str(kwargs)
try:
return self.cache[key]
except KeyError:
self.cache[key] = rval = self.fun(*args, **kwargs)
return rval
except TypeError: # incase key isn't a valid key - don't cache
return self.fun(*args, **kwargs)
and define a function along the lines of:
#cache
def get_url_src(url):
return urllib.urlopen(url).read()
This is assuming you're not paying attention to HTTP Cache Controls, but just want to cache the page for the duration of the application
This ActiveState Python recipe might be helpful:
http://code.activestate.com/recipes/491261/
I've always been torn between using httplib2, which does a solid job of handling HTTP caching and authentication, and urllib2, which is in the stdlib, has an extensible interface, and supports HTTP Proxy servers.
The ActiveState recipe starts to add caching support to urllib2, but only in a very primitive fashion. It fails to allow for extensibility in storage mechanisms, hard-coding the file-system-backed storage. It also does not honor HTTP cache headers.
In an attempt to bring together the best features of httplib2 caching and urllib2 extensibility, I've adapted the ActiveState recipe to implement most of the same caching functionality as is found in httplib2. The module is in jaraco.net as jaraco.net.http.caching. The link points to the module as it exists at the time of this writing. While that module is currently part of the larger jaraco.net package, it has no intra-package dependencies, so feel free to pull the module out and use it in your own projects.
Alternatively, if you have Python 2.6 or later, you can easy_install jaraco.net>=1.3 and then utilize the CachingHandler with something like the code in caching.quick_test().
"""Quick test/example of CacheHandler"""
import logging
import urllib2
from httplib2 import FileCache
from jaraco.net.http.caching import CacheHandler
logging.basicConfig(level=logging.DEBUG)
store = FileCache(".cache")
opener = urllib2.build_opener(CacheHandler(store))
urllib2.install_opener(opener)
response = opener.open("http://www.google.com/")
print response.headers
print "Response:", response.read()[:100], '...\n'
response.reload(store)
print response.headers
print "After reload:", response.read()[:100], '...\n'
Note that jaraco.util.http.caching does not provide a specification for the backing store for the cache, but instead follows the interface used by httplib2. For this reason, the httplib2.FileCache can be used directly with urllib2 and the CacheHandler. Also, other backing caches designed for httplib2 should be usable by the CacheHandler.
I was looking for something similar, and came across "Recipe 491261: Caching and throttling for urllib2" which danivo posted. The problem is I really dislike the caching code (lots of duplication, lots of manually joining of file paths instead of using os.path.join, uses staticmethods, non very PEP8'sih, and other things that I try to avoid)
The code is a bit nicer (in my opinion anyway) and is functionally much the same, with a few additions - mainly the "recache" method (example usage can be seem here, or in the if __name__ == "__main__": section at the end of the code).
The latest version can be found at http://github.com/dbr/tvdb_api/blob/master/cache.py, and I'll paste it here for posterity (with my application specific headers removed):
#!/usr/bin/env python
"""
urllib2 caching handler
Modified from http://code.activestate.com/recipes/491261/ by dbr
"""
import os
import time
import httplib
import urllib2
import StringIO
from hashlib import md5
def calculate_cache_path(cache_location, url):
"""Checks if [cache_location]/[hash_of_url].headers and .body exist
"""
thumb = md5(url).hexdigest()
header = os.path.join(cache_location, thumb + ".headers")
body = os.path.join(cache_location, thumb + ".body")
return header, body
def check_cache_time(path, max_age):
"""Checks if a file has been created/modified in the [last max_age] seconds.
False means the file is too old (or doesn't exist), True means it is
up-to-date and valid"""
if not os.path.isfile(path):
return False
cache_modified_time = os.stat(path).st_mtime
time_now = time.time()
if cache_modified_time < time_now - max_age:
# Cache is old
return False
else:
return True
def exists_in_cache(cache_location, url, max_age):
"""Returns if header AND body cache file exist (and are up-to-date)"""
hpath, bpath = calculate_cache_path(cache_location, url)
if os.path.exists(hpath) and os.path.exists(bpath):
return(
check_cache_time(hpath, max_age)
and check_cache_time(bpath, max_age)
)
else:
# File does not exist
return False
def store_in_cache(cache_location, url, response):
"""Tries to store response in cache."""
hpath, bpath = calculate_cache_path(cache_location, url)
try:
outf = open(hpath, "w")
headers = str(response.info())
outf.write(headers)
outf.close()
outf = open(bpath, "w")
outf.write(response.read())
outf.close()
except IOError:
return True
else:
return False
class CacheHandler(urllib2.BaseHandler):
"""Stores responses in a persistant on-disk cache.
If a subsequent GET request is made for the same URL, the stored
response is returned, saving time, resources and bandwidth
"""
def __init__(self, cache_location, max_age = 21600):
"""The location of the cache directory"""
self.max_age = max_age
self.cache_location = cache_location
if not os.path.exists(self.cache_location):
os.mkdir(self.cache_location)
def default_open(self, request):
"""Handles GET requests, if the response is cached it returns it
"""
if request.get_method() is not "GET":
return None # let the next handler try to handle the request
if exists_in_cache(
self.cache_location, request.get_full_url(), self.max_age
):
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = True
)
else:
return None
def http_response(self, request, response):
"""Gets a HTTP response, if it was a GET request and the status code
starts with 2 (200 OK etc) it caches it and returns a CachedResponse
"""
if (request.get_method() == "GET"
and str(response.code).startswith("2")
):
if 'x-local-cache' not in response.info():
# Response is not cached
set_cache_header = store_in_cache(
self.cache_location,
request.get_full_url(),
response
)
else:
set_cache_header = True
#end if x-cache in response
return CachedResponse(
self.cache_location,
request.get_full_url(),
set_cache_header = set_cache_header
)
else:
return response
class CachedResponse(StringIO.StringIO):
"""An urllib2.response-like object for cached responses.
To determine if a response is cached or coming directly from
the network, check the x-local-cache header rather than the object type.
"""
def __init__(self, cache_location, url, set_cache_header=True):
self.cache_location = cache_location
hpath, bpath = calculate_cache_path(cache_location, url)
StringIO.StringIO.__init__(self, file(bpath).read())
self.url = url
self.code = 200
self.msg = "OK"
headerbuf = file(hpath).read()
if set_cache_header:
headerbuf += "x-local-cache: %s\r\n" % (bpath)
self.headers = httplib.HTTPMessage(StringIO.StringIO(headerbuf))
def info(self):
"""Returns headers
"""
return self.headers
def geturl(self):
"""Returns original URL
"""
return self.url
def recache(self):
new_request = urllib2.urlopen(self.url)
set_cache_header = store_in_cache(
self.cache_location,
new_request.url,
new_request
)
CachedResponse.__init__(self, self.cache_location, self.url, True)
if __name__ == "__main__":
def main():
"""Quick test/example of CacheHandler"""
opener = urllib2.build_opener(CacheHandler("/tmp/"))
response = opener.open("http://google.com")
print response.headers
print "Response:", response.read()
response.recache()
print response.headers
print "After recache:", response.read()
main()
This article on Yahoo Developer Network - http://developer.yahoo.com/python/python-caching.html - describes how to cache http calls made through urllib to either memory or disk.
#dbr: you may need to add also https responses caching with :
def https_response(self, request, response):
return self.http_response(request,response)