Python file cache - python

I'm creating some objects from files (validators from templates xsd files, to draw together other xsd files, as it happens), and I'd like to recreate the objects when the file on disk changes.
I could create something like:
def getobj(fname, cache = {}):
try:
obj, lastloaded = cache[fname]
if lastloaded < last_time_written(fname):
# same stuff as in except clause
except KeyError:
obj = create_from_file(fname)
cache[fname] = (obj, currenttime)
return obj
However, I would prefer to use someone else's tested code if it exists. Is there an existing library that does something like this?
Update: I'm using python 2.7.1.

Your code (including the cache logic) looks fine.
Consider moving the cache variable outside the function definition. That will make it possible to add other functions to clear or inspect the cache.
If you want to look at code that does something similar, look at the source for the filecmp module: http://hg.python.org/cpython/file/2.7/Lib/filecmp.py The interesting part is how the stat module is used to determine whether a file has changed. Here is the signature function:
def _sig(st):
return (stat.S_IFMT(st.st_mode),
st.st_size,
st.st_mtime)

Three thoughts.
Use try... except... else for a neater control flow.
File modification times are notoriously unstable -- in particular, they don't necessarily correspond to the most recent time the file was modified!
Python 3 contains a caching decorator: functools.lru_cache. Here's the source.
def lru_cache(maxsize=100):
"""Least-recently-used cache decorator.
If *maxsize* is set to None, the LRU features are disabled and the cache
can grow without bound.
Arguments to the cached function must be hashable.
View the cache statistics named tuple (hits, misses, maxsize, currsize) with
f.cache_info(). Clear the cache and statistics with f.cache_clear().
Access the underlying function with f.__wrapped__.
See: http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used
"""
# Users should only access the lru_cache through its public API:
# cache_info, cache_clear, and f.__wrapped__
# The internals of the lru_cache are encapsulated for thread safety and
# to allow the implementation to change (including a possible C version).
def decorating_function(user_function,
tuple=tuple, sorted=sorted, len=len, KeyError=KeyError):
hits = misses = 0
kwd_mark = (object(),) # separates positional and keyword args
lock = Lock() # needed because ordereddicts aren't threadsafe
if maxsize is None:
cache = dict() # simple cache without ordering or size limit
#wraps(user_function)
def wrapper(*args, **kwds):
nonlocal hits, misses
key = args
if kwds:
key += kwd_mark + tuple(sorted(kwds.items()))
try:
result = cache[key]
hits += 1
except KeyError:
result = user_function(*args, **kwds)
cache[key] = result
misses += 1
return result
else:
cache = OrderedDict() # ordered least recent to most recent
cache_popitem = cache.popitem
cache_renew = cache.move_to_end
#wraps(user_function)
def wrapper(*args, **kwds):
nonlocal hits, misses
key = args
if kwds:
key += kwd_mark + tuple(sorted(kwds.items()))
try:
with lock:
result = cache[key]
cache_renew(key) # record recent use of this key
hits += 1
except KeyError:
result = user_function(*args, **kwds)
with lock:
cache[key] = result # record recent use of this key
misses += 1
if len(cache) > maxsize:
cache_popitem(0) # purge least recently used cache entry
return result
def cache_info():
"""Report cache statistics"""
with lock:
return _CacheInfo(hits, misses, maxsize, len(cache))
def cache_clear():
"""Clear the cache and cache statistics"""
nonlocal hits, misses
with lock:
cache.clear()
hits = misses = 0
wrapper.cache_info = cache_info
wrapper.cache_clear = cache_clear
return wrapper
return decorating_function

Unless there is a specific reason to use it as argument I would use cache as a global object

Related

How can I override global variables just for the scope of callees of a function in Python?

I'm writing a decorator which needs to pass data to other utility functions; something like:
STORE = []
def utility(message):
STORE.append(message)
def decorator(func):
def decorator_wrap(*args, **kwargs):
global STORE
saved_STORE = STORE
STORE = list()
func(*args, **kwargs)
for line in STORE:
print(line)
STORE = saved_STORE
return decorator_wrap
#decorator
def foo(x):
# ...
utility(x)
# ...
But that's kind of yuck, and not thread safe. Is there a way to override utility()'s view of STORE for the duration of decorator_wrap()? Or some other way to signal to utility() that there's an alternate STORE it should use?
Alternatively, to present an different utility() to foo() and all its callees; but that seems like exactly the same problem.
From this answer I find that I can implement it this way:
import inspect
STORE = []
def utility(message):
global STORE
store = STORE
frame = inspect.currentframe()
while frame:
if 'LOCAL_STORE' in frame.f_locals:
store = frame.f_locals['LOCAL_STORE']
break;
frame = frame.f_back
store.append(message)
def decorator(func):
def decorator_wrap(*args, **kwargs):
LOCAL_STORE = []
func(*args, **kwargs)
for line in LOCAL_STORE:
print(line)
return decorator_wrap
Buuuut while reading the documentation I see f_globals is present in every stack frame. I think the more efficient method would be to inject my local into my callee's f_globals. This would be similar to setting an environment variable before executing another command, but I don't know if it's legal.

Python Runtime Profiler?

Most python profilers are made for python programs or scripts, in my case I'm working with a python plugin for a third-party app (blender 3d), therefore the profiling needs to be sampled in real-time while the user is interacting with the plugin.
I'm currently trying an injection strategy, which consists of procedurally searching through all plugin modules, & injecting a profiler wrapper to every single function.
see below, this is what my current profiler looks like
I'm wondering if there are other profilers out there that can be used for run-time scenarios such as plugins
class ModuleProfiler:
#profiler is running?
allow = False #must be True in order to start the profiler
activated = False #read only indication if profiler has been activated
#please define your plugin main module here
plugin_main_module = "MyBlenderPlugin"
#function calls registry
registry = {}
#ignore parameters, typically ui functions/modules
ignore_fcts = [
"draw",
"foo",
]
ignore_module = [
"interface_drawing",
]
event_prints = True #print all event?
#classmethod
def print_registry(cls):
"""print all registered benchmark"""
#generate total
for k,v in cls.registry.copy().items():
cls.registry[k]["averagetime"] = v["runtime"]/v["calls"]
print("")
print("PROFILER: PRINTING OUTCOME")
sorted_registry = dict(sorted(cls.registry.items(), key=lambda item:item[1]["runtime"], reverse=False))
for k,v in sorted_registry.items():
print("\n",k,":")
for a,val in v.items():
print(" "*6,a,":",val)
return None
#classmethod
def update_registry(cls, fct, exec_time=0):
"""update internal benchmark with new data"""
key = f"{fct.__module__}.{fct.__name__}"
r = cls.registry.get(key)
if (r is None):
cls.registry[key] = {}
cls.registry[key]["calls"] = 0
cls.registry[key]["runtime"] = 0
r = cls.registry[key]
r["calls"] +=1
r["runtime"] += exec_time
return None
#classmethod
def profile_wrap(cls, fct):
"""wrap any functions with our benchmark & call-counter"""
#ignore some function?
if (fct.__name__ in cls.ignore_fcts):
return fct
import functools
import time
#functools.wraps(fct)
def inner(*args,**kwargs):
t = time.time()
r = fct(*args,**kwargs)
cls.update_registry(fct, exec_time=time.time()-t)
if cls.event_prints:
print(f"PROFILER : {fct.__module__}.{fct.__name__} : {time.time()-t}")
return r
return inner
#classmethod
def start(cls):
"""inject the wrapper for every functions of every sub-modules of our plugin
used for benchmark or debugging purpose only"""
if (not cls.allow):
return None
cls.activated = True
import types
import sys
def is_function(obj):
"""check if given object is a function"""
return isinstance(obj, types.FunctionType)
print("")
#for all modules in sys.modules
for mod_k,mod in sys.modules.copy().items():
#separate module componments names
mod_list = mod_k.split('.')
#fileter what isn't ours
if (mod_list[0]!=cls.plugin_main_module):
continue
#ignore some modules?
if any([m in cls.ignore_module for m in mod_list]):
continue
print("PROFILER_SEARCH : ",mod_k)
#for each objects found in module
for ele_k,ele in mod.__dict__.items():
#if it does not have a name, skip
if (not hasattr(ele,"__name__")):
continue
#we have a global function
elif is_function(ele):
print(f" INJECT LOCAL_FUNCTION: {mod_k}.{ele_k}")
mod.__dict__[ele_k] = cls.profile_wrap(ele)
#then we have a homebrewed class? search for class.fcts
#class.fcts implementation is not flawless, need to investigate issue(s)
elif repr(ele).startswith(f"<class '{cls.plugin_main_module}."):
for class_k,class_e in ele.__dict__.items():
if is_function(class_e):
print(f" INJECT CLASS_FUNCTION: {mod_k}.{ele_k}.{class_k}")
setattr( mod.__dict__[ele_k], class_k, cls.profile_wrap(class_e),) #class.__dict__ are mapping proxies, need to assign this way,
continue
print("")
return None
ModuleProfiler.allow = True
ModuleProfiler.plugin_main_module = "MyModule"
ModuleProfiler.start()

How do I write this as a context manager?

The race-condition-free way of updating a variable in redis is:
r = redis.Redis()
with r.pipeline() as p:
while 1:
try:
p.watch(KEY)
val = p.get(KEY)
newval = int(val) + 42
p.multi()
p.set(KEY, newval)
p.execute() # raises WatchError if anyone else changed KEY
break
except redis.WatchError:
continue # retry
this is significantly more complex than the straight forward version (which contains a race-condition):
r = redis.Redis()
val = r.get(KEY)
newval = int(val) + 42
r.set(KEY, newval)
so I thought a context manager would make this easier to work with, however, I'm having problems...
My initial idea was
with update(KEY) as val:
newval = val + 42
somehow return newval to the contextmanager...?
there wasn't an obvious way to do the last line, so I tried::
#contextmanager
def update(key, cn=None):
"""Usage::
with update(KEY) as (p, val):
newval = int(val) + 42
p.set(KEY, newval)
"""
r = cn or redis.Redis()
with r.pipeline() as p:
while 1:
try:
p.watch(key) # --> immediate mode
val = p.get(key)
p.multi() # --> back to buffered mode
yield (p, val)
p.execute() # raises WatchError if anyone has changed `key`
break # success, break out of while loop
except redis.WatchError:
pass # someone else got there before us, retry.
which works great as long as I don't catch a WatchError, then I get
File "c:\python27\Lib\contextlib.py", line 28, in __exit__
raise RuntimeError("generator didn't stop")
RuntimeError: generator didn't stop
what am I doing wrong?
I think the problem is that you yield multiple times (when the task is repeated) but a context manager is only entered once (the yield is just a syntactic sugar for the __enter__ method). So as soon as the yield can be executed multiple times, you have a problem.
I’m not prefectly sure how to solve this in a good way, and I can’t test it either, so I’m only giving some suggestions.
First of all, I would avoid yielding the rather internal p; you should yield some object that is specifically made for the update process. For example something like this:
with update(KEY) as updater:
updater.value = int(updater.original) + 42
Of course this still doesn’t solve the multiple yields, and you cannot yield that object earlier as you won’t have the original value at that point either. So instead, we could specify a delegate responsible for the value updating instead.
with update(KEY) as updater:
updater.process = lambda value: value + 42
This would store a function inside the yielded object which you can then use inside the context manager to keep trying to update the value until it succeeded. And you can yield that updater from the context manager early, before entering the while loop.
Of course, if you have made it this far, there isn’t actually any need for a context manager left. Instead, you can just make a function:
update(key, lambda value: value + 42)

python function that changes itself to list

So I'm working on a chemistry project for fun, and I have a function that initializes a list from a text file. What I want to do s make it so the function replaces itself with a list. So here's my first attempt at it which randomly will or won't work and I don't know why:
def periodicTable():
global periodicTable
tableAtoms = open('/Users/username/Dropbox/Python/Chem Project/atoms.csv','r')
listAtoms = tableAtoms.readlines()
tableAtoms.close()
del listAtoms[0]
atoms = []
for atom in listAtoms:
atom = atom.split(',')
atoms.append(Atom(*atom))
periodicTable = atoms
It gets called in in this way:
def findAtomBySymbol(symbol):
try:
periodicTable()
except:
pass
for atom in periodicTable:
if atom.symbol == symbol:
return atom
return None
Is there a way to make this work?
Don't do that. The correct thing to do would be using a decorator that ensures the function is only executed once and caches the return value:
def cachedfunction(f):
cache = []
def deco(*args, **kwargs):
if cache:
return cache[0]
result = f(*args, **kwargs)
cache.append(result)
return result
return deco
#cachedfunction
def periodicTable():
#etc
That said, there's nothing stopping you from replacing the function itself after it has been called, so your approach should generally work. I think the reason it doesn't is because an exception is thrown before you assign the result to periodicTable and thus it never gets replaced. Try removing the try/except block or replacing the blanket except with except TypeError to see what exactly happens.
This is very bad practice.
What would be better is to have your function remember if it has already loaded the table:
def periodicTable(_table=[]):
if _table:
return _table
tableAtoms = open('/Users/username/Dropbox/Python/Chem Project/atoms.csv','r')
listAtoms = tableAtoms.readlines()
tableAtoms.close()
del listAtoms[0]
atoms = []
for atom in listAtoms:
atom = atom.split(',')
atoms.append(Atom(*atom))
_table[:] = atoms
The first two lines check to see if the table has already been loaded, and if it has it simply returns it.

De-uglify memcache boilerplate in Python

I'm just getting started with Python. I'm making heavy use of caching in my app and my code is increasingly littered with this same pattern, which is the standard caching pattern I've seen used all over the shop. Are there some sexy syntactic tricks in Python that can DRY out some of this boilerplate?
(btw, this is not actual code)
# Determine if we are allowed to use cache
cacheable = settings.cache.lifetime is not None
# Generate unique cache key
cache_key = 'something_unique_{some_arg}'.format(some_arg=*args[0])
# Return cached version if allowed and available
if cacheable:
cached = memcache.get(cache_key)
if cached:
return cached
# Generate output
output = do_something_fooey(args[0])
# Cache output if allowed
if cacheable:
memcache.set(cache_key, output, settings.cache.lifetime)
return output
I'm going to have a stab at this too, probably writing a caching wrapper function and passing the output generation to it as a "delegate" (dunno if that's Python lingo), but it'd be great to get some advice from Python experts.
You want a decorator:
def cached(func):
def _cached(*args):
# Determine if we are allowed to use cache
cacheable = settings.cache.lifetime is not None
# Generate unique cache key
cache_key = '{0}-{1}-{2}'.format(func.__module__, func.__name__, args[0])
# Return cached version if allowed and available
if cacheable:
result = memcache.get(cache_key)
if result is not None:
return result
# Generate output
result = func(args[0])
# Cache output if allowed
if cacheable and result is not None:
memcache.set(cache_key, result, settings.cache.lifetime)
return result
return _cached
#cached
def do_something_fooey(*args):
return something
You may want to use functools.wraps (http://docs.python.org/2/library/functools.html#functools.wraps) for a well-behaved decorator.
I've found a couple of alternate pre-rolled solutions:
https://github.com/jayferd/python-cache
and
https://gist.github.com/abahgat/1395810
In the end I created the below, which is a fleshed-out version of #bruno's example. The nice thing about this one is that you can pass an extra_key to the decorator, which forms part of the caching key and can be either a string or a delegate function. (lifetime can also be a delegate function or an integer). This allows you to add stuff at runtime such as caching uniquely by user id.
def cached(lifetime=settings.cache.default_lifetime, extra_key=None):
def _cached(func):
#functools.wraps(func)
def wrapper(*args, **kwargs):
# Resolve lifetime if it's a function
resolved_lifetime = lifetime(*args) if hasattr(lifetime, '__call__') else lifetime
if resolved_lifetime is not None:
# Hash function args
items = kwargs.items()
items.sort()
hashable_args = (args, tuple(items))
args_key = hashlib.md5(pickle.dumps(hashable_args)).hexdigest()
# Generate unique cache key
cache_key = '{0}-{1}-{2}-{3}'.format(
func.__module__,
func.__name__,
args_key,
extra_key() if hasattr(extra_key, '__call__') else extra_key
)
# Return cached version if allowed and available
result = memcache.get(cache_key)
if result is not None:
return result
# Generate output
result = func(*args, **kwargs)
# Cache output if allowed
if resolved_lifetime is not None and result is not None:
memcache.set(cache_key, result, resolved_lifetime)
return result
return wrapper
return _cached

Categories