Lazy data-flow (spreadsheet like) properties with dependencies in Python - python

My problem is the following: I have some python classes that have properties that are derived from other properties; and those should be cached once they are calculated, and the cached results should be invalidated each time the base properties are changed.
I could do it manually, but it seems quite difficult to maintain if the number of properties grows. So I would like to have something like Makefile rules inside my objects to automatically keep track of what needs to be recalculated.
The desired syntax and behaviour should be something like that:
# this does dirty magic, like generating the reverse dependency graph,
# and preparing the setters that invalidate the cached values
#dataflow_class
class Test(object):
def calc_a(self):
return self.b + self.c
def calc_c(self):
return self.d * 2
a = managed_property(calculate=calc_a, depends_on=('b', 'c'))
b = managed_property(default=0)
c = managed_property(calculate=calc_c, depends_on=('d',))
d = managed_property(default=0)
t = Test()
print t.a
# a has not been initialized, so it calls calc_a
# gets b value
# c has not been initialized, so it calls calc_c
# c value is calculated and stored in t.__c
# a value is calculated and stored in t.__a
t.b = 1
# invalidates the calculated value stored in self.__a
print t.a
# a has been invalidated, so it calls calc_a
# gets b value
# gets c value, from t.__c
# a value is calculated and stored in t.__a
print t.a
# gets value from t.__a
t.d = 2
# invalidates the calculated values stored in t.__a and t.__c
So, is there something like this already available or should I start implementing my own? In the second case, suggestions are welcome :-)

Here, this should do the trick.
The descriptor mechanism (through which the language implements "property") is
more than enough for what you want.
If the code bellow does not work in some corner cases, just write me.
class DependentProperty(object):
def __init__(self, calculate=None, default=None, depends_on=()):
# "name" and "dependence_tree" properties are attributes
# set up by the metaclass of the owner class
if calculate:
self.calculate = calculate
else:
self.default = default
self.depends_on = set(depends_on)
def __get__(self, instance, owner):
if hasattr(self, "default"):
return self.default
if not hasattr(instance, "_" + self.name):
setattr(instance, "_" + self.name,
self.calculate(instance, getattr(instance, "_" + self.name + "_last_value")))
return getattr(instance, "_" + self.name)
def __set__(self, instance, value):
setattr(instance, "_" + self.name + "_last_value", value)
setattr(instance, "_" + self.name, self.calculate(instance, value))
for attr in self.dependence_tree[self.name]:
delattr(instance, attr)
def __delete__(self, instance):
try:
delattr(instance, "_" + self.name)
except AttributeError:
pass
def assemble_tree(name, dict_, all_deps = None):
if all_deps is None:
all_deps = set()
for dependance in dict_[name].depends_on:
all_deps.add(dependance)
assemble_tree(dependance, dict_, all_deps)
return all_deps
def invert_tree(tree):
new_tree = {}
for key, val in tree.items():
for dependence in val:
if dependence not in new_tree:
new_tree[dependence] = set()
new_tree[dependence].add(key)
return new_tree
class DependenceMeta(type):
def __new__(cls, name, bases, dict_):
dependence_tree = {}
properties = []
for key, val in dict_.items():
if not isinstance(val, DependentProperty):
continue
val.name = key
val.dependence_tree = dependence_tree
dependence_tree[key] = set()
properties.append(val)
inverted_tree = {}
for property in properties:
inverted_tree[property.name] = assemble_tree(property.name, dict_)
dependence_tree.update(invert_tree(inverted_tree))
return type.__new__(cls, name, bases, dict_)
if __name__ == "__main__":
# Example and visual test:
class Bla:
__metaclass__ = DependenceMeta
def calc_b(self, x):
print "Calculating b"
return x + self.a
def calc_c(self, x):
print "Calculating c"
return x + self.b
a = DependentProperty(default=10)
b = DependentProperty(depends_on=("a",), calculate=calc_b)
c = DependentProperty(depends_on=("b",), calculate=calc_c)
bla = Bla()
bla.b = 5
bla.c = 10
print bla.a, bla.b, bla.c
bla.b = 10
print bla.b
print bla.c

I would like to have something like Makefile rules
then use one! You may consider this model:
one rule = one python file
one result = one *.data file
the pipe is implemented as a makefile or with another dependency analysis tool (cmake, scons)
The hardware test team in our company use such a framework for intensive exploratory tests:
you can integrate other languages and tools easily
you get a stable and proven solution
computations may be distributed one multiple cpu/computers
you track dependencies on values and rules
debug of intermediate values is easy
the (big) downside to this method is that you have to give up python import keyword because it creates an implicit (and untracked) dependency (there are workarounds for this).

import collections
sentinel=object()
class ManagedProperty(object):
'''
If deptree = {'a':set('b','c')}, then ManagedProperties `b` and
`c` will be reset whenever `a` is modified.
'''
def __init__(self,property_name,calculate=None,depends_on=tuple(),
default=sentinel):
self.property_name=property_name
self.private_name='_'+property_name
self.calculate=calculate
self.depends_on=depends_on
self.default=default
def __get__(self,obj,objtype):
if obj is None:
# Allows getattr(cls,mprop) to return the ManagedProperty instance
return self
try:
return getattr(obj,self.private_name)
except AttributeError:
result=(getattr(obj,self.calculate)()
if self.default is sentinel else self.default)
setattr(obj,self.private_name,result)
return result
def __set__(self,obj,value):
# obj._dependencies is defined by #register
map(obj.__delattr__,getattr(obj,'_dependencies').get(self.property_name,tuple()))
setattr(obj,self.private_name,value)
def __delete__(self,obj):
if hasattr(obj,self.private_name):
delattr(obj,self.private_name)
def register(*mproperties):
def flatten_dependencies(name, deptree, all_deps=None):
'''
A deptree such as {'c': set(['a']), 'd': set(['c'])} means
'a' depends on 'c' and 'c' depends on 'd'.
Given such a deptree, flatten_dependencies('d', deptree) returns the set
of all property_names that depend on 'd' (i.e. set(['a','c']) in the
above case).
'''
if all_deps is None:
all_deps = set()
for dep in deptree.get(name,tuple()):
all_deps.add(dep)
flatten_dependencies(dep, deptree, all_deps)
return all_deps
def classdecorator(cls):
deptree=collections.defaultdict(set)
for mprop in mproperties:
setattr(cls,mprop.property_name,mprop)
# Find all ManagedProperties in dir(cls). Note that some of these may be
# inherited from bases of cls; they may not be listed in mproperties.
# Doing it this way allows ManagedProperties to be overridden by subclasses.
for propname in dir(cls):
mprop=getattr(cls,propname)
if not isinstance(mprop,ManagedProperty):
continue
for underlying_prop in mprop.depends_on:
deptree[underlying_prop].add(mprop.property_name)
# Flatten the dependency tree so no recursion is necessary. If one were
# to use recursion instead, then a naive algorithm would make duplicate
# calls to __delete__. By flattening the tree, there are no duplicate
# calls to __delete__.
dependencies={key:flatten_dependencies(key,deptree)
for key in deptree.keys()}
setattr(cls,'_dependencies',dependencies)
return cls
return classdecorator
These are the unit tests I used to verify its behavior.
if __name__ == "__main__":
import unittest
import sys
def count(meth):
def wrapper(self,*args):
countname=meth.func_name+'_count'
setattr(self,countname,getattr(self,countname,0)+1)
return meth(self,*args)
return wrapper
class Test(unittest.TestCase):
def setUp(self):
#register(
ManagedProperty('d',default=0),
ManagedProperty('b',default=0),
ManagedProperty('c',calculate='calc_c',depends_on=('d',)),
ManagedProperty('a',calculate='calc_a',depends_on=('b','c')))
class Foo(object):
#count
def calc_a(self):
return self.b + self.c
#count
def calc_c(self):
return self.d * 2
#register(ManagedProperty('c',calculate='calc_c',depends_on=('b',)),
ManagedProperty('a',calculate='calc_a',depends_on=('b','c')))
class Bar(Foo):
#count
def calc_c(self):
return self.b * 3
self.Foo=Foo
self.Bar=Bar
self.foo=Foo()
self.foo2=Foo()
self.bar=Bar()
def test_two_instances(self):
self.foo.b = 1
self.assertEqual(self.foo.a,1)
self.assertEqual(self.foo.b,1)
self.assertEqual(self.foo.c,0)
self.assertEqual(self.foo.d,0)
self.assertEqual(self.foo2.a,0)
self.assertEqual(self.foo2.b,0)
self.assertEqual(self.foo2.c,0)
self.assertEqual(self.foo2.d,0)
def test_initialization(self):
self.assertEqual(self.foo.a,0)
self.assertEqual(self.foo.calc_a_count,1)
self.assertEqual(self.foo.a,0)
self.assertEqual(self.foo.calc_a_count,1)
self.assertEqual(self.foo.b,0)
self.assertEqual(self.foo.c,0)
self.assertEqual(self.foo.d,0)
self.assertEqual(self.bar.a,0)
self.assertEqual(self.bar.b,0)
self.assertEqual(self.bar.c,0)
self.assertEqual(self.bar.d,0)
def test_dependence(self):
self.assertEqual(self.Foo._dependencies,
{'c': set(['a']), 'b': set(['a']), 'd': set(['a', 'c'])})
self.assertEqual(self.Bar._dependencies,
{'c': set(['a']), 'b': set(['a', 'c'])})
def test_setting_property_updates_dependent(self):
self.assertEqual(self.foo.a,0)
self.assertEqual(self.foo.calc_a_count,1)
self.foo.b = 1
# invalidates the calculated value stored in foo.a
self.assertEqual(self.foo.a,1)
self.assertEqual(self.foo.calc_a_count,2)
self.assertEqual(self.foo.b,1)
self.assertEqual(self.foo.c,0)
self.assertEqual(self.foo.d,0)
self.foo.d = 2
# invalidates the calculated values stored in foo.a and foo.c
self.assertEqual(self.foo.a,5)
self.assertEqual(self.foo.calc_a_count,3)
self.assertEqual(self.foo.b,1)
self.assertEqual(self.foo.c,4)
self.assertEqual(self.foo.d,2)
self.assertEqual(self.bar.a,0)
self.assertEqual(self.bar.calc_a_count,1)
self.assertEqual(self.bar.b,0)
self.assertEqual(self.bar.c,0)
self.assertEqual(self.bar.calc_c_count,1)
self.assertEqual(self.bar.d,0)
self.bar.b = 2
self.assertEqual(self.bar.a,8)
self.assertEqual(self.bar.calc_a_count,2)
self.assertEqual(self.bar.b,2)
self.assertEqual(self.bar.c,6)
self.assertEqual(self.bar.calc_c_count,2)
self.assertEqual(self.bar.d,0)
self.bar.d = 2
self.assertEqual(self.bar.a,8)
self.assertEqual(self.bar.calc_a_count,2)
self.assertEqual(self.bar.b,2)
self.assertEqual(self.bar.c,6)
self.assertEqual(self.bar.calc_c_count,2)
self.assertEqual(self.bar.d,2)
sys.argv.insert(1,'--verbose')
unittest.main(argv=sys.argv)

Related

How to convert an object back into the code used to create it?

For example if I have a custom Python object like this;
#!/usr/bin/env python3
import os
base_dir = os.path.abspath(".")
class MyFile(dict):
def __init__(self, name, size = None, dir = base_dir):
self.name = name
self.path = os.path.join(dir, name)
self.bytes = size
and somewhere in my program, I initialize my object class;
a = MyFile(name = "foo", size = 10)
I want to be able to return the code used to create the object in the first place. For example;
print(a)
# <__main__.MyFile object at 0x102b84470>
# should instead print:
# MyFile(name = "foo", size = 10)
But since my object has some default attribute values, I only want those to show up in the output if they were explicitly included when the object was initialized;
b = MyFile(name = "bar", dir = "/home")
print(b)
# <__main__.MyFile object at 0x102b845c0>
# should instead print:
# MyFile(name = "bar", dir = "/home")
And to be clear, I am not trying to pull this from the source code, because a lot of my objects will be created dynamically, and I want to be able to return the same thing for them as well;
l = [ ("baz", 4), ("buzz", 12) ]
f = [ MyFile(name = n, size = s) for n, s in l ]
print(f)
# [<__main__.MyFile object at 0x1023844a8>, <__main__.MyFile object at 0x102384828>]
# should instead print:
# [ MyFile(name = "baz", size = 4), MyFile(name = "buzz", size = 12) ]
I saw the inspect library (https://docs.python.org/3/library/inspect.html) but it does not seem to have anything that does this. What am I missing? This functionality would be pretty analogous to R's dput function.
At a very basic level you can do this:
class MyClass:
def __init__(self, a, b):
self.a = a
self.b = b
def __repr__(self):
return f'{self.__class__.__name__}({self.a}, {self.b})'
class MyOtherClass(MyClass):
def method(self):
pass
c = MyClass(1, 2)
oc = MyOtherClass(3, 4)
print(c, oc)
Result:
MyClass(1, 2) MyOtherClass(3, 4)
This does what you ask, as well as taking subclassing into account to provide the correct class name. But of course things can get complicated for several reasons:
class MyClass:
def __init__(self, a, b):
self.a = a + 1
self.b = b if b < 10 else a
self.c = 0
def inc_c(self):
self.c += 1
def __repr__(self):
return f'{self.__class__.__name__}({self.a - 1}, {self.b})'
The value of c isn't covered by the constructor, so the proposed call would set it to 0. And Although you could compensate for the + 1 for a, the value of b will be more complicated - even more so if you realise someone could have changed the value later.
And then you need to consider that subclasses can override behaviour, etc. So, doing something like this only makes sense in very limited use cases.
As simple as replacing your code snippet with the following:
import os
base_dir = os.path.abspath(".")
class MyFile(object):
def __init__(self, name, size = None, dir = base_dir):
self.name = name
self.path = os.path.join(dir, name)
self.bytes = size
self.remember(name,size, dir)
def remember(self, name,size, dir):
self.s= '{}(name = \'{}\'{}{})'.format(self.__class__.__name__,name, ", size="+str(size) if size!=None else "", ', dir="'+dir+'"' if dir!=base_dir else "")
def __repr__(self):
return self.s
a) for a it returns:
MyFile(name = 'foo', size=10)
b) for b it returns:
MyFile(name = 'bar', dir="/home")
c) for f it returns:
[MyFile(name = 'baz', size=4), MyFile(name = 'buzz', size=12)]
Thanks to everyone who commented and answered. Ultimately, I incorporated their ideas and feedback into the following method, which allowed me to preserve the object's native __repr__ while still getting the behaviors I wanted.
#!/usr/bin/env python3
import os
base_dir = os.path.abspath(".")
class MyFile(dict):
"""
A custom dict class that auto-populates some keys based on simple input args
compatible with unittest.TestCase.assertDictEqual
"""
def __init__(self, name, size = None, dir = base_dir):
"""
standard init methods
"""
self.name = name
self.path = os.path.join(dir, name)
self.bytes = size
# auto-populate this key
self['somekey'] = self.path + ' ' + str(self.bytes)
# more logic for more complex keys goes here...
# use these later with `init` and `repr`
self.args = None
self.kwargs = None
#classmethod
def init(cls, *args, **kwargs):
"""
alternative method to initialize the object while retaining the args passed
"""
obj = cls(*args, **kwargs)
obj.args = args
obj.kwargs = kwargs
return(obj)
def repr(self):
"""
returns a text representation of the object that can be used to
create a new copy of an identical object, displaying only the
args that were originally used to create the current object instance
(do not show args that were not passed e.g. default value args)
"""
n = 'MyFile('
if self.args:
for i, arg in enumerate(self.args):
n += arg.__repr__()
if i < len(self.args) - 1 or self.kwargs:
n += ', '
if self.kwargs:
for i, (k, v) in enumerate(self.kwargs.items()):
n += str(k) + '=' + v.__repr__()
if i < len(self.kwargs.items()) - 1:
n += ', '
n += ')'
return(n)
Usage:
# normal object initialization
obj1 = MyFile('foo', size=10)
print(obj1) # {'somekey': '/Users/me/test/foo 10'}
# initialize with classmethod instead to preserve args
obj2 = MyFile.init("foo", size = 10)
print(obj2) # {'somekey': '/Users/me/test/foo 10'}
# view the text representation
repr = obj2.repr()
print(repr) # MyFile('foo', size=10)
# re-load a copy of the object from the text representation
obj3 = eval(repr)
print(obj3) # {'somekey': '/Users/me/test/foo 10'}
The use case for this being where I need to represent large simple data structures (dicts) in my Python code (integration tests), where the data values are dynamically generated from a smaller set of variables. But when I have many hundreds of such data structures that I need to include in the test case, it becomes infeasible to write the code for e.g. MyFile(...) out hundreds of times. This method allows me to use a script to ingest the data, and then print out compact Python code needed to recreate the data using my custom object class. Which I can then just copy/paste into my test cases.

How do I clear the cache from #cached_property decorator?

I have an function called "value" that makes heavy calculation...
The result of the function is always the same if the dataset is not changed for the identifier.
Once the dataset is changed for some identifier, I want to clear the cache, and let the function calculate it again.
You can better understand me by looking at this code:
from functools import cached_property
class Test:
identifiers = {}
dataset = an empty object of dataset type
def __init__(self, identifier, ...)
self.identifier = identifier
...
Test.identifiers[identifier] = self
...
#cached_property
def value(self):
result = None
# heavy calculate based on dataset
return result
#classmethod
def get(cls, identifier):
if identifier in cls.identifiers:
return cls.identifiers[identifier]
else:
return cls(identifier, ...)
#classmethod
def update(cls, dataset):
for block in dataset:
# assume there is block['identifier'] in each block
# here i want to clear the cache of value() function
instance = cls.get(block['identifier'])
# clear #cached_property of instance
cls.dataset.append(block)
As you can read in the CPython source, the value for a cached_property in Python 3.8 is stored in an instance variable of the same name. This is not documented, so it may be an implementation detail that you should not rely upon.
But if you just want to get it done without regards to compatibility, you can remove the cache with del instance.value.
As of Python 3.9, this is documented.
(Aditional to #Blckknght answer)
In case that you have a mutable object and you need to refresh all the #cached_property (because the object has been mutated), you could delete the properties that are already cached on the self.__dict__ dictionary (that's where the properties are storaged)
from functools import cached_property
class Test:
datalist: List[int]
#cached_property
def value(self):
result = None
# heavy calculate based on datalist
return result
def add_element(self, new:int)-> None:
# restore cache if calculated
self.__dict__.pop('value', None) # this will delete the cached val if already cached, otherwise do nothing
self.datalist.append(new)
or in case you want to do it more elegant you can directly edit the __setattr__ method
from functools import cached_property
class Test:
datalist: List[int]
#cached_property
def value(self):
result = None
# heavy calculate based on datalist
return result
def __setattr__(self, name, val):
self.__dict__[name] = val
self.__dict__.pop('value', None)
I offer an alternative approach, which might be useful in some cases.
If the type of the dataset you need to do the computation on is hashable, you can make use of the regular functools.cache or lru_cache decorator, applied to a static method that takes the dataset as input.
Here is an example of what I mean:
from functools import lru_cache
class MyClass():
def __init__(self, data):
self.data = data
#property
def slow_attribute(self):
return self._slow_attribute(self.data)
#staticmethod
#lru_cache
def _slow_attribute(data):
# long computation, using data,
# here is just an example
return sum(data)
Here there is no need to concern yourself with when to clear the cache: if the underlying dataset changes, the staticmethod automatically knows it cannot use the cached value anymore.
This has the additional perk that, if the dataset were to be restored to a previously-used state, the lookup may still be able to use a cached value.
Here is a demo of the code above working:
from time import perf_counter_ns
def print_time_and_value_of_computation(c):
t1 = perf_counter_ns()
val = c.slow_attribute
t2 = perf_counter_ns()
print(f'Time taken: {(t2 - t1)/1000} microseconds')
print(f'Value: {val}')
c = MyClass(range(10_000))
print_time_and_value_of_computation(c)
print_time_and_value_of_computation(c)
print('Changing the dataset!')
c.data = range(20_000)
print_time_and_value_of_computation(c)
print_time_and_value_of_computation(c)
print('Going back to the original dataset!')
c.data = range(10_000)
print_time_and_value_of_computation(c)
which returns:
Time taken: 162.074 microseconds
Value: 49995000
Time taken: 2.152 microseconds
Value: 49995000
Changing the dataset!
Time taken: 264.121 microseconds
Value: 199990000
Time taken: 1.989 microseconds
Value: 199990000
Going back to the original dataset!
Time taken: 1.144 microseconds
Value: 49995000
I ran across this problem and came across this thread as I was trying to solve it. The data in my case effectively is immutable, except that the setup of this object in some cases involves using the properties, with the properties being out of date after the setup. #Pablo's answer was helpful, but I wanted that process to dynamically reset everything cached.
Here's a generic example:
Setup and broken thing:
from functools import cached_property
class BaseThing:
def __init__(self, *starting_numbers: int):
self.numbers = []
self.numbers.extend(starting_numbers)
#property
def numbers_as_strings(self) -> dict[int, str]:
"""This property method will be referenced repeatedly"""
def process_arbitrary_numbers(self, *arbitrary_numbers: int) -> list[str]:
return [self.numbers_as_strings.get(number) for number in arbitrary_numbers]
def extend_numbers(self, *additional_numbers: int):
self.numbers.extend(additional_numbers)
class BrokenThing(BaseThing):
#cached_property
def numbers_as_strings(self) -> dict[int, str]:
print("Working on:", " ".join(map(str, self.numbers)))
return {number: str(number) for number in self.numbers}
output:
>>> thing = BrokenThing(1, 2, 3, 4)
>>> thing.process_arbitrary_numbers(1, 3) == ["1", "3"]
Working on: 1 2 3 4
True
>>> thing.extend_numbers(4, 5, 6)
>>> thing.process_arbitrary_numbers(5, 6) == ["5", "6"]
False
#cached_property replaced with #property to make it work, leaving it inefficient:
class InefficientThing(BaseThing):
#property
def numbers_as_strings(self) -> dict[int, str]:
print("Working on:", " ".join(map(str, self.numbers)))
return {number: str(number) for number in self.numbers}
output:
>>> thing = InefficientThing(1, 2, 3)
>>> thing.process_arbitrary_numbers(1, 3) == ["1", "3"]
Working on: 1 2 3
Working on: 1 2 3
True
>>> thing.extend_numbers(4, 5, 6)
>>> thing.process_arbitrary_numbers(5, 6) == ["5", "6"]
Working on: 1 2 3 4 5 6
Working on: 1 2 3 4 5 6
True
Solution:
class EfficientThing(BaseThing):
def _clear_cached_properties(self):
for name in dir(type(self)):
if isinstance(getattr(type(self), name), cached_property):
print(f"Clearing self.{name}")
vars(self).pop(name, None)
def extend_numbers(self, *additional_numbers: int):
self._clear_cached_properties()
return super().extend_numbers(*additional_numbers)
#cached_property
def numbers_as_strings(self) -> dict[int, str]:
print("Working on:", " ".join(map(str, self.numbers)))
return {number: str(number) for number in self.numbers}
output:
>>> thing = EfficientThing(1, 2, 3, 4)
>>> thing.process_arbitrary_numbers(1, 3) == ["1", "3"]
Working on: 1 2 3 4
True
>>> thing.extend_numbers(4, 5, 6)
Clearing self.numbers_as_strings
>>> thing.process_arbitrary_numbers(5, 6) == ["5", "6"]
Working on: 1 2 3 4 4 5 6
True
This loops through all attributes of the object's parent class. If the value of the attribute is an instance of cached_property, it's most likely a cached_property. The attribute is then popped from the instance dictionary. None is passed to pop in case the property hadn't been cached yet.

Python check if one or more values is None and knows which ones are

I have a Python app with a Firebase-database backend.
When I retrieve the data from my database, I want to check if those values
are available (if not, that means that the database is somehow corrupted, as mandatories fields are missing)
My current implementation is the following:
self.foo = myDbRef.get('foo')
self.bar = myDbRef.get('bar')
self.bip = myDbRef.get('bip')
self.plop = myDbRef.get('plop')
if self.foo is None or self.bar is None or self.bip is None or self.plop is None:
self.isValid = False
return ErrorCode.CORRUPTED_DATABASE
This works fine, is compact, but have a major issue: I will get the information that the database is corrupted,
but not what field is missing (could be just one of them, or more, or all !)
The idiomatic approach should be
if self.foo is None:
self.isValid = False
return ErrorCode.CORRUPTED_DATABASE, "FOO IS MISSING" # could be a string, an enum value, whatever, I have the information
if self.bar is None:
self.isValid = False
return ErrorCode.CORRUPTED_DATABASE, "BAR IS MISSING"
if self.bip is None:
self.isValid = False
return ErrorCode.CORRUPTED_DATABASE, "BIP IS MISSING"
But this is not pretty, not factorized (All my 'init from db' functions use the same pattern... I don't want to multiply my
number of lines by a factor of 10 for such a case).
This is not a '100% python' question, but I hope the langage has something for me to handle this like a boss (it's python: it usually does !)
You could extract the checks into a generator and leave the flag and return statements outside.
def invalid_fields():
if self.foo is None: yield "FOO"
if self.bar is None: yield "BAR"
if self.bip is None: yield "BIP"
invalid = list(invalid_fields())
if invalid:
self.isValid = False
return ErrorCode.CORRUPTED_DATABASE, "MISSING {}".format(", ".join(invalid))
This has the advantage of telling you about all the missing fields if there are more than one.
I made a class to contain some of your functionality that I can't access. I also made ErrorCode a string as a hack, since that's not defined in my tools and I'm not sure how you want the None names returned with/beside the ErrorCode.
Build a dict of names and values, check that the dict contains no None values, and if it does, return which keys:
myDbRef = {'foo' : None,
'bar': 1,
'bip': 2,
'plop': 3}
class Foo():
def __init__(self):
self.foo = myDbRef.get('foo')
self.bar = myDbRef.get('bar')
self.bip = myDbRef.get('bip')
self.plop = myDbRef.get('plop')
def check(self):
temp_dict = {}
for key in ['foo','bar','bip','plop']:
temp_dict[key] = myDbRef.get(key)
vals = {k:v for k,v in temp_dict.items() if v is None}
if vals:
self.isValid = False
return ("ErrorCode.CORRUPTED_DATABASE", [k for k in vals.keys()])
f = Foo()
print(f.check())
Result: ('ErrorCode.CORRUPTED_DATABASE', ['foo'])
Use a function and a loop:
def checknone(**things_with_names):
for name, thing in things_with_names.items():
if thing is None:
return ErrorCode.CORRUPTED_DATABASE, name + " IS MISSING"
return True
And use as such:
result = checknone(foo=self.foo, bar=self.bar, bip=self.bip, plop=self.plop)
if result is not True:
self.isValid = False
return result
For maximum gains, put it as a method of a class that you will Mixin into all your classes that use this. That way it can also set isValid.
You can dynamically create and search your instance attributes like so:
class Foo():
def __init__(self):
# First, define the list of attributes you want to look for and an empty list of errors
self.attrbs = ['foo','bar','bip','plop']
self.errors = []
# Iterate through the attributes list
for attrb in self.attrbs:
# Create and assign self.foo to MyDbRef.get('foo'), etc
self.__dict__[attrb] = myDbRef.get(attrb)
# Check if attribute is empty, if so, add to error
if not self.__dict__[attrb]:
self.errors.append(attrb.upper())
# Check if there are any errors
if self.errors:
self.is_valid = False
return (ErrorCode.CORRUPTED_DATABASE, "MISSING {errs}".format(errs='/'.join(self.errors)))
else:
self.is_valid = True

python cache dictionary - counting number of hits

I'm implementing a caching service in python. I'm using a simple dictionary so far. What I'd like to do is to count number of hits (number of times when a stored value was retrieved by the key). Python builtin dict has no such possibility (as far as I know). I searched through 'python dictionary count' and found Counter (also on stackoverflow), but this doesn't satisfy my requirements I guess. I don't need to count what already exists. I need to increment something that come from the outside. And I think that storing another dictionary with hits counting only is not the best data structure I can get :)
Do you have any ideas how to do it efficiently?
For an alternative method, if you're using Python 3 (or are willing to add this module to your Python 2 project, which has a slightly different interface), I strongly recommend the lru_cache decorator.
See the docs here. For example, this code :
from functools import lru_cache
#lru_cache(maxsize=32)
def meth(a, b):
print("Taking some time", a, b)
return a + b
print(meth(2, 3))
print(meth(2, 4))
print(meth(2, 3))
...will output :
Taking some time 2 3
5
Taking some time 2 4
6
5 <--- Notice that this function result is cached
As per the documentation, you can get the number of hits and misses with meth.cache_info(), and clear the cache with meth.cache_clear().
You can subclass a built-in dict class:
class CustomDict(dict):
def __init__(self, *args, **kwargs):
self.hits = {}
super(CustomDict, self).__init__(*args, **kwargs)
def __getitem__(self, key):
if key not in self.hits:
self.hits[key] = 0
self.hits[key] += 1
return super(CustomDict, self).__getitem__(key)
usage:
>>> d = CustomDict()
>>> d["test"] = "test"
>>> d["test"]
'test'
>>> d["test"]
'test'
>>> d.hits["test"]
2
Having another dictionary to store the hit counts is probably not a bad option, but you could also do something like:
class CacheService(object):
def __init__(self):
self.data = {}
def __setitem__(self, key, item):
self.data[key] = [item, 0]
def __getitem__(self, key):
value = self.data[key]
value[1] += 1
return value[0]
def getcount(self, key):
return self.data[key][1]
You can use it something like this:
>>> cs = CacheService()
>>> cs[1] = 'one'
>>> cs[2] = 'two'
>>> print cs.getcount(1)
0
>>> cs[1]
'one'
>>> print cs.getcount(1)
1
It will be much easier to just overload the built-in dict data type. This will solve your problem.
def CountDict(dict):
count = {}
def __getitem__(self, key):
CountDict.count[key] = CountDict.count.get(key, 0) + 1
return super(CountDict, self).__getitem__(self, key)
def __setitem__(self, key, value):
return super(CountDict, self).__setitem__(self, key, value)
def get_count(self, key):
return CountDict.count.get(key, 0)
This will give you lot more flexibility. Like you can have two counts one for number of reads and another for number of writes, if you wish without much of a complexity. To learn more about super, see here.
Edited to meet OP's need of keeping a count for reading a key. The output can be obtained by calling get_count method.
>>>my_dict = CountDict()
>>>my_dict["a"] = 1
>>>my_dict["a"]
>>>1
>>>my_dict["a"]
>>>1
>>>my_dict.get_count("a")
>>>2
You could try this approach.
class AccessCounter(object):
'''A class that contains a value and implements an access counter.
The counter increments each time the value is changed.'''
def __init__(self, val):
super(AccessCounter, self).__setattr__('counter', 0)
super(AccessCounter, self).__setattr__('value', val)
def __setattr__(self, name, value):
if name == 'value':
super(AccessCounter, self).__setattr__('counter', self.counter + 1)
# Make this unconditional.
# If you want to prevent other attributes to be set, raise AttributeError(name)
super(AccessCounter, self).__setattr__(name, value)
def __delattr__(self, name):
if name == 'value':
super(AccessCounter, self).__setattr__('counter', self.counter + 1)
super(AccessCounter, self).__delattr__(name)

Elegant way to avoid .put() on unchanged entities

A reoccurring pattern in my Python programming on GAE is getting some entity from the data store, then possibly changing that entity based on various conditions. In the end I need to .put() the entity back to the data store to ensure that any changes that might have been made to it get saved.
However often there were no changes actually made and the final .put() is just a waste of money. How to easily make sure that I only put an entity if it has really changed?
The code might look something like
def handle_get_request():
entity = Entity.get_by_key_name("foobar")
if phase_of_moon() == "full":
entity.werewolf = True
if random.choice([True, False]):
entity.lucky = True
if some_complicated_condition:
entity.answer = 42
entity.put()
I could maintain a "changed" flag which I set if any condition changed the entity, but that seems very brittle. If I forget to set it somewhere, then changes would be lost.
What I ended up using
def handle_get_request():
entity = Entity.get_by_key_name("foobar")
original_xml = entity.to_xml()
if phase_of_moon() == "full":
entity.werewolf = True
if random.choice([True, False]):
entity.lucky = True
if some_complicated_condition:
entity.answer = 42
if entity.to_xml() != original_xml: entity.put()
I would not call this "elegant". Elegant would be if the object just saved itself automatically in the end, but I felt this was simple and readable enough to do for now.
Why not check if the result equals (==) the original and so decide whether to save it. This depends on a correctly implemented __eq__, but by default a field-by-field comparison based on the __dict__ should do it.
def __eq__(self, other) :
return self.__dict__ == other.__dict__
(Be sure that the other rich comparison and hash operators work correctly if you do this. See here.)
One possible solution is using a wrapper that tracks any attribute change:
class Wrapper(object):
def __init__(self, x):
self._x = x
self._changed = False
def __setattr__(self, name, value):
if name[:1] == "_":
object.__setattr__(self, name, value)
else:
if getattr(self._x, name) != value:
setattr(self._x, name, value)
self._changed = True
def __getattribute__(self, name):
if name[:1] == "_":
return object.__getattribute__(self, name)
return getattr(self._x, name)
class Contact:
def __init__(self, name, address):
self.name = name
self.address = address
c = Contact("Me", "Here")
w = Wrapper(c)
print w.name # --> Me
w.name = w.name
print w.name, w._changed # --> Me False
w.name = "6502"
print w.name, w._changed # --> 6502 True
This answer is a part of an question i posted about a Python checksum of a dict
With the answers of this question I developed a method to generate checksum from
a db.Model.
This is an example:
>>> class Actor(db.Model):
... name = db.StringProperty()
... age = db.IntegerProperty()
...
>>> u = Actor(name="John Doe", age=26)
>>> util.checksum_from_model(u, Actor)
'-42156217'
>>> u.age = 47
>>> checksum_from_model(u, Actor)
'-63393076'
I defined these methods:
def checksum_from_model(ref, model, exclude_keys=[], exclude_properties=[]):
"""Returns the checksum of a db.Model.
Attributes:
ref: The reference og the db.Model
model: The model type instance of db.Model.
exclude_keys: To exclude a list of properties name like 'updated'
exclude_properties: To exclude list of properties type like 'db.DateTimeProperty'
Returns:
A checksum in signed integer.
"""
l = []
for key, prop in model.properties().iteritems():
if not (key in exclude_keys) and \
not any([True for x in exclude_properties if isinstance(prop, x)]):
l.append(getattr(ref, key))
return checksum_from_list(l)
def checksum_from_list(l):
"""Returns a checksum from a list of data into an int."""
return reduce(lambda x,y : x^y, [hash(repr(x)) for x in l])
Note:
For the base36 implementation: http://en.wikipedia.org/wiki/Base_36#Python_implementation
Edit:
I removed the return in base36, now these functions run without dependences. (An advice from #Skirmantas)
Didn't work with GAE but in same situation I'd use something like:
entity = Entity.get_by_key_name("foobar")
prev_entity_state = deepcopy(entity.__dict__)
if phase_of_moon() == "full":
entity.werewolf = True
if random.choice([True, False]):
entity.lucky = True
if some_complicated_condition:
entity.answer = 42
if entity.__dict__ == prev_entity_state:
entity.put()

Categories