Related
Does NetworkX support customization of where nodes, edges and attributes are stored? For example, I would like to try 2 options:
Using something like LevelDB / Kyoto Cabinet as a backing store.
Using some distributed database (Neo4j or even HBase - I only need distributed storage of the nodes/edges) as a backing store.
What are the extensibility points of NetworkX to support such things?
I will post the nuances of setting up NetworkX for external storage. Kikohs is correct in pointing out that there are factories for each of the dictionaries. These can be overridden.
For persisted storage the only dictionary that really needs special attention is the node dictionary.
Special attention must be given to how the dict-like implementation behaves. There is code in the NetworkX classes that change values returned from the dictionary in-memory without setting them back.
For example things like:
self.succ[u][v]=datadict
self.pred[v][u]=datadict
These values will not be persisted back to the storage backend. To accommodate for this, I have used a memory cache that holds objects in memory and when they are evicted it writes them to the underlying storage.
For the in-memory cache I used cachetools. For the eviction see: Python functools.lru_cache eviction callback or equivalent
For the underlying storage I used plyvel (https://plyvel.readthedocs.org/en/latest/) which is a Python interface for LevelDB.
I also give the implementation of the dictionary bellow. Note there are still errors and bugs in the code, and it has not been tested properly, but you get the general idea.
class PlyvelBatchWrite(object):
def __init__(self, plv_dict):
self.__batch = plv_dict._db.write_batch()
self.__plvd = plv_dict
def set(self, key, val):
self.__batch.put(self.__plvd.serializer.pack(key), self.__plvd.serializer.pack(val))
def delete(self, key):
self.__batch.delete(self.__plvd.serializer.pack(key))
def clear(self):
self.__batch.clear()
def commit(self):
self.__batch.write()
class PlyvelDict(MutableMapping):
def __init__(self, directory='', db=None, serializer_factory=None, cache_factory=None, **kwargs):
self.__directory = directory
ensure_directory(directory)
if isinstance(db, str) or db is None:
if db is None:
# generate UUID
db = str(uuid.uuid4())
self.__db = db
db = plyvel.DB(self.name(), **kwargs)
else:
self.__db = kwargs['db']
self._db = db
if serializer_factory:
self.serializer = serializer_factory()
else:
self.serializer = None
if cache_factory:
self.__cache = cache_factory(self.__cache_miss, self.__cache_evict)
else:
self.__cache = None
def name(self):
full_path = os.path.join(self.__directory, self.__db)
return full_path
def __cache_miss(self, key):
b_item = self._db.get(self.serializer.pack(key))
if b_item is not None:
return self.serializer.unpack(b_item)
else:
raise KeyError(key)
def __cache_evict(self, key, val):
self._db.put(self.serializer.pack(key), self.serializer.pack(val))
def __copy__(self):
return type(self)(self.__directory, self._db, type(self.serializer), type(self.__cache), db=self.__db)
def __getitem__(self, key):
return self.__cache[key]
def __setitem__(self, key, value):
if key in self.__cache:
self.__cache[key] = value
self.__write_to_db(key, value)
def __write_to_db(self, key, value):
self._db.put(self.serializer.pack(key), self.serializer.pack(value))
def __delitem__(self, key):
if key in self.__cache:
del self.__cache[key]
self._db.delete(self.serializer.pack(key))
def __iter__(self):
return self.iterkeys()
def __keytransform__(self, key):
return key
def __len__(self):
return self.count()
def __del__(self):
self.flush()
if not self._db.closed:
self._db.close()
# improved methods
def flush(self, write_to_db=False):
if self.__cache:
if write_to_db:
batch = self.set_batch()
for key, val in self.__cache.items():
batch.set(key, val)
batch.commit()
self.__cache.clear()
def set_batch(self):
return PlyvelBatchWrite(self)
def iteritems(self):
self.flush()
for key, value in self._db.iterator(include_key=True, include_value=True):
yield (self.serializer.unpack(key), self.serializer.unpack(value))
def iterkeys(self):
self.flush()
for key in self._db.iterator(include_key=True, include_value=False):
yield self.serializer.unpack(key)
def itervalues(self):
self.flush()
for val in self._db.iterator(include_key=False, include_value=True):
yield self.serializer.unpack(val)
def keys(self):
self.flush()
# fixes default method which calls __len__
return list(self.iterkeys())
def values(self):
self.flush()
return list(self.itervalues())
def has_key(self, key):
return key in self
def clear(self):
self.flush()
for k in self:
del self[k]
def count(self):
self.flush()
return sum(1 for key in self)
And the graph class:
class PersistedGraph(nx.Graph):
def __init__(self, data=None, node_dict_factory=None, adjlist_dict_factory=None, edge_attr_dict_factory=None,
**attr):
if node_dict_factory:
self.node_dict_factory = node_dict_factory
if adjlist_dict_factory:
self.adjlist_dict_factory = adjlist_dict_factory
if edge_attr_dict_factory:
self.edge_attr_dict_factory = edge_attr_dict_factory
nx.Graph.__init__(self, data, **attr)
It should be possible to extend networkx by subclassing the Graph class and providing user-defined factories functions.
Those functions could query a database and cache the results in the dictionaries used by networkx.
I couldn't find those lines from the online documentation but in the code you have:
Subclasses (Advanced):
The Graph class uses a dict-of-dict-of-dict data structure.
The outer dict (node_dict) holds adjacency lists keyed by node.
The next dict (adjlist) represents the adjacency list and holds
edge data keyed by neighbor. The inner dict (edge_attr) represents
the edge data and holds edge attribute values keyed by attribute names.
Each of these three dicts can be replaced by a user defined
dict-like object. In general, the dict-like features should be
maintained but extra features can be added. To replace one of the
dicts create a new graph class by changing the class(!) variable
holding the factory for that dict-like structure. The variable names
are node_dict_factory, adjlist_dict_factory and edge_attr_dict_factory.
node_dict_factory : function, (default: dict)
Factory function to be used to create the outer-most dict
in the data structure that holds adjacency lists keyed by node.
It should require no arguments and return a dict-like object.
adjlist_dict_factory : function, (default: dict)
Factory function to be used to create the adjacency list
dict which holds edge data keyed by neighbor.
It should require no arguments and return a dict-like object
edge_attr_dict_factory : function, (default: dict)
Factory function to be used to create the edge attribute
dict which holds attrbute values keyed by attribute name.
It should require no arguments and return a dict-like object.
I am not aware of any official extension for networkx.
Well, the question is in the title: how do I define a python dictionary with immutable keys but mutable values? I came up with this (in python 2.x):
class FixedDict(dict):
"""
A dictionary with a fixed set of keys
"""
def __init__(self, dictionary):
dict.__init__(self)
for key in dictionary.keys():
dict.__setitem__(self, key, dictionary[key])
def __setitem__(self, key, item):
if key not in self:
raise KeyError("The key '" +key+"' is not defined")
dict.__setitem__(self, key, item)
but it looks to me (unsurprisingly) rather sloppy. In particular, is this safe or is there the risk of actually changing/adding some keys, since I'm inheriting from dict?
Thanks.
Consider proxying dict instead of subclassing it. That means that only the methods that you define will be allowed, instead of falling back to dict's implementations.
class FixedDict(object):
def __init__(self, dictionary):
self._dictionary = dictionary
def __setitem__(self, key, item):
if key not in self._dictionary:
raise KeyError("The key {} is not defined.".format(key))
self._dictionary[key] = item
def __getitem__(self, key):
return self._dictionary[key]
Also, you should use string formatting instead of + to generate the error message, since otherwise it will crash for any value that's not a string.
The problem with direct inheritance from dict is that it's quite hard to comply with the full dict's contract (e.g. in your case, update method won't behave in a consistent way).
What you want, is to extend the collections.MutableMapping:
import collections
class FixedDict(collections.MutableMapping):
def __init__(self, data):
self.__data = data
def __len__(self):
return len(self.__data)
def __iter__(self):
return iter(self.__data)
def __setitem__(self, k, v):
if k not in self.__data:
raise KeyError(k)
self.__data[k] = v
def __delitem__(self, k):
raise NotImplementedError
def __getitem__(self, k):
return self.__data[k]
def __contains__(self, k):
return k in self.__data
Note that the original (wrapped) dict will be modified, if you don't want that to happen, use copy or deepcopy.
How you prevent someone from adding new keys depends entirely on why someone might try to add new keys. As the comments state, most dictionary methods that modify the keys don't go through __setitem__, so a .update() call will add new keys just fine.
If you only expect someone to use d[new_key] = v, then your __setitem__ is fine. If they might use other ways to add keys, then you have to put in more work. And of course, they can always use this to do it anyway:
dict.__setitem__(d, new_key, v)
You can't make things truly immutable in Python, you can only stop particular changes.
I am doing a really simple API to test respire a SPORE client generation for python.
In WSGI, what would be the best way to keep data throught the server?
I try to make a RedisDict that way:
import json
from redis import Redis
redis = Redis()
class RedisDict:
"""A redis based dict."""
def dict(self):
TODOS = redis.get('TODOS')
return json.loads(TODOS)
def keys(self):
return self.dict().keys()
def __getitem__(self, key):
return self.dict()[key]
def __setitem__(self, key, value):
obj = self.dict()
obj[key] = value
redis.set('TODOS', json.dumps(obj))
def __delitem__(self, key):
obj = self.dict()
del obj[key]
redis.set('TODOS', json.dumps(obj))
todos = RedisDict()
How can I make dict(todos) to return a dict?
Is that enough in a WSGI environment ?
Assuming that method dict returns a dictionary, why not just do this:
dict_i_wanted = todos.dict()
If you must support dict_i_wanted = dict(todos) then add this method:
def __iter__(self):
return self.dict().iteritems()
If you want to make your own "dict-like" class, you need to implement the dictionary protocol. The easiest way would be inheriting from collections.Mapping and implementing the methods that the table mentions are abstract. You need to implement those to behave the same as the corresponding dict methods; e.g. __iter__ should return an iterable over all (key, value) tuples. (This is the one method you need to make dict(todos) work. There is no magic method to coerce to a dict like you tried to do with your dict() method.)
I had the need to implement a hashable dict so I could use a dictionary as a key for another dictionary.
A few months ago I used this implementation: Python hashable dicts
However I got a notice from a colleague saying 'it is not really immutable, thus it is not safe. You can use it, but it does make me feel like a sad Panda'.
So I started looking around to create one that is immutable. I have no need to compare the 'key-dict' to another 'key-dict'. Its only use is as a key for another dictionary.
I have come up with the following:
class HashableDict(dict):
"""Hashable dict that can be used as a key in other dictionaries"""
def __new__(self, *args, **kwargs):
# create a new local dict, that will be used by the HashableDictBase closure class
immutableDict = dict(*args, **kwargs)
class HashableDictBase(object):
"""Hashable dict that can be used as a key in other dictionaries. This is now immutable"""
def __key(self):
"""Return a tuple of the current keys"""
return tuple((k, immutableDict[k]) for k in sorted(immutableDict))
def __hash__(self):
"""Return a hash of __key"""
return hash(self.__key())
def __eq__(self, other):
"""Compare two __keys"""
return self.__key() == other.__key() # pylint: disable-msg=W0212
def __repr__(self):
"""#see: dict.__repr__"""
return immutableDict.__repr__()
def __str__(self):
"""#see: dict.__str__"""
return immutableDict.__str__()
def __setattr__(self, *args):
raise TypeError("can't modify immutable instance")
__delattr__ = __setattr__
return HashableDictBase()
I used the following to test the functionality:
d = {"a" : 1}
a = HashableDict(d)
b = HashableDict({"b" : 2})
print a
d["b"] = 2
print a
c = HashableDict({"a" : 1})
test = {a : "value with a dict as key (key a)",
b : "value with a dict as key (key b)"}
print test[a]
print test[b]
print test[c]
which gives:
{'a': 1}
{'a': 1}
value with a dict as key (key a)
value with a dict as key (key b)
value with a dict as key (key a)
as output
Is this the 'best possible' immutable dictionary that I can use that satisfies my requirements? If not, what would be a better solution?
If you are only using it as a key for another dict, you could go for frozenset(mutabledict.items()). If you need to access the underlying mappings, you could then use that as the parameter to dict.
mutabledict = dict(zip('abc', range(3)))
immutable = frozenset(mutabledict.items())
read_frozen = dict(immutable)
read_frozen['a'] # => 1
Note that you could also combine this with a class derived from dict, and use the frozenset as the source of the hash, while disabling __setitem__, as suggested in another answer. (#RaymondHettinger's answer for code which does just that).
The Mapping abstract base class makes this easy to implement:
import collections
class ImmutableDict(collections.Mapping):
def __init__(self, somedict):
self._dict = dict(somedict) # make a copy
self._hash = None
def __getitem__(self, key):
return self._dict[key]
def __len__(self):
return len(self._dict)
def __iter__(self):
return iter(self._dict)
def __hash__(self):
if self._hash is None:
self._hash = hash(frozenset(self._dict.items()))
return self._hash
def __eq__(self, other):
return self._dict == other._dict
I realize this has already been answered, but types.MappingProxyType is an analogous implementation for Python 3.3. Regarding the original question of safety, there is a discussion in PEP 416 -- Add a frozendict builtin type on why the idea of a frozendict was rejected.
In order for your immutable dictionary to be safe, all it needs to do is never change its hash. Why don't you just disable __setitem__ as follows:
class ImmutableDict(dict):
def __setitem__(self, key, value):
raise Exception("Can't touch this")
def __hash__(self):
return hash(tuple(sorted(self.items())))
a = ImmutableDict({'a':1})
b = {a:1}
print b
print b[a]
a['a'] = 0
The output of the script is:
{{'a': 1}: 1}
1
Traceback (most recent call last):
File "ex.py", line 11, in <module>
a['a'] = 0
File "ex.py", line 3, in __setitem__
raise Exception("Can't touch this")
Exception: Can't touch this
Here is a link to pip install-able implementation of #RaymondHettinger's answer: https://github.com/pcattori/icicle
Simply pip install icicle and you can from icicle import FrozenDict!
Update: icicle has been deprecated in favor of maps: https://github.com/pcattori/maps (documentation, PyPI).
It appears I am late to post. Not sure if anyone else has come up with ideas. But here is my take on it. The Dict is immutable and hashable. I made it immutable by overriding all the methods, magic and otherwise, with a custom '_readonly' function that raises an Exception. This is done when the object is instantiated. To get around the problem of not being able to apply the values I set the 'hash' under '__new__'. I then I override the '__hash__'function. Thats it!
class ImmutableDict(dict):
_HASH = None
def __new__(cls, *args, **kwargs):
ImmutableDict._HASH = hash(frozenset(args[0].items()))
return super(ImmutableDict, cls).__new__(cls, args)
def __hash__(self):
return self._HASH
def _readonly(self, *args, **kwards):
raise TypeError("Cannot modify Immutable Instance")
__delattr__ = __setattr__ = __setitem__ = pop = update = setdefault = clear = popitem = _readonly
Test:
immutabled1 = ImmutableDict({"This": "That", "Cheese": "Blarg"})
dict1 = {immutabled1: "Yay"}
dict1[immutabled1]
"Yay"
dict1
{{'Cheese': 'Blarg', 'This': 'That'}: 'Yay'}
Variation of Raymond Hettinger's answer by wrapping the self._dict with types.MappingProxyType.
class ImmutableDict(collections.Mapping):
"""
Copies a dict and proxies it via types.MappingProxyType to make it immutable.
"""
def __init__(self, somedict):
dictcopy = dict(somedict) # make a copy
self._dict = MappingProxyType(dictcopy) # lock it
self._hash = None
def __getitem__(self, key):
return self._dict[key]
def __len__(self):
return len(self._dict)
def __iter__(self):
return iter(self._dict)
def __hash__(self):
if self._hash is None:
self._hash = hash(frozenset(self._dict.items()))
return self._hash
def __eq__(self, other):
return self._dict == other._dict
def __repr__(self):
return str(self._dict)
You can use an enum:
import enum
KeyDict1 = enum.Enum('KeyDict1', {'InnerDictKey1':'bla', 'InnerDictKey2 ':2})
d = { KeyDict1: 'whatever', KeyDict2: 1, ...}
You can access the enums like you would a dictionary:
KeyDict1['InnerDictKey2'].value # This is 2
You can iterate over the names, and get their values... It does everything you'd expect.
You can try using https://github.com/Lightricks/freeze
It provides recursively immutable and hashable dictionaries
from freeze import FDict
a_mutable_dict = {
"list": [1, 2],
"set": {3, 4},
}
a_frozen_dict = FDict(a_mutable_dict)
print(a_frozen_dict)
print(hash(a_frozen_dict))
# FDict: {'list': FList: (1, 2), 'set': FSet: {3, 4}}
# -4855611361973338606
I have an algorithm in python which creates measures for pairs of values, where m(v1, v2) == m(v2, v1) (i.e. it is symmetric). I had the idea to write a dictionary of dictionaries where these values are stored in a memory-efficient way, so that they can easily be retrieved with keys in any order. I like to inherit from things, and ideally, I'd love to write a symmetric_dict where s_d[v1][v2] always equals s_d[v2][v1], probably by checking which of the v's is larger according to some kind of ordering relation and then switching them around so that the smaller element one is always mentioned first. i.e. when calling s_d[5][2] = 4, the dict of dicts will turn them around so that they are in fact stored as s_d[2][5] = 4, and the same for retrieval of the data.
I'm also very open for a better data structure, but I'd prefer an implementation with "is-a" relationship to something which just uses a dict and preprocesses some function arguments.
You could use a frozenset as the key for your dict:
>>> s_d = {}
>>> s_d[frozenset([5,2])] = 4
>>> s_d[frozenset([2,5])]
4
It would be fairly straightforward to write a subclass of dict that took iterables as key arguments and then turned then into a frozenset when storing values:
class SymDict(dict):
def __getitem__(self, key):
return dict.__getitem__(self, frozenset(key))
def __setitem__(self, key, value):
dict.__setitem__(self, frozenset(key), value)
Which gives you:
>>> s_d = SymDict()
>>> s_d[5,2] = 4
>>> s_d[2,5]
4
Doing it with nested indexing as shown will be extremely difficult. It's better to use a tuple as the key instead. That way the tuple can be sorted and an encapsulated dict can be accessed for the value.
d[2, 5] = 4
print d[5, 2]
Here's a slightly different approach that looks promising. Although the SymDict class isn't a dict subclass, it mostly behaves like one, and there's only a single private dictionary involved. I think one interesting feature is that fact that it preserves the natural [][] lookup syntax you seemed to want.
class SymDict(object):
def __init__(self, *args, **kwrds):
self._mapping = _SubSymDict(*args, **kwrds)
def __getitem__(self, key1):
self._mapping.set_key1(key1)
return self._mapping
def __setitem__(self, key1, value):
raise NotImplementedError
def __str__(self):
return '_mapping: ' + self._mapping.__str__()
def __getattr__(self, name):
return getattr(self._mapping, name)
class _SubSymDict(dict):
def __init__(self, *args, **kwrds):
dict.__init__(self, *args, **kwrds)
def set_key1(self, key1):
self.key1 = key1
def __getitem__(self, key2):
return dict.__getitem__(self, frozenset((self.key1, key2)))
def __setitem__(self, key2, value):
dict.__setitem__(self, frozenset((self.key1, key2)), value)
symdict = SymDict()
symdict[2][4] = 24
symdict[4][2] = 42
print 'symdict[2][4]:', symdict[2][4]
# symdict[2][4]: 42
print 'symdict[4][2]:', symdict[4][2]
# symdict[4][2]: 42
print 'symdict:', symdict
# symdict: _mapping: {frozenset([2, 4]): 42}
print symdict.keys()
# [frozenset([2, 4])]
Just as an alternative to Dave Webb's frozenset, why not do a SymDict like the following:
class SymDict(dict):
def __getitem__(self, key):
return dict.__getitem__(self, key if key[0] < key[1] else (key[1],key[0]))
def __setitem__(self, key, value):
dict.__setitem__(self, key if key[0] < key[1] else (key[1],key[0]), value)
From a quick test, this is more than 10% faster for getting and setting items than using a frozenset. Anyway, just another idea. However, it is less adaptable than the frozenset as it is really only set up to be used with tuples of length 2. As far as I can tell from the OP, that doesn't seem to be an issue here.
Improving on Justin Peel's solution, you need to add __delitem__ and __contains__ methods for a few more dictionary operations to work. So, for completeness,
class SymDict(dict):
def __getitem__(self, key):
return dict.__getitem__(self, key if key[0] < key[1] else (key[1],key[0]))
def __setitem__(self, key, value):
dict.__setitem__(self, key if key[0] < key[1] else (key[1],key[0]), value)
def __delitem__(self, key):
return dict.__delitem__(self, key if key[0] < key[1] else (key[1],key[0]))
def __contains__(self, key):
return dict.__contains__(self, key if key[0] < key[1] else (key[1],key[0]))
So then
>>> s_d = SymDict()
>>> s_d[2,5] = 4
>>> s_d[5,2]
4
>>> (5,2) in s_d
True
>>> del s_d[5,2]
>>> s_d
{}
I'm not sure, though, whether that covers all the bases, but it was good enough for my own code.
An obvious alternative is to use a (v1,v2) tuple as the key into a single standard dict, and insert both (v1,v2) and (v2,v1) into the dictionary, making them refer to the same object on the right-hand side.
I'd extract the function for more readability(for patvarilly answer)
class SymDict(dict):
def __getitem__(self, key):
return dict.__getitem__(self, self.symm(key))
def __setitem__(self, key, value):
dict.__setitem__(self, self.symm(key), value)
def __delitem__(self, key):
return dict.__delitem__(self, self.symm(key))
def __contains__(self, key):
return dict.__contains__(self, self.symm(key))
#staticmethod
def symm(key):
return key if key[0] < key[1] else (key[1], key[0]).