NetworkX extensibility for custom storage of nodes/edges - python

Does NetworkX support customization of where nodes, edges and attributes are stored? For example, I would like to try 2 options:
Using something like LevelDB / Kyoto Cabinet as a backing store.
Using some distributed database (Neo4j or even HBase - I only need distributed storage of the nodes/edges) as a backing store.
What are the extensibility points of NetworkX to support such things?

I will post the nuances of setting up NetworkX for external storage. Kikohs is correct in pointing out that there are factories for each of the dictionaries. These can be overridden.
For persisted storage the only dictionary that really needs special attention is the node dictionary.
Special attention must be given to how the dict-like implementation behaves. There is code in the NetworkX classes that change values returned from the dictionary in-memory without setting them back.
For example things like:
self.succ[u][v]=datadict
self.pred[v][u]=datadict
These values will not be persisted back to the storage backend. To accommodate for this, I have used a memory cache that holds objects in memory and when they are evicted it writes them to the underlying storage.
For the in-memory cache I used cachetools. For the eviction see: Python functools.lru_cache eviction callback or equivalent
For the underlying storage I used plyvel (https://plyvel.readthedocs.org/en/latest/) which is a Python interface for LevelDB.
I also give the implementation of the dictionary bellow. Note there are still errors and bugs in the code, and it has not been tested properly, but you get the general idea.
class PlyvelBatchWrite(object):
def __init__(self, plv_dict):
self.__batch = plv_dict._db.write_batch()
self.__plvd = plv_dict
def set(self, key, val):
self.__batch.put(self.__plvd.serializer.pack(key), self.__plvd.serializer.pack(val))
def delete(self, key):
self.__batch.delete(self.__plvd.serializer.pack(key))
def clear(self):
self.__batch.clear()
def commit(self):
self.__batch.write()
class PlyvelDict(MutableMapping):
def __init__(self, directory='', db=None, serializer_factory=None, cache_factory=None, **kwargs):
self.__directory = directory
ensure_directory(directory)
if isinstance(db, str) or db is None:
if db is None:
# generate UUID
db = str(uuid.uuid4())
self.__db = db
db = plyvel.DB(self.name(), **kwargs)
else:
self.__db = kwargs['db']
self._db = db
if serializer_factory:
self.serializer = serializer_factory()
else:
self.serializer = None
if cache_factory:
self.__cache = cache_factory(self.__cache_miss, self.__cache_evict)
else:
self.__cache = None
def name(self):
full_path = os.path.join(self.__directory, self.__db)
return full_path
def __cache_miss(self, key):
b_item = self._db.get(self.serializer.pack(key))
if b_item is not None:
return self.serializer.unpack(b_item)
else:
raise KeyError(key)
def __cache_evict(self, key, val):
self._db.put(self.serializer.pack(key), self.serializer.pack(val))
def __copy__(self):
return type(self)(self.__directory, self._db, type(self.serializer), type(self.__cache), db=self.__db)
def __getitem__(self, key):
return self.__cache[key]
def __setitem__(self, key, value):
if key in self.__cache:
self.__cache[key] = value
self.__write_to_db(key, value)
def __write_to_db(self, key, value):
self._db.put(self.serializer.pack(key), self.serializer.pack(value))
def __delitem__(self, key):
if key in self.__cache:
del self.__cache[key]
self._db.delete(self.serializer.pack(key))
def __iter__(self):
return self.iterkeys()
def __keytransform__(self, key):
return key
def __len__(self):
return self.count()
def __del__(self):
self.flush()
if not self._db.closed:
self._db.close()
# improved methods
def flush(self, write_to_db=False):
if self.__cache:
if write_to_db:
batch = self.set_batch()
for key, val in self.__cache.items():
batch.set(key, val)
batch.commit()
self.__cache.clear()
def set_batch(self):
return PlyvelBatchWrite(self)
def iteritems(self):
self.flush()
for key, value in self._db.iterator(include_key=True, include_value=True):
yield (self.serializer.unpack(key), self.serializer.unpack(value))
def iterkeys(self):
self.flush()
for key in self._db.iterator(include_key=True, include_value=False):
yield self.serializer.unpack(key)
def itervalues(self):
self.flush()
for val in self._db.iterator(include_key=False, include_value=True):
yield self.serializer.unpack(val)
def keys(self):
self.flush()
# fixes default method which calls __len__
return list(self.iterkeys())
def values(self):
self.flush()
return list(self.itervalues())
def has_key(self, key):
return key in self
def clear(self):
self.flush()
for k in self:
del self[k]
def count(self):
self.flush()
return sum(1 for key in self)
And the graph class:
class PersistedGraph(nx.Graph):
def __init__(self, data=None, node_dict_factory=None, adjlist_dict_factory=None, edge_attr_dict_factory=None,
**attr):
if node_dict_factory:
self.node_dict_factory = node_dict_factory
if adjlist_dict_factory:
self.adjlist_dict_factory = adjlist_dict_factory
if edge_attr_dict_factory:
self.edge_attr_dict_factory = edge_attr_dict_factory
nx.Graph.__init__(self, data, **attr)

It should be possible to extend networkx by subclassing the Graph class and providing user-defined factories functions.
Those functions could query a database and cache the results in the dictionaries used by networkx.
I couldn't find those lines from the online documentation but in the code you have:
Subclasses (Advanced):
The Graph class uses a dict-of-dict-of-dict data structure.
The outer dict (node_dict) holds adjacency lists keyed by node.
The next dict (adjlist) represents the adjacency list and holds
edge data keyed by neighbor. The inner dict (edge_attr) represents
the edge data and holds edge attribute values keyed by attribute names.
Each of these three dicts can be replaced by a user defined
dict-like object. In general, the dict-like features should be
maintained but extra features can be added. To replace one of the
dicts create a new graph class by changing the class(!) variable
holding the factory for that dict-like structure. The variable names
are node_dict_factory, adjlist_dict_factory and edge_attr_dict_factory.
node_dict_factory : function, (default: dict)
Factory function to be used to create the outer-most dict
in the data structure that holds adjacency lists keyed by node.
It should require no arguments and return a dict-like object.
adjlist_dict_factory : function, (default: dict)
Factory function to be used to create the adjacency list
dict which holds edge data keyed by neighbor.
It should require no arguments and return a dict-like object
edge_attr_dict_factory : function, (default: dict)
Factory function to be used to create the edge attribute
dict which holds attrbute values keyed by attribute name.
It should require no arguments and return a dict-like object.
I am not aware of any official extension for networkx.

Related

How to implement a secondary custom method for object slicing, other than __getitem__ in Python

I am looking to implement a custom method in my class which helps users slice based on index. The primary slicing will be based on dictionary key. I want to implement it similar to how Pandas does it, using df.iloc[n]
here's my code:
class Vector:
def __init__(self, map_object: dict):
self.dictionary = map_object
def __getitem__(self, key):
data = self.dictionary[key]
return data
def iloc(self, n):
key = list(self.dictionary)[n]
return self.dictionary[key]
However, if then write object.iloc[3] after creating the object, I get an error saying 'method' object is not subscriptable. So how can I implement this?
The [ ] syntax requires a proper object with a __getitem__ method. In order to have a "slice method", use a property that returns a helper which supports slicing.
The helper simply holds a reference to the actual parent object, and defines a __getitem__ with the desired behaviour:
class VectorIloc:
def __init__(self, parent):
self.parent = parent
# custom logic for desired "iloc" behaviour
def __getitem__(self, item):
key = list(self.parent.dictionary)[item]
return self.parent[key]
On the actual class, merely define the desired "method" as a property that returns the helper or as an attribute:
class Vector:
def __init__(self, map_object: dict):
self.dictionary = map_object
# if .iloc is used often
# self.iloc = VectorIloc(self)
def __getitem__(self, key):
return self.dictionary[key]
# if .iloc is used rarely
#property
def iloc(self):
return VectorIloc(self)
Whether to use a property or an attribute is an optimisation that trades memory for performance: an attribute constructs and stores the helper always, while a property constructs it only on-demand but on each access. A functools.cached_property can be used as a middle-ground, creating the attribute on first access.
The property is advantageous when the helper is used rarely per object, and especially if it often is not used at all.
Now, when calling vector.iloc[3], the vector.iloc part provides the helper and the [3] part invoces the helper's __getitem__.
>>> vector = Vector({0:0, 1: 1, 2: 2, "three": 3})
>>> vector.iloc[3]
3
I was looking for this implementation which I'm pretty used to in Pandas. However, after searching a lot, I could not find any suitable answer. So I went looking through the Pandas source code and found that the primary requirement for implementing this are as follows:
Create the method with #property decorator, so that it accepts the slice object without throwing the above error
Create a second class to slice based on the index, pass self to this class, and return this class from the method
My final code ended up looking something like this:
class TimeSeries:
def __init__(self, data: dict):
self.data = data
def __getitem__(self, key):
data = self.data[key]
return data
#property
def iloc(self):
return Slicer(self)
class Slicer:
def __init__(self, obj):
self.time_series = obj
def __getitem__(self, n):
key = list(self.time_series.data)[n]
return self.time_series[key]
With the classes defined this way, I could write the following code:
>>> ts = TimeSeries({'a': 1, 'b': 2, 'c': 3, 'd': 4})
>>> print("value of a:", ts['a'])
value of a: 1
>>> print("value at position 0:", ts.iloc[0])
value at position 0: 1

How to allow for extra characters in dictionary keys

Let's say I have a dictionary like this:
my_dict = {
hey: onevalue,
hat: twovalue,
how: threevalue
}
Is there any way to account for a reference to a key while having a variable, say a number in it?
such as my_dict['hey1'] = onevalue
or my_dict['hey2'] = onevalue
I'm trying to allow for such a variable instead of stripping the numbers from the reference to the key. Thank you in advance.
There is not. Due to how dicts work (as a hash table), they have no way to accommodate what you want. You will have to create your own type and define the __*item__() methods to implement your logic.
Building on this answer, you can define your own dict compatible type to strip trailing digits;
import collections
class TransformedDict(collections.MutableMapping):
"""A dictionary which applies an arbitrary key-altering function before accessing the keys"""
def __init__(self, *args, **kwargs):
self.store = dict()
self.update(dict(*args, **kwargs)) # use the free update to set keys
def __getitem__(self, key):
return self.store[self.__keytransform__(key)]
def __setitem__(self, key, value):
self.store[self.__keytransform__(key)] = value
def __delitem__(self, key):
del self.store[self.__keytransform__(key)]
def __iter__(self):
return iter(self.store)
def __len__(self):
return len(self.store)
def __keytransform__(self, key):
for i in xrange(len(key)-1,-1,-1):
if not key[i].isdigit():
break
return key[:i]
a = TransformedDict()
a['hello'] = 5
print a['hello123']
>>> 5

Define a python dictionary with immutable keys but mutable values

Well, the question is in the title: how do I define a python dictionary with immutable keys but mutable values? I came up with this (in python 2.x):
class FixedDict(dict):
"""
A dictionary with a fixed set of keys
"""
def __init__(self, dictionary):
dict.__init__(self)
for key in dictionary.keys():
dict.__setitem__(self, key, dictionary[key])
def __setitem__(self, key, item):
if key not in self:
raise KeyError("The key '" +key+"' is not defined")
dict.__setitem__(self, key, item)
but it looks to me (unsurprisingly) rather sloppy. In particular, is this safe or is there the risk of actually changing/adding some keys, since I'm inheriting from dict?
Thanks.
Consider proxying dict instead of subclassing it. That means that only the methods that you define will be allowed, instead of falling back to dict's implementations.
class FixedDict(object):
def __init__(self, dictionary):
self._dictionary = dictionary
def __setitem__(self, key, item):
if key not in self._dictionary:
raise KeyError("The key {} is not defined.".format(key))
self._dictionary[key] = item
def __getitem__(self, key):
return self._dictionary[key]
Also, you should use string formatting instead of + to generate the error message, since otherwise it will crash for any value that's not a string.
The problem with direct inheritance from dict is that it's quite hard to comply with the full dict's contract (e.g. in your case, update method won't behave in a consistent way).
What you want, is to extend the collections.MutableMapping:
import collections
class FixedDict(collections.MutableMapping):
def __init__(self, data):
self.__data = data
def __len__(self):
return len(self.__data)
def __iter__(self):
return iter(self.__data)
def __setitem__(self, k, v):
if k not in self.__data:
raise KeyError(k)
self.__data[k] = v
def __delitem__(self, k):
raise NotImplementedError
def __getitem__(self, k):
return self.__data[k]
def __contains__(self, k):
return k in self.__data
Note that the original (wrapped) dict will be modified, if you don't want that to happen, use copy or deepcopy.
How you prevent someone from adding new keys depends entirely on why someone might try to add new keys. As the comments state, most dictionary methods that modify the keys don't go through __setitem__, so a .update() call will add new keys just fine.
If you only expect someone to use d[new_key] = v, then your __setitem__ is fine. If they might use other ways to add keys, then you have to put in more work. And of course, they can always use this to do it anyway:
dict.__setitem__(d, new_key, v)
You can't make things truly immutable in Python, you can only stop particular changes.

How should I do a RedisDict in python?

I am doing a really simple API to test respire a SPORE client generation for python.
In WSGI, what would be the best way to keep data throught the server?
I try to make a RedisDict that way:
import json
from redis import Redis
redis = Redis()
class RedisDict:
"""A redis based dict."""
def dict(self):
TODOS = redis.get('TODOS')
return json.loads(TODOS)
def keys(self):
return self.dict().keys()
def __getitem__(self, key):
return self.dict()[key]
def __setitem__(self, key, value):
obj = self.dict()
obj[key] = value
redis.set('TODOS', json.dumps(obj))
def __delitem__(self, key):
obj = self.dict()
del obj[key]
redis.set('TODOS', json.dumps(obj))
todos = RedisDict()
How can I make dict(todos) to return a dict?
Is that enough in a WSGI environment ?
Assuming that method dict returns a dictionary, why not just do this:
dict_i_wanted = todos.dict()
If you must support dict_i_wanted = dict(todos) then add this method:
def __iter__(self):
return self.dict().iteritems()
If you want to make your own "dict-like" class, you need to implement the dictionary protocol. The easiest way would be inheriting from collections.Mapping and implementing the methods that the table mentions are abstract. You need to implement those to behave the same as the corresponding dict methods; e.g. __iter__ should return an iterable over all (key, value) tuples. (This is the one method you need to make dict(todos) work. There is no magic method to coerce to a dict like you tried to do with your dict() method.)

Keyed Collection in Python?

Is there any equivalent to KeyedCollection in Python, i.e. a set where the elements have (or dynamically generate) their own keys?
i.e. the goal here is to avoid storing the key in two places, and therefore dictionaries are less than ideal (hence the question).
You can simulate that very easily:
class KeyedObject(object):
def get_key(self):
raise NotImplementedError("You must subclass this before you can use it.")
class KeyedDict(dict):
def append(self, obj):
self[obj.get_key()] = obj
Now you can use a KeyedDict instead of dict with subclasses of KeyedObject (where get_key return a valid key based on some object property).
Given your constraints, everyone trying to implement what you're looking for using a dict is barking up the wrong tree. Instead, you should write a list subclass that overrides __getitem__ to provide the behavior you want. I've written it so it tries to get the desired item by index first, then falls back to searching for the item by the key attribute of the contained objects. (This could be a property if the object needs to determine this dynamically.)
There's no way to avoid a linear search if you don't want to duplicate something somewhere; I am sure the C# implementation does exactly the same thing if you don't allow it to use a dictionary to store the keys.
class KeyedCollection(list):
def __getitem__(self, key):
if isinstance(key, int) or isinstance(key, slice):
return list.__getitem__(key)
for item in self:
if getattr(item, "key", 0) == key:
return item
raise KeyError('item with key `%s` not found' % key)
You would probably also want to override __contains__ in a similar manner so you could say if "key" in kc.... If you want to make it even more like a dict, you could also implement keys() and so on. They will be equally inefficient, but you will have an API like a dict, that also works like a list.
#Mehrdad said:
Because semantically, it doesn't make as much sense. When an object
knows its key, it doesn't make sense to put it in a dictionary -- it's
not a key-value pair. It's more of a semantic issue than anything
else.
With this constraint, there is nothing in Python that does what you want. I suggest you use a dict and not worry about this level of detail on the semantics. #Gabi Purcaru's answer shows how you can create an object with the interface you want. Why get bothered about how it's working internally?
It could be that C#'s KeyedCollection is doing the same thing under the covers: asking the object for its key and then storing the key for fast access. In fact, from the docs:
By default, the KeyedCollection(Of TKey, TItem) includes a lookup
dictionary that you can obtain with the Dictionary property. When an
item is added to the KeyedCollection(Of TKey, TItem), the item's key
is extracted once and saved in the lookup dictionary for faster
searches. This behavior is overridden by specifying a dictionary
creation threshold when you create the KeyedCollection(Of TKey,
TItem). The lookup dictionary is created the first time the number of
elements exceeds that threshold. If you specify –1 as the threshold,
the lookup dictionary is never created.
I'm not much of a C#'er, but I think dictionaries is what you need.
http://docs.python.org/tutorial/datastructures.html#dictionaries
http://docs.python.org/tutorial/datastructures.html
Or maybe lists:
http://docs.python.org/library/functions.html#list
Why not simply use a dict? If the key already exists, a reference to the key will be used in the dict; it won't be senselessly duplicated.
class MyExample(object):
def __init__(self, key, value):
self.key = key
self.value = value
m = MyExample("foo", "bar")
d = {}
d[m.key] = m
first_key = d.keys()[0]
first_key is m.key # returns True
If the key doesn't already exist, a copy of it will be saved, but I don't see that as a problem.
def lame_hash(s):
h = 0
for ch in s:
h ^= ord(ch)
return h
d = {}
d[lame_hash(m.key)] = m
print d # key value is 102 which is stored in the dict
lame_hash(m.key) in d # returns True
I'm not sure if this is what you meant, but this dictionary will create it's own keys as you add to it...
class KeyedCollection(dict):
def __init__(self):
self.current_key = 0
def add(self, item):
self[self.current_key] = item
abc = KeyedCollection()
abc.add('bob')
abc.add('jane')
>>> abc
{0: 'bob', 1: 'jane'}
How about a set()? The elements can have their own k
To go a little more in detail that the already correct answer from #Gabi Purcaru's answer, here a class that do the same as gabi one's but that also check for correct given type on key and value (as the TKey and TValue of the .net KeyedCollection).
class KeyedCollection(MutableMapping):
"""
Provides the abstract base class for a collection (:class:`MutableMappinp`) whose keys are embedded in the values.
"""
__metaclass__ = abc.ABCMeta
_dict = None # type: dict
def __init__(self, seq={}):
self._dict = dict(seq)
#abc.abstractmethod
def __is_type_key_correct__(self, key):
"""
Returns: The type of keys in the collection
"""
pass
#abc.abstractmethod
def __is_type_value_correct__(self, value):
"""
Returns: The type of values in the collection
"""
pass
#abc.abstractmethod
def get_key_for_item(self, value):
"""
When implemented in a derivated class, extracts the key from the specified element.
Args:
value: the element from which to extract the key (of type specified by :meth:`type_value`)
Returns: The key of specified element (of type specified by :meth:`type_key`)
"""
pass
def __assert_type_key(self, key, arg_name='key'):
if not self.__is_type_key_correct__(key) :
raise ValueError("{} type is not correct".format(arg_name))
def __assert_type_value(self, value, arg_name='value'):
if not self.__is_type_value_correct__(value) :
raise ValueError("{} type is not correct".format(arg_name))
def add(self, value):
"""
Adds an object to the KeyedCollection.
Args:
value: The object to be added to the KeyedCollection (of type specified by :meth:`type_value`).
"""
key = self.get_key_for_item(value)
self._dict[key] = value
# Implements abstract method __setitem__ from MutableMapping parent class
def __setitem__(self, key, value):
self.__assert_type_key(key)
self.__assert_type_value(value)
if value.get_key() != key:
raise ValueError("provided key does not correspond to the given KeyedObject value")
self._dict[key] = value
# Implements abstract method __delitem__ from MutableMapping parent class
def __delitem__(self, key):
self.__assert_type_key(key)
self._dict.pop(key)
# Implements abstract method __getitem__ from MutableMapping parent class (Mapping base class)
def __getitem__(self, key):
self.__assert_type_key(key)
return self._dict[key]
# Implements abstract method __len__ from MutableMapping parent class (Sized mixin on Mapping base class)
def __len__(self):
return len(self._dict)
# Implements abstract method __iter__ from MutableMapping parent class (Iterable mixin on Mapping base class)
def __iter__(self):
return iter(self._dict)
pass
# Implements abstract method __contains__ from MutableMapping parent class (Container mixin on Mapping base class)
def __contains__(self, x):
self.__assert_type_key(x, 'x')
return x in self._dict

Categories