Pickling a graph with cycles - python

I have a custom node class in python that is built into a graph (which is a dictionary). Since these take a while to create, I'd like to pickle them so that I don't have to reconstruct them everytime I run my code.
Unfortunately, because this graph has cycles, cPickle hits the maximum recursion depth:
RuntimeError: maximum recursion depth exceeded while pickling an object
This is my node object:
class Node:
def __init__(self, name):
self.name = name
self.uid = 0
self.parents = set()
self.children = set()
def __hash__(self):
return hash(self.name)
def __eq__(self, that):
return self.name == that.name
def __str__(self):
return "\n".join(["Name: " + self.name,
"\tChildren:" + ", ".join([c.name for c in self.children]),
"\tParents:" + ", ".join([p.name for p in self.parents])
]
)
This is how I build my graph:
def buildGraph(input):
graph = {}
idToNode = {}
for line in input:
## Input from text line by line looks like
## source.node -> target.node
source, arr, target = line.split()
if source in graph:
nsource = graph[source]
else:
nsource = Node(source)
nsource.uid = len(graph)
graph[source] = nsource
idToNode[nsource.uid] = nsource
if target in graph:
ntarget = graph[target]
else:
ntarget = Node(target)
ntarget.uid = len(graph)
graph[target] = ntarget
idToNode[ntarget.uid] = ntarget
nsource.children.add(ntarget)
ntarget.parents.add(nsource)
return graph
Then in my main, I have
graph = buildGraph(input_file)
bo = cPickle.dumps(graph)
and the second line is where I get my recursion depth error.
Are there any solutions outside of changing the structure of Node?

You need to prepare the object for pickle: if you have a cycle you need to break cycles and store this information in some other form.
Pickle use methods __getstate__ to prepare object to pickle (it call before) and __setstate__ to initialize object.
class SomethingPickled(object):
## Compress and uncycle data before pickle.
def __getstate__(self):
# deep copy object
state = self.__dict__.copy()
# break cycles
state['uncycled'] = self.yourUncycleMethod(state['cycled'])
del state['cycle']
# send to pickle
return state
## Expand data before unpickle.
def __setstate__(self, state):
# restore cycles
state['cycle'] = self.yourCycleMethod(state['uncycled'])
del state['uncycle']
self.__dict__.update(state)
I am sure than you will find idea how to break and join cycles :)

I don't think that the fact that your graph is cyclic is the problem -- pickle (and cPickle) should handle cyclic data structures just fine. I tried the following (with your definition of Node) and it worked just fine:
>>> n1 = Node('a')
>>> n2 = Node('b')
>>> n1.parents.add(n2)
>>> n2.parents.add(n1)
>>> n2.children.add(n1)
>>> n1.children.add(n1)
>>> import cPickle as pickle
>>> pickle.dumps(n1)
Indeed, even with large cycles I didn't run into a problem. E.g., this works fine for me:
>>> def node_cycle(n):
... start_node = prev_node = Node('node0')
... for i in range(n):
... node = Node('node%d' % (i+1))
... node.parents.add(prev_node)
... prev_node.children.add(node)
... prev_node = node
... start_node.parents.add(node)
... node.children.add(start_node)
>>> cycle = node_cycle(100000) # cycle of 100k nodes
>>> pickle.dumps(cycle)
(This was all tested on Python 2.7.1)
There are other reasons why pickle might end up with very deep recursion though, depending on the shape of your data structure. If this is the real problem, then you might be able to fix it with something like this:
>>> import sys
>>> sys.setrecursionlimit(10000)

Here, this modified node class holds only the names of the objects as strings in a node, and gives you a set with full "Node" objects when you retrieve either the "children" or the "parents" attribute of a node.
Internally there are no cycles - so it should avoid the infinity loop trap.You can implement additional auxiliar methods to ease navigation as you want.
class Node(object):
all_nodes = {}
def __new__(cls, name):
self = object.__new__(cls)
cls.all_nodes[name] = self
return self
def __getstate__(self):
self.all_nodes = self.__class__.all_nodes
return self.__dict__
def __setstate__(self, dct):
self.__class__.all_nodes = dct["all_nodes"]
del dct["all_nodes"]
self.__dict__ = dct
def __init__(self, name):
#self.all_nodes = self.__class__.all_nodes
self.name = name
self.uid = 0
self._parents = set()
self._children = set()
def __hash__(self):
return hash(self.name)
def __eq__(self, that):
return self.name == that.name
def __repr__(self):
return "\n" + "\n".join(["Name: " + self.name,
"\tChildren:" + ", ".join([c.name for c in self.children]),
"\tParents:" + ", ".join([p.name for p in self.parents])
]
)
def get_relations(self, which):
names = getattr(self, which)
return set(self.__class__.all_nodes[name] for name in names)
#property
def children(self):
return self.get_relations("_children")
#property
def parents(self):
return self.get_relations("_parents")
def __contains__(self, item):
return item.name in self._children
def add(self, child):
self._children.add(child.name)
child._parents.add(self.name)
connect_child = add
#example and testing:
from cPickle import loads, dumps
n1 = Node("n1")
n2 = Node("n2")
n3 = Node("n3")
n1.add(n2)
n2.add(n3)
n3.add(n1)
print n1, n2, n3
p1 = dumps(n1)
Node.all_nodes.clear()
p2 = loads(p1)
print p2
print p2.children
print p2.children.pop().children
print Node.all_nodes
The drawback is that it maintains a class dictionary named "all_nodes" where there are references to all actually created nodes. (Pickle is smart enough to only pickle this dictionary once for a given graph, since it is referenced by all Node objects) .
The problem with the class wide "all_nodes" reference is if you need to pickle and unpickle different sets of graphs 9let say you do create graphs g1 with a set of nodes, in another run, create a graph g2 with another set of nodes, and then if you unpickle g1, and later g2, the unpickling of g2 will override the node references for g1). If you need this to work, ask in a comment and I could come up with something - the easiser thing I can think off is having a "graph" class that will hold a dictionary to all the nodes (insteadof having it in the Node class)

Related

adding childern to tree stucture and print

I am trying to implement an n-arry tree based on this post :
[here][1]
and I am getting an error when trying to define a function that adds children:
class node(object):
def __init__(self, value, children = []):
self.value = value
self.children = children
def __str__(self, level=0):
ret = "\t"*level+repr(self.value)+"\n"
for child in self.children:
ret += child.__str__(level+1)
return ret
# trying to implement this method si that I can get rid of
# calling root.children[0].children
def add_child(self, obj):
self.children.append(obj)
def __repr__(self):
return '<tree node representation>'
root = node('grandmother')
root.children = [node('daughter'), node('son')]
root.children[0].children = [node('granddaughter'), node('grandson')]
root.children[1].children = [node('granddaughter'), node('grandson')]
root.add_child([node((1)), node(2)]) # error
print (root)
I want to be able to to create a tree and print it.
[1]: Printing a Tree data structure in Python
If you name a method add_child, it should add a child, not children. And if you add children, you should extend the list, not just append the given list on its end.
Working example:
class Node(object):
def __init__(self, value, children=None):
if children is None:
children = []
self.value = value
self.children = children
def __str__(self, level=0):
ret = "\t" * level + repr(self.value) + "\n"
for child in self.children:
ret += child.__str__(level + 1)
return ret
def add_children(self, obj):
self.children.extend(obj)
root = Node('grandmother')
root.children = [Node('daughter'), Node('son')]
root.children[0].children = [Node('granddaughter'), Node('grandson')]
root.children[1].children = [Node('granddaughter'), Node('grandson')]
root.add_children([Node(1), Node(2)])
print(root)
Output:
'grandmother'
'daughter'
'granddaughter'
'grandson'
'son'
'granddaughter'
'grandson'
1
2
You call add_child with an entire list object. Within add_child you use the method list.append which adds the entire list object to the list itself.
Solution 1: call add_child by specifying the nodes directly:
root.add_child(node((1))
root.add_child(node((2))
Solution 2: change the implementation of add_child by using list.extend instead of list.append. The former adds each element within the supplied argument to the list, while the latter adds the entire argument to the list.
def add_child(self, obj):
self.children.extend(obj)

Modify multiple nodes with python ast.NodeTransformer

I have a input source code like this
def foo(my_input):
return my_input + 42
and want it to transform like this
def method_name(arg0):
return my_input + 42
The ast node transformer for this purpose is written like this.
class MyRenamer(ast.NodeTransformer):
def __init__(self):
self._arg_count = 0
def visit_FunctionDef(self, node):
node.name = "method_name"
return node
def visit_arg(self, node):
node.arg = "arg_{}".format(self._arg_count)
self._arg_count += 1
return node
but when I call the above transformer like this.
node = ast.parse(code)
renamer = MyRenamer()
node2 = renamer.visit(node)
print(astor.to_source(node2))
The output I get is
def method_name(my_input):
return my_input + 42
Here the argument of the function has not been changed.
The visitor needs to walk the AST by visiting all children of the currently visited node. The method generic_visit() does that for you, but you have to call it in every visit_... method, or at least for those where children are a possibility.
import ast
import astor
class MyRenamer(ast.NodeTransformer):
def __init__(self):
self._arg_count = 0
def visit_FunctionDef(self, node):
node.name = "method_name"
self.generic_visit(node)
return node
def visit_arg(self, node):
node.arg = "arg_{}".format(self._arg_count)
self._arg_count += 1
self.generic_visit(node)
return node
code = """
def foo(my_input):
return my_input + 42
"""
node = ast.parse(code)
renamer = MyRenamer()
node2 = renamer.visit(node)
print(astor.to_source(node2))
def method_name(arg_0):
return my_input + 42
This gives your expected output, but in a larger context will rename all functions to "method_name", which might not be desired. And there's still the identifier in the function body that presumably also needs to be renamed.

Python list of classes , index() not working

Not sure what i'm doing wrong here. I have this class:
class Node:
'''
Class to contain the lspid seq and all data.
'''
def __init__(self, name,pseudonode,fragment,seq_no,data):
self.name = name
self.data = {}
self.pseudonode = pseudonode
self.seq_no = seq_no
self.fragment = fragment
def __unicode__(self):
full_name = ('%s-%d-%d') %(self.name,self.pseudonode,self.fragment)
return str(full_name)
def __cmp__(self, other):
if self.name > other.name:
return 1
elif self.name < other.name:
return -1
return 0
def __repr__(self):
full_name = ('%s-%d-%d') %(self.name,self.pseudonode,self.fragment)
#print 'iside Node full_name: {} \n\n\n ------'.format(full_name)
return str(full_name)
and putting some entries in a list :
nodes = []
node = Node('0000.0000.0001',0,0,100,{})
nodes.append(node)
>>> nodes
[0000.0000.0001-0-0]
node = Node('0000.0000.0001',1,0,100,{})
nodes.append(node)
>>> nodes
[0000.0000.0001-0-0, 0000.0000.0001-1-0]
i'm trying to get the index of a node in list nodes[]
>>> node
0000.0000.0001-1-0
>>> nodes.index(node)
0
0 is not what i was expecting. Not sure why this is happening.
edit
i'm after getting the index of the list where '0000.0000.0001-1-0' is.
The index function, when used on a container, relies on its element's __cmp__ function to return the index of the first element that it thinks is equal to the input-object. You probably know as much, since you implemented it for the node. But what you are expecting is that __cmp__ considers not only the name, but also the pseudonode and the fragment, right?
A straight-forward approach would be to consider them a tuple, which performs a comparison of elements from left to right, until the first inequality was found:
def __cmp__(self, other):
self_tuple = (self.name, self.pseudonode, self.fragment)
other_tuple = (other.name, other.pseudonode, other.fragment)
if self_tuple > other_tuple:
return 1
elif self_tuple < other_tuple:
return -1
return 0
If you want another order, you can use the tuples-ordering to define it.

using dictionaries and linked list python

I have a dictionary their key is a word an I want in the elements save a linked list, something like this.
Dictionary
key element
hi linkedlist1
hello linkedlist2
i already make it with arrays
dictionari={}
for textoactual in texto:
archi = open(textoactual,'r')
lines = archi.readlines()
for row in lines:
for word in row.split(' '):
if word in dictionari:
aux = dictionari[word]
aux_txt = textoactual.replace('.txt','')
if not(aux_txt in aux):
aux.append(aux_txt)
dictionari[word]=aux
else:
aux_txt = textoactual.replace('.txt','')
dictionari[word] = makelist(aux_txt)
EDIT3 this might come too late to the show, since the question was accepted over a month ago, yet I've got a thing to add.
In fact Python has a standard C-ish linked list implementation that is the deque class in the collections module. Here is the source
A dequeobject is composed of a doubly-linked list of block nodes.
So if you need a fast linked list in Python stick with deque.
EDIT2 based on OP's comment.
...because i want to see what is faster linked list or arrays when i
search information
Search complexity in a linked list is equal to that in an array (or array-based structures) and is approximately O(n), where n is the number of elements in your container. But since Python built-in data structures are heavily optimized and C-loaded, they will run a lot faster in real-life usage. Linked lists are helpful when you need constant time insertion/deletion at any position of the list or when you don't want to mess with dynamically sized arrays, but it doesn't seem like your case. Since you are actually looking for fast search, you need a hash-table, hence use sets to store file names. In order to do this replace the following line in the match_words_and_files
res.setdefault(word, llist.LinkedList()).insert_with_lookup(file_title)
with
res.setdefault(word, set()).add(file_title)
EDIT. OP updated the request. Provided the LinkedList stuff is saved in a separate module named llist:
import os
import llist
def match_words_and_files(directory):
directory = os.path.abspath(directory)
res = {}
for file_name in filter(os.path.isfile, os.listdir(directory)):
file_title = os.path.splitext(file_name)[0]
with open(os.path.join(directory, file_name)) as inp:
for line in inp:
parsed_line = line.rstrip().split()
for word in parsed_line:
res.setdefault(word, llist.LinkedList()).insert_with_lookup(file_title)
return res
Original post.
If you want a linked list in Python it can be implemented this way (obviously this is not the only way to do it)
class Node(object):
__slots__ = ["_data", "_next_node"]
def __init__(self, data, next_node=None):
self._data = data
self._next_node = next_node
def __str__(self):
return str(self._data)
def __repr__(self):
return repr(self._data)
#property
def data(self):
return self._data
#property
def next_node(self):
return self._next_node
def link_node(self, next_node):
if not hasattr(next_node, "_next_node"):
self._next_node = Node(next_node)
self._next_node = next_node
class LinkedList(object):
def __init__(self, head=None):
if head is not None and not isinstance(head, Node):
self._head = Node(head)
else:
self._head = head
def __repr__(self):
return repr([repr(node) for node in self.iter_links()])
def __str__(self):
return ','.join(str(node) for node in self.iter_links())
def __len__(self):
return sum(1 for _ in self.iter_links())
def set_head(self, head):
self._head = head
def insert(self, node):
if not isinstance(node, Node):
node = Node(node)
node.link_node(self._head)
self._head = node
def insert_with_lookup(self, node):
"""
Inserts a node if the data it contains is not equal to the one
stored in the the head node.
"""
if not isinstance(node, Node):
node = Node(node)
if node.data != self._head.data:
self.insert(node)
def iter_links(self):
current_node = self._head
while current_node:
yield current_node
current_node = current_node.next_node
linked_list = LinkedList(1)
linked_list.insert(2)
linked_list.insert(3)
Let's create one and grow it a little
print(list(linked_list.iter_links()))
The output:
[3, 2, 1]
P.S.
I don't see a single reason to use a linked list in your case.

Implementation of a Trie in Python

I programmed a Trie as a class in python. The search and insert function are clear, but now i tried to programm the python function __str__, that i can print it on the screen. But my function doesn't work!
class Trie(object):
def __init__(self):
self.children = {}
self.val = None
def __str__(self):
s = ''
if self.children == {}: return ' | '
for i in self.children:
s = s + i + self.children[i].__str__()
return s
def insert(self, key, val):
if not key:
self.val = val
return
elif key[0] not in self.children:
self.children[key[0]] = Trie()
self.children[key[0]].insert(key[1:], val)
Now if I create a Object of Trie:
tr = Trie()
tr.insert('hallo', 54)
tr.insert('hello', 69)
tr.insert('hellas', 99)
And when i now print the Trie, occures the problem that the entries hello and hellas aren't completely.
print tr
hallo | ellas | o
How can i solve that problem?.
Why not have str actually dump out the data in the format that it is stored:
def __str__(self):
if self.children == {}:
s = str(self.val)
else:
s = '{'
comma = False
for i in self.children:
if comma:
s = s + ','
else:
comma = True
s = s + "'" + i + "':" + self.children[i].__str__()
s = s + '}'
return s
Which results in:
{'h':{'a':{'l':{'l':{'o':54}}},'e':{'l':{'l':{'a':{'s':99},'o':69}}}}}
There are several issues you're running into. The first is that if you have several children at the same level, you'll only be prefixing one of them with the initial part of the string, and just showing the suffix of the others. Another issue is that you're only showing leaf nodes, even though you can have terminal values that are not at a leaf (consider what happens when you use both "foo" and "foobar" as keys into a Trie). Finally, you're not outputting the values at all.
To solve the first issue, I suggest using a recursive generator that does the traversal of the Trie. Separating the traversal from __str__ makes things easier since the generator can simply yield each value we come across, rather than needing to build up a string as we go. The __str__ method can assemble the final result easily using str.join.
For the second issue, you should yield the current node's key and value whenever self.val is not None, rather than only at leaf nodes. As long as you don't have any way to remove values, all leaf nodes will have a value, but we don't actually need any special casing to detect that.
And for the final issue, I suggest using string formatting to make a key:value pair. (I suppose you can skip this if you really don't need the values.)
Here's some code:
def traverse(self, prefix=""):
if self.val is not None:
yield "{}:{}".format(prefix, self.val)
for letter, child in self.children.items():
yield from child.traverse(prefix + letter)
def __str__(self):
return " | ".join(self.traverse())
If you're using a version of Python before 3.3, you'll need to replace the yield from statement with an explicit loop to yield the items from the recursive calls:
for item in child.traverse(prefix + letter)
yield item
Example output:
>>> t = Trie()
>>> t.insert("foo", 5)
>>> t.insert("bar", 10)
>>> t.insert("foobar", 100)
>>> str(t)
'bar:10 | foo:5 | foobar:100'
You could go with a simpler representation that just provides a summary of what the structure contains:
class Trie:
def __init__(self):
self.__final = False
self.__nodes = {}
def __repr__(self):
return 'Trie<len={}, final={}>'.format(len(self), self.__final)
def __getstate__(self):
return self.__final, self.__nodes
def __setstate__(self, state):
self.__final, self.__nodes = state
def __len__(self):
return len(self.__nodes)
def __bool__(self):
return self.__final
def __contains__(self, array):
try:
return self[array]
except KeyError:
return False
def __iter__(self):
yield self
for node in self.__nodes.values():
yield from node
def __getitem__(self, array):
return self.__get(array, False)
def create(self, array):
self.__get(array, True).__final = True
def read(self):
yield from self.__read([])
def update(self, array):
self[array].__final = True
def delete(self, array):
self[array].__final = False
def prune(self):
for key, value in tuple(self.__nodes.items()):
if not value.prune():
del self.__nodes[key]
if not len(self):
self.delete([])
return self
def __get(self, array, create):
if array:
head, *tail = array
if create and head not in self.__nodes:
self.__nodes[head] = Trie()
return self.__nodes[head].__get(tail, create)
return self
def __read(self, name):
if self.__final:
yield name
for key, value in self.__nodes.items():
yield from value.__read(name + [key])
Instead of your current strategy for printing, I suggest the following strategy instead:
Keep a list of all characters in order that you have traversed so far. When descending to one of your children, push its character on the end of its list. When returning, pop the end character off of the list. When you are at a leaf node, print the contents of the list as a string.
So say you have a trie built out of hello and hellas. This means that as you descend to hello, you build a list h, e, l, l, o, and at the leaf node you print hello, return once to get (hell), push a, s and at the next leaf you print hellas. This way you re-print letters earlier in the tree rather than having no memory of what they were and missing them.
(Another possiblity is to just descend the tree, and whenever you reach a leaf node go to your parent, your parent's parent, your parent's parent's parent... etc, keeping track of what letters you encounter, reversing the list you make and printing that out. But it may be less efficient.)

Categories