create different list of dicts from master list of dict - python

I have a list of dictionary as following:-
listDict =[{'name':'A',
'fun':'funA',
'childs':[{'name':'B',
'fun':'funB',
'childs':[{ 'name':'D',
'fun':'funD'}]},
{'name':'C',
'fun':'funC',
'childs':[{ 'name':'E',
'fun':'funE'},
{ 'name':'F',
'fun':'funF'},
{ 'name':'G',
'fun':'funG',
'childs' :[{ 'name':'H',
'fun':'funH'}]}]}]},
{'name':'Z',
'fun':'funZ'}]
I wanted to create three list of dict from this :-
1. With no child and no parent
lod1 = [{'name':'Z'
'fun':'funZ'}]
2.with no child but having parent and parent as key:-
`lod2 = [{'B':[{ 'name':'D',
'fun':'funD'}]},
{'C':[{'name':'E',
'fun':'funE'},
{'name':'F',
'fun':'funF'}]},
{'G':[{ 'name':'H',
'fun':'funH'}]
}]`
3.with only parent child as a flat list with parent as key :-
lod3 = [{'A': [{ 'name':'B',
'fun':'funB'},
{'name':'C',
'fun':'funC'}]},
{'C': [{'name':'G',
'fun':'funG'}]
}]
Is there any possible way to do this with or without recursion. The purpose of this division is that I am trying to create a flat class structure, where are all nodes in category 1 (no child and parent) are added as a function of final class. All nodes with no child but having parent (category 2) are added as a function of respective parent class. And remaining parent child (category 3) will be created as classes with childs having instance of parent.

This is a task for which the Visitor Pattern is appropriate. You have a tree like structure and you wish to traverse it, accumulating the three different sets of information.
To implement this well, you should separate the traversal of the structure from the data collection. In that way you only need to define the different forms of data collection rather than reimplemeting the visitor each time. So lets start with that.
The visitor will take a dictionary, consider it, and will visit all of the dictionaries in the childs list (you may wish to rename this to children).
from abc import ABCMeta, abstractmethod
class DictionaryVisitor(object):
__metaclass__ = ABCMeta
#abstractmethod
def visit(self, node, parents, result):
""" This inspects the current node and
accumulates any data into result """
pass # Implement this in the subclasses
def accept(self, node, parents, result):
""" This tests if the node should be traversed. This is an efficiency
improvement to prevent traversing lots of nodes that you have no
interest in """
return True
def traverse(self, node, parents, result):
""" This traverses the dictionary, visiting each node in turn """
if not self.accept(node, parents, result):
return
self.visit(node, parents, result)
if 'childs' in node:
for child in node['childs']:
self.traverse(child, parents + [node], result)
def start(self, dict_list): # bad method name
""" This just handles the parents and result argument of traverse """
# Assuming that result is always a list is not normally appropriate
result = []
for node in dict_list:
self.traverse(node, [], result)
return result
You can then implement the different required outputs as subclasses of this abstract base class:
class ParentlessChildlessVisitor(DictionaryVisitor):
def visit(self, node, parents, result):
""" Collect the nodes that have no parents or children """
# parent filtering is performed in accept
if 'childs' not in node:
result.append(node)
def accept(self, nodes, parents, result):
""" Reject all nodes with parents """
return not parents
Then you can call it:
visitor = ParentlessChildlessVisitor()
results = visitor.start(data)
print results
# prints [{'fun': 'funZ', 'name': 'Z'}]
The next one:
class ChildlessChildVisitor(DictionaryVisitor):
def visit(self, node, parents, result):
""" Collect the nodes that have parents but no children """
if parents and 'childs' not in node:
# slightly odd data structure here, a list of dicts where the only
# dict key is unique. It would be better to be a plain dict, which
# is what is done here:
result[parents[-1]['name']].append(node)
def start(self, dict_list):
""" This just handles the parents and result argument of traverse """
# Here it is much better to have a dict as the result.
# This is an example of why wrapping all this logic in the start method
# is not normally appropriate.
result = defaultdict(list)
for node in dict_list:
self.traverse(node, [], result)
return result
visitor = ChildlessChildVisitor()
results = visitor.start(listDict)
print dict(results)
# prints {'C': [{'fun': 'funE', 'name': 'E'}, {'fun': 'funF', 'name': 'F'}], 'B': [{'fun': 'funD', 'name': 'D'}], 'G': [{'fun': 'funH', 'name': 'H'}]}
It is not entirely clear to me what you want to collect with the last example so you will have to handle that one yourself.

Related

Modifying Instance Attribute Unexpected Behavior

I'm trying to implement a Graph/Node data structure to store some connections between synsets in WordNet.
import nltk
from nltk.corpus import wordnet as wn
# a dict of [wn.synset, Node]
syns_node_dict = []
# gets a node from a wn.synset
def nodeFromSyn(syn):
for row in syns_node_dict:
if row[0] == syn:
return row[1]
return False
def display_dict():
for row in syns_node_dict:
curr_nde = row[1]
display([curr_nde._value, curr_nde._parents, curr_nde._children])
# Graph Node data struct
class Node:
# value is a wn.synset
# parents, children are both lists of wn.synset
def __init__(self, value, parents=[], children=[]):
curr = nodeFromSyn(value)
# if a Node for value already exists, merge attributes
if curr:
for p in parents:
if p not in curr._parents : curr._parents.append(p)
for c in children:
if c not in curr._children : curr._children.append(c)
self = curr
# if a Node for value does not exist, create new Node for value
else:
syns_node_dict.append([value, self])
self._value = value
self._parents = parents
self._children = children
# Create a Node for each of self's parents if it does not already exist
# and add self as a child
for parent in self._parents:
parent_node = nodeFromSyn(parent)
if parent_node:
if value not in parent_node._children:
parent_node._children.append(value)
else:
parent_node = Node(parent, children=[value])
# Create a Node for each of self's children if it does not already exist
# and add self as a parent
for child in self._children:
child_node = nodeFromSyn(child)
if child_node:
if value not in child_node._parents:
child_node._parents.append(value)
else:
child_node = Node(child, parents=[value])
However, I'm seeing some strange behavior when I attempt to run my code:
Node(wn.synset('condition.n.01'), [], children=[wn.synset('difficulty.n.03')])
display_dict()
>[Synset('condition.n.01'), [], [Synset('difficulty.n.03')]]
>[Synset('difficulty.n.03'), [Synset('condition.n.01')], []]
Node(wn.synset('state.n.02'), children=[wn.synset('condition.n.01')])
display_dict()
>[Synset('condition.n.01'), [Synset('state.n.02')], [Synset('difficulty.n.03')]]
>[Synset('difficulty.n.03'), [Synset('condition.n.01')], []]
>[Synset('state.n.02'), [], [Synset('condition.n.01')]]
When I run the following lines of code I expect the last line to be
[Synset('attribute.n.02'), [], [Synset('state.n.02')]].
Why does the Node for wn.synset('attribute.n.02') contain itself among its parents?
Node(wn.synset('attribute.n.02'), children=[wn.synset('state.n.02')])
display_dict()
>[Synset('condition.n.01'), [Synset('state.n.02')], [Synset('difficulty.n.03')]]
>[Synset('difficulty.n.03'), [Synset('condition.n.01')], []]
>[Synset('state.n.02'), [Synset('attribute.n.02')], [Synset('condition.n.01')]]
>[Synset('attribute.n.02'), [Synset('attribute.n.02')], [Synset('state.n.02')]]
I isolated the issue to be within the block below but I can't figure out what's causing this behavior.
# Create a Node for each of self's children if it does not already exist
# and add self as a parent
for child in self._children:
child_node = nodeFromSyn(child)
if child_node:
if value not in child_node._parents:
child_node._parents.append(value)
else:
child_node = Node(child, parents=[value])
The reason this is happening is because default parameters are evaluated once when the function is defined, not each time it is called. Because of this, all the nodes you create with the parents (or children) parameter blank are actually sharing the same list for their parent list. For the first two nodes this isn't a problem because you specify an empty list for the first node, and the first node creates a list for the second, but all the nodes after that are sharing the same list (which becomes more obvious if you keep creating more nodes).
A way to solve this problem outlined here is to make the default parameters None, then check if the parameter is none and create the empty list if so:
# modify the init function so that the default parameters are None
def __init__(self, value, parents=None, children=None):
# if they are left as None, set them to an empty list
if parents is None:
parents = []
if children is None:
children = []
# The rest of your code here...

loop/recursion to handle hierarchical structures in Python

I was wondering if there's a more efficient way of looping through a set of objects that might have some dependencies (child-parent relationship) between them, e.g.:
dependencies = [(child1,parent1),(child2,parent2),...]
obj_stack = [{'id':'child1'},{'id':'child2'},{'id':'parent1'},..]
new_stack = [{'id':'child1'},{'id':'child3'}]
check_and_add_to_new_stack(obj) # function that decides if an obj should be added to the new stack
I want to check each obj_stack and add objs into new_stack under two conditions:
it has no parent and check_and_add_to_new_stack(obj) adds it (let's assume it does it correctly)
it has a parent, then add it directly (no need to use the above function)
This means that I need to check if an element has a parent, and then check&add their parent first. If it gets added, then I come back to that element. I am kinda getting stuck on the recursive loop.
Here is the pseudo-code:
def check_and_add_to_new_stack(obj,stack):
if passed_checks(obj):
return add_to_new_stack(obj,stack)
return stack
def myFunction(obj_stack, new_stack, dependencies):
for obj in obj_stack:
if obj is not in new_stack:
if obj has parent in dependencies:
myFunction([parent], new_stack, dependencies)
else: # here the original obj should be thrown back into the function
new_stack += check_and_add_to_new_stack(obj)
return stack
Edit: adding the result that I am expecting and more details:
Let's assume that
passed_checks(parent1) = False
passed_checks(parent2) = True
passed_checks(child1) = True
passed_checks(child2) = False
The expected result is:
myFunction(obj_stack, new_stack, dependencies)
> [{'id':'child1'},{'id':'child3'},{'id':'child2'},{'id':'parent2'}]
Even though passed_checks(child2) = False it has a parent2 for which passed_checks = True, so both get added to the resulting set. child1 was already in new_stack. parent1 did not get added because passed_checks = False.
I think you might be looking for something like this.
walk_ancestry yields node IDs for a given node and all of its parents based on the parents dict (which is basically dict(dependencies) from your original code)
check is your check function – I just copied your condition there
The all_passing set comprehension iterates over all of our known object names (obj_stack in your original code) and uses the built-in any function to see if any of the nodes in that node's ancestry pass the check() test. If so, it's considered passing.
This could be made faster by caching and memoization, but for small enough graphs, I'm pretty sure this works out alright.
def walk_ancestry(parents, node):
while True:
yield node
node = parents.get(node)
if not node:
break
def check(node):
return node in {"parent2", "child1"}
parents = {
"child1": "parent1",
"child2": "parent2",
"parent2": "parent3",
}
all_objects = {
"parent1",
"parent2",
"parent3",
"child1",
"child2",
}
all_passing = {
node
for node in all_objects
if any(check(n) for n in walk_ancestry(parents, node))
}
print(all_passing)
The output is
['parent2', 'child2', 'child1']

Trigger method when child is added to tree

Suppose i got the following piece of python code to create a forest containing a bunch of trees.
NEXT_INDEX = 0
class Node:
""" A node of a tree """
def __init__(self):
# Each node gets a unique id
self._index = NEXT_INDEX
NEXT_INDEX += 1
# any node may have an arbitrary number of children
self._children = list()
self._parent = None
def add_child(self, node):
node._parent = self
self._children.append(node)
def __str__(self):
return "node {}".format(self._index)
class Forest:
""" A bunch of trees """
def __init__(self):
# contains the root nodes of a whole bunch of trees
self._trees = list()
def add_node(self, node):
# the new node will be the root node for a new tree in self._trees
self._trees.append(node)
def find_node(self, idx):
"""
Search all trees in self._trees for a node with index = idx
and return that node.
"""
# Implementation not relevant here
pass
def on_add_child(child):
# should be executed each time add_child is called on a node with the
# new child as a parameter
print("on_add_child with child = {}".format(child))
I would like to execute a method, "on_add_child", each time a child is added to any node in any of the trees stored in Forest._trees.
Important: The print statement has to be in the Forest class. In the real code Forest maintains a search index of nodes and whenever a new child node is added, the new node has to be added to the search index. Adding a reference to Forest to Node (so that Node.add_child could call Forest.on_add_child) is unfortunately not an option either, because it would introduce a circular dependency between Node and Forest.
Example: Say i executed the following code
forest = Forest()
node_0 = Node()
node_1 = Node()
node_2 = Node()
node_3 = Node()
node_4 = Node()
# We add the first node to the forest: It will become the root of the first tree
forest.add_node(node_0)
# Add node_1 as a child to node_0; This should execute on_add_child(node_1) and
# print "on_add_child with child = node 1"
forest.find_node(0).add_child(node_1)
# Should print "on_add_child with child = node 2"
# => on_add_child is also triggered when we add a child to a non-root node
forest.find_node(1).add_child(node_2)
# Create a second tree
forest.add_node(node_3)
# Should print "on_add_child with child = node 4"
forest.find_node(3).add_child(node_4)
How can this be accomplished? I am aware of python properties and i have found several related questions about how to use properties together with python lists (eg. Python property on a list, Python decorating property setter with list, python: how to have a property and with a setter function that detects all changes that happen to the value), but in my case it is not just a list, but also a tree structure and i couldn't get this combination to work.
If you need action in the forest class to be initiated you can call this in add_child as well. So if vertices need to be updated just update them every time add_child is called. You will need to also keep track of which forest the node is in by passing that into the default constructor.
def add_child(self, node):
node._parent = self
self._children.append(node)
self._forest.on_add_child(node)

Implementing a depth-first tree iterator in Python

I'm trying to implement an iterator class for not-necessarily-binary trees in Python. After the iterator is constructed with a tree's root node, its next() function can be called repeatedly to traverse the tree in depth-first order (e.g., this order), finally returning None when there are no nodes left.
Here is the basic Node class for a tree:
class Node(object):
def __init__(self, title, children=None):
self.title = title
self.children = children or []
self.visited = False
def __str__(self):
return self.title
As you can see above, I introduced a visited property to the nodes for my first approach, since I didn't see a way around it. With that extra measure of state, the Iterator class looks like this:
class Iterator(object):
def __init__(self, root):
self.stack = []
self.current = root
def next(self):
if self.current is None:
return None
self.stack.append(self.current)
self.current.visited = True
# Root case
if len(self.stack) == 1:
return self.current
while self.stack:
self.current = self.stack[-1]
for child in self.current.children:
if not child.visited:
self.current = child
return child
self.stack.pop()
This is all well and good, but I want to get rid of the need for the visited property, without resorting to recursion or any other alterations to the Node class.
All the state I need should be taken care of in the iterator, but I'm at a loss about how that can be done. Keeping a visited list for the whole tree is non-scalable and out of the question, so there must be a clever way to use the stack.
What especially confuses me is this--since the next() function, of course, returns, how can I remember where I've been without marking anything or using excess storage? Intuitively, I think of looping over children, but that logic is broken/forgotten when the next() function returns!
UPDATE - Here is a small test:
tree = Node(
'A', [
Node('B', [
Node('C', [
Node('D')
]),
Node('E'),
]),
Node('F'),
Node('G'),
])
iter = Iterator(tree)
out = object()
while out:
out = iter.next()
print out
If you really must avoid recursion, this iterator works:
from collections import deque
def node_depth_first_iter(node):
stack = deque([node])
while stack:
# Pop out the first element in the stack
node = stack.popleft()
yield node
# push children onto the front of the stack.
# Note that with a deque.extendleft, the first on in is the last
# one out, so we need to push them in reverse order.
stack.extendleft(reversed(node.children))
With that said, I think that you're thinking about this too hard. A good-ole' (recursive) generator also does the trick:
class Node(object):
def __init__(self, title, children=None):
self.title = title
self.children = children or []
def __str__(self):
return self.title
def __iter__(self):
yield self
for child in self.children:
for node in child:
yield node
both of these pass your tests:
expected = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
# Test recursive generator using Node.__iter__
assert [str(n) for n in tree] == expected
# test non-recursive Iterator
assert [str(n) for n in node_depth_first_iter(tree)] == expected
and you can easily make Node.__iter__ use the non-recursive form if you prefer:
def __iter__(self):
return node_depth_first_iter(self)
That could still potentially hold every label, though. I want the
iterator to keep only a subset of the tree at a time.
But you already are holding everything. Remember that an object is essentially a dictionary with an entry for each attribute. Having self.visited = False in the __init__ of Node means you are storing a redundant "visited" key and False value for every single Node object no matter what. A set, at least, also has the potential of not holding every single node ID. Try this:
class Iterator(object):
def __init__(self, root):
self.visited_ids = set()
...
def next(self):
...
#self.current.visited = True
self.visited_ids.add(id(self.current))
...
#if not child.visited:
if id(child) not in self.visited_ids:
Looking up the ID in the set should be just as fast as accessing a node's attribute. The only way this can be more wasteful than your solution is the overhead of the set object itself (not its elements), which is only a concern if you have multiple concurrent iterators (which you obviously don't, otherwise the node visited attribute couldn't be useful to you).

My recursive function (populates a tree structure) is adding to the root node during every loop/call

I have an algorithm to populate a tree like structure (class: Scan_instance_tree), but unfortunately, during each call, it is incorrectly adding to the root node's children, as well as to the new child nodes created further down in the tree.
As a clue, I saw another thread...
Persistent objects in recursive python functions
...where this problem was mentioned briefly, and it was suggested that the parameters passed had to be mutable. Is that the answer, and how would I do this, in this example???
Here is my current code:
class Field_node(object):
field_phenotype_id = -1
field_name = ''
field_parent_id = -1
child_nodes = []
class Scan_instance_tree(object):
root_node = None
def __init__(self, a_db):
self.root_node = Field_node()
scan_field_values = self.create_scan_field_values(a_db) # This just creates a temporary user-friendly version of a database table
self.build_tree(scan_field_values)
def build_tree(self, a_scan_field_values):
self.root_node.field_name = 'ROOT'
self.add_child_nodes(a_scan_field_values, self.root_node)
def add_child_nodes(self, a_scan_field_values, a_parent_node):
i = 0
while i < len(a_scan_field_values):
if a_scan_field_values[i]['field_parent_dependancy'] == a_parent_node.field_phenotype_id:
#highest_level_children.append(a_scan_field_values.pop(a_scan_field_values.index(scan_field)))
child_node = Field_node()
child_node.field_phenotype_id = a_scan_field_values[i]['field_phenotype_id']
child_node.field_name = a_scan_field_values[i]['field_name']
child_node.field_parent_dependancy = a_scan_field_values[i]['field_parent_dependancy']
a_parent_node.child_nodes.append(child_node)
a_scan_field_values.remove(a_scan_field_values[i])
# RECURSION: get the child nodes
self.add_child_nodes(a_scan_field_values, child_node)
else:
i = i+1
If I remove the recursive call to self.add_child_nodes(...), the root's children are added correctly, ie they only consist of those nodes where the field_parent_dependancy = -1
If I allow the recursive call, the root's children contain all the nodes, regardless of the field_parent_dependancy value.
Best regards
Ann
When you define your Field_node class, the line
child_nodes = []
is actually instantiating a single list as a class attribute, rather than an instance attribute, that will be shared by all instances of the class.
What you should do instead is create instance attributes in __init__, e.g.:
class Field_node(object):
def __init__(self):
self.field_phenotype_id = -1
self.field_name = ''
self.field_parent_id = -1
self.child_nodes = []

Categories