I am trying to generate custom JSON in python using the following code
root={}
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
templist=[]
noofchildren=0
if(len(d.items())==0):
return ([{}],1)
for k,v in d.items():
temp,children=convert(v)
noofchildren+=children
if(temp):
templist.append({"name":k+"("+str(children)+")",'children':temp})
else:
templist.append({'name': k+"("+str(children)+")", 'children':[{}]})
return (templist,noofchildren)
# Print results
import json
print(json.dumps(convert(root)[0], indent=2))
and the OUTPUT is
[
{
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
},
{
"name": "L3(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L3(1)",
"children": [
{}
]
},
{
"name": "L1(1)",
"children": [
{}
]
}
]
}
]
},
{
"name": "L3(1)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L4(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L1(1)",
"children": [
{}
]
},
{
"name": "L4(1)",
"children": [
{}
]
}
]
}
]
}
]
My dataset has changed a little bit
Levels=[[['L1','L1','L2'],[10,20,30]],
[[['L1','L1','L3'],[10,15,20]],
[[['L1','L2'],[20,10]],
[[['L2','L2','L3'],[20,20,30]],
[[['L2','L2','L1'],[10,20,30]]
[[['L3','L2'],[10,20]]
[[['L4','L2','L1'],[10,20,10]]
[[['L4','L2','L4'],[20,40,50]]]
and the output that I want is the average of the levels along with the count
[
{
"name": "L1(3)#(13)", // taking avg of 10,10,20
"children": [
{
"name": "L1(2)#(17)", // taking avg of 20,15
"children": [
{
"name": "L2(1)#(30)",
"children": [
{}
]
},
{
"name": "L3(1)#(20)",
"children": [
{}
]
}
]
},
{
"name": "L2(1)#10",
"children": [
{}
]
}
]
},
{
"name": "L2(2)#(15)", // avg of 20,10
"children": [
{
"name": "L2(2)#(20)", // avg of 20,20
"children": [
{
"name": "L3(1)#(30)",
"children": [
{}
]
},
{
"name": "L1(1)#(30)",
"children": [
{}
]
}
]
}
]
},
{
"name": "L3(1)#(10)",
"children": [
{
"name": "L2(1)#(10)",
"children": [
{}
]
}
]
},
{
"name": "L4(2)#(15)",// avg of 10,20
"children": [
{
"name": "L2(2)#(30)", // avg of 20,40
"children": [
{
"name": "L1(1)# (10)",
"children": [
{}
]
},
{
"name": "L4(1)#(50)",
"children": [
{}
]
}
]
}
]
}
]
How can i change my code to add this information?
Preface
Before moving onto the solution, here are some points I want to mention:
Make use of Python's object-oriented programming features! This makes the data structure much clearer to both yourself and future readers.
Using a custom class also makes it easier for us to store the metadata – i.e. the number of instances of a node and its total value – while constructing the intermediate tree structure, rather than while converting it. This is also more efficient because with the latter method, a simple naive traversal algorithm would make duplicate visits to the nodes!
If you want your output to (reliably) maintain the order in which the paths were inserted, you should use an OrderedDict (from collections) instead of an ordinary dict ({}).
It is more logical to output an empty list for nodes with no children than a list with a single empty dict:
// Before
"children": [
{}
]
// After
"children": []
The reason being that any software which will parse this data later can safely assume that all objects have the "name" and "children" fields, which an empty dict does not.
The list boundaries and elements in the Levels array seem to be poorly formed; did you mean:
Levels = [
[['L1','L1','L2'],[10,20,30]],
[['L1','L1','L3'],[10,15,20]],
[['L1','L2'],[20,10]],
[['L2','L2','L3'],[20,20,30]],
[['L2','L2','L1'],[10,20,30]],
[['L3','L2'],[10,20]],
[['L4','L2','L1'],[10,20,10]],
[['L4','L2','L4'],[20,40,50]],
]
While on the subject of the data, since the nodes and values obey 1-to-1 mappings (within each path), it would be more appropriate to use a list of tuples rather than a list of two parallel lists:
Levels = [
[('L1', 10), ('L1', 20), ('L2', 30)],
[('L1', 10), ('L1', 15), ('L3', 20)],
[('L1', 20), ('L2', 10)],
[('L2', 20), ('L2', 20), ('L3', 30)],
[('L2', 10), ('L2', 20), ('L1', 30)],
[('L3', 10), ('L2', 20)],
[('L4', 10), ('L2', 20), ('L1', 10)],
[('L4', 20), ('L2', 40), ('L4', 50)]
]
There seems to be a mistake in your expected output:
{
"name": "L3(1)#(10)",
"children": [
{
"name": "L2(1)#(10)", <--- this should be #(20)
"children": [
{}
]
}
]
},
Implementations
For your current data format (pair of lists):
# A dictionary here corresponds to an array of nodes in JSON
# the "name" fields serve as the keys and "children" as the values
class data_node(OrderedDict):
def __init__(self, **kwargs):
super(data_node, self).__init__(**kwargs)
self.instances = 0
self.total = 0
def insert(self, names, values):
# Python lists are truthy, so no need for len(path) == 0
if not (names or values):
return
# create the child node if it doesn't exist
child = self.get(names[0], data_node())
# add the value to the total
# and increment the instance counter
child.instances += 1
child.total += values[0]
# recursive call on the child
# with the rest of the path
child.insert(names[1:], values[1:])
self[names[0]] = child
def convert(self):
return [
{
"name": "{}({})#({})".format(
name,
child.instances,
child.total / child.instances # mean
),
"children": child.convert()
}
for name, child in self.items()
]
tree = data_node()
for path in Levels:
tree.insert(path[0], path[1])
print json.dumps(tree.convert(), indent=2)
For my proposed data format (list of tuples):
# only the insertion method differs
# all other parts of the class are unchanged
def insert(self, path):
if not path:
return
name, value = path[0]
child = self.get(name, data_node())
child.instances += 1
child.total += value
child.insert(path[1:])
self[name] = child
...
for path in Levels:
tree.insert(path) # simpler function signature
EDIT:
If there is a reason that you want the leaf node format to be [{}] instead of just [], only a simple change would be needed:
# in convert()
{
"name": ..., # as before
# again exploiting the truthy-ness property of arrays
"children": child.convert() or [{}]
}
Output
Both implementations give the correct JSON output, according my comments in the preface:
[
{
"name": "L1(3)#(13)",
"children": [
{
"name": "L1(2)#(17)",
"children": [
{
"name": "L2(1)#(30)",
"children": []
},
{
"name": "L3(1)#(20)",
"children": []
}
]
},
{
"name": "L2(1)#(10)",
"children": []
}
]
},
{
"name": "L2(2)#(15)",
"children": [
{
"name": "L2(2)#(20)",
"children": [
{
"name": "L3(1)#(30)",
"children": []
},
{
"name": "L1(1)#(30)",
"children": []
}
]
}
]
},
{
"name": "L3(1)#(10)",
"children": [
{
"name": "L2(1)#(20)",
"children": []
}
]
},
{
"name": "L4(2)#(15)",
"children": [
{
"name": "L2(2)#(30)",
"children": [
{
"name": "L1(1)#(10)",
"children": []
},
{
"name": "L4(1)#(50)",
"children": []
}
]
}
]
}
]
Related
I have a list of paths:
paths = [
"root/child1/file1",
"root/child1/file2",
"root/child2/file1"
]
And I want to parse it ith python into dict (or list of dicts) that looks like:
{
"text": "root",
"children": [
{
"text": "child1",
"children": [
{
"text": "file1",
"children": []
},
{
"text": "file2",
"children": []
}
]
},
{
"text": "child2",
"children": [
{
"text": "file2",
"children": []
}
]
}
I tried to write some recursive function, but no success. Example:
def path2dict(path, depth):
d = {}
text = path.split('/')[0]
d['text'] = text
depth = depth + 1
d['children'] = [path2dict(p, depth) for p in path.split('/')[depth:]]
return d
paths = [
"root/child1/file1",
"root/child1/file2",
"root/child2/file1"
]
depth = 0
for path in paths:
d = path2dict(path, depth)
print(d)
Sorry for not using your existing solution, but I have some other:
def stage1(paths):
result = {}
for path in paths:
split = path.split('/')
current = result
for part in split:
current.setdefault(part, {})
current = current[part]
return result
def stage2(dct):
return [
{
'text': key,
'children': stage2(value)
}
for key, value in dct.items()
]
after_stage1 = stage1(paths)
# after_stage1 is
# {
# 'root': {
# 'child1': {
# 'file1': {},
# 'file2': {}
# },
# 'child2': {
# 'file1': {}
# }
# }
# }
after_stage2 = stage2(after_stage1)
# after_stage2 contains exactly what you need
You can use itertools.groupby:
from itertools import groupby
import json
d = ['root/child1/file1', 'root/child1/file2', 'root/child2/file1']
def create_paths(paths):
_vals = [[a, [c for _, *c in b]] for a, b in groupby(sorted(paths, key=lambda x:x[0]), key=lambda x:x[0])]
return [{'text':a, 'children':[] if not b[0] else create_paths(b)} for a, b in _vals]
print(json.dumps(create_paths([i.split('/') for i in d]), indent=4))
Output:
[
{
"text": "root",
"children": [
{
"text": "child1",
"children": [
{
"text": "file1",
"children": []
},
{
"text": "file2",
"children": []
}
]
},
{
"text": "child2",
"children": [
{
"text": "file1",
"children": []
}
]
}
]
}
]
I have the following code to generate json representation of list of lists.
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
return [{'name': k, 'children': convert(v) if v else [{}]} for k, v in d.items()]
# Print results
import json
print(json.dumps(convert(root), indent=4))
Output:
[
"name": "L1",
"children": [
{
"name": "L1",
"children":[
{
"name":"L3",
"children":[{}]
},
{
"name":"L1",
"children":[{}]
}]
},
{
"name":"L2",
"children":[{}]
}
]
for the levels
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
I also need to encode the count of each level
for eg there is the path from L1 which has two first level childrens L1(2) and L2(1) followed by L2(1) and L3(1) for next level .
L1(3)-->L1(2)-->L2(1)
-->L3(1)
-->L2(1)
How can I encode this count in my json output.
I want my final output to look like this
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children":[
root={}
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
templist=[]
noofchildren=0
if(len(d.items())==0):
return ([{}],1)
for k,v in d.items():
temp,children=convert(v)
noofchildren+=children
if(temp):
templist.append({"name":k+"("+str(children)+")",'children':temp})
else:
templist.append({'name': k+"("+str(children)+")", 'children':[{}]})
return (templist,noofchildren)
# Print results
import json
print(json.dumps(convert(root)[0], indent=2))
OUTPUT
[
{
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
},
{
"name": "L3(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L3(1)",
"children": [
{}
]
},
{
"name": "L1(1)",
"children": [
{}
]
}
]
}
]
},
{
"name": "L3(1)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L4(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L1(1)",
"children": [
{}
]
},
{
"name": "L4(1)",
"children": [
{}
]
}
]
}
]
}
]
I have a json file that contains many children, like this:
{
"tree": {
"name": "Top Level",
"children": [
{
"name": "[('server', 'Cheese')]",
"children": [
{
"name": "[('waiter', 'mcdonalds')]",
"percentage": "100.00%",
"duration": 100,
"children": [
{
"name": "[('server', 'kfc')]",
"percentage": "15.73%",
"duration": 100,
"children": [
{
"name": "[('server', 'wendys')]",
"percentage": "12.64%",
"duration": 100
},
{
"name": "[('boss', 'dennys')]",
"percentage": "10.96%",
"duration": 100
}
]
},
{
"name": "[('cashier', 'chickfila')]",
"percentage": "10.40%",
"duration": 100,
"children": [
{
"name": "[('cashier', 'burger king')]",
"percentage": "11.20%",
"duration": 100
}
]
}
]
}
]
}
]
}
}
I want to add a unique ID to each child that corresponds to the level they are in so it ends up looking like this, where each ID can tell how many parents the data has and how deep into the json you are (for example, 21.2.3.102 would be the 102nd child of a 3rd child of a 2nd child of the 21st parent):
{
"tree": {
"name": "Top Level",
"id": 1
"children": [
{
"name": "[('server', 'Cheese')]",
"id": 1.1
"children": [
{
"name": "[('waiter', 'mcdonalds')]",
"percentage": "100.00%",
"duration": 100,
"id": 1.1.1
"children": [
{
"name": "[('server', 'kfc')]",
"percentage": "15.73%",
"duration": 100,
"id": 1.1.1.1
"children": [
{
"name": "[('server', 'wendys')]",
"percentage": "12.64%",
"duration": 100,
"id":1.1.1.1.1
},
{
"name": "[('boss', 'dennys')]",
"percentage": "10.96%",
"duration": 100,
"id":1.1.1.1.2
}
]
},
{
"name": "[('cashier', 'chickfila')]",
"percentage": "10.40%",
"duration": 100,
"id":1.1.1.2
"children": [
{
"name": "[('cashier', 'burger king')]",
"percentage": "11.20%",
"duration": 100,
"id":1.1.1.2.1
}
]
}
]
}
]
}
]
}
}
Is there a streamlined way to do this to a very long json file with many many children?
PLEASE
Thanks!
You can use recursion walking, where d - your dictionary from json:
def walk(d, level="1"):
d["id"] = level
for i, child in enumerate(d.get("children", []), 1):
walk(child, level + "." + str(i))
walk(d["tree"])
I'm trying to use Python to turn data from a CSV into a JSON with the format found here:
https://gist.github.com/mbostock/1093025 so that I can modify some http://d3js.org/ examples.
I have found some posts on how to do similar transformations, but nothing exactly like the nested {'name': name, 'children' = []} format.
For the test.csv:
Team,Task,Country,ID
Team A,Processing,CA,5
Team A,Review,CA,45
Team B,Processing,US,76
Team B,Processing,CA,676
Team B,Support,US,2345
Team C,Processing,US,67
Team C,Review,US,734
Team C,Support,US,34
Output should look like:
{
"name": "Flare",
"children": [
{
"name": "Team A",
"children": [
{
"name": "Processing",
"children": [
{"name": "CA", "size": 5}
]
},
{
"name": "Review",
"children": [
{"name": "CA", "size": 45}
]
}
]
},
{
"name": "Team B",
"children": [
{
"name": "Processing",
"children": [
{"name": "US", "size": 76},
{"name": "CA", "size": 676}
]
},
{
"name": "Support",
"children": [
{"name": "US", "size": 2345}
]
}
]
},
{
"name": "Team C",
"children": [
{
"name": "Processing",
"children": [
{"name": "US", "size": 67}
]
},
{
"name": "Review",
"children": [
{"name": "US", "size": 734}
]
},
{
"name": "Support",
"children": [
{"name": "US", "size": 34}
]
}
]
}
]
}
This is as far as I have been able to get (I know it's pretty bad):
import csv
import json
children = []
#create a list of lists from CSV
reader = csv.reader(open('//corp.bloomberg.com/pn-dfs/AllLinks/Users/jdesilvio/Desktop/test.csv', 'rb'))
reader.next()
for row in reader:
children.append(row)
#create tree root
tree = {'name': "flare", 'children': []}
#create a generic subtree
subtree = {'name': 0, 'children': []}
for i in children:
#if the first element in i doesn't equal name, then we know that it's a different group
if i[0] != subtree['name']:
#so we append the current group
tree['children'].append({'name': subtree['name'], 'children': subtree['children']})
#start a new group
subtree['children'] = []
#and rename the subtree
subtree['name'] = i[0]
else:
#then start appending pieces to the new group
subtree['children'].append(i[1:len(i)])
#remove the generic starting name
tree['children'] = tree['children'][1:]
print json.dumps(tree, indent=1)
Which yields:
{
"name": "flare",
"children": [
{
"name": "Team A",
"children": [
[
"Review",
"CA",
"45"
]
]
},
{
"name": "Team B",
"children": [
[
"Processing",
"CA",
"676"
],
[
"Support",
"US",
"2345"
]
]
}
]
}
This looks like it is headed in the right direction, but even if I was able to get the first level nested, I'm not sure how to nest more levels in a generic way.
Populate the tree is the most clear solution. However, using a dict for traversing is not a good idea. I suggest to create a helper class for each tree node, use it for populating data and then convert result to JSON:
import csv
import json
class Node(object):
def __init__(self, name, size=None):
self.name = name
self.children = []
self.size = size
def child(self, cname, size=None):
child_found = [c for c in self.children if c.name == cname]
if not child_found:
_child = Node(cname, size)
self.children.append(_child)
else:
_child = child_found[0]
return _child
def as_dict(self):
res = {'name': self.name}
if self.size is None:
res['children'] = [c.as_dict() for c in self.children]
else:
res['size'] = self.size
return res
root = Node('Flare')
with open('/tmp/test.csv', 'r') as f:
reader = csv.reader(f)
reader.next()
for row in reader:
grp1, grp2, grp3, size = row
root.child(grp1).child(grp2).child(grp3, size)
print json.dumps(root.as_dict(), indent=4)
I am working on a file representing a tree-like structure very similar to flare.json which is known for D3.js community. What's the best way to delete all the leaves of the tree in python? In other words, I want to remove all the keys that don't have a 'children' key in their value.
example :
{
"name": "flare",
"children": [
{
"name": "analytics",
"children": [
{
"name": "cluster",
"children": [
{"name": "AgglomerativeCluster", "size": 3938},
{"name": "CommunityStructure", "size": 3812},
{"name": "HierarchicalCluster", "size": 6714},
{"name": "MergeEdge", "size": 743}
]
},
{
"name": "graph",
"children": [
{"name": "BetweennessCentrality", "size": 3534},
{"name": "LinkDistance", "size": 5731},
{"name": "MaxFlowMinCut", "size": 7840},
{"name": "ShortestPaths", "size": 5914},
{"name": "SpanningTree", "size": 3416}
]
},
{
"name": "optimization",
"children": [
{"name": "AspectRatioBanker", "size": 7074}
] ...
which should become:
{
"name": "flare",
"children": [
{
"name": "analytics",
"children": [
{
"name": "cluster",
},
{
"name": "graph",
},
{
"name": "optimization",
] ...
In other words, I'm just cutting the leaves of the tree. In a children list is empty, it should be removed.
I tried this only to remove the keys and it did not work:
def deleteLeaves(pTree):
if pTree.has_key('children'):
for child in pTree['children']:
deleteLeaves(child)
else:
del pTree
This seems to approximate what you want:
def pruneLeaves(obj):
if isinstance(obj, dict):
isLeaf = True
for key in obj.keys():
if key == 'children': isLeaf = False
if pruneLeaves(obj[key]): del obj[key]
return isLeaf
elif isinstance(obj, list):
leaves = []
for (index, element) in enumerate(obj):
if pruneLeaves(element): leaves.append(index)
leaves.reverse()
for index in leaves: obj.pop(index)
return not bool(obj)
else: # String values look like attributes in your dict, so never prune them
return False
Tested with a truncated sample of your data:
data = {
"name": "flare",
"children": [
{
"name": "analytics",
"children": [
{
"name": "cluster",
"children": [
{"name": "AgglomerativeCluster", "size": 3938},
{"name": "CommunityStructure", "size": 3812},
{"name": "HierarchicalCluster", "size": 6714},
{"name": "MergeEdge", "size": 743}
]
},
{
"name": "graph",
"children": [
{"name": "BetweennessCentrality", "size": 3534},
{"name": "LinkDistance", "size": 5731},
{"name": "MaxFlowMinCut", "size": 7840},
{"name": "ShortestPaths", "size": 5914},
{"name": "SpanningTree", "size": 3416}
]
}
]
}
]
}
pruneLeaves(data)
print data
And got these results:
{'name': 'flare', 'children': [{'name': 'analytics', 'children': [{'name': 'cluster'}, {'name': 'graph'}]}]}
I just edited the answer of #rchang to fix deletion of lists other than children.
def pruneLeaves(self,obj):
if isinstance(obj, dict):
isLeaf = True
for key in obj.keys():
if key=='children':
isLeaf = False
if self.pruneLeaves(obj[key]): del obj[key]
return isLeaf
elif isinstance(obj, list) :
leaves = []
for (index, element) in enumerate(obj):
if self.pruneLeaves(element): leaves.append(index)
leaves.reverse()
for index in leaves: obj.pop(index)
return not bool(obj)
else: # String values look like attributes in your dict, so never prune them
return False