Lets say we have a json object in Python:
myJson = [
{
"id": "123",
"name": "alex",
"meta": {
"city": "boston"
}
},
{
"id": "234",
"name": "mike",
"meta": {
"city": "seattle"
}
},
{
"id": "345",
"name": "jess",
"meta": {
"city": "boston"
}
}
]
What is the most efficient way to group this data by city, so that we end up with a json in which we group the data by city such that we end up with a json as:
myNewJson = [
{
"city": "boston",
"people": [ ... ... ]
},
{
"city": "seattle",
"people": [ ... ]
}
]
... in which the content of the people are included in "people" key.
Thanks!
Try:
myJson = [
{"id": "123", "name": "alex", "meta": {"city": "boston"}},
{"id": "234", "name": "mike", "meta": {"city": "seattle"}},
{"id": "345", "name": "jess", "meta": {"city": "boston"}},
]
out = {}
for d in myJson:
out.setdefault(d["meta"]["city"], []).append(d["name"])
out = [{"city": k, "people": v} for k, v in out.items()]
print(out)
Prints:
[
{"city": "boston", "people": ["alex", "jess"]},
{"city": "seattle", "people": ["mike"]},
]
Seems like a dictionary could work. Use city names as the keys, and a list as the value. Then at the end, go through the dictionary and convert it to a list.
myJson = [
{
"id": "123",
"name": "alex",
"meta": {
"city": "boston"
}
},
{
"id": "234",
"name": "mike",
"meta": {
"city": "seattle"
}
},
{
"id": "345",
"name": "jess",
"meta": {
"city": "boston"
}
}
]
d = dict() # dictionary of {city: list of people}
for e in myJson:
city = e['meta']['city']
if city not in d:
d[city] = list()
d[city].append(e['name'])
# convert dictionary to list of json
result = list()
for key, val in d.items():
result.append({'city': key, 'people': val})
print(result)
I have n of very complex Python dictionaries with big depth level (~5) and I don't know how to merge them properly and fast, not to iterate over them for a milion times.
What is worth mentioning - that dicts have strict structure as you will see below.
I was trying solutions connected with:
defaultdict
merge operator
Version of Python - 3.9
d1 = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
d2 = {
"name": "Louis",
"places": [
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
d3 = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
And in that case output should be
d_merged = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
},
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
Your task is quite specific, so universal solution is not possible. I'd suggest you to merge all "places", "subplaces" and "subsubplaces" in nested dictionary to clean up all possible duplicates and then modify data to match desired format.
from itertools import groupby
from operator import itemgetter
from collections import defaultdict
def merge_places(*dicts):
if not dicts:
return
# check all dicts have same names
# https://docs.python.org/3/library/itertools.html#itertools-recipes
g = groupby(dicts, itemgetter("name"))
if next(g, True) and next(g, False):
raise ValueError("Dictionaries names are not equal")
places = defaultdict(lambda: defaultdict(set)) # set values are unique
for d in dicts:
for place in d["places"]:
for subplace in place["subplaces"]:
for subsubplace in subplace["subsubplaces"]:
places[place["code"]][subplace["name"]].add(subsubplace["name"])
return {
"name": d["name"], # always exists as dicts aren't empty
"places": [
{
"code": code,
"subplaces": [
{
"name": name,
"subsubplaces": [
{"name": subsubplace}
for subsubplace in subsubplaces
]
}
for name, subsubplaces in subplaces.items()
]
}
for code, subplaces in places.items()
]
}
Usage:
result = merge_places(d1, d2, d3)
Output:
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
},
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
I think your representation of data has a lot of non unnecessary details, we can reduce them by this solution:
from typing import Dict, List
dicts = [
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
},
{
"name": "Louis",
"places": [
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
},
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}]
def merger(dicts: List[Dict]) -> Dict:
result = {}
for d in dicts:
name = d["name"]
if not name in result:
result[name] = {}
places = d["places"]
for p in places:
code = p["code"]
if not code in result[name]:
result[name][code] = []
result[name][code].extend(p["subplaces"])
return result
print(merger(dicts=dicts))
The output will be:
{
'Louis':{
'A':[
{'name': 'Subplace name', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name2', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name X', 'subsubplaces': [{'name': 'subsub1'}]}
],
'B':[
{'name': 'Subplace name', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name2', 'subsubplaces': [{'name': 'subsub1'}]}]
}
}
If you want your desired output it's easy to change this one to your desired output, but this on is more readable and maintainable.
I have created a list using boto3 that contains all the subfolders in my S3 bucket. The list is sorted as below:
s3_list = ['a', 'a/a1/a11', 'b', 'b/b1', 'b/b2', 'b/b2/b22']
I'm trying to convert this list into JSON hierarchy structure:
{
"root": [
{
"name": "a",
"path": "a",
"child": [
{
"name": "a1",
"path": "a/a1",
"child": [
{
"name": "a11",
"path": "a/a1/a11"
}
]
}
]
},
{
"name": "b",
"path": "b",
"child": [
{
"name": "b1",
"path": "b/b1"
},
{
"name": "b2",
"path": "b/b2",
"child": [
{
"name": "b22",
"path": "b/b2/b22"
}
]
}
]
}
]
}
Whats the best way/libraries to get this going in Python?
You can use recursion with collections.defaultdict:
from collections import defaultdict
s3_list = ['a', 'a/a1/a11', 'b', 'b/b1', 'b/b2', 'b/b2/b22']
def to_dict(d, c = []):
if not d:
return {}
_d, r = defaultdict(list), []
for a, *b in d:
_d[a].append(b)
return [{'name':a, 'path':'/'.join(c+[a]),
**({} if not (k:=list(filter(None, b))) else {'children':to_dict(k, c+[a])})}
for a, b in _d.items()]
result = {'root':to_dict([i.split('/') for i in s3_list])}
import json
print(json.dumps(result, indent=4))
Output:
{
"root": [
{
"name": "a",
"path": "a",
"children": [
{
"name": "a1",
"path": "a/a1",
"children": [
{
"name": "a11",
"path": "a/a1/a11"
}
]
}
]
},
{
"name": "b",
"path": "b",
"children": [
{
"name": "b1",
"path": "b/b1"
},
{
"name": "b2",
"path": "b/b2",
"children": [
{
"name": "b22",
"path": "b/b2/b22"
}
]
}
]
}
]
}
I am trying to generate custom JSON in python using the following code
root={}
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
templist=[]
noofchildren=0
if(len(d.items())==0):
return ([{}],1)
for k,v in d.items():
temp,children=convert(v)
noofchildren+=children
if(temp):
templist.append({"name":k+"("+str(children)+")",'children':temp})
else:
templist.append({'name': k+"("+str(children)+")", 'children':[{}]})
return (templist,noofchildren)
# Print results
import json
print(json.dumps(convert(root)[0], indent=2))
and the OUTPUT is
[
{
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
},
{
"name": "L3(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L3(1)",
"children": [
{}
]
},
{
"name": "L1(1)",
"children": [
{}
]
}
]
}
]
},
{
"name": "L3(1)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L4(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L1(1)",
"children": [
{}
]
},
{
"name": "L4(1)",
"children": [
{}
]
}
]
}
]
}
]
My dataset has changed a little bit
Levels=[[['L1','L1','L2'],[10,20,30]],
[[['L1','L1','L3'],[10,15,20]],
[[['L1','L2'],[20,10]],
[[['L2','L2','L3'],[20,20,30]],
[[['L2','L2','L1'],[10,20,30]]
[[['L3','L2'],[10,20]]
[[['L4','L2','L1'],[10,20,10]]
[[['L4','L2','L4'],[20,40,50]]]
and the output that I want is the average of the levels along with the count
[
{
"name": "L1(3)#(13)", // taking avg of 10,10,20
"children": [
{
"name": "L1(2)#(17)", // taking avg of 20,15
"children": [
{
"name": "L2(1)#(30)",
"children": [
{}
]
},
{
"name": "L3(1)#(20)",
"children": [
{}
]
}
]
},
{
"name": "L2(1)#10",
"children": [
{}
]
}
]
},
{
"name": "L2(2)#(15)", // avg of 20,10
"children": [
{
"name": "L2(2)#(20)", // avg of 20,20
"children": [
{
"name": "L3(1)#(30)",
"children": [
{}
]
},
{
"name": "L1(1)#(30)",
"children": [
{}
]
}
]
}
]
},
{
"name": "L3(1)#(10)",
"children": [
{
"name": "L2(1)#(10)",
"children": [
{}
]
}
]
},
{
"name": "L4(2)#(15)",// avg of 10,20
"children": [
{
"name": "L2(2)#(30)", // avg of 20,40
"children": [
{
"name": "L1(1)# (10)",
"children": [
{}
]
},
{
"name": "L4(1)#(50)",
"children": [
{}
]
}
]
}
]
}
]
How can i change my code to add this information?
Preface
Before moving onto the solution, here are some points I want to mention:
Make use of Python's object-oriented programming features! This makes the data structure much clearer to both yourself and future readers.
Using a custom class also makes it easier for us to store the metadata – i.e. the number of instances of a node and its total value – while constructing the intermediate tree structure, rather than while converting it. This is also more efficient because with the latter method, a simple naive traversal algorithm would make duplicate visits to the nodes!
If you want your output to (reliably) maintain the order in which the paths were inserted, you should use an OrderedDict (from collections) instead of an ordinary dict ({}).
It is more logical to output an empty list for nodes with no children than a list with a single empty dict:
// Before
"children": [
{}
]
// After
"children": []
The reason being that any software which will parse this data later can safely assume that all objects have the "name" and "children" fields, which an empty dict does not.
The list boundaries and elements in the Levels array seem to be poorly formed; did you mean:
Levels = [
[['L1','L1','L2'],[10,20,30]],
[['L1','L1','L3'],[10,15,20]],
[['L1','L2'],[20,10]],
[['L2','L2','L3'],[20,20,30]],
[['L2','L2','L1'],[10,20,30]],
[['L3','L2'],[10,20]],
[['L4','L2','L1'],[10,20,10]],
[['L4','L2','L4'],[20,40,50]],
]
While on the subject of the data, since the nodes and values obey 1-to-1 mappings (within each path), it would be more appropriate to use a list of tuples rather than a list of two parallel lists:
Levels = [
[('L1', 10), ('L1', 20), ('L2', 30)],
[('L1', 10), ('L1', 15), ('L3', 20)],
[('L1', 20), ('L2', 10)],
[('L2', 20), ('L2', 20), ('L3', 30)],
[('L2', 10), ('L2', 20), ('L1', 30)],
[('L3', 10), ('L2', 20)],
[('L4', 10), ('L2', 20), ('L1', 10)],
[('L4', 20), ('L2', 40), ('L4', 50)]
]
There seems to be a mistake in your expected output:
{
"name": "L3(1)#(10)",
"children": [
{
"name": "L2(1)#(10)", <--- this should be #(20)
"children": [
{}
]
}
]
},
Implementations
For your current data format (pair of lists):
# A dictionary here corresponds to an array of nodes in JSON
# the "name" fields serve as the keys and "children" as the values
class data_node(OrderedDict):
def __init__(self, **kwargs):
super(data_node, self).__init__(**kwargs)
self.instances = 0
self.total = 0
def insert(self, names, values):
# Python lists are truthy, so no need for len(path) == 0
if not (names or values):
return
# create the child node if it doesn't exist
child = self.get(names[0], data_node())
# add the value to the total
# and increment the instance counter
child.instances += 1
child.total += values[0]
# recursive call on the child
# with the rest of the path
child.insert(names[1:], values[1:])
self[names[0]] = child
def convert(self):
return [
{
"name": "{}({})#({})".format(
name,
child.instances,
child.total / child.instances # mean
),
"children": child.convert()
}
for name, child in self.items()
]
tree = data_node()
for path in Levels:
tree.insert(path[0], path[1])
print json.dumps(tree.convert(), indent=2)
For my proposed data format (list of tuples):
# only the insertion method differs
# all other parts of the class are unchanged
def insert(self, path):
if not path:
return
name, value = path[0]
child = self.get(name, data_node())
child.instances += 1
child.total += value
child.insert(path[1:])
self[name] = child
...
for path in Levels:
tree.insert(path) # simpler function signature
EDIT:
If there is a reason that you want the leaf node format to be [{}] instead of just [], only a simple change would be needed:
# in convert()
{
"name": ..., # as before
# again exploiting the truthy-ness property of arrays
"children": child.convert() or [{}]
}
Output
Both implementations give the correct JSON output, according my comments in the preface:
[
{
"name": "L1(3)#(13)",
"children": [
{
"name": "L1(2)#(17)",
"children": [
{
"name": "L2(1)#(30)",
"children": []
},
{
"name": "L3(1)#(20)",
"children": []
}
]
},
{
"name": "L2(1)#(10)",
"children": []
}
]
},
{
"name": "L2(2)#(15)",
"children": [
{
"name": "L2(2)#(20)",
"children": [
{
"name": "L3(1)#(30)",
"children": []
},
{
"name": "L1(1)#(30)",
"children": []
}
]
}
]
},
{
"name": "L3(1)#(10)",
"children": [
{
"name": "L2(1)#(20)",
"children": []
}
]
},
{
"name": "L4(2)#(15)",
"children": [
{
"name": "L2(2)#(30)",
"children": [
{
"name": "L1(1)#(10)",
"children": []
},
{
"name": "L4(1)#(50)",
"children": []
}
]
}
]
}
]
I am trying to add a key id with the same uuid.uuid4() into the inner dictionary when 'node' values are equal and a new uuid.uuid4() when a distinct uuid is found.
Let's say 2 keys ('node' in this case) have same value like-> node: 'Bangalore', so I want to generate the same ID for it and a fresh ID for every other distinct node.
This is the code I'm working on now:
import uuid
import json
node_list = [
{
"nodes": [
{
"node": "Kunal",
"label": "PERSON"
},
{
"node": "Bangalore",
"label": "LOC"
}
]
},
{
"nodes": [
{
"node": "John",
"label": "PERSON"
},
{
"node": "Bangalore",
"label": "LOC"
}
]
}
]
for outer_node_dict in node_list:
for inner_dict in outer_node_dict["nodes"]:
inner_dict['id'] = str(uuid.uuid4()) # Remember the key's value here and apply this statement somehow?
print(json.dumps(node_list, indent = True))
This is the response I want:
"[
{
"nodes": [
{
"node": "Kunal",
"label": "PERSON",
"id": "fbf094eb-8670-4c31-a641-4cf16c3596d1"
},
{
"node": "Bangalore",
"label": "LOC",
"id": "24867c2a-f66a-4370-8c5d-8af5b9a25675"
}
]
},
{
"nodes": [
{
"node": "John",
"label": "PERSON",
"id": "5eddc375-ed3e-4f6a-81dc-3966590e8f35"
},
{
"node": "Bangalore",
"label": "LOC",
"id": "24867c2a-f66a-4370-8c5d-8af5b9a25675"
}
]
}
]"
But currently its generating like this:
"[
{
"nodes": [
{
"node": "Kunal",
"label": "PERSON",
"id": "3cce6e36-9d1c-4058-a11b-2bcd0da96c83"
},
{
"node": "Bangalore",
"label": "LOC",
"id": "4d860d3b-1835-4816-a372-050c1cc88fbb"
}
]
},
{
"nodes": [
{
"node": "John",
"label": "PERSON",
"id": "67fc9ba9-b591-44d4-a0ae-70503cda9dfe"
},
{
"node": "Bangalore",
"label": "LOC",
"id": "f83025a0-7d8e-4ec8-b4a0-0bced982825f"
}
]
}
]"
How to remember key's value and apply the same ID for it in the dictionary?
Looks like you want the uuid to be the same for the same "node" value. So, instead of generating it, store it to a dict
node_uuids = defaultdict(lambda: uuid.uuid4())
and then, in your inner loop, instead of
inner_dict['id'] = str(uuid.uuid4())
you write
inner_dict['id'] = node_uuids[inner_dict['node']]
A complete working example is as follows:
from collections import defaultdict
import uuid
import json
node_list = [
{
"nodes": [
{
"node": "Kunal",
"label": "PERSON"
},
{
"node": "Bangalore",
"label": "LOC"
}
]
},
{
"nodes": [
{
"node": "John",
"label": "PERSON"
},
{
"node": "Bangalore",
"label": "LOC"
}
]
}
]
node_uuids = defaultdict(lambda: uuid.uuid4())
for outer_node_dict in node_list:
for inner_dict in outer_node_dict["nodes"]:
inner_dict['id'] = str(node_uuids[inner_dict['node']])
print(json.dumps(node_list, indent = True))