Python recursive aggregation - python

I am working with a nested data structure which needs to be flattened. The values need to be aggregated so totals are produced across each level of the nested data. I'm trying to do this recursively but it's not clear how best to achieve this?
The following is an example of the data I'm working with.
def get_result():
return {
"a1": {
"b1": {
"c1": {
"d1": 1,
"d2": 1,
},
"c2": {
"d3": 1,
}
},
"b2": {
"c3": {
"d4": 1
}
}
},
"a2": {}
}
The data I'd like to produce would be as follows:
[
{
"key": "a1",
"total": 4
},
{
"key": "b1",
"total": 3
},
{
"key": "c1",
"total": 2
},
{
"key": "d1",
"total": 1
},
{
"key": "d2",
"total": 1
}
{
"key": "c2",
"total": 1
},
{
"key": "d3",
"total": 1
},
{
"key": "b2",
"total": 1
},
{
"key": "c3",
"total": 1
},
{
"key": "d4",
"total": 1
}
]

You can use recursion
from collections import defaultdict
def agg(data):
result = defaultdict(int)
agg_sum = 0
for k, v in data.items():
if isinstance(v, dict):
d, sub = agg(v)
if sub:
result.update(d)
result[k] += sub
agg_sum += sub
else:
result[k] += v
agg_sum += v
return result, agg_sum

You can use a recursive generator function for a shorter solution:
d = {'a1': {'b1': {'c1': {'d1': 1, 'd2': 1}, 'c2': {'d3': 1}}, 'b2': {'c3': {'d4': 1}}}, 'a2': {}}
def get_aggr(d):
return d if not isinstance(d, dict) else sum(map(get_aggr, d.values()))
def aggr_keys(d):
for a, b in d.items():
yield {'key':a, 'total':get_aggr(b)}
yield from (() if not isinstance(b, dict) else aggr_keys(b))
print(list(aggr_keys(d)))
Output:
[{'key': 'a1', 'total': 4},
{'key': 'b1', 'total': 3},
{'key': 'c1', 'total': 2},
{'key': 'd1', 'total': 1},
{'key': 'd2', 'total': 1},
{'key': 'c2', 'total': 1},
{'key': 'd3', 'total': 1},
{'key': 'b2', 'total': 1},
{'key': 'c3', 'total': 1},
{'key': 'd4', 'total': 1},
{'key': 'a2', 'total': 0}]

Related

Python Group and aggregate unidirectionally a list of dictionaries by multiple keys

Am building a tree selector, I need to structure my data like a tree of grouped items. I have bellow input which is a list of dictionaries.
data = [
{'region': 'R1', 'group': 'G1', 'category': 'C1', 'item': 'I2'},
{'region': 'R1', 'group': 'G1', 'category': 'C1', 'item': 'I1'},
{'region': 'R1', 'group': 'G2', 'category': 'C2', 'item': 'I3'},
{'region': 'R2', 'group': 'G1', 'category': 'C1', 'item': 'I1'},
{'region': 'R2', 'group': 'G2', 'category': 'C2', 'item': 'I3'},
{'region': 'R2', 'group': 'G2', 'category': 'C2', 'item': 'I4'},
{'region': 'R2', 'group': 'G2', 'category': 'C3', 'item': 'I5'},
]
I want to get the following output
result = {
"regions": [
{
"name": "R1",
"groups": [
{
"name": "G1",
"categories": [
{"name": "C1","items": [{ "name": "I2"},{"name": "I1"}]}
]
},
{
"name": "G2",
"categories": [
{"name": "C2", "items": [{"name": "I3"}]}
]
}
]
},
{
"name": "R2",
"groups": [
{
"name": "G1",
"categories": [
{"name": "C1","items": [{"name": "I1"}]}
]
},
{
"name": "G2",
"categories": [
{"name": "C2","items": [{"name": "I3"},{"name": "I4"}]},
{"name": "C3", "items": [{"name": "I5"}]}
]
}
]
}
]
}
After some researches I come up with this solution
from collections import OrderedDict
d = OrderedDict()
for aggr in data:
d.setdefault(
key=(aggr['region'], aggr['group'], aggr['category']),
default=list()
).append({"name": aggr['item']})
d1 = OrderedDict()
for k, v in d.items():
d1.setdefault(
key=(k[0], k[1]),
default=list()
).append({"name": k[2], "items": v})
d2 = OrderedDict()
for k, v in d1.items():
d2.setdefault(
key=k[0],
default=list()
).append({"name": k[1], "categories": v})
result = {"regions": [{"name": k, "groups": v} for k, v in d2.items()]}
It's working but I believe it's not the most pythonic solution. I did not manage to simplify it.
Any help to propose another solution or improvement on above codes will be appreciated
As long as the items are sorted, like in your example, you could use groupby from itertools in a recursive function, like:
from itertools import groupby
from operator import itemgetter
def plural(word):
return f"{word}s" if word[-1] != 'y' else f"{word[:-1]}ies"
def grouping(records, *keys):
if len(keys) == 1:
return [{"name": record[keys[0]]} for record in records]
return [
{"name": key, plural(keys[1]): grouping(group, *keys[1:])}
for key, group in groupby(records, itemgetter(keys[0]))
]
result = {"regions": grouping(data, "region", "group", "category", "item")}
If the sorting isn't guaranteed, then you could adjust grouping in the following way
def grouping(records, *keys):
if len(keys) == 1:
return [{"name": record[keys[0]]} for record in records]
key_func = itemgetter(keys[0])
records = sorted(records, key=key_func)
return [
{"name": key, plural(keys[1]): grouping(group, *keys[1:])}
for key, group in groupby(records, key_func)
]
or sort the data beforehand
keys = ["region", "group", "category", "item"]
data = sorted(data, key=itemgetter(*keys))
result = {"regions": grouping(data, *keys)}
Result of first version for data as provided in the question:
result = {
"regions": [
{
"name": "R1",
"groups": [
{
"name": "G1",
"categories": [
{"name": "C1", "items": [{"name": "I2"}, {"name": "I1"}]
}
]
},
{
"name": "G2",
"categories": [
{"name": "C2", "items": [{"name": "I3"}]}
]
}
]
},
{
"name": "R2",
"groups": [
{
"name": "G1",
"categories": [
{"name": "C1", "items": [{"name": "I1"}]}
]
},
{
"name": "G2",
"categories": [
{"name": "C2", "items": [{"name": "I3"}, {"name": "I4"}]},
{"name": "C3", "items": [{"name": "I5"}]}
]
}
]
}
]
}

loop over a nested dictionary to create a new one

I've got a nested dictionary like that:
d={'a1': {'b': ['x', 1]}, 'a2': {'b1': ['x1', 2]}}
Expected result:
[
{
"measurements": "XXXXX",
"tags": {
"MPC": b,
"host": a1
},
"time": "timexxxxx",
"fields": {
x: 1
}
},
{
"measurements": "XXXXX",
"tags": {
"MPC": b,
"host": a2
},
"time": "timexxxxx",
"fields": {
x: 1
}
}
]
that is what I'm trying, however it's being overwritten
for k,v in d.items():
metrics['measurements'] = "XXXXX"
if isinstance(v,dict):
for j,h in v.items():
metrics['tags'] = {'MPC':j,'host':k}
metrics['time'] = "timexxxxx"
for value in h:
metrics['fields'] = {j:h}
and I'm getting:
{'fields': {'b1': ['x1', 2]},
'measurements': 'XXXXX',
'tags': {'MPC': 'b1', 'host': 'a2'},
'time': 'timexxxxx'}
Could you give me some pointers on how to deal with this?
Thanks
see below
import pprint
d = {'a1': {'b': ['x', 1]}, 'a2': {'b1': ['x1', 2]}}
data = []
for k, v in d.items():
entry = {"measurements": "XXXXX"}
entry['tags'] = {'MPC': list(v.keys())[0],"host": k}
entry["time"] = "timexxxxx"
values= list(v.values())
entry["fields"] = {values[0][0]:values[0][1]}
data.append(entry)
pprint.pprint(data)
output
[{'fields': {'x': 1},
'measurements': 'XXXXX',
'tags': {'MPC': 'b', 'host': 'a1'},
'time': 'timexxxxx'},
{'fields': {'x1': 2},
'measurements': 'XXXXX',
'tags': {'MPC': 'b1', 'host': 'a2'},
'time': 'timexxxxx'}]
This code can help you:
d={'a1': {'b': ['x', 1]}, 'a2': {'b1': ['x1', 2]}}
def convert(dictionary):
return [
{
"measurements": "XXXXX",
"tags": {
"MPC": list(value.keys())[0],
"host": key
},
"time": "timexxxxx",
"fields": dict(value.values())
} for key, value in dictionary.items()
]
print(convert(d))
Results in [{'measurements': 'XXXXX', 'tags': {'MPC': 'b', 'host': 'a1'}, 'time': 'timexxxxx', 'fields': {'x': 1}}, {'measurements': 'XXXXX', 'tags': {'MPC': 'b1', 'host': 'a2'}, 'time': 'timexxxxx', 'fields': {'x1': 2}}]
You can do it like this
#Empty List
li=[]
#Add Items in list
for i in range(2):
d = {}
d["measurment"] = "XXXXX"
d["tags"] = {1: "x"}
d["time"] = "timexxx"
d["field"] = {2: "y"}
li.append(d)
#Print list elements
for i in li:
for key, value in i.items():
print(key, ":", value)
print()

Converting pandas dataframe to JSON Object Column

I have a pandas dataframe that has information about a user with multiple orders and within each order there are multiple items purchases. An example of the dataframe format:
user_id | order_num | item_id | item_desc
1 1 1 red
1 1 2 blue
1 1 3 green
I want to convert it to JSONb Object in a column so that I can query it in postgresql.
Currently I am using the following code:
j = (reg_test.groupby(['user_id', 'order_num'], as_index=False)
.apply(lambda x: x[['item_id','item_desc']].to_dict('r'))
.reset_index()
.rename(columns={0:'New-Data'})
.to_json(orient='records'))
This is the result I am getting:
'''
[
{
"New-Data": [
{
"item_id": "1",
"item_desc": "red",
},
{
"item_id": "2",
"item_desc": "blue",
},
{
"item_id": "3",
"item_desc": "green",
}
],
"order_number": "1",
"user_id": "1"
}
]
'''
While that is correct json format, I want the result to look like this:
'''
[
{
"New-Data": [{
"1":
{
"item_id": "1",
"item_desc": "red",
},
"2": {
"item_id": "2",
"item_desc": "blue",
},
"3":
{
"item_id": "3",
"item_desc": "green",
}
}
],
"order_number": "1",
"user_id": "1"
}
]
'''
as an alternative to #rpanai's solution, i moved the processing into vanilla python :
convert dataframe to dict :
M = df.to_dict("records")
create the dict for the items
items = [
{key: value
for key, value in entry.items()
if key not in ("user_id", "order_num")}
for entry in M
]
item_details = [{str(num + 1): entry}
for num, entry
in enumerate(items)]
print(item_details)
[{'1': {'item_id': 1, 'item_desc': 'red'}},
{'2': {'item_id': 2, 'item_desc': 'blue'}},
{'3': {'item_id': 3, 'item_desc': 'green'}}]
Initialize dict and add the remaining data
d = dict()
d['New-Data'] = item_details
d['order_number'] = M[0]['order_num']
d['user_id'] = M[0]['user_id']
wrapper = [d]
print(wrapper)
[{'New-Data': [{'1': {'item_id': 1, 'item_desc': 'red'}},
{'2': {'item_id': 2, 'item_desc': 'blue'}},
{'3': {'item_id': 3, 'item_desc': 'green'}}],
'order_number': 1,
'user_id': 1}]
Have you considered to use a custom function
import pandas as pd
df = pd.DataFrame({'user_id': {0: 1, 1: 1, 2: 1},
'order_num': {0: 1, 1: 1, 2: 1},
'item_id': {0: 1, 1: 2, 2: 3},
'item_desc': {0: 'red', 1: 'blue', 2: 'green'}})
out = df.groupby(['user_id', 'order_num'])[["item_id", "item_desc"]]\
.apply(lambda x: x.to_dict("records"))\
.apply(lambda x: [{str(l["item_id"]):l for l in x}])\
.reset_index(name="New-Data")\
.to_dict("records")
where out returns
[{'user_id': 1,
'order_num': 1,
'New-Data': [{'1': {'item_id': 1, 'item_desc': 'red'},
'2': {'item_id': 2, 'item_desc': 'blue'},
'3': {'item_id': 3, 'item_desc': 'green'}}]}]

List of dicts to multilevel dict based on depth info

I have some data, more or less like this:
[
{"tag": "A", "level":0},
{"tag": "B", "level":1},
{"tag": "D", "level":2},
{"tag": "F", "level":3},
{"tag": "G", "level":4},
{"tag": "E", "level":2},
{"tag": "H", "level":3},
{"tag": "I", "level":3},
{"tag": "C", "level":1},
{"tag": "J", "level":2},
]
I want to turn it into a multilevel dict based on depth level (key "level"):
{
"A": {"level": 0, "children": {
"B": {"level": 1, "children": {
"D": {"level": 2, "children": {
"F": {"level": 3, "children": {
"G": {"level": 4, "children": {}}}}}},
"E": {"level": 2, "children": {
"H": {"level": 3, "children": {}},
"I": {"level": 3, "children": {}}}}}},
"C": {"level": 1, "children": {
"J": {"level": 2, "children": {}}}}}}
}
All I can come up with right now is this little piece of code... which obviously breaks after few items:
def list2multilevel(list):
children = {}
parent = list.pop(0)
tag = parent.get("Tag")
level = parent.get("Level")
for child in list:
ctag = child.get("Tag")
clevel = child.get("Level")
if clevel == level + 1:
children.update(list2multilevel(list))
elif clevel <= level:
print(clevel, level)
break
return {tag: children}
Originally sat down to it on Friday and it was supposed to be just a small exercise....
data = [
{"tag": "A", "level": 0},
{"tag": "B", "level": 1},
{"tag": "D", "level": 2},
{"tag": "F", "level": 3},
{"tag": "G", "level": 4},
{"tag": "E", "level": 2},
{"tag": "H", "level": 3},
{"tag": "I", "level": 3},
{"tag": "C", "level": 1},
{"tag": "J", "level": 2},
]
root = {'level': -1, 'children': {}}
parents = {-1: root}
for datum in data:
level = datum['level']
parents[level] = parents[level - 1]['children'][datum['tag']] = {
'level': datum['level'],
'children': {},
}
result = root['children']
print(result)
output:
{'A': {'level': 0, 'children': {'B': {'level': 1, 'children': {'D': {'level': 2, 'children': {'F': {'level': 3, 'children': {'G': {'level': 4, 'children': {}}}}}}, 'E': {'level': 2, 'children': {'H': {'level': 3, 'children': {}}, 'I': {'level': 3, 'children': {}}}}}}, 'C': {'level': 1, 'children': {'J': {'level': 2, 'children': {}}}}}}}
restriction:
level >= 0
Any level cannot be bigger than +1 of max level appeared before.
explanation:
parents is a dictionary to remember last element for each level.
root is a starting point(dummy element).
logic:
Start with -1 level which indicates the root.
Make an item and register it into parent's children.
Update same item to parents dictionary.
Repeat.
Extract root['children'].
Other solution using recursion (same restrictions as with Boseong Choi's answer):
data = [
{"tag": "A", "level": 0},
{"tag": "B", "level": 1},
{"tag": "D", "level": 2},
{"tag": "F", "level": 3},
{"tag": "G", "level": 4},
{"tag": "E", "level": 2},
{"tag": "H", "level": 3},
{"tag": "I", "level": 3},
{"tag": "C", "level": 1},
{"tag": "J", "level": 2},
]
def make_node(dic):
node = dic.copy()
node["children"] = {}
tag = node.pop("tag")
return tag, node
def add_child(parent, child, tag):
assert child["level"] > parent["level"]
if child["level"] == parent["level"] + 1:
parent["children"][tag] = child
return True
for node in parent["children"].values():
if add_child(node, child, tag):
return True
return False
def parse(lst):
assert lst[0]["level"] == 0
root_tag, root = make_node(lst[0])
for item in lst[1:]:
tag, node = make_node(item)
add_child(root, node, tag)
print(parse(data))
You can use recursion:
from itertools import groupby as gb
data = [{'tag': 'A', 'level': 0}, {'tag': 'B', 'level': 1}, {'tag': 'D', 'level': 2}, {'tag': 'F', 'level': 3}, {'tag': 'G', 'level': 4}, {'tag': 'E', 'level': 2}, {'tag': 'H', 'level': 3}, {'tag': 'I', 'level': 3}, {'tag': 'C', 'level': 1}, {'tag': 'J', 'level': 2}]
def to_tree(d, s = 0):
v = [list(b) for _, b in gb(d, key=lambda x:x['level'] == s)]
if len(v) == 1:
return {i['tag']:{'level':s, 'children':{}} for i in v[0]}
return {v[i][0]['tag']:{'level':s, 'children':to_tree(v[i+1], s+1)} for i in range(0, len(v), 2)}
import json
print(json.dumps(to_tree(data), indent=4))
Output:
{
"A": {
"level": 0,
"children": {
"B": {
"level": 1,
"children": {
"D": {
"level": 2,
"children": {
"F": {
"level": 3,
"children": {
"G": {
"level": 4,
"children": {}
}
}
}
}
},
"E": {
"level": 2,
"children": {
"H": {
"level": 3,
"children": {}
},
"I": {
"level": 3,
"children": {}
}
}
}
}
},
"C": {
"level": 1,
"children": {
"J": {
"level": 2,
"children": {}
}
}
}
}
}
}

n-depth tree: set parent value based on children values

In a n-depth dict where values are set in the deepest level of a hierarchy:
{
"name": "root",
"value": None, # expected value to be 80
"children": [
{
"name": "a",
"value": None, # expected value to be 30
"children": [
{ "name": "a.1", "value": 10 },
{ "name": "a.2", "value": 20 }
]
},
{
"name": "b",
"value": None, # expected value to be 50
"children": [
{ "name": "b.1", "value": 25 },
{
"name": "b.2",
"value": None, # expected value to be 25
"children": [
{"name": "b.2.1", "value": 5},
{"name": "b.2.2", "value": 5},
{"name": "b.2.3", "value": 5},
{"name": "b.2.4", "value": 5},
{"name": "b.2.5", "value": 5}
]
}
]
}
]
}
What could be the approach to recursively set each parent value based on the result of an operation perfomed with its children value (i.e. sum)?
I finally managed to do it using the iterative level order traversal pattern (BFS), I was missing just a couple of details.
This approach works because the depth iteration order is guaranteed, so once we are getting to a node wich has children, all its sub-level children are already calculated.
The solution:
def reverseTraversal(obj):
def parentOperation(node):
out = 0
for child in node['children']:
out = out + child['value']
return out
if obj is None:
return
queue = []
stack = []
queue.append(obj)
while len(queue) > 0:
temp = queue.pop(0)
stack.append(temp)
if 'children' in temp and len(temp['children']) > 0:
for child in temp['children']:
queue.append(child)
while len(stack)>0:
node = stack.pop()
if 'children' in node and len(node['children']) > 0:
node['value'] = parentOperation(node)
# obj is the original dict
obj = reverseTraversal(obj)
print(obj)
Results in:
{
"name": "root",
"value": 80,
"children": [
{
"name": "a",
"value": 30,
"children": [
{"name": "a.1","value": 10},
{"name": "a.2","value": 20}
]
},
{
"name": "b",
"value": 50,
"children": [
{"name": "b.1","value": 25},
{
"name": "b.2",
"value": 25,
"children": [
{"name": "b.2.1","value": 5},
{"name": "b.2.2","value": 5},
{"name": "b.2.3","value": 5},
{"name": "b.2.4","value": 5},
{"name": "b.2.5","value": 5}
]
}
]
}
]
}
Given your datastructure and a list of values to update, you can use next in recursion:
def update(d, targets):
return {a:[update(i, targets) for i in b] if isinstance(b, list) else update(b, targets) if isinstance(b, dict) else next(targets) if not b else b for a, b in d.items()}
targets = [80, 30, 50, 25]
results = update(nlist, iter(targets))
Output:
{'children': [{'children': [{'name': 'a.1', 'value': 10},
{'name': 'a.2', 'value': 20}],
'name': 'a',
'value': 30},
{'children': [{'name': 'b.1', 'value': 25},
{'children': [{'name': 'b.2.1', 'value': 5},
{'name': 'b.2.2', 'value': 5},
{'name': 'b.2.3', 'value': 5},
{'name': 'b.2.4', 'value': 5},
{'name': 'b.2.5', 'value': 5}],
'name': 'b.2',
'value': 25}],
'name': 'b',
'value': 50}],
'name': 'root',
'value': 80}

Categories