rebuilding arrays with nested defaultdict - python

This question is an extension of a previous question: rebuild python array based on common elements
- but different enough to warrant a new question:
I've been struggling with this for a bit now. My data is an array of dictionaries from an sql query. Each element in the array represents a shipment, and there are common values based on the keys.
data = [
{"CustName":"customer1", "PartNum":"part1", "delKey":"0001", "qty":"10", "memo":"blah1"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0002", "qty":"10", "memo":"blah2"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0003", "qty":"10", "memo":"blah3"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0004", "qty":"20", "memo":"blah4"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0005", "qty":"20", "memo":"blah5"},
{"CustName":"customer3", "PartNum":"partXYZ", "delKey":"0006", "qty":"50", "memo":"blah6"},
{"CustName":"customer3", "PartNum":"partABC", "delKey":"0007", "qty":"100", "memo":"blah7"}]
The output I want is grouped according to specific keys
dataOut = [
{"CustName":"customer1", "Parts":[
{"PartNum":"part1", "deliveries":[
{"delKey":"0001", "qty":"10", "memo":"blah1"},
{"delKey":"0002", "qty":"10", "memo":"blah2"},
{"delKey":"0003", "qty":"10", "memo":"blah3"}]}]},
{"CustName":"customer2", "Parts":[
{"PartNum":"part3", "deliveries":[
{"delKey":"0004", "qty":"20", "memo":"blah4"},
{"delKey":"0005", "qty":"20", "memo":"blah5"}]}]},
{"CustName":"customer3", "Parts":[
{"PartNum":"partXYZ", "deliveries":[
{"delKey":"0006", "qty":"50", "memo":"blah6"}]},
{"PartNum":"partABC", "deliveries":[
{"delKey":"0007", "qty":"100", "memo":"blah7"}]}]}]
I can get the grouping with a single level using defaultdict and list comprehension as provided by the previous question and modified slightly
d = defaultdict(list)
for item in data:
d[item['CustName']].append(item)
print([{'CustName': key, 'parts': value} for key, value in d.items()])
But I can't seem to get the second level in the output array - the grouping b the PartNum key. Through some research, I think what I need to do is use defaultdict as the type of the outer `defaultdict' like so:
d = defaultdict(defaultdict(list))
which throws errors because defaultdict returns a function, so I need to use lambda (yes?)
d = defaultdict(lambda:defaultdict(list))
for item in data:
d[item['CustName']].append(item) <----this?
My question is how to "access" the second level array in the loop and tell the "inner" defaultdict what to group on (PartNum)? The data comes to me from the database programmer and the project keeps evolving to add more and more data (keys), so I'd like this solution to be as general as possible in case more data gets thrown my way. I was hoping to be able to "chain" the defaultdicts depending on how many levels I need to go. I'm learning as I'm going, so I'm struggling trying to understand the lambda and the basics of the defaultdict type and where to go from here.

Using groupby as suggested by #Pynchia and using sorted for unordered data as suggested by #hege_hegedus:
from itertools import groupby
dataOut = []
dataSorted = sorted(data, key=lambda x: (x["CustName"], x["PartNum"]))
for cust_name, cust_group in groupby(dataSorted, lambda x: x["CustName"]):
dataOut.append({
"CustName": cust_name,
"Parts": [],
})
for part_num, part_group in groupby(cust_group, lambda x: x["PartNum"]):
dataOut[-1]["Parts"].append({
"PartNum": part_num,
"deliveries": [{
"delKey": delivery["delKey"],
"memo": delivery["memo"],
"qty": delivery["qty"],
} for delivery in part_group]
})
If you look at the second for loop, this will hopefully answer your question about accessing the second level array in the loop.

You could use a tree-like data structure based on an OrderedDefaultdict instead of a defaultdict(list). (The definition's from an unrelated answer of mine.)
from collections import OrderedDict
class OrderedDefaultdict(OrderedDict):
def __init__(self, *args, **kwargs):
if not args:
self.default_factory = None
else:
if not (args[0] is None or callable(args[0])):
raise TypeError('first argument must be callable or None')
self.default_factory = args[0]
args = args[1:]
super(OrderedDefaultdict, self).__init__(*args, **kwargs)
def __missing__ (self, key):
if self.default_factory is None:
raise KeyError(key)
self[key] = default = self.default_factory()
return default
Tree = lambda: OrderedDefaultdict(Tree)
d = Tree()
for rec in data:
custName, partNum, delKey = rec['CustName'], rec['PartNum'], rec['delKey']
details = {"qty": rec["qty"], "memo": rec["memo"]}
d[custName]['Parts'][partNum]['deliveries'][delKey] = details
So, for the data shown in your question, d would end up containing:
d = {
"customer1": {
"Parts": {
"part1": {
"deliveries": {"0001": {"memo": "blah1", "qty": "10"},
"0002": {"memo": "blah2", "qty": "10"},
"0003": {"memo": "blah3", "qty": "10"}}}}},
"customer2": {
"Parts": {
"part3": {
"deliveries": {"0004": {"memo": "blah4", "qty": "20"},
"0005": {"memo": "blah5", "qty": "20"}}}}},
"customer3": {
"Parts": {
"partXYZ": {
"deliveries": {"0006": {"memo": "blah6", "qty": "50"}}},
"partABC": {
"deliveries": {"0007": {"memo": "blah7", "qty": "100"}}}}}
}
Which could just simply be printed out since it's now grouped the way you want.

Sort by "CustName", "PartNum", "delKey". Iterate over the delivery items for each part, for each customer and accumulate to match your output spec.
I like to use operator.itemgetter - for me it makes things clearer.
import collections, itertools, operator
cust_name = operator.itemgetter('CustName')
part_num = operator.itemgetter('PartNum')
group_sort = operator.itemgetter('CustName', 'PartNum', 'delKey')
del_key = operator.itemgetter('delKey')
qty = operator.itemgetter('qty')
memo = operator.itemgetter('memo')
# sort on the relavent keys
data.sort(key = group_sort)
result = []
# iterate over customers
for custname, group1 in itertools.groupby(data, cust_name):
cust_dict = {'CustName' : custname, 'Parts': []}
# iterate over parts for this customer
for partnum, group2 in itertools.groupby(group1, part_num):
part_dict = {"PartNum" : partnum, 'deliveries' : []}
# iterate over delivery items for this part
for thing in group2:
part_dict['deliveries'].append({'delKey':del_key(thing),
'qty':qty(thing),
'memo':memo(thing)})
cust_dict['Parts'].append(part_dict)
result.append(cust_dict)
This clearly iterates over the items in the original data multiple times which may be a performance hit -- but I don't see a way around multiple iteration for what you need to do.

This is the prettiest way I could do it. It uses the same defaultdict idea to implement proper grouping, as python's builtin groupby function only works on ordered data.
Note that this version will mutate the items in the input dataset, so the leaf items in the result are the same dict instances as the input, but with "CustName" and "PartNum" entries deleted.
from collections import defaultdict
def groupby_mutate(seq, key):
d = defaultdict(list)
for item in seq:
d[item[key]].append(item)
del item[key]
return d
def your_operation(data):
return [ {
'CustName': CustName,
'Parts': [ {
'PartNum': PartNum,
'deliveries': deliveries
} for PartNum,deliveries in groupby_mutate(custItems, 'PartNum').items() ]
} for CustName,custItems in groupby_mutate(data, 'CustName').items() ]
# try it
from pprint import *
data = [
{"CustName":"customer1", "PartNum":"part1", "delKey":"0001", "qty":"10", "memo":"blah1"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0002", "qty":"10", "memo":"blah2"},
{"CustName":"customer1", "PartNum":"part1", "delKey":"0003", "qty":"10", "memo":"blah3"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0004", "qty":"20", "memo":"blah4"},
{"CustName":"customer2", "PartNum":"part3", "delKey":"0005", "qty":"20", "memo":"blah5"},
{"CustName":"customer3", "PartNum":"partXYZ", "delKey":"0006", "qty":"50", "memo":"blah6"},
{"CustName":"customer3", "PartNum":"partABC", "delKey":"0007", "qty":"100", "memo":"blah7"}
]
pprint(your_operation(data))
EDIT:
Just in the case somebody needs it in the future, here is a version that does not mutate the original data:
from collections import defaultdict
def groupby_getitem(seq, key):
d = defaultdict(list)
for item in seq:
d[item[key]].append(item)
return d
def your_operation(data):
return [ {
'CustName': CustName,
'Parts': [ {
'PartNum': PartNum,
'deliveries': [ dict(
(k,v) for k,v in delivery.items() if not k in ['CustName', 'PartNum']
) for delivery in deliveries ]
} for PartNum,deliveries in groupby_getitem(custItems, 'PartNum').items() ]
} for CustName,custItems in groupby_getitem(data, 'CustName').items() ]

Related

Drift management of JSON configurations by comparing with dictionary data

I am trying to write a python code for drift management, which compares the application's configuration in JSON with the predefined dictionary of key-value pairs.
Ex: Application configuration in JSON:
{
"location": "us-east-1",
"properties": [
{
"type": "t2.large",
"os": "Linux"
}
],
"sgs": {
"sgid": "x-1234"
}
}
Ex: Dictionary with desired values to compare:
{
"os": "Windows",
"location": "us-east-1"
}
Expected output:
Difference is:
{
"os": "Windows"
}
I have been trying to convert the entire JSON (including sub dicts) into a single dict without sub dicts, and then iterate over it with each values of desired dict. I am able to print all the key, values in line but couldn't convert into a dict.
Is there a better way to do this? Or any references that could help me out here?
import json
def openJsonFile(file):
with open(file) as json_data:
workData = json.load(json_data)
return workData
def recursive_iter(obj):
if isinstance(obj, dict):
for item in obj.items():
yield from recursive_iter(item)
elif any(isinstance(obj, t) for t in (list, tuple)):
for item in obj:
yield from recursive_iter(item)
else:
yield obj
data = openJsonFile('file.json')
for item in recursive_iter(data):
print(item)
Expected output:
{
"location": "us-east-1",
"type": "t2.large",
"os": "Linux"
"sgid": "x-1234"
}
I think this will do what you say you want. I used the dictionary flattening code in this answer with a small modification — I changed it to not concatenate the keys of parent dictionaries with those of the nested ones since that seems to be what you want. This assumes that the keys used in all nested dictionaries are unique from one another, which in my opinion is a weakness of your approach.
You asked for references that could help you: Searching this website for related questions is often a productive way to find solutions to your own problems. This is especially true when what you want to know is something that has probably been asked before (such as how to flatten nested dictionaries).
Also note that I have written the code to closely follow the PEP 8 - Style Guide for Python Code guidelines — which I strongly suggest you read (and start following as well).
import json
desired = {
"os": "Windows",
"location": "us-east-1"
}
def read_json_file(file):
with open(file) as json_data:
return json.load(json_data)
def flatten(d):
out = {}
for key, val in d.items():
if isinstance(val, dict):
val = [val]
if isinstance(val, list):
for subdict in val:
deeper = flatten(subdict).items()
out.update({key2: val2 for key2, val2 in deeper})
else:
out[key] = val
return out
def check_drift(desired, config):
drift = {}
for key, value in desired.items():
if config[key] != value:
drift[key] = value
return drift
if __name__ == '__main__':
from pprint import pprint
config = flatten(read_json_file('config.json'))
print('Current configuration (flattened):')
pprint(config, width=40, sort_dicts=False)
drift = check_drift(desired, config)
print()
print('Drift:')
pprint(drift, width=40, sort_dicts=False)
This is the output it produces:
Current configuration (flattened):
{'location': 'us-east-1',
'type': 't2.large',
'os': 'Linux',
'sgid': 'x-1234'}
Drift:
{'os': 'Windows'}

Querying a nested JSON file in Python without Indexing

I have the below Json file which I need to query to get the values of the keys inside 'validations' in a list
for example the column_values_not_null output will need to be this:
['lu_name', 'transaction_amount']
"validation_file_name": "ctm",
"connection_type": "s3",
"low_threshold": 500000,
"high_threshold": 1000000,
"frequency": "weekly",
"validations": [
{
"columns_to_match_ordered_list" :[
"lu_name",
"site_name",
"transaction_date_time",
"margin",
"transaction_currency_code",
"reversal_indicator_description",
"reversal_amount",
"original_amount"
]
},
{
"column_values_not_null":[
"lu_name",
"transaction_amount"
]
},
{
"column_values_not_duplicate": [
"lu_name",
"response_code_description"
]
}
]
I am able to do the below but I need to do this without using the index value
f = open('test.json')
json_content = json.load(f)
print(json_content['validations'][1]['column_values_not_null'])
Get a list by querying the validations key. The sum( ,[]) are used to flat the list (as required by the condition "without using the index value" if got it right), for details about it with pros and cons see doc.
data = #
def validations(data: dict, key_query: str) -> list:
for k, v in data.items():
if k == 'validations':
return sum(sum([list(d.values()) for d in v if key_query in d], []), [])
print(validations(data, query='column_values_not_null'))
# ['lu_name', 'transaction_amount']

How to extract lists and modify them from different levels of nested dictionary

I want to extract all the lists and modify them from a nested dictionary which has multiple levels and the list can be a value in any of the levels.
For example:
test = {
'Type 1': {
'Type1_mainkey1': {
'Type1_key2': {
'Type1_key2_key1': [
'Type1_list1'
],
'Type1_key2_key2': [
'Type1_list2'
],
'Type1_key2_key2': [
'Type1_list3',
'Type1_list4'
]
}
}
},
'Type 2': {
'Type2_mainkey1': {
'Type2_key2': [
'Type2_list1',
'Type2_list2'
]},
'Type2_key3': {
'Type2_key3_key1': [
'Type2_list3',
'Type2_list4'
]
}
}
}
This is the kind of dictionary that might be present. I was wondering if there is a way of extracting the lists and updating the dictionary.
My function so far:
def find_list(data):
if not any([isinstance(data.get(k), dict) for k in data]):
return data
else:
for dkey in data:
if isinstance(data.get(dkey), dict):
return find_list(data.get(dkey))
else:
continue
And on running this:
out = find_list(test)
My output is:
{'Type1_key2_key1': ['Type1_list1'],
'Type1_key2_key2': ['Type1_list3', 'Type1_list4']}
Whereas, the expected output is all the list items and their keys (so that I can modify the list and update)
In my example below the only thing I am doing differently is collecting the results during each recursive step and adding to previous returns. I added some inline comments to hopefully better explain.
def find_list(data):
d = {} # initialize collection object to hold results
if not any([isinstance(data.get(k), dict) for k in data]):
return data
else:
for dkey in data:
if isinstance(data.get(dkey), dict):
# add results of each recursive call to the collection
d.update(find_list(data.get(dkey)))
return d # return the collection of results
Using your test dictionary with this example gives this output:
{
'Type1_key2_key1': ['Type1_list1'],
'Type1_key2_key2': ['Type1_list3', 'Type1_list4'],
'Type2_key2': ['Type2_list1', 'Type2_list2'],
'Type2_key3_key1': ['Type2_list3', 'Type2_list4']
}

Processing dictionary keys in arbitrary order

I want to transform dictionary into a string. What would be beginner-level question is complicated by few rules that I have to adhere to:
There is a list of known keys that must come out in particular, arbitrary order
Each of known keys is optional, i.e. it may not be present in dictionary
It is guaranteed that at least one of known keys will be present in dictionary
Dictionary may contain additional keys; they must come after known keys and their order is not important
I cannot make assumptions about order in which keys will be added to dictionary
What is the pythonic way of processing some dictionary keys before others?
So far, I have following function:
def format_data(input_data):
data = dict(input_data)
output = []
for key in ["title", "slug", "date", "modified", "category", "tags"]:
if key in data:
output.append("{}: {}".format(key.title(), data[key]))
del data[key]
if data:
for key in data:
output.append("{}: {}".format(key.title(), data[key]))
return "\n".join(output)
data = {
"tags": "one, two",
"slug": "post-title",
"date": "2017-02-01",
"title": "Post Title",
}
print(format_data(data))
data = {
"format": "book",
"title": "Another Post Title",
"date": "2017-02-01",
"slug": "another-post-title",
"custom": "data",
}
print(format_data(data))
Title: Post Title
Slug: post-title
Date: 2017-02-01
Tags: one, two
Title: Another Post Title
Slug: another-post-title
Date: 2017-02-01
Custom: data
Format: book
While this function does provide expected results, it has some issues that makes me think there might be better approach. Namely, output.append() line is duplicated and input data structure is copied to allow it's modification without side-effects.
To sum up, how can I process some keys in particular order and before other keys?
I suggest that you simply run a pair of list comprehensions: one for the desired keys, and one for the rest. Concatenate them in the desired order in bulk, rather than one at a time. This reduces the critical step to a single command to build output.
The first comprehension looks for desired keys in the dict; the second looks for any dict keys not in the "desired" list.
def format_data(input_data):
data = dict(input_data)
key_list = ["title", "slug", "date", "modified", "category", "tags"]
output = ["{}: {}".format(key.title(), data[key]) for key in key_list if key in data] + \
["{}: {}".format(key.title(), data[key]) for key in data if key not in key_list]
return "\n".join(output)
I'd suggest list comprehensions and pop():
def format_data(input_data):
data = dict(input_data)
keys = ["title", "slug", "date", "modified", "category", "tags"]
output = ['{}: {}'.format(key.title(), data.pop(key)) for key in keys if key in data]
output.extend(['{}: {}'.format(key.title(), val) for key, val in data.items()])
return "\n".join(output)
To the concern about deleting during iteration - note that the iteration is over the list of keys, not the dictionary being evaluated, so I wouldn't consider that a red flag.
To completely edit, the below will take a list of primary keys (you can pass them in if you want or set it in a config file) and then it will set those in the beginning of your dictionary.
I think I see what you mean now:
Try this:
from collections import OrderedDict
data = {'aaa': 'bbbb',
'custom': 'data',
'date': '2017-02-01',
'foo': 'bar',
'format': 'book',
'slug': 'another-post-title',
'title': 'Another Post Title'}
def format_data(input_data):
primary_keys = ["title", "slug", "date", "modified", "category", "tags"]
data = OrderedDict((k, input_data.get(k)) for k in primary_keys + input_data.keys())
output = []
for key, value in data.items():
if value:
output.append("{}: {}".format(key.title(), value))
return "\n".join(output)
print(format_data(data))
Title: Another Post Title
Slug: another-post-title
Date: 2017-02-01
Aaa: bbbb
Format: book
Custom: data
Foo: bar
Find the difference between the known keys and the keys in the input dictionary; Use itertools.chain to iterate over both sets of keys; catch KeyErrors for missing keys and just pass. No need to copy the input and no duplication.
import itertools
def format_data(input_data):
known_keys = ["title", "slug", "date", "modified", "category", "tags"]
xtra_keys = set(input_data.keys()).difference(known_keys)
output = []
for key in itertools.chain(known_keys, xtra_keys):
try:
output.append("{}: {}".format(key.title(), data[key]))
except KeyError as e:
pass
return '\n'.join(output)
data = {"tags": "one, two",
"slug": "post-title",
"date": "2017-02-01",
"title": "Post Title",
"foo": "bar"}
>>> print format_data(data)
Title: Post Title
Slug: post-title
Date: 2017-02-01
Tags: one, two
Foo: bar
>>>

Sorting and organizing DNS records

So I have a fairly simple set of data, such as:
['test.sh','api.test.sh','blah.api.test.sh','test.com','api.test.com']
and I need to transform into a hierarchical data structure, I was thinking of doing it with a dictionary:
{ 'name':'test.sh',
'children': { 'name':'api.test.sh',
'children': { 'name':'blah.api.test.sh' }
}
},
{
'name':'test.com',
'children': { 'name':'api.test.com' }
}
And essentially for each high level name I can work my way down and perform the operations I need to do.
My question has more to do with creating a simple way to sort, match, and transform the data. I can think of a few ways to do this but I can't think of anything quite elegant. Also I'm doing this in python.
Thanks
I think this could be what you are looking for:
def sort_dns(l):
to_return = []
# Get top-level domains: the domains that contain the less amount of dots.
count_list = [i.count('.') for i in l]
min_dots = min(count_list)
top_domains = [i for i in l if i.count('.') == min_dots]
# Now for each domain, we find it subdomains.
for domain in top_domains:
sub_domains = [i for i in l if domain in i and i is not domain]
#And untill we aren't at the deepest level, we continue looking for sub domains and repeat the structure
sub_sub_domains = sort_dns(sub_domains) if not len(sub_domains) == 0 else None
to_return.append({'name' : domain, 'childrens' : sub_sub_domains})
return to_return
As you see this function call itself recursively to go infinitely "deep" if needed.
With your example, the result is the following
[
{
'name': 'test.sh',
'childrens': [
{
'name': 'api.test.sh',
'childrens': [
{'name': 'blah.api.test.sh', 'childrens': None}
]
}
]
},
{
'name': 'test.com',
'childrens': [
{'name': 'api.test.com', 'childrens': None}
]
}
]
As you see it handle the case of multiple childrens and no children at all.
Note that if you don't want the 'childrens': None, You can change the function to that:
def sort_dns(l):
to_return = []
# Get top-level domains: the domains that contain the less amount of dots.
count_list = [i.count('.') for i in l]
min_dots = min(count_list)
top_domains = [i for i in l if i.count('.') == min_dots]
# Now for each domain, we find it subdomains.
for domain in top_domains:
sub_domains = [i for i in l if domain in i and i is not domain]
#And untill we aren't at the deepest level, we continue looking for sub domains and repeat the structure
sub_sub_domains = sort_dns(sub_domains) if not len(sub_domains) == 0 else None
if sub_sub_domains:
to_return.append({'name' : domain, 'childrens' : sub_sub_domains})
else:
to_return.append({'name' : domain})
return to_return
Note that this is Python3 Code.
EDIT: I've read roippi Answer and this works great too, His solution is surely the most pythonic. This one's advantage is that it doesn't require any imports. But you should really consider roippi answer as the most elegant.
So, I see a proper approach to this problem happening in three steps: sort, group, format.
First, sorting your inputs to arrange them in logical groups. You can define a quick helper function to define your sort key:
def sorter(netloc):
split = netloc.split('.')
return (split[::-1], -len(split))
And use it thusly:
data = ['test.sh','api.test.sh','blah.api.test.sh','test.com','api.test.com', 'another.com', 'sub.another.com', 'sub.sub.another.com']
#shuffling data, to show that sorting works
import random
random.shuffle(data)
sorted(data, key=sorter)
Out[14]:
['another.com',
'sub.another.com',
'sub.sub.another.com',
'test.com',
'api.test.com',
'test.sh',
'api.test.sh',
'blah.api.test.sh']
Now that everything's in the correct order, do a similar grouping operation with itertools.groupby which groups by the blah.com part of x.y.z.blah.com:
def grouper(netloc):
return ''.join(netloc.split('.')[-2:])
#in-place sort, replicating sorted() call above
data.sort(key=sorter)
from itertools import groupby
[list(g) for k,g in groupby(data, grouper)]
Out[27]:
[['another.com', 'sub.another.com', 'sub.sub.another.com'],
['test.com', 'api.test.com'],
['test.sh', 'api.test.sh', 'blah.api.test.sh']]
Lastly, you need to format these groups into your desired hierarchy. Here is a quick-and-dirty implementation:
def make_hierarchy(groups):
from copy import deepcopy
_groups = deepcopy(groups)
ret = []
for li in _groups:
current = {}
ret.append(current)
while li:
current['name'] = li.pop()
if li:
nxt = {}
current['children'] = nxt
current = nxt
return ret
print(json.dumps(make_hierarchy(grouped), indent=2))
[
{
"children": {
"children": {
"name": "another.com"
},
"name": "sub.another.com"
},
"name": "sub.sub.another.com"
},
{
"children": {
"name": "test.com"
},
"name": "api.test.com"
},
{
"children": {
"children": {
"name": "test.sh"
},
"name": "api.test.sh"
},
"name": "blah.api.test.sh"
}
]
This last implementation depends on a couple of assumptions, namely that there will not be any equivalent-length netlocs in a given group, i.e. sub1.example.com and sub2.example.com will never happen. Obviously you can tweak the implementation as needed.

Categories