get list of all nested keys in a json - python

I have a huge json in the format something like :
{
"Name1": {
"NNum": "11",
"Node1": {
"SubNodeA": "Thomas",
"SubNodeB": "27"
},
"Node2": {
"SubNodeA": "ZZZ",
"SubNodeD": "XXX",
"SubNodeE": "yy"
},
"Node3": {
"child1": 11,
"child2": {
"grandchild": {
"greatgrandchild1": "Rita",
"greatgrandchild2": "US"
}
}
}
}
}
The format or keys are not defined and can go to any depth
I would like to get the list of keys like
keyList= ["Name1.NNum","Name1.Node1.SubNodeA",""Name1.Node1.SubNodeB","Name1.Node2.SubNodeA","Name1.Node2.SubNodeD","Name1.Node2.SubNodeE","Name1.Node3.child1","Name1.Node3.child2.grandchild.greatgrandchild1","Name1.Node3.child2.grandchild.greatgrandchild2"]
A snapshot of the code
def extract_values(obj):
"""Pull all values of specified key from nested JSON."""
arr = []
key_list = []
parent = ""
def extract(obj, arr,parent):
"""Recursively search for values of key in JSON tree."""
if isinstance(obj, dict):
grandparent = ""
for k, v in obj.items():
print ("k ............",k)
parent = grandparent
temp_parent = k
print ("parent >>>>> ",parent)
if isinstance(v, (dict, list)):
parent = temp_parent
print ("IF VALUE DICT .. parent ", parent)
extract(v, arr,parent)
else:
grandparent = parent
parent = parent + "_" + temp_parent
print ("!!!! NOT DICT :).... **** parent ... ", parent)
arr.append(parent)
elif isinstance(obj, list):
for item in obj:
extract(item, arr)
#print ("arr >>>>>>>>>> ", arr)
time.sleep(5)
return arr
results = extract(obj, arr,parent)
return results
but this does not give the expected output.
Expected Output:
keyList= ["Name1.NNum","Name1.Node1.SubNodeA",""Name1.Node1.SubNodeB","Name1.Node2.SubNodeA","Name1.Node2.SubNodeD","Name1.Node2.SubNodeE","Name1.Node3.child1","Name1.Node3.child2.grandchild.greatgrandchild1","Name1.Node3.child2.grandchild.greatgrandchild2"]
Can anybody help me with this.
Thanks in advance

You can use recursion:
d = {'Name1': {'NNum': '11', 'Node1': {'SubNodeA': 'Thomas', 'SubNodeB': '27'}, 'Node2': {'SubNodeA': 'ZZZ', 'SubNodeD': 'XXX', 'SubNodeE': 'yy'}, 'Node3': {'child1': 11, 'child2': {'grandchild': {'greatgrandchild1': 'Rita', 'greatgrandchild2': 'US'}}}}}
def keys(d, c = []):
return [i for a, b in d.items() for i in ([c+[a]] if not isinstance(b, dict) else keys(b, c+[a]))]
result = list(map('.'.join, keys(d)))
Output:
['Name1.NNum', 'Name1.Node1.SubNodeA', 'Name1.Node1.SubNodeB', 'Name1.Node2.SubNodeA', 'Name1.Node2.SubNodeD', 'Name1.Node2.SubNodeE', 'Name1.Node3.child1', 'Name1.Node3.child2.grandchild.greatgrandchild1', 'Name1.Node3.child2.grandchild.greatgrandchild2']

def getKeys(object, prev_key = None, keys = []):
if type(object) != type({}):
keys.append(prev_key)
return keys
new_keys = []
for k, v in object.items():
if prev_key != None:
new_key = "{}.{}".format(prev_key, k)
else:
new_key = k
new_keys.extend(getKeys(v, new_key, []))
return new_keys
This solution assumes that the inner types that might have children are dictionaries.

You can do simple recursion:
d = {
"Name1": {
"NNum": "11",
"Node1": {
"SubNodeA": "Thomas",
"SubNodeB": "27"
},
"Node2": {
"SubNodeA": "ZZZ",
"SubNodeD": "XXX",
"SubNodeE": "yy"
},
"Node3": {
"child1": 11,
"child2": {
"grandchild": {
"greatgrandchild1": "Rita",
"greatgrandchild2": "US"
}
}
}
}
}
def get_keys(d, curr_key=[]):
for k, v in d.items():
if isinstance(v, dict):
yield from get_keys(v, curr_key + [k])
elif isinstance(v, list):
for i in v:
yield from get_keys(i, curr_key + [k])
else:
yield '.'.join(curr_key + [k])
print([*get_keys(d)])
Prints:
['Name1.NNum', 'Name1.Node1.SubNodeA', 'Name1.Node1.SubNodeB', 'Name1.Node2.SubNodeA', 'Name1.Node2.SubNodeD', 'Name1.Node2.SubNodeE', 'Name1.Node3.child1', 'Name1.Node3.child2.grandchild.greatgrandchild1', 'Name1.Node3.child2.grandchild.greatgrandchild2']

What about this?
from collections import Mapping
def extract_paths(base_path, dd):
new_paths = []
for key, value in dd.items():
new_path = base_path + ('.' if base_path else '') + key
if isinstance(value, Mapping):
new_paths.extend(extract_paths(new_path, value))
else:
new_paths.append(new_path)
return new_paths
extract_paths('', your_dict)

Use isinstance to check the dict or not called by function recursively. If dict append to path recursively else print the path
def print_nested_keys(dic,path=''):
for k,v in dic.items():
if isinstance(v,dict):
path+=k+"."
yield from print_nested_keys(v,path)
else:
path+=k
yield path
Output:
>>> [*print_nested_keys(d)] # Here, d is your nested dictionary
['Name1.NNum',
'Name1.NNumNode1.SubNodeA',
'Name1.NNumNode1.SubNodeASubNodeB',
'Name1.NNumNode1.Node2.SubNodeA',
'Name1.NNumNode1.Node2.SubNodeASubNodeD',
'Name1.NNumNode1.Node2.SubNodeASubNodeDSubNodeE',
'Name1.NNumNode1.Node2.Node3.child1',
'Name1.NNumNode1.Node2.Node3.child1child2.grandchild.greatgrandchild1',
'Name1.NNumNode1.Node2.Node3.child1child2.grandchild.greatgrandchild1greatgrandchild2']

Related

Get key values for certain fields in JSON response

My json data would look like this:
{
"a":1,
"b":[
{
"c":2,
"d":{
"e":3
},
"f":{
"g":4
},
"h":[
{
"i":5
},
{
"j":6
}
]
}
]
}
Is there a way I can get values for certain fields in the response along with their keys. So from this response, the fields for which I expect values are a, c,e,g,i,j along with the respective keys.
Eg: [a:1,c:2,e:3,g:4,i:5,j:6]. Could this be done?
My response contained something like:
{
"a":1,
"b":[
{
"c":2,
"d":{
"e":3
},
"f":{
"g":4,
"k":[
"l","m"]
},
"h":[
{
"i":5
},
{
"j":6
}
]
}
]
}
Which resulted in the error. I have made the following fix for it.
def get_key_value(dct, res_dct, lst):
for k,v in dct.items():
if isinstance(v, list):
for d in v:
if isinstance(d,dict):
get_key_value(d, res_dct, lst)
else:
lst.append(f'{k}:{v}')
elif isinstance(v, dict):
get_key_value(v, res_dct, lst)
else:
res_dct[k] = v
# If you want to store in 'list' you can store as string
lst.append(f'{k}:{v}')
res_dct = {}
lst = []
get_key_value(staging_dict, res_dct, lst)
You can use a recursive function and store key & value if only value not list or dict.
def get_key_value(dct, res_dct, lst):
for k,v in dct.items():
if isinstance(v, list):
for d in v:
get_key_value(d, res_dct, lst)
elif isinstance(v, dict):
get_key_value(v, res_dct, lst)
else:
res_dct[k] = v
# If you want to store in 'list' you can store as string
lst.append(f'{k}:{v}')
res_dct = {}
lst = []
get_key_value(dct, res_dct, lst)
print(res_dct)
print(lst)
Output:
# res_dct
{'a': 1, 'c': 2, 'e': 3, 'g': 4, 'i': 5, 'j': 6}
# lst
['a:1', 'c:2', 'e:3', 'g:4', 'i:5', 'j:6']

Recursively find and return key and value from nested dictionaries python

Could someone help me with my code below?
This is originally meant to work with data in a json file but I have converted it to work with a json / dictionary variable.
Right now the get_data_value() function is working but instead of just returning the value, I would like to return a singular dict containing the key and value.
I am just not sure how to convert the item_generator function to make this possible without ruining the recursion; I found this function from an example here on stackoverflow.
def get_data_value(data,data_name):
d = data['test']
print(item_generator(d,data_name))
for _ in item_generator(d,data_name):
return (_)
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for key, value in json_input.items():
if key == lookup_key:
data_single_item = {key:value} # what i want to return
print(data_single_item)
yield value # only value is returned
else:
yield from item_generator(value, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
json_data = { "test": [ { "Tier1": [ { "Tier1-Main-Title-1": [ { "title": "main", "example": 400 } ] } ] }, { "Tier2": [] }, { "Tier3": [ { "Example1-Sub1": 44 } ] } ] }
print(get_data_value(json_data,'title'))
It's worth pointing out you have a bug here:
for _ in item_generator(d,data_name):
return (_)
This is an important case to be aware of, because the return statement here only returns once. Therefore, this for loop only runs for the first iteration, and only returns the first yield result - i.e. only the first occurrence of the lookup key in the json_data.
You can fix it using generator (or iterable) unpacking into a list, as below:
def get_data_value(data, data_name):
d = data['test']
return [*item_generator(d, data_name)]
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
if lookup_key in json_input:
yield {lookup_key: json_input[lookup_key]}
else:
for v in json_input.values():
yield from item_generator(v, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
json_data = {"test": [{"Tier1": [{"Tier1-Main-Title-1": [{"title": "main", "example": 400}]}]}, {"Tier2": []},
{"Tier3": [{"Example1-Sub1": 44, "title": "TEST2"}]}]}
print(get_data_value(json_data, 'title'))
Result:
[{'title': 'main'}, {'title': 'TEST2'}]
Or, if you'd prefer not to call get_data_value at all:
print(*item_generator(json_data['test'], 'title'))
Where passing the key 'test' is optional, thanks to the function being recursive by nature.
The results are separated by a single space by default, but you can control the separator by passing the sep parameter to the print statement.
{'title': 'main'} {'title': 'TEST2'}
I am not sure if I am missing something but why dont you just return what you want? Like this for example:
def get_data_value(data,data_name):
d = data['test']
print(item_generator(d,data_name))
for _ in item_generator(d,data_name):
return (_)
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for key, value in json_input.items():
if key == lookup_key:
yield {key:value}
else:
yield from item_generator(value, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
json_data = { "test": [ { "Tier1": [ { "Tier1-Main-Title-1": [ { "title": "main", "example": 400 } ] } ] }, { "Tier2": [] }, { "Tier3": [ { "Example1-Sub1": 44 } ] } ] }
print(get_data_value(json_data,'title'))
Also if you have multiple instances of "title" in different sub-objects and you want all of them back maybe in a list this can also work:
def get_data_value(data, data_name):
d = data["test"]
results = []
for item in item_generator(d, data_name):
results.append(item)
return results
def item_generator(json_input, lookup_key):
if isinstance(json_input, dict):
for key, value in json_input.items():
if key == lookup_key:
yield {key: value}
else:
yield from item_generator(value, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from item_generator(item, lookup_key)
json_data = {
"test": [
{
"Tier1": [
{
"Tier1-Main-Title-1": [
{"title": "main", "example": 400},
{"title": "example2"},
]
}
]
},
{"Tier2": []},
{
"Tier3": [
{"Example1-Sub1": 44},
{"AnotherExample": 9856, "title": "example3"},
]
},
]
}
print(get_data_value(json_data, "title"))
This returns: [{'title': 'main'}, {'title': 'example2'}, {'title': 'example3'}]

Remove integer list keys from column headers

I have a Python script, which uses a function from a previous Stack Overflow solution.
from pandas import json_normalize
from collections.abc import MutableMapping as mm
def flatten(dictionary, parent_key=False, separator='.'):
items = []
for key, value in dictionary.items():
new_key = str(parent_key) + separator + key if parent_key else key
if isinstance(value, mm):
items.extend(flatten(value, new_key, separator).items())
elif isinstance(value, list):
for k, v in enumerate(value):
items.extend(flatten({str(k): v}, new_key).items())
else:
items.append((new_key, value))
return dict(items)
d = {
"_id" : 1,
"labelId" : [
6422
],
"levels" : [
{
"active" : "true",
"level" : 3,
"actions" : [
{
"isActive" : "true"
}]
}]
}
x = flatten(d)
x = json_normalize(x)
print(x)
Current Output:
_id labelId.0 levels.0.active levels.0.level levels.0.actions.0.isActive
0 1 6422 true 3 true
The issue I am having is the numeric keys which gets included in the column name. Is there a way I can amend my code in order to achieve my desired output?
Desired Output:
_id labelId levels.active levels.level levels.actions.isActive
0 1 6422 true 3 true
First of all using parent_key as bool then assigning it other type value is not the best practice. It works but can become messy. I modified a code a bit, adding separate argument to track parent_key status as bool, and p_key which carry the string you wanted. Here is snippet
from pandas import json_normalize
from collections.abc import MutableMapping as mm
def flatten(dictionary, p_key=None, parent_key=False, separator='.'):
items = []
for key, value in dictionary.items():
if parent_key:
new_key = f"{str(p_key)}{separator}{key}"
else:
new_key = p_key if p_key else key
if isinstance(value, mm):
items.extend(flatten(
dictionary=value,
p_key=new_key,
parent_key=True,
separator=separator).items())
elif isinstance(value, list):
for k, v in enumerate(value):
items.extend(flatten(
dictionary={str(k): v},
p_key=new_key,
parent_key=False,
separator=separator).items())
else:
items.append((new_key, value))
return dict(items)
d = {
"_id" : 1,
"labelId" : [
6422
],
"levels" : [
{
"active" : "true",
"level" : 3,
"actions" : [
{
"isActive" : "true"
}]
}]
}
x = flatten(d)
x = json_normalize(x)
print(x)

Flatten nested dictionary to key and joined string value

I need help with a function to flatten a nested dictionary in the following format:
dict_test = {
"id" : "5d4c2c0fd89234260ec81",
"Reference Number" : "JA-L800D-191",
"entities_discovered" : {
"OTHER_ID" : [
"L800DFAG02191"
],
"CODE_ID" : [
"160472708",
"276954773"
]
},
"label_field" : [
"ELECTRONICS",
"HDMI"
],
"numeric_field" : [
491,
492
],
}
The function I was working with, flattens the dictionary to one dimension (key:value) as I want, but doesn´t join the values within the same key iteration.
def flatten(d):
agg = {}
def _flatten(d, prev_key=''):
if isinstance(d, list):
for i, item in enumerate(d):
new_k = '%s.%s' % (prev_key, i) if prev_key else i
_flatten(item, prev_key=new_k)
elif isinstance(d, dict):
for k, v in d.items():
new_k = '%s.%s' % (prev_key, k) if prev_key else k
_flatten(v, prev_key=new_k)
else:
agg[prev_key] = d
_flatten(d)
return agg
My current output is:
{
"id" : "5d4c2c0fd89234260ec81",
"Reference Number" : "JA-L800D-191",
"entities_discovered.OTHER_ID.0" : "L800DFAG02191",
"entities_discovered.CODE_ID.0" : "160472708",
"entities_discovered.CODE_ID.1" : "276954773",
"label_field.0" : "ELECTRONICS",
"label_field.1" : "HDMI",
"numeric_field.0" : 491,
"numeric_field.1" : 492
}
But actually I´m looking for something like (joining the values into the same string and separated by , or |):
{
"id" : "5d4c2c0fd89234260ec81",
"Reference Number" : "JA-L800D-191",
"OTHER_ID" : "L800DFAG02191",
"CODE_ID" : "160472708, 276954773",
"label_field" : "ELECTRONICS, HDMI",
"numeric_field" : ¨491, 492¨
}
You can use join() built-in method to join values together.
def do():
dict_test = {
"id": "5d4c2c0fd89234260ec81",
"Reference Number": "JA-L800D-191",
"entities_discovered": {
"OTHER_ID": [
"L800DFAG02191"
],
"CODE_ID": [
"160472708",
"276954773"
]
},
"label_field": [
"ELECTRONICS",
"HDMI"
],
"numeric_field": [
491,
492
],
}
new_dict = {}
for key, value in dict_test.items():
if isinstance(value, dict):
for _key, _value in value.items():
if isinstance(_value, list):
new_dict.update({_key: ', '.join([str(item) for item in _value])})
elif isinstance(value, list):
new_dict.update({key: ', '.join([str(item) for item in value])})
else:
new_dict.update({key: value})
return new_dict
if __name__ == '__main__':
print(do())
Output:
{
'id': '5d4c2c0fd89234260ec81',
'Reference Number': 'JA-L800D-191',
'OTHER_ID': 'L800DFAG02191',
'CODE_ID': '160472708, 276954773',
'label_field': 'ELECTRONICS, HDMI',
'numeric_field': '491, 492'
}
def recursive_flatten_dict(tmp, dict_test):
for i,v in dict_test.items():
if type(v) == type({}):
recursive_flatten_dict(tmp,v)
else:
tmp[i] = v
return tmp
recursive_flatten_dict({},dict_test)
Simple recursion using a generator:
def flatten(d):
for a, b in d.items():
if isinstance(b, dict):
yield from flatten(b)
else:
yield (a, b if not isinstance(b, list) else ', '.join(map(str, b)))
print(dict(flatten(dict_test)))
Output:
{
'id': '5d4c2c0fd89234260ec81',
'Reference Number': 'JA-L800D-191',
'OTHER_ID': 'L800DFAG02191',
'CODE_ID': '160472708, 276954773',
'label_field': 'ELECTRONICS, HDMI',
'numeric_field': '491, 492'
}
def flatten(dict_test):
for key in ['label_field', 'numeric_field']:
dict_test[key]= ', '.join([str(c) for c in dict_test[key]])
for c in dict_test['entities_discovered'].keys():
dict_test[c]= ', '.join(dict_test['entities_discovered'][c])
return dict_test
The above function does the job. I hope this what you are looking for?

How can I write a recursive python function that splits a dictionary into an array of dictionaries?

I am looking to write a recursive function:
arguments: d, dictionary
result: list of dictionaries
def expand_dictionary(d):
return []
The function recursively goes through a dictionary and flattens nested objects using an _, in addition it expands out nested lists into the array, and includes the parent label.
Think of creating a relational model from a document.
Here is an example input and output:
original_object = {
"id" : 1,
"name" : {
"first" : "Alice",
"last" : "Sample"
},
"cities" : [
{
"id" : 55,
"name" : "New York"
},
{
"id" : 60,
"name" : "Chicago"
}
],
"teachers" : [
{
"id" : 2
"name" : "Bob",
"classes" : [
{
"id" : 13,
"name" : "math"
},
{
"id" : 16,
"name" : "spanish"
}
]
}
]
}
expected_output = [
{
"id" : 1,
"name_first" : "Alice",
"name_last" : "Sample"
},
{
"_parent_object" : "cities",
"id" : 55,
"name" : "New York"
},
{
"_parent_object" : "cities",
"id" : 60,
"name" : "Chicago"
},
{
"parent_object" :"teachers",
"id" : 2,
"name" : "Bob"
},
{
"parent_object" :"teachers_classes",
"id" : 13,
"name" : "math"
},
{
"parent_object" :"teachers_classes",
"id" : 16,
"name" : "spanish"
}
]
the code currently being used for flattening is:
def flatten_dictionary(d):
def expand(key, value):
if isinstance(value, dict):
return [ (key + '_' + k, v) for k, v in flatten_dictionary(value).items() ]
else:
#If value is null or empty array don't include it
if value is None or value == [] or value == '':
return []
return [ (key, value) ]
items = [ item for k, v in d.items() for item in expand(k, v) ]
return dict(items)
That will do
def expand_dictionary(d,name=None,l=None):
obj = {}
if l == None:
l = [obj]
else:
l.append(obj)
prefix = (name+'_'if name else '')
if prefix: obj['_parent_object'] = name
for i, v in d.iteritems():
if isinstance(v, list):
map(lambda x:expand_dictionary(x,prefix+i,l),v)
elif isinstance(v, dict):
obj.update(flatten_dictionary({i: v}))
else:
obj[i] = v
return l
After working through it a bit here is what I have come up with. Probably can be significantly optimized. Based on #paulo-scardine's comment I added the parent primary key to keep the relational model. Would love to hear optimization thoughts.
def expand_dictionary(original_object, object_name, objects=None):
if objects is None:
objects = []
def flatten_dictionary(dictionary):
def expand(key, value):
if isinstance(value, dict):
return [ (key + '_' + k, v) for k, v in flatten_dictionary(value).items() ]
else:
#If value is null or empty array don't include it
if value is None or value == [] or value == '':
return []
return [ (key, value) ]
items = [ item for k, v in dictionary.items() for item in expand(k, v) ]
return dict(items)
original_object_root = flatten_dictionary(original_object).copy()
original_object_root['_meta_object_name'] = object_name
for key,value in original_object_root.copy().items():
if isinstance(value, dict):
flatten_dictionary(value, objects)
if isinstance(value, list):
original_object_root.pop(key)
for nested_object in value:
nested_object['_meta_parent_foreign_key'] = original_object_root['id']
nested_object['_meta_object_name'] = object_name + "_" + key
expand_dictionary(nested_object, object_name + "_" + key, objects)
objects.append(original_object_root)
return objects

Categories