Merging nested dictionaries by keys preserving different values - python

I have two list of nested dictionaries with the same keys, but different values:
d1 = {
'distilled ': [{'water': '45'}, {'vodka': '9'}, {'vinegar': '7'}, {'beer': '6'}, {'alcohol': '5'}, {'whiskey': '5'}],
'planted': [{'tree': '30'}, {'seed': '28'}, {'flower': '20'}, {'plant': '7'}, {'bomb': '4'}, {'garden': '2'}]
}
and
d2 = {
'distilled ': [{'water': '14'}, {'vinegar': '9'}, {'wine': '8'}, {'alcohol': '8'}, {'liquid': '7'}, {'whiskey': '6'}, {'beer': '5'}],
'planted ': [{'flower': '28'}, {'tree': '18'}, {'seed': '9'}, {'vegetable': '4'}, {'bush': '3'}, {'grass': '3'}, {'garden': '3'}]
}
I want to merge them in a way that preserves the values and merges only the keys in the nested dictionaries. So that the outcome would look like:
{
'distilled ': [('water', '45', '14'), ('vodka', '9'), ('vinegar', '7', '9'), ('beer', '6', '5'), ('alcohol', '5'), ('whiskey', '5'), ('wine', '8')],
'planted': [('tree', '30', '18'), ('seed', '28', '9'), ('flower', '20', '7'), ('plant', '7'), ('bomb', '4'), ('garden', '2', '3')]
}
I tried merging the two using:
d_merged = { k: [ d1[k], d2_to_compare[k] ] for k in d1 }
but the in the outcome only the values of the first dictionary are presented, obviously. Do you have any ideas on how to fix this? Thank you very much in advance.
I am not sure which way to take from here. Would really appreciate any suggestions! Thanks a lot.

dict only has one key-value pair is not a good idea, but anyway, we can work out like this:
d1 = {
'distilled': [{'water': '45'}, {'vodka': '9'}, {'vinegar': '7'}, {'beer': '6'}, {'alcohol': '5'}, {'whiskey': '5'}],
'planted': [{'tree': '30'}, {'seed': '28'}, {'flower': '20'}, {'plant': '7'}, {'bomb': '4'}, {'garden': '2'}]
}
d2 = {
'distilled': [{'water': '14'}, {'vinegar': '9'}, {'wine': '8'}, {'alcohol': '8'}, {'liquid': '7'}, {'whiskey': '6'}, {'beer': '5'}],
'planted': [{'flower': '28'}, {'tree': '18'}, {'seed': '9'}, {'vegetable': '4'}, {'bush': '3'}, {'grass': '3'}, {'garden': '3'}]
}
d3 = {}
for k, v in d1.items():
k1 = dict([d.items()[0] for d in d1[k]])
k2 = dict([d.items()[0] for d in d2[k]])
ret = []
for d in (set(k1.keys()) | set(k2.keys())):
ret.append((d, k1.get(d), k2.get(d)))
d3[k] = ret
print d3

Related

How to desalinize json coming from dynamodb stream

event = event = {'Records': [{'eventID': '2339bc590c21035b84f8cc602b12c1d2', 'eventName': 'INSERT', 'eventVersion': '1.1', 'eventSource': 'aws:dynamodb', 'awsRegion': 'us-east-1', 'dynamodb': {'ApproximateCreationDateTime': 1595908037.0, 'Keys': {'id': {'S': '9'}}, 'NewImage': {'last_name': {'S': 'Hus'}, 'id': {'S': '9'}, 'age': {'S': '95'}}, 'SequenceNumber': '3100000000035684810908', 'SizeBytes': 23, 'StreamViewType': 'NEW_IMAGE'}, 'eventSourceARN': 'arn:aws:dynamodb:us-east-1:656441365658:table/glossary/stream/2020-07-28T00:26:55.462'}, {'eventID': 'bbd4073256ef3182b3c00f13ead09501', 'eventName': 'MODIFY', 'eventVersion': '1.1', 'eventSource': 'aws:dynamodb', 'awsRegion': 'us-east-1', 'dynamodb': {'ApproximateCreationDateTime': 1595908037.0, 'Keys': {'id': {'S': '2'}}, 'NewImage': {'last_name': {'S': 'JJ'}, 'id': {'S': '2'}, 'age': {'S': '5'}}, 'SequenceNumber': '3200000000035684810954', 'SizeBytes': 21, 'StreamViewType': 'NEW_IMAGE'}, 'eventSourceARN': 'arn:aws:dynamodb:us-east-1:656441365658:table/glossary/stream/2020-07-28T00:26:55.462'}, {'eventID': 'a9c90c0c4a5a4b64d0314c4557e94e28', 'eventName': 'INSERT', 'eventVersion': '1.1', 'eventSource': 'aws:dynamodb', 'awsRegion': 'us-east-1', 'dynamodb': {'ApproximateCreationDateTime': 1595908037.0, 'Keys': {'id': {'S': '10'}}, 'NewImage': {'last_name': {'S': 'Hus'}, 'id': {'S': '10'}, 'age': {'S': '95'}}, 'SequenceNumber': '3300000000035684810956', 'SizeBytes': 25, 'StreamViewType': 'NEW_IMAGE'}, 'eventSourceARN': 'arn:aws:dynamodb:us-east-1:656441365658:table/glossary/stream/2020-07-28T00:26:55.462'}, {'eventID': '288f4a424992e5917af0350b53f754dc', 'eventName': 'MODIFY', 'eventVersion': '1.1', 'eventSource': 'aws:dynamodb', 'awsRegion': 'us-east-1', 'dynamodb': {'ApproximateCreationDateTime': 1595908037.0, 'Keys': {'id': {'S': '1'}}, 'NewImage': {'last_name': {'S': 'V'}, 'id': {'S': '1'}, 'age': {'S': '2'}}, 'SequenceNumber': '3400000000035684810957', 'SizeBytes': 20, 'StreamViewType': 'NEW_IMAGE'}, 'eventSourceARN': 'arn:aws:dynamodb:us-east-1:656441365658:table/glossary/stream/2020-07-28T00:26:55.462'}]}
The above one coming from dynamodb stream. I need to extract the some value from above
Code is below nothing is returning
def deserialize(event):
data = {}
data["M"] = event
return extract_some(data)
def extract_some(event):
for key, value in list(event.items()):
if (key == "NULL"):
return None
if (key == "S" or key == "BOOL"):
return value
for record in event['Records']:
doc = deserialise(record['dynamodb']['NewImage'])
print (doc)
Expected Out
{'last_name': 'Hus', 'id': '9', 'age': '95'}
{'last_name': 'JJ', 'id': '2', 'age': '5'}
{'last_name': 'Hus', 'id': '10', 'age': '95'}
{'last_name': 'V', 'id': '1', 'age': '2'}
try this,
from pprint import pprint
result = []
for r in event['Records']:
tmp = {}
for k, v in r['dynamodb']['NewImage'].items():
if "S" in v.keys() or "BOOL" in v.keys():
tmp[k] = v.get('S', v.get('BOOL', False))
elif 'NULL' in v:
tmp[k] = None
result.append(tmp)
pprint(result)
[{'age': '95', 'id': '9', 'last_name': 'Hus'},
{'age': '5', 'id': '2', 'last_name': 'JJ'},
{'age': '95', 'id': '10', 'last_name': 'Hus'},
{'age': '2', 'id': '1', 'last_name': 'V'}]

list of dictionary: aggregate value by grouping by inner dictionary key

I have this signature:
def aggregate_by_player_id(input, playerid, fields):
By 'fields', i mean fields to sum up grouping by 'playerID' within the 'input'.
I call the function like this:
aggregate_by_player_id(input, 'player', ['stat1','stat3'])
Input look like this:
[{'player': '1', 'stat1': '3', 'stat2': '4', 'stat3': '5'},
{'player': '1', 'stat1': '1', 'stat2': '4', 'stat3': '1'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '3'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '1'},
{'player': '3', 'stat1': '4', 'stat2': '1', 'stat3': '6'}]
My output structure is:
nested_dic = {value_of_playerid1: {'playerid': value_of_playerid1, 'stat1': value_of_stat1, 'stat2': value_of_stat2},
value_of_playerid2: {'playerid': value_of_playerid2, 'stat2': value_of_stat2, 'stat2': value_of_stat2},
value_of_playerid3: {'playerid': value_of_playerid3, 'stat3': value_of_stat3, 'stat3': value_of_stat3}}
Hence the output should look like:
{'1': {'player': '1', 'stat1': 4, 'stat3': 6},
'2': {'player': '2', 'stat1': 2, 'stat3': 4},
'3': {'player': '3', 'stat1': 4, 'stat3': 6}}
We can use itertools.groupby for this to group on playerid and then sum values across the fields.
from itertools import groupby
from operator import itemgetter
def aggregate_by_player_id(input_, playerid, fields):
player = itemgetter(playerid)
output = {}
for k, v in groupby(input_, key=player):
data = list(v)
stats = {playerid: k}
for field in fields:
stats[field] = sum(int(d.get(field, 0)) for d in data)
output[k] = stats
return output
data.sort(key=player) # data must be pre-sorted on grouping key
results = aggregate_by_player_id(data, 'player', ['stat1', 'stat3'])
{'1': {'player': '1', 'stat1': 4, 'stat3': 6},
'2': {'player': '2', 'stat1': 2, 'stat3': 4},
'3': {'player': '3', 'stat1': 4, 'stat3': 6}}
Capturing the result you're after in a single comprehension might be possible, but is likely not very readable. Here's a simple function that does the work:
data = [
{'player': '1', 'stat1': '3', 'stat2': '4', 'stat3': '5'},
{'player': '1', 'stat1': '1', 'stat2': '4', 'stat3': '1'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '3'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '1'},
{'player': '3', 'stat1': '4', 'stat2': '1', 'stat3': '6'}
]
def aggregate_dicts(ds, id_field, aggr_fields):
result = {}
for d in ds:
identifier = d[id_field]
if identifier not in result:
result[identifier] = {f: 0 for f in aggr_fields}
for f in aggr_fields:
result[identifier][f] += int(d[f])
return result
print(aggregate_dicts(data, 'player', ['stat1', 'stat3']))
Result:
{'1': {'stat1': 4, 'stat3': 6}, '2': {'stat1': 2, 'stat3': 4}, '3': {'stat1': 4, 'stat3': 6}}
If you want to repeat the identifier inside the dict, just add this line to the if block:
result[identifier][id_field] = identifier

Finding missing value in JSON using python

I am facing this problem, I want to separate the dataset that has completed and not complete.
So, I want to put flag like 'complete' in the JSON. Example as in output.
This is the data that i have
data=[{'id': 'abc001',
'demo':{'gender':'1',
'job':'6',
'area':'3',
'study':'3'},
'ex_data':{'fam':'small',
'scholar':'2'}},
{'id': 'abc002',
'demo':{'gender':'1',
'edu':'6',
'qual':'3',
'living':'3'},
'ex_data':{'fam':'',
'scholar':''}},
{'id': 'abc003',
'demo':{'gender':'1',
'edu':'6',
'area':'3',
'sal':'3'}
'ex_data':{'fam':'big',
'scholar':NaN}}]
Output
How can I put the flag and also detect NaN and NULL in JSON?
Output=[{'id': 'abc001',
'completed':'yes',
'demo':{'gender':'1',
'job':'6',
'area':'3',
'study':'3'},
'ex_data':{'fam':'small',
'scholar':'2'}},
{'id': 'abc002',
'completed':'no',
'demo':{'gender':'1',
'edu':'6',
'qual':'3',
'living':'3'},
'ex_data':{'fam':'',
'scholar':''}},
{'id': 'abc003',
'completed':'no',
'demo':{'gender':'1',
'edu':'6',
'area':'3',
'sal':'3'}
'ex_data':{'fam':'big',
'scholar':NaN}}]
Something like this should work for you:
data = [
{
'id': 'abc001',
'demo': {
'gender': '1',
'job': '6',
'area': '3',
'study': '3'},
'ex_data': {'fam': 'small',
'scholar': '2'}
},
{
'id': 'abc002',
'demo': {
'gender': '1',
'edu': '6',
'qual': '3',
'living': '3'},
'ex_data': {'fam': '',
'scholar': ''}},
{
'id': 'abc003',
'demo': {
'gender': '1',
'edu': '6',
'area': '3',
'sal': '3'},
'ex_data': {'fam': 'big',
'scholar': None}
}
]
def browse_dict(dico):
empty_values = 0
for key in dico:
if dico[key] is None or dico[key] == "":
empty_values += 1
if isinstance(dico[key], dict):
for k in dico[key]:
if dico[key][k] is None or dico[key][k] == "":
empty_values += 1
if empty_values == 0:
dico["completed"] = "yes"
else:
dico["completed"] = "no"
for d in data:
browse_dict(d)
print(d)
Output :
{'id': 'abc001', 'demo': {'gender': '1', 'job': '6', 'area': '3', 'study': '3'}, 'ex_data': {'fam': 'small', 'scholar': '2'}, 'completed': 'yes'}
{'id': 'abc002', 'demo': {'gender': '1', 'edu': '6', 'qual': '3', 'living': '3'}, 'ex_data': {'fam': '', 'scholar': ''}, 'completed': 'no'}
{'id': 'abc003', 'demo': {'gender': '1', 'edu': '6', 'area': '3', 'sal': '3'}, 'ex_data': {'fam': 'big', 'scholar': None}, 'completed': 'no'}
Note that I changed NaN to None, because here you are most likely showing a python dictionary, not a JSON file since you are using data =
In a dictionary, the NaN value would be changed for None.
If you have to convert your JSON to a dictionary, refer to the JSON module documentation.
Also please check your dictionary syntax. You missed several commas to separate data.
You should try
The Input is
data = [{'demo': {'gender': '1', 'job': '6', 'study': '3', 'area': '3'}, 'id': 'abc001', 'ex_data': {'scholar': '2', 'fam': 'small'}}, {'demo': {'living': '3', 'gender': '1', 'qual': '3', 'edu': '6'}, 'id': 'abc002', 'ex_data': {'scholar': '', 'fam': ''}}, {'demo': {'gender': '1', 'area': '3', 'sal': '3', 'edu': '6'}, 'id': 'abc003', 'ex_data': {'scholar': None, 'fam': 'big'}}]
Also, Nan will not work in Python. So, instead of Nan we have used None.
for item in data:
item["completed"] = 'yes'
for key in item.keys():
if isinstance(item[key],dict):
for inner_key in item[key].keys():
if (not item[key][inner_key]):
item["completed"] = "no"
break
else:
if (not item[key]):
item["completed"] = "no"
break
The Output will be
data = [{'demo': {'gender': '1', 'job': '6', 'study': '3', 'area': '3'}, 'completed': 'yes', 'id': 'abc001', 'ex_data': {'scholar': '2', 'fam': 'small'}}, {'demo': {'living': '3', 'edu': '6', 'qual': '3', 'gender': '1'}, 'completed': 'no', 'id': 'abc002', 'ex_data': {'scholar': '', 'fam': ''}}, {'demo': {'edu': '6', 'gender': '1', 'sal': '3', 'area': '3'}, 'completed': 'no', 'id': 'abc003', 'ex_data': {'scholar': None, 'fam': 'big'}}]

How to remove duplicate elements of, list of dictionaries in python

I have a list of campuses:
campus = [{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'},{'id': '3', 'dlin': '1'},{'id': '4', 'dlin': '2'},{'id': '5', 'dlin': '2'},{'id': '6', 'dlin': '1'}, ]
each campus belongs to a school with a unique dlin. I want to have a list in which I have some other lists, each having a few dictionaries.
I run the below code:
schools = []
for i in campus:
ls = []
for j in campus:
if i['dlin'] == j['dlin']:
ls.append(j)
# campus_copy.remove(j)
schools.append(ls)
[print(item) for item in schools]
the result is:
[{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'}, {'id': '3', 'dlin': '1'}, {'id': '6', 'dlin': '1'}]
[{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'}, {'id': '3', 'dlin': '1'}, {'id': '6', 'dlin': '1'}]
[{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'}, {'id': '3', 'dlin': '1'}, {'id': '6', 'dlin': '1'}]
[{'id': '4', 'dlin': '2'}, {'id': '5', 'dlin': '2'}]
[{'id': '4', 'dlin': '2'}, {'id': '5', 'dlin': '2'}]
[{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'}, {'id': '3', 'dlin': '1'}, {'id': '6', 'dlin': '1'}]
I have to either remove the duplicate members from schools or modify the code such that I do not get duplicates.
When I try to remove duplicates from schools, I see that dic item is not hashable so I can not do it.
To solutions are available that are somewhat similar to my problem.
Remove duplicates from list of dictionaries within list of dictionaries
Remove duplicate dict in list in Python
However, I cannot figure out what to do?
does anybody know how to solve the problem?
what I expect to get is:
[{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'}, {'id': '3', 'dlin': '1'}, {'id': '6', 'dlin': '1'}]
[{'id': '4', 'dlin': '2'}, {'id': '5', 'dlin': '2'}]
One possible solution is storing the dlin as key in dictionary (and dictionaries cannot have multiple equal keys) rather than removing duplicates explicitly afterwards:
campus = [{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'},{'id': '3', 'dlin': '1'},{'id': '4', 'dlin': '2'},{'id': '5', 'dlin': '2'},{'id': '6', 'dlin': '1'}, ]
schools = {}
for c in campus:
schools.setdefault(c['dlin'], []).append(c)
for s in schools.values():
print(s)
Prints:
[{'id': '1', 'dlin': '1'}, {'id': '2', 'dlin': '1'}, {'id': '3', 'dlin': '1'}, {'id': '6', 'dlin': '1'}]
[{'id': '4', 'dlin': '2'}, {'id': '5', 'dlin': '2'}]
Based on the answer of Andrej, I solved another part of the question I had and I wanted just to share it here:
My question:
I am now involved in another issue related to the previous one:
I have this list of dictionaries, each informaton of a campus. multiple campuses might belong to a school. I have to distinguish and cluster them based on the similarity of their names.
campus = [
{'id': '1', 'name': 'seneca - york'},
{'id': '2', 'name': 'seneca college - north gate campus'},
{'id': '3', 'name': 'humber college - toronto campus'},
{'id': '4', 'name': 'humber college'},
{'id': '5', 'name': 'humber collge - waterloo campus'},
{'id': '6', 'name': 'university of waterloo toronto campus'},
]
my expected result can be reached by this small and neat code:
schools = {}
for c in campus:
schools.setdefault(c['name'][:4], []).append(c)
print(schools)

Formatting a python dictionary received via xmlrpc for nice output

Is there an easy way to format a dictionary in python for nice output?
I am learning how to interact with an API/XMLRPC in python at the moment. After making a request, I get a dictionary back formatted like the following:
{'category_id': '9', 'parent_id': '3', 'name': 'Headboard', 'is_active': '1', 'position': '6', 'level': '3', 'children': []}, {'category_id': '10', 'parent_id': '3', 'name': 'Mattress', 'is_active': '1', 'position': '7', 'level': '3', 'children': []},
This is a wall of text, easily a few pages. Is there an easy way to display this data nicely, or perhaps just to output the name of each category on one line?
edit:
Here is an attempt to print it via pprint, which ended up omitting a lot of the data:
import xmlrpc.client
import pprint
svc = xmlrpc.client.ServerProxy('https://example.com/api/xmlrpc/')
session = svc.login('apiuser', 'apikey')
temp = svc.call(session, 'catalog_category.tree')
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(temp)
You can use pprint.pprint:
>>> pprint([{'category_id': '9', 'parent_id': '3', 'name': 'Headboard', 'is_active': '1', 'position': '6', 'level': '3', 'children': []}, {'category_id': '10', 'parent_id': '3', 'name': 'Mattress', 'is_active': '1', 'position': '7', 'level': '3', 'children': []}])
[{'category_id': '9',
'children': [],
'is_active': '1',
'level': '3',
'name': 'Headboard',
'parent_id': '3',
'position': '6'},
{'category_id': '10',
'children': [],
'is_active': '1',
'level': '3',
'name': 'Mattress',
'parent_id': '3',
'position': '7'}]
To display just the category names you can do:
>>> [x['name'] for x in ...]
Alternatively you can use json.dump(s) + the JSON viewer of your choice (plenty of online choices available, or just your local browser).
Edit
Processing in a recursive manner:
import copy
t2 = copy.deepcopy(temp) # Modify for printing.
items = [t2]
while items:
item = items.pop(-1)
del item['category_id']
del item['is_active']
del item['level']
del item['position']
... # Whatever other keys you want to delete.
items += item.get('children', [])
pprint(t2)
This will give you a list of category names:
list_of_dicts = [{'category_id': '9', 'parent_id': '3', 'name': 'Headboard', 'is_active': '1', 'position': '6', 'level': '3', 'children': []}, {'category_id': '10', 'parent_id': '3', 'name': 'Mattress', 'is_active': '1', 'position': '7', 'level': '3', 'children': []}]
category_names = [dict['name'] for dict in list_of_dicts]
print(category_names)
OUTPUT:
['Headboard', 'Mattress']
If the data is actually a dictionary of dictionaries, such that it is in the form: { "key_1": {}, "key_2": {} ... "key_n": {} }
then the following code will create a list of the names of categories:
dict_of_dicts = {"key_a": {'category_id': '9', 'parent_id': '3', 'name': 'Headboard', 'is_active': '1', 'position': '6', 'level': '3', 'children': []}, "key_b": {'category_id': '10', 'parent_id': '3', 'name': 'Mattress', 'is_active': '1', 'position': '7', 'level': '3', 'children': []}}
category_names = [dict["name"] for dict in dict_of_dicts.values()]
print(category_names)
OUTPUT:
['Headboard', 'Mattress']

Categories