Calculate average values in a nested dict of dicts - python

I have a dictionary which has the following structure;
d = {'actor1': {'salary': {'year1': 60, 'year2': 65}, 'age': 30},
'actor2': {'salary': {'year1': 20, 'year2': 30}, 'age': 17},
'actor3': {'salary': {'year1': 50, 'year2': 80}, 'age': 25}}
I want the output to be as follows;
b = {'average': {'salary': {'year1': 43.3, 'year2': 58.3}, 'age': 24}}
So the inner dict can contain values which are both numbers, or dictionaries. If it is a dictionary we are guaranteed to have the same keys for each constituent dictionary (ie : the same years will always appear in salary for each actor).
I don't have a problem finding the correct value for the age key, which can be done as follows;
actor_keys = list(d)
b = {}
b['average'] = {}
b['average']['age'] = np.mean([b[i]['age'] for i in actor_keys])
Is there a nice similar kind of calculation that aggregates over the keys inside salary?

You can use recursion for a more robust solution to handle input of an unknown depth:
from itertools import groupby
data = {'actor1': {'salary': {'year1': 60, 'year2': 65}, 'age': 30}, 'actor2': {'salary': {'year1': 20, 'year2': 30}, 'age': 17}, 'actor3': {'salary': {'year1': 50, 'year2': 80}, 'age': 25}}
def ave(d):
_data = sorted([i for b in d for i in b.items()], key=lambda x:x[0])
_d = [(a, [j for _, j in b]) for a, b in groupby(_data, key=lambda x:x[0])]
return {a:ave(b) if isinstance(b[0], dict) else round(sum(b)/float(len(b)), 1) for a, b in _d}
result = {'average':ave(list(data.values()))}
Output:
{'average': {'age': 24.0, 'salary': {'year1': 43.3, 'year2': 58.3}}}

Here is another recursive solution:
def average_dicts(dicts):
result = {}
for i, d in enumerate(dicts):
for k, v in d.items():
update_dict_average(result, k, v, i)
return result
def update_dict_average(current, key, update, n):
if isinstance(update, dict):
subcurrent = current.setdefault(key, {})
for subkey, subupdate in update.items():
update_dict_average(subcurrent, subkey, subupdate, n)
else:
current[key] = (current.get(key, 0) * n + update) / (n + 1)
d = {'actor1': {'salary': {'year1': 60, 'year2': 65}, 'age': 30},
'actor2': {'salary': {'year1': 20, 'year2': 30}, 'age': 17},
'actor3': {'salary': {'year1': 50, 'year2': 80}, 'age': 25}}
result = {'average': average_dicts(d.values())}
print(result)
# {'average': {'salary': {'year1': 43.333333333333336, 'year2': 58.333333333333336}, 'age': 24.0}}

Here's what I would do.
def avg(nums):
nums = list(nums)
return round(sum(nums) / len(nums), 1)
d = {'actor1': {'salary': {'year1': 60, 'year2': 65}, 'age': 30},
'actor2': {'salary': {'year1': 20, 'year2': 30}, 'age': 17},
'actor3': {'salary': {'year1': 50, 'year2': 80}, 'age': 25}}
average = {'salary': {}}
average['age'] = avg(actor['age'] for actor in d.values())
for year in list(d.values())[0]['salary']:
average['salary'][year] = avg(actor['salary'][year] for actor in d.values())
b = {'average': average}
>>> print(b)
{'average': {'salary': {'year1': 43.3, 'year2': 58.3}, 'age': 24.0}}
This can handle an arbitrary positive number of years and actors, and doesn't require itertools or numpy.

Functional approach:
import itertools
from statistics import mean
d = {'actor1': {'salary': {'year1': 60, 'year2': 65}, 'age': 30},
'actor2': {'salary': {'year1': 20, 'year2': 30}, 'age': 17},
'actor3': {'salary': {'year1': 50, 'year2': 80}, 'age': 25}}
#helpers
age = operator.itemgetter('age')
salary = operator.itemgetter('salary')
year = operator.itemgetter(0)
value = operator.itemgetter(1)
ages = map(age,d.values())
avg_age = mean(ages)
print(f'avg_age: {avg_age}')
salaries = map(dict.items, map(salary, d.values()))
salaries = sorted(itertools.chain.from_iterable(salaries), key=year)
for key, group in itertools.groupby(salaries, year):
avg = mean(map(value, group))
print(f'avg for {key}: {avg}')

Here is my solution reusing what you did for age :
b = {}
b['average'] = {}
b['average']["salary"] = {"year1":np.mean([d.get(i).get('salary').get('year1') for i in d]),"year2":np.mean([d.get(i).get('salary').get('year2') for i in d])}

Related

How can i get values from 2 lists and put them into a dictionary in python

I have 2 lists
list1 = ["ben", "tim", "john", "wally"]
list2 = [18,12,34,55]
the output im looking for is this
[{'Name': 'ben', 'Age': 18, 'Name': 'tim', 'Age': 12, 'Name': 'john', 'Age': 34, 'Name': 'wally', 'Age': 55}]
As mentioned in the comments, you can't have duplicate keys in a dictionary; even your output snippet would just return [{'Name': 'wally', 'Age': 55}]
However, {k: v for k, v in zip(list1, list2)} will return
{'ben': 18, 'tim': 12, 'john': 34, 'wally': 55}
And [{'Name': n, 'Age': a} for n, a in zip(list1, list2)] will return
[{'Name': 'ben', 'Age': 18},
{'Name': 'tim', 'Age': 12},
{'Name': 'john', 'Age': 34},
{'Name': 'wally', 'Age': 55}]

Get list of dictionaries based on their key values

I have several dictionaries, let's say 5:
dict1={'Age': 20, 'Name': 'Bob'}
dict2={'Age': 10, 'Name': 'Ane'}
dict3={'Age': 40, 'Name': 'Lee'}
dict4={'Age': 50, 'Name': 'Rob'}
dict5={'Age': 30, 'Name': 'Sia'}
and
arr=[50,40,30,20,10]
Can I get a list of dictionaries based on the values of age in arr?
Desired output:
[dict4,dict3,dict5,dict1,dict2]
Try using a lambda to sort by a certain property (in this case, Age):
Code:
dict1={'Age': 20, 'Name': 'Bob'}
dict2={'Age': 10, 'Name': 'Ane'}
dict3={'Age': 40, 'Name': 'Lee'}
dict4={'Age': 50, 'Name': 'Rob'}
dict5={'Age': 30, 'Name': 'Sia'}
dicts = [dict1, dict2, dict3, dict4, dict5]
dicts.sort(reverse=True, key=lambda x: x['Age'])
print(dicts)
Output:
[{'Age': 50, 'Name': 'Rob'}, {'Age': 40, 'Name': 'Lee'}, {'Age': 30, 'Name': 'Sia'}, {'Age': 20, 'Name': 'Bob'}, {'Age': 10, 'Name': 'Ane'}]
Based on your comments:
arr = [50, 40, 30, 20, 10]
dicts = [dict1, dict2, dict3, dict4, dict5]
dicts = {d["Age"]: d for d in dicts}
dicts = [dicts[v] for v in arr]
print(dicts)
Prints:
[
{"Age": 50, "Name": "Rob"},
{"Age": 40, "Name": "Lee"},
{"Age": 30, "Name": "Sia"},
{"Age": 20, "Name": "Bob"},
{"Age": 10, "Name": "Ane"},
]
If the expected output is the dict names and not the values, you can create a mapping between Age and the dict name and iterate through arr and get the name of the dict by its age:
dict1 = {'Age': 20, 'Name': 'Bob'}
dict2 = {'Age': 10, 'Name': 'Ane'}
dict3 = {'Age': 40, 'Name': 'Lee'}
dict4 = {'Age': 50, 'Name': 'Rob'}
dict5 = {'Age': 30, 'Name': 'Sia'}
arr = [50, 40, 30, 20, 10]
age_to_dict_name = {globals()[x]['Age']: x for x in globals() if x.startswith("dict")}
expected_output = [age_to_dict_name[x] for x in arr]
print(expected_output) # ['dict4', 'dict3', 'dict5', 'dict1', 'dict2']

How to create a list of dictionaries from a dictionary with lists of different lengths

I want to create a list of dictionaries with the same index element from each list.
I have this dictionary:
d = {'name': ['bob', 'john', 'harry', 'mary'],
'age': [13, 19, 23],
'height': [164, 188],
'job': ['programmer']}
The desired output is:
d2 = [{'name': 'bob', 'age': 13, 'height': 164, 'job': 'programmer'},
{'name': 'john', 'age': 19, 'height': 188},
{'name': 'harry', 'age': 23},
{'name': 'mary'}]
I have tried something like this:
d2 = [dict(zip(d, t)) for t in zip(*d.values())]
But my output is:
d2 = [{'name': 'bob', 'age': 13, 'height': 164, 'job': 'programmer'}]
I think this is happening because the lists have different lengths.
You can use itertools.zip_longest and filter out None values:
from itertools import zip_longest
[{x: y for x, y in zip(d, t) if y is not None} for t in zip_longest(*d.values())]
# [{'name': 'bob', 'age': 13, 'height': 164, 'job': 'programmer'},
# {'name': 'john', 'age': 19, 'height': 188},
# {'name': 'harry', 'age': 23},
# {'name': 'mary'}]
You can use zip_longest here:
from itertools import zip_longest
keys = d.keys()
d2 = [
{k: v for k, v in zip(keys, vs) if v is not None}
for vs in zip_longest(*d.values())
]
If the values can be None as well, we can circumvent that by using a dummy value:
from itertools import zip_longest
keys = d.keys()
dummy = object()
d2 = [
{k: v for k, v in zip(keys, vs) if v is not dummy}
for vs in zip_longest(*d.values(), fillvalue=dummy)
]
Here the dummy is an object which we are sure that is not part of the items in d (since we construct it after we constructed d). By using an is comparison, we thus can know if that value was the "fillvalue".
This will give us:
>>> d2
[{'name': 'bob', 'age': 13, 'height': 164, 'job': 'programmer'}, {'name': 'john', 'age': 19, 'height': 188}, {'name': 'harry', 'age': 23}, {'name': 'mary'}]
A simple solution without using zip_longest, for the record:
d = {'name': ['bob', 'john', 'harry', 'mary'], 'age': [13, 19, 23], 'height': [164, 188], 'job': ['programmer']}
recordset = [{k: v[i] for k, v in d.items() if i < len(v)} for i in range(max([len(l) for l in d.values()]))]
print(recordset) # >> [{'name': 'bob', 'age': 13, 'height': 164, 'job': 'programmer'},
{'name': 'john', 'age': 19, 'height': 188},
{'name': 'harry', 'age': 23},
{'name': 'mary'}]
Here is another approach :
d = {'name': ['bob', 'john', 'harry', 'mary'], 'age': [13, 19, 23], 'height': [164, 188], 'job': ['programmer']}
m = max(map(len, d.values()))
d1 = {k : (v if len(v)==m else v+['']*(m-len(v))) for k,v in d.items()}
d2 = [{k:v for k,v in zip(d, t) if v} for t in zip(*d1.values())]
print(d2)
Output :
[{'height': 164, 'age': 13, 'job': 'programmer', 'name': 'bob'}, {'height': 188, 'age': 19, 'name': 'john'}, {'age': 23, 'name': 'harry'}, {'name': 'mary'}]
Just keep everything and add this import statement:
from itertools import zip_longest as zip

Python find duplicated dicts in list and separate them with counting

I have a dicts in a list and some dicts are identical. I want to find duplicated ones and want to add to new list or dictionary with how many duplicate they have.
import itertools
myListCombined = list()
for a, b in itertools.combinations(myList, 2):
is_equal = set(a.items()) - set(b.items())
if len(is_equal) == 0:
a.update(count=2)
myListCombined.append(a)
else:
a.update(count=1)
b.update(count=1)
myListCombined.append(a)
myListCombined.append(b)
myListCombined = [i for n, i enumerate(myListCombine) if i not in myListCombine[n + 1:]]
This code is kinda working, but it's just for 2 duplicated dicts in list. a.update(count=2) won't work in this situations.
I'm also deleting duplicated dicts after separete them in last line, but i'm not sure if it's going to work well.
Input:
[{'name': 'Mary', 'age': 25, 'salary': 1000},
{'name': 'John', 'age': 25, 'salary': 2000},
{'name': 'George', 'age': 30, 'salary': 2500},
{'name': 'John', 'age': 25, 'salary': 2000},
{'name': 'John', 'age': 25, 'salary': 2000}]
Desired Output:
[{'name': 'Mary', 'age': 25, 'salary': 1000, 'count':1},
{'name': 'John', 'age': 25, 'salary': 2000, 'count': 3},
{'name': 'George', 'age': 30, 'salary': 2500, 'count' 1}]
You could try the following, which first converts each dictionary to a frozenset of key,value tuples (so that they are hashable as required by collections.Counter).
import collections
a = [{'a':1}, {'a':1},{'b':2}]
print(collections.Counter(map(lambda x: frozenset(x.items()),a)))
Edit to reflect your desired input/output:
from copy import deepcopy
def count_duplicate_dicts(list_of_dicts):
cpy = deepcopy(list_of_dicts)
for d in list_of_dicts:
d['count'] = cpy.count(d)
return list_of_dicts
x = [{'a':1},{'a':1}, {'c':3}]
print(count_duplicate_dicts(x))
If your dict data is well structured and the contents of the dict are simple data types, e.g., numbers and string, and you have following data analysis processing, I would suggest you use pandas, which provide rich functions. Here is a sample code for your case:
In [32]: data = [{'name': 'Mary', 'age': 25, 'salary': 1000},
...: {'name': 'John', 'age': 25, 'salary': 2000},
...: {'name': 'George', 'age': 30, 'salary': 2500},
...: {'name': 'John', 'age': 25, 'salary': 2000},
...: {'name': 'John', 'age': 25, 'salary': 2000}]
...:
...: df = pd.DataFrame(data)
...: df['counts'] = 1
...: df = df.groupby(df.columns.tolist()[:-1]).sum().reset_index(drop=False)
...:
In [33]: df
Out[33]:
age name salary counts
0 25 John 2000 3
1 25 Mary 1000 1
2 30 George 2500 1
In [34]: df.to_dict(orient='records')
Out[34]:
[{'age': 25, 'counts': 3, 'name': 'John', 'salary': 2000},
{'age': 25, 'counts': 1, 'name': 'Mary', 'salary': 1000},
{'age': 30, 'counts': 1, 'name': 'George', 'salary': 2500}]
The logical are:
(1) First build the DataFrame from your data
(2) The groupby function can do aggregate function on each group.
(3) To output back to dict, you can call pd.to_dict
Pandas is a big package, which costs some time to learn it, but it worths to know pandas. It is so powerful that can make your data analysis quite faster and elegant.
Thanks.
You can try this:
import collections
d = [{'name': 'Mary', 'age': 25, 'salary': 1000},
{'name': 'John', 'age': 25, 'salary': 2000},
{'name': 'George', 'age': 30, 'salary': 2500},
{'name': 'John', 'age': 25, 'salary': 2000},
{'name': 'John', 'age': 25, 'salary': 2000}]
count = dict(collections.Counter([i["name"] for i in d]))
a = list(set(map(tuple, [i.items() for i in d])))
final_dict = [dict(list(i)+[("count", count[dict(i)["name"]])]) for i in a]
Output:
[{'salary': 2000, 'count': 3, 'age': 25, 'name': 'John'}, {'salary': 2500, 'count': 1, 'age': 30, 'name': 'George'}, {'salary': 1000, 'count': 1, 'age': 25, 'name': 'Mary'}]
You can take the count values using collections.Counter and then rebuild the dicts after adding the count value from the Counter to each frozenset:
from collections import Counter
l = [dict(d | {('count', c)}) for d, c in Counter(frozenset(d.items())
for d in myList).items()]
print(l)
# [{'salary': 1000, 'name': 'Mary', 'age': 25, 'count': 1},
# {'name': 'John', 'salary': 2000, 'age': 25, 'count': 3},
# {'salary': 2500, 'name': 'George', 'age': 30, 'count': 1}]

Parse list into dictionaries within a dictionary Python

dataset:
id = [1,2,3]
header = ['name','attack','defense']
stats = [['John',12,30], ['Amy',32,89], ['Lisa',45,21]]
I would like to obtain an output in the form of a nested dictionary. The keys of the outer dictionary will be the id and the values will be dictionaries the contain the other data. i.e.:
dict = {
1: {'name': 'John', 'attack': 12, 'defense': 30},
2: {'name': 'Amy', 'attack': 32, 'defense': 89},
3: {'name': 'Lisa', 'attack': 45, 'defense': 21}
}
this is my current code:
dict = {}
for i in id:
next_input = {}
for index, h in enumerate (header):
for sublist in stats:
next_input[h] = sublist[index]
dict[i] = next_input
It is not working because of the last for loop. the value of the inner dictionaries are just replacing themselves until the last sublist.
How can I correct this code?
You don't need to loop over the stats sublists; using the enumerate() option you picked, you'd have to add an index to the id loop and pick the right stats:
dict = {}
for id_index, i in enumerate(id):
next_input = {}
for h in enumerate (header):
next_input[h] = sublist[id_index][index]
dict[i] = next_input
However, you can use the zip() function to pair up two lists for parallel iteration:
result = {i: dict(zip(header, stat)) for i, stat in zip(id, stats)}
This uses a dictionary comprehension to build the outer mapping from id value to corresponding stats entry. The inner dictionary is simply build from the paired headers and statistics (dict() takes a sequence of (key, value) pairs).
Demo:
>>> id = [1,2,3]
>>> header = ['name','attack','defense']
>>> stats = [['John',12,30], ['Amy',32,89], ['Lisa',45,21]]
>>> {i: dict(zip(header, stat)) for i, stat in zip(id, stats)}
{1: {'attack': 12, 'defense': 30, 'name': 'John'}, 2: {'attack': 32, 'defense': 89, 'name': 'Amy'}, 3: {'attack': 45, 'defense': 21, 'name': 'Lisa'}}
>>> from pprint import pprint
>>> pprint(_)
{1: {'attack': 12, 'defense': 30, 'name': 'John'},
2: {'attack': 32, 'defense': 89, 'name': 'Amy'},
3: {'attack': 45, 'defense': 21, 'name': 'Lisa'}}
You can try this:
id = [1,2,3]
header = ['name','attack','defense']
stats = [['John',12,30], ['Amy',32,89], ['Lisa',45,21]]
new_dict = {a:{d:c for c, d in zip(b, header)} for a, b in zip(id, stats)}
Output:
{1: {'attack': 12, 'defense': 30, 'name': 'John'}, 2: {'attack': 32, 'defense': 89, 'name': 'Amy'}, 3: {'attack': 45, 'defense': 21, 'name': 'Lisa'}}
Another zip() variation:
d = {}
for i,s in enumerate(stats):
d[id[i]] = dict((zip(header, s)))
print(d)
The output:
{1: {'attack': 12, 'name': 'John', 'defense': 30}, 2: {'attack': 32, 'name': 'Amy', 'defense': 89}, 3: {'attack': 45, 'name': 'Lisa', 'defense': 21}}
use zip() and list comphersion
>> dict(zip(id ,[dict(zip(header,item)) for item in stats]))
{1: {'attack': 12, 'defense': 30, 'name': 'John'}, 2: {'attack': 32, 'defense': 89, 'name': 'Amy'}, 3: {'attack': 45, 'defense': 21, 'name': 'Lisa'}}
first zip every item in stats with header
>>> [dict(zip(header,item)) for item in stats]
[{'attack': 12, 'defense': 30, 'name': 'John'}, {'attack': 32, 'defense': 89, 'name': 'Amy'}, {'attack': 45, 'defense': 21, 'name': 'Lisa'}]
second zip id with the output of first
>>> zip(id,[dict(zip(header,item)) for item in stats])
[(1, {'attack': 12, 'defense': 30, 'name': 'John'}), (2, {'attack': 32, 'defense': 89, 'name': 'Amy'}), (3, {'attack': 45, 'defense': 21, 'name': 'Lisa'})]

Categories