Merge random number of dicts in list - python

The task is to create a list of a random number of dicts (from 2 to 10)
dict's random numbers of keys should be letter, dict's values should
be a number (0-100), example: [{'a': 5, 'b': 7, 'g': 11}, {'a': 3, 'c': 35, 'g': 42}]
get a previously generated list of dicts and create one common dict:
if dicts have same key, we will take max value, and rename key with dict number with max value
if key is only in one dict - take it as is,
example: {'a_1': 5, 'b': 7, 'c': 35, 'g_2': 42}
I've written the following code:
from random import randint, choice
from string import ascii_lowercase
final_dict, indexes_dict = {}, {}
rand_list = [{choice(ascii_lowercase): randint(0, 100) for i in range(len(ascii_lowercase))} for j in range(randint(2, 10))]
for dictionary in rand_list:
for key, value in dictionary.items():
if key not in final_dict:
final_dict.update({key: value}) # add first occurrence
else:
if value < final_dict.get(key):
#TODO indexes_dict.update({:})
continue
else:
final_dict.update({key: value})
#TODO indexes_dict.update({:})
for key in indexes_dict:
final_dict[key + '_' + str(indexes_dict[key])] = final_dict.pop(key)
print(final_dict)
I only need to add some logic in order to keep indexes of final_dict values (created the separated dict for it).
I'm wondering if exists some more Pythonic way in order to solve such tasks.

This approach seems completely reasonable.
I, personally, would probably go around this way, however:
final_dict, tmp_dict = {}, {}
#Transform from list of dicts into dict of lists.
for dictionary in rand_list:
for k, v in dictionary.items():
tmp_dict.setdefault(k, []).append(v)
#Now choose only the biggest one
for k, v in tmp_dict.items():
if len(v) > 1:
final_dict[k+"_"+str(v.index(max(v))+1)] = max(v)
else: final_dict[k] = v[0]

You will need some auxiliary data structure to keep track of unrepeated keys. This uses collections.defaultdict and enumerate to aid the task:
from collections import defaultdict
def merge(dicts):
helper = defaultdict(lambda: [-1, -1, 0]) # key -> max, index_max, count
for i, d in enumerate(dicts, 1): # start indexing at 1
for k, v in d.items():
helper[k][2] += 1 # always increase count
if v > helper[k][0]:
helper[k][:2] = [v, i] # update max and index_max
# build result from helper data structure
result = {}
for k, (max_, index, count) in helper.items():
key = k if count == 1 else "{}_{}".format(k, index)
result[key] = max_
return result
>>> merge([{'a': 5, 'b': 7, 'g': 11}, {'a': 3, 'c': 35, 'g': 42}])
{'a_1': 5, 'b': 7, 'g_2': 42, 'c': 35}

Related

Python: How to sum dict values with shared key

I have json format key value pairs need to sum only values of another key inside same set if same key.
For example,
obj=[{'A': 1, 'X': 5}, {'B' : 5, 'X': 2 },{'A': 1, 'X': 8}]
If above A key matches, I would like to sum X key values like 5+8 = 13. I'm expecting remove duplicate key of A and sum only X values finally get output like below.
obj=[{'A': 1, 'X': 13}, {'B' : 5, 'X': 2 }]
I have tried something like below, but not working.
>>> for i in range(0, len(obj)):
... for z in range(0, len(obj)):
... if obj[i] == obj[z]:
... print(obj[i]['A'])
Here's what I came up with. It sorts the list. Then uses itertools.groupby to group by the key. Then builds a new dictionary with that group.
obj=[{'A': 1, 'X': 5}, {'B' : 5, 'X': 2 },{'A': 1, 'X': 8}]
sorted_list = sorted(obj, key=lambda x: next(iter(x.items())))
res = []
for key,group in itertools.groupby(sorted_list, key=lambda x: next(iter(x.items()))):
d = next(group).copy()
for o in group:
d['X'] += o['X']
res.append(d)
Convert key-value pairs to tuple (except for "X"), and then use that tuple as the key in a new dict to add up values for "X". After that, it's just reformatting to get the answer.
d = dict.fromkeys(((k, v) for el in obj for k, v in el.items() if k != "X"), 0)
for k, v in d.keys():
for item in obj:
if item.get(k) and item[k] == v:
d[(k, v)] += item["X"]
ans = []
for k, v in d.items():
curr = {}
curr[k[0]] = k[1]
curr["X"] = v
ans.append(curr)
ans
# [{'A': 1, 'X': 13}, {'B': 5, 'X': 2}]
If it's a large(ish) dataset pandas might provide some efficiency gains and save some of the nested iteration.
For example:
Read the obj list into a DataFrame
Only the columns need to be iterated
Create a view for each column exposing the non-null values
Append a dict containing the column value and the summed 'X' values
import pandas as pd
l = []
d = {}
df = pd.DataFrame(obj, dtype=object)
for col in df:
if col == 'X': continue
tmp = df.loc[~df[col].isnull(), [col, 'X']]
l.append({col: tmp[col].iloc[0],
'X': tmp['X'].sum()})
Output:
[{'A': 1, 'X': 13}, {'B': 5, 'X': 2}]

Python merging dictionary of dictionaries into one dictionary by summing the value

I want to merge all the dictionaries in a dictionary, while ignoring the main dictionary keys, and summing the value of the other dictionaries by value.
Input:
{'first':{'a': 5}, 'second':{'a': 10}, 'third':{'b': 5, 'c': 1}}
Output:
{'a': 15, 'b': 5, 'c': 1}
I did:
def merge_dicts(large_dictionary):
result = {}
for name, dictionary in large_dictionary.items():
for key, value in dictionary.items():
if key not in result:
result[key] = value
else:
result[key] += value
return result
Which works, but I don't think it's such a good way (or less "pythonic").
By the way, I don't like the title I wrote. If anybody thinks of a better wording please edit.
You can sum counters, which are a dict subclass:
>>> from collections import Counter
>>> sum(map(Counter, d.values()), Counter())
Counter({'a': 15, 'b': 5, 'c': 1})
This will work
from collections import defaultdict
values = defaultdict(int)
def combine(d, values):
for k, v in d.items():
values[k] += v
for v in a.values():
combine(v, values)
print(dict(values))
Almost similar, but it's just short and I like it a little better.
def merge_dicts(large_dictionary):
result = {}
for d in large_dictionary.values():
for key, value in d.items():
result[key] = result.get(key, 0) + value
return result

Denormalize/flatten list of nested objects into dot separated key value pairs

It would have simpler if my nested objects were dictionaries, but these are list of dictionaries.
Example:
all_objs1 = [{
'a': 1,
'b': [{'ba': 2, 'bb': 3}, {'ba': 21, 'bb': 31}],
'c': 4
}, {
'a': 11,
'b': [{'ba': 22, 'bb': 33, 'bc': [{'h': 1, 'e': 2}]}],
'c': 44
}]
I expect output in following format:
[
{'a': 1, 'b.ba': 2, 'b.bb': 3, 'c': 4},
{'a': 1, 'b.ba': 21, 'b.bb': 31, 'c': 4},
{'a': 11, 'b.ba': 22, 'b.bb': 33, 'bc.h': 1, 'bc.e': 2, 'c': 44},
]
Basically, number of flattened objects generated will be equal to (obj * depth)
With my current code:
def flatten(obj, flattened_obj, last_key=''):
for k,v in obj.iteritems():
if not isinstance(v, list):
flattened_obj.update({last_key+k : v})
else:
last_key += k + '.'
for nest_obj in v:
flatten(nest_obj, flattened_obj, last_key)
last_key = remove_last_key(last_key)
def remove_last_key(key_path):
second_dot = key_path[:-1].rfind('.')
if second_dot > 0:
return key_path[:second_dot+1]
return key_path
Output:
[
{'a': 1, 'b.bb': 31, 'c': 4, 'b.ba': 21},
{'a': 11, 'b.bc.e': 2, 'c': 44, 'b.bc.h': 1, 'b.bb': 33, 'b.ba': 22}
]
I am able to flatten the object (not accurate though), but I am not able to create a new object at each nested object.
I can not use pandas library as my app is deployed on app engine.
code.py:
from itertools import product
from pprint import pprint as pp
all_objs = [{
"a": 1,
"b": [{"ba": 2, "bb": 3}, {"ba": 21, "bb": 31}],
"c": 4,
#"d": [{"da": 2}, {"da": 5}],
}, {
"a": 11,
"b": [{"ba": 22, "bb": 33, "bc": [{"h": 1, "e": 2}]}],
"c": 44,
}]
def flatten_dict(obj, parent_key=None):
base_dict = dict()
complex_items = list()
very_complex_items = list()
for key, val in obj.items():
new_key = ".".join((parent_key, key)) if parent_key is not None else key
if isinstance(val, list):
if len(val) > 1:
very_complex_items.append((key, val))
else:
complex_items.append((key, val))
else:
base_dict[new_key] = val
if not complex_items and not very_complex_items:
return [base_dict]
base_dicts = list()
partial_dicts = list()
for key, val in complex_items:
partial_dicts.append(flatten_dict(val[0], parent_key=new_key))
for product_tuple in product(*tuple(partial_dicts)):
new_base_dict = base_dict.copy()
for new_dict in product_tuple:
new_base_dict.update(new_dict)
base_dicts.append(new_base_dict)
if not very_complex_items:
return base_dicts
ret = list()
very_complex_keys = [item[0] for item in very_complex_items]
very_complex_vals = tuple([item[1] for item in very_complex_items])
for product_tuple in product(*very_complex_vals):
for base_dict in base_dicts:
new_dict = base_dict.copy()
new_items = zip(very_complex_keys, product_tuple)
for key, val in new_items:
new_key = ".".join((parent_key, key)) if parent_key is not None else key
new_dict.update(flatten_dict(val, parent_key=new_key)[0])
ret.append(new_dict)
return ret
def main():
flatten = list()
for obj in all_objs:
flatten.extend(flatten_dict(obj))
pp(flatten)
if __name__ == "__main__":
main()
Notes:
As expected, recursion is used
It's general, it also works for the case that I mentioned in my 2nd comment (for one input dict having more than one key with a value consisting of a list with more than one element), that can be tested by decommenting the "d" key in all_objs. Also, theoretically it should support any depth
flatten_dict: takes an input dictionary and outputs a list of dictionaries (as the input dictionary might yield more than one output dictionary):
Every key having a "simple" (not list) value, goes into the output dictionar(y/ies) unchanged
At this point, a base output dictionary is complete (if the input dictionary will generate more than output dictionary, all will have the base dictionary keys/values, if it only generates one output dictionary, then that will be the base one)
Next, the keys with "problematic" values - that may generate more than output dictionary - (if any) are processed:
Keys having a list with a single element ("problematic") - each might generate more than one output dictionary:
Each of the values will be flattened (might yield more than one output dictionary); the corresponding key will be used in the process
Then, the cartesian product will be computed on all the flatten dictionary lists (for current input, there will only be one list with one element)
Now, each product item needs to be in a distinct output dictionary, so the base dictionary is duplicated and updated with the keys / values of every element in the product item (for current input, there will be only one element per product item)
The new dictionary is appended to a list
At this point a list of base dictionaries (might only be one) is complete, if no values consisting of lists with more than one element are present, this is the return list, else everything below has to be done for each base dictionary in the list
Keys having a list with a more elements ("very problematic") - each will generate more than one output dictionaries:
First, the cartesian product will be computed against all the values (lists with more than one element). In the current case, since since it's only one such list, each product item will only contain an element from that list
Then, for each product item element, its key will need to be established based on the lists order (for the current input, the product item will only contain one element, and also, there will only be one key)
Again, each product item needs to be in a distinct output dictionary, so the base dictionary is duplicated and updated with the keys / values, of the flattened product item
The new dictionary is appended to the output dictionaries list
Works with Python 3 and Python 2
Might be slow (especially for big input objects), as performance was not the goal. Also since it was built bottom-up (adding functionality as new cases were handled), it is pretty twisted (RO: intortocheated :) ), there might be a simpler implementation that I missed.
Output:
c:\Work\Dev\StackOverflow\q046341856>c:\Work\Dev\VEnvs\py35x64_test\Scripts\python.exe code.py
[{'a': 1, 'b.ba': 2, 'b.bb': 3, 'c': 4},
{'a': 1, 'b.ba': 21, 'b.bb': 31, 'c': 4},
{'a': 11, 'b.ba': 22, 'b.bb': 33, 'b.bc.e': 2, 'b.bc.h': 1, 'c': 44}]
#EDIT0:
Made it more general (although it's not visible for the current input): values containing only one element can yield more than output dictionary (when flattened), addressed that case (before I was only considering the 1st output dictionary, simply ignoring the rest)
Corrected a logical error that was masked out tuple unpacking combined with cartesian product: if not complex_items ... part
#EDIT1:
Modified the code to match a requirement change: the key in the flattened dictionary must have the full nesting path in the input dictionary
use this code to get your desired output. It generates output based on recursive call.
import json
from copy import deepcopy
def flatten(final_list, all_obj, temp_dct, last_key):
for dct in all_obj:
deep_temp_dct = deepcopy(temp_dct)
for k, v in dct.items():
if isinstance(v, list):
final_list, deep_temp_dct = flatten(final_list, v, deep_temp_dct, k)
else:
prefix = ""
if last_key : prefix = last_key + "."
key = prefix+ k
deep_temp_dct[key] = v
if deep_temp_dct not in final_list:
final_list.append(deep_temp_dct)
return final_list, deep_temp_dct
final_list, _ = flatten([], all_objs1, {}, "")
print json.dumps(final_list, indent =4 )
let me know if it works for you.

Pythonic way to get the union of dictionaries

Is there a better/pythonic way to do the following:
I have a function that merges dictionaries:
def merge_dicts(a, *dict_args):
for dictionary in dict_args:
for k, v in dictionary.items():
if k not in a:
a[k] = v
return a
Here is a sample run:
a = {'A': 1, 'B': 2}
b = {'B': 3, 'C': 4}
c = merge_dicts(a, b) # {'A': 1, 'B': 2, 'C': 4}
I am using python2.7.
You can use update. Since the earlier dicts have priority you have to update in reverse order, and update with a last:
def merge_dicts(a, *dict_args):
d = {}
for dictionary in reversed(dict_args):
d.update(dictionary)
d.update(a)
return d
Or as a one-liner, using itertools.chain:
from itertools import chain
def merge_dicts(a, *dict_args):
# chain (key, value) items in order of increasing priority
return dict(chain.from_iterable(d.iteritems() for d in dict_args[::-1]+(a,)))
> merge_dicts(a, b)
{'A': 1, 'C': 4, 'B': 2}
If I may add, why not remove a from the function signature altogether:
def merge_dicts(*dict_args):
return dict(chain.from_iterable(d.iteritems() for d in dict_args[::-1]))
# If you provide 0 or 1 dict,
# this will return an empty dict or the single dict (a copy thereof) itself
You don't need to check the existence of keys in dictionaries, since you want to preserve the first key you can use a dict comprehension by looping through the list of dictionaries backward:
{k: v for d in list_of_dict[::-1] for k, v in d.items()}
Python will replace the existence keys with new ones, each time it encounter a duplicate one, and since you are looping through the list backward, it will comes up with the first keys in your aggregated dictionary.
Based on your example:
>>> {k: v for d in l[::-1] for k, v in d.items()}
{'A': 1, 'C': 4, 'B': 2}

converting list to dict and averaging the values of duplicates in python

I have a list:
list = [(a,1),(b,2),(a,3)]
I want to convert it to a dict where when there is a duplicate (eg. (a,1) and (a,3)), it will be get the average so dict will just have 1 key:value pair which would be in this case a:2.
from collections import defaultdict
l = [('a',1),('b',2),('a',3)]
d = defaultdict(list)
for pair in l:
d[pair[0]].append(pair[1]) #add each number in to the list with under the correct key
for (k,v) in d.items():
d[k] = sum(d[k])/len(d[k]) #re-assign the value associated with key k as the sum of the elements in the list divided by its length
So
print(d)
>>> defaultdict(<type 'list'>, {'a': 2, 'b': 2})
Or even nicer and producing a plain dictionary in the end:
from collections import defaultdict
l = [('a',1),('b',2),('a',3)]
temp_d = defaultdict(list)
for pair in l:
temp_d[pair[0]].append(pair[1])
#CHANGES HERE
final = dict((k,sum(v)/len(v)) for k,v in temp_d.items())
print(final)
>>>
{'a': 2, 'b': 2}
Note that if you are using 2.x (as you are, you will need to adjust the following to force float division):
(k,sum(v)/float(len(v)))
OR
sum(d[k])/float(len(d[k]))

Categories