Related
Suppose I have an unstructured nested dict as following:
{
'A_brand': {'score1': {'A': 13, 'K': 50}},
'B_brand': {'before_taste': {'score2': {'A': 43, 'D': 23}}, 'after_taste': {'score3': {'H': 36, 'J': 34}}},
'Score4': {'G': 2, 'W': 19}
}
How can I get/show the info like: Which letter get the highest score for each scores?
like:
{'key':'value',
'A_brand/score1':'K',
'B_brand/before_taste/score2':'A',
'B_brand/after_taste/score3':'H',
'Score4':'W'}
What I did was dummies way which I created a new dict and accessed into each path, sorted them by values and selected first one item, then added it into the new dict.
For example:
new_csv={'key':'value'}
first=data['A_brand']['before_lunch_break']['score1']
first_new=sorted(first.items(),key=lambda x: x[1],reverse=True)
new_csv['A_brand/score']=first_new[0][0]
second=data['B_brand']['before_taste']['score2']
second_new=sorted(second.items(),key=lambda x: x[1],reverse=True)
new_csv['B_brand/before_taste/score2']=second_new[0][0]
...
I'm wondering if there is any faster or automatic ways to do that?
You can use a generator with recursion:
data = {'A_brand': {'score1': {'A': 13, 'K': 50}}, 'B_brand': {'before_taste': {'score2': {'A': 43, 'D': 23}}, 'after_taste': {'score3': {'H': 36, 'J': 34}}}, 'Score4': {'G': 2, 'W': 19}}
def get_max(d, c = []):
for a, b in d.items():
if all(not isinstance(i, dict) for i in b.values()):
yield ('/'.join(c+[a]), max(b, key=lambda x:b[x]))
else:
yield from get_max(b, c+[a])
print(dict(get_max(data)))
Output:
{'A_brand/score1': 'K', 'B_brand/before_taste/score2': 'A', 'B_brand/after_taste/score3': 'H', 'Score4': 'W'}
The goal I want to achieve is to exchange all items whose form is #item_name# to the from (item_value) in the dict. I use two dict named test1 and test2 to test my function. Here is the code:
test1={'integer_set': '{#integer_list#?}', 'integer_list': '#integer_range#(?,#integer_range#)*', 'integer_range': '#integer#(..#integer#)?', 'integer': '[+-]?\\d+'}
test2={'b': '#a#', 'f': '#e#', 'c': '#b#', 'e': '#d#', 'd': '#c#', 'g': '#f#', 'a': 'correct'}
def change(pat_dict:{str:str}):
print('Expanding: ',pat_dict)
num=0
while num<len(pat_dict):
inv_pat_dict = {v: k for k, v in pat_dict.items()}
for value in pat_dict.values():
for key in pat_dict.keys():
if key in value:
repl='#'+key+'#'
repl2='('+pat_dict[key]+')'
value0=value.replace(repl,repl2)
pat_dict[inv_pat_dict[value]]=value0
num+=1
print('Result: ',pat_dict)
change(test1)
change(test2)
sometimes I can get correct result like:
Expanding: {'integer': '[+-]?\\d+', 'integer_list': '#integer_range#(?,#integer_range#)*', 'integer_set': '{#integer_list#?}', 'integer_range': '#integer#(..#integer#)?'}
Result: {'integer': '[+-]?\\d+', 'integer_list': '(([+-]?\\d+)(..([+-]?\\d+))?)(?,(([+-]?\\d+)(..([+-]?\\d+))?))*', 'integer_set': '{((([+-]?\\d+)(..([+-]?\\d+))?)(?,(([+-]?\\d+)(..([+-]?\\d+))?))*)?}', 'integer_range': '([+-]?\\d+)(..([+-]?\\d+))?'}
Expanding: {'c': '#b#', 'f': '#e#', 'e': '#d#', 'b': '#a#', 'g': '#f#', 'd': '#c#', 'a': 'correct'}
Result: {'c': '((correct))', 'f': '(((((correct)))))', 'e': '((((correct))))', 'b': '(correct)', 'g': '((((((correct))))))', 'd': '(((correct)))', 'a': 'correct'}
But most of time I get wrong results like that:
Expanding: {'integer_range': '#integer#(..#integer#)?', 'integer': '[+-]?\\d+', 'integer_set': '{#integer_list#?}', 'integer_list': '#integer_range#(?,#integer_range#)*'}
Result: {'integer_range': '([+-]?\\d+)(..([+-]?\\d+))?', 'integer': '[+-]?\\d+', 'integer_set': '{(#integer_range#(?,#integer_range#)*)?}', 'integer_list': '#integer_range#(?,#integer_range#)*'}
Expanding: {'f': '#e#', 'a': 'correct', 'd': '#c#', 'g': '#f#', 'b': '#a#', 'c': '#b#', 'e': '#d#'}
Result: {'f': '(((((correct)))))', 'a': 'correct', 'd': '(((correct)))', 'g': '((((((correct))))))', 'b': '(correct)', 'c': '((correct))', 'e': '((((correct))))'}
How could I update my code to achieve my goal?
Your problem is caused by the fact that python dictionaries are unordered. Try using a OrderedDict instead of dict and you should be fine. The OrderedDict works just like a normal dict but with ordering retained, at a small performance cost.
Note that while you could create an OrderedDict from a dict literal (like I did here at first), that dict would be unordered, so the ordering might not be guaranteed. Using a list of (key, value) pairs preserves the ordering in all cases.
from collections import OrderedDict
test1=OrderedDict([('integer_set', '{#integer_list#?}'), ('integer_list', '#integer_range#(?,#integer_range#)*'), ('integer_range', '#integer#(..#integer#)?'), ('integer', '[+-]?\\d+')])
test2=OrderedDict([('b', '#a#'), ('f', '#e#'), ('c', '#b#'), ('e', '#d#'), ('d', '#c#'), ('g', '#f#'), ('a', 'correct')])
def change(pat_dict:{str:str}):
print('Expanding: ',pat_dict)
num=0
while num<len(pat_dict):
inv_pat_dict = {v: k for k, v in pat_dict.items()}
for value in pat_dict.values():
for key in pat_dict.keys():
if key in value:
repl='#'+key+'#'
repl2='('+pat_dict[key]+')'
value0=value.replace(repl,repl2)
pat_dict[inv_pat_dict[value]]=value0
num+=1
print('Result: ',pat_dict)
change(test1)
change(test2)
Try this one. Your problem is due to mutating starting dict. You need to change its copy.
test1={'integer_set': '{#integer_list#?}', 'integer_list': '#integer_range#(?,#integer_range#)*', 'integer_range': '#integer#(..#integer#)?', 'integer': '[+-]?\\d+'}
test2={'b': '#a#', 'f': '#e#', 'c': '#b#', 'e': '#d#', 'd': '#c#', 'g': '#f#', 'a': 'correct'}
def change(d):
new_d = d.copy()
for k in d.keys():
for nk, v in new_d.items():
if k in v:
new_d[nk] = v.replace('#{}#'.format(k), '({})'.format(new_d[k]))
return new_d
test1 = change(test1)
test2 = change(test2)
Let's assume the following :
sp_sample=[{"t":1434946093036,"v":54.0},{"t":1434946095013,"v":53.0},{"t":1434946096823,"v":52.0}
I wish I could get the following result :
sp_sample=[{"t":1434946093036,"v":5400.0},{"t":1434946095013,"v":5300.0},{"t":1434946096823,"v":5200.0}
In other words, I wish I could iterate through the array and multiple v by a 100 factor.
The following only performs the multiplication on the first item, ie yields 54000 :
for i, a in enumerate(sp_sample):
a[i]['v'] = a[i]['v'] * 100
The sp_sample is of type tuple. Using the following yields the whole array, which is not what I expect :
print sp_sample[0]
Also, tried printing sp_sample :
print sp_sample
Which returns the following (replaced the ....... for brevity) :
([{'t': 1434946093036, 'v': 54.0}, {'t': 1434946095013, 'v': 53.0}, {'t': 1434946096823, 'v': 52.0}, {'t': 1434946098612, 'v': 52.0}, {'t': 1434946100400, 'v': 51.0}, {'t': 1434946102372, 'v': 49.0},........, {'t': 1434947987823, 'v': 15.0}, {'t': 1434947989851, 'v': 12.0}, {'t': 1434947991899, 'v': 10.0}, {'t': 1434947993744, 'v': 5.0}, {'t': 1434947995599, 'v': 0.0}, {'t': 1434947997455, 'v': 0.0}, {'t': 1434947999494, 'v': 0.0}, {'t': 1434948001542, 'v': 0.0}, {'t': 1434948003417, 'v': 0.0}, {'t': 1434948005264, 'v': 0.0}, {'t': 1434948007120, 'v': 0.0}],)
print type(sp_sample) returns :
Simply iterate over the list and update the dictionaries as you go:
sp_sample = [{"t":1434946093036,"v":54.0},{"t":1434946095013,"v":53.0},{"t":1434946096823,"v":52.0}]
for d in sp_sample:
d['v'] *= 100
>>> print(sp_sample)
[{'t': 1434946093036, 'v': 5400.0}, {'t': 1434946095013, 'v': 5300.0}, {'t': 1434946096823, 'v': 5200.0}]
This will bind in turn each dictionary in list (tuple?) sp_sample to d, which you then update in place. You do not need to use enumerate().
Note that you really need to multiply by 100, not 10000, to achieve the output that you have shown.
Update
sp_sample is actually a tuple with a list of dictionaries as its only item. So you need to access the list in the tuple like this:
sp_sample = ([{"t":1434946093036,"v":54.0},{"t":1434946095013,"v":53.0},{"t":1434946096823,"v":52.0}],)
for d in sp_sample[0]: # N.B. access first item of tuple
d['v'] *= 100
>>> print(sp_sample)
[{'t': 1434946093036, 'v': 5400.0}, {'t': 1434946095013, 'v': 5300.0}, {'t': 1434946096823, 'v': 5200.0}]
Or, since the tuple contains only a single item you could just get rid of the tuple by:
sp_sample = sp_sample[0]
for d in sp_sample:
d['v'] *= 100
If I have two lists (with the same length):
ls1 = ['a','b','c','a','d','c']
ls2 = [1,2,3,5,1,2]
I would like to get the following dictionary (sum over the values if it is the same key):
d = {'a':6,'b':2,'c':5,'d':1}
I did the following:
ls1 = np.array(ls1)
ls2 = np.array(ls2)
uniqe_vals = list(set(ls1))
d = {}
for u in uniqe_vals:
ind = np.where(ls1 == u)[0]
d[u] = sum(ls2[ind])
It works fine for small data, but it is taking too long for the whole data (I have a list of size ~5 million).
Do you have any suggestions for a more efficient way to do it ?
Also with a defaultdict, but different and simpler:
from collections import defaultdict
d = defaultdict(int)
for n, v in zip(ls1, ls2):
d[n] += v
Or, as suggested:
from collections import defaultdict
from itertools import izip
d = defaultdict(int)
for n, v in izip(ls1, ls2):
d[n] += v
You could try:
import numpy as np
uni, i = np.unique(ls1, return_inverse=1)
vals = np.bincount(i, ls2)
dict(zip(uni, vals))
Since you asked how to make it more efficient, I compared the time your original solution took with the version suggested in my comment (equivalent with Juergen's second solution) with 5 million random characters from a-z as keys and 5 million random values from 0-20, using my shell's time function:
~/test $ time python defdict.py
defaultdict(<type 'int'>, {'a': 381956, 'c': 383815, 'b': 378277, 'e': 384629, 'd': 383557, 'g': 381139, 'f': 386268, 'i': 383902, 'h': 385809, 'k': 385138, 'j': 384690, 'm': 388552, 'l': 384393, 'o': 384533, 'n': 385011, 'q': 385685, 'p': 386188, 's': 387132, 'r': 383886, 'u': 386176, 't': 387144, 'w': 386371, 'v': 388263, 'y': 381337, 'x': 385281, 'z': 384048})
python defdict.py 13,24s user 0,35s system 96% cpu 14,045 total
~/test $ time python original.py
{'a': 386316, 'c': 383596, 'b': 383424, 'e': 385598, 'd': 383324, 'g': 382233, 'f': 385435, 'i': 386761, 'h': 384047, 'k': 386640, 'j': 386313, 'm': 381032, 'l': 383035, 'o': 389142, 'n': 385000, 'q': 386088, 'p': 387435, 's': 385429, 'r': 384260, 'u': 385442, 't': 384793, 'w': 385052, 'v': 380830, 'y': 386500, 'x': 386871, 'z': 379870}
python original.py 14,68s user 0,38s system 96% cpu 15,529 total
So there seems to be a difference, although not a big one. To make it fairer, numpy was also imported in defdict.py.
I have a list of strings with prefix characters representing the multiplying factor for the number. So if I have data like:
data = ['101n', '100m', '100.100f']
I want to use the dictionary
prefix_dict = {'y': 'e-24', 'z': 'e-21', 'a': 'e-18', 'f': 'e-15', 'p': 'e-12',
'n': 'e-9', 'u': 'e-6', 'm': 'e-3', 'c': 'e-2', 'd': 'e-1',
'da': 'e1', 'h': 'e2', 'k': 'e3', 'M': 'e6', 'G': 'e9',
'T': 'e12', 'P': 'e15', 'E': 'e18', 'Z': 'e21', 'Y': 'e24'}
To insert their corresponding strings. When I look at the other questions similar to mine there is one character being translated into another character. Is there a way to use the translate function to translate one character into multiple characters or should I be approaching this differently?
You can use regex for this, this works for 'da' as well:
>>> data = ['101n', '100m', '100.100f', '1d', '1da']
>>> import re
>>> r = re.compile(r'([a-zA-Z]+)$')
>>> for d in data:
print r.sub(lambda m: prefix_dict.get(m.group(1), m.group(1)), d)
...
101e-9
100e-3
100.100e-15
1e-1
1e1
And a non-regex version using itertools.takewhile:
>>> from itertools import takewhile
>>> def find_suffix(s):
return ''.join(takewhile(str.isalpha, s[::-1]))[::-1]
...
>>> for d in data:
sfx = find_suffix(d)
print (d.replace(sfx, prefix_dict.get(sfx, sfx)))
...
101e-9
100e-3
100.100e-15
1e-1
1e1
Try:
for i, entry in enumerate(data):
for key, value in sorted(prefix_dict.items(),
key = lambda x: len(x[0]), reverse=True):
# need to sort the dictionary so that 'da' always comes before 'a'
if key in entry:
data[i] = entry.replace(key, value)
print(data)
This works for arbitrary combinations in the dictionary and the data. If the dictionary key is always only 1 string long, you have lots of other solutions posted here.
import re
data = ['101da', '100m', '100.100f']
prefix_dict = {'y': 'e-24', 'z': 'e-21', 'a': 'e-18', 'f': 'e-15', 'p': 'e-12',
'n': 'e-9', 'u': 'e-6', 'm': 'e-3', 'c': 'e-2', 'd': 'e-1',
'da': 'e1', 'h': 'e2', 'k': 'e3', 'M': 'e6', 'G': 'e9',
'T': 'e12', 'P': 'e15', 'E': 'e18', 'Z': 'e21', 'Y': 'e24'}
comp = re.compile(r"[^\[A-Za-z]")
for ind,d in enumerate(data):
pre = re.sub(comp,"",d)
data[ind] = d.replace(pre,prefix_dict.get(pre))
print data
['101e1', '100e-3', '100.100e-15']
You can use pre = [x for x in d if x.isalpha()][0] instead of using re