Select random element from "non uniform" python dictionary - python

I have a python dictionary in which values are lists of integers:
key1 -> [1, 2, 3]
key2 -> [1, 2, 3, ... 17]
key3 -> [1, 2, 3, 4, 5]
I want to select a random tuple(key, val) where val is a random value from the list of values (for example: key2, 8). The random selection must be uniform across all the values, so for example, this method is not uniform:
random_key = random.choice(d.keys())
random_val = random.choice(d[random_key])
because the lists are not of the same length.
I know the length of the concatenation of the lists, n, so my current approach is the following:
idx = np.random.randint(n)
c = 0
found = False
for k in D:
for v in D[k]:
if c == idx:
found = True
do_something_with_val(k, v);
break
c += 1
if found:
break
My question is: is there a better/faster method of doing this?

You can try (in Python 3—for Python 2, use iteritems()):
idx = random.randint(0, n)
for k, v in D.items():
if idx < len(v):
do_something_with_val(k, v[idx])
break
else:
idx -= len(v)
Speed measuring:
def ref():
idx = random.randint(0, n)
c = 0
found = False
for k in D:
for v in D[k]:
if c == idx:
found = True
# do_something_with_val(k, v);
break
c += 1
if found:
break
def uut():
idx = random.randint(0, n)
for k, v in D.items():
if idx < len(v):
# do_something_with_val(k, v[idx])
break
else:
idx -= len(v)
if __name__ == '__main__':
print(timeit.timeit('ref()', setup="from __main__ import ref", number=1000))
print(timeit.timeit('uut()', setup="from __main__ import uut", number=1000))
Results:
1.7672173159990052
0.011254642000494641
I checked distribution using small D like {'key2': [3, 4, 5], 'key1': [0, 1, 2]} and the distribution looks good for me:
0,166851
1,166141
2,166269
3,167094
4,167130
5,166515

You can build an helper "values" container and use it for the random choice...
import random
d = {1: [1, 2],
2: [1, 2, 3, 4]}
values = [(k, v) for k, l in d.items() for v in l ]
k,v = random.choice(values)
print (k, v)
This approach is very fast but takes more memory...
Have fun ;)

Related

How to group list of duplicate continuous value in a list with a recursion function?

I want to group consecutive values if it's duplicates and each value is just in one group, let's see my example below:
Note: results is an index of the value in test_list
test_list = ["1","2","1","2","1","1","5325235","2","62623","1","1"]
--->results = [[[0, 1], [2, 3]],
[[4, 5], [9, 10]]]
test_list = ["1","2","1","1","2","1","5325235","2","62623","1","2","1","236","2388","626236437","1","2","1","236","2388"]
--->results = [[[9, 10, 11, 12, 13], [15, 16, 17, 18, 19]],
[[0, 1, 2], [3, 4, 5]]]
I build a recursive function:
def group_duplicate_continuous_value(list_label_group):
# how to know which continuous value is duplicate, I implement take next number minus the previous number
list_flag_grouping = [str(int(j.split("_")[0]) - int(i.split("_")[0])) +f"_{j}_{i}" for i,j in zip(list_label_group,list_label_group[1:])]
# I find duplicate value in list_flag_grouping
counter_elements = Counter(list_flag_grouping)
list_have_duplicate = [k for k,v in counter_elements.items() if v > 1]
if len(list_have_duplicate) > 0:
list_final_index = group_duplicate_continuous_value(list_flag_grouping)
# To return exactly value, I use index to define
for k, v in list_final_index.items():
temp_list = [v[i] + [v[i][-1] + 1] for i in range(0,len(v))]
list_final_index[k] = temp_list
check_first_cursive = list_label_group[0].split("_")
# If we have many list grouping duplicate countinous value with different length, we need function below to return exactly results
if len(check_first_cursive) > 1:
list_temp_index = find_index_duplicate(list_label_group)
list_duplicate_index = list_final_index.values()
list_duplicate_index = [val for sublist in list_duplicate_index for val1 in sublist for val in val1]
for k,v in list_temp_index.items():
list_index_v = [val for sublist in v for val in sublist]
if any(x in list_index_v for x in list_duplicate_index) is False:
list_final_index[k] = v
return list_final_index
else:
if len(list_label_group) > 0:
check_first_cursive = list_label_group[0].split("_")
if len(check_first_cursive) > 1:
list_final_index = find_index_duplicate(list_label_group)
return list_final_index
list_final_index = None
return list_final_index
Support function:
def find_index_duplicate(list_data):
dups = defaultdict(list)
for i, e in enumerate(list_data):
dups[e].append([i])
new_dict = {key:val for key, val in dups.items() if len(val) >1}
return new_dict
But when I run with test_list = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,1,2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,1,2,5,5,5], it's very slow and make out of memory (~6GB). I knew a reason is stack overflow of my recursive function group_duplicate_continuous_value but I don't know how to fix it.
You can create a dict of lists, where every item from the original list is a key in the dict, and every key is mapped to the list of its indices in the original list. For instance, your list ["1","3","5","5","7","1","3","5"] would result in the dict {"1": [0, 5], "3": [1, 6], "5": [2, 3, 7], "7": [4]}.
Creating a dict of lists in this way is very idiomatic in python, and fast, too: it can be done by iterating just once on the list.
def build_dict(l):
d = {}
for i, x in enumerate(l):
d.setdefault(x, []).append(i)
return d
l = ["1","3","5","5","7","1","3","5"]
d = build_dict(l)
print(d)
# {'1': [0, 5], '3': [1, 6], '5': [2, 3, 7], '7': [4]}
Then you can iterate on the dict to build two lists of indices:
def build_index_results(l):
d = build_dict(l)
idx1, idx2 = [], []
for v in d.values():
if len(v) > 1:
idx1.append(v[0])
idx2.append(v[1])
return idx1, idx2
print(build_index_results(l))
# ([0, 1, 2], [5, 6, 3])
Or using zip:
from operator import itemgetter
def build_index_results(l):
d = build_dict(l)
return list(zip(*map(itemgetter(0,1), (v for v in d.values() if len(v) > 1))))
print(build_index_results(l))
# [(0, 1, 2), (5, 6, 3)]
I can't resist showcasing more_itertools.map_reduce for this:
from more_itertools import map_reduce
from operator import itemgetter
def build_index_results(l):
d = map_reduce(enumerate(l),
keyfunc=itemgetter(1),
valuefunc=itemgetter(0),
reducefunc=lambda v: v[:2] if len(v) > 1 else None
)
return list(zip(*filter(None, d.values())))
print(build_index_results(l))
# [(0, 1, 2), (5, 6, 3)]

how do I multiply integers in a set?

You are given a sequence of positive ints where every element appears three times, except one that appears only once (let's call it x) and one that appears only twice (let's call it y).
Your task is to find x * x * y.
e.g.
arr = [1,1,1,2,2,2,3,3,4] -> 4 x 4 x 3
I have written some code below. I have a question regarding the final part of the code- so after the completion of the loop, there should be one integer left in seen_once and one integer left in seen_twice, but how do I then multiply these numbers, as they are now sitting in a set()?
def Missing_Values(arr):
seen_once = set()
seen_twice = set()
seen_thrice = set()
for i in arr:
if i not in seen_once or seen_twice or seen_thrice:
seen_once.add(i)
elif i in seen_once:
seen_twice.add(i)
seen_once.remove(i)
elif i in seen_twice:
seen_thrice.add(i)
seen_twice.remove(i)
return seen_once*seen_once*seen_twice
Missing_Values(arr)
One way would be to pop the values.
x = seen_once.pop()
y = seen_twice.pop()
return x * x * y
You can use counter from collections for better performance that also improves readability.
Following is the code:
from collections import Counter
arr = [1, 1, 1, 2, 2, 2, 3, 3, 4]
d = Counter(arr)
ans = 1
for x, cnt in d.items():
if cnt == 2:
ans *= x
elif cnt == 1:
ans *= (x * x)
print(ans)
You can also use list comprehension as a generator as follows:
from collections import Counter
arr = [1, 1, 1, 2, 4, 4, 3, 3, 4]
d = Counter(arr)
x, y = (x**(3 - cnt) for x, cnt in d.items() if(cnt <= 2))
print(x*y)
Counter Explanation:
Counter returns a dictionary where array item as a Key and item frequency as a value. For example, if array arr = [1, 1, 1, 2, 2, 2, 3, 3, 4] then counter provide following dictionary:
d = {
1: 3,
2: 3,
3: 2,
4: 1
}
You have a bug in your code, this is a working piece:
def Missing_Values(arr):
seen_once = set()
seen_twice = set()
seen_thrice = set()
for i in arr:
if i not in seen_once and i not in seen_twice and i not in seen_thrice: # Note this line!
seen_once.add(i)
elif i in seen_once:
seen_twice.add(i)
seen_once.remove(i)
elif i in seen_twice:
seen_thrice.add(i)
seen_twice.remove(i)
return next(iter(seen_once))*next(iter(seen_once))*next(iter(seen_twice))
arr = [1,1,1,2,2,2,3,3,4]
print(Missing_Values(arr))

Building a custom Counter function without using built-ins

I have this code:
L = [1, 4, 7, 5, 5, 4, 5, 1, 1, 1]
def frequency(L):
counter = 0
number = L[0]
for i in L:
amount_times = L.count(i)
if amount_times > counter:
counter = amount_times
number = i
return number
print(frequency(L))
But I don't want to use counter function. I want to make code run without any built-in functions. How can I do this?
If you really want to reinvent collections.Counter, this is possible with and without list.count. However, I see no rationale.
Using list.count, you can use a dictionary comprehension. This is inefficient as the list is passed once for each variable.
def frequency2(L):
return {i: L.count(i) for i in set(L)}
If you do not wish to use list.count, this is possible using if / else:
def frequency3(L):
d = {}
for i in L:
if i in d:
d[i] += 1
else:
d[i] = 0
return d
Then to extract the highest count(s):
maxval = max(d.values())
res = [k for k, v in d.items() if v == maxval]
You could try this one. Not sure if this one is acceptable to you.
This finds the most frequent item in a list without using built-ins:
L = [1, 4, 7, 5, 5, 4, 5, 1, 1, 1]
def frequency(L):
count, item = 0, ''
d = {i:0 for i in L}
for i in L[::-1]:
d[i] = d[i] + 1
if d[i] >= count :
count = d[i]
item = i
return item
print(frequency(L))
# 1

How to get the keys from value in python?

I'm trying to solve question. which gives following output:
>>> frequency([13,12,11,13,14,13,7,11,13,14,12,14,14])
ANSWER: ([7], [13, 14])
Basically it's returning list of HIGHEST and LOWEST frequency.
I'm using collection.Counter() function So I got this:
Counter({13: 4, 14: 4, 11: 2, 12: 2, 7: 1})
I extracted key and values and I also got my values sorted in one list. Now I want to get keys which are having least and highest values so that I can generate list from that.
I don't know how to do that.
Not the most pythonic way, but easy to understand for the beginner.
from collections import Counter
L = [13,12,11,13,14,13,7,11,13,14,12,14,14]
answer_min = []
answer_max = []
d = Counter(L)
min_value = min(d.values())
max_value = max(d.values())
for k,v in d.items():
if v == min_value:
answer_min.append(k)
if v == max_value:
answer_max.append(k)
answer = (answer_min, answer_max)
answer
Gives us ([7], [13, 14]). It looks like you only needed to know about dictionary.items() to solve this.
You can take the minimum and maximum values first, then build the list of keys at those values with list comprehensions:
c = Counter({13: 4, 14: 4, 11: 2, 12: 2, 7: 1})
values = c.values()
mn, mx = min(values), max(values)
mins = [k for k, v in c.items() if v == mn]
maxs = [k for k, v in c.items() if v == mx]
print (mins, maxs)
# ([7], [13, 14])
You can try this:
import collections
s = [13,12,11,13,14,13,7,11,13,14,12,14,14]
count = collections.Counter(s)
mins = [a for a, b in count.items() if b == min(count.values())]
maxes = [a for a, b in count.items() if b == max(count.values())]
final_vals = [mins, maxes]
Output:
[[7], [13, 14]]

How to optimize this Python code?

def maxVote(nLabels):
count = {}
maxList = []
maxCount = 0
for nLabel in nLabels:
if nLabel in count:
count[nLabel] += 1
else:
count[nLabel] = 1
#Check if the count is max
if count[nLabel] > maxCount:
maxCount = count[nLabel]
maxList = [nLabel,]
elif count[nLabel]==maxCount:
maxList.append(nLabel)
return random.choice(maxList)
nLabels contains a list of integers.
The above function returns the integer with highest frequency, if more than one have same frequency then a randomly selected integer from them is returned.
E.g. maxVote([1,3,4,5,5,5,3,12,11]) is 5
import random
import collections
def maxvote(nlabels):
cnt = collections.defaultdict(int)
for i in nlabels:
cnt[i] += 1
maxv = max(cnt.itervalues())
return random.choice([k for k,v in cnt.iteritems() if v == maxv])
print maxvote([1,3,4,5,5,5,3,3,11])
In Python 3.1 or future 2.7 you'd be able to use Counter:
>>> from collections import Counter
>>> Counter([1,3,4,5,5,5,3,12,11]).most_common(1)
[(5, 3)]
If you don't have access to those versions of Python you could do:
>>> from collections import defaultdict
>>> d = defaultdict(int)
>>> for i in nLabels:
d[i] += 1
>>> max(d, key=lambda x: d[x])
5
It appears to run in O(n) time. However there may be a bottleneck in checking if nLabel in count since this operation could also potentially run O(n) time as well, making the total efficiency O(n^2).
Using a dictionary instead of a list in this case is the only major efficiency boost I can spot.
I'm not sure what exactly you want to optimize, but this should work:
from collections import defaultdict
def maxVote(nLabels):
count = defaultdict(int)
for nLabel in nLabels:
count[nLabel] += 1
maxCount = max(count.itervalues())
maxList = [k for k in count if count[k] == maxCount]
return random.choice(maxList)
Idea 1
Does the return really need to be random, or can you just return a maximum? If you just need to nondeterministically return a max frequency, you could just store a single label and remove the list logic, including
elif count[nLabel]==maxCount:
maxList.append(nLabel)
Idea 2
If this method is called frequently, would it be possible to only work on new data, as opposed to the entire data set? You could cache your count map and then only process new data. Assuming your data set is large and the calculations are done online, this could net huge improvements.
Complete example:
#!/usr/bin/env python
def max_vote(l):
"""
Return the element with the (or a) maximum frequency in ``l``.
"""
unsorted = [(a, l.count(a)) for a in set(l)]
return sorted(unsorted, key=lambda x: x[1]).pop()[0]
if __name__ == '__main__':
votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]
print max_vote(votes)
# => 5
Benchmarks:
#!/usr/bin/env python
import random
import collections
def max_vote_2(l):
"""
Return the element with the (or a) maximum frequency in ``l``.
"""
unsorted = [(a, l.count(a)) for a in set(l)]
return sorted(unsorted, key=lambda x: x[1]).pop()[0]
def max_vote_1(nlabels):
cnt = collections.defaultdict(int)
for i in nlabels:
cnt[i] += 1
maxv = max(cnt.itervalues())
return random.choice([k for k,v in cnt.iteritems() if v == maxv])
if __name__ == '__main__':
from timeit import Timer
votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]
print max_vote_1(votes)
print max_vote_2(votes)
t = Timer("votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]; max_vote_2(votes)", \
"from __main__ import max_vote_2")
print 'max_vote_2', t.timeit(number=100000)
t = Timer("votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]; max_vote_1(votes)", \
"from __main__ import max_vote_1")
print 'max_vote_1', t.timeit(number=100000)
Yields:
5
5
max_vote_2 1.79455208778
max_vote_1 2.31705093384

Categories