I'm trying to solve question. which gives following output:
>>> frequency([13,12,11,13,14,13,7,11,13,14,12,14,14])
ANSWER: ([7], [13, 14])
Basically it's returning list of HIGHEST and LOWEST frequency.
I'm using collection.Counter() function So I got this:
Counter({13: 4, 14: 4, 11: 2, 12: 2, 7: 1})
I extracted key and values and I also got my values sorted in one list. Now I want to get keys which are having least and highest values so that I can generate list from that.
I don't know how to do that.
Not the most pythonic way, but easy to understand for the beginner.
from collections import Counter
L = [13,12,11,13,14,13,7,11,13,14,12,14,14]
answer_min = []
answer_max = []
d = Counter(L)
min_value = min(d.values())
max_value = max(d.values())
for k,v in d.items():
if v == min_value:
answer_min.append(k)
if v == max_value:
answer_max.append(k)
answer = (answer_min, answer_max)
answer
Gives us ([7], [13, 14]). It looks like you only needed to know about dictionary.items() to solve this.
You can take the minimum and maximum values first, then build the list of keys at those values with list comprehensions:
c = Counter({13: 4, 14: 4, 11: 2, 12: 2, 7: 1})
values = c.values()
mn, mx = min(values), max(values)
mins = [k for k, v in c.items() if v == mn]
maxs = [k for k, v in c.items() if v == mx]
print (mins, maxs)
# ([7], [13, 14])
You can try this:
import collections
s = [13,12,11,13,14,13,7,11,13,14,12,14,14]
count = collections.Counter(s)
mins = [a for a, b in count.items() if b == min(count.values())]
maxes = [a for a, b in count.items() if b == max(count.values())]
final_vals = [mins, maxes]
Output:
[[7], [13, 14]]
Related
I want to group consecutive values if it's duplicates and each value is just in one group, let's see my example below:
Note: results is an index of the value in test_list
test_list = ["1","2","1","2","1","1","5325235","2","62623","1","1"]
--->results = [[[0, 1], [2, 3]],
[[4, 5], [9, 10]]]
test_list = ["1","2","1","1","2","1","5325235","2","62623","1","2","1","236","2388","626236437","1","2","1","236","2388"]
--->results = [[[9, 10, 11, 12, 13], [15, 16, 17, 18, 19]],
[[0, 1, 2], [3, 4, 5]]]
I build a recursive function:
def group_duplicate_continuous_value(list_label_group):
# how to know which continuous value is duplicate, I implement take next number minus the previous number
list_flag_grouping = [str(int(j.split("_")[0]) - int(i.split("_")[0])) +f"_{j}_{i}" for i,j in zip(list_label_group,list_label_group[1:])]
# I find duplicate value in list_flag_grouping
counter_elements = Counter(list_flag_grouping)
list_have_duplicate = [k for k,v in counter_elements.items() if v > 1]
if len(list_have_duplicate) > 0:
list_final_index = group_duplicate_continuous_value(list_flag_grouping)
# To return exactly value, I use index to define
for k, v in list_final_index.items():
temp_list = [v[i] + [v[i][-1] + 1] for i in range(0,len(v))]
list_final_index[k] = temp_list
check_first_cursive = list_label_group[0].split("_")
# If we have many list grouping duplicate countinous value with different length, we need function below to return exactly results
if len(check_first_cursive) > 1:
list_temp_index = find_index_duplicate(list_label_group)
list_duplicate_index = list_final_index.values()
list_duplicate_index = [val for sublist in list_duplicate_index for val1 in sublist for val in val1]
for k,v in list_temp_index.items():
list_index_v = [val for sublist in v for val in sublist]
if any(x in list_index_v for x in list_duplicate_index) is False:
list_final_index[k] = v
return list_final_index
else:
if len(list_label_group) > 0:
check_first_cursive = list_label_group[0].split("_")
if len(check_first_cursive) > 1:
list_final_index = find_index_duplicate(list_label_group)
return list_final_index
list_final_index = None
return list_final_index
Support function:
def find_index_duplicate(list_data):
dups = defaultdict(list)
for i, e in enumerate(list_data):
dups[e].append([i])
new_dict = {key:val for key, val in dups.items() if len(val) >1}
return new_dict
But when I run with test_list = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,1,2,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,1,2,5,5,5], it's very slow and make out of memory (~6GB). I knew a reason is stack overflow of my recursive function group_duplicate_continuous_value but I don't know how to fix it.
You can create a dict of lists, where every item from the original list is a key in the dict, and every key is mapped to the list of its indices in the original list. For instance, your list ["1","3","5","5","7","1","3","5"] would result in the dict {"1": [0, 5], "3": [1, 6], "5": [2, 3, 7], "7": [4]}.
Creating a dict of lists in this way is very idiomatic in python, and fast, too: it can be done by iterating just once on the list.
def build_dict(l):
d = {}
for i, x in enumerate(l):
d.setdefault(x, []).append(i)
return d
l = ["1","3","5","5","7","1","3","5"]
d = build_dict(l)
print(d)
# {'1': [0, 5], '3': [1, 6], '5': [2, 3, 7], '7': [4]}
Then you can iterate on the dict to build two lists of indices:
def build_index_results(l):
d = build_dict(l)
idx1, idx2 = [], []
for v in d.values():
if len(v) > 1:
idx1.append(v[0])
idx2.append(v[1])
return idx1, idx2
print(build_index_results(l))
# ([0, 1, 2], [5, 6, 3])
Or using zip:
from operator import itemgetter
def build_index_results(l):
d = build_dict(l)
return list(zip(*map(itemgetter(0,1), (v for v in d.values() if len(v) > 1))))
print(build_index_results(l))
# [(0, 1, 2), (5, 6, 3)]
I can't resist showcasing more_itertools.map_reduce for this:
from more_itertools import map_reduce
from operator import itemgetter
def build_index_results(l):
d = map_reduce(enumerate(l),
keyfunc=itemgetter(1),
valuefunc=itemgetter(0),
reducefunc=lambda v: v[:2] if len(v) > 1 else None
)
return list(zip(*filter(None, d.values())))
print(build_index_results(l))
# [(0, 1, 2), (5, 6, 3)]
I am seeking a way to improve performance of my code: Given two dictionaries I need to find the keys of matching value pairs. So far I am iterating over both dictionaries, which will be very slow when both have up to 100000 key-value-pairs.
Given:
keys of both dictionaries are always numeric and sorted ascending
keys of both dictionaries refer to features of a QGIS layer I need to work with, so I really need to keep them this way
values of both dictionaries can have any datatype but both always do have the same datatype
values of both dictionaries are randomly filled
values can contain duplicates which may not be removed
Does anyone have a brilliant idea how I could improve the performance? Note that also a "no, absolutely not possible" is an acceptable answer, if well founded, so I can finally stop trying and searching.
dict_a = {1:'abc',2:'def',3:'abc',4:'ghj',5:'klm',6:'nop',7:'def',8:'abc',9:'xyz',10:'abc'}
dict_b = {1:'abc',2:'a',3:'b',4:'xyz',5:'abc',6:'b',7:'c',8:'def',9:'d',10:'e'}
# imagine both dictionaries have up to 100000 entries...
desired_matching_dict = {1:1,1:5,2:8,3:1,3:5,7:8,8:1,8:5,9:4,10:1,10:5} # example of my desired output
matching_dict_slow = {}
matching_dict_fast = {}
# This will be very slow when having huge dictionaries...
for key_a, value_a in dict_a.items():
for key_b, value_b in dict_b.items():
if value_a == value_b:
matching_dict_slow[key_a] = key_b
# Seeking an attempt to speed this up
# But getting lost...
for key, value in dict_a.items():
if value in dict_b.items():
if dict_a[key] == dict_b[key]:
matching_dict_fast[key]=dict_a[key]
print('Slow method works: ' + str(desired_matching_dict == matching_dict_slow))
print('Fast method works: ' + str(desired_matching_dict == matching_dict_fast))
From the competitive programming uses I've generally faced, this simple approach should work fine:
dict_a = {1:'abc',2:'def',3:'abc',4:'ghj',5:'klm',6:'nop',7:'def',8:'abc',9:'xyz',10:'abc'}
dict_b = {1:'abc',2:'a',3:'b',4:'xyz',5:'abc',6:'b',7:'c',8:'def',9:'d',10:'e'}
dic2 = {}
for i in dict_b.keys():
elem = dict_b[i]
if dic2.get(elem, None):
dic2[elem].append(i)
else:
dic2[elem] = [i]
matches = {}
for i in dict_a.keys():
elem = dict_a[i]
x = dic2.get(elem, None)
if x:
matches[i] = x
print(matches) #prints {1: [1, 5], 2: [8], 3: [1, 5], 7: [8], 8: [1, 5], 9: [4], 10: [1, 5]}
You can then access your features like:
for k, v in matches.items():
l = len(v) - 1
i = 0
for l in v:
print('desired pair: ' + 'key (dict_a feature) = ' + str(k) + ' | value(dict_b feature) = ' + str(v[i]))
i += 1
def dict_gen(a, b):
for i in a:
res = []
for j in b:
if a[i] == b[j]:
res.append(j)
if res:
yield [(i), res]
d = dict(i for i in dict_gen(dict_a, dict_b))
print(d)
Output:
{1: [1, 5], 2: [8], 3: [1, 5], 7: [8], 8: [1, 5], 9: [4], 10: [1, 5]}
[Finished in 0.1s]
I have this code:
L = [1, 4, 7, 5, 5, 4, 5, 1, 1, 1]
def frequency(L):
counter = 0
number = L[0]
for i in L:
amount_times = L.count(i)
if amount_times > counter:
counter = amount_times
number = i
return number
print(frequency(L))
But I don't want to use counter function. I want to make code run without any built-in functions. How can I do this?
If you really want to reinvent collections.Counter, this is possible with and without list.count. However, I see no rationale.
Using list.count, you can use a dictionary comprehension. This is inefficient as the list is passed once for each variable.
def frequency2(L):
return {i: L.count(i) for i in set(L)}
If you do not wish to use list.count, this is possible using if / else:
def frequency3(L):
d = {}
for i in L:
if i in d:
d[i] += 1
else:
d[i] = 0
return d
Then to extract the highest count(s):
maxval = max(d.values())
res = [k for k, v in d.items() if v == maxval]
You could try this one. Not sure if this one is acceptable to you.
This finds the most frequent item in a list without using built-ins:
L = [1, 4, 7, 5, 5, 4, 5, 1, 1, 1]
def frequency(L):
count, item = 0, ''
d = {i:0 for i in L}
for i in L[::-1]:
d[i] = d[i] + 1
if d[i] >= count :
count = d[i]
item = i
return item
print(frequency(L))
# 1
I have a python dictionary in which values are lists of integers:
key1 -> [1, 2, 3]
key2 -> [1, 2, 3, ... 17]
key3 -> [1, 2, 3, 4, 5]
I want to select a random tuple(key, val) where val is a random value from the list of values (for example: key2, 8). The random selection must be uniform across all the values, so for example, this method is not uniform:
random_key = random.choice(d.keys())
random_val = random.choice(d[random_key])
because the lists are not of the same length.
I know the length of the concatenation of the lists, n, so my current approach is the following:
idx = np.random.randint(n)
c = 0
found = False
for k in D:
for v in D[k]:
if c == idx:
found = True
do_something_with_val(k, v);
break
c += 1
if found:
break
My question is: is there a better/faster method of doing this?
You can try (in Python 3—for Python 2, use iteritems()):
idx = random.randint(0, n)
for k, v in D.items():
if idx < len(v):
do_something_with_val(k, v[idx])
break
else:
idx -= len(v)
Speed measuring:
def ref():
idx = random.randint(0, n)
c = 0
found = False
for k in D:
for v in D[k]:
if c == idx:
found = True
# do_something_with_val(k, v);
break
c += 1
if found:
break
def uut():
idx = random.randint(0, n)
for k, v in D.items():
if idx < len(v):
# do_something_with_val(k, v[idx])
break
else:
idx -= len(v)
if __name__ == '__main__':
print(timeit.timeit('ref()', setup="from __main__ import ref", number=1000))
print(timeit.timeit('uut()', setup="from __main__ import uut", number=1000))
Results:
1.7672173159990052
0.011254642000494641
I checked distribution using small D like {'key2': [3, 4, 5], 'key1': [0, 1, 2]} and the distribution looks good for me:
0,166851
1,166141
2,166269
3,167094
4,167130
5,166515
You can build an helper "values" container and use it for the random choice...
import random
d = {1: [1, 2],
2: [1, 2, 3, 4]}
values = [(k, v) for k, l in d.items() for v in l ]
k,v = random.choice(values)
print (k, v)
This approach is very fast but takes more memory...
Have fun ;)
def maxVote(nLabels):
count = {}
maxList = []
maxCount = 0
for nLabel in nLabels:
if nLabel in count:
count[nLabel] += 1
else:
count[nLabel] = 1
#Check if the count is max
if count[nLabel] > maxCount:
maxCount = count[nLabel]
maxList = [nLabel,]
elif count[nLabel]==maxCount:
maxList.append(nLabel)
return random.choice(maxList)
nLabels contains a list of integers.
The above function returns the integer with highest frequency, if more than one have same frequency then a randomly selected integer from them is returned.
E.g. maxVote([1,3,4,5,5,5,3,12,11]) is 5
import random
import collections
def maxvote(nlabels):
cnt = collections.defaultdict(int)
for i in nlabels:
cnt[i] += 1
maxv = max(cnt.itervalues())
return random.choice([k for k,v in cnt.iteritems() if v == maxv])
print maxvote([1,3,4,5,5,5,3,3,11])
In Python 3.1 or future 2.7 you'd be able to use Counter:
>>> from collections import Counter
>>> Counter([1,3,4,5,5,5,3,12,11]).most_common(1)
[(5, 3)]
If you don't have access to those versions of Python you could do:
>>> from collections import defaultdict
>>> d = defaultdict(int)
>>> for i in nLabels:
d[i] += 1
>>> max(d, key=lambda x: d[x])
5
It appears to run in O(n) time. However there may be a bottleneck in checking if nLabel in count since this operation could also potentially run O(n) time as well, making the total efficiency O(n^2).
Using a dictionary instead of a list in this case is the only major efficiency boost I can spot.
I'm not sure what exactly you want to optimize, but this should work:
from collections import defaultdict
def maxVote(nLabels):
count = defaultdict(int)
for nLabel in nLabels:
count[nLabel] += 1
maxCount = max(count.itervalues())
maxList = [k for k in count if count[k] == maxCount]
return random.choice(maxList)
Idea 1
Does the return really need to be random, or can you just return a maximum? If you just need to nondeterministically return a max frequency, you could just store a single label and remove the list logic, including
elif count[nLabel]==maxCount:
maxList.append(nLabel)
Idea 2
If this method is called frequently, would it be possible to only work on new data, as opposed to the entire data set? You could cache your count map and then only process new data. Assuming your data set is large and the calculations are done online, this could net huge improvements.
Complete example:
#!/usr/bin/env python
def max_vote(l):
"""
Return the element with the (or a) maximum frequency in ``l``.
"""
unsorted = [(a, l.count(a)) for a in set(l)]
return sorted(unsorted, key=lambda x: x[1]).pop()[0]
if __name__ == '__main__':
votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]
print max_vote(votes)
# => 5
Benchmarks:
#!/usr/bin/env python
import random
import collections
def max_vote_2(l):
"""
Return the element with the (or a) maximum frequency in ``l``.
"""
unsorted = [(a, l.count(a)) for a in set(l)]
return sorted(unsorted, key=lambda x: x[1]).pop()[0]
def max_vote_1(nlabels):
cnt = collections.defaultdict(int)
for i in nlabels:
cnt[i] += 1
maxv = max(cnt.itervalues())
return random.choice([k for k,v in cnt.iteritems() if v == maxv])
if __name__ == '__main__':
from timeit import Timer
votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]
print max_vote_1(votes)
print max_vote_2(votes)
t = Timer("votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]; max_vote_2(votes)", \
"from __main__ import max_vote_2")
print 'max_vote_2', t.timeit(number=100000)
t = Timer("votes = [1, 3, 4, 5, 5, 5, 3, 12, 11]; max_vote_1(votes)", \
"from __main__ import max_vote_1")
print 'max_vote_1', t.timeit(number=100000)
Yields:
5
5
max_vote_2 1.79455208778
max_vote_1 2.31705093384