I created a dictionary myDict holding 10 million entries in the following form. Each entry in the dictionary represent {(id, age): code}
>>> myDict = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281'
}
A constant ageOffset is defined with value = 0.1
Given an (id,age) tuple, how can I fetch all values from myDict which have key (id, X) where:
age <= X <= age+ageOffset
I need to perform this fetch operation 20 billion times.
Examples:
1.
myTup = ('1039', '68.0')
the answer is: '42731,42781,V4501'
2.
myTup = ('0845', '60.0')
Ans : No value returned
Edit:
Can I create a sub-dictionary, on the basis of partial match on the first element of the Key. I mean, If first element of the tuple Key matched, then create a subdictionary. According to my data, this wont be longer than a couple of hundreds. And then perform linear range search comparing the second element in the tuple key and finding the corresponding values.
To do this operation 20 billion(!) times, you will have to preprocess your data a bit.
First, I would group by id:
def preprocess(data):
from collections import defaultdict # Python 2.5+ only
preprocessed = defaultdict(list)
# group by id
for (id, age), value in data.iteritems():
preprocessed[id].append((float(age), value))
# sort lists for binary search, see edit
for key, value in preprocessed.iteritems():
value.sort()
return preprocessed
Result should look like this:
>>> preprocess(myDict)
defaultdict(<type 'list'>, {
'0845': [(60.4476, '2724,27800,4019')],
'0983': [(63.3936, '41401,4168,4240,V1582,V7281')],
'1039': [(68.0864, '42731,42781,V4501'), (68.1704, '4770,4778,V071')]}
If relatively few items share the same id, thus resulting in short lists, you might get away with filtering the list.
def lookup(data, id, age, age_offset=0.1):
if id in data:
return [value for x, value in data[id] if age <= x <= age+age_offset]
else:
return None
lookup(preprocessed, '1039', 68.0) # Note that I use floats for age
['42731,42781,V4501']
However, if many items share the same id, you will have to traverse long lists, making the lookup relatively slow. In this case, you will have to apply further optimizations.
Edit: as suggested by #Andrey Petrov
from bisect import bisect_left
from itertools import islice, takewhile
def optimized_lookup(data, id, age, age_offset=0.1):
if id in data:
l = data[id]
idx = bisect_left(l, age)
return [a for a,v in takewhile(lambda (x, value): x <= age+age_offset, islice(l, idx, None))]
else:
return None
Here's a way to do it in numpy, and though I haven't tested it I'm pretty confident it will be vastly faster than looping over the dictionary. I replaced the dictionary structure with a Numpy record array, and used np.where to locate the rows where they match the parameters you gave.
import numpy as np
myDict = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281'
}
records=[]
for k,v in myDict.iteritems():
records.append([k[0], float(k[1]), v])
myArr = np.rec.fromrecords(records, formats='S10, f4, S100',
names="ID, Age, Code")
def findInMyArray(arr, requestedID, requestedAge, tolerance=0.1):
idx = np.where(((arr["Age"] - requestedAge) < tolerance) & (arr["ID"] == requestedID))
return idx
idx = findInMyArray(myArr, "1039", 68.0, tolerance=0.1)
print "The index found is: ", idx
print "The values are: ", myArr["Code"][idx[0]]
def getr(t):
id = float(t[0])
age = float(t[1])
os = 0.1
rs = []
correct_id=fixed[id]
for k in correct_id.keys():
if (k > age and k <= age + os):
rs.append(correct_id.get(k))
return rs
ct = {('1039', '68.0864'): '42731,42781,V4501',
('1039', '68.1704'): '4770,4778,V071',
('0845', '60.4476'): '2724,27800,4019',
('0983', '63.3936'): '41401,4168,4240,V1582,V7281' }
fixed={}
for k in ct:
if not(float(k[0]) in fixed):
fixed[float(k[0])]={}
fixed[float(k[0])][float(k[1])] = ct[k]
print "1"
myTup = ('1039', '68.0')
assert(getr(myTup) == ['42731,42781,V4501'])
#the answer is: '42731,42781,V4501'
print "2"
myTup = ('0845', '60.0')
assert(getr(myTup) == [])
#Ans : No value returned
Related
Background
We have a family tradition where my and my siblings' Christmas presents are identified by a code that can be solved using only numbers related to us. For example, the code could be birth month * age + graduation year (This is a simple one). If the numbers were 8 * 22 + 2020 = 2196, the number 2196 would be written on all my Christmas presents.
I've already created a Python class that solves the code with certain constraints, but I'm wondering if it's possible to do it recursively.
Current Code
The first function returns a result set for all possible combinations of numbers and operations that produce a value in target_values
#Master algorithm (Get the result set of all combinations of numbers and cartesian products of operations that reach a target_value, using only the number_of_numbers_in_solution)
#Example: sibling1.results[1] = [(3, 22, 4), (<built-in function add>, <built-in function add>), 29]. This means that 3 + 22 + 4 = 29, and 29 is in target_values
import operator
from itertools import product
from itertools import combinations
NUMBER_OF_OPERATIONS_IN_SOLUTION = 2 #Total numbers involved is this plus 1
NUMBER_OF_NUMBERS_IN_SOLUTION = NUMBER_OF_OPERATIONS_IN_SOLUTION + 1
TARGET_VALUES = {22,27,29,38,39}
def getresults( list ):
#Add the cartesian product of all possible operations to a variable ops
ops = []
opslist = [operator.add, operator.sub, operator.mul, operator.truediv]
for val in product(opslist, repeat=NUMBER_OF_OPERATIONS_IN_SOLUTION):
ops.append(val)
#Get the result set of all combinations of numbers and cartesian products of operations that reach a target_value
results = []
for x in combinations(list, NUMBER_OF_NUMBERS_IN_SOLUTION):
for y in ops:
result = 0
for z in range(len(y)):
#On the first iteration, do the operation on the first two numbers (x[z] and x[z+1])
if (z == 0):
#print(y[z], x[z], x[z+1])
result = y[z](x[z], x[z+1])
#For all other iterations, do the operation on the current result and x[z+1])
else:
#print(y[z], result, x[z+1])
result = y[z](result, x[z+1])
if result in TARGET_VALUES:
results.append([x, y, result])
#print (x, y)
print(len(results))
return results
Then a class that takes in personal parameters for each person and gets the result set
def getalpha( str, inverse ):
"Converts string to alphanumeric array of chars"
array = []
for i in range(0, len(str)):
alpha = ord(str[i]) - 96
if inverse:
array.append(27 - alpha)
else:
array.append(alpha)
return array;
class Person:
def __init__(self, name, middlename, birthmonth, birthday, birthyear, age, orderofbirth, gradyear, state, zip, workzip, cityfirst3):
#final list
self.listofnums = []
self.listofnums.extend((birthmonth, birthday, birthyear, birthyear - 1900, age, orderofbirth, gradyear, gradyear - 2000, zip, workzip))
self.listofnums.extend(getalpha(cityfirst3, False))
self.results = getresults(self.listofnums)
Finally, a "solve code" method that takes from the result sets and finds any possible combinations that produce the full list of target_values.
#Compares the values of two sets
def compare(l1, l2):
result = all(map(lambda x, y: x == y, l1, l2))
return result and len(l1) == len(l2)
#Check every result in sibling2 with a different result target_value and equal operation sets
def comparetwosiblings(current_values, sibling1, sibling2, a, b):
if sibling2.results[b][2] not in current_values and compare(sibling1.results[a][1], sibling2.results[b][1]):
okay = True
#If the indexes aren't alphanumeric, ensure they're the same before adding to new result set
for c in range(0, NUMBER_OF_NUMBERS_IN_SOLUTION):
indexintersection = set([index for index, value in enumerate(sibling1.listofnums) if value == sibling1.results[a][0][c]]) & set([index for index, value in enumerate(sibling2.listofnums) if value == sibling2.results[b][0][c]])
if len(indexintersection) > 0:
okay = True
else:
okay = False
break
else:
okay = False
return okay
#For every result, we start by adding the result number to the current_values list for sibling1, then cycle through each person and see if a matching operator list leads to a different result number. (Matching indices as well)
#If there's a result set for everyone that leads to five different numbers in the code, the values will be added to the newresult set
def solvecode( sibling1, sibling2, sibling3, sibling4, sibling5 ):
newresults = []
current_values = []
#For every result in sibling1
for a in range(len(sibling1.results)):
current_values = []
current_values.append(sibling1.results[a][2])
for b in range(len(sibling2.results)):
if comparetwosiblings(current_values, sibling1, sibling2, a, b):
current_values.append(sibling2.results[b][2])
for c in range(len(sibling3.results)):
if comparetwosiblings(current_values, sibling1, sibling3, a, c):
current_values.append(sibling3.results[c][2])
for d in range(len(sibling4.results)):
if comparetwosiblings(current_values, sibling1, sibling4, a, d):
current_values.append(sibling4.results[d][2])
for e in range(len(sibling5.results)):
if comparetwosiblings(current_values, sibling1, sibling5, a, e):
newresults.append([sibling1.results[a][0], sibling2.results[b][0], sibling3.results[c][0], sibling4.results[d][0], sibling5.results[e][0], sibling1.results[a][1]])
current_values.remove(sibling4.results[d][2])
current_values.remove(sibling3.results[c][2])
current_values.remove(sibling2.results[b][2])
print(len(newresults))
print(newresults)
It's the last "solvecode" method that I'm wondering if I can optimize and make into a recursive algorithm. In some cases it can be helpful to add or remove a sibling, which would look nice recursively (My mom sometimes makes a mistake with one sibling, or we get a new brother/sister-in-law)
Thank you for any and all help! I hope you at least get a laugh out of my weird family tradition.
Edit: In case you want to test the algorithm, here's an example group of siblings that result in exactly one correct solution
#ALL PERSONAL INFO CHANGED FOR STACKOVERFLOW
sibling1 = Person("sibling1", "horatio", 7, 8, 1998, 22, 5, 2020, "ma", 11111, 11111, "red")
sibling2 = Person("sibling2", "liem", 2, 21, 1995, 25, 4, 2018, "ma", 11111, 11111, "pho")
sibling3 = Person("sibling3", "kyle", 4, 21, 1993, 26, 3, 2016, "ma", 11111, 11111, "okl")
sibling4 = Person("sibling4", "jamal", 4, 7, 1991, 29, 2, 2014, "ma", 11111, 11111, "pla")
sibling5 = Person("sibling5", "roberto", 9, 23, 1990, 30, 1, 2012, "ma", 11111, 11111, "boe")
I just spent a while improving the code. Few things I need to mention:
It's not good practice to use python keywords(like list, str and zip) as variables, it will give you problems and it makes it harder to debug.
I feel like you should use the permutation function as combination gives unordered pairs while permutation gives ordered pairs which are more in number and will give more results. For example, for the sibling info you gave combination gives only 1 solution through solvecode() while permutation gives 12.
Because you are working with operators, there can be more cases with brackets. To solve that problem and to make the getresults() function a bit more optimized, I suggest you explore the reverse polish notation. Computerphile has an excellent video on it.
You don't need a compare function. list1==list2 works.
Here's the optimized code:
import operator
from itertools import product
from itertools import permutations
NUMBER_OF_OPERATIONS_IN_SOLUTION = 2 #Total numbers involved is this plus 1
NUMBER_OF_NUMBERS_IN_SOLUTION = NUMBER_OF_OPERATIONS_IN_SOLUTION + 1
TARGET_VALUES = {22,27,29,38,39}
def getresults(listofnums):
#Add the cartesian product of all possible operations to a variable ops
ops = []
opslist = [operator.add, operator.sub, operator.mul, operator.truediv]
for val in product(opslist, repeat=NUMBER_OF_OPERATIONS_IN_SOLUTION):
ops.append(val)
#Get the result set of all combinations of numbers and cartesian products of operations that reach a target_value
results = []
for x in permutations(listofnums, NUMBER_OF_NUMBERS_IN_SOLUTION):
for y in ops:
result = y[0](x[0], x[1])
if NUMBER_OF_OPERATIONS_IN_SOLUTION>1:
for z in range(1, len(y)):
result = y[z](result, x[z+1])
if result in TARGET_VALUES:
results.append([x, y, result])
return results
def getalpha(string, inverse):
"Converts string to alphanumeric array of chars"
array = []
for i in range(0, len(string)):
alpha = ord(string[i]) - 96
array.append(27-alpha if inverse else alpha)
return array
class Person:
def __init__(self, name, middlename, birthmonth, birthday, birthyear, age, orderofbirth, gradyear, state, zipcode, workzip, cityfirst3):
#final list
self.listofnums = [birthmonth, birthday, birthyear, birthyear - 1900, age, orderofbirth, gradyear, gradyear - 2000, zipcode, workzip]
self.listofnums.extend(getalpha(cityfirst3, False))
self.results = getresults(self.listofnums)
#Check every result in sibling2 with a different result target_value and equal operation sets
def comparetwosiblings(current_values, sibling1, sibling2, a, b):
if sibling2.results[b][2] not in current_values and sibling1.results[a][1]==sibling2.results[b][1]:
okay = True
#If the indexes aren't alphanumeric, ensure they're the same before adding to new result set
for c in range(0, NUMBER_OF_NUMBERS_IN_SOLUTION):
indexintersection = set([index for index, value in enumerate(sibling1.listofnums) if value == sibling1.results[a][0][c]]) & set([index for index, value in enumerate(sibling2.listofnums) if value == sibling2.results[b][0][c]])
if len(indexintersection) > 0:
okay = True
else:
okay = False
break
else:
okay = False
return okay
And now, the million dollar function or should i say two functions:
# var contains the loop variables a-e, depth keeps track of sibling number
def rec(arg, var, current_values, newresults, depth):
for i in range(len(arg[depth].results)):
if comparetwosiblings(current_values, arg[0], arg[depth], var[0], i):
if depth<len(arg)-1:
current_values.append(arg[depth].results[i][2])
rec(arg, var[:depth]+[i], current_values, newresults, depth+1)
current_values.remove(arg[depth].results[i][2])
else:
var.extend([i])
newresults.append([arg[0].results[var[0]][0], arg[1].results[var[1]][0], arg[2].results[var[2]][0], arg[3].results[var[3]][0], arg[4].results[var[4]][0], arg[0].results[var[0]][1]])
def solvecode(*arg):
newresults = []
for a in range(len(arg[0].results)):
current_values = [arg[0].results[a][2]]
rec(arg, var=[a], current_values=current_values, newresults=newresults, depth=1)
print(len(newresults))
print(newresults)
There is a need for two functions as the first one is the recursive one and the second one is like a packaging. I've also fulfilled your second wish, that was being able to have variable number of siblings' data that can be input into the new solvecode function. I've checked the new functions and they work together exactly like the original solvecode function. Something to be noted is that there is no significant difference in the version's runtimes although the second one has 8 less lines of code. Hope this helped. lmao took me 3 hours.
import numpy as np
x = ([1,2,3,3])
y = ([1,2,3])
z = ([6,6,1,2,9,9])
(only positive values)
In each array i need to return the most common value, or, if values come up the same amount of times - return the minimum.
This is home assignment and I can't use anything but numpy.
outputs:
f(x) = 3,
f(y) = 1,
f(z) = 6
for a numpy exclusive solution something like this will work:
occurances = np.bincount(x)
print (np.argmax(occurances))
The above mentioned method won't work if there is a negative number in the list. So in order to account for such an occurrence kindly use:
not_required, counts = np.unique(x, return_counts=True)
x=np.array(x)
if (x >= 0).all():
print(not_required[np.argmax(counts)])
else:
print(not_required[np.argmax(counts)])
It's called a mode function. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mode.html
Without numpy
n_dict = {}
for k in x:
try:
n_dict[k] += 1
except KeyError:
n_dict[k] = 1
rev_n_dict = {}
for k in n_dict:
if n_dict[k] not in rev_n_dict:
rev_n_dict[n_dict[k]] = [k]
else:
rev_n_dict[n_dict[k]].append(k)
local_max = 0
for k in rev_n_dict:
if k > local_max:
local_max = k
if len(rev_n_dict[local_max]) > 0:
print (min(rev_n_dict[local_max]))
else:
print (rev_n_dict[local_max])
To add to the previous results, you could use a collections.Counter object:
my_array = [3,24,543,3,1,6,7,8,....,223213,13213]
from collections import Counter
my_counter = Counter( my_array)
most_common_value = my_counter.most_common(1)[0][0]
It is quite simple but certainly not pretty. I have used variable names that will be self explanatory along with the comments. Feel free to ask if there is a doubt.
import numpy as np
x=([6,6,1,2,9,9])
def tester(x):
not_required, counts = np.unique(x, return_counts=True)
x=np.array(x)
if (x >= 0).all():
highest_occurance=[not_required[np.argmax(counts)]]
number_of_counts=np.max(counts)
else:
highest_occurance=not_required[np.argmax(counts)]
number_of_counts=np.max(counts)
return highest_occurance,number_of_counts
most_abundant,first_test_counts=(tester(x))
new_x=[vals for vals in x if vals not in most_abundant]
second_most_abundant,second_test_counts=(tester(new_x))
if second_test_counts==first_test_counts:
print("Atleast two elements have the same number of counts",most_abundant," and", second_most_abundant, "have %s"%first_test_counts,"occurances")
else:
print("%s occurrs for the max of %s times"%(most_abundant,first_test_counts))
we can also loop it to check if there are more than two elements with the same occurrence, instead of using an if else for a specific case of only looking at two elements
Hi I'm trying to make a list of the maximum value of a unique string within a list.
example:
a = ['DS10.json', 'DS11.json', 'DT4.json', 'DT5.json', 'DT6.json', 'CJ6.json', 'CJ7.json']
should return me a list of the following:
['DS11.json', 'DT6.json', 'CJ7.json']
I have tried the following code:
def j(l):
p = []
for i in l:
digcode = i.split('.')[0]
if any(s.startswith(digcode[:2]) for s in p): #there exists prefex in list
if digcode[2:] > p[[n for n, l in enumerate(p) if l.startswith(digcode[:2])][0]][2:]:
p.pop([n for n, l in enumerate(p) if l.startswith(digcode[:2])][0])
p.append(digcode)
else:
pass
else:
p.append(digcode)
return p
But when I apply it to a larger sample it does not do an accurate job
>>> o = ['AS6.json', 'AS7.json', 'AS8.json', 'AS9.json', 'BS1.json', 'BS2.json', 'BS3.json', 'BS4.json', 'BS5.json', 'CS1.json', 'CS2.json', 'CS3.json', 'CS4.json', 'CS5.json', 'CS6.json', 'DS10.json', 'DS11.json', 'DS4.json', 'DS5.json', 'DS6.json', 'DS7.json', 'DS8.json', 'DS9.json', 'ES4.json', 'ES5.json', 'ES6.json', 'FS5.json', 'FS6.json', 'FS7.json', 'FS8.json', 'MS4.json', 'MS5.json', 'MS6.json', 'MS7.json', 'MS8.json', 'MS9.json', 'NR1.json', 'NR2.json', 'NR3.json', 'NR4.json', 'NR5.json', 'NR6.json', 'NR7.json', 'NR8.json', 'VR1.json', 'VR2.json', 'VR3.json', 'VR4.json', 'VR5.json', 'VR6.json', 'VR7.json', 'VR8.json', 'XS11.json', 'XS9.json']
>>> j(o)
['AS9', 'BS5', 'CS6', 'DS9', 'ES6', 'FS8', 'MS9', 'NR8', 'VR8', 'XS9']
which is incorrect as there is a XS11 and DS11 as an example.
I would appreciate if someone could help me rectify my problem or perhaps find a simpler solution to my problem. Thank you
You are making string comparisons; '9' is greater than '11' because the character '9' comes later in the alphabet. You'll have to convert those to integers first.
I'd use a dictionary to map prefixes to the maximum number:
def find_latest(lst):
prefixes = {}
for entry in lst:
code, value = entry[:2], int(entry.partition('.')[0][2:])
if value > prefixes.get(code, (float('-inf'), ''))[0]:
prefixes[code] = (value, entry)
return [entry for value, entry in prefixes.values()]
This is far more efficient as it doesn't loop over your whole input list each time; you are processing the list N^2 times (add one element and you are adding N tests to work through); it processes your list in N steps instead. So instead of 100 tests for 10 elements, this just executes 10 tests.
Demo:
>>> sample = ['AS6.json', 'AS7.json', 'AS8.json', 'AS9.json', 'BS1.json', 'BS2.json', 'BS3.json', 'BS4.json', 'BS5.json', 'CS1.json', 'CS2.json', 'CS3.json', 'CS4.json', 'CS5.json', 'CS6.json', 'DS10.json', 'DS11.json', 'DS4.json', 'DS5.json', 'DS6.json', 'DS7.json', 'DS8.json', 'DS9.json', 'ES4.json', 'ES5.json', 'ES6.json', 'FS5.json', 'FS6.json', 'FS7.json', 'FS8.json', 'MS4.json', 'MS5.json', 'MS6.json', 'MS7.json', 'MS8.json', 'MS9.json', 'NR1.json', 'NR2.json', 'NR3.json', 'NR4.json', 'NR5.json', 'NR6.json', 'NR7.json', 'NR8.json', 'VR1.json', 'VR2.json', 'VR3.json', 'VR4.json', 'VR5.json', 'VR6.json', 'VR7.json', 'VR8.json', 'XS11.json', 'XS9.json']
>>> def find_latest(lst):
... prefixes = {}
... for entry in lst:
... code, value = entry[:2], int(entry.partition('.')[0][2:])
... if value > prefixes.get(code, (float('-inf'), ''))[0]:
... prefixes[code] = (value, entry)
... return [entry for value, entry in prefixes.values()]
...
>>> find_latest(sample)
['FS8.json', 'VR8.json', 'AS9.json', 'MS9.json', 'BS5.json', 'CS6.json', 'XS11.json', 'NR8.json', 'DS11.json', 'ES6.json']
It looks as though your digcode[2:] values are being compared lexicographically (dictionary order), rather than numerically.
So 9 is considered to be "larger than" 11, because in a list of words, a word that began with "9" would come after a word that began with "11".
For comparison purposes you should convert digcode[2:] to a number i.e. int(digcode[2:])
if digcode[2:] > p[[n for n, l in enumerate(p) if l.startswith(digcode[:2])][0]][2:]:
to
if int(digcode[2:]) > int(p[[n for n, l in enumerate(p) if l.startswith(digcode[:2])][0]][2:]):
This gives:
>>> j(o)
['AS9', 'BS5', 'CS6', 'DS11', 'ES6', 'FS8', 'MS9', 'NR8', 'VR8', 'XS11']
I am looking for a function that makes a new array of values based on ordered_ids, when the array has a length of one million.
Input:
>>> ids=array(["WYOMING01","TEXAS01","TEXAS02",...])
>>> values=array([12,20,30,...])
>>> ordered_ids=array(["TEXAS01","TEXAS02","ALABAMA01",...])
Output:
ordered [ 20 , 30 , nan , ...]
Closing Summary
#Dietrich's use of a dictionary in list comprehension is 10x faster than using numpy index search (numpy.where). I compared the times of three results in my answer below.
You could try:
import numpy as np
def order_array(ids, values, master_order_ids):
n = len(master_order_ids)
idx = np.searchsorted(master_order_ids, ids)
ordered_values = np.zeros(n)
ordered_values[idx < n] = values[idx < n]
print "ordered", ordered_values
return ordered_values
Searchsorted gives you indices where you should insert ids into master_order_ids to keep the arrray ordered. Then you just drop those (idx, values) that are out of the range of master_order_ids.
You could try using a dict() to associate the stings to your numbers. It simplifies the code considerably:
import numpy as np
def order_bydict(ids,values,master_order_ids):
""" Using a dict to order ``master_order_ids`` """
dd = dict([(k,v) for k,v in zip(ids, values)]) # create the dict
ordered_values = [dd.get(m, 0) for m in master_order_ids] # get() return 0 if key not found
return np.asarray(ordered_values) # return a numpy array instead of a list
The speedwise improvement is hard to predict without testing longer arrays (with your example it was 25% faster based on %timeit).
import numpy
from numpy import copy, random, arange
import time
# SETUP
N=10**4
ids = arange(0,N).astype(str)
values = arange(0,N)
numpy.random.shuffle(ids)
numpy.random.shuffle(values)
ordered_ids=arange(0,N).astype(str)
ordered_values = numpy.empty((N,1))
ordered_values[:] = numpy.NAN
# METHOD 1
start = time.clock()
for i in range(len(values)):ordered_values[ordered_ids==ids[i]]=values[i]
print "not using dictionary:", time.clock() - start
# METHOD 2
start = time.clock()
d = dict(zip(ids, values))
for k, v in d.iteritems(): ordered_values[ordered_ids==k] = v
print "using dictionary:", time.clock() - start
# METHOD 3 #Dietrich's approach in the answer above
start = time.clock()
dd = dict(zip(ids, values))
ordered_values = [dd.get(m, 0) for m in ordered_ids]
print "using dictionary with list comprehension:", time.clock() - start
Results
not using dictionary: 1.320237 # Method 1
using dictionary: 1.327119 # Method 2
using dictionary with list comprehension: 0.013287 # #Dietrich
The following solution using the numpy_indexed package (disclaimer: I am its author) is purely vectorized, and likely to be much more efficient than the solutions posted thus far:
import numpy_indexed as npi
idx = npi.indices(ids, ordered_ids, missing='mask')
new_values = values[idx]
new_values[idx.mask] = -1 # or cast to float and set to nan, but you get the idea...
As an example my list is:
[25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866,
19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154,
13.09409, 12.18347, 11.33447, 10.32184, 9.544922, 8.813385, 8.181152,
6.983734, 6.048035, 5.505096, 4.65799]
and I'm looking for the index of the value closest to 11.5. I've tried other methods such as binary search and bisect_left but they don't work.
I cannot sort this array, because the index of the value will be used on a similar array to fetch the value at that index.
Try the following:
min(range(len(a)), key=lambda i: abs(a[i]-11.5))
For example:
>>> a = [25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866, 19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154, 13.09409, 12.18347, 11.33447, 10.32184, 9.544922, 8.813385, 8.181152, 6.983734, 6.048035, 5.505096, 4.65799]
>>> min(range(len(a)), key=lambda i: abs(a[i]-11.5))
16
Or to get the index and the value:
>>> min(enumerate(a), key=lambda x: abs(x[1]-11.5))
(16, 11.33447)
import numpy as np
a = [25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866, 19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154, 13.09409, 12.18347, 11.33447, 10.32184, 9.544922, 8.813385, 8.181152, 6.983734, 6.048035, 5.505096, 4.65799]
index = np.argmin(np.abs(np.array(a)-11.5))
a[index] # here is your result
In case a is already an array, the corresponding transformation can be ommitted.
How about: you zip the two lists, then sort the result?
If you can't sort the array, then there is no quick way to find the closest item - you have to iterate over all entries.
There is a workaround but it's quite a bit of work: Write a sort algorithm which sorts the array and (at the same time) updates a second array which tells you where this entry was before the array was sorted.
That way, you can use binary search to look up index of the closest entry and then use this index to look up the original index using the "index array".
[EDIT] Using zip(), this is pretty simple to achieve:
array_to_sort = zip( original_array, range(len(original_array)) )
array_to_sort.sort( key=i:i[0] )
Now you can binary search for the value (using item[0]). item[1] will give you the original index.
Going through all the items is only linear. If you would sort the array that would be worse.
I don't see a problem on keeping an additional deltax (the min difference so far) and idx (the index of that element) and just loop once trough the list.
Keep in mind that if space isn't important you can sort any list without moving the contents by creating a secondary list of the sorted indices.
Also bear in mind that if you are doing this look up just once, then you will just have to traverse every element in the list O(n). (If multiple times then you probably would want to sort for increase efficiency later)
If you are searching a long list a lot of times, then min scales very bad (O(n^2), if you append some of your searches to the search list, I think).
Bisect is your friend. Here's my solution. It scales O(n*log(n)):
class Closest:
"""Assumes *no* redundant entries - all inputs must be unique"""
def __init__(self, numlist=None, firstdistance=0):
if numlist == None:
numlist=[]
self.numindexes = dict((val, n) for n, val in enumerate(numlist))
self.nums = sorted(self.numindexes)
self.firstdistance = firstdistance
def append(self, num):
if num in self.numindexes:
raise ValueError("Cannot append '%s' it is already used" % str(num))
self.numindexes[num] = len(self.nums)
bisect.insort(self.nums, num)
def rank(self, target):
rank = bisect.bisect(self.nums, target)
if rank == 0:
pass
elif len(self.nums) == rank:
rank -= 1
else:
dist1 = target - self.nums[rank - 1]
dist2 = self.nums[rank] - target
if dist1 < dist2:
rank -= 1
return rank
def closest(self, target):
try:
return self.numindexes[self.nums[self.rank(target)]]
except IndexError:
return 0
def distance(self, target):
rank = self.rank(target)
try:
dist = abs(self.nums[rank] - target)
except IndexError:
dist = self.firstdistance
return dist
Use it like this:
a = [25.75443, 26.7803, 25.79099, 24.17642, 24.3526, 22.79056, 20.84866,
19.49222, 18.38086, 18.0358, 16.57819, 15.71255, 14.79059, 13.64154,
13.09409, 12.18347, 1.33447, 10.32184, 9.544922, 8.813385, 8.181152,
6.983734, 6.048035, 5.505096, 4.65799]
targets = [1.0, 100.0, 15.0, 15.6, 8.0]
cl = Closest(a)
for x in targets:
rank = cl.rank(x)
print("Closest to %5.1f : rank=%2i num=%8.5f index=%2i " % (x, rank,
cl.nums[rank], cl.closest(x)))
Will output:
Closest to 1.0 : rank= 0 num= 1.33447 index=16
Closest to 100.0 : rank=25 num=26.78030 index= 1
Closest to 15.0 : rank=12 num=14.79059 index=12
Closest to 15.6 : rank=13 num=15.71255 index=11
Closest to 8.0 : rank= 5 num= 8.18115 index=20
And:
cl.append(99.9)
x = 100.0
rank = cl.rank(x)
print("Closest to %5.1f : rank=%2i num=%8.5f index=%2i " % (x, rank,
cl.nums[rank], cl.closest(x)))
Output:
Closest to 100.0 : rank=25 num=99.90000 index=25