Unique column of a sparse matrix in Python - python

What is a good way of identifying the unique columns of a sparse matrix represented in csc_matrix format and the times each column is repeated?
I have no information a priori about the elements of the matrix. It is the result of a sampling with replacement of the columns of another matrix, so I can have duplicated columns both due to the fact that a column is sampled many times either there are duplicated columns in the original matrix. Therefore I cannot apply numpy.unique to the indices of the sampled columns and I think it is not a good choice to convert the entire matrix to a dense format and then apply numpy.unique to it.

You can sort and group by the number of nonzeros in each column. Each group then sort by indices and values and split into blocks of no-change:
import numpy as np
from scipy import sparse
def sparse_unique_columns(M):
M = M.tocsc()
m, n = M.shape
if not M.has_sorted_indices:
M.sort_indices()
if not M.has_canonical_format:
M.sum_duplicates()
sizes = np.diff(M.indptr)
idx = np.argsort(sizes)
Ms = M#sparse.csc_matrix((np.ones((n,)), idx, np.arange(n+1)), (n, n))
ssizes = np.diff(Ms.indptr)
ssizes[1:] -= ssizes[:-1]
grpidx, = np.where(ssizes)
grpidx = np.concatenate([grpidx, [n]])
if ssizes[0] == 0:
counts = [np.array([0, grpidx[0]])]
else:
counts = [np.zeros((1,), int)]
ssizes = ssizes[grpidx[:-1]].cumsum()
for i, ss in enumerate(ssizes):
gil, gir = grpidx[i:i+2]
pl, pr = Ms.indptr[[gil, gir]]
dv = Ms.data[pl:pr].view(f'V{ss*Ms.data.dtype.itemsize}')
iv = Ms.indices[pl:pr].view(f'V{ss*Ms.indices.dtype.itemsize}')
idxi = np.lexsort((dv, iv))
dv = dv[idxi]
iv = iv[idxi]
chng, = np.where(np.concatenate(
[[True], (dv[1:] != dv[:-1]) | (iv[1:] != iv[:-1]), [True]]))
counts.append(np.diff(chng))
idx[gil:gir] = idx[gil:gir][idxi]
counts = np.concatenate(counts)
nu = counts.size - 1
uniques = M#sparse.csc_matrix((np.ones((nu,)), idx[counts[:-1].cumsum()],
np.arange(nu + 1)), (n, nu))
return uniques, idx, counts[1:]
a = np.random.uniform(0, 10, (1000, 200))
a[a>1] = 0
a = sparse.csc_matrix(a)
b = sparse.csc_matrix((np.ones(1000), np.random.randint(0, 200, (1000,)), np.arange(1001)))
c = a#b
unq, idx, cnt = sparse_unique_columns(c)
unqd, idxd, cntd = np.unique(c.A, axis=1, return_counts=True, return_inverse=True)
from timeit import timeit
print('sparse:', timeit(lambda: sparse_unique_columns(c), number=1000), 'ms')
print('dense: ', timeit(lambda: np.unique(c.A, axis=1, return_counts=True), number=100)*10, 'ms')
Sample output:
sparse: 2.735588440205902 ms
dense: 49.32689592242241 ms

Related

Need to use apply or broadcasting and masking to iterate over a DataFrame

I have a data frame that I need to iterate over. I want to use either apply or broadcasting and masking. This is the pseudocode I am trying to improve upon.
2 The algorithm
Algorithm 1: The algorithm
initialize the population (of size n) uniformly randomly, obeying the bounds;
while a pre-determined number of iterations is not complete do
set the random parameters (two independent parameters for each of the d
variables); find the best and the worst vectors in the population;
for each vector in the population do create a new vector using the
current vector, the best vector, the worst vector, and the random
parameters;
if the new vector is at least as good as the current vector then
current vector = new vector;
This is the code I have so far.
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.uniform(-5.0, 10.0, size = (20, 5)), columns = list('ABCDE'))
pd.set_option('display.max_columns', 500)
df
#while portion of pseudocode
f_func = np.square(df).sum(axis=1)
final_func = np.square(f_func)
xti_best = final_func.idxmin()
xti_worst = final_func.idxmax()
print(final_func)
print(df.head())
print(df.tail())
*#for loop of pseudocode
#for row in df.iterrows():
#implement equation from assignment
#define in array math
#xi_new = row.to_numpy() + np.random.uniform(0, 1, size = (1, 5)) * (df.iloc[xti_best].values - np.absolute(row.to_numpy())) - np.random.uniform(0, 1, size = (1, 5)) * (df.iloc[xti_worst].values - np.absolute(row.to_numpy()))
#print(xi_new)*
df2 = df.apply(lambda row: 0 if row == 0 else row + np.random.uniform(0, 1, size = (1, 5)) * (df.iloc[xti_best].values - np.absolute(axis = 1)))
print(df2)
The formula I am trying to use for xi_new is:
#xi_new = xi_current + random value between 0,1(xti_best -abs(xi_current)) - random value(xti_worst - abs(xi_current))
I'm not sure I'm implementing your formula correctly, but hopefully this helps
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.uniform(-5.0, 10.0, size = (20, 5)), columns = list('ABCDE'))
#while portion of pseudocode
f_func = np.square(df).sum(axis=1)
final_func = np.square(f_func)
xti_best_idx = final_func.idxmin()
xti_worst_idx = final_func.idxmax()
xti_best = df.loc[xti_best_idx]
xti_worst = df.loc[xti_worst_idx]
#Calculate random values for the whole df for the two different areas where you need randomness
nrows,ncols = df.shape
r1 = np.random.uniform(0, 1, size = (nrows, ncols))
r2 = np.random.uniform(0, 1, size = (nrows, ncols))
#xi_new = xi_current + random value between 0,1(xti_best -abs(xi_current)) - random value(xti_worst - abs(xi_current))
df= df+r1*xti_best.sub(df.abs())-r2*xti_worst.sub(df.abs())
df

Efficient ways of calculating affinity only for local neighbourhood in a large graph

I am relatively new to python and numpy and am trying to cluster a dense matrix with floating point numbers and having dimensions of 256x256 using spectral clustering. Since the affinity matrix will be of size 65536x65536, a full affinity matrix cannot be computed (due to memory limitations). As such, I am currently calculating the affinity between a given matrix entry and its 5x5 local neighbourhood, and build a sparse graph (in 3-tuple representation).
To do so, I am using for loops (basically, a sliding widow approach) which I think is not the most efficient way of doing so.
import numpy as np
def getAffinity(f1, f2):
return np.exp(-np.linalg.norm(np.absolute(f1 - f2))/ 2.1)
G = np.arange(256*256).reshape((256,256))
dim1 = 256 # Dimension 1 of matrix
dim2 = 256 # Dimension 1 of matrix
values = np.zeros(1623076, dtype=np.float32) # To hold affinities
rows = np.zeros(1623076, dtype=np.int32) # To hold row index
cols = np.zeros(1623076, dtype=np.int32) # To hold column index
index = 0 # To hold column index
for i in range(dim1):
for j in range(dim2):
current = G[i, j]
for k in range(np.maximum(0, i-2), np.minimum(dim1 , i+3)): # traverse rows
for l in range(np.maximum(0, j-2), np.minimum(dim2 , j+3)): # traverse columns
rows[index] = i*d1 + j
cols[index] = k*d1 + l
values[index] = getAffinity(current, G[k, l])
index += 1
I was wondering whether there are any other efficient ways of achieving the same goal.
Here is a sparse matrix approach. It is >800x faster than the loopy code.
import numpy as np
from scipy import sparse
from time import perf_counter as pc
T = []
T.append(pc())
def getAffinity(f1, f2):
return np.exp(-np.linalg.norm(np.absolute(f1 - f2))/ 2.1)
G = 2*np.arange(256*256).reshape((256,256))
dim1 = 256 # Dimension 1 of matrix
dim2 = 256 # Dimension 1 of matrix
values = np.zeros(1623076, dtype=np.float32) # To hold affinities
rows = np.zeros(1623076, dtype=np.int32) # To hold row index
cols = np.zeros(1623076, dtype=np.int32) # To hold column index
index = 0 # To hold column index
for i in range(dim1):
for j in range(dim2):
current = G[i, j]
for k in range(np.maximum(0, i-2), np.minimum(dim1 , i+3)): # traverse rows
for l in range(np.maximum(0, j-2), np.minimum(dim2 , j+3)): # traverse columns
rows[index] = i*dim1 + j
cols[index] = k*dim1 + l
values[index] = getAffinity(current, G[k, l])
index += 1
T.append(pc())
affs_OP = sparse.coo_matrix((values,(rows,cols))).tocsr()
import scipy.sparse as sp
def getAffinity(f1, f2): # similar to #PaulPanzer, I don't think OP is right
return np.exp(-np.abs(f1 - f2)/ 2.1)
def affinity_block(dim = 256, dist = 2):
i = np.arange(-dist, dist+1)
init_block = sp.dia_matrix((np.ones((i.size, dim)), i), (dim, dim))
out = sp.kron(init_block, init_block).tocoo()
out.data = getAffinity(Gf[out.row], Gf[out.col])
return out
T.append(pc())
Gf = G.ravel()
offsets = np.concatenate((np.mgrid[1:3,-2:3].reshape(2,-1).T,np.mgrid[:1,1:3].reshape(2,-1).T), axis=0)
def make_diag(yo,xo):
o = 256*yo+xo
diag = np.exp(-np.abs(Gf[o:]-Gf[:-o])/2.1)
if xo>0:
diag[:xo-256].reshape(-1,256)[:,-xo:] = 0
elif xo<0:
diag[:xo].reshape(-1,256)[:,:-xo] = 0
diag[xo:] = 0
return diag
diags = [make_diag(*o) for o in offsets]
offsets = np.sum(offsets*[256,1], axis=1)
affs_pp = sparse.diags([*diags,[np.ones(256*256)],*diags],np.concatenate([offsets,[0],-offsets]))
T.append(pc())
affs_df = affinity_block()
T.append(pc())
print("OP: {:.3f} s convert OP to sparse matrix: {:.3f} s pp {:.3f} s df: {:.3f} s".format(*np.diff(T)))
diff = affs_pp-affs_OP
diff *= diff.sign()
md = diff.max()
print(f"max deviation pp-OP: {md}")
print(f"number of different entries pp-df: {(affs_pp-affs_df).nnz}")
Sample run:
OP: 23.392 s convert OP to sparse matrix: 0.020 s pp 0.025 s df: 0.093 s
max deviation pp-OP: 2.0616356788405454e-08
number of different entries pp-df: 0
A bit of explanation, 1D first to keep it simple. Let's imagine an actually sliding window, so we can use time as an intuitive axis:
space
+-------------->
|
t | xo... x: window center
i | oxo.. o: window off center
m | .oxo. .: non window
e | ..oxo
| ...ox
v
time here actually is equivalent to space because we move with constant speed. We can now see that all the window points can be described as three diagonals. Offsets are 0, 1 and -1 but note that because the affinities are symmetric and the one for 0 is trivial, we need only calculate them for 1.
Now lets skip to 2D, the smallest example we can do is 3x3 window in 4x4 array. In row major this looks like.
xo..oo..........
oxo.ooo.........
.oxo.ooo........
..ox..oo........
oo..xo..oo......
ooo.oxo.ooo.....
.ooo.oxo.ooo....
..oo..ox..oo....
....oo..xo..oo..
....ooo.oxo.ooo.
.....ooo.oxo.ooo
......oo..ox..oo
........oo..xo..
........ooo.oxo.
.........ooo.oxo
..........oo..ox
The relevant offsets are (0,1),(1,-1),(1,0),(1,1) or in row major 0x4+1 = 1, 1x4-1 = 3, 1x4+0 = 4, 1x4+1 = 5. Also note that most of these diagonals are not complete, the missing bits explained by row major wrapping around, i.e. at z = y,x x = 3 the right neighbor z+1 is not actually a right neighbor y,x+1 ; instead, because of line jump, it is y+1,0 The if-else clause in the code above blanks the right bits of each diagonal.
#DanielF's strategy is similar but takes advantage of the block structure evident in the figure.
xo.. oo.. .... ....
oxo. ooo. .... ....
.oxo .ooo .... ....
..ox ..oo .... ....
oo.. xo.. oo.. ....
ooo. oxo. ooo. ....
.ooo .oxo .ooo ....
..oo ..ox ..oo ....
.... oo.. xo.. oo..
.... ooo. oxo. ooo.
.... .ooo .oxo .ooo
.... ..oo ..ox ..oo
.... .... oo.. xo..
.... .... ooo. oxo.
.... .... .ooo .oxo
.... .... ..oo ..ox
This seems to be a bit more elegant and extensible, albeit a bit (4x) slower, way to do the same thing as #PaulPanzer
import scipy.sparse as sp
from functools import reduce
def getAffinity(f1, f2): # similar to #PaulPanzer, I don't think OP is right
return np.exp(-np.abs(f1 - f2)/ 2.1)
def affinity_block(G, dist = 2):
Gf = G.ravel()
i = np.arange(-dist, dist+1)
init_blocks = [1]
for dim in G.shape:
init_blocks.append(sp.dia_matrix((np.ones((i.size, dim)), i), (dim, dim)))
out = reduce(sp.kron, init_blocks).tocoo()
out.data = getAffinity(Gf[out.row], Gf[out.col])
return out
This allows non-square G matrices, and higher dimensions.

Improve performance calculating a random sample matching specific conditions in pandas

For some dataset group_1 I need to iterate over all rows k times for robustness and find a matching random sample of another data frame group_2 according to some criteria expressed as data frame columns.
Unfortunately, this is fairly slow.
How can I improve performance?
The bottleneck is the apply-ed function, i.e. randomMatchingCondition.
import tqdm
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
seed = 47
np.random.seed(seed)
###################################################################
# generate dummy data
size = 10000
df = pd.DataFrame({i: np.random.randint(1,100,size=size) for i in ['metric']})
df['label'] = np.random.randint(0,2, size=size)
df['group_1'] = pd.Series(np.random.randint(1,12, size=size)).astype(object)
df['group_2'] = pd.Series(np.random.randint(1,10, size=size)).astype(object)
group_0 = df[df['label'] == 0]
group_0 = group_0.reset_index(drop=True)
group_0 = group_0.rename(index=str, columns={"metric": "metric_group_0"})
join_columns_enrich = ['group_1', 'group_2']
join_real = ['metric_group_0']
join_real.extend(join_columns_enrich)
group_0 = group_0[join_real]
display(group_0.head())
group_1 = df[df['label'] == 1]
group_1 = group_1.reset_index(drop=True)
display(group_1.head())
###################################################################
# naive find random element matching condition
def randomMatchingCondition(original_element, group_0, join_columns, random_state):
limits_dict = original_element[join_columns_enrich].to_dict()
query = ' & '.join([f"{k} == {v}" for k, v in limits_dict.items()])
candidates = group_0.query(query)
if len(candidates) > 0:
return candidates.sample(n=1, random_state=random_state)['metric_group_0'].values[0]
else:
return np.nan
###################################################################
# iterate over pandas dataframe k times for more robust sampling
k = 3
resulting_df = None
for i in range(1, k+1):
group_1['metric_group_0'] = group_1.progress_apply(randomMatchingCondition,
args=[group_0, join_columns_enrich, None],
axis = 1)
group_1['run'] = i
if resulting_df is None:
resulting_df = group_1.copy()
else:
resulting_df = pd.concat([resulting_df, group_1])
resulting_df.head()
Experimenting with pre-sorting the data:
group_0 = group_0.sort_values(join_columns_enrich)
group_1 = group_1.sort_values(join_columns_enrich)
does not show any difference.
IIUC you want to end up with k number of random samples for each row (combination of metrics) in your input dataframe. So why not candidates.sample(n=k, ...), and get rid of the for loop? Alternatively you could concatenate you dataframe k times with pd.concat([group1] * k).
It depends on your real data but I would give a shot for grouping the input dataframe by metric columns with group1.groupby(join_columns_enrich) (if their cardinality is sufficiently low), and apply the random sampling on these groups, picking k * len(group.index) random samples for each. groupby is expensive, OTOH you might save a lot on the iteration/sampling once it's done.
#smiandras, you are correct. Getting rid of the for loop is important.
Variant 1: multiple samples:
def randomMatchingCondition(original_element, group_0, join_columns, k, random_state):
limits_dict = original_element[join_columns_enrich].to_dict()
query = ' & '.join([f"{k} == {v}" for k, v in limits_dict.items()])
candidates = group_0.query(query)
if len(candidates) > 0:
return candidates.sample(n=k, random_state=random_state, replace=True)['metric_group_0'].values
else:
return np.nan
###################################################################
# iterate over pandas dataframe k times for more robust sampling
k = 3
resulting_df = None
#######################
# trying to improve performance: sort both dataframes
group_0 = group_0.sort_values(join_columns_enrich)
group_1 = group_1.sort_values(join_columns_enrich)
#######################
group_1['metric_group_0'] = group_1.progress_apply(randomMatchingCondition,
args=[group_0, join_columns_enrich, k, None],
axis = 1)
print(group_1.isnull().sum())
group_1 = group_1[~group_1.metric_group_0.isnull()]
display(group_1.head())
s=pd.DataFrame({'metric_group_0':np.concatenate(group_1.metric_group_0.values)},index=group_1.index.repeat(group_1.metric_group_0.str.len()))
s = s.join(group_1.drop('metric_group_0',1),how='left')
s['pos_in_array'] = s.groupby(s.index).cumcount()
s.head()
Variant 2: all possible samples optimized by native JOIN operation.
WARN this is a bit unsafe as it might generate a gigantic number of rows:
size = 1000
df = pd.DataFrame({i: np.random.randint(1,100,size=size) for i in ['metric']})
df['label'] = np.random.randint(0,2, size=size)
df['group_1'] = pd.Series(np.random.randint(1,12, size=size)).astype(object)
df['group_2'] = pd.Series(np.random.randint(1,10, size=size)).astype(object)
group_0 = df[df['label'] == 0]
group_0 = group_0.reset_index(drop=True)
join_columns_enrich = ['group_1', 'group_2']
join_real = ['metric']
join_real.extend(join_columns_enrich)
group_0 = group_0[join_real]
display(group_0.head())
group_1 = df[df['label'] == 1]
group_1 = group_1.reset_index(drop=True)
display(group_1.head())
df = group_1.merge(group_0, on=join_columns_enrich)
display(df.head())
print(group_1.shape)
df.shape

How to get the index of a list items in another list?

Consider I have these lists:
l = [5,6,7,8,9,10,5,15,20]
m = [10,5]
I want to get the index of m in l. I used list comprehension to do that:
[(i,i+1) for i,j in enumerate(l) if m[0] == l[i] and m[1] == l[i+1]]
Output : [(5,6)]
But if I have more numbers in m, I feel its not the right way. So is there any easy approach in Python or with NumPy?
Another example:
l = [5,6,7,8,9,10,5,15,20,50,16,18]
m = [10,5,15,20]
The output should be:
[(5,6,7,8)]
The easiest way (using pure Python) would be to iterate over the items and first only check if the first item matches. This avoids doing sublist comparisons when not needed. Depending on the contents of your l this could outperform even NumPy broadcasting solutions:
def func(haystack, needle): # obviously needs a better name ...
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx:idx+lengthneedle] == needle:
yield tuple(range(idx, idx+lengthneedle))
>>> list(func(l, m))
[(5, 6, 7, 8)]
In case your interested in speed I checked the performance of the approaches (borrowing from my setup here):
import random
import numpy as np
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
# view1D is from https://stackoverflow.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
def find_sublist_indices(haystack, needle):
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
restneedle = needle[1:]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx+1:idx+lengthneedle] == restneedle:
yield tuple(range(idx, idx+lengthneedle))
def Divakar1(l, m):
return np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
def Divakar2(l, m):
return np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
def MSeifert(l, m):
return list(find_sublist_indices(l, m))
# Timing setup
timings = {Divakar1: [], Divakar2: [], MSeifert: []}
sizes = [2**i for i in range(5, 20, 2)]
# Timing
for size in sizes:
l = [random.randint(0, 50) for _ in range(size)]
m = [random.randint(0, 50) for _ in range(10)]
larr = np.asarray(l)
marr = np.asarray(m)
for func in timings:
# first timings:
# res = %timeit -o func(l, m)
# second timings:
if func is MSeifert:
res = %timeit -o func(l, m)
else:
res = %timeit -o func(larr, marr)
timings[func].append(res)
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(1)
ax = plt.subplot(111)
for func in timings:
ax.plot(sizes,
[time.best for time in timings[func]],
label=str(func.__name__))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('size')
ax.set_ylabel('time [seconds]')
ax.grid(which='both')
ax.legend()
plt.tight_layout()
In case your l and m are lists my function outperforms the NumPy solutions for all sizes:
But in case you have these as numpy arrays you'll get faster results for large arrays (size > 1000 elements) when using Divakars NumPy solutions:
You are basically looking for the starting indices of a list in another list.
Approach #1 : One approach to solve it would be to create sliding windows of the elements in list in which we are searching, giving us a 2D array and then simply use NumPy broadcasting to perform broadcasted comparison against the search list against each row of the 2D sliding window version obtained earlier. Thus, one method would be -
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
out = np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Sample runs -
In [340]: l = [5,6,7,8,9,10,5,15,20,50,16,18]
...: m = [10,5,15,20]
...:
In [341]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[341]: array([5, 6, 7, 8])
In [342]: l = [5,6,7,8,9,10,5,15,20,50,16,18,10,5,15,20]
...: m = [10,5,15,20]
...:
In [343]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[343]:
array([[ 5, 6, 7, 8],
[12, 13, 14, 15]])
Approach #2 : Another method would be to get the sliding window and then get the row-wise scalar view into the data to be search data and the data to be search for, giving us 1D data to work with, like so -
# view1D is from https://stackoverflow.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
out = np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
2020 Versions
In search of more easy/compact approaches, we could look into scikit-image's view_as_windows for getting sliding windows with a built-in. I am assuming arrays as inputs for less messy code. For lists as input, we have to use np.asarray() as shown earlier.
Approach #3 : Basically a derivative of pattern_index_broadcasting with view_as_windows for a one-liner with a as the larger data and b is the array to be searched -
from skimage.util import view_as_windows
np.flatnonzero((view_as_windows(a,len(b))==b).all(1))[:,None]+np.arange(len(b))
Approach #4 : For a small number of matches from b in a, we could optimize, by looking for first element match from b to reduce the dataset size for searches -
mask = a[:-len(b)+1]==b[0]
mask[mask] = (view_as_windows(a,len(b))[mask]).all(1)
out = np.flatnonzero(mask)[:,None]+np.arange(len(b))
Approach #5 : For a small sized b, we could simply run a loop for each of the elements in b and perform bitwise and-reduction -
mask = np.bitwise_and.reduce([a[i:len(a)-len(b)+1+i]==b[i] for i in range(len(b))])
out = np.flatnonzero(mask)[:,None]+np.arange(len(b))
Just making the point that #MSeifert's approach can, of course, also be implemented in numpy:
def pp(h,n):
nn = len(n)
NN = len(h)
c = (h[:NN-nn+1]==n[0]).nonzero()[0]
if c.size==0: return
for i,l in enumerate(n[1:].tolist(),1):
c = c[h[i:][c]==l]
if c.size==0: return
return np.arange(c[0],c[0]+nn)
def get_data(l1,l2):
d=defaultdict(list)
[d[item].append(index) for index,item in enumerate(l1)]
print(d)
Using defaultdict to store indices of elements from other list.

numpy how to find index of nearest value according to a thresold value of multi dimensional array? [duplicate]

How do I find the nearest value in a numpy array? Example:
np.find_nearest(array, value)
import numpy as np
def find_nearest(array, value):
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return array[idx]
Example usage:
array = np.random.random(10)
print(array)
# [ 0.21069679 0.61290182 0.63425412 0.84635244 0.91599191 0.00213826
# 0.17104965 0.56874386 0.57319379 0.28719469]
print(find_nearest(array, value=0.5))
# 0.568743859261
IF your array is sorted and is very large, this is a much faster solution:
def find_nearest(array,value):
idx = np.searchsorted(array, value, side="left")
if idx > 0 and (idx == len(array) or math.fabs(value - array[idx-1]) < math.fabs(value - array[idx])):
return array[idx-1]
else:
return array[idx]
This scales to very large arrays. You can easily modify the above to sort in the method if you can't assume that the array is already sorted. It’s overkill for small arrays, but once they get large this is much faster.
With slight modification, the answer above works with arrays of arbitrary dimension (1d, 2d, 3d, ...):
def find_nearest(a, a0):
"Element in nd array `a` closest to the scalar value `a0`"
idx = np.abs(a - a0).argmin()
return a.flat[idx]
Or, written as a single line:
a.flat[np.abs(a - a0).argmin()]
Summary of answer: If one has a sorted array then the bisection code (given below) performs the fastest. ~100-1000 times faster for large arrays, and ~2-100 times faster for small arrays. It does not require numpy either.
If you have an unsorted array then if array is large, one should consider first using an O(n logn) sort and then bisection, and if array is small then method 2 seems the fastest.
First you should clarify what you mean by nearest value. Often one wants the interval in an abscissa, e.g. array=[0,0.7,2.1], value=1.95, answer would be idx=1. This is the case that I suspect you need (otherwise the following can be modified very easily with a followup conditional statement once you find the interval). I will note that the optimal way to perform this is with bisection (which I will provide first - note it does not require numpy at all and is faster than using numpy functions because they perform redundant operations). Then I will provide a timing comparison against the others presented here by other users.
Bisection:
def bisection(array,value):
'''Given an ``array`` , and given a ``value`` , returns an index j such that ``value`` is between array[j]
and array[j+1]. ``array`` must be monotonic increasing. j=-1 or j=len(array) is returned
to indicate that ``value`` is out of range below and above respectively.'''
n = len(array)
if (value < array[0]):
return -1
elif (value > array[n-1]):
return n
jl = 0# Initialize lower
ju = n-1# and upper limits.
while (ju-jl > 1):# If we are not yet done,
jm=(ju+jl) >> 1# compute a midpoint with a bitshift
if (value >= array[jm]):
jl=jm# and replace either the lower limit
else:
ju=jm# or the upper limit, as appropriate.
# Repeat until the test condition is satisfied.
if (value == array[0]):# edge cases at bottom
return 0
elif (value == array[n-1]):# and top
return n-1
else:
return jl
Now I'll define the code from the other answers, they each return an index:
import math
import numpy as np
def find_nearest1(array,value):
idx,val = min(enumerate(array), key=lambda x: abs(x[1]-value))
return idx
def find_nearest2(array, values):
indices = np.abs(np.subtract.outer(array, values)).argmin(0)
return indices
def find_nearest3(array, values):
values = np.atleast_1d(values)
indices = np.abs(np.int64(np.subtract.outer(array, values))).argmin(0)
out = array[indices]
return indices
def find_nearest4(array,value):
idx = (np.abs(array-value)).argmin()
return idx
def find_nearest5(array, value):
idx_sorted = np.argsort(array)
sorted_array = np.array(array[idx_sorted])
idx = np.searchsorted(sorted_array, value, side="left")
if idx >= len(array):
idx_nearest = idx_sorted[len(array)-1]
elif idx == 0:
idx_nearest = idx_sorted[0]
else:
if abs(value - sorted_array[idx-1]) < abs(value - sorted_array[idx]):
idx_nearest = idx_sorted[idx-1]
else:
idx_nearest = idx_sorted[idx]
return idx_nearest
def find_nearest6(array,value):
xi = np.argmin(np.abs(np.ceil(array[None].T - value)),axis=0)
return xi
Now I'll time the codes:
Note methods 1,2,4,5 don't correctly give the interval. Methods 1,2,4 round to nearest point in array (e.g. >=1.5 -> 2), and method 5 always rounds up (e.g. 1.45 -> 2). Only methods 3, and 6, and of course bisection give the interval properly.
array = np.arange(100000)
val = array[50000]+0.55
print( bisection(array,val))
%timeit bisection(array,val)
print( find_nearest1(array,val))
%timeit find_nearest1(array,val)
print( find_nearest2(array,val))
%timeit find_nearest2(array,val)
print( find_nearest3(array,val))
%timeit find_nearest3(array,val)
print( find_nearest4(array,val))
%timeit find_nearest4(array,val)
print( find_nearest5(array,val))
%timeit find_nearest5(array,val)
print( find_nearest6(array,val))
%timeit find_nearest6(array,val)
(50000, 50000)
100000 loops, best of 3: 4.4 µs per loop
50001
1 loop, best of 3: 180 ms per loop
50001
1000 loops, best of 3: 267 µs per loop
[50000]
1000 loops, best of 3: 390 µs per loop
50001
1000 loops, best of 3: 259 µs per loop
50001
1000 loops, best of 3: 1.21 ms per loop
[50000]
1000 loops, best of 3: 746 µs per loop
For a large array bisection gives 4us compared to next best 180us and longest 1.21ms (~100 - 1000 times faster). For smaller arrays it's ~2-100 times faster.
Here is a fast vectorized version of #Dimitri's solution if you have many values to search for (values can be multi-dimensional array):
# `values` should be sorted
def get_closest(array, values):
# make sure array is a numpy array
array = np.array(array)
# get insert positions
idxs = np.searchsorted(array, values, side="left")
# find indexes where previous index is closer
prev_idx_is_less = ((idxs == len(array))|(np.fabs(values - array[np.maximum(idxs-1, 0)]) < np.fabs(values - array[np.minimum(idxs, len(array)-1)])))
idxs[prev_idx_is_less] -= 1
return array[idxs]
Benchmarks
> 100 times faster than using a for loop with #Demitri's solution`
>>> %timeit ar=get_closest(np.linspace(1, 1000, 100), np.random.randint(0, 1050, (1000, 1000)))
139 ms ± 4.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
>>> %timeit ar=[find_nearest(np.linspace(1, 1000, 100), value) for value in np.random.randint(0, 1050, 1000*1000)]
took 21.4 seconds
Here's an extension to find the nearest vector in an array of vectors.
import numpy as np
def find_nearest_vector(array, value):
idx = np.array([np.linalg.norm(x+y) for (x,y) in array-value]).argmin()
return array[idx]
A = np.random.random((10,2))*100
""" A = array([[ 34.19762933, 43.14534123],
[ 48.79558706, 47.79243283],
[ 38.42774411, 84.87155478],
[ 63.64371943, 50.7722317 ],
[ 73.56362857, 27.87895698],
[ 96.67790593, 77.76150486],
[ 68.86202147, 21.38735169],
[ 5.21796467, 59.17051276],
[ 82.92389467, 99.90387851],
[ 6.76626539, 30.50661753]])"""
pt = [6, 30]
print find_nearest_vector(A,pt)
# array([ 6.76626539, 30.50661753])
If you don't want to use numpy this will do it:
def find_nearest(array, value):
n = [abs(i-value) for i in array]
idx = n.index(min(n))
return array[idx]
Here's a version that will handle a non-scalar "values" array:
import numpy as np
def find_nearest(array, values):
indices = np.abs(np.subtract.outer(array, values)).argmin(0)
return array[indices]
Or a version that returns a numeric type (e.g. int, float) if the input is scalar:
def find_nearest(array, values):
values = np.atleast_1d(values)
indices = np.abs(np.subtract.outer(array, values)).argmin(0)
out = array[indices]
return out if len(out) > 1 else out[0]
Here is a version with scipy for #Ari Onasafari, answer "to find the nearest vector in an array of vectors"
In [1]: from scipy import spatial
In [2]: import numpy as np
In [3]: A = np.random.random((10,2))*100
In [4]: A
Out[4]:
array([[ 68.83402637, 38.07632221],
[ 76.84704074, 24.9395109 ],
[ 16.26715795, 98.52763827],
[ 70.99411985, 67.31740151],
[ 71.72452181, 24.13516764],
[ 17.22707611, 20.65425362],
[ 43.85122458, 21.50624882],
[ 76.71987125, 44.95031274],
[ 63.77341073, 78.87417774],
[ 8.45828909, 30.18426696]])
In [5]: pt = [6, 30] # <-- the point to find
In [6]: A[spatial.KDTree(A).query(pt)[1]] # <-- the nearest point
Out[6]: array([ 8.45828909, 30.18426696])
#how it works!
In [7]: distance,index = spatial.KDTree(A).query(pt)
In [8]: distance # <-- The distances to the nearest neighbors
Out[8]: 2.4651855048258393
In [9]: index # <-- The locations of the neighbors
Out[9]: 9
#then
In [10]: A[index]
Out[10]: array([ 8.45828909, 30.18426696])
For large arrays, the (excellent) answer given by #Demitri is far faster than the answer currently marked as best. I've adapted his exact algorithm in the following two ways:
The function below works whether or not the input array is sorted.
The function below returns the index of the input array corresponding to the closest value, which is somewhat more general.
Note that the function below also handles a specific edge case that would lead to a bug in the original function written by #Demitri. Otherwise, my algorithm is identical to his.
def find_idx_nearest_val(array, value):
idx_sorted = np.argsort(array)
sorted_array = np.array(array[idx_sorted])
idx = np.searchsorted(sorted_array, value, side="left")
if idx >= len(array):
idx_nearest = idx_sorted[len(array)-1]
elif idx == 0:
idx_nearest = idx_sorted[0]
else:
if abs(value - sorted_array[idx-1]) < abs(value - sorted_array[idx]):
idx_nearest = idx_sorted[idx-1]
else:
idx_nearest = idx_sorted[idx]
return idx_nearest
All the answers are beneficial to gather the information to write efficient code. However, I have written a small Python script to optimize for various cases. It will be the best case if the provided array is sorted. If one searches the index of the nearest point of a specified value, then bisect module is the most time efficient. When one search the indices correspond to an array, the numpy searchsorted is most efficient.
import numpy as np
import bisect
xarr = np.random.rand(int(1e7))
srt_ind = xarr.argsort()
xar = xarr.copy()[srt_ind]
xlist = xar.tolist()
bisect.bisect_left(xlist, 0.3)
In [63]: %time bisect.bisect_left(xlist, 0.3)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 22.2 µs
np.searchsorted(xar, 0.3, side="left")
In [64]: %time np.searchsorted(xar, 0.3, side="left")
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 98.9 µs
randpts = np.random.rand(1000)
np.searchsorted(xar, randpts, side="left")
%time np.searchsorted(xar, randpts, side="left")
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.2 ms
If we follow the multiplicative rule, then numpy should take ~100 ms which implies ~83X faster.
I think the most pythonic way would be:
num = 65 # Input number
array = np.random.random((10))*100 # Given array
nearest_idx = np.where(abs(array-num)==abs(array-num).min())[0] # If you want the index of the element of array (array) nearest to the the given number (num)
nearest_val = array[abs(array-num)==abs(array-num).min()] # If you directly want the element of array (array) nearest to the given number (num)
This is the basic code. You can use it as a function if you want
This is a vectorized version of unutbu's answer:
def find_nearest(array, values):
array = np.asarray(array)
# the last dim must be 1 to broadcast in (array - values) below.
values = np.expand_dims(values, axis=-1)
indices = np.abs(array - values).argmin(axis=-1)
return array[indices]
image = plt.imread('example_3_band_image.jpg')
print(image.shape) # should be (nrows, ncols, 3)
quantiles = np.linspace(0, 255, num=2 ** 2, dtype=np.uint8)
quantiled_image = find_nearest(quantiles, image)
print(quantiled_image.shape) # should be (nrows, ncols, 3)
Maybe helpful for ndarrays:
def find_nearest(X, value):
return X[np.unravel_index(np.argmin(np.abs(X - value)), X.shape)]
For 2d array, to determine the i, j position of nearest element:
import numpy as np
def find_nearest(a, a0):
idx = (np.abs(a - a0)).argmin()
w = a.shape[1]
i = idx // w
j = idx - i * w
return a[i,j], i, j
Here is a version that works with 2D arrays, using scipy's cdist function if the user has it, and a simpler distance calculation if they don't.
By default, the output is the index that is closest to the value you input, but you can change that with the output keyword to be one of 'index', 'value', or 'both', where 'value' outputs array[index] and 'both' outputs index, array[index].
For very large arrays, you may need to use kind='euclidean', as the default scipy cdist function may run out of memory.
This is maybe not the absolute fastest solution, but it is quite close.
def find_nearest_2d(array, value, kind='cdist', output='index'):
# 'array' must be a 2D array
# 'value' must be a 1D array with 2 elements
# 'kind' defines what method to use to calculate the distances. Can choose one
# of 'cdist' (default) or 'euclidean'. Choose 'euclidean' for very large
# arrays. Otherwise, cdist is much faster.
# 'output' defines what the output should be. Can be 'index' (default) to return
# the index of the array that is closest to the value, 'value' to return the
# value that is closest, or 'both' to return index,value
import numpy as np
if kind == 'cdist':
try: from scipy.spatial.distance import cdist
except ImportError:
print("Warning (find_nearest_2d): Could not import cdist. Reverting to simpler distance calculation")
kind = 'euclidean'
index = np.where(array == value)[0] # Make sure the value isn't in the array
if index.size == 0:
if kind == 'cdist': index = np.argmin(cdist([value],array)[0])
elif kind == 'euclidean': index = np.argmin(np.sum((np.array(array)-np.array(value))**2.,axis=1))
else: raise ValueError("Keyword 'kind' must be one of 'cdist' or 'euclidean'")
if output == 'index': return index
elif output == 'value': return array[index]
elif output == 'both': return index,array[index]
else: raise ValueError("Keyword 'output' must be one of 'index', 'value', or 'both'")
For those searching for multiple nearest, modifying the accepted answer:
import numpy as np
def find_nearest(array, value, k):
array = np.asarray(array)
idx = np.argsort(abs(array - value))[:k]
return array[idx]
See:
https://stackoverflow.com/a/66937734/11671779
import numpy as np
def find_nearest(array, value):
array = np.array(array)
z=np.abs(array-value)
y= np.where(z == z.min())
m=np.array(y)
x=m[0,0]
y=m[1,0]
near_value=array[x,y]
return near_value
array =np.array([[60,200,30],[3,30,50],[20,1,-50],[20,-500,11]])
print(array)
value = 0
print(find_nearest(array, value))
This one handles any number of queries, using numpy searchsorted, so after sorting the input arrays, is just as fast.
It works on regular grids in 2d, 3d ... too:
#!/usr/bin/env python3
# keywords: nearest-neighbor regular-grid python numpy searchsorted Voronoi
import numpy as np
#...............................................................................
class Near_rgrid( object ):
""" nearest neighbors on a Manhattan aka regular grid
1d:
near = Near_rgrid( x: sorted 1d array )
nearix = near.query( q: 1d ) -> indices of the points x_i nearest each q_i
x[nearix[0]] is the nearest to q[0]
x[nearix[1]] is the nearest to q[1] ...
nearpoints = x[nearix] is near q
If A is an array of e.g. colors at x[0] x[1] ...,
A[nearix] are the values near q[0] q[1] ...
Query points < x[0] snap to x[0], similarly > x[-1].
2d: on a Manhattan aka regular grid,
streets running east-west at y_i, avenues north-south at x_j,
near = Near_rgrid( y, x: sorted 1d arrays, e.g. latitide longitude )
I, J = near.query( q: nq × 2 array, columns qy qx )
-> nq × 2 indices of the gridpoints y_i x_j nearest each query point
gridpoints = np.column_stack(( y[I], x[J] )) # e.g. street corners
diff = gridpoints - querypoints
distances = norm( diff, axis=1, ord= )
Values at an array A definded at the gridpoints y_i x_j nearest q: A[I,J]
3d: Near_rgrid( z, y, x: 1d axis arrays ) .query( q: nq × 3 array )
See Howitworks below, and the plot Voronoi-random-regular-grid.
"""
def __init__( self, *axes: "1d arrays" ):
axarrays = []
for ax in axes:
axarray = np.asarray( ax ).squeeze()
assert axarray.ndim == 1, "each axis should be 1d, not %s " % (
str( axarray.shape ))
axarrays += [axarray]
self.midpoints = [_midpoints( ax ) for ax in axarrays]
self.axes = axarrays
self.ndim = len(axes)
def query( self, queries: "nq × dim points" ) -> "nq × dim indices":
""" -> the indices of the nearest points in the grid """
queries = np.asarray( queries ).squeeze() # or list x y z ?
if self.ndim == 1:
assert queries.ndim <= 1, queries.shape
return np.searchsorted( self.midpoints[0], queries ) # scalar, 0d ?
queries = np.atleast_2d( queries )
assert queries.shape[1] == self.ndim, [
queries.shape, self.ndim]
return [np.searchsorted( mid, q ) # parallel: k axes, k processors
for mid, q in zip( self.midpoints, queries.T )]
def snaptogrid( self, queries: "nq × dim points" ):
""" -> the nearest points in the grid, 2d [[y_j x_i] ...] """
ix = self.query( queries )
if self.ndim == 1:
return self.axes[0][ix]
else:
axix = [ax[j] for ax, j in zip( self.axes, ix )]
return np.array( axix )
def _midpoints( points: "array-like 1d, *must be sorted*" ) -> "1d":
points = np.asarray( points ).squeeze()
assert points.ndim == 1, points.shape
diffs = np.diff( points )
assert np.nanmin( diffs ) > 0, "the input array must be sorted, not %s " % (
points.round( 2 ))
return (points[:-1] + points[1:]) / 2 # floats
#...............................................................................
Howitworks = \
"""
How Near_rgrid works in 1d:
Consider the midpoints halfway between fenceposts | | |
The interval [left midpoint .. | .. right midpoint] is what's nearest each post --
| | | | points
| . | . | . | midpoints
^^^^^^ . nearest points[1]
^^^^^^^^^^^^^^^ nearest points[2] etc.
2d:
I, J = Near_rgrid( y, x ).query( q )
I = nearest in `x`
J = nearest in `y` independently / in parallel.
The points nearest [yi xj] in a regular grid (its Voronoi cell)
form a rectangle [left mid x .. right mid x] × [left mid y .. right mid y]
(in any norm ?)
See the plot Voronoi-random-regular-grid.
Notes
-----
If a query point is exactly halfway between two data points,
e.g. on a grid of ints, the lines (x + 1/2) U (y + 1/2),
which "nearest" you get is implementation-dependent, unpredictable.
"""
Murky = \
""" NaNs in points, in queries ?
"""
__version__ = "2021-10-25 oct denis-bz-py"

Categories