I have an image mask stored as a 2D numpy array where the values indicate the presence of objects that have been segmented in the image (0 = no object, 1..n = object 1 through n). I want to get a single coordinate for each object representing the center of the object. It doesn't have to be a perfectly accurate centroid or center of gravity. I'm just taking the mean of the x and y indices of all cells in the array that contain each object. I'm wondering if there's a faster way to do this than my current method:
for obj in np.unique(mask):
if obj == 0:
continue
x, y = np.mean(np.where(mask == obj), axis=1)
Here is a reproducible example:
import numpy as np
mask = np.array([
[0,0,0,0,0,2,0,0,0,0],
[0,1,1,0,2,2,2,0,0,0],
[0,0,1,0,2,2,2,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,3,3,3,0,0,4,0,0,0],
[0,0,0,0,0,4,4,4,0,0],
[0,0,0,0,0,0,4,0,0,0],
])
points = []
for obj in np.unique(mask):
if obj == 0:
continue
points.append(np.mean(np.where(mask == obj), axis=1))
print(points)
This outputs:
[array([1.33333333, 1.66666667]),
array([1.28571429, 5. ]),
array([4., 2.]),
array([5., 6.])]
I came up with another way to do it that seems to be about 3x faster:
import numpy as np
mask = np.array([
[0,0,0,0,0,2,0,0,0,0],
[0,1,1,0,2,2,2,0,0,0],
[0,0,1,0,2,2,2,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,3,3,3,0,0,4,0,0,0],
[0,0,0,0,0,4,4,4,0,0],
[0,0,0,0,0,0,4,0,0,0],
])
flat = mask.flatten()
split = np.unique(np.sort(flat), return_index=True)[1]
points = []
for inds in np.split(flat.argsort(), split)[2:]:
points.append(np.array(np.unravel_index(inds, mask.shape)).mean(axis=1))
print(points)
I wonder if the for loop can be replaced with a numpy operation which would likely be even faster.
You can copy this answer (give them an upvote too if this answer works for you) and use sparse matrices instead of np arrays. However, this only proves to be quicker for large arrays, with increasing speed boosts the larger your array is:
import numpy as np, time
from scipy.sparse import csr_matrix
def compute_M(data):
cols = np.arange(data.size)
return csr_matrix((cols, (np.ravel(data), cols)),
shape=(data.max() + 1, data.size))
def get_indices_sparse(data,M):
#M = compute_M(data)
return [np.mean(np.unravel_index(row.data, data.shape),1) for R,row in enumerate(M) if R>0]
def gen_random_mask(C, n, m):
mask = np.zeros([n,m],int)
for i in range(C):
x = np.random.randint(n)
y = np.random.randint(m)
mask[x:x+np.random.randint(n-x),y:y+np.random.randint(m-y)] = i
return mask
N = 100
C = 4
for S in [10,100,1000,10000]:
mask = gen_random_mask(C, S, S)
print('Time for size {:d}x{:d}:'.format(S,S))
s = time.time()
for _ in range(N):
points = []
for obj in np.unique(mask):
if obj == 0:
continue
points.append(np.mean(np.where(mask == obj), axis=1))
points_np = np.array(points)
print('NP: {:f}'.format((time.time() - s)/N))
mask_s = compute_M(mask)
s = time.time()
for _ in range(100):
points = get_indices_sparse(mask,mask_s)
print('Sparse: {:f}'.format((time.time() - s)/N))
np.testing.assert_equal(points,points_np)
Which results in the timings of:
Time for size 10x10:
NP: 0.000066
Sparse: 0.000226
Time for size 100x100:
NP: 0.000207
Sparse: 0.000253
Time for size 1000x1000:
NP: 0.018662
Sparse: 0.004472
Time for size 10000x10000:
NP: 2.545973
Sparse: 0.501061
The problem likely comes from np.where(mask == obj) which iterates on the whole mask array over and over. This is a problem when there are a lot of objects. You can solve this problem efficiently using a group-by strategy. However, Numpy do not yet provide such an operation. You can implement that using a sort followed by a split. But a sort is generally not efficient. An alternative method is to ask Numpy to return the index in the unique call so that you can then accumulate the value regarding the object (like a reduce-by-key where the reduction operator is an addition and the key are object integers). The mean can be obtained using a simple division in the end.
objects, inverts, counts = np.unique(mask, return_counts=True, return_inverse=True)
# Reduction by object
x = np.full(len(objects), 0.0)
y = np.full(len(objects), 0.0)
xPos = np.repeat(np.arange(mask.shape[0]), mask.shape[1])
yPos = np.tile(np.arange(mask.shape[1]), reps=mask.shape[0])
np.add.at(x, inverts, xPos)
np.add.at(y, inverts, yPos)
# Compute the final mean from the sum
x /= counts
y /= counts
# Discard the first item (when obj == 0)
x = x[1:]
y = y[1:]
If you need something faster, you could use Numba and perform the reduction manually (and possibly in parallel).
EDIT: if you really need a list in output, you can use points = list(np.stack([x, y]).T) but this is rather slow to use lists instead of Numpy arrays (and not memory efficient either).
Because the mask values number the segments they can be directly used as indices into numpy arrays. Combined with Cython this can be used to achieve a strong speed-up.
In Jupyter start with loading Cython:
%load_ext Cython
then use Python magic and a single pass over the whole array to calculate the means:
%%cython -a
import cython
import numpy as np
cimport numpy as np
#cython.boundscheck(False) # turn off bounds-checking for entire function
#cython.wraparound(False) # turn off negative index wrapping for entire function
def calc_xy_mean4(int[:,:] mask, int number_of_maskvalues):
cdef int[:] sum_x = np.zeros(number_of_maskvalues, dtype='int')
cdef int[:] sum_y = np.zeros(number_of_maskvalues, dtype='int')
n = np.zeros(number_of_maskvalues, dtype='int')
cdef int[:] n_mv = n
mean_x = np.zeros(number_of_maskvalues, dtype='float')
mean_y = np.zeros(number_of_maskvalues, dtype='float')
cdef double[:] mean_x_mv = mean_x
cdef double[:] mean_y_mv = mean_y
cdef int x_max = mask.shape[0]
cdef int y_max = mask.shape[1]
cdef int segment_index
cdef int x
cdef int y
for x in range(x_max):
for y in range(y_max):
segment_index = mask[x,y]
n_mv[segment_index] += 1
sum_x[segment_index] += x
sum_y[segment_index] += y
for segment_index in range(number_of_maskvalues):
mean_x_mv[segment_index] = sum_x[segment_index]/n[segment_index]
mean_y_mv[segment_index] = sum_y[segment_index]/n[segment_index]
return mean_x, mean_y, n
and call it with timeit magic
mask = np.array([
[0,0,0,0,0,2,0,0,0,0],
[0,1,1,0,2,2,2,0,0,0],
[0,0,1,0,2,2,2,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,3,3,3,0,0,4,0,0,0],
[0,0,0,0,0,4,4,4,0,0],
[0,0,0,0,0,0,4,0,0,0],
])
%timeit calc_xy_mean4(mask, 5)
This Cython solution is on my machine 9 times faster than the original code.
6.32 µs ± 18.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
and if we run the same instruction without the timeit magic:
calc_xy_mean4(mask, 5)
we obtain as output:
(array([3.07692308, 1.33333333, 1.28571429, 4. , 5. ]),
array([4.59615385, 1.66666667, 5. , 2. , 6. ]),
array([52, 3, 7, 3, 5]))
I have two sets of 2000 3D vectors each, and I need to compute the cross product between each possible pair. I currently do it like this
for tx in tangents_x:
for ty in tangents_y:
cross = np.cross(tx, ty)
(... do something with the cross variable...)
This works, but it's pretty slow. Is there a way to make it faster?
If I was interested in the element-wise product, I could just do the following
# Define initial vectors
tx = np.array([np.random.randn(3) for i in range(2000)])
ty = np.array([np.random.randn(3) for i in range(2000)])
# Store them into matrices
X = np.array([tx for i in range(2000)])
Y = np.array([ty for i in range(2000)]).T
# Compute the element-wise product
ew = X * Y
# Use the element_wise product as usual
for i,tx in enumerate(tangents_x):
for j,ty in enumerate(tangents_y):
(... use the element wise product of tx and ty as ew[i,j])
How can I apply this to the cross product instead of the element-wise one? Or, do you see another alternative?
Thanks much :)
Like many numpy functions cross supports broadcasting, therefore you can simply do:
np.cross(tangents_x[:, None, :], tangents_y)
or - more verbose but maybe easier to read
np.cross(tangents_x[:, None, :], tangents_y[None, :, :])
This reshapes tangents_x and tangents_y to shapes 2000, 1, 3 and 1, 2000, 3. By the rules of broadcasting this will be interpreted like two arrays of shape 2000, 2000, 3 where tangents_x is repeated along axis 1 and tangents_y is repeated along axis 0.
Just write it out and compile it
import numpy as np
import numba as nb
#nb.njit(fastmath=True,parallel=True)
def calc_cros(vec_1,vec_2):
res=np.empty((vec_1.shape[0],vec_2.shape[0],3),dtype=vec_1.dtype)
for i in nb.prange(vec_1.shape[0]):
for j in range(vec_2.shape[0]):
res[i,j,0]=vec_1[i,1] * vec_2[j,2] - vec_1[i,2] * vec_2[j,1]
res[i,j,1]=vec_1[i,2] * vec_2[j,0] - vec_1[i,0] * vec_2[j,2]
res[i,j,2]=vec_1[i,0] * vec_2[j,1] - vec_1[i,1] * vec_2[j,0]
return res
Performance
#create data
tx = np.random.rand(3000,3)
ty = np.random.rand(3000,3)
#don't measure compilation overhead
comb=calc_cros(tx,ty)
t1=time.time()
comb=calc_cros(tx,ty)
print(time.time()-t1)
This gives 0.08s for the two (3000,3) matrices.
np.dot is almost always going to be faster. So you could convert one of the vectors into a matrix.
def skew(x):
return np.array([[0, -x[2], x[1]],
[x[2], 0, -x[0]],
[-x[1], x[0], 0]])
On my machine this runs faster:
tx = np.array([np.random.randn(3) for i in range(100)])
ty = np.array([np.random.randn(3) for i in range(100)])
tt=time.clock()
for x in tx:
for y in ty:
cross = np.cross(x, y)
print(time.clock()-tt)
0.207 sec
tt=time.clock()
for x in tx:
m=skew(x)
for y in ty:
cross = np.dot(m, y)
print(time.clock()-tt)
0.015 sec
This result may vary depending on the computer.
You could use np.meshgrid() to build the combination matrix and then decompose the cross product. The rest is fiddling around with the axes etc:
# build two lists of 5 3D vecotrs as example values:
a_list = np.random.randint(0, 10, (5, 3))
b_list = np.random.randint(0, 10, (5, 3))
# here the original approach using slow list comprehensions:
slow = np.array([[ np.cross(a, b) for a in a_list ] for b in b_list ])
# now the faster proposed version:
g = np.array([ np.meshgrid(a_list[:,i], b_list[:,i]) for i in range(3) ])
fast = np.array([ g[1,0] * g[2,1] - g[2,0] * g[1,1],
g[2,0] * g[0,1] - g[0,0] * g[2,1],
g[0,0] * g[1,1] - g[1,0] * g[0,1] ]).transpose(1, 2, 0)
I tested this with 10000×10000 elements (instead of the 5×5 in the example above) and it took 6.4 seconds with the fast version. The slow version already took 27 seconds for 500 elements.
For your 2000×2000 elements the fast version takes 0.23s on my computer. Fast enough for you?
Use a cartesian product to get all possible pairs
import itertools as it
all_pairs = it.product(tx, ty)
And then use map to loop over all pairs and compute the cross product:
map(lambda x: np.cross(x[0], x[1]), all_pairs)
I'm doing statistical science, and have this code snippet which consumes about 80% of my computing time. As the program will run for weeks, i want to make it as fast as possible. agg1 and agg2 are numpy arrays with 4 entries and length between 20 and 400
for i, j in itertools.product(xrange(agg1.shape[1]), xrange(agg2.shape[1])):
iterator.append((i, j))
particle_distances.append(agg1[0:2, i] - agg2[0:2, j])
does it pay off to i.e. filter my numpy arrays from the last entry (which is uninteresting here)? should i use agg1.shape[1] or better give it a variable name beforehand. The function which contains this code is called 4500 times. If there are any other faster approaches to achieve the differences of every list element and its corresponding iterator they are also welcome.
here is an example text file you can use. import with numpy.loadtxt.
Thank you for your help
This computation can be concisely vectorized:
a = agg1[0:2, :].T
b = agg2[0:2, :].T
particle_distances = (a[:, None, :] - b[None, :, :]).reshape(-1, 2)
To retrieve the mapping of the indices you can call
idx1, idx2 = np.unravel_index(np.arange(agg1.shape[1] * agg2.shape[1]),
(agg1.shape[1], agg2.shape[1]))
This results in two arrays that correspond with the corresponding indices of agg1 and agg2.
Let's compare performance:
import numpy as np
import itertools
from time_stats import compare_calls
agg1 = np.random.rand(100, 10)
agg2 = np.random.rand(100, 15)
def original(agg1, agg2):
particle_distances = []
for i, j in itertools.product(range(agg1.shape[1]), range(agg2.shape[1])):
particle_distances.append(agg1[0:2, i] - agg2[0:2, j])
return particle_distances
def prealloc(agg1, agg2):
n = agg1.shape[1] * agg2.shape[1]
particle_distances = np.empty((n, 2))
for k, (i, j) in enumerate(itertools.product(range(agg1.shape[1]), range(agg2.shape[1]))):
particle_distances[k, :] = agg1[0:2, i] - agg2[0:2, j]
return particle_distances
def vectorized(agg1, agg2):
a = agg1[0:2, :].T
b = agg2[0:2, :].T
particle_distances = (a[:, None, :] - b[None, :, :]).reshape(-1, 2)
return particle_distances
r = compare_calls(['original(agg1, agg2)', 'prealloc(agg1, agg2)', 'vectorized(agg1, agg2)'], globals=globals())
r.print()
r.hist()
# original(agg1, agg2) : 0.00038 s/call median, 0.00034 ... 0.00047 IQR
# prealloc(agg1, agg2) : 0.00047 s/call median, 0.00041 ... 0.00068 IQR
# vectorized(agg1, agg2) : 6e-06 s/call median, 5.8e-06 ... 6.7e-06 IQR
I would guess it's slow because you're growing your lists inside a loop so it often has to move the lists around in memory as they grow too big. On option is to preallocate an array. If you use a 2D numpy array you can do away with iterator as it just become the indices:
import numpy as np
import itertools
particle_distances = np.zeros((agg1.shape[1], agg2.shape[1],3))
for i, j in itertools.product(range(agg1.shape[1]), range(agg2.shape[1])):
particle_distances[i,j,:] = agg1[0:2, i] - agg2[0:2, j]
However you can probably get more speedup and simplify your code by dropping the loop for a vectorized solution:
particle_distances = np.transpose(np.expand_dims(agg1[0:2,:], axis=3), (1,2,0)) - np.transpose(np.expand_dims(agg2[0:2,:], axis=3), (2,1,0))
Here I've used np.transpose to change the shape of the matrices such that automatic broadcasting will do the job that itertools.product was doing for you. np.expand_dims is just to add a third dimension to each so that we can reshape them appropriately.
It's a classic question, but I believe many people still searching for answers.
This question is a different than this one, since my question is operation between two sparse vectors (not a matrix).
I wrote a blog post about how Cosine Scipy Spatial Distance (SSD) is getting slower when the dimension of the data is getting higher (because it works on dense vectors). The post is in Indonesian language, but the code, my experiment settings & results should be easily understandable regardless of the language (at the bottom of the post).
Currently this solution is more than 70 times faster for high dimension data (compared to SSD) & more memory efficient:
import numpy as np
def fCosine(u,v): # u,v CSR vectors, Cosine Dissimilarity
uData = u.data; vData = v.data
denominator = np.sqrt(np.sum(uData**2)) * np.sqrt(np.sum(vData**2))
if denominator>0:
uCol = u.indices; vCol = v.indices # np array
intersection = set(np.intersect1d(uCol,vCol))
uI = np.array([u1 for i,u1 in enumerate(uData) if uCol[i] in intersection])
vI = np.array([v2 for j,v2 in enumerate(vData) if vCol[j] in intersection])
return 1-np.dot(uI,vI)/denominator
else:
return float("inf")
Is it possible to even further improve that function (Pythonic or via JIT/Cython???).
Here is an alternative, alt_fCosine, which (on my machine) is about 3x faster for CSR vectors of size 10**5 and 10**4 non-zero elements:
import scipy.sparse as sparse
import numpy as np
import math
def fCosine(u,v): # u,v CSR vectors, Cosine Dissimilarity
uData = u.data; vData = v.data
denominator = np.sqrt(np.sum(uData**2)) * np.sqrt(np.sum(vData**2))
if denominator>0:
uCol = u.indices; vCol = v.indices # np array
intersection = set(np.intersect1d(uCol,vCol))
uI = np.array([u1 for i,u1 in enumerate(uData) if uCol[i] in intersection])
vI = np.array([v2 for j,v2 in enumerate(vData) if vCol[j] in intersection])
return 1-np.dot(uI,vI)/denominator
else:
return float("inf")
def alt_fCosine(u,v):
uData, vData = u.data, v.data
denominator = math.sqrt(np.sum(uData**2) * np.sum(vData**2))
if denominator>0:
uCol, vCol = u.indices, v.indices
uI = uData[np.in1d(uCol, vCol)]
vI = vData[np.in1d(vCol, uCol)]
return 1-np.dot(uI,vI)/denominator
else:
return float("inf")
# Check that they return the same result
N = 10**5
u = np.round(10*sparse.random(1, N, density=0.1, format='csr'))
v = np.round(10*sparse.random(1, N, density=0.1, format='csr'))
assert np.allclose(fCosine(u, v), alt_fCosine(u, v))
alt_fCosine replaces two list comprehensions, a call to np.intersection1d
and the formation of a Python set with two calls to np.in1d and advanced
indexing.
For N = 10**5:
In [322]: %timeit fCosine(u, v)
100 loops, best of 3: 5.73 ms per loop
In [323]: %timeit alt_fCosine(u, v)
1000 loops, best of 3: 1.62 ms per loop
In [324]: 5.73/1.62
Out[324]: 3.537037037037037