Using Cython to speed up connected components algorithm - python
First off, I am using python[2.7.2], numpy[1.6.2rc1], cython[0.16], gcc[MinGW] compiler, on a windows xp machine.
I needed a 3D connected components algorithm to process some 3D binary data (i.e. 1s and 0s) stored in numpy arrays. Unfortunately, I could not find any existing code so I adapted the code found here to work with 3D arrays. Everything works great, however speed is desirable for processing huge data sets. As a result I stumbled upon cython and decided to give it a try.
So far cython has improved the speed:
Cython: 0.339 s
Python: 0.635 s
Using cProfile, my time consuming line in the pure python version is:
new_region = min(filter(lambda i: i > 0, array_region[xMin:xMax,yMin:yMax,zMin:zMax].ravel()))
The Question: What is the correct way to "cythonize" the lines:
new_region = min(filter(lambda i: i > 0, array_region[xMin:xMax,yMin:yMax,zMin:zMax].ravel()))
for x,y,z in zip(ind[0],ind[1],ind[2]):
Any help would be appreciated and hopefully this work will help others.
Pure python version [*.py]:
import numpy as np
def find_regions_3D(Array):
x_dim=np.size(Array,0)
y_dim=np.size(Array,1)
z_dim=np.size(Array,2)
regions = {}
array_region = np.zeros((x_dim,y_dim,z_dim),)
equivalences = {}
n_regions = 0
#first pass. find regions.
ind=np.where(Array==1)
for x,y,z in zip(ind[0],ind[1],ind[2]):
# get the region number from all surrounding cells including diagnols (27) or create new region
xMin=max(x-1,0)
xMax=min(x+1,x_dim-1)
yMin=max(y-1,0)
yMax=min(y+1,y_dim-1)
zMin=max(z-1,0)
zMax=min(z+1,z_dim-1)
max_region=array_region[xMin:xMax+1,yMin:yMax+1,zMin:zMax+1].max()
if max_region > 0:
#a neighbour already has a region, new region is the smallest > 0
new_region = min(filter(lambda i: i > 0, array_region[xMin:xMax+1,yMin:yMax+1,zMin:zMax+1].ravel()))
#update equivalences
if max_region > new_region:
if max_region in equivalences:
equivalences[max_region].add(new_region)
else:
equivalences[max_region] = set((new_region, ))
else:
n_regions += 1
new_region = n_regions
array_region[x,y,z] = new_region
#Scan Array again, assigning all equivalent regions the same region value.
for x,y,z in zip(ind[0],ind[1],ind[2]):
r = array_region[x,y,z]
while r in equivalences:
r= min(equivalences[r])
array_region[x,y,z]=r
#return list(regions.itervalues())
return array_region
Pure python speedups:
#Original line:
new_region = min(filter(lambda i: i > 0, array_region[xMin:xMax+1,yMin:yMax+1,zMin:zMax+1].ravel()))
#ver A:
new_region = array_region[xMin:xMax+1,yMin:yMax+1,zMin:zMax+1]
min(new_region[new_region>0])
#ver B:
new_region = min( i for i in array_region[xMin:xMax,yMin:yMax,zMin:zMax].ravel() if i>0)
#ver C:
sub=array_region[xMin:xMax,yMin:yMax,zMin:zMax]
nlist=np.where(sub>0)
minList=[]
for x,y,z in zip(nlist[0],nlist[1],nlist[2]):
minList.append(sub[x,y,z])
new_region=min(minList)
Time results:
O: 0.0220445
A: 0.0002161
B: 0.0173195
C: 0.0002560
Cython version [*.pyx]:
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t
cdef inline int int_max(int a, int b): return a if a >= b else b
cdef inline int int_min(int a, int b): return a if a <= b else b
def find_regions_3D(np.ndarray Array not None):
cdef int x_dim=np.size(Array,0)
cdef int y_dim=np.size(Array,1)
cdef int z_dim=np.size(Array,2)
regions = {}
cdef np.ndarray array_region = np.zeros((x_dim,y_dim,z_dim),dtype=DTYPE)
equivalences = {}
cdef int n_regions = 0
#first pass. find regions.
ind=np.where(Array==1)
cdef int xMin, xMax, yMin, yMax, zMin, zMax, max_region, new_region, x, y, z
for x,y,z in zip(ind[0],ind[1],ind[2]):
# get the region number from all surrounding cells including diagnols (27) or create new region
xMin=int_max(x-1,0)
xMax=int_min(x+1,x_dim-1)+1
yMin=int_max(y-1,0)
yMax=int_min(y+1,y_dim-1)+1
zMin=int_max(z-1,0)
zMax=int_min(z+1,z_dim-1)+1
max_region=array_region[xMin:xMax,yMin:yMax,zMin:zMax].max()
if max_region > 0:
#a neighbour already has a region, new region is the smallest > 0
new_region = min(filter(lambda i: i > 0, array_region[xMin:xMax,yMin:yMax,zMin:zMax].ravel()))
#update equivalences
if max_region > new_region:
if max_region in equivalences:
equivalences[max_region].add(new_region)
else:
equivalences[max_region] = set((new_region, ))
else:
n_regions += 1
new_region = n_regions
array_region[x,y,z] = new_region
#Scan Array again, assigning all equivalent regions the same region value.
cdef int r
for x,y,z in zip(ind[0],ind[1],ind[2]):
r = array_region[x,y,z]
while r in equivalences:
r= min(equivalences[r])
array_region[x,y,z]=r
#return list(regions.itervalues())
return array_region
Cython speedups:
Using:
cdef np.ndarray region = np.zeros((3,3,3),dtype=DTYPE)
...
region=array_region[xMin:xMax,yMin:yMax,zMin:zMax]
new_region=np.min(region[region>0])
Time: 0.170, original: 0.339 s
Results
After considering the many useful comments and answers provided, my current algorithms are running at:
Cython: 0.0219
Python: 0.4309
Cython is providing a 20x increase in speed over the pure python.
Current Cython Code:
import numpy as np
import cython
cimport numpy as np
cimport cython
from libcpp.map cimport map
DTYPE = np.int
ctypedef np.int_t DTYPE_t
cdef inline int int_max(int a, int b): return a if a >= b else b
cdef inline int int_min(int a, int b): return a if a <= b else b
#cython.boundscheck(False)
def find_regions_3D(np.ndarray[DTYPE_t,ndim=3] Array not None):
cdef unsigned int x_dim=np.size(Array,0),y_dim=np.size(Array,1),z_dim=np.size(Array,2)
regions = {}
cdef np.ndarray[DTYPE_t,ndim=3] array_region = np.zeros((x_dim,y_dim,z_dim),dtype=DTYPE)
cdef np.ndarray region = np.zeros((3,3,3),dtype=DTYPE)
cdef map[int,int] equivalences
cdef unsigned int n_regions = 0
#first pass. find regions.
ind=np.where(Array==1)
cdef np.ndarray[DTYPE_t,ndim=1] ind_x = ind[0], ind_y = ind[1], ind_z = ind[2]
cells=range(len(ind_x))
cdef unsigned int xMin, xMax, yMin, yMax, zMin, zMax, max_region, new_region, x, y, z, i, xi, yi, zi, val
for i in cells:
x=ind_x[i]
y=ind_y[i]
z=ind_z[i]
# get the region number from all surrounding cells including diagnols (27) or create new region
xMin=int_max(x-1,0)
xMax=int_min(x+1,x_dim-1)+1
yMin=int_max(y-1,0)
yMax=int_min(y+1,y_dim-1)+1
zMin=int_max(z-1,0)
zMax=int_min(z+1,z_dim-1)+1
max_region = 0
new_region = 2000000000 # huge number
for xi in range(xMin, xMax):
for yi in range(yMin, yMax):
for zi in range(zMin, zMax):
val = array_region[xi,yi,zi]
if val > max_region: # val is the new maximum
max_region = val
if 0 < val < new_region: # val is the new minimum
new_region = val
if max_region > 0:
if max_region > new_region:
if equivalences.count(max_region) == 0 or new_region < equivalences[max_region]:
equivalences[max_region] = new_region
else:
n_regions += 1
new_region = n_regions
array_region[x,y,z] = new_region
#Scan Array again, assigning all equivalent regions the same region value.
cdef int r
for i in cells:
x=ind_x[i]
y=ind_y[i]
z=ind_z[i]
r = array_region[x,y,z]
while equivalences.count(r) > 0:
r= equivalences[r]
array_region[x,y,z]=r
return array_region
Setup file [setup.py]
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("ConnectComp", ["ConnectedComponents.pyx"],
include_dirs =[numpy.get_include()],
language="c++",
)]
)
Build command:
python setup.py build_ext --inplace
As #gotgenes points out, you should definitely be using cython -a <file>, and trying to reduce the amount of yellow you see. Yellow corresponds to worse and worse generated C.
Things I found that reduced the amount of yellow:
This looks like a situation where there will never be any out of bounds array access, as long as the input Array has 3 dimensions, so one can turn off bounds checking:
cimport cython
#cython.boundscheck(False)
def find_regions_3d(...):
Give the compiler more information for efficient indexing, i.e. whenever you cdef an ndarray give as much information as you can:
def find_regions_3D(np.ndarray[DTYPE_t,ndim=3] Array not None):
[...]
cdef np.ndarray[DTYPE_t,ndim=3] array_region = ...
[etc.]
Give the compiler more information about positive/negative-ness. I.e. if you know a certain variable is always going to be positive, cdef it as unsigned int rather than int, as this means that Cython can eliminate any negative-indexing checks.
Unpack the ind tuple immediately, i.e.
ind = np.where(Array==1)
cdef np.ndarray[DTYPE_t,ndim=1] ind_x = ind[0], ind_y = ind[1], ind_z = ind[2]
Avoid using the for x,y,z in zip(..[0],..[1],..[2]) construct. In both cases, replace it with
cdef int i
for i in range(len(ind_x)):
x = ind_x[i]
y = ind_y[i]
z = ind_z[i]
Avoid doing the fancy indexing/slicing. And especially avoid doing it twice! And avoid using filter! I.e. replace
max_region=array_region[xMin:xMax,yMin:yMax,zMin:zMax].max()
if max_region > 0:
new_region = min(filter(lambda i: i > 0, array_region[xMin:xMax,yMin:yMax,zMin:zMax].ravel()))
if max_region > new_region:
if max_region in equivalences:
equivalences[max_region].add(new_region)
else:
equivalences[max_region] = set((new_region, ))
with the more verbose
max_region = 0
new_region = 2000000000 # "infinity"
for xi in range(xMin, xMax):
for yi in range(yMin, yMax):
for zi in range(zMin, zMax):
val = array_region[xi,yi,zi]
if val > max_region: # val is the new maximum
max_region = val
if 0 < val < new_region: # val is the new minimum
new_region = val
if max_region > 0:
if max_region > new_region:
if max_region in equivalences:
equivalences[max_region].add(new_region)
else:
equivalences[max_region] = set((new_region, ))
else:
n_regions += 1
new_region = n_regions
This doesn't look so nice, but the triple loop compiles down to about 10 or so lines of C, while the compiled version of the original is hundreds of lines long and has a lot of Python object manipulation.
(Obviously you must cdef all the variables you use, especially xi, yi, zi and val in this code.)
You don't need to store all the equivalences, since the only thing you do with the set is find the minimum element. So if you instead have equivalences mapping int to int, you can replace
if max_region in equivalences:
equivalences[max_region].add(new_region)
else:
equivalences[max_region] = set((new_region, ))
[...]
while r in equivalences:
r = min(equivalences[r])
with
if max_region not in equivalences or new_region < equivalences[max_region]:
equivalences[max_region] = new_region
[...]
while r in equivalences:
r = equivalences[r]
The last thing to do after all that would be to not use any Python objects at all, specifically, don't use a dictionary for equivalences. This is now easy, since it is mapping int to int, so one could use from libcpp.map cimport map and then cdef map[int,int] equivalences, and replace .. not in equivalences with equivalences.count(..) == 0 and .. in equivalences with equivalences.count(..) > 0. (Note that it will then require a C++ compiler.)
(copied from the above comment for others ease of reading)
I believe scipy's ndimage.label does what you want (I did not test it against your code but it should be quite efficient). Note that you have to import it explicitely:
from scipy import ndimage
ndimage.label(your_data, connectivity_struct)
then later you can apply other built-in functions (like finding the bounding rectangle, centre-of-mass, etc)
When optimizing for cython you want to make sure that in your loops mostly native C data types are used, not Python objects that come with a higher overhead. The best way to find such places is to look at the generated C code and look for lines that were translated into lots of Py* function calls. These places could usually be optimized by using cdef variables instead of python objects.
In your code I would for example suspect that the loop with zip produces lots of python objects and it would be much faster to iterate with an int index that is then used to get the elements in ind[0],.... But look at the generated C code and see what seems to call unnecessarily many python functions.
Related
Is there a faster method to find the difference of each pixel neighbor in an image?
I'm kinda new to image processing and I'm having trouble with the speed. What I'm trying to do is "the difference between pixel values and the neighbors of them are calculated to discover whether there is a great contrast (>100 for this case) between them and accumulated" Equation It is working but it is very slow. Is there any optimal way to do this? %%cython -a import cython import numpy as np #cython.boundscheck(False) cpdef unsigned char[:, :] test(unsigned char [:, :] image): w = image.shape[1] h = image.shape[0]; Hi = [None] * w Vj = [None] * h #For Hi for y in range(0, w): value1 = 0 for x in range(1, h): value = abs(image[x, y]- image[(x-1), y]) if(value > 100): value1+= value Hi[y]= value1
Nested loops with cython for image processing
I'm trying to iterate over a 2D image containing floating-point depth data, it has a somewhat normal resolution (640, 480), but python has been too slow, so I've been trying to optimize the problem by using cython. I've tried to move the looping to other functions, shifting around the nogil statement, didn't seem to work, after reworking the problem, I was able to get a portion of it working. But this last part is escaping me to no avail. I've attempted to get rid of python objects from the prange() loop by moving them to the with gil section beforehand, hence: cdef int[:] w_list = array.array(range(0, w_inc, interpolation)) instead of for r in range(0, w_inc, interpolation): but the error persists My code works in two parts: The split_data() method subsections the image into num quadrants that are stored in a 3D array bits. These are use to make splitting up the work to multiple thread/processes easier. This part works okay. #cython.cdivision(True) #cython.boundscheck(False) cpdef split_data(double[:, :] frame, int h, int w, int num): cdef double[:, :, :] bits = np.zeros(shape=(num, h // num, w // num), dtype=float) cdef int c_count = os.cpu_count() cdef int i, j, k for i in prange(num, nogil=True, num_threads=c_count): for j in prange(h // num): for k in prange(w // num): bits[i, j, k] = frame[i * (h // num) + j, i * (w // num) + k] return bits The scatter_data() method takes the bits array from the previous function and then creates another 3D array with length num where num is the length of bits, called points which is a series of 3D coordinates representing valid depth points. It then uses prange() to extract the valid depth data from each of these bits and stores them into points #cython.cdivision(True) #cython.boundscheck(False) cpdef scatter_data(double[:, :] depths, object validator=None, int h=-1, int w=-1, int interpolation=1): # Handles if h or w is -1 (default) if h < 0 or w < 0: h = depths.shape[0] if h < 0 else h w = depths.shape[1] if w < 0 else w cdef int max_num = w * h cdef int c_count = os.cpu_count() cdef int h_inc = h // c_count, w_inc = w // c_count cdef double[:, :, :] points = np.zeros(shape=(c_count, max_num, 3), dtype=float) cdef double[:, :, :] bits = split_data(depths, h, w, c_count) cdef int count = 0 cdef int i, r, c cdef int[:] w_list = array.array(range(0, w_inc, interpolation)) cdef int[:] h_list = array.array(range(0, h_inc, interpolation)) for i in prange(c_count, nogil=True, num_threads=c_count): count = 0 for r in w_list: for c in h_list: if depths[c, r] != 0: points[i, count, 0] = w - r points[i, count, 1] = c points[i, count, 2] = depths[c, r] count = count + 1 points = points[:count] return points and for completeness 3. Here are my import statements import cython from cython.parallel import prange from cpython cimport array import array cimport numpy as np import numpy as np import os When compiling the code I keep getting error messages something along the lines of: Error compiling Cython file: ------------------------------------------------------------ ... cdef int[:] w_list = array.array(range(0, w_inc, interpolation)) cdef int[:] h_list = array.array(range(0, h_inc, interpolation)) for i in prange(c_count, nogil=True, num_threads=c_count): count = 0 for r in w_list: ^ ------------------------------------------------------------ data_util/cy_scatter.pyx:70:17: Iterating over Python object not allowed without gil and Error compiling Cython file: ------------------------------------------------------------ ... cdef int[:] w_list = array.array(range(0, w_inc, interpolation)) cdef int[:] h_list = array.array(range(0, h_inc, interpolation)) for i in prange(c_count, nogil=True, num_threads=c_count): count = 0 for r in w_list: ^ ------------------------------------------------------------ data_util/cy_scatter.pyx:70:17: Coercion from Python not allowed without the GIL and Error compiling Cython file: ------------------------------------------------------------ ... cdef int[:] w_list = array.array(range(0, w_inc, interpolation)) cdef int[:] h_list = array.array(range(0, h_inc, interpolation)) for i in prange(c_count, nogil=True, num_threads=c_count): count = 0 for r in w_list: ^ ------------------------------------------------------------ data_util/cy_scatter.pyx:70:17: Converting to Python object not allowed without gil Is there a way to do this? And if so, how do I do this?
You just want to iterate by index rather than by iterating over a Python iterator: for ri in range(w_list.shape[0]): r = w_list[ri] This is somewhere where best practice in Python differs from best practice in Cython - Cython only accelerates iterating over numeric loops. The way you're trying to do it will fall back to being a Python iterator which is both slower, and requires the GIL.
Parallelising an Exhaustive Search in Cython
I'm fairly new to Cython, and I'm trying to Cythonize some code of mine. I have a 3D array, X, of complex values (which I'm treating as a large 'stack' of square arrays) which has shape on the scale of (small, small, huge), and I need to find the location and the absolute value of the largest above-diagonal item. I currently have an exhaustive search like this: cdef double complex[:,:,:] Xcp = X.copy() cdef Py_ssize_t h = Xcp.shape[0] cdef Py_ssize_t w = Xcp.shape[1] cdef Py_ssize_t l = Xcp.shape[2] cdef Py_ssize_t j, k, m cdef double tmptop = 0.0 cdef Py_ssize_t[:] coords = np.zeros((3), dtype="intp") cdef double it for j in range(l): for k in range(w): for m in range(k+1, h): it = cabs(Xcp[m, k, j]) if it > tmptop: tmptop = it coords[0] = m coords[1] = k coords[2] = j Note that I'm getting cabs from here: cdef extern from "complex.h": double cabs(double complex) This code is already quite a lot faster than what I had previously in Numpy, but I do feel as though it could be sped up, in particular paralellised. I have tried changing the loop to this: with nogil: for j in prange(l): for k in range(w): for m in range(k+1, h): it = abs(Xcp[m, k, j]) if it > tmptop: tmptop = it coords[0] = m coords[1] = k coords[2] = j Though I'm now getting the wrong result. What's going on here?
Cythonize two small numpy functions, help needed
The problem I'm trying to Cythonize two small functions that mostly deal with numpy ndarrays for some scientific purpose. These two smalls functions are called millions of times in a genetic algorithm and account for the majority of the time taken by the algo. I made some progress on my own and both work nicely, but i get only a tiny speed improvement (10%). More importantly, cython --annotate show that the majority of the code is still going through Python. The code First function: The aim of this function is to get back slices of data and it is called millions of times in an inner nested loop. Depending on the bool in data[1][1], we either get the slice in the forward or reverse order. #Ipython notebook magic for cython %%cython --annotate import numpy as np from scipy import signal as scisignal cimport cython cimport numpy as np def get_signal(data): #data[0] contains the data structure containing the numpy arrays #data[1][0] contains the position to slice #data[1][1] contains the orientation to slice, forward = 0, reverse = 1 cdef int halfwinwidth = 100 cdef int midpoint = data[1][0] cdef int strand = data[1][1] cdef int start = midpoint - halfwinwidth cdef int end = midpoint + halfwinwidth #the arrays we want to slice cdef np.ndarray r0 = data[0]['normals_forward'] cdef np.ndarray r1 = data[0]['normals_reverse'] cdef np.ndarray r2 = data[0]['normals_combined'] if strand == 0: normals_forward = r0[start:end] normals_reverse = r1[start:end] normals_combined = r2[start:end] else: normals_forward = r1[end - 1:start - 1: -1] normals_reverse = r0[end - 1:start - 1: -1] normals_combined = r2[end - 1:start - 1: -1] #return the result as a tuple row = (normals_forward, normals_reverse, normals_combined) return row Second function This one gets a list of tuples of numpy arrays, and we want to add up the arrays element wise, then normalize them and get the integration of the intersection. def calculate_signal(list signal): cdef int halfwinwidth = 100 cdef np.ndarray profile_normals_forward = np.zeros(halfwinwidth * 2, dtype='f') cdef np.ndarray profile_normals_reverse = np.zeros(halfwinwidth * 2, dtype='f') cdef np.ndarray profile_normals_combined = np.zeros(halfwinwidth * 2, dtype='f') #b is a tuple of 3 np.ndarrays containing 200 floats #here we add them up elementwise for b in signal: profile_normals_forward += b[0] profile_normals_reverse += b[1] profile_normals_combined += b[2] #normalize the arrays cdef int count = len(signal) #print "Normalizing to number of elements" profile_normals_forward /= count profile_normals_reverse /= count profile_normals_combined /= count intersection_signal = scisignal.detrend(np.fmin(profile_normals_forward, profile_normals_reverse)) intersection_signal[intersection_signal < 0] = 0 intersection = np.sum(intersection_signal) results = {"intersection": intersection, "profile_normals_forward": profile_normals_forward, "profile_normals_reverse": profile_normals_reverse, "profile_normals_combined": profile_normals_combined, } return results Any help is appreciated - I tried using memory views but for some reason the code got much, much slower.
After fixing the array cdef (as has been indicated, with the dtype specified), you should probably put the routine in a cdef function (which will only be callable by a def function in the same script). In the declaration of the function, you'll need to provide the type (and the dimensions if it's an array numpy): cdef get_signal(numpy.ndarray[DTYPE_t, ndim=3] data): I'm not sure using a dict is a good idea though. You could make use of numpy's column or row slices like data[:, 0].
numpy template matching using matrix multiplications
I am trying to match a template with a binary image (only black and white) by shifting the template along the image. And return the minimum distance between the template and the image with the corresponding position on which this minimum distance did occur. For example: img: 0 1 0 0 0 1 0 1 1 template: 0 1 1 1 This template matches the image best at position (1,1) and the distance will then be 0. So far things are not too difficult and I already got some code that does the trick. def match_template(img, template): mindist = float('inf') idx = (-1,-1) for y in xrange(img.shape[1]-template.shape[1]+1): for x in xrange(img.shape[0]-template.shape[0]+1): #calculate Euclidean distance dist = np.sqrt(np.sum(np.square(template - img[x:x+template.shape[0],y:y+template.shape[1]]))) if dist < mindist: mindist = dist idx = (x,y) return [mindist, idx] But for images of the size I need (image among 500 x 200 pixels and template among 250 x 100) this already takes approximately 4.5 seconds, which is way too slow. And I know the same thing can be done much quicker using matrix multiplications (in matlab I believe this can be done using im2col and repmat). Can anyone explain me how to do it in python/numpy? btw. I know there is an opencv matchTemplate function that does exactly what I need, but since I might need to alter the code slightly later on I would prefer a solution which I fully understand and can alter. Thanks! edit: If anyone can explain me how opencv does this in less than 0.2 seconds that would also be great. I have had a short look at the source code, but those things somehow always look quite complicated to me. edit2: Cython code import numpy as np cimport numpy as np DTYPE = np.int ctypedef np.int_t DTYPE_t def match_template(np.ndarray img, np.ndarray template): cdef float mindist = float('inf') cdef int x_coord = -1 cdef int y_coord = -1 cdef float dist cdef unsigned int x, y cdef int img_width = img.shape[0] cdef int img_height = img.shape[1] cdef int template_width = template.shape[0] cdef int template_height = template.shape[1] cdef int range_x = img_width-template_width+1 cdef int range_y = img_height-template_height+1 for y from 0 <= y < range_y: for x from 0 <= x < range_x: dist = np.sqrt(np.sum(np.square(template - img[ x:<unsigned int>(x+template_width), y:<unsigned int>(y+template_height) ]))) #calculate euclidean distance if dist < mindist: mindist = dist x_coord = x y_coord = y return [mindist, (x_coord,y_coord)] img = np.asarray(img, dtype=DTYPE) template = np.asarray(template, dtype=DTYPE) match_template(img, template)
One possible way of doing what you want is via convolution (which can be brute force or FFT). Matrix multiplications AFAIK won't work. You need to convolve your data with the template. And find the maximum (you'll also need to do some scaling to make it work properly). xs=np.array([[0,1,0],[0,0,1],[0,1,1]])*1. ys=np.array([[0,1],[1,1]])*1. print scipy.ndimage.convolve(xs,ys,mode='constant',cval=np.inf) >>> array([[ 1., 1., inf], [ 0., 2., inf], [ inf, inf, inf]]) print scipy.signal.fftconvolve(xs,ys,mode='valid') >>> array([[ 1., 1.], [ 0., 2.]])
There may be a fancy way to get this done using pure numpy/scipy magic. But it might be easier (and more understandable when you look at the code in the future) to just drop into Cython to get this done. There's a good tutorial for integrating Cython with numpy at http://docs.cython.org/src/tutorial/numpy.html. EDIT: I did a quick test with your Cython code and it ran ~15 sec for a 500x400 img with a 100x200 template. After some tweaks (eliminating the numpy method calls and numpy bounds checking), I got it down under 3 seconds. That may not be enough for you, but it shows the possibility. import numpy as np cimport numpy as np cimport cython from libc.math cimport sqrt DTYPE = np.int ctypedef np.int_t DTYPE_t #cython.boundscheck(False) def match_template(np.ndarray[DTYPE_t, ndim=2] img, np.ndarray[DTYPE_t, ndim=2] template): cdef float mindist = float('inf') cdef int x_coord = -1 cdef int y_coord = -1 cdef float dist cdef unsigned int x, y cdef int img_width = img.shape[0] cdef int img_height = img.shape[1] cdef int template_width = template.shape[0] cdef int template_height = template.shape[1] cdef int range_x = img_width-template_width+1 cdef int range_y = img_height-template_height+1 cdef DTYPE_t total cdef int delta cdef unsigned int j, k, j_plus, k_plus for y from 0 <= y < range_y: for x from 0 <= x < range_x: #dist = np.sqrt(np.sum(np.square(template - img[ x:<unsigned int>(x+template_width), y:<unsigned int>(y+template_height) ]))) #calculate euclidean distance # Do the same operations, but in plain C total = 0 for j from 0 <= j < template_width: j_plus = <unsigned int>x + j for k from 0 <= k < template_height: k_plus = <unsigned int>y + k delta = template[j, k] - img[j_plus, k_plus] total += delta*delta dist = sqrt(total) if dist < mindist: mindist = dist x_coord = x y_coord = y return [mindist, (x_coord,y_coord)]