Cython call optimization

Cython call optimization - python

I've got a Python function I try to export to Cython. I have tested two implementations but I don't understand why the second one is slower than the first one. Furthermore, I am looking for ways to improve speed a little more but I have no clue how ?
Base code
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t
cdef inline int int_max(int a, int b): return a if a >= b else b
cdef inline int int_min(int a, int b): return a if a <= b else b
cdef extern from "math.h":
double exp(double x)
#cython.boundscheck(False)
#cython.wraparound(False)
def bilateral_filter_C(np.ndarray[np.float_t, ndim=1] samples, int w=20):
# Filter Parameters
cdef Py_ssize_t size = samples.shape[0]
cdef float rang
cdef float sigma = 2*3.0*3.0
cdef int j, L
cdef unsigned int a, b
cdef np.float_t W, num, sub_sample, intensity
# Initialization
cdef np.ndarray[np.float_t, ndim=1] gauss = np.zeros(2*w+1, dtype=np.float)
cdef np.ndarray[np.float_t, ndim=1] sub_samples, intensities = np.empty(size, dtype=np.float)
cdef np.ndarray[np.float_t, ndim=1] samples_filtered = np.empty(size, dtype=np.float)
L = 2*w+1
for j in xrange(L):
rang = -w+1.0/L
rang *= rang
gauss[j] = exp(-rang/sigma)
<CODE TO IMPROVE>
return samples_filtered
I tried to inject those two code samples in the <CODE TO IMPROVE> section:
Most efficient approach
for i in xrange(size):
a = <unsigned int>int_max(i-w, 0)
b = <unsigned int>int_min(i+w, size-1)
L = b-a
sub_samples = samples[a:b]-samples[i]
sub_samples *= sub_samples
for j in xrange(L):
sub_samples[j] = exp(-sub_samples[j]/sigma)
intensities = gauss[w-i+a:w-i+b]*sub_samples
num = 0.0
W = 0.0
for j in xrange(L):
W += intensities[j]
num += intensities[j]*samples[a+j]
samples_filtered[i] = num/W
Result
%timeit -n1 -r10 bilateral_filter_C(x, 20)
1 loop, best of 10: 45 ms per loop
Less efficient
for i in xrange(size):
a = <unsigned int>int_max(i-w, 0)
b = <unsigned int>int_min(i+w, size-1)
num = 0.0
W = 0.0
for j in xrange(b-a):
sub_sample = samples[a+j]-samples[i]
intensity1 = gauss[w-i+a+j]*exp(-sub_sample*sub_sample/sigma)
W += intensity
num += intensity*samples[a+j]
samples_filtered[i] = num/W
Result
%timeit -n1 -r10 bilateral_filter_C(x, 20)
1 loop, best of 10: 125 ms per loop

You have a few typos:
1) You forgot to define i, just add cdef int i, j, L
2) In the second algorithm you wrote intensity1 = gauss[w-i+a+j]*exp(-sub_sample*sub_sample/sigma), it should be intensity, without the 1
3) I would add #cython.cdivision(True) to avoid the check of division by zero
With those changes and with x = np.random.rand(10000)I got the following results
%timeit bilateral_filter_C1(x, 20) # First code
10 loops, best of 3: 74.1 ms per loop
%timeit bilateral_filter_C2(x, 20) # Second code
100 loops, best of 3: 9.5 ms per loop
And, to check the results
np.all(np.equal(bilateral_filter_C1(x, 20), bilateral_filter_C2(x, 20)))
True
To avoid these problems I suggest to use the option cython my_file.pyx -a, it generates an html file that shows you the possible problems you have in your code
EDIT
Reading again the code, it seems to have more errors:
for j in xrange(L):
rang = -w+1.0/L
rang *= rang
gauss[j] = exp(-rang/sigma)
gauss has the same value always, what is the definition of rang?

Related

Nested loops with cython for image processing

I'm trying to iterate over a 2D image containing floating-point depth data, it has a somewhat normal resolution (640, 480), but python has been too slow, so I've been trying to optimize the problem by using cython.
I've tried to move the looping to other functions, shifting around the nogil statement, didn't seem to work, after reworking the problem, I was able to get a portion of it working. But this last part is escaping me to no avail.
I've attempted to get rid of python objects from the prange() loop by moving them to the with gil section beforehand, hence:
cdef int[:] w_list = array.array(range(0, w_inc, interpolation))
instead of
for r in range(0, w_inc, interpolation):
but the error persists
My code works in two parts:
The split_data() method subsections the image into num quadrants that are stored in a 3D array bits. These are use to make splitting up the work to multiple thread/processes easier. This part works okay.
#cython.cdivision(True)
#cython.boundscheck(False)
cpdef split_data(double[:, :] frame, int h, int w, int num):
cdef double[:, :, :] bits = np.zeros(shape=(num, h // num, w // num), dtype=float)
cdef int c_count = os.cpu_count()
cdef int i, j, k
for i in prange(num, nogil=True, num_threads=c_count):
for j in prange(h // num):
for k in prange(w // num):
bits[i, j, k] = frame[i * (h // num) + j, i * (w // num) + k]
return bits
The scatter_data() method takes the bits array from the previous function and then creates another 3D array with length num where num is the length of bits, called points which is a series of 3D coordinates representing valid depth points. It then uses prange() to extract the valid depth data from each of these bits and stores them into points
#cython.cdivision(True)
#cython.boundscheck(False)
cpdef scatter_data(double[:, :] depths, object validator=None,
int h=-1, int w=-1, int interpolation=1):
# Handles if h or w is -1 (default)
if h < 0 or w < 0:
h = depths.shape[0] if h < 0 else h
w = depths.shape[1] if w < 0 else w
cdef int max_num = w * h
cdef int c_count = os.cpu_count()
cdef int h_inc = h // c_count, w_inc = w // c_count
cdef double[:, :, :] points = np.zeros(shape=(c_count, max_num, 3), dtype=float)
cdef double[:, :, :] bits = split_data(depths, h, w, c_count)
cdef int count = 0
cdef int i, r, c
cdef int[:] w_list = array.array(range(0, w_inc, interpolation))
cdef int[:] h_list = array.array(range(0, h_inc, interpolation))
for i in prange(c_count, nogil=True, num_threads=c_count):
count = 0
for r in w_list:
for c in h_list:
if depths[c, r] != 0:
points[i, count, 0] = w - r
points[i, count, 1] = c
points[i, count, 2] = depths[c, r]
count = count + 1
points = points[:count]
return points
and for completeness
3. Here are my import statements
import cython
from cython.parallel import prange
from cpython cimport array
import array
cimport numpy as np
import numpy as np
import os
When compiling the code I keep getting error messages something along the lines of:
Error compiling Cython file:
------------------------------------------------------------
...
cdef int[:] w_list = array.array(range(0, w_inc, interpolation))
cdef int[:] h_list = array.array(range(0, h_inc, interpolation))
for i in prange(c_count, nogil=True, num_threads=c_count):
count = 0
for r in w_list:
^
------------------------------------------------------------
data_util/cy_scatter.pyx:70:17: Iterating over Python object not allowed without gil
and
Error compiling Cython file:
------------------------------------------------------------
...
cdef int[:] w_list = array.array(range(0, w_inc, interpolation))
cdef int[:] h_list = array.array(range(0, h_inc, interpolation))
for i in prange(c_count, nogil=True, num_threads=c_count):
count = 0
for r in w_list:
^
------------------------------------------------------------
data_util/cy_scatter.pyx:70:17: Coercion from Python not allowed without the GIL
and
Error compiling Cython file:
------------------------------------------------------------
...
cdef int[:] w_list = array.array(range(0, w_inc, interpolation))
cdef int[:] h_list = array.array(range(0, h_inc, interpolation))
for i in prange(c_count, nogil=True, num_threads=c_count):
count = 0
for r in w_list:
^
------------------------------------------------------------
data_util/cy_scatter.pyx:70:17: Converting to Python object not allowed without gil
Is there a way to do this? And if so, how do I do this?

You just want to iterate by index rather than by iterating over a Python iterator:
for ri in range(w_list.shape[0]):
r = w_list[ri]
This is somewhere where best practice in Python differs from best practice in Cython - Cython only accelerates iterating over numeric loops. The way you're trying to do it will fall back to being a Python iterator which is both slower, and requires the GIL.

Parallelising an Exhaustive Search in Cython

I'm fairly new to Cython, and I'm trying to Cythonize some code of mine. I have a 3D array, X, of complex values (which I'm treating as a large 'stack' of square arrays) which has shape on the scale of (small, small, huge), and I need to find the location and the absolute value of the largest above-diagonal item. I currently have an exhaustive search like this:
cdef double complex[:,:,:] Xcp = X.copy()
cdef Py_ssize_t h = Xcp.shape[0]
cdef Py_ssize_t w = Xcp.shape[1]
cdef Py_ssize_t l = Xcp.shape[2]
cdef Py_ssize_t j, k, m
cdef double tmptop = 0.0
cdef Py_ssize_t[:] coords = np.zeros((3), dtype="intp")
cdef double it
for j in range(l):
for k in range(w):
for m in range(k+1, h):
it = cabs(Xcp[m, k, j])
if it > tmptop:
tmptop = it
coords[0] = m
coords[1] = k
coords[2] = j
Note that I'm getting cabs from here:
cdef extern from "complex.h":
double cabs(double complex)
This code is already quite a lot faster than what I had previously in Numpy, but I do feel as though it could be sped up, in particular paralellised.
I have tried changing the loop to this:
with nogil:
for j in prange(l):
for k in range(w):
for m in range(k+1, h):
it = abs(Xcp[m, k, j])
if it > tmptop:
tmptop = it
coords[0] = m
coords[1] = k
coords[2] = j
Though I'm now getting the wrong result. What's going on here?

Iteratively-defined Numpy Array Creation

I'm having trouble phrasing this problem in Numpy. I need to simulate an analog maximum tracker (resistor diode capacitor). I have some very long 1-D array X from which I want to calculate the output array Y, such that
Y[0] = X[0]
Y[i] = max(0.99 * Y[i - 1], X[i])
I've faked it by approximating my above rules with Y^30 = ExpDecayFunc * X^30 where the asteriks is convolution. Surely there is something much more straight forward I'm missing? Thanks so much!

Are you trying to simulate an asymmetric signal filter (resistor, diode, capacitor)? It is a nasty non-linear operation, which cannot be calculated in parallel. So, this is really not something nice for NumPy to solve.
The trivial solution is:
import numpy as np
# just do something random
X = np.random.random(1000000)
def my_filter(X):
Y = np.empty(len(X))
Y[0] = X[0]
for i in range(1, len(X)):
Y[i] = max(.99*Y[i-1], X[i])
return Y
This takes time, my machine needs whopping 1.36 s for this (1.36 us for item). Not very nice. (Edit: The stupid use of np.arange changed to range.)
The algorithm can be made a bit faster by rearranging it to avoid lookups:
def my_filter_2(X):
Y = np.empty(len(X))
Y[0] = X[0]
a = .99 * Y[0]
for i in range(1, len(X)):
a = max(a, X[i])
Y[i] = a
a *= .99
return Y
Now we have 1.16 ms (1.16 us per element). An improvement, but not very fast after all.
But then we have cython. This is done with IPython's %%cython (not my solution, Andrew Jaffe shows this in his great answer):
%%cython
import numpy as np
cimport numpy as np
# just do something random
cdef np.ndarray cX = np.random.random(1000000)
def cy_filter(np.ndarray[np.double_t] X):
cdef int i
cdef np.ndarray[np.double_t] Y = np.empty(len(X))
Y[0] = X[0]
for i in range(1, len(X)):
Y[i] = max(.99*Y[i-1], X[i])
return Y
This is fast! My computer claims 6.43 ms (6.43 ns/element).
Another almost-Pythonic solution is numba as suggested by DSM in their answer:
from numba import autojit
import numpy as np
#autojit
def my_filter_nb(X, Y):
Y[0] = X[0]
for i in range(1, len(X)):
Y[i] = max(.99*Y[i-1], X[i])
return Y
def my_filter_fast(X):
Y = np.empty(len(X))
my_filter_nb(X, Y)
return Y
This gives 4.18 ms (4.18 ns/element).
But if we still need speed, let's C:
import numpy as np
import scipy.weave
X = np.random.random(1000000)
def my_filter_c(X):
x_len = len(X)
Y = np.empty(x_len)
c_source = """
#include <math.h>
int i;
double a, x;
Y(0) = X(0);
a = .99 * Y(0);
for (i = 1; i < x_len; i++)
{
x = X(i);
if (x > a)
a = x;
Y(i) = a;
a *= .99;
}
"""
scipy.weave.inline(c_source, ["X","Y","x_len"],
compiler="gcc",
headers=["<math.h>"],
type_converters=scipy.weave.converters.blitz)
return Y
This one gives 3.72 ms (3.72 ns/round). (BTW, my brain is not multi-threaded, and writing inlined C into Python would require two threads - it's amazing how many semicolons one can miss when writing a simple program in C.) The improvement is not that big, the trouble is.
To see how bad or good this is compared to plain C:
#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>
#include <time.h>
#define NUMITER 100000000
int main(void)
{
double *x, *y;
double a, b, time_delta;
int i;
struct rusage ru0, ru1;
x = (double *)malloc(NUMITER * sizeof(double));
y = (double *)malloc(NUMITER * sizeof(double));
for (i = 0; i < NUMITER; i++)
x[i] = rand() / (double)(RAND_MAX - 1);
getrusage(RUSAGE_SELF, &ru0);
y[0] = x[0];
a = .99 * y[0];
for (i = 0; i < NUMITER; i++)
{
b = x[i];
if (b > a)
a = b;
y[i] = a;
a *= .99;
}
getrusage(RUSAGE_SELF, &ru1);
time_delta = ru1.ru_utime.tv_sec + ru1.ru_utime.tv_usec * 1e-6
- ru0.ru_utime.tv_sec - ru0.ru_utime.tv_usec * 1e-6;
printf("Took %.6lf seconds, %.2lf nanoseconds per element", time_delta, 1e9 * time_delta / NUMITER);
return (int)y[1234] % 2; // just to make sure the optimizer is not too clever
}
This compiled with gcc -Ofast takes 318 ms or 3.18 ns/element (note the larger number of elements) and is thus the winner.
All Python timings have been performed with IPython's %timeit and they include some overhead from the np.empty, but that is quite insignificant. However probably due to memory management issues the results vary somewhat from one run to another, so they need to be taken with a pinch a salt in any case.
I also tried the faster solutions with 500 million elements to avoid call overheads:
%cython: 7.5 ns/element
numba: 7.3 ns/element
inlined C (weave): 5.7 ns/element
plain C: 3.2 ns/element
I also tried some hand-optimizing tricks with plain C, but at least without looking at the compiling results it seems that gcc is at least as clever as I am.
Out of this stack I'd probably take numba or plain C depending on the rush I am having. With this specific problem scipy.weave.inline is too much trouble compared to the advantage.
Also -- depending on the data -- this could possibly be made slightly faster with parallel processing, but the worst case is then worse, and the whole thing may be memory-bandwidth-limited anyway.

Cython is very fast. I ran this in iPython using the cython magic.
%% cython
import numpy as np
cimport numpy as np
# just do something random
cdef np.ndarray cX = np.random.random(1000000)
def cy_filter(np.ndarray[np.double_t] X):
cdef int i
cdef np.ndarray[np.double_t] Y = np.empty(len(X))
Y[0] = X[0]
for i in range(1, len(X)):
Y[i] = max(.99*Y[i-1], X[i])
return Y
Using %timeit, I get a speedup from
1 loops, best of 3: 1.52 s per loop
to
100 loops, best of 3: 4.67 ms per loop
(For what it's worth, when I missed the cdef int i it was only about a factor 3 speedup, instead of 300!)

You could also use numba, although it would require a few changes:
from numba import autojit
import numpy as np
#autojit
def my_filter_nb(X, Y):
Y[0] = X[0]
for i in range(1, len(X)):
Y[i] = max(.99*Y[i-1], X[i])
return Y
def my_filter_fast(X):
Y = np.empty(len(X))
my_filter_nb(X, Y)
return Y
def my_filter(X):
Y = np.empty(len(X))
Y[0] = X[0]
for i in np.arange(1, len(X)):
Y[i] = max(.99*Y[i-1], X[i])
return Y
which gives me:
>>> X = np.random.random(1000000)
>>> %timeit my_filter(X)
1 loops, best of 3: 936 ms per loop
>>> %timeit my_filter_fast(X)
100 loops, best of 3: 3.83 ms per loop
>>> (my_filter(X) == my_filter_fast(X)).all()
True

Subset of a matrix multiplication, fast, and sparse

Converting a collaborative filtering code to use sparse matrices I'm puzzling on the following problem: given two full matrices X (m by l) and Theta (n by l), and a sparse matrix R (m by n), is there a fast way to calculate the sparse inner product . Large dimensions are m and n (order 100000), while l is small (order 10). This is probably a fairly common operation for big data since it shows up in the cost function of most linear regression problems, so I'd expect a solution built into scipy.sparse, but I haven't found anything obvious yet.
The naive way to do this in python is R.multiply(XTheta.T), but this will result in evaluation of the full matrix XTheta.T (m by n, order 100000**2) which occupies too much memory, then dumping most of the entries since R is sparse.
There is a pseudo solution already here on stackoverflow, but it is non-sparse in one step:
def sparse_mult_notreally(a, b, coords):
rows, cols = coords
rows, r_idx = np.unique(rows, return_inverse=True)
cols, c_idx = np.unique(cols, return_inverse=True)
C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is dense
return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
This works fine, and fast, for me on small enough arrays, but it barfs on my big datasets with the following error:
... in sparse_mult(a, b, coords)
132 rows, r_idx = np.unique(rows, return_inverse=True)
133 cols, c_idx = np.unique(cols, return_inverse=True)
--> 134 C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is not sparse
135 return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
ValueError: array is too big.
A solution which IS actually sparse, but very slow, is:
def sparse_mult(a, b, coords):
rows, cols = coords
n = len(rows)
C = np.array([ float(a[rows[i],:]*b[:,cols[i]]) for i in range(n) ]) # this is sparse, but VERY slow
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
Does anyone know a fast, fully sparse way to do this?

I profiled 4 different solutions to your problem, and it looks like for any size of the array, the numba jit solution is the best. A close second is #Alexander's cython solution.
Here are the results (M is the number of rows in the x array):
M = 1000
function sparse_dense took 0.03 sec.
function sparse_loop took 0.07 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.09 sec.
M = 10000
function sparse_dense took 2.88 sec.
function sparse_loop took 0.68 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.01 sec.
M = 100000
function sparse_dense ran out of memory
function sparse_loop took 6.84 sec.
function sparse_numba took 0.09 sec.
function sparse_cython took 0.12 sec.
The script I used to profile these methods is:
import numpy as np
from scipy.sparse import coo_matrix
from numba import autojit, jit, float64, int32
import pyximport
pyximport.install(setup_args={"script_args":["--compiler=mingw32"],
"include_dirs":np.get_include()},
reload_support=True)
def sparse_dense(a,b,c):
return coo_matrix(c.multiply(np.dot(a,b)))
def sparse_loop(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) by looping over non-zero
entries in `c` and using `np.dot()` for each entry."""
N = c.size
data = np.empty(N,dtype=float)
for i in range(N):
data[i] = c.data[i]*np.dot(a[c.row[i],:],b[:,c.col[i]])
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
##autojit
def _sparse_mult4(a,b,cd,cr,cc):
N = cd.size
data = np.empty_like(cd)
for i in range(N):
num = 0.0
for j in range(a.shape[1]):
num += a[cr[i],j]*b[j,cc[i]]
data[i] = cd[i]*num
return data
_fast_sparse_mult4 = \
jit(float64[:,:](float64[:,:],float64[:,:],float64[:],int32[:],int32[:]))(_sparse_mult4)
def sparse_numba(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) using Numba's jit."""
assert c.shape == (a.shape[0],b.shape[1])
data = _fast_sparse_mult4(a,b,c.data,c.row,c.col)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def sparse_cython(a, b, c):
"""Computes c.multiply(np.dot(a,b)) using cython."""
from sparse_mult_c import sparse_mult_c
data = np.empty_like(c.data)
sparse_mult_c(a,b,c.data,c.row,c.col,data)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def unique_rows(a):
a = np.ascontiguousarray(a)
unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))
if __name__ == '__main__':
import time
for M in [1000,10000,100000]:
print 'M = %i' % M
N = M + 2
L = 10
x = np.random.rand(M,L)
t = np.random.rand(N,L).T
# number of non-zero entries in sparse r matrix
S = M*10
row = np.random.randint(M,size=S)
col = np.random.randint(N,size=S)
# remove duplicate rows and columns
row, col = unique_rows(np.dstack((row,col)).squeeze()).T
data = np.random.rand(row.size)
r = coo_matrix((data,(row,col)),shape=(M,N))
a2 = sparse_loop(x,t,r)
for f in [sparse_dense,sparse_loop,sparse_numba,sparse_cython]:
t0 = time.time()
try:
a = f(x,t,r)
except MemoryError:
print 'function %s ran out of memory' % f.__name__
continue
elapsed = time.time()-t0
try:
diff = abs(a-a2)
if diff.nnz > 0:
assert np.max(abs(a-a2).data) < 1e-5
except AssertionError:
print f.__name__
raise
print 'function %s took %.2f sec.' % (f.__name__,elapsed)
The cython function is a slightly modified version of #Alexander's code:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.float64_t, ndim=1] data,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] out):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
cdef double num
for i in range(n):
num = 0.0
for j in range(k):
num += a[rows[i],j] * b[j,cols[i]]
out[i] = data[i]*num

Based on the extra information on the comments, I think what's throwing you off is the call to np.unique. Try the following approach:
import numpy as np
import scipy.sparse as sps
from numpy.core.umath_tests import inner1d
n = 100000
x = np.random.rand(n, 10)
theta = np.random.rand(n, 10)
rows = np.arange(n)
cols = np.arange(n)
np.random.shuffle(rows)
np.random.shuffle(cols)
def sparse_multiply(x, theta, rows, cols):
data = inner1d(x[rows], theta[cols])
return sps.coo_matrix((data, (rows, cols)),
shape=(x.shape[0], theta.shape[0]))
I get the following timings:
n = 1000
%timeit sparse_multiply(x, theta, rows, cols)
1000 loops, best of 3: 465 us per loop
n = 10000
%timeit sparse_multiply(x, theta, rows, cols)
100 loops, best of 3: 4.29 ms per loop
n = 100000
%timeit sparse_multiply(x, theta, rows, cols)
10 loops, best of 3: 61.5 ms per loop
And of course, with n = 100:
>>> np.allclose(sparse_multiply(x, theta, rows, cols).toarray()[rows, cols],
x.dot(theta.T)[rows, cols])
>>> True

Haven't tested Jaime's answer yet (thanks again!), but I implemented another answer that works in the meantime using cython.
file sparse_mult_c.pyx:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] C ):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
for i in range(n):
for j in range(k):
C[i] += a[rows[i],j] * b[j,cols[i]]
Then compile it as per http://docs.cython.org/src/userguide/tutorial.html
Then in my python code, I include the following:
def sparse_mult(a, b, coords):
#a,b are np.ndarrays
from sparse_mult_c import sparse_mult_c
rows, cols = coords
C = np.zeros(rows.shape[0])
sparse_mult_c(a,b,rows,cols,C)
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
This works fully sparse and also runs faster than even the original (memory-inefficient for me) solution.

numpy template matching using matrix multiplications

I am trying to match a template with a binary image (only black and white) by shifting the template along the image. And return the minimum distance between the template and the image with the corresponding position on which this minimum distance did occur. For example:
img:
0 1 0
0 0 1
0 1 1
template:
0 1
1 1
This template matches the image best at position (1,1) and the distance will then be 0. So far things are not too difficult and I already got some code that does the trick.
def match_template(img, template):
mindist = float('inf')
idx = (-1,-1)
for y in xrange(img.shape[1]-template.shape[1]+1):
for x in xrange(img.shape[0]-template.shape[0]+1):
#calculate Euclidean distance
dist = np.sqrt(np.sum(np.square(template - img[x:x+template.shape[0],y:y+template.shape[1]])))
if dist < mindist:
mindist = dist
idx = (x,y)
return [mindist, idx]
But for images of the size I need (image among 500 x 200 pixels and template among 250 x 100) this already takes approximately 4.5 seconds, which is way too slow. And I know the same thing can be done much quicker using matrix multiplications (in matlab I believe this can be done using im2col and repmat). Can anyone explain me how to do it in python/numpy?
btw. I know there is an opencv matchTemplate function that does exactly what I need, but since I might need to alter the code slightly later on I would prefer a solution which I fully understand and can alter.
Thanks!
edit: If anyone can explain me how opencv does this in less than 0.2 seconds that would also be great. I have had a short look at the source code, but those things somehow always look quite complicated to me.
edit2: Cython code
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t
def match_template(np.ndarray img, np.ndarray template):
cdef float mindist = float('inf')
cdef int x_coord = -1
cdef int y_coord = -1
cdef float dist
cdef unsigned int x, y
cdef int img_width = img.shape[0]
cdef int img_height = img.shape[1]
cdef int template_width = template.shape[0]
cdef int template_height = template.shape[1]
cdef int range_x = img_width-template_width+1
cdef int range_y = img_height-template_height+1
for y from 0 <= y < range_y:
for x from 0 <= x < range_x:
dist = np.sqrt(np.sum(np.square(template - img[ x:<unsigned int>(x+template_width), y:<unsigned int>(y+template_height) ]))) #calculate euclidean distance
if dist < mindist:
mindist = dist
x_coord = x
y_coord = y
return [mindist, (x_coord,y_coord)]
img = np.asarray(img, dtype=DTYPE)
template = np.asarray(template, dtype=DTYPE)
match_template(img, template)

One possible way of doing what you want is via convolution (which can be brute force or FFT). Matrix multiplications AFAIK won't work. You need to convolve your data with the template. And find the maximum (you'll also need to do some scaling to make it work properly).
xs=np.array([[0,1,0],[0,0,1],[0,1,1]])*1.
ys=np.array([[0,1],[1,1]])*1.
print scipy.ndimage.convolve(xs,ys,mode='constant',cval=np.inf)
>>> array([[ 1., 1., inf],
[ 0., 2., inf],
[ inf, inf, inf]])
print scipy.signal.fftconvolve(xs,ys,mode='valid')
>>> array([[ 1., 1.],
[ 0., 2.]])

There may be a fancy way to get this done using pure numpy/scipy magic. But it might be easier (and more understandable when you look at the code in the future) to just drop into Cython to get this done. There's a good tutorial for integrating Cython with numpy at http://docs.cython.org/src/tutorial/numpy.html.
EDIT:
I did a quick test with your Cython code and it ran ~15 sec for a 500x400 img with a 100x200 template. After some tweaks (eliminating the numpy method calls and numpy bounds checking), I got it down under 3 seconds. That may not be enough for you, but it shows the possibility.
import numpy as np
cimport numpy as np
cimport cython
from libc.math cimport sqrt
DTYPE = np.int
ctypedef np.int_t DTYPE_t
#cython.boundscheck(False)
def match_template(np.ndarray[DTYPE_t, ndim=2] img, np.ndarray[DTYPE_t, ndim=2] template):
cdef float mindist = float('inf')
cdef int x_coord = -1
cdef int y_coord = -1
cdef float dist
cdef unsigned int x, y
cdef int img_width = img.shape[0]
cdef int img_height = img.shape[1]
cdef int template_width = template.shape[0]
cdef int template_height = template.shape[1]
cdef int range_x = img_width-template_width+1
cdef int range_y = img_height-template_height+1
cdef DTYPE_t total
cdef int delta
cdef unsigned int j, k, j_plus, k_plus
for y from 0 <= y < range_y:
for x from 0 <= x < range_x:
#dist = np.sqrt(np.sum(np.square(template - img[ x:<unsigned int>(x+template_width), y:<unsigned int>(y+template_height) ]))) #calculate euclidean distance
# Do the same operations, but in plain C
total = 0
for j from 0 <= j < template_width:
j_plus = <unsigned int>x + j
for k from 0 <= k < template_height:
k_plus = <unsigned int>y + k
delta = template[j, k] - img[j_plus, k_plus]
total += delta*delta
dist = sqrt(total)
if dist < mindist:
mindist = dist
x_coord = x
y_coord = y
return [mindist, (x_coord,y_coord)]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Cython call optimization - python

Related

Nested loops with cython for image processing

Parallelising an Exhaustive Search in Cython

Iteratively-defined Numpy Array Creation

Subset of a matrix multiplication, fast, and sparse

numpy template matching using matrix multiplications

Categories

Resources