Converting a collaborative filtering code to use sparse matrices I'm puzzling on the following problem: given two full matrices X (m by l) and Theta (n by l), and a sparse matrix R (m by n), is there a fast way to calculate the sparse inner product . Large dimensions are m and n (order 100000), while l is small (order 10). This is probably a fairly common operation for big data since it shows up in the cost function of most linear regression problems, so I'd expect a solution built into scipy.sparse, but I haven't found anything obvious yet.
The naive way to do this in python is R.multiply(XTheta.T), but this will result in evaluation of the full matrix XTheta.T (m by n, order 100000**2) which occupies too much memory, then dumping most of the entries since R is sparse.
There is a pseudo solution already here on stackoverflow, but it is non-sparse in one step:
def sparse_mult_notreally(a, b, coords):
rows, cols = coords
rows, r_idx = np.unique(rows, return_inverse=True)
cols, c_idx = np.unique(cols, return_inverse=True)
C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is dense
return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
This works fine, and fast, for me on small enough arrays, but it barfs on my big datasets with the following error:
... in sparse_mult(a, b, coords)
132 rows, r_idx = np.unique(rows, return_inverse=True)
133 cols, c_idx = np.unique(cols, return_inverse=True)
--> 134 C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is not sparse
135 return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
ValueError: array is too big.
A solution which IS actually sparse, but very slow, is:
def sparse_mult(a, b, coords):
rows, cols = coords
n = len(rows)
C = np.array([ float(a[rows[i],:]*b[:,cols[i]]) for i in range(n) ]) # this is sparse, but VERY slow
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
Does anyone know a fast, fully sparse way to do this?
I profiled 4 different solutions to your problem, and it looks like for any size of the array, the numba jit solution is the best. A close second is #Alexander's cython solution.
Here are the results (M is the number of rows in the x array):
M = 1000
function sparse_dense took 0.03 sec.
function sparse_loop took 0.07 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.09 sec.
M = 10000
function sparse_dense took 2.88 sec.
function sparse_loop took 0.68 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.01 sec.
M = 100000
function sparse_dense ran out of memory
function sparse_loop took 6.84 sec.
function sparse_numba took 0.09 sec.
function sparse_cython took 0.12 sec.
The script I used to profile these methods is:
import numpy as np
from scipy.sparse import coo_matrix
from numba import autojit, jit, float64, int32
import pyximport
pyximport.install(setup_args={"script_args":["--compiler=mingw32"],
"include_dirs":np.get_include()},
reload_support=True)
def sparse_dense(a,b,c):
return coo_matrix(c.multiply(np.dot(a,b)))
def sparse_loop(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) by looping over non-zero
entries in `c` and using `np.dot()` for each entry."""
N = c.size
data = np.empty(N,dtype=float)
for i in range(N):
data[i] = c.data[i]*np.dot(a[c.row[i],:],b[:,c.col[i]])
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
##autojit
def _sparse_mult4(a,b,cd,cr,cc):
N = cd.size
data = np.empty_like(cd)
for i in range(N):
num = 0.0
for j in range(a.shape[1]):
num += a[cr[i],j]*b[j,cc[i]]
data[i] = cd[i]*num
return data
_fast_sparse_mult4 = \
jit(float64[:,:](float64[:,:],float64[:,:],float64[:],int32[:],int32[:]))(_sparse_mult4)
def sparse_numba(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) using Numba's jit."""
assert c.shape == (a.shape[0],b.shape[1])
data = _fast_sparse_mult4(a,b,c.data,c.row,c.col)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def sparse_cython(a, b, c):
"""Computes c.multiply(np.dot(a,b)) using cython."""
from sparse_mult_c import sparse_mult_c
data = np.empty_like(c.data)
sparse_mult_c(a,b,c.data,c.row,c.col,data)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def unique_rows(a):
a = np.ascontiguousarray(a)
unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))
if __name__ == '__main__':
import time
for M in [1000,10000,100000]:
print 'M = %i' % M
N = M + 2
L = 10
x = np.random.rand(M,L)
t = np.random.rand(N,L).T
# number of non-zero entries in sparse r matrix
S = M*10
row = np.random.randint(M,size=S)
col = np.random.randint(N,size=S)
# remove duplicate rows and columns
row, col = unique_rows(np.dstack((row,col)).squeeze()).T
data = np.random.rand(row.size)
r = coo_matrix((data,(row,col)),shape=(M,N))
a2 = sparse_loop(x,t,r)
for f in [sparse_dense,sparse_loop,sparse_numba,sparse_cython]:
t0 = time.time()
try:
a = f(x,t,r)
except MemoryError:
print 'function %s ran out of memory' % f.__name__
continue
elapsed = time.time()-t0
try:
diff = abs(a-a2)
if diff.nnz > 0:
assert np.max(abs(a-a2).data) < 1e-5
except AssertionError:
print f.__name__
raise
print 'function %s took %.2f sec.' % (f.__name__,elapsed)
The cython function is a slightly modified version of #Alexander's code:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.float64_t, ndim=1] data,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] out):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
cdef double num
for i in range(n):
num = 0.0
for j in range(k):
num += a[rows[i],j] * b[j,cols[i]]
out[i] = data[i]*num
Based on the extra information on the comments, I think what's throwing you off is the call to np.unique. Try the following approach:
import numpy as np
import scipy.sparse as sps
from numpy.core.umath_tests import inner1d
n = 100000
x = np.random.rand(n, 10)
theta = np.random.rand(n, 10)
rows = np.arange(n)
cols = np.arange(n)
np.random.shuffle(rows)
np.random.shuffle(cols)
def sparse_multiply(x, theta, rows, cols):
data = inner1d(x[rows], theta[cols])
return sps.coo_matrix((data, (rows, cols)),
shape=(x.shape[0], theta.shape[0]))
I get the following timings:
n = 1000
%timeit sparse_multiply(x, theta, rows, cols)
1000 loops, best of 3: 465 us per loop
n = 10000
%timeit sparse_multiply(x, theta, rows, cols)
100 loops, best of 3: 4.29 ms per loop
n = 100000
%timeit sparse_multiply(x, theta, rows, cols)
10 loops, best of 3: 61.5 ms per loop
And of course, with n = 100:
>>> np.allclose(sparse_multiply(x, theta, rows, cols).toarray()[rows, cols],
x.dot(theta.T)[rows, cols])
>>> True
Haven't tested Jaime's answer yet (thanks again!), but I implemented another answer that works in the meantime using cython.
file sparse_mult_c.pyx:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] C ):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
for i in range(n):
for j in range(k):
C[i] += a[rows[i],j] * b[j,cols[i]]
Then compile it as per http://docs.cython.org/src/userguide/tutorial.html
Then in my python code, I include the following:
def sparse_mult(a, b, coords):
#a,b are np.ndarrays
from sparse_mult_c import sparse_mult_c
rows, cols = coords
C = np.zeros(rows.shape[0])
sparse_mult_c(a,b,rows,cols,C)
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
This works fully sparse and also runs faster than even the original (memory-inefficient for me) solution.
Related
I am trying to compute the commutation matrix in python for a large dataset. I wrote the following code but found it performs terribly (and runs into memory errors for examples of around 500 by 500). In my code a and b are equivilent to the m and n notation in the linked wikipedia page. Can anyone provide a quicker and more memory efficient alternative to my current attempt?
def vec(matrix):
#Return vectorised matrix
return(matrix.transpose().reshape(matrix.shape[0]*matrix.shape[1],1))
def commutation(a, b):
# Example matrix with unique elements
m = np.arange(a*b).reshape(a,b)
# Vec(m)
vecm = vec(m)
vecm = vecm.reshape(vecm.shape[0])
# Get row inds
rowInds = np.arange(a*b)
# Get column inds
colInds = np.argsort(vecm)
colInds = colInds.reshape(colInds.shape[0])
# Work out mapping between them.
K = scipy.sparse.csr_matrix((np.ones(a*b),(rowInds,colInds)))
return(K)
Below is an improved version of your code:
import numpy as np
from scipy.sparse import csr_matrix
def vec(A):
m, n = A.shape[0], A.shape[1]
return A.reshape(m*n, order='F')
def commutation_matrix_sp(A):
m, n = A.shape[0], A.shape[1]
row = np.arange(m*n)
col = row.reshape((m, n), order='F').ravel()
data = np.ones(m*n, dtype=np.int8)
K = csr_matrix((data, (row, col)), shape=(m*n, m*n))
return K
Test:
A = np.random.rand(500, 500)
K = commutation_matrix_sp(A)
print(f'{K.data.nbytes/2**20:.2f} MB')
# 0.24 MB
print(np.all(K # vec(A) == vec(A.T)))
# True
I'm trying to find the fastest way to to get the functionality of numpy's 'where' statement on a 2D numpy array; namely, retrieving the indices where a condition is met. It is simply much slower than other languages I have used (e.g., IDL, Matlab).
I have cythonized a function that marches through the array in nested for-loops. There is almost an order of magnitude increase in speed, but I would like to increase performance even more, if possible.
TEST.py:
from cython_where import *
import time
import numpy as np
data = np.zeros((2600,5200))
data[100:200,100:200] = 10
t0 = time.time()
inds,ct = cython_where(data,'EQ',10)
print time.time() - t0
t1 = time.time()
tmp = np.where(data == 10)
print time.time() - t1
My cython_where.pyx program:
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
DTYPE1 = np.float
ctypedef np.float_t DTYPE1_t
DTYPE2 = np.int
ctypedef np.int_t DTYPE2_t
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
def cython_where(np.ndarray[DTYPE1_t, ndim=2] data, oper, DTYPE1_t val):
assert data.dtype == DTYPE1
cdef int xmax = data.shape[0]
cdef int ymax = data.shape[1]
cdef unsigned int x, y
cdef int count = 0
cdef np.ndarray[DTYPE2_t, ndim=1] xind = np.zeros(100000,dtype=int)
cdef np.ndarray[DTYPE2_t, ndim=1] yind = np.zeros(100000,dtype=int)
if(oper == 'EQ' or oper == 'eq'): #I didn't want to include GT, GE, LT, LE here
for x in xrange(xmax):
for y in xrange(ymax):
if(data[x,y] == val):
xind[count] = x
yind[count] = y
count += 1
return tuple([xind[0:count],yind[0:count]]),count
Output of TEST.py:
cython_test]$ python TEST.py
0.0139019489288
0.0982608795166
I've also tried numpy's argwhere, which is about as fast as where. I'm pretty new to numpy and cython, so if you have any other ideas to really increase performance, I'm all ears!
Contributions:
Numpy can be speed up on flattened array for a 4x gain:
%timeit np.where(data==10)
1 loops, best of 3: 105 ms per loop
%timeit np.unravel_index(np.where(data.ravel()==10),data.shape)
10 loops, best of 3: 26.0 ms per loop
I think you can optimize your cython code with that, avoiding computing k=i*ncol+j for each cell.
Numba give a simple alternative :
from numba import jit
dtype=data.dtype
#jit(nopython=True)
def numbaeq(flatdata,x,nrow,ncol):
size=ncol*nrow
ix=np.empty(size,dtype=dtype)
jx=np.empty(size,dtype=dtype)
count=0
k=0
while k<size:
if flatdata[k]==x :
ix[count]=k//ncol
jx[count]=k%ncol
count+=1
k+=1
return ix[:count],jx[:count]
def whereequal(data,x): return numbaeq(data.ravel(),x,*data.shape)
which gives :
%timeit whereequal(data,10)
10 loops, best of 3: 20.2 ms per loop
Not great optimisation for numba on such problem, under cython performance.
k//ncol and k%ncol can be computed at same time with a optimized divmod operation.
ultimate steps are assembly language and parallélisation , but it's other sports.
I have written a Python function that computes pairwise electromagnetic interactions between a largish number (N ~ 10^3) of particles and stores the results in an NxN complex128 ndarray. It runs, but it is the slowest part of a larger program, taking about 40 seconds when N=900 [corrected]. The original code looks like this:
import numpy as np
def interaction(s,alpha,kprop): # s is an Nx3 real array
# alpha is complex
# kprop is float
ndipoles = s.shape[0]
Amat = np.zeros((ndipoles,3, ndipoles, 3), dtype=np.complex128)
I = np.array([[1,0,0],[0,1,0],[0,0,1]])
im = complex(0,1)
k2 = kprop*kprop
for i in range(ndipoles):
xi = s[i,:]
for j in range(ndipoles):
if i != j:
xj = s[j,:]
dx = xi-xj
R = np.sqrt(dx.dot(dx))
n = dx/R
kR = kprop*R
kR2 = kR*kR
A = ((1./kR2) - im/kR)
nxn = np.outer(n, n)
nxn = (3*A-1)*nxn + (1-A)*I
nxn *= -alpha*(k2*np.exp(im*kR))/R
else:
nxn = I
Amat[i,:,j,:] = nxn
return(Amat.reshape((3*ndipoles,3*ndipoles)))
I had never previously used Cython, but that seemed like a good place to start in my effort to speed things up, so I pretty much blindly adapted the techniques I found in online tutorials. I got some speedup (30 seconds vs. 40 seconds), but not nearly as dramatic as I expected, so I'm wondering whether I'm doing something wrong or am missing a critical step. The following is my best attempt at cythonizing the above routine:
import numpy as np
cimport numpy as np
DTYPE = np.complex128
ctypedef np.complex128_t DTYPE_t
def interaction(np.ndarray s, DTYPE_t alpha, float kprop):
cdef float k2 = kprop*kprop
cdef int i,j
cdef np.ndarray xi, xj, dx, n, nxn
cdef float R, kR, kR2
cdef DTYPE_t A
cdef int ndipoles = s.shape[0]
cdef np.ndarray Amat = np.zeros((ndipoles,3, ndipoles, 3), dtype=DTYPE)
cdef np.ndarray I = np.array([[1,0,0],[0,1,0],[0,0,1]])
cdef DTYPE_t im = complex(0,1)
for i in range(ndipoles):
xi = s[i,:]
for j in range(ndipoles):
if i != j:
xj = s[j,:]
dx = xi-xj
R = np.sqrt(dx.dot(dx))
n = dx/R
kR = kprop*R
kR2 = kR*kR
A = ((1./kR2) - im/kR)
nxn = np.outer(n, n)
nxn = (3*A-1)*nxn + (1-A)*I
nxn *= -alpha*(k2*np.exp(im*kR))/R
else:
nxn = I
Amat[i,:,j,:] = nxn
return(Amat.reshape((3*ndipoles,3*ndipoles)))
The real power of NumPy is in performing an operation across a huge number of elements in a vectorized manner instead of using that operation in chunks spread across loops. In your case, you are using two nested loops and one IF conditional statement. I would propose extending the dimensions of the intermediate arrays, which would bring in NumPy's powerful broadcasting capability to come into play and thus the same operations could be used on all elements in one go instead of small chunks of data within the loops.
For extending the dimensions, None/np.newaxis could be used. So, the vectorized implementation to follow such a premise would look like this -
def vectorized_interaction(s,alpha,kprop):
im = complex(0,1)
I = np.array([[1,0,0],[0,1,0],[0,0,1]])
k2 = kprop*kprop
# Vectorized calculations for dx, R, n, kR, A
sd = s[:,None] - s
Rv = np.sqrt((sd**2).sum(2))
nv = sd/Rv[:,:,None]
kRv = Rv*kprop
Av = (1./(kRv*kRv)) - im/kRv
# Vectorized calculation for: "nxn = np.outer(n, n)"
nxnv = nv[:,:,:,None]*nv[:,:,None,:]
# Vectorized calculation for: "(3*A-1)*nxn + (1-A)*I"
P = (3*Av[:,:,None,None]-1)*nxnv + (1-Av[:,:,None,None])*I
# Vectorized calculation for: "-alpha*(k2*np.exp(im*kR))/R"
multv = -alpha*(k2*np.exp(im*kRv))/Rv
# Vectorized calculation for: "nxn *= -alpha*(k2*np.exp(im*kR))/R"
outv = P*multv[:,:,None,None]
# Simulate ELSE part of the conditional statement"if i != j:"
# with masked setting to I on the last two dimensions
outv[np.eye((N),dtype=bool)] = I
return outv.transpose(0,2,1,3).reshape(N*3,-1)
Runtime tests and output verification -
Case #1:
In [703]: N = 10
...: s = np.random.rand(N,3) + complex(0,1)*np.random.rand(N,3)
...: alpha = 3j
...: kprop = 5.4
...:
In [704]: out_org = interaction(s,alpha,kprop)
...: out_vect = vectorized_interaction(s,alpha,kprop)
...: print np.allclose(np.real(out_org),np.real(out_vect))
...: print np.allclose(np.imag(out_org),np.imag(out_vect))
...:
True
True
In [705]: %timeit interaction(s,alpha,kprop)
100 loops, best of 3: 7.6 ms per loop
In [706]: %timeit vectorized_interaction(s,alpha,kprop)
1000 loops, best of 3: 304 µs per loop
Case #2:
In [707]: N = 100
...: s = np.random.rand(N,3) + complex(0,1)*np.random.rand(N,3)
...: alpha = 3j
...: kprop = 5.4
...:
In [708]: out_org = interaction(s,alpha,kprop)
...: out_vect = vectorized_interaction(s,alpha,kprop)
...: print np.allclose(np.real(out_org),np.real(out_vect))
...: print np.allclose(np.imag(out_org),np.imag(out_vect))
...:
True
True
In [709]: %timeit interaction(s,alpha,kprop)
1 loops, best of 3: 826 ms per loop
In [710]: %timeit vectorized_interaction(s,alpha,kprop)
100 loops, best of 3: 14 ms per loop
Case #3:
In [711]: N = 900
...: s = np.random.rand(N,3) + complex(0,1)*np.random.rand(N,3)
...: alpha = 3j
...: kprop = 5.4
...:
In [712]: out_org = interaction(s,alpha,kprop)
...: out_vect = vectorized_interaction(s,alpha,kprop)
...: print np.allclose(np.real(out_org),np.real(out_vect))
...: print np.allclose(np.imag(out_org),np.imag(out_vect))
...:
True
True
In [713]: %timeit interaction(s,alpha,kprop)
1 loops, best of 3: 1min 7s per loop
In [714]: %timeit vectorized_interaction(s,alpha,kprop)
1 loops, best of 3: 1.59 s per loop
I was wondering if I'm missing something when using Cython with Numpy because I haven't seen much of an improvement. I wrote this code as an example.
Naive version:
import numpy as np
from skimage.util import view_as_windows
it = 16
arr = np.arange(1000*1000, dtype=np.float64).reshape(1000,1000)
windows = view_as_windows(arr, (it, it), it)
container = np.zeros((windows.shape[0], windows.shape[1]))
def test(windows):
for i in range(windows.shape[0]):
for j in range(windows.shape[1]):
container[i,j] = np.mean(windows[i,j])
return container
%%timeit
test(windows)
1 loops, best of 3: 131 ms per loop
Cythonized version:
%%cython --annotate
import numpy as np
cimport numpy as np
from skimage.util import view_as_windows
import cython
cdef int step = 16
arr = np.arange(1000*1000, dtype=np.float64).reshape(1000,1000)
windows = view_as_windows(arr, (step, step), step)
#cython.boundscheck(False)
def cython_test(np.ndarray[np.float64_t, ndim=4] windows):
cdef np.ndarray[np.float64_t, ndim=2] container = np.zeros((windows.shape[0], windows.shape[1]),dtype=np.float64)
cdef int i, j
I = windows.shape[0]
J = windows.shape[1]
for i in range(I):
for j in range(J):
container[i,j] = np.mean(windows[i,j])
return container
%timeit cython_test(windows)
10 loops, best of 3: 126 ms per loop
As you can see, there is a very modest improvement, so maybe I'm doing something wrong. By the way, the annotation that Cython produces the following:
As you can see, the numpy lines have a yellow background even after including the efficient indexing syntax np.ndarray[DTYPE_t, ndim=2]. Why?
By the way, in my view the ideal outcome is being able to use most numpy functions but still get some reasonable improvement after taking advantage of efficient indexing syntax or maybe memory views as in HYRY's answer.
UPDATE
It seems I'm not doing anything wrong in the code I posted above and that the yellow background in some lines is normal, so I was left wondering the following: In which situations I can get a benefit from typing cdef np.ndarray[np.float64_t, ndim=2] in front of numpy arrays? I suppose there are specific instances where this is helpful, otherwise there wouldn't be much purpose in doing it.
You need to implement the mean() function yourself to speedup the code, this is because the overhead of calling a numpy function is very high.
#cython.boundscheck(False)
#cython.wraparound(False)
def cython_test(double[:, :, :, :] windows):
cdef double[:, ::1] container
cdef int i, j, k, l
cdef int n0, n1, n2, n3
cdef double inv_n
cdef double s
n0, n1, n2, n3 = windows.base.shape
container = np.zeros((n0, n1))
inv_n = 1.0 / (n2 * n3)
for i in range(n0):
for j in range(n1):
s = 0
for k in range(n2):
for l in range(n3):
s += windows[i, j, k, l]
container[i,j] = s * inv_n
return container.base
Here is the %timeit results:
python_test(windows): 63.7 ms
cython_test(windows): 1.24 ms
np.mean(windows, axis=(2, 3)): 2.66 ms
I need to quickly compute a matrix whose entries are obtained by convolving a filter with a vector for each row, subsampling the entries of the resulting vector, and then taking the dot product of the result with another vector. Specifically, I want to compute
M = [conv(e_j, f)*P_i*v_i ]_{i,j},
where i varies from 1 to n and j varies from 1 to m. Here e_j is the indicator (row) vector of size n with a one only in column j, f is the filter of length s, P_i is an (n+s-1)-by-k matrix which samples the appropriate k entries from the convolution, and v_i is a column vector of length k.
It takes O(n*s) operations to compute each entry of M, so O(n*s*n*m) overall to compute M. For n=6, m=7, s=3, one core of my computer (8GLOPs) should be able compute M in roughly .094 microseconds. Yet my very simple cython implementation, following the example given in the Cython documentation, takes more than 2 milliseconds to compute an example with those parameters. That is about 4 orders of magnitude difference!
Here is a shar file with the Cython implementation and test code. Copy and paste it to a file and run 'bash <fname>' in a clean directory to get the code, then run 'bash ./test.sh' to see the abysmal performance.
cat > fastcalcM.pyx <<'EOF'
import numpy as np
cimport numpy as np
cimport cython
from scipy.signal import convolve
DTYPE=np.float32
ctypedef np.float32_t DTYPE_t
#cython.boundscheck(False)
def calcM(np.ndarray[DTYPE_t, ndim=1, negative_indices=False] filtertaps, int
n, int m, np.ndarray[np.int_t, ndim=2, negative_indices=False]
keep_indices, np.ndarray[DTYPE_t, ndim=2, negative_indices=False] V):
""" Computes a numrows-by-k matrix M whose entries satisfy
M_{i,k} = [conv(e_j, f)^T * P_i * v_i],
where v_i^T is the i-th row of V, and P_i samples the entries from
conv(e_j, f)^T indicated by the ith row of the keep_indices matrix """
cdef int k = keep_indices.shape[1]
cdef np.ndarray M = np.zeros((n, m), dtype=DTYPE)
cdef np.ndarray ej = np.zeros((m,), dtype=DTYPE)
cdef np.ndarray convolution
cdef int rowidx, colidx, kidx
for rowidx in range(n):
for colidx in range(m):
ej[colidx] = 1
convolution = convolve(ej, filtertaps, mode='full')
for kidx in range(k):
M[rowidx, colidx] += convolution[keep_indices[rowidx, kidx]] * V[rowidx, kidx]
ej[colidx] = 0
return M
EOF
#-----------------------------------------------------------------------------
cat > test_calcM.py << 'EOF'
import numpy as np
from fastcalcM import calcM
filtertaps = np.array([-1, 2, -1]).astype(np.float32)
n, m = 6, 7
keep_indices = np.array([[1, 3],
[4, 5],
[2, 2],
[5, 5],
[3, 4],
[4, 5]]).astype(np.int)
V = np.random.random_integers(-5, 5, size=(6, 2)).astype(np.float32)
print calcM(filtertaps, n, m, keep_indices, V)
EOF
#-----------------------------------------------------------------------------
cat > test.sh << 'EOF'
python setup.py build_ext --inplace
echo -e "%run test_calcM\n%timeit calcM(filtertaps, n, m, keep_indices, V)" > script.ipy
ipython script.ipy
EOF
#-----------------------------------------------------------------------------
cat > setup.py << 'EOF'
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(
name="Fast convolutions",
include_dirs = [numpy.get_include()],
ext_modules = cythonize("fastcalcM.pyx")
)
EOF
I thought maybe the call to scipy's convolve might be the culprit (I'm not certain that cython and scipy play well together), so I implemented my own convolution code ala the same example in Cython documentation, but this resulted in the overall code being about 10 times slower.
Any ideas on how to get closer to the theoretically possible speed, or reasons why the difference is so great?
For one thing, the typing of M, eg and convolution doesn't allow fast indexing. The typing you've done is not particularly helpful at all, actually.
But it doesn't matter, because you have two overheads. The first is converting between Cython and Python types. You should keep untyped arrays around if you want to pass them to Python a lot, to prevent the need to convert. Just moving this to Python sped it up for that reason (1ms → 0.65μs).
Then I profiled it:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
15 def calcM(filtertaps, n, m, keep_indices, V):
16 4111 3615 0.9 0.1 k = keep_indices.shape[1]
17 4111 8024 2.0 0.1 M = np.zeros((n, m), dtype=np.float32)
18 4111 6090 1.5 0.1 ej = np.zeros((m,), dtype=np.float32)
19
20 28777 18690 0.6 0.3 for rowidx in range(n):
21 197328 123284 0.6 2.2 for colidx in range(m):
22 172662 112348 0.7 2.0 ej[colidx] = 1
23 172662 4076225 23.6 73.6 convolution = convolve(ej, filtertaps, mode='full')
24 517986 395513 0.8 7.1 for kidx in range(k):
25 345324 668309 1.9 12.1 M[rowidx, colidx] += convolution[keep_indices[rowidx, kidx]] * V[rowidx, kidx]
26 172662 120271 0.7 2.2 ej[colidx] = 0
27
28 4111 2374 0.6 0.0 return M
Before you consider anything else, deal with convolve.
Why is convolve slow? Well, it's got a lot of overhead. numpy/scipy normally does; it's best for large datasets. If you know the size of your array is going to stay small, just reimplement convolve in Cython.
Oh, try to use the buffer syntax. Use DTYPE[:, :] for a 2D array, DTYPE[:] for a 1D array, etc. It's the memoryview protocol, and it's way better. There are cases where it has more overhead, but those are typically possible to work around and it's way better in most other ways.
EDIT:
You can try (recursively) inlining the scipy function:
import numpy as np
from scipy.signal.sigtools import _correlateND
def calcM(filtertaps, n, m, keep_indices, V):
k = keep_indices.shape[1]
M = np.zeros((n, m), dtype=np.float32)
ej = np.zeros((m,), dtype=np.float32)
slice_obj = [slice(None, None, -1)] * len(filtertaps.shape)
sliced_filtertaps_view = filtertaps[slice_obj]
ps = ej.shape[0] + sliced_filtertaps_view.shape[0] - 1
in1zpadded = np.zeros(ps, ej.dtype)
out = np.empty(ps, ej.dtype)
for rowidx in range(n):
for colidx in range(m):
in1zpadded[colidx] = 1
convolution = _correlateND(in1zpadded, sliced_filtertaps_view, out, 2)
for kidx in range(k):
M[rowidx, colidx] += convolution[keep_indices[rowidx, kidx]] * V[rowidx, kidx]
in1zpadded[colidx] = 0
return M
Note that this uses private implementation details.
This is tailored for the particular dimensions, so I don't know if it'll work on your actual data. But it removes the vast majority of overhead. You can then improve this by typing things again:
import numpy as np
cimport numpy as np
from scipy.signal.sigtools import _correlateND
DTYPE=np.float32
ctypedef np.float32_t DTYPE_t
def calcM(filtertaps, int n, int m, np.int_t[:, :] t_keep_indices, DTYPE_t[:, :] t_V):
cdef int rowidx, colidx, kidx, k
cdef DTYPE_t[:, :] t_M
cdef DTYPE_t[:] t_in1zpadded, t_convolution
k = t_keep_indices.shape[1]
t_M = M = np.zeros((n, m), dtype=np.float32)
ej = np.zeros((m,), dtype=np.float32)
slice_obj = [slice(None, None, -1)] * len(filtertaps.shape)
sliced_filtertaps_view = filtertaps[slice_obj]
ps = ej.shape[0] + sliced_filtertaps_view.shape[0] - 1
t_in1zpadded = in1zpadded = np.zeros(ps, ej.dtype)
out = np.empty(ps, ej.dtype)
for rowidx in range(n):
for colidx in range(m):
t_in1zpadded[colidx] = 1
t_convolution = _correlateND(in1zpadded, sliced_filtertaps_view, out, 2)
for kidx in range(k):
t_M[rowidx, colidx] += t_convolution[<int>t_keep_indices[rowidx, kidx]] * t_V[rowidx, kidx]
t_in1zpadded[colidx] = 0
return M
It's over 10x as fast, but not as high as your pie-in-the-sky estimate. Then again, that calculation was a bit bogus to begin with ;).