I have a list of sparse symmetric matrices sigma such that
len(sigma) = N
and for all i,j,k,
sigma[i].shape[0] == sigma[i].shape[1] = m # Square
sigma[i][j,k] == sigma[i][k,j] # Symmetric
I have an indexing array P such that
P.shape[0] = N
P.shape[1] = k
My objective is to extract the k x k dense submatrices of sigma[i] using the indexing given by P[i,:]. This can be done as follows
sub_matrices = np.empty([N,k,k])
for i in range(N):
sub_matrices[i,:,:] = sigma[i][np.ix_(P[i,:], P[i,:])].todense()
Note however that while k is small, N (and m) are very large. If the sparse symmetric matrices are stored in CSR format this takes a very long time. I feel there must be a better solution. For example is there a sparse format that lends itself well to symmetric matrices that need to be sliced on both dimensions?
I am using Python but would be open to any C library suggestions that I could interface using Cython.
EXTRA
Note that my current Cython approach is as follows:
cimport cython
import numpy as np
cimport numpy as np
#cython.boundscheck(False) # turn off bounds-checking for entire function
cpdef sparse_slice_fast_cy(sigma,
long[:,:] P,
double[:,:,:] sub_matrices):
"""
Inputs:
sigma: A list (N,) of sparse sp.csr_matrix (m x m)
P: A 2D array of integers (N, k)
sub_matrices: A 3D array of doubles (N, k, k) containing the slicing
"""
# Create variables for keeping code tidy
cdef long N = P.shape[0]
cdef long k = P.shape[1]
cdef long i
cdef long j
cdef long index_pointer
cdef long sparse_row_pointer
# Create objects for holding sparse matrix data
cdef double[:] data
cdef long[:] indices
cdef long[:] indptr
# Object for the ordered P
cdef long[:] perm
# Make sure sub_matrices is all 0
sub_matrices[:] = 0
for i in range(N):
# Sort the P
perm = np.argsort(P[i,:])
# Get the sparse matrix values
data = sigma[i].data
indices = sigma[i].indices.astype(long)
indptr = sigma[i].indptr.astype(long)
for j in range(k):
# Loop over row P[i, perm[j]] in sigma searching for values
# in P[i, :] vector i.e. compare
# sigma[P[i, perm[j], :]
# against
# P[i,:]
# To do this we need our sparse row vector with columns
# indices[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# and data/values
# data[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# which comes from the csr matrix format.
# We also need our sorted indexing vector
# P[i, perm[:]]
# We begin by pointing at the top of both
# our vectors and gradually move down them. In the event of
# an equality we add the data to sub_matrices[i,:,:] and
# increment the INDEXING VECTOR pointer, not the sparse
# row vector pointer, as there can be multiple values that
# are the same in the indexing vector but not the sparse row
# column vector (only 1 column can appear in 1 row!).
index_pointer = 0
sparse_row_pointer = indptr[P[i, perm[j]]]
while ((index_pointer < k) and (sparse_row_pointer < indptr[P[i, perm[j]] + 1])):
if indices[sparse_row_pointer] == P[i, perm[index_pointer]]:
# We can add data to sub_matrices
sub_matrices[i, perm[j], perm[index_pointer]] = \
data[sparse_row_pointer]
# Only increment the index pointer
index_pointer += 1
elif indices[sparse_row_pointer] > P[i, perm[index_pointer]]:
# Need to increment index pointer
index_pointer += 1
else:
# Need to increment sparse row pointer
sparse_row_pointer += 1
I believe then np.argsort may be inefficient when called often on relatively small vectors and would like to swap for a C implementation. I also don't take advantage of parallel processing that could potentially speed it up over the N sparse matrices. Unfortunately as there are Python coercions inside the outer, loop I don't know how I can use prange.
Another point to note is that the Cython approach seems to use a HUGE amount of memory but I have no idea where its getting allocated.
Latest Version
As per the suggestions of ead, below is the latest version of the Cython code
cimport cython
import numpy as np
cimport numpy as np
#cython.boundscheck(False) # turn off bounds-checking for entire function
cpdef sparse_slice_fast_cy(sigma,
np.ndarray[np.int32_t, ndim=2] P,
np.float64_t[:,:,:] sub_matrices,
int symmetric):
"""
Inputs:
sigma: A list (N,) of sparse sp.csr_matrix (m x m)
P: A 2D array of integers (N, k)
sub_matrices: A 3D array of doubles (N, k, k) containing the slicing
symmetric: 1 if the sigma matrices are symmetric
"""
# Create variables for keeping code tidy
cdef np.int32_t N = P.shape[0]
cdef np.int32_t k = P.shape[1]
cdef np.int32_t i
cdef np.int32_t j
cdef np.int32_t index_pointer
cdef np.int32_t sparse_row_pointer
# Create objects for holding sparse matrix data
cdef np.float64_t[:] data
cdef np.int32_t[:] indices
cdef np.int32_t[:] indptr
# Object for the ordered P
cdef np.int32_t[:,:] perm = np.argsort(P, axis=1).astype(np.int32)
# Make sure sub_matrices is all 0
sub_matrices[:] = 0
for i in range(N):
# Get the sparse matrix values
data = sigma[i].data
indices = sigma[i].indices
indptr = sigma[i].indptr
for j in range(k):
# Loop over row P[i, perm[j]] in sigma searching for values
# in P[i, :] vector i.e. compare
# sigma[P[i, perm[j], :]
# against
# P[i,:]
# To do this we need our sparse row vector with columns
# indices[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# and data/values
# data[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# which comes from the csr matrix format.
# We also need our sorted indexing vector
# P[i, perm[:]]
# We begin by pointing at the top of both
# our vectors and gradually move down them. In the event of
# an equality we add the data to sub_matrices[i,:,:] and
# increment the INDEXING VECTOR pointer, not the sparse
# row vector pointer, as there can be multiple values that
# are the same in the indexing vector but not the sparse row
# column vector (only 1 column can appear in 1 row!).
if symmetric:
index_pointer = j # Only search upper triangular
else:
index_pointer = 0
sparse_row_pointer = indptr[P[i, perm[i, j]]]
while ((index_pointer < k) and (sparse_row_pointer < indptr[P[i, perm[i, j]] + 1])):
if indices[sparse_row_pointer] == P[i, perm[i, index_pointer]]:
# We can add data to sub_matrices
sub_matrices[i, perm[i, j], perm[i, index_pointer]] = \
data[sparse_row_pointer]
if symmetric:
sub_matrices[i, perm[i, index_pointer], perm[i, j]] = \
data[sparse_row_pointer]
# Only increment the index pointer
index_pointer += 1
elif indices[sparse_row_pointer] > P[i, perm[i, index_pointer]]:
# Need to increment index pointer
index_pointer += 1
else:
# Need to increment sparse row pointer
sparse_row_pointer += 1
Parallel Version
Below is a parallel version, although it doesn't seem to provide any speedup and the code is no longer as nice looking:
# See https://stackoverflow.com/questions/48805636/efficient-slicing-of-symmetric-sparse-matrices
cimport cython
import numpy as np
cimport numpy as np
from libc.stdlib cimport malloc, free
from cython.parallel import prange
#cython.boundscheck(False) # turn off bounds-checking for entire function
cpdef sparse_slice_fast_cy(sigma,
np.ndarray[np.int32_t, ndim=2] P,
np.float64_t[:,:,:] sub_matrices,
int symmetric):
"""
Inputs:
sigma: A list (N,) of sparse sp.csr_matrix (m x m)
P: A 2D array of integers (N, k)
sub_matrices: A 3D array of doubles (N, k, k) containing the slicing
symmetric: 1 if the sigma matrices are symmetric
"""
# Create variables for keeping code tidy
cdef np.int32_t N = P.shape[0]
cdef np.int32_t k = P.shape[1]
cdef np.int32_t i
cdef np.int32_t j
cdef np.int32_t index_pointer
cdef np.int32_t sparse_row_pointer
# Create objects for holding sparse matrix data
cdef np.float64_t[:] data_mem_view
cdef np.int32_t[:] indices_mem_view
cdef np.int32_t[:] indptr_mem_view
cdef np.float64_t **data = <np.float64_t **> malloc(N * sizeof(np.float64_t *))
cdef np.int32_t **indices = <np.int32_t **> malloc(N * sizeof(np.int32_t *))
cdef np.int32_t **indptr = <np.int32_t **> malloc(N * sizeof(np.int32_t *))
for i in range(N):
data_mem_view = sigma[i].data
data[i] = &(data_mem_view[0])
indices_mem_view = sigma[i].indices
indices[i] = &(indices_mem_view[0])
indptr_mem_view = sigma[i].indptr
indptr[i] = &(indptr_mem_view[0])
# Object for the ordered P
cdef np.int32_t[:,:] perm = np.argsort(P, axis=1).astype(np.int32)
# Make sure sub_matrices is all 0
sub_matrices[:] = 0
for i in prange(N, nogil=True):
for j in range(k):
# Loop over row P[i, perm[j]] in sigma searching for values
# in P[i, :] vector i.e. compare
# sigma[P[i, perm[j], :]
# against
# P[i,:]
# To do this we need our sparse row vector with columns
# indices[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# and data/values
# data[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# which comes from the csr matrix format.
# We also need our sorted indexing vector
# P[i, perm[:]]
# We begin by pointing at the top of both
# our vectors and gradually move down them. In the event of
# an equality we add the data to sub_matrices[i,:,:] and
# increment the INDEXING VECTOR pointer, not the sparse
# row vector pointer, as there can be multiple values that
# are the same in the indexing vector but not the sparse row
# column vector (only 1 column can appear in 1 row!).
if symmetric:
index_pointer = j # Only search upper triangular
else:
index_pointer = 0
sparse_row_pointer = indptr[i][P[i, perm[i, j]]]
while ((index_pointer < k) and
(sparse_row_pointer < indptr[i][P[i, perm[i, j]] + 1])):
if indices[i][sparse_row_pointer] == P[i, perm[i, index_pointer]]:
# We can add data to sub_matrices
sub_matrices[i, perm[i, j], perm[i, index_pointer]] = \
data[i][sparse_row_pointer]
if symmetric:
sub_matrices[i, perm[i, index_pointer], perm[i, j]] = \
data[i][sparse_row_pointer]
# Only increment the index pointer
index_pointer = index_pointer + 1
elif indices[i][sparse_row_pointer] > P[i, perm[i, index_pointer]]:
# Need to increment index pointer
index_pointer = index_pointer + 1
else:
# Need to increment sparse row pointer
sparse_row_pointer = sparse_row_pointer + 1
# Free malloc'd data
free(data)
free(indices)
free(indptr)
Test
To test the code run
cythonize -i sparse_slice.pyx
where sparse_slice.pyx is th filename. Then you can use this script:
import time
import numpy as np
import scipy as sp
import scipy.sparse
from sparse_slice import sparse_slice_fast_cy
k = 100
N = 20000
m = 10000
samples = 20
# Create sigma matrices
## The sampling of random sparse takes a while so just do a few and
## then populate with these.
now = time.time()
sigma_samples = []
for i in range(samples):
sigma_samples.append(sp.sparse.rand(m, m, density=0.001, format='csr'))
sigma_samples[-1] = sigma_samples[-1] + sigma_samples[-1].T # Symmetric
## Now make the sigma list from these.
sigma = []
for i in range(N):
j = np.random.randint(samples)
sigma.append(sigma_samples[j])
print('Time to make sigma: {}'.format(time.time() - now))
# Create indexer
now = time.time()
P = np.empty([N, k]).astype(int)
for i in range(N):
P[i, :] = np.random.choice(np.arange(m), k, replace=True)
print('Time to make P: {}'.format(time.time() - now))
# Create objects for holding the slices
sub_matrices_slow = np.empty([N, k, k])
sub_matrices_fast = np.empty([N, k, k])
# Run both slicings
## Slow
now = time.time()
for i in range(N):
sub_matrices_slow[i,:,:] = sigma[i][np.ix_(P[i,:], P[i,:])].todense()
print('Time to make sub_matrices_slow: {}'.format(time.time() - now))
## Fast
symmetric = 1
now = time.time()
sparse_slice_fast_cy(sigma, P.astype(np.int32), sub_matrices_fast, symmetric)
print('Time to make sub_matrices_fast: {}'.format(time.time() - now))
assert(np.all((sub_matrices_slow - sub_matrices_fast)**2 < 1e-6))
Cannot test right now, but there are two suggestions:
A) sort all rows at once onside of the i-loop:
# Object for the ordered P
cdef long[:,:] perm = np.argsort(P, axis=1)
maybe you will need to pass P as np.ndarray[np.int64_t, ndim=2] P (or whatever type it is) to avoid copying. You will have to access the data via perm[i,X] instead of perm[X].
B) define
cdef np.int32_t[:] indices
cdef np.int32_t[:] indptr
So you don't need to copy the data via '.astype`, i.e.
for i in range(N):
data = sigma[i].data
indices = sigma[i].indices
indptr = sigma[i].indptr
I think because the sigma[i] has O(m) elements the copying is the bottleneck of your function: you get running time O(N*(m+k^2)) instead of `O(N*k^2) - it is good to avoid it.
Otherwise the function doesn't look too bad.
For getting prange to work with i-loop, you should move the accesses to sigma[i] outside of the loop by creating a kind of arrays of pointers to the first element of data , indices and indptr and populating them in a cheap preprocess-step. One can make it work, but the question is how much is the gain from the parallelization - it might well be the case, that the problem is memory-bound - one has to see timings.
You could also use the symmetry by processing only the upper triangle matrix:
...
index_pointer = j #only upper triangle!
....
....
# We can add data to sub_matrices
#upper triangle sub-matrix:
sub_matrices[i, perm[j], perm[index_pointer]] = \
data[sparse_row_pointer]
#lower triangle sub-matrix:
sub_matrices[i, perm[index_pointer], perm[j]] = \
data[sparse_row_pointer]
....
I would start with B) and see how it works out...
Edit:
On memory usage: one can measure the peak memory usage via
/usr/bin/time -f "peak_used_memory:%M(in Kb)" python test.py
I run my tests with N=2000 and get (python3.6+cython0.27.1):
peak memory usage
only slow 245Mb
only fast 245Mb
slow+fast no check 402Mb
slow+fast+assert 576Mb
So there is 50Mb overhead, 200Mb used by either function and additional 176 Mb for evaluation the assert. I can see the same behavior also for other values of N.
So I would say there is no huge memory usage by cython.
This task is very probably (at least partly) memory bound, so the parallelization will not help much. You should reduce the amount of memory loaded to cache.
One possibility is not to use perm - after all it also needs to be loaded into the cache. You could do it if
you can live with any row/col permutation in matrix sigma, than just sort P and use it.
there are very few elements per row, so linear search for every element would be Ok.
doing binary search for every element
I guess you could win about 20-30% in the best case.
Sometimes cython produces code which is not easy to optimize for the c-compiler and one achieves often better results writing directly in C and then wrapping it with python.
But I would do all that only if this operation is really, really the bottle-neck of your program.
By the way, declaring
cdef np.int64_t[:,:] perm = np.argsort(P, axis=1)
you will not need additional copying.
Related
In Python, I have a matrix K of dimensions (N x N). I want to normalize K by dividing every entry K_ij by sqrt(K_(i,i)*K_(j,j)). What is a fast way to achieve this in Python without iterating through every entry?
My current solution is:
import numpy as np
K = np.random.rand(3,3)
diag = np.diag(K)
for i in range(np.shape(K)[0]):
for j in range(np.shape(K)[1]):
K[i,j] = K[i,j]/np.sqrt(diag[i]*diag[j])
Of course you have to iterate through every entry, at least internally. For square matrices:
K / np.sqrt(np.einsum('ii,jj->ij', K, K))
If the matrix is not square, you first have to define what should replace the "missing" values K[i,i] where i > j etc.
Alternative: use numba to leave your loop as is, get free speedup, and even avoid intermediate allocation:
#njit
def normalize(K):
M = np.empty_like(K)
m, n = K.shape
for i in range(m):
Kii = K[i,i]
for j in range(n):
Kjj = K[j,j]
M[i,j] = K[i,j] / np.sqrt(Kii * Kjj)
return M
I have a large 2D array of size Nx3. This array contains point cloud data in (X,Y,Z) format. I am using Python in Ubuntu in a virtual environment to read data from a .ply file.
When I am trying to find the covariance of this array with rowvar set to True (meaning each row being considered a variable), I am getting MemoryError.
I understand that this is creating a very large array, apparently too large for my 8 Gb allocated memory to handle. Without increasing memory allocation, is there a different way of getting around this issue? Are there different methods of calculating the covariance matrix elements so that the memory is not overloaded?
You could chop it up in a loop and keep the upper triangle only.
import numpy as np
N = 23000
a = np.random.random((N, 3))
c = a - a.mean(axis=-1, keepdims=True)
out = np.empty((N*(N+1) // 2,))
def ravel_triu(i, j, n):
i, j = np.where(i>j, np.broadcast_arrays(j, i), np.broadcast_arrays(i, j))
return i*n - i*(i+1) // 2 + j
def unravel_triu(k, n):
i = n - (0.5 + np.sqrt(n*(n+1) - 2*k - 1)).astype(int)
return i, k - (i*n - i*(i+1) // 2)
ii, jj = np.ogrid[:N, :N]
for j in range(0, N, 500):
out[ravel_triu(j, j, N):ravel_triu(min(N, j+500), min(N, j+500), N)] \
= np.einsum(
'i...k,...jk->ij', c[j:j+500], c[j:]) [ii[j:j+500] <= jj[:, j:]]
Obviously your covariances will be quite undersampled and the covariance matrix highly rank-defective...
I need to quickly compute a matrix whose entries are obtained by convolving a filter with a vector for each row, subsampling the entries of the resulting vector, and then taking the dot product of the result with another vector. Specifically, I want to compute
M = [conv(e_j, f)*P_i*v_i ]_{i,j},
where i varies from 1 to n and j varies from 1 to m. Here e_j is the indicator (row) vector of size n with a one only in column j, f is the filter of length s, P_i is an (n+s-1)-by-k matrix which samples the appropriate k entries from the convolution, and v_i is a column vector of length k.
It takes O(n*s) operations to compute each entry of M, so O(n*s*n*m) overall to compute M. For n=6, m=7, s=3, one core of my computer (8GLOPs) should be able compute M in roughly .094 microseconds. Yet my very simple cython implementation, following the example given in the Cython documentation, takes more than 2 milliseconds to compute an example with those parameters. That is about 4 orders of magnitude difference!
Here is a shar file with the Cython implementation and test code. Copy and paste it to a file and run 'bash <fname>' in a clean directory to get the code, then run 'bash ./test.sh' to see the abysmal performance.
cat > fastcalcM.pyx <<'EOF'
import numpy as np
cimport numpy as np
cimport cython
from scipy.signal import convolve
DTYPE=np.float32
ctypedef np.float32_t DTYPE_t
#cython.boundscheck(False)
def calcM(np.ndarray[DTYPE_t, ndim=1, negative_indices=False] filtertaps, int
n, int m, np.ndarray[np.int_t, ndim=2, negative_indices=False]
keep_indices, np.ndarray[DTYPE_t, ndim=2, negative_indices=False] V):
""" Computes a numrows-by-k matrix M whose entries satisfy
M_{i,k} = [conv(e_j, f)^T * P_i * v_i],
where v_i^T is the i-th row of V, and P_i samples the entries from
conv(e_j, f)^T indicated by the ith row of the keep_indices matrix """
cdef int k = keep_indices.shape[1]
cdef np.ndarray M = np.zeros((n, m), dtype=DTYPE)
cdef np.ndarray ej = np.zeros((m,), dtype=DTYPE)
cdef np.ndarray convolution
cdef int rowidx, colidx, kidx
for rowidx in range(n):
for colidx in range(m):
ej[colidx] = 1
convolution = convolve(ej, filtertaps, mode='full')
for kidx in range(k):
M[rowidx, colidx] += convolution[keep_indices[rowidx, kidx]] * V[rowidx, kidx]
ej[colidx] = 0
return M
EOF
#-----------------------------------------------------------------------------
cat > test_calcM.py << 'EOF'
import numpy as np
from fastcalcM import calcM
filtertaps = np.array([-1, 2, -1]).astype(np.float32)
n, m = 6, 7
keep_indices = np.array([[1, 3],
[4, 5],
[2, 2],
[5, 5],
[3, 4],
[4, 5]]).astype(np.int)
V = np.random.random_integers(-5, 5, size=(6, 2)).astype(np.float32)
print calcM(filtertaps, n, m, keep_indices, V)
EOF
#-----------------------------------------------------------------------------
cat > test.sh << 'EOF'
python setup.py build_ext --inplace
echo -e "%run test_calcM\n%timeit calcM(filtertaps, n, m, keep_indices, V)" > script.ipy
ipython script.ipy
EOF
#-----------------------------------------------------------------------------
cat > setup.py << 'EOF'
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(
name="Fast convolutions",
include_dirs = [numpy.get_include()],
ext_modules = cythonize("fastcalcM.pyx")
)
EOF
I thought maybe the call to scipy's convolve might be the culprit (I'm not certain that cython and scipy play well together), so I implemented my own convolution code ala the same example in Cython documentation, but this resulted in the overall code being about 10 times slower.
Any ideas on how to get closer to the theoretically possible speed, or reasons why the difference is so great?
For one thing, the typing of M, eg and convolution doesn't allow fast indexing. The typing you've done is not particularly helpful at all, actually.
But it doesn't matter, because you have two overheads. The first is converting between Cython and Python types. You should keep untyped arrays around if you want to pass them to Python a lot, to prevent the need to convert. Just moving this to Python sped it up for that reason (1ms → 0.65μs).
Then I profiled it:
Line # Hits Time Per Hit % Time Line Contents
==============================================================
15 def calcM(filtertaps, n, m, keep_indices, V):
16 4111 3615 0.9 0.1 k = keep_indices.shape[1]
17 4111 8024 2.0 0.1 M = np.zeros((n, m), dtype=np.float32)
18 4111 6090 1.5 0.1 ej = np.zeros((m,), dtype=np.float32)
19
20 28777 18690 0.6 0.3 for rowidx in range(n):
21 197328 123284 0.6 2.2 for colidx in range(m):
22 172662 112348 0.7 2.0 ej[colidx] = 1
23 172662 4076225 23.6 73.6 convolution = convolve(ej, filtertaps, mode='full')
24 517986 395513 0.8 7.1 for kidx in range(k):
25 345324 668309 1.9 12.1 M[rowidx, colidx] += convolution[keep_indices[rowidx, kidx]] * V[rowidx, kidx]
26 172662 120271 0.7 2.2 ej[colidx] = 0
27
28 4111 2374 0.6 0.0 return M
Before you consider anything else, deal with convolve.
Why is convolve slow? Well, it's got a lot of overhead. numpy/scipy normally does; it's best for large datasets. If you know the size of your array is going to stay small, just reimplement convolve in Cython.
Oh, try to use the buffer syntax. Use DTYPE[:, :] for a 2D array, DTYPE[:] for a 1D array, etc. It's the memoryview protocol, and it's way better. There are cases where it has more overhead, but those are typically possible to work around and it's way better in most other ways.
EDIT:
You can try (recursively) inlining the scipy function:
import numpy as np
from scipy.signal.sigtools import _correlateND
def calcM(filtertaps, n, m, keep_indices, V):
k = keep_indices.shape[1]
M = np.zeros((n, m), dtype=np.float32)
ej = np.zeros((m,), dtype=np.float32)
slice_obj = [slice(None, None, -1)] * len(filtertaps.shape)
sliced_filtertaps_view = filtertaps[slice_obj]
ps = ej.shape[0] + sliced_filtertaps_view.shape[0] - 1
in1zpadded = np.zeros(ps, ej.dtype)
out = np.empty(ps, ej.dtype)
for rowidx in range(n):
for colidx in range(m):
in1zpadded[colidx] = 1
convolution = _correlateND(in1zpadded, sliced_filtertaps_view, out, 2)
for kidx in range(k):
M[rowidx, colidx] += convolution[keep_indices[rowidx, kidx]] * V[rowidx, kidx]
in1zpadded[colidx] = 0
return M
Note that this uses private implementation details.
This is tailored for the particular dimensions, so I don't know if it'll work on your actual data. But it removes the vast majority of overhead. You can then improve this by typing things again:
import numpy as np
cimport numpy as np
from scipy.signal.sigtools import _correlateND
DTYPE=np.float32
ctypedef np.float32_t DTYPE_t
def calcM(filtertaps, int n, int m, np.int_t[:, :] t_keep_indices, DTYPE_t[:, :] t_V):
cdef int rowidx, colidx, kidx, k
cdef DTYPE_t[:, :] t_M
cdef DTYPE_t[:] t_in1zpadded, t_convolution
k = t_keep_indices.shape[1]
t_M = M = np.zeros((n, m), dtype=np.float32)
ej = np.zeros((m,), dtype=np.float32)
slice_obj = [slice(None, None, -1)] * len(filtertaps.shape)
sliced_filtertaps_view = filtertaps[slice_obj]
ps = ej.shape[0] + sliced_filtertaps_view.shape[0] - 1
t_in1zpadded = in1zpadded = np.zeros(ps, ej.dtype)
out = np.empty(ps, ej.dtype)
for rowidx in range(n):
for colidx in range(m):
t_in1zpadded[colidx] = 1
t_convolution = _correlateND(in1zpadded, sliced_filtertaps_view, out, 2)
for kidx in range(k):
t_M[rowidx, colidx] += t_convolution[<int>t_keep_indices[rowidx, kidx]] * t_V[rowidx, kidx]
t_in1zpadded[colidx] = 0
return M
It's over 10x as fast, but not as high as your pie-in-the-sky estimate. Then again, that calculation was a bit bogus to begin with ;).
The problem
I'm trying to Cythonize two small functions that mostly deal with numpy ndarrays for some scientific purpose. These two smalls functions are called millions of times in a genetic algorithm and account for the majority of the time taken by the algo.
I made some progress on my own and both work nicely, but i get only a tiny speed improvement (10%). More importantly, cython --annotate show that the majority of the code is still going through Python.
The code
First function:
The aim of this function is to get back slices of data and it is called millions of times in an inner nested loop. Depending on the bool in data[1][1], we either get the slice in the forward or reverse order.
#Ipython notebook magic for cython
%%cython --annotate
import numpy as np
from scipy import signal as scisignal
cimport cython
cimport numpy as np
def get_signal(data):
#data[0] contains the data structure containing the numpy arrays
#data[1][0] contains the position to slice
#data[1][1] contains the orientation to slice, forward = 0, reverse = 1
cdef int halfwinwidth = 100
cdef int midpoint = data[1][0]
cdef int strand = data[1][1]
cdef int start = midpoint - halfwinwidth
cdef int end = midpoint + halfwinwidth
#the arrays we want to slice
cdef np.ndarray r0 = data[0]['normals_forward']
cdef np.ndarray r1 = data[0]['normals_reverse']
cdef np.ndarray r2 = data[0]['normals_combined']
if strand == 0:
normals_forward = r0[start:end]
normals_reverse = r1[start:end]
normals_combined = r2[start:end]
else:
normals_forward = r1[end - 1:start - 1: -1]
normals_reverse = r0[end - 1:start - 1: -1]
normals_combined = r2[end - 1:start - 1: -1]
#return the result as a tuple
row = (normals_forward,
normals_reverse,
normals_combined)
return row
Second function
This one gets a list of tuples of numpy arrays, and we want to add up the arrays element wise, then normalize them and get the integration of the intersection.
def calculate_signal(list signal):
cdef int halfwinwidth = 100
cdef np.ndarray profile_normals_forward = np.zeros(halfwinwidth * 2, dtype='f')
cdef np.ndarray profile_normals_reverse = np.zeros(halfwinwidth * 2, dtype='f')
cdef np.ndarray profile_normals_combined = np.zeros(halfwinwidth * 2, dtype='f')
#b is a tuple of 3 np.ndarrays containing 200 floats
#here we add them up elementwise
for b in signal:
profile_normals_forward += b[0]
profile_normals_reverse += b[1]
profile_normals_combined += b[2]
#normalize the arrays
cdef int count = len(signal)
#print "Normalizing to number of elements"
profile_normals_forward /= count
profile_normals_reverse /= count
profile_normals_combined /= count
intersection_signal = scisignal.detrend(np.fmin(profile_normals_forward, profile_normals_reverse))
intersection_signal[intersection_signal < 0] = 0
intersection = np.sum(intersection_signal)
results = {"intersection": intersection,
"profile_normals_forward": profile_normals_forward,
"profile_normals_reverse": profile_normals_reverse,
"profile_normals_combined": profile_normals_combined,
}
return results
Any help is appreciated - I tried using memory views but for some reason the code got much, much slower.
After fixing the array cdef (as has been indicated, with the dtype specified), you should probably put the routine in a cdef function (which will only be callable by a def function in the same script).
In the declaration of the function, you'll need to provide the type (and the dimensions if it's an array numpy):
cdef get_signal(numpy.ndarray[DTYPE_t, ndim=3] data):
I'm not sure using a dict is a good idea though. You could make use of numpy's column or row slices like data[:, 0].
Converting a collaborative filtering code to use sparse matrices I'm puzzling on the following problem: given two full matrices X (m by l) and Theta (n by l), and a sparse matrix R (m by n), is there a fast way to calculate the sparse inner product . Large dimensions are m and n (order 100000), while l is small (order 10). This is probably a fairly common operation for big data since it shows up in the cost function of most linear regression problems, so I'd expect a solution built into scipy.sparse, but I haven't found anything obvious yet.
The naive way to do this in python is R.multiply(XTheta.T), but this will result in evaluation of the full matrix XTheta.T (m by n, order 100000**2) which occupies too much memory, then dumping most of the entries since R is sparse.
There is a pseudo solution already here on stackoverflow, but it is non-sparse in one step:
def sparse_mult_notreally(a, b, coords):
rows, cols = coords
rows, r_idx = np.unique(rows, return_inverse=True)
cols, c_idx = np.unique(cols, return_inverse=True)
C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is dense
return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
This works fine, and fast, for me on small enough arrays, but it barfs on my big datasets with the following error:
... in sparse_mult(a, b, coords)
132 rows, r_idx = np.unique(rows, return_inverse=True)
133 cols, c_idx = np.unique(cols, return_inverse=True)
--> 134 C = np.array(np.dot(a[rows, :], b[:, cols])) # this operation is not sparse
135 return sp.coo_matrix( (C[r_idx,c_idx],coords), (a.shape[0],b.shape[1]) )
ValueError: array is too big.
A solution which IS actually sparse, but very slow, is:
def sparse_mult(a, b, coords):
rows, cols = coords
n = len(rows)
C = np.array([ float(a[rows[i],:]*b[:,cols[i]]) for i in range(n) ]) # this is sparse, but VERY slow
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
Does anyone know a fast, fully sparse way to do this?
I profiled 4 different solutions to your problem, and it looks like for any size of the array, the numba jit solution is the best. A close second is #Alexander's cython solution.
Here are the results (M is the number of rows in the x array):
M = 1000
function sparse_dense took 0.03 sec.
function sparse_loop took 0.07 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.09 sec.
M = 10000
function sparse_dense took 2.88 sec.
function sparse_loop took 0.68 sec.
function sparse_numba took 0.00 sec.
function sparse_cython took 0.01 sec.
M = 100000
function sparse_dense ran out of memory
function sparse_loop took 6.84 sec.
function sparse_numba took 0.09 sec.
function sparse_cython took 0.12 sec.
The script I used to profile these methods is:
import numpy as np
from scipy.sparse import coo_matrix
from numba import autojit, jit, float64, int32
import pyximport
pyximport.install(setup_args={"script_args":["--compiler=mingw32"],
"include_dirs":np.get_include()},
reload_support=True)
def sparse_dense(a,b,c):
return coo_matrix(c.multiply(np.dot(a,b)))
def sparse_loop(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) by looping over non-zero
entries in `c` and using `np.dot()` for each entry."""
N = c.size
data = np.empty(N,dtype=float)
for i in range(N):
data[i] = c.data[i]*np.dot(a[c.row[i],:],b[:,c.col[i]])
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
##autojit
def _sparse_mult4(a,b,cd,cr,cc):
N = cd.size
data = np.empty_like(cd)
for i in range(N):
num = 0.0
for j in range(a.shape[1]):
num += a[cr[i],j]*b[j,cc[i]]
data[i] = cd[i]*num
return data
_fast_sparse_mult4 = \
jit(float64[:,:](float64[:,:],float64[:,:],float64[:],int32[:],int32[:]))(_sparse_mult4)
def sparse_numba(a,b,c):
"""Multiply sparse matrix `c` by np.dot(a,b) using Numba's jit."""
assert c.shape == (a.shape[0],b.shape[1])
data = _fast_sparse_mult4(a,b,c.data,c.row,c.col)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def sparse_cython(a, b, c):
"""Computes c.multiply(np.dot(a,b)) using cython."""
from sparse_mult_c import sparse_mult_c
data = np.empty_like(c.data)
sparse_mult_c(a,b,c.data,c.row,c.col,data)
return coo_matrix((data,(c.row,c.col)),shape=(a.shape[0],b.shape[1]))
def unique_rows(a):
a = np.ascontiguousarray(a)
unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))
if __name__ == '__main__':
import time
for M in [1000,10000,100000]:
print 'M = %i' % M
N = M + 2
L = 10
x = np.random.rand(M,L)
t = np.random.rand(N,L).T
# number of non-zero entries in sparse r matrix
S = M*10
row = np.random.randint(M,size=S)
col = np.random.randint(N,size=S)
# remove duplicate rows and columns
row, col = unique_rows(np.dstack((row,col)).squeeze()).T
data = np.random.rand(row.size)
r = coo_matrix((data,(row,col)),shape=(M,N))
a2 = sparse_loop(x,t,r)
for f in [sparse_dense,sparse_loop,sparse_numba,sparse_cython]:
t0 = time.time()
try:
a = f(x,t,r)
except MemoryError:
print 'function %s ran out of memory' % f.__name__
continue
elapsed = time.time()-t0
try:
diff = abs(a-a2)
if diff.nnz > 0:
assert np.max(abs(a-a2).data) < 1e-5
except AssertionError:
print f.__name__
raise
print 'function %s took %.2f sec.' % (f.__name__,elapsed)
The cython function is a slightly modified version of #Alexander's code:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.float64_t, ndim=1] data,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] out):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
cdef double num
for i in range(n):
num = 0.0
for j in range(k):
num += a[rows[i],j] * b[j,cols[i]]
out[i] = data[i]*num
Based on the extra information on the comments, I think what's throwing you off is the call to np.unique. Try the following approach:
import numpy as np
import scipy.sparse as sps
from numpy.core.umath_tests import inner1d
n = 100000
x = np.random.rand(n, 10)
theta = np.random.rand(n, 10)
rows = np.arange(n)
cols = np.arange(n)
np.random.shuffle(rows)
np.random.shuffle(cols)
def sparse_multiply(x, theta, rows, cols):
data = inner1d(x[rows], theta[cols])
return sps.coo_matrix((data, (rows, cols)),
shape=(x.shape[0], theta.shape[0]))
I get the following timings:
n = 1000
%timeit sparse_multiply(x, theta, rows, cols)
1000 loops, best of 3: 465 us per loop
n = 10000
%timeit sparse_multiply(x, theta, rows, cols)
100 loops, best of 3: 4.29 ms per loop
n = 100000
%timeit sparse_multiply(x, theta, rows, cols)
10 loops, best of 3: 61.5 ms per loop
And of course, with n = 100:
>>> np.allclose(sparse_multiply(x, theta, rows, cols).toarray()[rows, cols],
x.dot(theta.T)[rows, cols])
>>> True
Haven't tested Jaime's answer yet (thanks again!), but I implemented another answer that works in the meantime using cython.
file sparse_mult_c.pyx:
# working from tutorial at: http://docs.cython.org/src/tutorial/numpy.html
cimport numpy as np
# Turn bounds checking back on if there are ANY problems!
cimport cython
#cython.boundscheck(False) # turn of bounds-checking for entire function
def sparse_mult_c(np.ndarray[np.float64_t, ndim=2] a,
np.ndarray[np.float64_t, ndim=2] b,
np.ndarray[np.int32_t, ndim=1] rows,
np.ndarray[np.int32_t, ndim=1] cols,
np.ndarray[np.float64_t, ndim=1] C ):
cdef int n = rows.shape[0]
cdef int k = a.shape[1]
cdef int i,j
for i in range(n):
for j in range(k):
C[i] += a[rows[i],j] * b[j,cols[i]]
Then compile it as per http://docs.cython.org/src/userguide/tutorial.html
Then in my python code, I include the following:
def sparse_mult(a, b, coords):
#a,b are np.ndarrays
from sparse_mult_c import sparse_mult_c
rows, cols = coords
C = np.zeros(rows.shape[0])
sparse_mult_c(a,b,rows,cols,C)
return sp.coo_matrix( (C,coords), (a.shape[0],b.shape[1]) )
This works fully sparse and also runs faster than even the original (memory-inefficient for me) solution.