Cythonize two small numpy functions, help needed - python

The problem
I'm trying to Cythonize two small functions that mostly deal with numpy ndarrays for some scientific purpose. These two smalls functions are called millions of times in a genetic algorithm and account for the majority of the time taken by the algo.
I made some progress on my own and both work nicely, but i get only a tiny speed improvement (10%). More importantly, cython --annotate show that the majority of the code is still going through Python.
The code
First function:
The aim of this function is to get back slices of data and it is called millions of times in an inner nested loop. Depending on the bool in data[1][1], we either get the slice in the forward or reverse order.
#Ipython notebook magic for cython
%%cython --annotate
import numpy as np
from scipy import signal as scisignal
cimport cython
cimport numpy as np
def get_signal(data):
#data[0] contains the data structure containing the numpy arrays
#data[1][0] contains the position to slice
#data[1][1] contains the orientation to slice, forward = 0, reverse = 1
cdef int halfwinwidth = 100
cdef int midpoint = data[1][0]
cdef int strand = data[1][1]
cdef int start = midpoint - halfwinwidth
cdef int end = midpoint + halfwinwidth
#the arrays we want to slice
cdef np.ndarray r0 = data[0]['normals_forward']
cdef np.ndarray r1 = data[0]['normals_reverse']
cdef np.ndarray r2 = data[0]['normals_combined']
if strand == 0:
normals_forward = r0[start:end]
normals_reverse = r1[start:end]
normals_combined = r2[start:end]
else:
normals_forward = r1[end - 1:start - 1: -1]
normals_reverse = r0[end - 1:start - 1: -1]
normals_combined = r2[end - 1:start - 1: -1]
#return the result as a tuple
row = (normals_forward,
normals_reverse,
normals_combined)
return row
Second function
This one gets a list of tuples of numpy arrays, and we want to add up the arrays element wise, then normalize them and get the integration of the intersection.
def calculate_signal(list signal):
cdef int halfwinwidth = 100
cdef np.ndarray profile_normals_forward = np.zeros(halfwinwidth * 2, dtype='f')
cdef np.ndarray profile_normals_reverse = np.zeros(halfwinwidth * 2, dtype='f')
cdef np.ndarray profile_normals_combined = np.zeros(halfwinwidth * 2, dtype='f')
#b is a tuple of 3 np.ndarrays containing 200 floats
#here we add them up elementwise
for b in signal:
profile_normals_forward += b[0]
profile_normals_reverse += b[1]
profile_normals_combined += b[2]
#normalize the arrays
cdef int count = len(signal)
#print "Normalizing to number of elements"
profile_normals_forward /= count
profile_normals_reverse /= count
profile_normals_combined /= count
intersection_signal = scisignal.detrend(np.fmin(profile_normals_forward, profile_normals_reverse))
intersection_signal[intersection_signal < 0] = 0
intersection = np.sum(intersection_signal)
results = {"intersection": intersection,
"profile_normals_forward": profile_normals_forward,
"profile_normals_reverse": profile_normals_reverse,
"profile_normals_combined": profile_normals_combined,
}
return results
Any help is appreciated - I tried using memory views but for some reason the code got much, much slower.

After fixing the array cdef (as has been indicated, with the dtype specified), you should probably put the routine in a cdef function (which will only be callable by a def function in the same script).
In the declaration of the function, you'll need to provide the type (and the dimensions if it's an array numpy):
cdef get_signal(numpy.ndarray[DTYPE_t, ndim=3] data):
I'm not sure using a dict is a good idea though. You could make use of numpy's column or row slices like data[:, 0].

Related

iterating through specified axis in cython

I am learning cython and I have modified the code in the tutorial to try to do numerical differentiation:
import numpy as np
cimport numpy as np
import cython
np.import_array()
def test3(a, int order=2, int axis=-1):
cdef int i
if axis<0:
axis = len(a.shape) + axis
out = np.empty(a.shape, np.double)
cdef np.flatiter ita = np.PyArray_IterAllButAxis(a, &axis)
cdef np.flatiter ito = np.PyArray_IterAllButAxis(out, &axis)
cdef int a_axis_stride = a.strides[axis]
cdef int o_axis_stride = out.strides[axis]
cdef int axis_length = out.shape[axis]
cdef double value
while np.PyArray_ITER_NOTDONE(ita):
# first element
pt1 = <double*>((<char*>np.PyArray_ITER_DATA(ita)))
pt2 = (<double*>((<char*>np.PyArray_ITER_DATA(ita)) + 1*a_axis_stride))
pt3 = (<double*>((<char*>np.PyArray_ITER_DATA(ita)) + 2*a_axis_stride))
value = -1.5*pt1[0] + 2*pt2[0] - 0.5*pt3[0]
(<double*>((<char*>np.PyArray_ITER_DATA(ito))))[0] = value
for i in range(axis_length-2):
pt1 = (<double*>((<char*>np.PyArray_ITER_DATA(ita)) + i*a_axis_stride))
pt2 = (<double*>((<char*>np.PyArray_ITER_DATA(ita)) + (i+2)*a_axis_stride))
value = -0.5*pt1[0] + 0.5*pt2[0]
(<double*>((<char*>np.PyArray_ITER_DATA(ito)) + (i+1)*o_axis_stride))[0] = value
# last element
pt1 = (<double*>((<char*>np.PyArray_ITER_DATA(ita))+ (axis_length-3)*a_axis_stride))
pt2 = (<double*>((<char*>np.PyArray_ITER_DATA(ita))+ (axis_length-2)*a_axis_stride))
pt3 = (<double*>((<char*>np.PyArray_ITER_DATA(ita))+ (axis_length-1)*a_axis_stride))
value = 1.5*pt3[0] - 2*pt2[0] + 0.5*pt1[0]
(<double*>((<char*>np.PyArray_ITER_DATA(ito))+(axis_length-1)*o_axis_stride))[0] = value
np.PyArray_ITER_NEXT(ita)
np.PyArray_ITER_NEXT(ito)
return out
The code produces correct results, and it is indeed faster than my own numpy implementation without cython. The problem is the following:
I thought about only having one pt1 = (<double*>((<char*>np.PyArray_ITER_DATA(ita)) + i*a_axis_stride)) statement and then use pt1[0], pt1[-1], pt1[1] to access the array elements, but this only works if the specified axis is the last one. If I am differentiating a different axis (not the last one), then (<double*>((<char*>np.PyArray_ITER_DATA(ita)) + i*a_axis_stride)) points to the right one, but pt[-1] and pt[1] point to the elements right before and after pt[0], which is along the last axis. The current version works, but if I want to implement higher-order differentiation which requires more points to evaluate, then I will end up having many such lines, and I'm not sure if there are better/more efficient ways to do it using pt[1] or
something like pt[xxx] to access neighbouring points (along the specified axis).
Are there other ways to speed up this piece of code? I am looking for some minor details that I may have overlooked or subtle things that can have a big effect.
To my slight surprise I can't actually beat your version using Cython typed memoryviews - the numpy iterators look pretty quick. However I think I can manage a significant increase in readability to let you use the Python slicing syntax. The only restriction is that the input array must be C contiguous to allow it to be reshaped easily (I think Fortran contiguous might also work, but I haven't tested)
The basic trick is to flatten all the axes before and after selected axis so it is a known 3D shape, at which point you can use Cython memoryviews.
#cython.boundscheck(False)
def test4(a,order=2,axis=-1):
assert a.flags['C_CONTIGUOUS'] # otherwise the reshape doesn't work
before = np.product(a.shape[:axis])
after = np.product(a.shape[(axis+1):])
cdef double[:,:,::1] a_new = a.reshape((before, a.shape[axis], after)) # this should not involve copying memory - it's just a new view
cdef double[:] a_slice
cdef double[:,:,::1] out = np.empty_like(a_new)
assert a_new.shape[1] > 3
cdef int m,n,i
for m in range(a_new.shape[0]):
for n in range(a_new.shape[2]):
a_slice = a_new[m,:,n]
out[m,0,n] = -1.5*a_slice[0] + 2*a_slice[1] - 0.5*a_slice[2]
for i in range(a_slice.shape[0]-2):
out[m,i+1,n] = -0.5*a_slice[i] + 0.5*a_slice[i+2]
# last element
out[m,-1,n] = 1.5*a_slice[-1] - 2*a_slice[-2] + 0.5*a_slice[-3]
return np.asarray(out).reshape(a.shape)
The speed is very slightly slower than your version I think.
In terms of improving your code, you could work out the stride in doubles instead of bytes (a_axis_stride_dbl = a_axis_stride/sizeof(double)) and then index as pt[i*a_axis_stride_dbl]). It probably won't gain much speed but will be more readable. (This is what you ask about in point 1)

Efficient slicing of symmetric sparse matrices

I have a list of sparse symmetric matrices sigma such that
len(sigma) = N
and for all i,j,k,
sigma[i].shape[0] == sigma[i].shape[1] = m # Square
sigma[i][j,k] == sigma[i][k,j] # Symmetric
I have an indexing array P such that
P.shape[0] = N
P.shape[1] = k
My objective is to extract the k x k dense submatrices of sigma[i] using the indexing given by P[i,:]. This can be done as follows
sub_matrices = np.empty([N,k,k])
for i in range(N):
sub_matrices[i,:,:] = sigma[i][np.ix_(P[i,:], P[i,:])].todense()
Note however that while k is small, N (and m) are very large. If the sparse symmetric matrices are stored in CSR format this takes a very long time. I feel there must be a better solution. For example is there a sparse format that lends itself well to symmetric matrices that need to be sliced on both dimensions?
I am using Python but would be open to any C library suggestions that I could interface using Cython.
EXTRA
Note that my current Cython approach is as follows:
cimport cython
import numpy as np
cimport numpy as np
#cython.boundscheck(False) # turn off bounds-checking for entire function
cpdef sparse_slice_fast_cy(sigma,
long[:,:] P,
double[:,:,:] sub_matrices):
"""
Inputs:
sigma: A list (N,) of sparse sp.csr_matrix (m x m)
P: A 2D array of integers (N, k)
sub_matrices: A 3D array of doubles (N, k, k) containing the slicing
"""
# Create variables for keeping code tidy
cdef long N = P.shape[0]
cdef long k = P.shape[1]
cdef long i
cdef long j
cdef long index_pointer
cdef long sparse_row_pointer
# Create objects for holding sparse matrix data
cdef double[:] data
cdef long[:] indices
cdef long[:] indptr
# Object for the ordered P
cdef long[:] perm
# Make sure sub_matrices is all 0
sub_matrices[:] = 0
for i in range(N):
# Sort the P
perm = np.argsort(P[i,:])
# Get the sparse matrix values
data = sigma[i].data
indices = sigma[i].indices.astype(long)
indptr = sigma[i].indptr.astype(long)
for j in range(k):
# Loop over row P[i, perm[j]] in sigma searching for values
# in P[i, :] vector i.e. compare
# sigma[P[i, perm[j], :]
# against
# P[i,:]
# To do this we need our sparse row vector with columns
# indices[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# and data/values
# data[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# which comes from the csr matrix format.
# We also need our sorted indexing vector
# P[i, perm[:]]
# We begin by pointing at the top of both
# our vectors and gradually move down them. In the event of
# an equality we add the data to sub_matrices[i,:,:] and
# increment the INDEXING VECTOR pointer, not the sparse
# row vector pointer, as there can be multiple values that
# are the same in the indexing vector but not the sparse row
# column vector (only 1 column can appear in 1 row!).
index_pointer = 0
sparse_row_pointer = indptr[P[i, perm[j]]]
while ((index_pointer < k) and (sparse_row_pointer < indptr[P[i, perm[j]] + 1])):
if indices[sparse_row_pointer] == P[i, perm[index_pointer]]:
# We can add data to sub_matrices
sub_matrices[i, perm[j], perm[index_pointer]] = \
data[sparse_row_pointer]
# Only increment the index pointer
index_pointer += 1
elif indices[sparse_row_pointer] > P[i, perm[index_pointer]]:
# Need to increment index pointer
index_pointer += 1
else:
# Need to increment sparse row pointer
sparse_row_pointer += 1
I believe then np.argsort may be inefficient when called often on relatively small vectors and would like to swap for a C implementation. I also don't take advantage of parallel processing that could potentially speed it up over the N sparse matrices. Unfortunately as there are Python coercions inside the outer, loop I don't know how I can use prange.
Another point to note is that the Cython approach seems to use a HUGE amount of memory but I have no idea where its getting allocated.
Latest Version
As per the suggestions of ead, below is the latest version of the Cython code
cimport cython
import numpy as np
cimport numpy as np
#cython.boundscheck(False) # turn off bounds-checking for entire function
cpdef sparse_slice_fast_cy(sigma,
np.ndarray[np.int32_t, ndim=2] P,
np.float64_t[:,:,:] sub_matrices,
int symmetric):
"""
Inputs:
sigma: A list (N,) of sparse sp.csr_matrix (m x m)
P: A 2D array of integers (N, k)
sub_matrices: A 3D array of doubles (N, k, k) containing the slicing
symmetric: 1 if the sigma matrices are symmetric
"""
# Create variables for keeping code tidy
cdef np.int32_t N = P.shape[0]
cdef np.int32_t k = P.shape[1]
cdef np.int32_t i
cdef np.int32_t j
cdef np.int32_t index_pointer
cdef np.int32_t sparse_row_pointer
# Create objects for holding sparse matrix data
cdef np.float64_t[:] data
cdef np.int32_t[:] indices
cdef np.int32_t[:] indptr
# Object for the ordered P
cdef np.int32_t[:,:] perm = np.argsort(P, axis=1).astype(np.int32)
# Make sure sub_matrices is all 0
sub_matrices[:] = 0
for i in range(N):
# Get the sparse matrix values
data = sigma[i].data
indices = sigma[i].indices
indptr = sigma[i].indptr
for j in range(k):
# Loop over row P[i, perm[j]] in sigma searching for values
# in P[i, :] vector i.e. compare
# sigma[P[i, perm[j], :]
# against
# P[i,:]
# To do this we need our sparse row vector with columns
# indices[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# and data/values
# data[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# which comes from the csr matrix format.
# We also need our sorted indexing vector
# P[i, perm[:]]
# We begin by pointing at the top of both
# our vectors and gradually move down them. In the event of
# an equality we add the data to sub_matrices[i,:,:] and
# increment the INDEXING VECTOR pointer, not the sparse
# row vector pointer, as there can be multiple values that
# are the same in the indexing vector but not the sparse row
# column vector (only 1 column can appear in 1 row!).
if symmetric:
index_pointer = j # Only search upper triangular
else:
index_pointer = 0
sparse_row_pointer = indptr[P[i, perm[i, j]]]
while ((index_pointer < k) and (sparse_row_pointer < indptr[P[i, perm[i, j]] + 1])):
if indices[sparse_row_pointer] == P[i, perm[i, index_pointer]]:
# We can add data to sub_matrices
sub_matrices[i, perm[i, j], perm[i, index_pointer]] = \
data[sparse_row_pointer]
if symmetric:
sub_matrices[i, perm[i, index_pointer], perm[i, j]] = \
data[sparse_row_pointer]
# Only increment the index pointer
index_pointer += 1
elif indices[sparse_row_pointer] > P[i, perm[i, index_pointer]]:
# Need to increment index pointer
index_pointer += 1
else:
# Need to increment sparse row pointer
sparse_row_pointer += 1
Parallel Version
Below is a parallel version, although it doesn't seem to provide any speedup and the code is no longer as nice looking:
# See https://stackoverflow.com/questions/48805636/efficient-slicing-of-symmetric-sparse-matrices
cimport cython
import numpy as np
cimport numpy as np
from libc.stdlib cimport malloc, free
from cython.parallel import prange
#cython.boundscheck(False) # turn off bounds-checking for entire function
cpdef sparse_slice_fast_cy(sigma,
np.ndarray[np.int32_t, ndim=2] P,
np.float64_t[:,:,:] sub_matrices,
int symmetric):
"""
Inputs:
sigma: A list (N,) of sparse sp.csr_matrix (m x m)
P: A 2D array of integers (N, k)
sub_matrices: A 3D array of doubles (N, k, k) containing the slicing
symmetric: 1 if the sigma matrices are symmetric
"""
# Create variables for keeping code tidy
cdef np.int32_t N = P.shape[0]
cdef np.int32_t k = P.shape[1]
cdef np.int32_t i
cdef np.int32_t j
cdef np.int32_t index_pointer
cdef np.int32_t sparse_row_pointer
# Create objects for holding sparse matrix data
cdef np.float64_t[:] data_mem_view
cdef np.int32_t[:] indices_mem_view
cdef np.int32_t[:] indptr_mem_view
cdef np.float64_t **data = <np.float64_t **> malloc(N * sizeof(np.float64_t *))
cdef np.int32_t **indices = <np.int32_t **> malloc(N * sizeof(np.int32_t *))
cdef np.int32_t **indptr = <np.int32_t **> malloc(N * sizeof(np.int32_t *))
for i in range(N):
data_mem_view = sigma[i].data
data[i] = &(data_mem_view[0])
indices_mem_view = sigma[i].indices
indices[i] = &(indices_mem_view[0])
indptr_mem_view = sigma[i].indptr
indptr[i] = &(indptr_mem_view[0])
# Object for the ordered P
cdef np.int32_t[:,:] perm = np.argsort(P, axis=1).astype(np.int32)
# Make sure sub_matrices is all 0
sub_matrices[:] = 0
for i in prange(N, nogil=True):
for j in range(k):
# Loop over row P[i, perm[j]] in sigma searching for values
# in P[i, :] vector i.e. compare
# sigma[P[i, perm[j], :]
# against
# P[i,:]
# To do this we need our sparse row vector with columns
# indices[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# and data/values
# data[indptr[P[i, perm[j]]], indptr[P[i, perm[j]]+1]]
# which comes from the csr matrix format.
# We also need our sorted indexing vector
# P[i, perm[:]]
# We begin by pointing at the top of both
# our vectors and gradually move down them. In the event of
# an equality we add the data to sub_matrices[i,:,:] and
# increment the INDEXING VECTOR pointer, not the sparse
# row vector pointer, as there can be multiple values that
# are the same in the indexing vector but not the sparse row
# column vector (only 1 column can appear in 1 row!).
if symmetric:
index_pointer = j # Only search upper triangular
else:
index_pointer = 0
sparse_row_pointer = indptr[i][P[i, perm[i, j]]]
while ((index_pointer < k) and
(sparse_row_pointer < indptr[i][P[i, perm[i, j]] + 1])):
if indices[i][sparse_row_pointer] == P[i, perm[i, index_pointer]]:
# We can add data to sub_matrices
sub_matrices[i, perm[i, j], perm[i, index_pointer]] = \
data[i][sparse_row_pointer]
if symmetric:
sub_matrices[i, perm[i, index_pointer], perm[i, j]] = \
data[i][sparse_row_pointer]
# Only increment the index pointer
index_pointer = index_pointer + 1
elif indices[i][sparse_row_pointer] > P[i, perm[i, index_pointer]]:
# Need to increment index pointer
index_pointer = index_pointer + 1
else:
# Need to increment sparse row pointer
sparse_row_pointer = sparse_row_pointer + 1
# Free malloc'd data
free(data)
free(indices)
free(indptr)
Test
To test the code run
cythonize -i sparse_slice.pyx
where sparse_slice.pyx is th filename. Then you can use this script:
import time
import numpy as np
import scipy as sp
import scipy.sparse
from sparse_slice import sparse_slice_fast_cy
k = 100
N = 20000
m = 10000
samples = 20
# Create sigma matrices
## The sampling of random sparse takes a while so just do a few and
## then populate with these.
now = time.time()
sigma_samples = []
for i in range(samples):
sigma_samples.append(sp.sparse.rand(m, m, density=0.001, format='csr'))
sigma_samples[-1] = sigma_samples[-1] + sigma_samples[-1].T # Symmetric
## Now make the sigma list from these.
sigma = []
for i in range(N):
j = np.random.randint(samples)
sigma.append(sigma_samples[j])
print('Time to make sigma: {}'.format(time.time() - now))
# Create indexer
now = time.time()
P = np.empty([N, k]).astype(int)
for i in range(N):
P[i, :] = np.random.choice(np.arange(m), k, replace=True)
print('Time to make P: {}'.format(time.time() - now))
# Create objects for holding the slices
sub_matrices_slow = np.empty([N, k, k])
sub_matrices_fast = np.empty([N, k, k])
# Run both slicings
## Slow
now = time.time()
for i in range(N):
sub_matrices_slow[i,:,:] = sigma[i][np.ix_(P[i,:], P[i,:])].todense()
print('Time to make sub_matrices_slow: {}'.format(time.time() - now))
## Fast
symmetric = 1
now = time.time()
sparse_slice_fast_cy(sigma, P.astype(np.int32), sub_matrices_fast, symmetric)
print('Time to make sub_matrices_fast: {}'.format(time.time() - now))
assert(np.all((sub_matrices_slow - sub_matrices_fast)**2 < 1e-6))
Cannot test right now, but there are two suggestions:
A) sort all rows at once onside of the i-loop:
# Object for the ordered P
cdef long[:,:] perm = np.argsort(P, axis=1)
maybe you will need to pass P as np.ndarray[np.int64_t, ndim=2] P (or whatever type it is) to avoid copying. You will have to access the data via perm[i,X] instead of perm[X].
B) define
cdef np.int32_t[:] indices
cdef np.int32_t[:] indptr
So you don't need to copy the data via '.astype`, i.e.
for i in range(N):
data = sigma[i].data
indices = sigma[i].indices
indptr = sigma[i].indptr
I think because the sigma[i] has O(m) elements the copying is the bottleneck of your function: you get running time O(N*(m+k^2)) instead of `O(N*k^2) - it is good to avoid it.
Otherwise the function doesn't look too bad.
For getting prange to work with i-loop, you should move the accesses to sigma[i] outside of the loop by creating a kind of arrays of pointers to the first element of data , indices and indptr and populating them in a cheap preprocess-step. One can make it work, but the question is how much is the gain from the parallelization - it might well be the case, that the problem is memory-bound - one has to see timings.
You could also use the symmetry by processing only the upper triangle matrix:
...
index_pointer = j #only upper triangle!
....
....
# We can add data to sub_matrices
#upper triangle sub-matrix:
sub_matrices[i, perm[j], perm[index_pointer]] = \
data[sparse_row_pointer]
#lower triangle sub-matrix:
sub_matrices[i, perm[index_pointer], perm[j]] = \
data[sparse_row_pointer]
....
I would start with B) and see how it works out...
Edit:
On memory usage: one can measure the peak memory usage via
/usr/bin/time -f "peak_used_memory:%M(in Kb)" python test.py
I run my tests with N=2000 and get (python3.6+cython0.27.1):
peak memory usage
only slow 245Mb
only fast 245Mb
slow+fast no check 402Mb
slow+fast+assert 576Mb
So there is 50Mb overhead, 200Mb used by either function and additional 176 Mb for evaluation the assert. I can see the same behavior also for other values of N.
So I would say there is no huge memory usage by cython.
This task is very probably (at least partly) memory bound, so the parallelization will not help much. You should reduce the amount of memory loaded to cache.
One possibility is not to use perm - after all it also needs to be loaded into the cache. You could do it if
you can live with any row/col permutation in matrix sigma, than just sort P and use it.
there are very few elements per row, so linear search for every element would be Ok.
doing binary search for every element
I guess you could win about 20-30% in the best case.
Sometimes cython produces code which is not easy to optimize for the c-compiler and one achieves often better results writing directly in C and then wrapping it with python.
But I would do all that only if this operation is really, really the bottle-neck of your program.
By the way, declaring
cdef np.int64_t[:,:] perm = np.argsort(P, axis=1)
you will not need additional copying.

Making a boolean array

I want to make a boolean numpy array in cython with the given size of another numpy.array but it raises an error message:
CosmoTest.pyx
import numpy as np
cimport numpy as np
cimport cython
from libcpp cimport bool
x=np.array([[-0.3,1.2],[2.5,0.82],[0.61,-0.7]])
mask= np.ones_like(x,dtype=bool)
error:
mask= np.ones_like(x,dtype=bool)
^
------------------------------------------------------------
CosmoTest.pyx:318:39: 'bool' is not a constant, variable or function identifier
How should it be defined in cython?
Update:
cpdef np.ndarray arc( np.ndarray x):
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros_like(x)
cdef np.ndarray[np.uint8_t,cast=True, ndim=1] mask = (x < 0.999).view(dtype=np.uint8)
if mask.any():
out[mask] = 0.5*np.log((1.+((1.-x[mask])/(x[mask]+1.))**0.5)/(1.-((1.-x[mask])/(x[mask]+1.))**0.5))/(1-x[mask]**2)**0.5
cdef np.ndarray[np.uint8_t,cast=True, ndim=1] mask = (x > 1.001).view(dtype=np.uint8)
if mask.any():
out[mask] = np.arctan(((x[mask]-1.)/(x[mask]+1.))**0.5)/(x[mask]**2 - 1)**0.5
cdef np.ndarray[np.uint8_t,cast=True , ndim=1] mask = ((x >= 0.999) & (x <= 1.001)).view(dtype=np.uint8)
if mask.any():
out[mask] = 5./6. - x[mask]/3.
return out
Error Message:
Error compiling Cython file:
------------------------------------------------------------
...
if mask.any():
out[mask] = 0.5*np.log((1.+((1.-x[mask])/(x[mask]+1.))**0.5)/(1.-((1.-x[mask])/(x[mask]+1.))**0.5))/(1-x[mask]**2)**0.5
cdef np.ndarray[np.uint8_t,cast=True, ndim=1] mask = (x > 1.001).view(dtype=np.uint8)
if mask.any():
out[mask] = np.arctan(((x[mask]-1.)/(x[mask]+1.))**0.5)/(x[mask]**2 - 1)**0.5
^
------------------------------------------------------------
CosmoTest.pyx:9:55: local variable 'mask' referenced before assignment
If you change (the last line of) your code to
mask= np.ones_like(x,dtype=np.bool)
it will work (take bool from numpy rather than trying to use the lipcpp definition). However, actually statically typing boolean numpy arrays doesn't quite work currently (see Passing a numpy pointer (dtype=np.bool) to C++).
The best way forward currently is to statically type them as
def f(np.ndarray[dtype=np.int8_t,ndim=1] x):
cdef np.ndarray[dtype=np.int8_t,ndim=1] y
y = np.ones_like(x,dtype=np.int8)
return y.view(dtype=np.bool) # returns as boolean array
Internally numpy uses an 8 bit integer to store a bool, and thus you can just use view to reinterpret the array without copying.
If you had a boolean array and wanted to call f you'd do
mask = np.array([True,False,True])
f(mask.view(dtype=np.int8))
You could always write a small wrapper function as your public interface to f to do that reinterpretation automatically.
It's more fiddly than it needs be be, but it is possible to work with.
Addition in response to comments
The article I linked to suggested using cast=True:
cdef np.ndarray[np.uint8_t,cast=True] mask = (x > 0.01)
This also works fine. Written in my approach that would be
cdef np.ndarray[np.uint8_t] mask = (x > 0.01).view(dtype=np.uint8)
(i.e. no cast, but with a view). As far as I can tell there's no practical difference, so pick which one you think looks nicer.
And edited to respond to additional issues
The working code is below (I've checked and it compiles - I haven't checked to make sure it runs). You were getting compiler errors because you'd defined the type of mask multiple times. You're only allowed to use cdef once per variable per function, but having defined the type you can assign to it as often as you like.
cpdef np.ndarray arc( np.ndarray x):
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros_like(x)
cdef np.ndarray[np.uint8_t, ndim=1] mask = (x < 0.999).view(dtype=np.uint8)
if mask.any():
out[mask] = 0.5*np.log((1.+((1.-x[mask])/(x[mask]+1.))**0.5)/(1.-((1.-x[mask])/(x[mask]+1.))**0.5))/(1-x[mask]**2)**0.5
mask = (x > 1.001).view(dtype=np.uint8) # REMOVED cdef!
if mask.any():
out[mask] = np.arctan(((x[mask]-1.)/(x[mask]+1.))**0.5)/(x[mask]**2 - 1)**0.5
mask = ((x >= 0.999) & (x <= 1.001)).view(dtype=np.uint8) # REMOVED cdef!
if mask.any():
out[mask] = 5./6. - x[mask]/3.
return out
(I've also removed cast=True from the definition. This isn't important. You can either use that, or use view(dtype=np.uint8). You can use both if you like, but it's more typing!)

Apply 1D function on one axis of nd-array

What I want:
I want to apply a 1D function to an arbitrarily shaped ndarray, such that it modifies a certain axis. Similar to the axis argument in numpy.fft.fft.
Take the following example:
import numpy as np
def transf1d(f, x, y, out):
"""Transform `f(x)` to `g(y)`.
This function is actually a C-function that is far more complicated
and should not be modified. It only takes 1D arrays as parameters.
"""
out[...] = (f[None,:]*np.exp(-1j*x[None,:]*y[:,None])).sum(-1)
def transf_all(F, x, y, axis=-1, out=None):
"""General N-D transform.
Perform `transf1d` along the given `axis`.
Given the following:
F.shape == (2, 3, 100, 4, 5)
x.shape == (100,)
y.shape == (50,)
axis == 2
Then the output shape would be:
out.shape == (2, 3, 50, 4, 5)
This function should wrap `transf1d` such that it works on arbitrarily
shaped (compatible) arrays `F`, and `out`.
"""
if out is None:
shape = list(np.shape(F))
shape[axis] = np.size(y)
for f, o in magic_iterator(F, out):
# Given above shapes:
# f.shape == (100,)
# o.shape == (50,)
transf1d(f, x, y, o)
return out
The function transf1d takes a 1D ndarray f, and two more 1D arrays x, and y. It performs a fourier transform of f(x) from the x-axis to the y-axis. The result is stored in the out argument.
Now I want to wrap this in a more general function transf_all, that can take ndarrays of arbitrary shape along with an axis argument, that specifies along which axis to transform.
Notes:
My code is actually written in Cython. Ideally, the magic_iterator would be fast in Cython.
The function transf1d actually is a C-function that returns its output in the out argument. Hence, I couldn't get it to work with numpy.apply_along_axis.
Because transf1d is actually a pretty complicated C-function I cannot rewrite it to work on arbitrary arrays. I need to wrap it in a Cython function that deals with the additional dimensions.
Note, that the arrays x, and y can differ in their lengths.
My question:
How can I do this? How can I iterate over arbitrary dimensions of an ndarray such that at each iteration I will get a 1D array containing the specified axis?
I had a look at nditer, but I'm not sure if that is actually the right tool for this job.
Cheers!
import numpy as np
def transf1d(f, x, y, out):
"""Transform `f(x)` to `g(y)`.
This function is actually a C-function that is far more complicated
and should not be modified. It only takes 1D arrays as parameters.
"""
out[...] = (f[None,:]*np.exp(-1j*x[None,:]*y[:,None])).sum(-1)
def transf_all(F, x, y, axis=-1, out=None):
"""General N-D transform.
Perform `transf1d` along the given `axis`.
Given the following:
F.shape == (2, 3, 100, 4, 5)
x.shape == (100,)
y.shape == (50,)
axis == 2
Then the output shape would be:
out.shape == (2, 3, 50, 4, 5)
This function should wrap `transf1d` such that it works on arbitrarily
shaped (compatible) arrays `F`, and `out`.
"""
def wrapper(f):
"""
wrap transf1d for apply_along_axis compatibility
that is, having a signature of F.shape[axis] -> out.shape[axis]
"""
out = np.empty_like(y)
transf1d(f, x, y, out)
return out
return np.apply_along_axis(wrapper, axis, F)
I believe this should do what you want, although I havnt tested it. Note that the looping happening inside apply_along_axis has python-level performance though, so this only vectorizes the operation in terms of style, not in terms of performance. However, that is quite probably of no concern, assuming the decision to resort to external C code for the inner loop is justified by it being a nontrivial operation in the first place.
To answer your question:
If you really just want to iterate over all but a given axis, you can use:
for s in itertools.product(map(range, arr.shape[:axis]+arr.shape[axis+1:]):
arr[s[:axis] + (slice(None),) + s[axis:]]
Maybe there's a more elegant way to do it, but this should work.
But, don't iterate:
For your problem, I would just rewrite your function to work on a given axis of an ndarray. I think this should work:
def transfnd(f, x, y, axis, out):
s = list(f.shape)
s.insert(axis, 1)
yx = [y.size, x.size] + [1]*(f.ndim - axis - 1)
out[...] = np.sum(f.reshape(*s)*np.exp(-1j*x[None,:]*y[:,None]).reshape(*yx), axis+1)
It's really just the generalization of your current implementation, but instead of inserting a new axis in F at the beginning, it inserts it at axis (there might be a better way to do this than with the list(shape) method, but that was all I could do. Finally, you have to add trailing new axes to your yx outer product, to match as many trailing indices you have in F.
I didn't really know how to test this, but the shapes all work out, so please test it and let me know whether it works.
I found a way of iterating over all but one axis in Cython using the Numpy C-API (Code down below). However, it's not pretty. Whether it's worth the effort depends on the inner function and the size of data.
If any one knows a more elegant way to do this in Cython, please let me know.
I compared to Eelco's solution and they run at a comparable speed for large arguments. For smaller arguments the C-API solution is faster:
In [5]: y=linspace(-1,1,100);
In [6]: %timeit transf.apply_along(f, x, y, axis=1)
1 loops, best of 3: 5.28 s per loop
In [7]: %timeit transf.transfnd(f, x, y, axis=1)
1 loops, best of 3: 5.16 s per loop
As you can see, for this input both functions are roughly at the same speed.
In [8]: f=np.random.rand(10,20,50);x=linspace(0,1,20);y=linspace(-1,1,10);
In [9]: %timeit transf.apply_along(f, x, y, axis=1)
100 loops, best of 3: 15.1 ms per loop
In [10]: %timeit transf.transfnd(f, x, y, axis=1)
100 loops, best of 3: 8.55 ms per loop
However, for less large input arrays the C-API approach is faster.
The code
#cython: boundscheck=False
#cython: wraparound=False
#cython: cdivision=True
import numpy as np
cimport numpy as np
np.import_array()
cdef extern from "complex.h":
double complex cexp(double complex z) nogil
cdef void transf1d(double complex[:] f,
double[:] x,
double[:] y,
double complex[:] out,
int Nx,
int Ny) nogil:
cdef int i, j
for i in xrange(Ny):
out[i] = 0
for j in xrange(Nx):
out[i] = out[i] + f[j]*cexp(-1j*x[j]*y[i])
def transfnd(F, x, y, axis=-1, out=None):
# Make sure everything is a numpy array.
F = np.asanyarray(F, dtype=complex)
x = np.asanyarray(x, dtype=float)
y = np.asanyarray(y, dtype=float)
# Calculate absolute axis.
cdef int ax = axis
if ax < 0:
ax = np.ndim(F) + ax
# Calculate lengths of the axes `x`, and `y`.
cdef int Nx = np.size(x), Ny = np.size(y)
# Output array.
if out is None:
shape = list(np.shape(F))
shape[axis] = Ny
out = np.empty(shape, dtype=complex)
else:
out = np.asanyarray(out, dtype=complex)
# Error check.
assert np.shape(F)[axis] == Nx, \
'Array length mismatch between `F`, and `x`!'
assert np.shape(out)[axis] == Ny, \
'Array length mismatch between `out`, and `y`!'
f_shape = list(np.shape(F))
o_shape = list(np.shape(out))
f_shape[axis] = 0
o_shape[axis] = 0
assert f_shape == o_shape, 'Array shape mismatch between `F`, and `out`!'
# Construct iterator over all but one axis.
cdef np.flatiter itf = np.PyArray_IterAllButAxis(F, &ax)
cdef np.flatiter ito = np.PyArray_IterAllButAxis(out, &ax)
cdef int f_stride = F.strides[axis]
cdef int o_stride = out.strides[axis]
# Memoryview to access one slice per iteration.
cdef double complex[:] fdat
cdef double complex[:] odat
cdef double[:] xdat = x
cdef double[:] ydat = y
while np.PyArray_ITER_NOTDONE(itf):
# View the current `x`, and `y` axes.
fdat = <double complex[:Nx]> np.PyArray_ITER_DATA(itf)
fdat.strides[0] = f_stride
odat = <double complex[:Ny]> np.PyArray_ITER_DATA(ito)
odat.strides[0] = o_stride
# Perform the 1D-transformation on one slice.
transf1d(fdat, xdat, ydat, odat, Nx, Ny)
# Go to next step.
np.PyArray_ITER_NEXT(itf)
np.PyArray_ITER_NEXT(ito)
return out
# For comparison
def apply_along(F, x, y, axis=-1):
# Make sure everything is a numpy array.
F = np.asanyarray(F, dtype=complex)
x = np.asanyarray(x, dtype=float)
y = np.asanyarray(y, dtype=float)
# Calculate absolute axis.
cdef int ax = axis
if ax < 0:
ax = np.ndim(F) + ax
# Calculate lengths of the axes `x`, and `y`.
cdef int Nx = np.size(x), Ny = np.size(y)
# Error check.
assert np.shape(F)[axis] == Nx, \
'Array length mismatch between `F`, and `x`!'
def wrapper(f):
out = np.empty(Ny, complex)
transf1d(f, x, y, out, Nx, Ny)
return out
return np.apply_along_axis(wrapper, axis, F)
Build with the following setup.py
from distutils.core import setup
from Cython.Build import cythonize
import numpy as np
setup(
name = 'transf',
ext_modules = cythonize('transf.pyx'),
include_dirs = [np.get_include()],
)

Return variable length array in Numpy C-extension?

I have made some Numpy C-extensions before with great help from this site, but as far as I can see the returned parameters are all fixed length.
Is there any way to have a Numpy C-extension return a variable length numpy array instead?
You may find it easier to make numpy extensions in Cython using the Numpy C-API which simplifies the process as it allows you to mix python and c objects. In that case there is little difficult about making a variable length array, you can simply specify an array with an arbitrary shape.
The Cython numpy tutorial is probably the best source on this topic.
For example, here is a function I recently wrote:
import numpy as np
cimport numpy as np
cimport cython
dtype = np.double
ctypedef double dtype_t
np.import_ufunc()
np.import_array()
def ewma(a, d, axis):
#Calculates the exponentially weighted moving average of array a along axis using the parameter d.
cdef void *args[1]
cdef double weight[1]
weight[0] = <double>np.exp(-d)
args[0] = &weight[0]
return apply_along_axis(&ewma_func, np.array(a, dtype = float), np.double, np.double, False, &(args[0]), <int>axis)
cdef void ewma_func(int n, void* aData,int astride, void* oData, int ostride, void** args):
#Exponentially weighted moving average calculation function
cdef double avg = 0.0
cdef double weight = (<double*>(args[0]))[0]
cdef int i = 0
for i in range(n):
avg = (<double*>((<char*>aData) + i * astride))[0]*weight + avg * (1.0 - weight)
(<double*>((<char*>oData) + i * ostride))[0] = avg
ctypedef void (*func_1d)(int, void*, int, void*, int, void **)
cdef apply_along_axis(func_1d function, a, adtype, odtype, reduce, void** args, int axis):
#generic function for applying a cython function along a particular dimension
oshape = list(a.shape)
if reduce :
oshape[axis] = 1
out = np.empty(oshape, odtype)
cdef np.flatiter ita, ito
ita = np.PyArray_IterAllButAxis(a, &axis)
ito = np.PyArray_IterAllButAxis(out, &axis)
cdef int axis_length = a.shape[axis]
cdef int a_axis_stride = a.strides[axis]
cdef int o_axis_stride = out.strides[axis]
if reduce:
o_axis_stride = 0
while np.PyArray_ITER_NOTDONE(ita):
function(axis_length, np.PyArray_ITER_DATA (ita), a_axis_stride, np.PyArray_ITER_DATA (ito), o_axis_stride, args)
np.PyArray_ITER_NEXT(ita)
np.PyArray_ITER_NEXT(ito)
if reduce:
oshape.pop(axis)
out.shape = oshape
return out
If this doesn't suit you, there is a function for making a new empty array with arbitrary shape (link).
I am interpreting your question to mean "I have a function that takes a NumPy array of length n, but it will return another array of length m different from n." If that is the case, you will need to malloc a new C array in the extension, e.g.
new_array = malloc(m * sizeof(int64)); // or whatever your data type is
then create a new NumPy array with that. This example assumes a 1D array:
int npy_intp dims[1];
dims[0] = m;
PyArrayObject *out = (PyArrayObject *)PyArray_SimpleNewFromData(1, // 1D array
dims, // dimensions
NPY_INT64, // type
new_array);
PyArray_ENABLEFLAGS(out, NPY_ARRAY_OWNDATA);
Then return the new array. The important part here is to set the NPY_ARRAY_OWNDATA flag so that the memory you allocated is freed when the Python object is garbage collected.

Categories