Define a pointer as a method of a class? - python

I am trying to speed up my cython code. I came across this link where the author has described how using pointers instead of numpy arrays can improve the speed of cython codes. In my cosmology class the bottleneck is Da function. I am not very familiar with pointers in C, I would appreciate if somebody give me an idea:
Is it possible to define a method of a class as a pointer for instance in my case convert np.ndarray[double, ndim=1] Da to something like double* Da?
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
import copy
cdef extern from "gsl/gsl_math.h":
ctypedef struct gsl_function:
double (* function) (double x, void * params)
void * params
cdef extern from "gsl/gsl_integration.h":
ctypedef struct gsl_integration_workspace
gsl_integration_workspace * gsl_integration_workspace_alloc(size_t n)
void gsl_integration_workspace_free(gsl_integration_workspace * w)
int gsl_integration_qags(const gsl_function * f, double a, double b, double epsabs, double epsrel, size_t limit, gsl_integration_workspace * workspace, double *result, double *abserr)
cdef double func_callback(double x, void* params):
return (<cosmology>params).__angKernel(x)
cdef class cosmology(object):
cdef public double omega_m, omega_l, h, w, omega_r, G, v_c
cdef object omega_c
def __init__(self,double omega_m = 0.3, double omega_l = 0.7, double h = 0.7, double w = -1, double omega_r = 0., double G = std_G):
self.omega_m = omega_m
self.omega_l = omega_l
self.omega_r = omega_r
self.omega_c = (1. - omega_m - omega_l)
self.h = h
self.w = w
self.G = G
self.v_c = v_c
def __copy__(self):
return cosmology(omega_m = self.omega_m, omega_l = self.omega_l, h = self.h, w = self.w, omega_r = self.omega_r, G = self.G)
property H0:
def __get__(self):
return 100*self.h #km/s/MPC
cpdef double a(self, double z):
return 1./(1.+z)
cpdef double E(self, double a):
return (self.omega_r*a**(-4) + self.omega_m*a**(-3) + self.omega_c*a**(-2) + self.omega_l)**0.5
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
cdef double __angKernel(self, double x):
"""Integration kernel for angular diameter distance computation.
"""
return self.E(x**-1)**-1
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
cpdef np.ndarray[double, ndim=1] Da(self, np.ndarray[double, ndim=1] z, double z_ref=0):
cdef gsl_integration_workspace* w =gsl_integration_workspace_alloc(1000)
cdef gsl_function F
F.function = &func_callback
F.params = <void*>self
cdef double result = 3, error = 5
cdef double err, rk, zs, omc
omc=self.omega_c
cdef np.ndarray[double,ndim=1] d = np.ones_like(z, dtype=np.float64, order='C')
cdef int i, num
num = len(z)
for i in range(num):
zs=z[i]
if zs < 0:
raise ValueError("Redshift z must not be negative")
if zs < z_ref:
raise ValueError("Redshift z must not be smaller than the reference redshift")
gsl_integration_qags(&F, z_ref+1, zs+1, 0, 1e-7, 1000, w, &result, &error)
d[i], err = result, error
# check for curvature
rk = (fabs(omc))**0.5
if (rk*d[i] > 0.01):
if omc > 0:
d[i] = sinh(rk*d[i])/rk
if omc < 0:
d[i] = sin(rk*d[i])/rk
gsl_integration_workspace_free(w)
return d/(1.+z)
Thanks in advance.

It has been a while since I developed in cython, but if memory serves me I believe you could declare the function as follows:
ctypedef double* ( * Da)(double* z, double z_ref, int length)
This function will return an array of type double and allow you to pass an array of doubles in as z. This is a function pointer, so maybe not quite what you want.
ctypedef double* Da(double* z, double z_ref, int length)
this will accomplish same thing but as a regular function, not just a function pointer. Difference between function and function pointer is you have to assign a function pointer a function to point to.

Related

Matrix multiplication of a 2d numpy array to cpp using ctypes

What is a correct way to do the matrix multiplication using ctype ?
in my current implementation data going back and forth consuming lots of time, is there any way to do it optimally ? by passing array address and getting pointer in return instead of generating entire array using .contents method.
cpp_function.cpp
compile using g++ -shared -fPIC cpp_function.cpp -o cpp_function.so
#include <iostream>
extern "C" {
double* mult_matrix(double *a1, double *a2, size_t a1_h, size_t a1_w,
size_t a2_h, size_t a2_w, int size)
{
double* ret_arr = new double[size];
for(size_t i = 0; i < a1_h; i++){
for (size_t j = 0; j < a2_w; j++) {
double val = 0;
for (size_t k = 0; k < a2_h; k++){
val += a1[i * a1_h + k] * a2[k * a2_h +j] ;
}
ret_arr[i * a1_h +j ] = val;
// printf("%f ", ret_arr[i * a1_h +j ]);
}
// printf("\n");
}
return ret_arr;
}
}
Python file to call the so file
main.py
import ctypes
import numpy
from time import time
libmatmult = ctypes.CDLL("./cpp_function.so")
ND_POINTER_1 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
ND_POINTER_2 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
libmatmult.mult_matrix.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_size_t, ctypes.c_size_t]
def mult_matrix_cpp(a,b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = ctypes.POINTER(ctypes.c_double * shape )
ret_cpp = libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
out_list_c = [i for i in ret_cpp.contents] # <---- regenrating list which is time consuming
return out_list_c
size_a = (300,300)
size_b = size_a
a = numpy.random.uniform(low=1, high=255, size=size_a)
b = numpy.random.uniform(low=1, high=255, size=size_b)
t2 = time()
out_cpp = mult_matrix_cpp(a,b)
print("cpp time taken:{:.2f} ms".format((time() - t2) * 1000))
out_cpp = numpy.array(out_cpp).reshape(size_a[0], size_a[1])
t3 = time()
out_np = numpy.dot(a,b)
# print(out_np)
print("Numpy dot() time taken:{:.2f} ms".format((time() - t3) * 1000))
This solution works but time consuming is there any way to make it faster ?
One reason for the time consumption is not using an ndpointer for the return value and copying it into a Python list. Instead use the following restype. You won't need the later reshape as well. But take the commenters' advice and don't reinvent the wheel.
def mult_matrix_cpp(a, b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = np.ctypeslib.ndpointer(dtype=np.float64, ndim=2, shape=a.shape, flags="C")
return libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
use restype
def mult_matrix_cpp(a, b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = np.ctypeslib.ndpointer(dtype=np.float64, ndim=2, shape=a.shape, flags="C")
return libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])

Cython boundscheck & nonecheck

I struggle for several day with a script giving me unexpected results.
Today, I just realize that if I used a cython function with or without boundscheck and nonecheck decorators, I do not obtain the same results !
Here is an example :
import numpy as np
cimport numpy as np
cimport cython
cdef double[4] c
c[0] = 0.1
c[1] = 0.2
c[2] = 0.3
c[3] = 0.4
def cp1(double[:,::1] u, double[:,::1] K, int ixmin, int ixmax, int izmin, int izmax):
cpc1(u, K, ixmin, ixmax, izmin, izmax)
def cp2(double[:,::1] u, double[:,::1] K, int ixmin, int ixmax, int izmin, int izmax):
cpc2(u, K, ixmin, ixmax, izmin, izmax)
#cython.boundscheck(False)
#cython.nonecheck(False)
cdef void cpc1(double[:,::1] u, double[:,::1] K, int ixmin, int ixmax, int izmin, int izmax) nogil:
cdef Py_ssize_t ix, iz
cdef double dpu, dmu
for ix in range(ixmin+2, ixmax-1):
for iz in range(izmin, izmax):
dpu = c[0]*u[ix-1, iz] + c[1]*u[ix, iz] + c[2]*u[ix+1, iz]
dmu = c[1]*u[ix-1, iz] + c[2]*u[ix, iz] + c[3]*u[ix+1, iz]
K[ix, iz] = 0.5*dpu - 0.5*dmu
#cython.boundscheck(True)
#cython.nonecheck(True)
cdef void cpc2(double[:,::1] u, double[:,::1] K, int ixmin, int ixmax, int izmin, int izmax) nogil:
cdef Py_ssize_t ix, iz
cdef double dpu, dmu
for ix in range(ixmin+2, ixmax-1):
for iz in range(izmin, izmax):
dpu = c[0]*u[ix-1, iz] + c[1]*u[ix, iz] + c[2]*u[ix+1, iz]
dmu = c[1]*u[ix-1, iz] + c[2]*u[ix, iz] + c[3]*u[ix+1, iz]
K[ix, iz] = 0.5*dpu - 0.5*dmu
If I run these lines :
u = np.random.rand(256, 256)
K1 = np.zeros_like(u)
K2 = np.zeros_like(u)
cp1(u, K1, 100, 150, 100, 150)
cp2(u, K2, 100, 150, 100, 150)
the instruction np.all(K1 == K2) returns False. The difference between the two arrays is close to the machine precision (about 5e-17) but using this function thousand of times is enough to give me large differences on the final results.
Now, if remove the nogil instructions in cpc1 and cpc2 and I replace cdef double[4] c by c = np.zeros(4), both cp1 and cp2 functions return the same results. The problem is that I lose about 50% performance using ndarray instead of c array.
In think the problem comes from the c array precision, but why the value of boundscheck and nonecheck have an impact on the results in this case (no access out of bounds, no none, ...)
Is there a way to solve this ?
EDIT
As highlighted by ead, if I compile the code without -03 --ffast-math -march=native, both cp1 and cp2 return the same results ! But at the cost of doubling the execution time ! I can more or less understand why O3 and ffast-math lead to unexpected results doing aggressive optimizations, but I don't undertand why march=native also breaks the code.
Is there a way to preserve both performance and precision ?

Cython optimization of the code

I'm struggling to boost the performance of my python particle tracking code with Cython.
Here's my pure Python code:
from scipy.integrate import odeint
import numpy as np
from numpy import sqrt, pi, sin, cos
from time import time as Time
import multiprocessing as mp
from functools import partial
cLight = 299792458.
Dim = 6
class Integrator:
def __init__(self, ring):
self.ring = ring
def equations(self, X, s):
dXds = np.zeros(Dim)
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]**2 - self.ring.particle.mass**2 - X[1]**2 - X[3]**2)
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
def odeSolve(self, X0, sRange):
sol = odeint(self.equations, X0, sRange)
return sol
class Ring:
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
def getEMField(self, pos, time):
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x**2 + y**2)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
break
return E, B
class Particle:
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
Obviously, the most time-consuming process is integrating the ODE, defined as odeSolve() and equations() in class Integrator. Also, getEMField() method in class Ring is called as much as equations() method during the solving process.
I tried to get significant amount of speed up (at least 10x~20x) using Cython, but I only got ~1.5x level of speed up by the following Cython script:
import cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt, pi, sin, cos
from scipy.integrate import odeint
from time import time as Time
import multiprocessing as mp
from functools import partial
cdef double cLight = 299792458.
cdef int Dim = 6
#cython.boundscheck(False)
cdef class Integrator:
cdef Ring ring
def __init__(self, ring):
self.ring = ring
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] equations(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X,
double s):
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] dXds = np.zeros(Dim)
cdef double h, p_s, dtds, gamma
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] beta, E, B
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]*X[5] - self.ring.particle.mass*self.ring.particle.mass - X[1]*X[1] - X[3]*X[3])
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] odeSolve(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X0,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] sRange):
sol = odeint(self.equations, X0, sRange)
return sol
#cython.boundscheck(False)
cdef class Ring:
cdef Particle particle
cdef double ringRadius
cdef double magicB0
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
cpdef tuple getEMField(self,
list pos,
double time):
cdef double x, y, s
cdef double theta, r, rn, arg, k2, k10
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] E, B
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x*x + y*y)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
#E = np.array( [ k2*x/0.045, -k2*y/0.045, 0] )
break
return E, B
cdef class Particle:
cdef double mass
cdef double charge
cdef double gm2
cdef double magicMomentum
cdef double magicEnergy
cdef double magicGamma
cdef double magicBeta
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
#nParticles = 5
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
What should I do to get the maximum effect from Cython?
(I tried Numba instead of Cython, and actually the performance gain from Numba was enormous (around ~20x speedup). But I had extremely hard time to utilize Numba with python class instances, and I decided to use Cython instead of Numba).
For reference, the following is cython annotation on its compilation:
This is a very incomplete answer since I haven't profiled or timed anything or even checked that it gives the same answer. However here are some suggestions that reduce the amount of Python code that Cython generates:
Add the #cython.cdivision(True) compilation directive. This means that a ZeroDivisionError won't be raised on float division and you'll get a NaN value instead. (Only do this if you don't want the error to be raised).
Change p_s = np.sqrt(...) to p_s = sqrt(...). This removes a numpy call that only operates on a single value. You seem to have done this elsewhere so I don't know why you missed this line.
Where possible use fixed size C arrays instead of numpy arrays:
cdef double beta[3]
# ...
beta[0] = X[1]/X[5]
beta[1] = X[3]/X[5]
beta[2] = p_s/X[5]
You can do this when the size is known at compile time (and fairly small) and when you don't want to return it. This avoids a call to np.zeros and some subsequent type-checking to assign it the the typed numpy array. I think beta is the only place you can do this.
np.angle( complex(x/r, y/r) ) can be replaced by atan2(y/r, x/r) (using atan2 from libc.math. You can also lose the division by r
cdef int i helps make your for loop faster in getEMField (Cython is often good at automatically picking up the types of loop variables but seems to have failed here)
I suspect it's quicker to assign E element-by-element than as a whole array:
E[0] = k2*x/0.045 + k10*rn**9*cos(9*arg)
E[1] = -k2*y/0.045 -k10*rn**9*sin(9*arg)
There isn't much value in specifying types like list and tuple and it may actually make the code slightly slower (because it will waste time checking the types).
A bigger change would be to pass E and B into GetEMField as pointers rather than using allocating them np.zeros. This would let you allocate them as static C arrays in equations (cdef double E[3]). The downside is that GetEMField would have to be cdef so no longer callable from Python (but you could make a Python callable wrapper function too if you like).

Nested loop optimization in cython: is there a faster way to setup this code example?

As part of a large piece of code, I need to call the (simplified) function example (pasted below) multiple (hundreds of thousands of) times, with different arguments. As such, I need this module to run quickly.
The main issue with the module seems to be the multiple nested loops. However, I am not sure if there is actually unnecessary overhead associated with these loops (as written), or if the code is really as fast it can get.
In general, when dealing with multiple nested for loops in cython, are there loop optimization techniques that can be used to reduce overhead and speed up the code? Do any of these techniques apply to the example code pasted below?
I am also compiling the cython with extra_compile_args=["-ffast-math",'-O3'], though this doesn't seem to make a huge difference.
If this code really can't get any faster in cython (which I hope is not the case), would there be any advantage to writing all or part of this module in C or Fortran?
import numpy as np
import math
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.float_t DTYPE_t
cdef extern from "math.h":
double log(double x) nogil
double exp(double x) nogil
double pow(double x, double y) nogil
def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
#cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):
cdef int k_max = np.max(data) + 1
cdef double A = np.float(theta[0])
cdef double n1 = np.float(theta[1])
cdef double n2 = np.float(theta[2])
cdef double Sb = np.float(theta[3])
cdef int npixROI = len(xbg_PSF_compressed)
cdef double f2 = 0.0
cdef double df_rho_div_f2 = 0.0
cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef Py_ssize_t f_index, p, k, n
#calculations for PS
cdef int do_half = 0
cdef double term1 = 0.0
cdef double term2 = 0.0
cdef double second_2_a = 0.0
cdef double second_2_b = 0.0
cdef double second_2_c = 0.0
cdef double second_2_d = 0.0
cdef double second_1_a = 0.0
cdef double second_1_b = 0.0
cdef double second_1_c = 0.0
cdef double second_1_d = 0.0
for f_index in range(len(f_ary)):
f2 = f_ary[f_index]
df_rho_div_f2 = df_rho_div_f_ary[f_index]
g1_ary_f = np.random.random(k_max+1)
g2_ary_f = np.random.random(k_max+1)
term1 = (A * Sb * f2) \
* (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
- (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
second_1_a = A * pow(Sb * f2, n1)
second_1_b = A * pow(Sb * f2, n2)
for p in range(npixROI):
x_m_sum_f[p] = term1 * PS_dist_compressed[p]
x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]
second_1_c = second_1_a * PS_dist_compressed[p]
second_1_d = second_1_b * PS_dist_compressed[p]
for k in range(data[p]+1):
x_m_ary_f[k,p] = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k]
x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]
cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)
cdef double ll = 0.
for p in range(npixROI):
f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
nu_mat[0,p] = exp(f0_ary[p])
nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]
for k in range(2,data[p]+1):
for n in range(0, k - 1):
nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
ll+=log( nu_mat[data[p],p])
if math.isnan(ll) ==True or math.isinf(ll) ==True:
ll = -10.1**10.
return ll
For reference, when trying to run this code, example arguments are
f_ary=np.array([ 0.05, 0.15, 0.25 , 0.35 , 0.45 ,0.55 , 0.65 , 0.75, 0.85 , 0.95])
df_rho_div_f_ary = np.array([ 24.27277928, 2.83852471 , 1.14224844 , 0.61687863 , 0.39948536,
0.30138642 , 0.24715899 , 0.22077999 , 0.21594814 , 0.19035121])
theta=[.002, 3.01,0.01, 10.013]
n_p=1000
data= np.random.randint(1,400,n_p).astype(dtype='int32')
k_max=int(np.max(data))+1
xbg_PSF_compressed= np.ones(n_p)*20
PS_dist_compressed= np.ones(n_p)
and the example may then be called as example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed). For timing, I find that this example evaluates in ~10 loops, best of 3: 147 ms per loop. Since the full code takes hours to run, any decrease in this run time would make a big overall difference in performance.
Calling cython -a on your code shows that almost all relevant part run in pure C, so there's not much to gain here.
Still, you're overusing arrays, where a scalar could be enough. or You're using matrices when a 1D array would be enough. Doing this optimization removes a lot of memory accesses, as showcased here:
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
#cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):
cdef int k_max = np.max(data) + 1
cdef double A = np.float(theta[0])
cdef double n1 = np.float(theta[1])
cdef double n2 = np.float(theta[2])
cdef double Sb = np.float(theta[3])
cdef int npixROI = len(xbg_PSF_compressed)
cdef double f2 = 0.0
cdef double df_rho_div_f2 = 0.0
cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
cdef double x_m_ary_f
cdef double x_m_sum_f
cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef Py_ssize_t f_index, p, k, n
#calculations for PS
cdef int do_half = 0
cdef double term1 = 0.0
cdef double term2 = 0.0
cdef double second_2_a = 0.0
cdef double second_2_b = 0.0
cdef double second_2_c = 0.0
cdef double second_2_d = 0.0
cdef double second_1_a = 0.0
cdef double second_1_b = 0.0
cdef double second_1_c = 0.0
cdef double second_1_d = 0.0
for f_index in range(len(f_ary)):
f2 = f_ary[f_index]
df_rho_div_f2 = df_rho_div_f_ary[f_index]
g1_ary_f = np.random.random(k_max+1)
g2_ary_f = np.random.random(k_max+1)
term1 = (A * Sb * f2) \
* (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
- (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
second_1_a = A * pow(Sb * f2, n1)
second_1_b = A * pow(Sb * f2, n2)
for p in range(npixROI):
x_m_sum_f = term1 * PS_dist_compressed[p]
x_m_sum[p] += df_rho_div_f2*x_m_sum_f
second_1_c = second_1_a * PS_dist_compressed[p]
second_1_d = second_1_b * PS_dist_compressed[p]
for k in range(data[p]+1):
x_m_ary_f = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k]
x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f
cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)
cdef double f0_ary
cdef double f1_ary
cdef double[:] nu_mat = np.zeros((k_max+1), dtype=DTYPE)
cdef double ll = 0.
for p in range(npixROI):
f0_ary = -(xbg_PSF_compressed[p] + x_m_sum[p])
f1_ary = (xbg_PSF_compressed[p] + x_m_ary[1,p])
nu_mat[0] = exp(f0_ary)
nu_mat[1] = nu_mat[0] * f1_ary
for k in range(2,data[p]+1):
for n in range(0, k - 1):
nu_mat[k] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n]
nu_mat[k] += f1_ary * nu_mat[k-1] / float(k)
ll+=log( nu_mat[data[p]])
if math.isnan(ll) or math.isinf(ll):
ll = -10.1**10.
return ll
Running your benchmark on this version yields:
>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
10 loops, best of 3: 74.1 ms per loop
When the original code was running much slower:
>>> %timeit example(xbg_PSF_compressed, theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data)
1 loops, best of 3: 146 ms per loop

Cython int ** and int * types

I am trying to wrap some C code with Cython, but I am running into a error that I don't understand, and despite a lot of searching I cannot seem to find anything on it. Here is my c code
void cssor(double *U, int m, int n, double omega, double tol, int maxiters, int *info){
double maxerr, temp, lcf, rcf;
int i, j, k;
lcf = 1.0 - omega;
rcf = 0.25 * omega;
for (k =0; k < maxiters ; k ++){
maxerr = 0.0;
for (j =1; j < n-1; j++) {
for (i =1; i < m-1; i++) {
temp = U[i*n+ j];
U[i*n+j] = lcf * U[i*n+j] + rcf * (U[i*n+j-1] + U [i*n+j+1] + U [(i-1)*n + j] + U [(i+1)*n+j]);
maxerr = fmax(fabs(U[i*n+j] - temp), maxerr);
}
}
if(maxerr < tol){break;}
}
if (maxerr < tol) {*info =0;}
else{*info =1;}
}
My .pyx file is
cdef extern from "cssor.h":
void cssor(double *U, int m, int n, double omega, double tol, int maxiters, int *info)
cpdef cyssor(double[:, ::1] U, double omega, double tol, int maxiters, int *info):
cdef int n, m
m = U.shape[0]
n = U.shape[1]
cssor(&U[0, 0], m, n, omega, tol, maxiters, &info)
However, when I try to run the associated setup file I get an error referring to maxiters in the last line of the code that says:
Cannot assign type 'int **' to type 'int *'
Can you tell me how to fix this?
Roy Roth
The problem comes from here:
cpdef cyssor(double[:, ::1] U, double omega, double tol, int maxiters, int *info):
cdef int n, m
m = U.shape[0]
n = U.shape[1]
cssor(&U[0, 0], m, n, omega, tol, maxiters, &info)
You declare info as type int*. But you then pass it into the cssor function as a reference to an int*, making it an int**.
The correct code is:
cssor(&U[0, 0], m, n, omega, tol, maxiters, info)

Categories