Cython optimization of the code - python

I'm struggling to boost the performance of my python particle tracking code with Cython.
Here's my pure Python code:
from scipy.integrate import odeint
import numpy as np
from numpy import sqrt, pi, sin, cos
from time import time as Time
import multiprocessing as mp
from functools import partial
cLight = 299792458.
Dim = 6
class Integrator:
def __init__(self, ring):
self.ring = ring
def equations(self, X, s):
dXds = np.zeros(Dim)
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]**2 - self.ring.particle.mass**2 - X[1]**2 - X[3]**2)
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
def odeSolve(self, X0, sRange):
sol = odeint(self.equations, X0, sRange)
return sol
class Ring:
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
def getEMField(self, pos, time):
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x**2 + y**2)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
break
return E, B
class Particle:
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
Obviously, the most time-consuming process is integrating the ODE, defined as odeSolve() and equations() in class Integrator. Also, getEMField() method in class Ring is called as much as equations() method during the solving process.
I tried to get significant amount of speed up (at least 10x~20x) using Cython, but I only got ~1.5x level of speed up by the following Cython script:
import cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt, pi, sin, cos
from scipy.integrate import odeint
from time import time as Time
import multiprocessing as mp
from functools import partial
cdef double cLight = 299792458.
cdef int Dim = 6
#cython.boundscheck(False)
cdef class Integrator:
cdef Ring ring
def __init__(self, ring):
self.ring = ring
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] equations(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X,
double s):
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] dXds = np.zeros(Dim)
cdef double h, p_s, dtds, gamma
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] beta, E, B
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]*X[5] - self.ring.particle.mass*self.ring.particle.mass - X[1]*X[1] - X[3]*X[3])
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] odeSolve(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X0,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] sRange):
sol = odeint(self.equations, X0, sRange)
return sol
#cython.boundscheck(False)
cdef class Ring:
cdef Particle particle
cdef double ringRadius
cdef double magicB0
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
cpdef tuple getEMField(self,
list pos,
double time):
cdef double x, y, s
cdef double theta, r, rn, arg, k2, k10
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] E, B
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x*x + y*y)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
#E = np.array( [ k2*x/0.045, -k2*y/0.045, 0] )
break
return E, B
cdef class Particle:
cdef double mass
cdef double charge
cdef double gm2
cdef double magicMomentum
cdef double magicEnergy
cdef double magicGamma
cdef double magicBeta
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
#nParticles = 5
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
What should I do to get the maximum effect from Cython?
(I tried Numba instead of Cython, and actually the performance gain from Numba was enormous (around ~20x speedup). But I had extremely hard time to utilize Numba with python class instances, and I decided to use Cython instead of Numba).
For reference, the following is cython annotation on its compilation:

This is a very incomplete answer since I haven't profiled or timed anything or even checked that it gives the same answer. However here are some suggestions that reduce the amount of Python code that Cython generates:
Add the #cython.cdivision(True) compilation directive. This means that a ZeroDivisionError won't be raised on float division and you'll get a NaN value instead. (Only do this if you don't want the error to be raised).
Change p_s = np.sqrt(...) to p_s = sqrt(...). This removes a numpy call that only operates on a single value. You seem to have done this elsewhere so I don't know why you missed this line.
Where possible use fixed size C arrays instead of numpy arrays:
cdef double beta[3]
# ...
beta[0] = X[1]/X[5]
beta[1] = X[3]/X[5]
beta[2] = p_s/X[5]
You can do this when the size is known at compile time (and fairly small) and when you don't want to return it. This avoids a call to np.zeros and some subsequent type-checking to assign it the the typed numpy array. I think beta is the only place you can do this.
np.angle( complex(x/r, y/r) ) can be replaced by atan2(y/r, x/r) (using atan2 from libc.math. You can also lose the division by r
cdef int i helps make your for loop faster in getEMField (Cython is often good at automatically picking up the types of loop variables but seems to have failed here)
I suspect it's quicker to assign E element-by-element than as a whole array:
E[0] = k2*x/0.045 + k10*rn**9*cos(9*arg)
E[1] = -k2*y/0.045 -k10*rn**9*sin(9*arg)
There isn't much value in specifying types like list and tuple and it may actually make the code slightly slower (because it will waste time checking the types).
A bigger change would be to pass E and B into GetEMField as pointers rather than using allocating them np.zeros. This would let you allocate them as static C arrays in equations (cdef double E[3]). The downside is that GetEMField would have to be cdef so no longer callable from Python (but you could make a Python callable wrapper function too if you like).

Related

Cython anomalous speed differences when casting variable from float to short?

I've noticed that when attempting to optimize a Cython loop, casting a float to a short take significantly more time for a defined (and ctyped) variable. Here is an example function with OPTION 1 and OPTION 2 denoted, one of which will be commented out when comparing the performance:
cpdef np.ndarray[np.int16_t, ndim=2] test_func(short[:, :, :] data_array):
cdef Py_ssize_t i, k, n
cdef Py_ssize_t n_pts = data_array.shape[0], length = data_array.shape[1], width = data_array.shape[2]
cdef float x_diff, y_diff, xy_sum
coeffs_array = np.zeros((length, width), dtype=np.int16)
cdef short[:, :] coeffs = coeffs_array
for i in range(length):
for k in range(width):
xy_sum = 0
for n in range(n_pts):
x_diff = data_array[n, i, k]
y_diff = data_array[n, 0, 0]
xy_sum = xy_sum + (x_diff * y_diff)
# OPTION 1
coeffs[i, k] = <short> xy_sum
# OPTION 2
coeffs[i, k] = <short> (7.235 + 2.31 + 78.123)
return coeffs_array
After compiling with one of the two options active, I tested with the following:
import numpy as np
from gen_libs.test import test_func
import time
np.random.seed(0)
jn = np.random.choice(100, size=(500, 5000, 500)).astype(np.int16)
start_time = time.time()
a = test_func(jn)
print(time.time() - start_time)
The performance of the two options changes drastically:
OPTION 1: 1.5317 seconds
OPTION 2: 0.0025 seconds
What am I missing here? It seems that xy_sum should be a simple ctyped float, just like the sum of the decimal numbers. I tested again by defining a new float variable like so:
cdef float a
a = (7.235 + 2.31 + 78.123)
coeffs[i, k] = <short> a
But again the timing was 0.0026 seconds, so what is it about xy_sum that is causing this ~1,000x slowdown? Is there something connected to the array access of x_diff and y_diff that could be an issue? I'm stumped.
EDIT:
Here's a test that appears to narrow it down to whether or not the variable getting cast from float to short was accumulative or not. No idea why this would make any difference:
Accumulative (xy_sum += x_diff)
cpdef np.ndarray[np.int16_t, ndim=2] test_func(short[:, :, :] data_array):
cdef Py_ssize_t i, k, n
cdef Py_ssize_t n_pts = data_array.shape[0], length = data_array.shape[1], width = data_array.shape[2]
cdef float x_diff, y_diff, xy_sum, a
coeffs_array = np.zeros((length, width), dtype=np.int16)
cdef short[:, :] coeffs = coeffs_array
for i in range(length):
for k in range(width):
xy_sum = 0
for n in range(n_pts):
x_diff = 0.152
# XY_SUM ACCUMULATING
xy_sum += x_diff
coeffs[i, k] = <short> xy_sum
return coeffs_array
Time: 1.068 seconds
Non-accumulative (xy_sum = x_diff)
cpdef np.ndarray[np.int16_t, ndim=2] test_func(short[:, :, :] data_array):
cdef Py_ssize_t i, k, n
cdef Py_ssize_t n_pts = data_array.shape[0], length = data_array.shape[1], width = data_array.shape[2]
cdef float x_diff, y_diff, xy_sum, a
coeffs_array = np.zeros((length, width), dtype=np.int16)
cdef short[:, :] coeffs = coeffs_array
for i in range(length):
for k in range(width):
xy_sum = 0
for n in range(n_pts):
x_diff = 0.152
# XY_SUM NON-ACCUMULATING
xy_sum = x_diff
coeffs[i, k] = <short> xy_sum
return coeffs_array
Time: 0.0025 seconds

Error "unable to find vcvarsall.bat" despite newest visual studio version

I'm working in Jupyter Notebook. When I want to compile a .pyx in cython, it throws an error similar to this:
%run -i setup.py build_ext --inplace
unable to find vcvarsall.bat
The setup.py file looks like this:
from distutils.core import setup
from Cython.Build import cythonize
setup(
ext_modules=cythonize("hh_vers_vector.pyx"),
)
This only happens, however, on my computer at work. At the one at home, it works just fine.
It is probably an issue with Visual Studio as explained here. The thing is, I installed the very same version of Visual Studio 2017 Community on both computers. The latest Anaconda 3 version is installed on both computers. Both use Python 3.6.2 and IPython 6.1.0. So how can that be? Both run with Windows 10. I'll also show you my .pyx file. If you need more information I will edit my post.
from math import exp
import numpy as np
import time
def hhModel(*params, Iext, float dt, int Vref):
## Unwrap params argument: these variables are going to be optimized
cdef float ENa = params[0]
cdef float EK = params[1]
cdef float EL = params[2]
cdef float GNa = params[3]
cdef float GK = params[4]
cdef float GL = params[5]
## Input paramters
# I : a list containing external current steps, your stimulus vector [nA]
# dt : a crazy time parameter [ms]
# Vref : reference potential [mV]
def alphaM(float v, float vr): return 0.1 * (v-vr-25) / ( 1 - exp(-(v-vr-25)/10) )
def betaM(float v, float vr): return 4 * exp(-(v-vr)/18)
def alphaH(float v, float vr): return 0.07 * exp(-(v-vr)/20)
def betaH(float v, float vr): return 1 / ( 1 + exp( -(v-vr-30)/10 ) )
def alphaN(float v, float vr): return 0.01 * (v-vr-10) / ( 1 - exp(-(v-vr-10)/10) )
def betaN(float v, float vr): return 0.125 * exp(-(v-vr)/80)
## steady-state values and time constants of m,h,n
def m_infty(float v, float vr): return alphaM(v,vr) / ( alphaM(v,vr) + betaM(v,vr) )
def h_infty(float v, float vr): return alphaH(v,vr) / ( alphaH(v,vr) + betaH(v,vr) )
def n_infty(float v, float vr): return alphaN(v,vr) / ( alphaN(v,vr) + betaN(v,vr) )
## parameters
cdef float Cm, gK, gL, INa, IK, IL, dv_dt, dm_dt, dh_dt, dn_dt, aM, bM, aH, bH, aN, bN
cdef float Smemb = 4000 # [um^2] surface area of the membrane
cdef float Cmemb = 1 # [uF/cm^2] membrane capacitance density
Cm = Cmemb * Smemb * 1e-8 # [uF] membrane capacitance
gNa = GNa * Smemb * 1e-8 # Na conductance [mS]
gK = GK * Smemb * 1e-8 # K conductance [mS]
gL = GL * Smemb * 1e-8 # leak conductance [mS]
# numSamples = int(T/dt);
cdef int numSamples = len(Iext);
# DEF numSamples = 200000
# initial values
cdef float[:] v = np.empty(numSamples, dtype=np.float)
cdef float[:] m = np.empty(numSamples, dtype=np.float)
cdef float[:] h = np.empty(numSamples, dtype=np.float)
cdef float[:] n = np.empty(numSamples, dtype=np.float)
#cdef float v[numSamples]
#cdef float m[numSamples]
#cdef float h[numSamples]
#cdef float n[numSamples]
v[0] = Vref # initial membrane potential
m[0] = m_infty(v[0], Vref) # initial m
h[0] = h_infty(v[0], Vref) # initial h
n[0] = n_infty(v[0], Vref) # initial n
## calculate membrane response step-by-step
for j in range(0, numSamples-1):
# ionic currents: g[mS] * V[mV] = I[uA]
INa = gNa * m[j]*m[j]*m[j] * h[j] * (ENa-v[j])
IK = gK * n[j]*n[j]*n[j]*n[j] * (EK-v[j])
IL = gL * (EL-v[j])
# derivatives
# I[uA] / C[uF] * dt[ms] = dv[mV]
dv_dt = ( INa + IK + IL + Iext[j]*1e-3) / Cm;
aM = 0.1 * (v[j]-Vref-25) / ( 1 - exp(-(v[j]-Vref-25)/10))
bM = 4 * exp(-(v[j]-Vref)/18)
aH = 0.07 * exp(-(v[j]-Vref)/20)
bH = 1 / ( 1 + exp( -(v[j]-Vref-30)/10 ) )
aN = 0.01 * (v[j]-Vref-10) / ( 1 - exp(-(v[j]-Vref-10)/10) )
bN = 0.125 * exp(-(v[j]-Vref)/80)
dm_dt = (1-m[j])* aM - m[j]*bM
dh_dt = (1-h[j])* aH - h[j]*bH
dn_dt = (1-n[j])* aN - n[j]*bN
# calculate next step
v[j+1] = (v[j] + dv_dt * dt)
m[j+1] = (m[j] + dm_dt * dt)
h[j+1] = (h[j] + dh_dt * dt)
n[j+1] = (n[j] + dn_dt * dt)
return v

Cython not fast enough

I rewrote my python loop in cython expecting a large improvement in speed.
I only get about a factor of four. Am I doing something wrong?
This is the code without cython:
import numpy as np
import itertools as itr
import math
def Pk (b, f, mu, k): # k is in Mpc
isoPk = 200*math.exp(-(k-0.02)**2/2/0.01**2) # Isotropic power spectrum
power = (b+mu**2*f)**2*isoPk
return power
def Gendk (N, kvec, Pk, b, f, deltak3d):
Nhalf = int(N/2)
for xx, yy, zz in itr.product(range(0,N), range(0,N), range(0,Nhalf+1)):
kx = kvec[xx]
ky = kvec[yy]
kz = kvec[zz]
kk = math.sqrt(kx**2+ky**2+kz**2)
if kk == 0:
continue
mu = kz/kk
power = Pk(b, f, mu, kk)
if power==0:
deltaRe = 0
deltaIm = 0
else:
deltaRe = np.random.normal(0, power/2.0)
if (xx==0 or xx==Nhalf) and (yy==0 or yy==Nhalf) and (zz==0 or zz==Nhalf):
deltaIm = 0
else:
deltaIm = np.random.normal(0, power/2.0)
x_conj = (2*N-xx)%N
y_conj = (2*N-yy)%N
z_conj = (2*N-zz)%N
deltak3d[xx,yy,zz] = deltaRe + deltaIm*1j
deltak3d[x_conj,y_conj,z_conj] = deltaRe - deltaIm*1j
Ntot = 300000
L = 1000
N = 128
Nhalf = int(N/2)
kmax = 5.0
dk = kmax/N
kvec = np.fft.fftfreq(N, L/N)
dL = L/N
deltak3d = np.zeros((N,N,N), dtype=complex)
deltak3d[0,0,0] = Ntot
Gendk(N, kvec, Pk, 2, 1, deltak3d)
This is the version with cython:
import numpy as np
import pyximport; pyximport.install(setup_args={"include_dirs":np.get_include()})
import testGauss as tG
Ntot = 300000
L = 1000
N = 128
Nhalf = int(N/2)
kmax = 5.0
dk = kmax/N
kvec = np.fft.fftfreq(N, L/N)
dL = L/N
deltak3d = np.zeros((N,N,N), dtype=complex)
deltak3d[0,0,0] = Ntot
tG.Gendk(N, kvec, tG.Pk, 2, 1, deltak3d)
and the testGauss.pyx file is:
import math
import numpy as np
cimport numpy as np
import itertools as itr
def Pk (double b, double f, double mu, double k): # k is in Mpc
cdef double isoPk, power
isoPk = 200*math.exp(-(k-0.02)**2/2/0.01**2) # Isotropic power spectrum
power = (b+mu**2*f)**2*isoPk
return power
def Gendk (int N, np.ndarray[np.float64_t,ndim=1] kvec, Pk, double b, double f, np.ndarray[np.complex128_t,ndim=3] deltak3d):
cdef int Nhalf = int(N/2)
cdef int xx, yy, zz
cdef int x_conj, y_conj, z_conj
cdef double kx, ky, kz, kk
cdef mu
cdef power
cdef deltaRe, deltaIm
for xx, yy, zz in itr.product(range(0,N), range(0,N), range(0,Nhalf+1)):
kx = kvec[xx]
ky = kvec[yy]
kz = kvec[zz]
kk = math.sqrt(kx**2+ky**2+kz**2)
if kk == 0:
continue
mu = kz/kk
power = Pk(b, f, mu, kk)
if power==0:
deltaRe = 0
deltaIm = 0
else:
deltaRe = np.random.normal(0, power/2.0)
if (xx==0 or xx==Nhalf) and (yy==0 or yy==Nhalf) and (zz==0 or zz==Nhalf):
deltaIm = 0
else:
deltaIm = np.random.normal(0, power/2.0)
x_conj = (2*N-xx)%N
y_conj = (2*N-yy)%N
z_conj = (2*N-zz)%N
deltak3d[xx,yy,zz] = deltaRe + deltaIm*1j
deltak3d[x_conj,y_conj,z_conj] = deltaRe - deltaIm*1j
Thank you very much in advance!
You could get some speedup by replacing
import math
with
from libc cimport math
That will avoid a python function call when you do sqrt and exp, replacing it with a direct c call (which should be a lot faster).
I'm also slightly concerned at the calls to np.random.normal inside your loop, which add a reasonable python overhead each time. It might well be quicker to call this before the loop to generate a large array of random numbers (with the overhead of a single python call) then overwrite them with 0 if they aren't needed inside the loop.
The general advice for optimising Cython still applies: run
cython -a your_file.pyx
Look at the HTML, and worry about bits highlighted yellow (but only if they're called often)
Use cProfile to profile your Python code. Maybe the most CPU intensive tasks are in NumPy already. Then there is not so much to gain from Cython.
Turning your code (slightly modified) in a native module with Pythran gives me a x50 speedup.
import numpy as np
import itertools as itr
import math
from random import gauss as normal
def Pk (b, f, mu, k): # k is in Mpc
isoPk = 200*math.exp(-(k-0.02)**2/2/0.01**2) # Isotropic power spectrum
power = (b+mu**2*f)**2*isoPk
return power
#pythran export Gendk(int, float[], int, int, complex[][][])
def Gendk (N, kvec, b, f, deltak3d):
Nhalf = int(N/2)
for xx, yy, zz in itr.product(range(0, N), range(0, N), range(0, Nhalf+1)):
kx = kvec[xx]
ky = kvec[yy]
kz = kvec[zz]
kk = math.sqrt(kx**2+ky**2+kz**2)
if kk == 0:
continue
mu = kz/kk
power = Pk(b, f, mu, kk)
if power == 0:
deltaRe = 0
deltaIm = 0
else:
# deltaRe = np.random.normal(0, power/2.0)
deltaRe = normal(0, power/2.0)
if (xx == 0 or xx == Nhalf) and (yy == 0 or yy == Nhalf) and (zz == 0 or zz == Nhalf):
deltaIm = 0
else:
#deltaIm = np.random.normal(0, power/2.0)
deltaIm = normal(0, power/2.0)
x_conj = (2*N-xx)%N
y_conj = (2*N-yy)%N
z_conj = (2*N-zz)%N
deltak3d[xx, yy, zz] = deltaRe + deltaIm*1j
deltak3d[x_conj, y_conj, z_conj] = deltaRe - deltaIm*1j
Compiled with:
$ pythran tg.py
And tested with:
$ python -m timeit -s 'import numpy as np; Ntot = 30000; L = 1000; N = 12; Nhalf = int(N/2); kmax = 5.0; dk = kmax/N; kvec = np.fft.fftfreq(N, L/N); dL = L/N; deltak3d = np.zeros((N, N, N), dtype=complex); deltak3d[0, 0, 0] = Ntot; from tg import Gendk' 'Gendk(N, kvec, 2, 1, deltak3d)'
I get 10 loops, best of 3: 29.4 msec per loop for the CPython run and 1000 loops, best of 3: 587 usec per loop for the Pythran run.
Disclaimer: I'm a Pythran dev.

Cython code 3x slower than corresponding NumPy version

I'm currently writing my thesis on the use of particle filters for filtering out latent states in stochastic volatility models. To improve the filtering results I've added option prices as an observed process. This means that for a given time series, I have to calculate the option prices at each time step - a "normal" time series is 100-200 points.
Without going too deep into the algorithm, I'm having a serious problem with performance. The last for-loop loops over all of the particles that I use, which is somewhere around a 1,000 (as determined by M). Running this code for only one particle takes 0.25 seconds - which means that it takes around 4 minutes per time step to run using 1,000 particles (which is rather infeasible).
from __future__ import division
import numpy as np
import numexpr as ne
from fftInC import fft
import time
import math
import pyfftw
def HestonCallPrice(M, N, S, V, t, T, strikes, r, param, b, NFFT, inp, v, alphaC, eta, k, weights):
"""
This will be the pricing function for the European call option. Since we found the
quadrature procedure to be too slow we shall move on to use FFT instead.
So, we begin defining all of the constants etc.
"""
vT, weightsT, inpJ, vJT = v.T, weights.T, inp * 1j, v.T * 1j
p1, p2, p3_2, p3, p4 = param[1,:], param[2,:], param[3,:], np.sqrt(param[3,:]), param[4,:]
"""
Next we move on to the calculations. These have been found to be rather fast, and hence do not
need any Cythonization.
"""
gamma = p3_2 / 2
beta = ne.evaluate("p1 - p4 * p3 * 1j * inp")
alpha = ne.evaluate("(-inp**2 - inpJ)/2")
d = ne.evaluate("sqrt(beta**2 - 4 * alpha * gamma)")
r_pos, r_neg = ne.evaluate("(beta + d)/(2 * gamma)"), ne.evaluate("(beta - d)/(2 * gamma)")
g, inpJT = ne.evaluate("r_neg / r_pos"), inpJ.T
D = ne.evaluate("r_neg * (1 - exp( -d * (T - t) ) ) / (1 - g * exp( -d * (T - t) ) )" )
C = ne.evaluate("p1 * (r_neg*(T - t) - 2 / p3_2 * log( (1 - g*exp(-d*(T - t)))/(1 - g) ) )")
A = 1j * inp.T * (math.log(S) + r * (T - t))
C_tmp = (C * p2).T
"""
The matrices and vectors that are sent into the Cython version of the code are
A = (1, 2048)
C_tmp = (4, 2048)
D.T = (4, 2048)
V = (4, 1000)
vJT[0, :] = (2048,)
k[:, 0] = (2048,)
weights.T[0, :] = (2048,)
This is now where we call the Cython script.
"""
start = time.time()
prices = fft(A, float(r), float(t), float(T), C_tmp, D.T, V, float(alphaC), vJT[0, :], k[:, 0],
float(b), strikes, float(eta), weights.T[0, :])
print 'Cythonized version: ', time.time() - start, ' seconds'
"""
The below code is the original code which has been "cythonized".
"""
start = time.time()
outPrices = np.empty( (M, N) )
prices = np.empty( (M * N, len(strikes)) )
"""
Regularly I use pyFFTW since it's a bit faster, but I couldn't figure out how to use the C
version of this, so to be fair when comparing speeds I disable pyFFTW. However, turning this on
using the below settings it's 20-30% faster.
"""
# fftIn = pyfftw.n_byte_align_empty((N, NFFT), 16, 'complex128')
#
# fftOut = fftIn.copy()
#
# fft_object = pyfftw.FFTW(fftIn, fftOut, nthreads=8)
for j in range( len(strikes) ):
position = (np.log(strikes[j]) + b) / ( 2 * b / NFFT)
x_1 = np.exp( k[ int(math.floor(position)) ] )
x_2 = np.exp( k[ int(math.ceil(position)) ] )
for m in range(M):
C_m, D_m, V_m = C_tmp[m, :], D[:, m].T, V[m, :][:, np.newaxis]
F_cT = ne.evaluate("exp( -r*(T - t) ) * exp(C_m + D_m * V_m + A) / \
( (alphaC + vJT) * (alphaC + 1 + vJT) )")
toFFT = ne.evaluate("exp( b * vJT ) * F_cT * eta / 3 * weightsT")
price = np.exp( -alphaC * k.T ) / math.pi * np.real ( np.fft.fft(toFFT) )
y_1 = price[ :, int(math.floor(position)) ]
y_2 = price[ :, int(math.ceil(position)) ]
dydx = (y_2 - y_1)/(x_2 - x_1)
outPrices[m, :] = dydx * (strikes[j] - x_1) + y_1
prices[:, j] = outPrices.reshape(M * N)
print 'Non-cythonized version: ', time.time() - start, ' seconds'
return prices
" ------ Defining constants etc, nothing to say really ----- "
M, N, S, t, T, r, NFFT, alphaC = 1, 1000, 1000, 0, 1, 0, 2048, 1.5
strikes = np.array([900, 1100])
c, V = 600, np.random.normal(loc=0.2, scale=0.05, size=(M, N))
param = np.repeat(np.array([0.05, 0.5, 0.15, 0.15**2, 0]), M).reshape((5, M))
eta = c / NFFT
b = np.pi / eta
j = np.arange(1, NFFT+1)[:, np.newaxis]
v, k = eta * (j - 1), -b + 2 * b/ NFFT*(j - 1)
inp = v - (alphaC + 1)*1j
weights = 3 + (-1)**j - np.array([1] + [0]*(NFFT-1))[:, np.newaxis]
" ------------------------------------------------------------- "
HestonCallPrice(M, N, S, V, t, T, strikes, r, param, b, NFFT, inp, v, alphaC, eta, k, weights)
I found that the bottleneck is the last for-loop. I got a tip to rewrite the for-loop in Cython instead, see below
" --------------------------------- C IMPORTED PACKAGES ------------------------------------------ "
from __future__ import division
import cython
cimport cython
import math
cimport numpy as np
import numpy as np
import pyfftw
" ------------------------------------------------------------------------------------------------ "
"""
I heard that the boundscheck and wraparound functions could improve the performance, but I didn't
notice any performance gain whatsoever.
"""
#cython.profile(False)
#cython.boundscheck(False)
#cython.wraparound(False)
def fft(np.ndarray[double complex, ndim=2] A, float r, float t, float T,
np.ndarray[double complex, ndim=2] C, np.ndarray[double complex, ndim=2] D,
np.ndarray[double, ndim=2] V, float alphaC, np.ndarray[double complex, ndim=1] vJT,
np.ndarray[double, ndim=1] k, float b,
np.ndarray[long, ndim=1] strikes, float eta,
np.ndarray[long, ndim=1] weightsT):
cdef int M = V.shape[0]
cdef int N = V.shape[1]
cdef int NFFT = D.shape[1]
cdef np.ndarray[double complex, ndim=1] F_cT
cdef np.ndarray[double complex, ndim=2] toFFT = np.empty( (N, NFFT), dtype=complex)
cdef np.ndarray[double, ndim=2] prices
cdef float x_1, x_2, position
cdef np.ndarray[double, ndim=1] y_1
cdef np.ndarray[double, ndim=1] y_2
cdef np.ndarray[double, ndim=1] dydx
cdef int m, j, n
cdef np.ndarray[double, ndim=2] price = np.empty( (M * N, len(strikes)) )
cdef np.ndarray[double complex, ndim=1] A_inp = A[0, :]
for j in range( len(strikes) ):
position = (math.log(strikes[j]) + b) / ( 2 * b / NFFT)
x_1 = math.exp ( k[ int(math.floor(position)) ] )
x_2 = math.exp ( k[ int(math.ceil(position)) ] )
for m in range(M):
"""
M is the number of rows we have in A, C, D and V, so we need to loop over all of those.
"""
for n in range(N):
"""
Next we loop over all of the elements for each row in V, corresponding to N. For
us this corresponds to 1000 (if you haven't changed to N in the main program).
Each of the rows of A, C and D are 2048 in length. So I tried to loop over all of
those as well as for each n, but this made the code 4 times slower.
"""
F_cT = math.exp( -r*(T - t) ) * np.exp (A_inp + C[m, :] + D[m, :] * V[m, n]) / \
( (alphaC + vJT) * (alphaC + 1 + vJT) )
toFFT[n, :] = np.exp (b * vJT) * F_cT * eta / 3 * weightsT
"""
I'm guessing FFT'ing is rather slow using NumPy in Cython?
"""
prices = np.exp ( -alphaC * k ) / math.pi * np.real ( np.fft.fft(toFFT) )
y_1 = prices[ :, int(math.floor(position)) ]
y_2 = prices[ :, int(math.ceil(position)) ]
dydx = (y_2 - y_1)/(x_2 - x_1)
price[m * N:(m + 1) * N, j] = dydx * (strikes[j] - x_1) + y_1
return price
I'm compiling the code as
from distutils.core import setup, Extension
from Cython.Distutils import build_ext
import numpy.distutils.misc_util
include_dirs = numpy.distutils.misc_util.get_numpy_include_dirs()
setup(
name = 'fftInC',
ext_modules = [Extension('fftInC', ['fftInC.pyx'], include_dirs=include_dirs)],
cmdclass = {'build_ext':build_ext}
)
But to my surprise, the Cython version is about 3x slower than the original one. And I can't really figure out where I'm going wrong. I think I've defined the input types correctly (which I understand should give a considerable performance boost).
My question is therefore: Can you identify where I'm going wrong? Is it the type definition, for-loops or FFT'ing (or something else)?

Define a pointer as a method of a class?

I am trying to speed up my cython code. I came across this link where the author has described how using pointers instead of numpy arrays can improve the speed of cython codes. In my cosmology class the bottleneck is Da function. I am not very familiar with pointers in C, I would appreciate if somebody give me an idea:
Is it possible to define a method of a class as a pointer for instance in my case convert np.ndarray[double, ndim=1] Da to something like double* Da?
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
import copy
cdef extern from "gsl/gsl_math.h":
ctypedef struct gsl_function:
double (* function) (double x, void * params)
void * params
cdef extern from "gsl/gsl_integration.h":
ctypedef struct gsl_integration_workspace
gsl_integration_workspace * gsl_integration_workspace_alloc(size_t n)
void gsl_integration_workspace_free(gsl_integration_workspace * w)
int gsl_integration_qags(const gsl_function * f, double a, double b, double epsabs, double epsrel, size_t limit, gsl_integration_workspace * workspace, double *result, double *abserr)
cdef double func_callback(double x, void* params):
return (<cosmology>params).__angKernel(x)
cdef class cosmology(object):
cdef public double omega_m, omega_l, h, w, omega_r, G, v_c
cdef object omega_c
def __init__(self,double omega_m = 0.3, double omega_l = 0.7, double h = 0.7, double w = -1, double omega_r = 0., double G = std_G):
self.omega_m = omega_m
self.omega_l = omega_l
self.omega_r = omega_r
self.omega_c = (1. - omega_m - omega_l)
self.h = h
self.w = w
self.G = G
self.v_c = v_c
def __copy__(self):
return cosmology(omega_m = self.omega_m, omega_l = self.omega_l, h = self.h, w = self.w, omega_r = self.omega_r, G = self.G)
property H0:
def __get__(self):
return 100*self.h #km/s/MPC
cpdef double a(self, double z):
return 1./(1.+z)
cpdef double E(self, double a):
return (self.omega_r*a**(-4) + self.omega_m*a**(-3) + self.omega_c*a**(-2) + self.omega_l)**0.5
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
cdef double __angKernel(self, double x):
"""Integration kernel for angular diameter distance computation.
"""
return self.E(x**-1)**-1
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
cpdef np.ndarray[double, ndim=1] Da(self, np.ndarray[double, ndim=1] z, double z_ref=0):
cdef gsl_integration_workspace* w =gsl_integration_workspace_alloc(1000)
cdef gsl_function F
F.function = &func_callback
F.params = <void*>self
cdef double result = 3, error = 5
cdef double err, rk, zs, omc
omc=self.omega_c
cdef np.ndarray[double,ndim=1] d = np.ones_like(z, dtype=np.float64, order='C')
cdef int i, num
num = len(z)
for i in range(num):
zs=z[i]
if zs < 0:
raise ValueError("Redshift z must not be negative")
if zs < z_ref:
raise ValueError("Redshift z must not be smaller than the reference redshift")
gsl_integration_qags(&F, z_ref+1, zs+1, 0, 1e-7, 1000, w, &result, &error)
d[i], err = result, error
# check for curvature
rk = (fabs(omc))**0.5
if (rk*d[i] > 0.01):
if omc > 0:
d[i] = sinh(rk*d[i])/rk
if omc < 0:
d[i] = sin(rk*d[i])/rk
gsl_integration_workspace_free(w)
return d/(1.+z)
Thanks in advance.
It has been a while since I developed in cython, but if memory serves me I believe you could declare the function as follows:
ctypedef double* ( * Da)(double* z, double z_ref, int length)
This function will return an array of type double and allow you to pass an array of doubles in as z. This is a function pointer, so maybe not quite what you want.
ctypedef double* Da(double* z, double z_ref, int length)
this will accomplish same thing but as a regular function, not just a function pointer. Difference between function and function pointer is you have to assign a function pointer a function to point to.

Categories