I have a piecewise function with 3 parts that I'm trying to write in Python using Numba #jit instruction. The function is calculated over an array. The function is defined by:
#njit(parallel=True)
def f(x_vec):
N=len(x_vec)
y_vec=np.zeros(N)
for i in prange(N):
x=x_vec[i]
if x<=2000:
y=64/x
elif x>=4000:
y=np.log(x)
else:
y=np.log(1.2*x)
y_vec[i]=y
return y_vec
I'm using Numba to make this code very fast and run it on all 8 threads of my CPU.
Now, my question is, if I wanted to define each part of the function separately as f1, f2 and f3, and put those inside the if statements (and still benefit from Numba speed), how can I do that? The reason is that the subfunctions can be more complicated and I don't want to make my code hard to read. I want it to be as fast as this one (or slightly slower but not alot).
In order to test the function, we can use this array:
Np=10000000
x_vec=100*np.power(1e8/100,np.random.rand(Np))
%timeit f(x_vec) #0.06sec on intel core i7 3610
For completionism, the following libraries are called:
import numpy as np
from numba import njit, prange
So in this case, the functions would be:
def f1(x):
return 64/x
def f2(x):
return np.log(x)
def f3(x):
return np.log(1.2*x)
The actual functions are these, which are for smooth pipe friction factor for laminar, transition and turbulent regimes:
#njit
def f1(x):
return 64/x
#njit
def f2(x):
#x is the Reynolds number(Re), y is the Darcy friction(f)
#for transition, we can assume Re=4000 (max possible friction)
y=0.02
y=(-2/np.log(10))*np.log(2.51/(4000*np.sqrt(y)))
return 1/(y*y)
#njit
def f3(x): #colebrook-white approximation
#x is the Reynolds number(Re), y is the Darcy friction(f)
y=0.02
y=(-2/np.log(10))*np.log(2.51/(x*np.sqrt(y)))
return 1/(y*y)
Thanks for contributions from everyone. This is the numpy solution (the last tree lines are slow for some reason, but doesn't need warmup):
y = np.empty_like(x_vec)
a1=np.where(x_vec<=2000,True,False)
a3=np.where(x_vec>=4000,True,False)
a2=~(a1 | a3)
y[a1] = f1(x_vec[a1])
y[a2] = f2(x_vec[a2])
y[a3] = f3(x_vec[a3])
The fastest Numba solution, allowing for passing function names and taking advantage of prange (but hindered by jit warmup) is this, which can be as fast as the first solution (top of the question):
#njit(parallel=True)
def f(x_vec,f1,f2,f3):
N = len(x_vec)
y_vec = np.zeros(N)
for i in prange(N):
x=x_vec[i]
if x<=2000:
y=f1(x)
elif x>=4000:
y=f3(x)
else:
y=f2(x)
y_vec[i]=y
return y_vec
You can write f() to accept function parameters, e.g.:
#njit
def f(arr, f1, f2, f3):
N = len(arr)
y_vec = np.zeros(N)
for i in range(N):
x = x_vec[i]
if x <= 2000:
y = f1(x)
elif x >= 4000:
y = f2(x)
else:
y = f3(x)
y_vec[i] = y
return y_vec
Make sure that the function you pass are Numba compatible.
Is this too slow? This can be done in pure numpy, by avoiding loops and using masks for indexing:
def f(x):
y = np.empty_like(x)
mask = x <= 2000
y[mask] = 64 / x[mask]
mask = (x > 2000) & (x < 4000)
y[mask] = np.log(1.2 * x[mask])
mask = x >= 4000
y[mask] = np.log(x[mask])
return y
You can also run the "else" case by first applying the middle part without any mask to the whole array, it's probably a bit slower:
def f_else(x):
y = np.log(1.2 * x)
mask = x <= 2000
y[mask] = 64 / x[mask]
mask = x >= 4000
y[mask] = np.log(x[mask])
return y
With
Np=10000000
x_vec=100*np.power(1e8/100,np.random.rand(Np))
I get (laptop with i7-8850H with 6 + 6VT cores)
f1: 1 loop, best of 5: 294 ms per loop
f_else: 1 loop, best of 5: 400 ms per loop
If your intended subfunctions are mainly numpy-operations this will still be fast.
Related
I'm new to Numba and I'm trying to implement an old Fortran code in Python using Numba (version 0.54.1), but when I add parallel = True the program actually slows down. My program is very simple: I change the positions x and y in a L x L grid and for each position in the grid I perform a summation
import numpy as np
import numba as nb
#nb.njit(parallel=True)
def lyapunov_grid(x_grid, y_grid, k, N):
L = len(x_grid)
lypnv = np.zeros((L, L))
for ii in nb.prange(L):
for jj in range(L):
x = x_grid[ii]
y = y_grid[jj]
beta0 = 0
sumT11 = 0
for j in range(N):
y = (y - k*np.sin(x)) % (2*np.pi)
x = (x + y) % (2*np.pi)
J = np.array([[1.0, -k*np.cos(x)], [1.0, 1.0 - k*np.cos(x)]])
beta = np.arctan((-J[1,0]*np.cos(beta0) + J[1,1]*np.sin(beta0))/(J[0,0]*np.cos(beta0) - J[0,1]*np.sin(beta0)))
T11 = np.cos(beta0)*(J[0,0]*np.cos(beta) - J[1,0]*np.sin(beta)) - np.sin(beta0)*(J[0,1]*np.cos(beta) - J[1,1]*np.sin(beta))
sumT11 += np.log(abs(T11))/np.log(2)
beta0 = beta
lypnv[ii, jj] = sumT11/N
return lypnv
# Compile
_ = lyapunov_grid(np.linspace(0, 1, 10), np.linspace(0, 1, 10), 1, 10)
# Parameters
N = int(1e3)
L = 128
pi = np.pi
k = 1.5
# Limits of the phase space
x0 = -pi
xf = pi
y0 = -pi
yf = pi
# Grid positions
x = np.linspace(x0, xf, L, endpoint=True)
y = np.linspace(y0, yf, L, endpoint=True)
lypnv = lyapunov_grid(x, y, k, N)
With parallel=False it takes about 8s to run, however with parallel=True it takes about 14s. I also tested with another code from https://github.com/animator/mandelbrot-numba and in this case the parallelization works.
import math
import numpy as np
import numba as nb
WIDTH = 1000
MAX_ITER = 1000
#nb.njit(parallel=True)
def mandelbrot(width, max_iter):
pixels = np.zeros((width, width, 3), dtype=np.uint8)
for y in nb.prange(width):
for x in range(width):
c0 = complex(3.0*x/width - 2, 3.0*y/width - 1.5)
c = 0
for i in range(1, max_iter):
if abs(c) > 2:
log_iter = math.log(i)
pixels[y, x, :] = np.array([int(255*(1+math.cos(3.32*log_iter))/2),
int(255*(1+math.cos(0.774*log_iter))/2),
int(255*(1+math.cos(0.412*log_iter))/2)],
dtype=np.uint8)
break
c = c * c + c0
return pixels
# compile
_ = mandelbrot(WIDTH, 10)
calcpixels = mandelbrot(WIDTH, MAX_ITER)
One main issue is that the second function call compile the function again. Indeed, the types of the provided arguments change: in the first call the third argument is an integer (int transformed to a np.int_) while in the second call the third argument (k) is a floating point number (float transformed to a np.float64). Numba recompiles the function for different parameter types because they are deduced from the type of the arguments and it does not know you want to use a np.float64 type for the third argument (since the first time the function is compiled with for a np.int_ type). One simple solution to fix the problem is to change the first call to:
_ = lyapunov_grid(np.linspace(0, 1, 10), np.linspace(0, 1, 10), 1.0, 10)
However, this is not a robust way to fix the problem. You can specify the parameter types to Numba so it will compile the function at declaration time. This also remove the need to artificially call the function (with useless parameters).
#nb.njit('float64[:,:](float64[::1], float64[::1], float64, float64)', parallel=True)
Note that (J[0,0]*np.cos(beta0) - J[0,1]*np.sin(beta0)) is zero the first time resulting in a division by 0.
Another main issue comes from the allocations of many small arrays in the loop causing a contention of the standard allocator (see this post for more information). While Numba could theoretically optimize it (ie. replace the array with local variables), it actually does not, resulting in a huge slowdown and a contention. Hopefully, in your case, you do not need to actually create the array. At last, you can create it only in the encompassing loop and modify it in the innermost loop. Here is the optimized code:
#nb.njit('float64[:,:](float64[::1], float64[::1], float64, float64)', parallel=True)
def lyapunov_grid(x_grid, y_grid, k, N):
L = len(x_grid)
lypnv = np.zeros((L, L))
for ii in nb.prange(L):
J = np.ones((2, 2), dtype=np.float64)
for jj in range(L):
x = x_grid[ii]
y = y_grid[jj]
beta0 = 0
sumT11 = 0
for j in range(N):
y = (y - k*np.sin(x)) % (2*np.pi)
x = (x + y) % (2*np.pi)
J[0, 1] = -k*np.cos(x)
J[1, 1] = 1.0 - k*np.cos(x)
beta = np.arctan((-J[1,0]*np.cos(beta0) + J[1,1]*np.sin(beta0))/(J[0,0]*np.cos(beta0) - J[0,1]*np.sin(beta0)))
T11 = np.cos(beta0)*(J[0,0]*np.cos(beta) - J[1,0]*np.sin(beta)) - np.sin(beta0)*(J[0,1]*np.cos(beta) - J[1,1]*np.sin(beta))
sumT11 += np.log(abs(T11))/np.log(2)
beta0 = beta
lypnv[ii, jj] = sumT11/N
return lypnv
Here is the results on a old 2-core machine (with 4 hardware threads):
Original sequential: 15.9 s
Original parallel: 11.9 s
Fix-build sequential: 15.7 s
Fix-build parallel: 10.1 s
Optimized sequential: 2.73 s
Optimized parallel: 0.94 s
The optimized implementation is much faster than the others. The parallel optimized version scale very well compared than the original one (2.9 times faster than the sequential one). Finally, the best version is about 12 times faster than the original parallel version. I expect a much faster computation on a recent machine with many more cores.
Given two opposite corners of a rectangle (x1, y1) and (x2, y2) and two radii r1 and r2, find the ratio of points that lie between the circles defined by the radii r1 and r2 to the total number of points in the rectangle.
Simple NumPy approach:
def func_1(x1,y1,x2,y2,r1,r2,n):
x11,y11 = np.meshgrid(np.linspace(x1,x2,n),np.linspace(y1,y2,n))
z1 = np.sqrt(x11**2+y11**2)
a = np.where((z1>(r1)) & (z1<(r2)))
fill_factor = len(a[0])/(n*n)
return fill_factor
Next I tried to optimize this function with the jit decorator from numba. When I use:
nopython = True
The function is faster and gives the right output. But when I also add:
parallel = True
The function is faster but gives the wrong result.
I know that this has something to do with my z matrix since that is not being updated properly.
#jit(nopython=True,parallel=True)
def func_2(x1,y1,x2,y2,r1,r2,n):
x_ = np.linspace(x1,x2,n)
y_ = np.linspace(y1,y2,n)
z1 = np.zeros((n,n))
for i in range(n):
for j in range(n):
z1[i][j] = np.sqrt((x_[i]*x_[i]+y_[j]*y_[j]))
a = np.where((z1>(r1)) & (z1<(r2)))
fill_factor = len(a[0])/(n*n)
return fill_factor
Test values :
x1 = 1.0
x2 = -1.0
y1 = 1.0
y2 = -1.0
r1 = 0.5
r2 = 0.75
n = 25000
Additional info : Python version : 3.6.1, Numba version : 0.34.0+5.g1762237, NumPy version : 1.13.1
The problem with parallel=True is that it's a black-box. Numba doesn't even guarantee that it will actually parallelize anything. It uses heuristics to find out if it's parallelizable and what could be done in parallel. These can fail and in your example they do fail, just like in my experiments with parallel and numba. That makes parallel untrustworthy and I would advise against using it!
In newer versions (0.34) prange was added an you could have more luck with that. It can't be applied in this case because prange works like range and that's different from np.linspace...
Just a note: You can avoid building z and doing the np.where in your function completely, you could just do the checks explicitly:
import numpy as np
import numba as nb
#nb.njit # equivalent to "jit(nopython=True)".
def func_2(x1,y1,x2,y2,r1,r2,n):
x_ = np.linspace(x1,x2,n)
y_ = np.linspace(y1,y2,n)
cnts = 0
for i in range(n):
for j in range(n):
z = np.sqrt(x_[i] * x_[i] + y_[j] * y_[j])
if r1 < z < r2:
cnts += 1
fill_factor = cnts/(n*n)
return fill_factor
That should also provide some speedup compared to your function, maybe even more than using parallel=True (if it would work correctly).
For approximating the value of Pi consider this stochastic method that populates an array with random values and tests for unit circle inclusion,
import random as rd
import numpy as np
def r(_): return rd.random()
def np_pi(n):
v_r = np.vectorize(r)
x = v_r(np.zeros(n))
y = v_r(np.zeros(n))
return sum (x*x + y*y <= 1) * 4. / n
Note the random number generation relies on Python standard library; consider though numpy random generation,
def np_pi(n):
x = np.random.random(n)
y = np.random.random(n)
return sum (x*x + y*y <= 1) * 4. / n
Consider now the non-vectorized approach,
import random as rd
def dart_board():
x,y = rd.random(), rd.random()
return (x*x + y*y <= 1)
def pi(n):
s = sum([dart_board() for _ in range(n)])
return s * 4. / n
The non-vectorized form proves 4 times faster in average than the vectorized counterpart, for instance consider n = 5000000 and OS command line as follows (Python 2.7, Quadcore, 8GB RAM, RedHat Linux),
time python pi.py
time python np_pi.py
Thus to ask how to improve the vectorized approach to improve its performance.
You are invoking the python builtin sum, rather than numpy's vectorized method sum:
import numpy as np
import random as rd
def np_pi(n):
x = np.random.random(n)
y = np.random.random(n)
return (x*x + y*y <= 1).sum()
def dart_board():
x,y = rd.random(), rd.random()
return (x*x + y*y <= 1)
def pi(n):
s = sum([dart_board() for _ in range(n)])
Timing results are now much different:
In [12]: %timeit np_pi(10000)
1000 loops, best of 3: 250 us per loop
In [13]: %timeit pi(10000)
100 loops, best of 3: 3.54 ms per loop
It is my guess that calling the builtin sum on a numpy-array causes overhead by iterating over the array, rather than using vectorized routines.
I started with this code to calculate a simple matrix multiplication. It runs with %timeit in around 7.85s on my machine.
To try to speed this up I tried cython which reduced the time to 0.4s. I want to also try to use numba jit compiler to see if I can get similar speed ups (with less effort). But adding the #jit annotation appears to give exactly the same timings (~7.8s). I know it can't figure out the types of the calculate_z_numpy() call but I'm not sure what I can do to coerce it. Any ideas?
from numba import jit
import numpy as np
#jit('f8(c8[:],c8[:],uint)')
def calculate_z_numpy(q, z, maxiter):
"""use vector operations to update all zs and qs to create new output array"""
output = np.resize(np.array(0, dtype=np.int32), q.shape)
for iteration in range(maxiter):
z = z*z + q
done = np.greater(abs(z), 2.0)
q = np.where(done, 0+0j, q)
z = np.where(done, 0+0j, z)
output = np.where(done, iteration, output)
return output
def calc_test():
w = h = 1000
maxiter = 1000
# make a list of x and y values which will represent q
# xx and yy are the co-ordinates, for the default configuration they'll look like:
# if we have a 1000x1000 plot
# xx = [-2.13, -2.1242,-2.1184000000000003, ..., 0.7526000000000064, 0.7584000000000064, 0.7642000000000064]
# yy = [1.3, 1.2948, 1.2895999999999999, ..., -1.2844000000000058, -1.2896000000000059, -1.294800000000006]
x1, x2, y1, y2 = -2.13, 0.77, -1.3, 1.3
x_step = (float(x2 - x1) / float(w)) * 2
y_step = (float(y1 - y2) / float(h)) * 2
y = np.arange(y2,y1-y_step,y_step,dtype=np.complex)
x = np.arange(x1,x2,x_step)
q1 = np.empty(y.shape[0],dtype=np.complex)
q1.real = x
q1.imag = y
# Transpose y
x_y_square_matrix = x+y[:, np.newaxis] # it is np.complex128
# convert square matrix to a flatted vector using ravel
q2 = np.ravel(x_y_square_matrix)
# create z as a 0+0j array of the same length as q
# note that it defaults to reals (float64) unless told otherwise
z = np.zeros(q2.shape, np.complex128)
output = calculate_z_numpy(q2, z, maxiter)
print(output)
calc_test()
I figured out how to do this with some help from someone else.
#jit('i4[:](c16[:],c16[:],i4,i4[:])',nopython=True)
def calculate_z_numpy(q, z, maxiter,output):
"""use vector operations to update all zs and qs to create new output array"""
for iteration in range(maxiter):
for i in range(len(z)):
z[i] = z[i] + q[i]
if z[i] > 2:
output[i] = iteration
z[i] = 0+0j
q[i] = 0+0j
return output
What I learnt is that use numpy datastructures as inputs (for typing), but within use c like paradigms for looping.
This runs in 402ms which is a touch faster than cython code 0.45s so for fairly minimal work in rewriting the loop explicitly we have a python version faster than C(just).
I need this function to be optimized as I am trying to make my OpenGL simulation run faster. I want to use Parakeet, but I can't quite understand in what way I would need to modify the code below in order to do so. Can you see what I should do?
def distanceMatrix(self,x,y,z):
" ""Computes distances between all particles and places the result in a matrix such that the ij th matrix entry corresponds to the distance between particle i and j"" "
xtemp = tile(x,(self.N,1))
dx = xtemp - xtemp.T
ytemp = tile(y,(self.N,1))
dy = ytemp - ytemp.T
ztemp = tile(z,(self.N,1))
dz = ztemp - ztemp.T
# Particles 'feel' each other across the periodic boundaries
if self.periodicX:
dx[dx>self.L/2]=dx[dx > self.L/2]-self.L
dx[dx<-self.L/2]=dx[dx < -self.L/2]+self.L
if self.periodicY:
dy[dy>self.L/2]=dy[dy>self.L/2]-self.L
dy[dy<-self.L/2]=dy[dy<-self.L/2]+self.L
if self.periodicZ:
dz[dz>self.L/2]=dz[dz>self.L/2]-self.L
dz[dz<-self.L/2]=dz[dz<-self.L/2]+self.L
# Total Distances
d = sqrt(dx**2+dy**2+dz**2)
# Mark zero entries with negative 1 to avoid divergences
d[d==0] = -1
return d, dx, dy, dz
From what I can tell, Parakeet should be able to use the above function without modifications - it only uses Numpy and math. But, I always get the following error when calling the function from the Parakeet jit wrapper:
AssertionError: Unsupported function: <bound method Particles.distanceMatrix of <particles.Particles instance at 0x04CD8E90>>
Parakeet is still young, its NumPy support is incomplete, and your code touches on several features that don't yet work.
1) You're wrapping a method, while Parakeet so far only knows how to deal with functions. The common workaround is to make a #jit wrapped helper function and have your method call into that with all of the required member data. The reason that methods don't work is that it's non-trivial to assign a meaningful type to 'self'. It's not impossible, but tricky enough that methods won't make their way into Parakeet until lower hanging fruit are plucked. Speaking of low-hanging fruit...
2) Boolean indexing. Not yet implemented but will be in the next release.
3) np.tile: Also doesn't work, will also probably be in the next release. If you want to see which builtins and NumPy library functions will work, take a look at Parakeet's mappings module.
I rewrote your code to be a little friendlier to Parakeet:
#jit
def parakeet_dist(x, y, z, L, periodicX, periodicY, periodicZ):
# perform all-pairs computations more explicitly
# instead of tile + broadcasting
def periodic_diff(x1, x2, periodic):
diff = x1 - x2
if periodic:
if diff > (L / 2): diff -= L
if diff < (-L/2): diff += L
return diff
dx = np.array([[periodic_diff(x1, x2, periodicX) for x1 in x] for x2 in x])
dy = np.array([[periodic_diff(y1, y2, periodicY) for y1 in y] for y2 in y])
dz = np.array([[periodic_diff(z1, z2, periodicZ) for z1 in z] for z2 in z])
d= np.sqrt(dx**2 + dy**2 + dz**2)
# since we can't yet use boolean indexing for masking out zero distances
# have to fall back on explicit loops instead
for i in xrange(len(x)):
for j in xrange(len(x)):
if d[i,j] == 0: d[i,j] = -1
return d, dx, dy, dz
On my machine this runs only ~3x faster than NumPy for N = 2000 (0.39s for NumPy vs. 0.14s for Parakeet). If I rewrite the array traversals to use loops more explicitly then the performance goes up to ~6x faster than NumPy (Parakeet runs in ~0.06s):
#jit
def loopy_dist(x, y, z, L, periodicX, periodicY, periodicZ):
N = len(x)
dx = np.zeros((N,N))
dy = np.zeros( (N,N) )
dz = np.zeros( (N,N) )
d = np.zeros( (N,N) )
def periodic_diff(x1, x2, periodic):
diff = x1 - x2
if periodic:
if diff > (L / 2): diff -= L
if diff < (-L/2): diff += L
return diff
for i in xrange(N):
for j in xrange(N):
dx[i,j] = periodic_diff(x[j], x[i], periodicX)
dy[i,j] = periodic_diff(y[j], y[i], periodicY)
dz[i,j] = periodic_diff(z[j], z[i], periodicZ)
d[i,j] = dx[i,j] ** 2 + dy[i,j] ** 2 + dz[i,j] ** 2
if d[i,j] == 0: d[i,j] = -1
else: d[i,j] = np.sqrt(d[i,j])
return d, dx, dy, dz
With a little creative rewriting you can also get the above code running in Numba, but it only goes ~1.5x faster than NumPy (0.25 seconds). The compile times were Parakeet w/ comprehensions: 1 second, Parakeet w/ loops: .5 seconds, Numba w/ loops: 0.9 seconds.
Hopefully the next few releases will enable more idiomatic use of NumPy library functions, but for now comprehensions or loops are often the way to go.