Related
Below are some of the functions I wrote for distance (square) calculation in 3-D toroidal geometry for a collection of particles in that 3-D space:
import itertools
import time
import numpy as np
import scipy
import numba
from numba import njit
#njit(cache=True)
def get_dr2(i=np.array([]),j=np.array([]),cellsize=np.array([])):
k=np.zeros(3,dtype=np.float64)
dr2=0.0
for idx in numba.prange(cellsize.shape[0]):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.rint((j[idx]-i[idx])/cellsize[idx])
dr2+=k[idx]**2
return dr2
#numba.guvectorize(["void(float64[:],float64[:],float64[:],float64[:])"],
"(m),(m),(m)->()",nopython=True,cache=True)
def get_dr2_vec(i,j,cellsize,dr2):
dr2[:]=0.0
k=np.zeros(3,dtype=np.float64)
for idx in numba.prange(cellsize.shape[0]):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.rint((j[idx]-i[idx])/cellsize[idx])
dr2[0]+=k[idx]**2
#njit(cache=True)
def pair_vec_gen(pIList=np.array([[]]),pJList=np.array([[]])):
assert pIList.shape[1] == pJList.shape[1]
vecI=np.zeros((pIList.shape[0]*pJList.shape[0],pIList.shape[1]))
vecJ=np.zeros_like(vecI)
for i in numba.prange(pIList.shape[0]):
for j in numba.prange(pJList.shape[0]):
for k in numba.prange(pIList.shape[1]):
vecI[j+pJList.shape[0]*i][k]=pIList[i][k]
vecJ[j+pJList.shape[0]*i][k]=pJList[j][k]
return vecI,vecJ
#njit(cache=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
vecI=np.zeros((pIList.shape[0]*pJList.shape[0],pIList.shape[1]))
vecJ=np.zeros_like(vecI)
r2List=np.zeros(vecI.shape[0])
for i in numba.prange(pIList.shape[0]):
for j in numba.prange(pJList.shape[0]):
for k in numba.prange(pIList.shape[1]):
vecI[j+pJList.shape[0]*i][k]=pIList[i][k]
vecJ[j+pJList.shape[0]*i][k]=pJList[j][k]
r2List=get_dr2_vec2(vecI,vecJ,cellsize)
return r2List
#njit(cache=True)
def get_dr2_vec2(i=np.array([[]]),j=np.array([[]]),cellsize=np.array([])):
dr2=np.zeros(i.shape[0],dtype=np.float64)
k=np.zeros(i.shape[1],dtype=np.float64)
for m in numba.prange(i.shape[0]):
for n in numba.prange(i.shape[1]):
k[n] = (j[m,n]-i[m,n])-cellsize[n]*np.rint((j[m,n]-i[m,n])/cellsize[n])
dr2[m]+=k[n]**2
return dr2
def pair_dist_calculator_cdist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
r2List = (scipy.spatial.distance.cdist(pIList, pJList, metric=get_dr2_wrapper(cellsize=cellsize))).flatten()
return np.array(r2List).flatten()
def get_dr2_wrapper(cellsize=np.array([])):
return lambda u, v: get_dr2(u,v,cellsize)
frames=50
timedata=np.zeros((5,frames),dtype=np.float64)
N, dim = 100, 3 # 100 particles in 3D
cellsize=np.array([26.4,19.4,102.4])
for i in range(frames):
print("\rIter {}".format(i),end='')
vec = np.random.random((N, dim))
rList1=[];rList2=[];rList3=[];rList4=[];rList5=[]
#method 1
#print("method 1")
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
rList1.append(get_dr2(pI,pJ,cellsize))
end =time.perf_counter()
timedata[0,i]=(end-start)
#method 2
#print("method 2")
pIvec=[];pJvec=[];rList2=[]
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
pIvec.append(pI)
pJvec.append(pJ)
rList2=get_dr2_vec(np.array(pIvec),np.array(pJvec),cellsize)
end =time.perf_counter()
timedata[1,i]=(end-start)
#method 3
#print("method 3")
start = time.perf_counter()
rList3=get_dr2_vec(*pair_vec_gen(vec,vec),cellsize)
end =time.perf_counter()
timedata[2,i]=(end-start)
#method 4
#print("method 4")
start = time.perf_counter()
rList4=pair_vec_dist(vec,vec,cellsize)
end =time.perf_counter()
timedata[3,i]=(end-start)
#method 5
#print("method 5")
#start = time.perf_counter()
#rList5=pair_dist_calculator_cdist(np.array(pIvec),np.array(pJvec),cellsize)
#end =time.perf_counter()
#timedata[4,i]=(end-start)
assert (rList1 == rList2).all()
assert (rList2 == rList3).all()
assert (rList3 == rList4).all()
#assert rList4 == rList5
print("\n")
for i in range(4):
print("Method {} Average time {:.3g}s \u00B1 {:.3g}s".format(i+1,np.mean(timedata[i,1:]),np.std(timedata[i,1:])))
exit()
The essential idea is that at a particular time you have a snapshot of the particles or frame which contains the position of the particles. Now we can calculate all the distances between the particles we can use the following approaches:
Calculate distance between points iteratively in pure python; passing the combination of the position of the two particles one by one via Numba.
Create an iteration list (in pure python) beforehand and pass the whole list to a Numba #guvectorize function
Do (2) but all steps in Numba
Integrate all step in (3) to a simple Numba function
(optional) parse the positions to scipy.spatial.distance.cdist with the distance function as the distance metric.
For 50 frames containing 100 particles we have the respective times (frames, N = 50, 100):
Method 1 Average time 0.017s ± 0.00555s
Method 2 Average time 0.0181s ± 0.00573s
Method 3 Average time 0.00182s ± 0.000944s
Method 4 Average time 0.000485s ± 0.000348s
For 50 frames containing 1000 particles we have the respective times (frames, N = 50, 1000):
Method 1 Average time 2.11s ± 0.977s
Method 2 Average time 2.42s ± 0.859s
Method 3 Average time 0.349s ± 0.12s
Method 4 Average time 0.0694s ± 0.022s
and for 1000 frames containing 100 particles we have the respective times (frames, N = 1000, 100):
Method 1 Average time 0.0244s ± 0.0166s
Method 2 Average time 0.0288s ± 0.0254s
Method 3 Average time 0.00258s ± 0.00231s
Method 4 Average time 0.000636s ± 0.00086s
(All the time shown above are after removing the contribution from the first iteration)
Method 5 simply fails due to memory requirements and is much slower in comparison to any other method
Given the above dataset, I tend to prefer Method 4 though I am a bit concerned about the average time increase when I increase frames from 50 to 1000. Is there any further optimizations I can do in these implementations or if someone has ideas for much faster and memory conscious implementations? Any suggestions are welcome.
Update
Based on Jerome's answer the modified function is now:
#njit(cache=True,parallel=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
assert cellsize.size == 3
dr2=np.zeros(pIList.shape[0]*pJList.shape[0],dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
offset = j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[offset] = xk**2+yk**2+zk**2
return dr2
As Jerome pointed out that a very simple optimization could be running the loops through just the "lower half of the symmetric matrix" the distance calculation creates, though in a realistic situation I might have vector lists as pI and pJ where pI is a subset of pJ, which complicates this situation. Either I have to create two separate functions and control them via a wrapper function or somehow manage that in one single function. If there are any suggestions on how to do so that would be really helpful.
Update 2
I should clarify the problem furthermore. In this code I am trying to calculate distance between all points in a frame/snapshot, which is used further for pair distance distribution analysis. But in some cases we might want to focus on a subset of coordinates in a frame and calculate the distribution from their perspective. In such a case we select this subset smallVec from a pool of all coordinates vec (such that smallVec +restOfVec = vec) and calculate pair_vec_dist(smallVec,vec) instead of pair_vec_dist(vec,vec). For this calculation one can use list(pair_vec_dist(smallVec,smallVec)).append(pair_vec_dist(smallVec,restOfVec).
Based on the discussion with Jerome, I modified my function as:
#njit(cache=True,parallel=True)
def pair_vec_dist_cmb(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([]),is_sq=True,is_nonsq=True):
assert pIList.shape[1] == pJList.shape[1]
assert cellsize.size == 3
dr2_1=0; dr2_2=0
dr2_1=int(0.5*pIList.shape[0]*(pIList.shape[0]+1))
if is_nonsq:
dr2_2=int(pIList.shape[0]*pJList.shape[0])
dr2 = np.zeros((dr2_1+dr2_2),dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for j in numba.prange(0,pIList.shape[0],1):
if is_sq:
for i in range(j,pIList.shape[0],1):
index_1 = int(0.5*i*(i+1)+j)
xdist = pIList[j,0]-pIList[i,0]
ydist = pIList[j,1]-pIList[i,1]
zdist = pIList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index_1] = xk**2+yk**2+zk**2
if is_nonsq:
for j in range(pJList.shape[0]):
index_2 = dr2_1+ j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index_2] = xk**2+yk**2+zk**2
return dr2
Where pI (size: (N,3)) is the subset of pJ (size (M,3). In this code we subdivide the calculation into two sections: pair distance between pI-pI, which is symmetric and hence we can calculate only the lower triangular matrix i.e. N(N-1)/2 unique values. The other section is pI-pJ distances where we have to go through M(M-N) unique values. To further optimize the function, I have two additional changes:
Combining the outer loop for both sections. In order to do so I am now iterating over the upper triangular matrix which translates to N(N+1)/2 values. One can also implement an if check to see if coordinates are identical, though I am not sure how much time it would save.
To avoid appending the results from the two section together, I am predefining and partitioning the returned array by length.
A further assumption I have made is that time needed for partitioning vec into smallVec and restOfVec is negligent with respect to the pair distance calculation. Obviously, if wrong, one might need to rethink another optimization pathway.
The resultant function is 1.5 times faster than the previous function. I am looking to further optimize the function, but I am very new to loop tilling and other advanced optimizations, so if you have any suggestions, please let me know.
Update 3
So I figured that I should focus on making the function more optimized in terms of serial calculations as I might simply use Dask or multiprocessing to implement to work on multiple sections of an input collection of frames. So the reference function now is:
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test(pIList,pJList,cellsize):
_I=pIList.shape[0]
_J=pJList.shape[0]
dr2 = np.empty(int(_I*_J),dtype=np.float32)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
index = j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index] = xk**2+yk**2+zk**2
return dr2
Going back to the main problem while ignoring the symmetry aspect, I tried to further optimize the distance function as:
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test_v2(pIList,pJList,cellsize):
_I=pIList.shape[0]
_J=pJList.shape[0]
dr2 = np.empty(int(_I*_J),dtype=np.float32)
inv_cellsize = 1.0 / cellsize
tile=32
for ii in range(0,_I,tile):
for jj in range(0,_J,tile):
for i in range(ii,min(_I,ii+tile)):
for j in range(jj,min(_J,jj+tile)):
index = j + _J * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index] = xk**2+yk**2+zk**2
return dr2
which is essentially tiling up the two vector arrays. However I couldn't get any speedup as the exec time for both functions are roughly the same. I also thought about working with the transpose of the vector arrays, but I couldn't figure out how to align them in a loop when the vector lengths are not a multiple of tile length. Does anyone has any further suggestions or ideas on how to procced?
Edit: Another failed trial
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test_v3(pIList,pJList,cellsize):
inv_cellsize = 1.0 / cellsize
tile=32
_I=pIList.shape[0]
_J=pJList.shape[0]
vecI=np.empty((_I+2*tile,3),dtype=np.float64) # for rolling effect
vecJ=np.empty((_J+2*tile,3),dtype=np.float64) # for rolling effect
vecI_mask=np.ones((_I+2*tile),dtype=np.uint8)
vecJ_mask=np.ones((_J+2*tile),dtype=np.uint8)
vecI[:_I]=pIList
vecJ[:_J]=pJList
vecI[_I:]=0.
vecJ[_J:]=0.
vecI_mask[_I:]=0
vecI_mask[_J:]=0
#print(vecI,vecJ)
ILim=_I+(tile-_I%tile)
JLim=_J+(tile-_J%tile)
dr2 = np.empty((ILim*JLim),dtype=np.float64)
vecI=vecI.T
vecJ=vecJ.T
for ii in range(ILim):
for jj in range(0,JLim,tile):
index = jj + JLim*ii
#print(ii,jj,index)
mask = np.multiply(vecJ_mask[jj:jj+tile],vecI_mask[ii:ii+tile])
xdist = vecJ[0,jj:jj+tile]-vecI[0,ii:ii+tile]
ydist = vecJ[1,jj:jj+tile]-vecI[1,ii:ii+tile]
zdist = vecJ[2,jj:jj+tile]-vecI[2,ii:ii+tile]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
arr = xk**2+yk**2+zk**2
dr2[index:index+tile] = np.multiply(arr,mask)
return dr2
First things first: there are races conditions in your current code. This basically means the produced results can be corrupted (and it also impact performance). In practice, this causes an undefined behaviour. For example, k[n] is read by multiple thread in get_dr2_vec2. One need to be very careful when using prange. In this case, the race condition can be removed by just not using a temporary array which is not really useful and not using prange in the inner loop due to dr2[m] being updated (updating it from multiple threads also cause a race condition).
Moreover, prange is often not practically useful when parallel=True is not set in the Numba decorator. Indeed, the current functions are not parallel since this flag is missing.
Finally, you can merge the function pair_vec_dist and get_dr2_vec2 and the internal loops so to avoid creating and filling large temporary arrays. Indeed, the RAM throughput is pretty small nowadays compared to the computing power of modern processor. This gap is getting bigger since the last two decades. This effect is called the "memory wall" and it is not expected to disappear any time soon. Codes less memory-bound generally tends to be faster and scale better.
Here is the resulting code:
#njit(cache=True, parallel=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
dr2=np.zeros(pIList.shape[0]*pJList.shape[0],dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
offset = j + pJList.shape[0] * i
for k in range(pIList.shape[1]):
tmp = pJList[j,k]-pIList[i,k]
k = tmp-cellsize[k]*np.rint(tmp*inv_cellsize[k])
dr2[offset] += k**2
return dr2
It is 11 times faster with frames=50 and N=1000 on my 6-core machine (i5-9600KF).
The code can be optimized further. For example, dr2 is a flatten symmetric square matrix, so only the upper-right part needs to be computed and the bottom-left part can just be copied. Note that to do that efficiently in parallel, the work needs to be balanced between the thread (otherwise, the slowest will not be faster and will be the bottleneck). One can also generate an optimized version of the function only supporting cellsize.size == 3. Moreover, one can use register tiling so to make the code more cache-friendly. Finally, one can transpose the input so the layout is more SIMD-friendly (this certainly require the loop to be manually unrolled and the register tiling optimization to be done before).
I'm trying to construct a (p+1,n) matrix with the code below:
import numpy as np
p = 3
xh = np.linspace(0.5,1.5,p+1)
n = 7
x = np.linspace(0,2,n)
M = np.zeros([p+1,n])
l2 = 1
for i in range(len(x)):
for k in range(len(xh)):
for j in range(len(xh)):
if k != j:
l = (x[i]-xh[j])/(xh[k]-xh[j])
l2 *= l
elif k == j:
l = 1
l2 *= l
M[k][i]=l2
l2 = 1
print(M)
This method produces the matrix I want but is very slow (6 sec for p=40 and n=2000).
The matrix itself is a matrix of lagrange polynomials, for approximating some function. The nodal points, xh, are the points used in forming/calculating the interpolation of a function. They have the property that their values on the original function and the interpolation are always the same. The number of distinct nodal points (p+1) indicate the degree (p) of the polynomial for the Lagrange interpolation. The x points are where a function is to be evaluated. That could be the interpolation of the function or the function. This is the formula I'm following:
I don't know how a faster way to construct a matrix in numpy, other methods seem to keep going wrong when I apply it to the code I've got and I don't know enough to see why. What faster method can I use here?
Your code can be nicely compiled by decorating a function with #nb.njit from the numba package. Some minor redundant parts were removed.
import numpy as np
import numba as nb
#nb.njit
def test(p,n):
xh = np.linspace(0.5,1.5,p+1)
x = np.linspace(0,2,n)
M = np.zeros((p+1,n), dtype=nb.float64)
l2 = 1
for k in range(len(x)):
for i in range(len(xh)):
for j in range(len(xh)):
if i != j:
l = (x[k]-xh[j])/(xh[i]-xh[j])
else:
l = 1
l2 *= l
M[i][k]=l2
l2 = 1
return M
Benchmark for p=40, n=2000 on a 2-core colab instance. Array M was computed with your original code.
a = [0]
%timeit a[0] = test(40,2000)
np.testing.assert_allclose(M, a[0])
Runs in 5.57 ms per loop vs 2.24 s per loop or ~402x speed up.
So what I have, or want to create, is a 3D array consisting of different parameters I can then use a function on to create a new 3D array (of same size) with the results from the function. Basically I have something like this (R code) :
x <- seq(0,1,0.01)
y <- seq(0,1,0.01)
z <- seq(0,100,0.1)
And let's say I have a function that just is just:
result = x*data_point + y^2 + z^3
In principle I could probably just make three loops, and save it into a array(or something like that), but I would think that would take A LOT of computation time, especially if this step has to be done for several data-points. In this case that would mean approximately 10.000.000 calculations per data-point - and I have about a thousand. So in total around 10 billion calculations.
I understand that in order to get this resulting matrix it will take some time, no matter what, but are there some steps I can do to do it as fast as possible, or is looping the best way ? I also need to be able to go back and say: "I want x = 0.2, y = 0.2, and z = 10 on data-point 5".
A solution in R would be the best, but if it can be done a lot faster in Python, that will work just as well.
The fastest way is to use Numpy's broadcasting (or here).I modified the code from #EternusVia and it is about 14 times faster than his faster version. Avoid for loops wherever possible :)
import numpy as np
import time
# number of parameter values and patients
nx=100;
ny=100;
nz=100;
n_data=100;
# dummy data
x = np.linspace(0,1,nx);
y = np.linspace(1,2,ny);
z = np.linspace(2,3,nz);
data = np.linspace(0,100,n_data);
result2 = np.empty((n_data,nx,ny,nz));
# method 2 from #EternusVia
start = time.time()
y2=np.power(y,2);
z3=np.power(z,3);
for l in range(0,n_data):
for i in range(0,nx):
for j in range(0,ny):
result2[l,i,j,:]=x[i]*data[l]+y2[j]+z3[:]
end = time.time()
print(end-start)
# method 3 using Numpy broadcasting
# expand the dimensions of the array depending on where
# they are in the final array
x_bc = x[np.newaxis, :, np.newaxis, np.newaxis]
y_bc = y[np.newaxis, np.newaxis, :, np.newaxis]
z_bc = z[np.newaxis, np.newaxis, np.newaxis, :]
data_bc = data[:, np.newaxis, np.newaxis, np.newaxis]
start = time.time()
# just write the equation, broadcasting will to the rest
# of the magic and calculate the results element-wise
result3 = x_bc * data_bc + np.power(y_bc, 2) + np.power(z_bc, 3)
end = time.time()
print(end-start)
print(np.array_equal(result2,result3))
Here are two ways to implement your problem in Python; I timed both. Running the first method on my machine for 100^4 elements took about 2 minutes, while the second method took only 4 seconds.
import numpy as np
import time
# number of parameter values and patients
nx=100;
ny=100;
nz=100;
n_data=100;
# dummy data
x = np.linspace(0,1,nx);
y = np.linspace(1,2,ny);
z = np.linspace(2,3,nz);
data = np.linspace(0,100,n_data);
result1 = np.empty((n_data,nx,ny,nz));
result2 = np.empty((n_data,nx,ny,nz));
# method 1
start = time.time()
y2=np.power(y,2);
z3=np.power(z,3);
for l in range(0,n_data):
for i in range(0,nx):
for j in range(0,ny):
for k in range(0,nz):
result1[l,i,j,k] = x[i]*data[l]+y2[j]+z3[k]
end = time.time()
print(end-start)
# method 2
start = time.time()
y2=np.power(y,2);
z3=np.power(z,3);
for l in range(0,n_data):
for i in range(0,nx):
for j in range(0,ny):
result2[l,i,j,:]=x[i]*data[l]+y2[j]+z3[:]
end = time.time()
print(end-start)
print(np.array_equal(result1,result2))
Output:
133.110018015
4.36485505104
True
Are you looking for numpy.mgrid?
import numpy as np
x, y, z = np.mgrid[0:1:0.01, 0:1:0.01, 0:100:0.1]
data = np.mgrid[0:100:0.1] # could use np.arange here, but why?
# this will take some time
result = x * data[..., np.newaxis, np.newaxis, np.newaxis] + y**2 + z**3
print(result.shape) # (100, 100, 100, 1000)
I have a system of equations in the form of A*x = B where [A] is a tridiagonal coefficient matrix. Using the Numpy solver numpy.linalg.solve I can solve the system of equations for x.
See example below of how I develop the tridiagonal [A] martix. the {B} vector, and solve for x:
# Solve system of equations with a tridiagonal coefficient matrix
# uses numpy.linalg.solve
# use Python 3 print function
from __future__ import print_function
from __future__ import division
# modules
import numpy as np
import time
ti = time.clock()
#---- Build [A] array and {B} column vector
m = 1000 # size of array, make this 8000 to see time benefits
A = np.zeros((m, m)) # pre-allocate [A] array
B = np.zeros((m, 1)) # pre-allocate {B} column vector
A[0, 0] = 1
A[0, 1] = 2
B[0, 0] = 1
for i in range(1, m-1):
A[i, i-1] = 7 # node-1
A[i, i] = 8 # node
A[i, i+1] = 9 # node+1
B[i, 0] = 2
A[m-1, m-2] = 3
A[m-1, m-1] = 4
B[m-1, 0] = 3
print('A \n', A)
print('B \n', B)
#---- Solve using numpy.linalg.solve
x = np.linalg.solve(A, B) # solve A*x = B for x
print('x \n', x)
#---- Elapsed time for each approach
print('NUMPY time', time.clock()-ti, 'seconds')
So my question relates to two sections of the above example:
Since I am dealing with a tridiagonal matrix for [A], also called a banded matrix, is there a more efficient way to solve the system of equations instead of using numpy.linalg.solve?
Also, is there a better way to create the tridiagonal matrix instead of using a for-loop?
The above example runs on Linux in about 0.08 seconds according to the time.clock() function.
The numpy.linalg.solve function works fine, but I'm trying to find an approach that takes advantage of the tridiagonal form of [A] in hopes of speeding up the solution even further and then apply that approach to a more complicated example.
There are two immediate performance improvements (1) do not use a loop, (2) use scipy.linalg.solve_banded().
I would write the code something more like
import scipy.linalg as la
# Create arrays and set values
ab = np.zeros((3,m))
b = 2*ones(m)
ab[0] = 9
ab[1] = 8
ab[2] = 7
# Fix end points
ab[0,1] = 2
ab[1,0] = 1
ab[1,-1] = 4
ab[2,-2] = 3
b[0] = 1
b[-1] = 3
return la.solve_banded ((1,1),ab,b)
There may be more elegant ways to construct the matrix, but this works.
Using %timeit in ipython the original code took 112 ms for m=1000. This code takes 2.94 ms for m=10,000, an order of magnitude larger problem yet still almost two orders of magnitude faster! I did not have the patience to wait on the original code for m=10,000. Most of the time in the original may be in constructing the array, I did not test this. Regardless, for large arrays it is much more efficient to only store the non-zero values of the matrix.
There is a scipy.sparse matrix type called scipy.sparse.dia_matrix which captures the structure of your matrix well (it will store 3 arrays, in "positions" 0 (diagonal), 1 (above) and -1 (below)). Using this type of matrix you can try scipy.sparse.linalg.lsqr for solving. If your problem has an exact solution, it will be found, otherwise it will find the solution in least squares sense.
from scipy import sparse
A_sparse = sparse.dia_matrix(A)
ret_values = sparse.linalg.lsqr(A_sparse, C)
x = ret_values[0]
However, this may not be completely optimal in terms of exploiting the triadiagonal structure, there may be a theoretical way of making this faster. What this conversion does do for you is cut down the matrix multiplication expenses to the essential: Only the 3 bands are used. This, in combination with the iterative solver lsqr should already yield a speedup.
Note: I am not proposing scipy.sparse.linalg.spsolve, because it converts your matrix to csr format. However, replacing lsqr with spsolve is worth a try, especially because spsolve can bind UMFPACK, see relevant doc on spsolve. Also, it may be of interest to take a look at this stackoverflow question and answer relating to UMFPACK
You could use scipy.linalg.solveh_banded.
EDIT: You CANNOT used the above as your matrix is not symmetric and I thought it was. However, as was mentioned above in the comment, the Thomas algorithm is great for this
a = [7] * ( m - 2 ) + [3]
b = [1] + [8] * ( m - 2 ) + [4]
c = [2] + [9] * ( m - 2 )
d = [1] + [2] * ( m - 2 ) + [3]
# This is taken directly from the Wikipedia page also cited above
# this overwrites b and d
def TDMASolve(a, b, c, d):
n = len(d) # n is the numbers of rows, a and c has length n-1
for i in xrange(n-1):
d[i+1] -= 1. * d[i] * a[i] / b[i]
b[i+1] -= 1. * c[i] * a[i] / b[i]
for i in reversed(xrange(n-1)):
d[i] -= d[i+1] * c[i] / b[i+1]
return [d[i] / b[i] for i in xrange(n)]
This code is not optimize nor does it use np, but if I (or any of the other fine folks here) have time, I will edit it so that it does those thing. It currently times at ~10 ms for m=10000.
This probably will help
There is a function creates_tridiagonal which will create tridiagonal matrix. There is another function which converts a matrix into diagonal ordered form as requested by SciPy solve_banded function.
import numpy as np
def lu_decomp3(a):
"""
c,d,e = lu_decomp3(a).
LU decomposition of tridiagonal matrix a = [c\d\e]. On output
{c},{d} and {e} are the diagonals of the decomposed matrix a.
"""
n = np.diagonal(a).size
assert(np.all(a.shape ==(n,n))) # check if square matrix
d = np.copy(np.diagonal(a)) # without copy (assignment destination is read-only) error is raised
e = np.copy(np.diagonal(a, 1))
c = np.copy(np.diagonal(a, -1))
for k in range(1,n):
lam = c[k-1]/d[k-1]
d[k] = d[k] - lam*e[k-1]
c[k-1] = lam
return c,d,e
def lu_solve3(c,d,e,b):
"""
x = lu_solve(c,d,e,b).
Solves [c\d\e]{x} = {b}, where {c}, {d} and {e} are the
vectors returned from lu_decomp3.
"""
n = len(d)
y = np.zeros_like(b)
y[0] = b[0]
for k in range(1,n):
y[k] = b[k] - c[k-1]*y[k-1]
x = np.zeros_like(b)
x[n-1] = y[n-1]/d[n-1] # there is no x[n] out of range
for k in range(n-2,-1,-1):
x[k] = (y[k] - e[k]*x[k+1])/d[k]
return x
from scipy.sparse import diags
def create_tridiagonal(size = 4):
diag = np.random.randn(size)*100
diag_pos1 = np.random.randn(size-1)*10
diag_neg1 = np.random.randn(size-1)*10
a = diags([diag_neg1, diag, diag_pos1], offsets=[-1, 0, 1],shape=(size,size)).todense()
return a
a = create_tridiagonal(4)
b = np.random.randn(4)*10
print('matrix a is\n = {} \n\n and vector b is \n {}'.format(a, b))
c, d, e = lu_decomp3(a)
x = lu_solve3(c, d, e, b)
print("x from our function is {}".format(x))
print("check is answer correct ({})".format(np.allclose(np.dot(a, x), b)))
## Test Scipy
from scipy.linalg import solve_banded
def diagonal_form(a, upper = 1, lower= 1):
"""
a is a numpy square matrix
this function converts a square matrix to diagonal ordered form
returned matrix in ab shape which can be used directly for scipy.linalg.solve_banded
"""
n = a.shape[1]
assert(np.all(a.shape ==(n,n)))
ab = np.zeros((2*n-1, n))
for i in range(n):
ab[i,(n-1)-i:] = np.diagonal(a,(n-1)-i)
for i in range(n-1):
ab[(2*n-2)-i,:i+1] = np.diagonal(a,i-(n-1))
mid_row_inx = int(ab.shape[0]/2)
upper_rows = [mid_row_inx - i for i in range(1, upper+1)]
upper_rows.reverse()
upper_rows.append(mid_row_inx)
lower_rows = [mid_row_inx + i for i in range(1, lower+1)]
keep_rows = upper_rows+lower_rows
ab = ab[keep_rows,:]
return ab
ab = diagonal_form(a, upper=1, lower=1) # for tridiagonal matrix upper and lower = 1
x_sp = solve_banded((1,1), ab, b)
print("is our answer the same as scipy answer ({})".format(np.allclose(x, x_sp)))
I need help vectorizing this code. Right now, with N=100, its takes a minute or so to run. I would like to speed that up. I have done something like this for a double loop, but never with a 3D loop, and I am having difficulties.
import numpy as np
N = 100
n = 12
r = np.sqrt(2)
x = np.arange(-N,N+1)
y = np.arange(-N,N+1)
z = np.arange(-N,N+1)
C = 0
for i in x:
for j in y:
for k in z:
if (i+j+k)%2==0 and (i*i+j*j+k*k!=0):
p = np.sqrt(i*i+j*j+k*k)
p = p/r
q = (1/p)**n
C += q
print '\n'
print C
The meshgrid/where/indexing solution is already extremely fast. I made it about 65 % faster. This is not too much, but I explain it anyway, step by step:
It was easiest for me to approach this problem with all 3D vectors in the grid being columns in one large 2D 3 x M array. meshgrid is the right tool for creating all the combinations (note that numpy version >= 1.7 is required for a 3D meshgrid), and vstack + reshape bring the data into the desired form. Example:
>>> np.vstack(np.meshgrid(*[np.arange(0, 2)]*3)).reshape(3,-1)
array([[0, 0, 1, 1, 0, 0, 1, 1],
[0, 0, 0, 0, 1, 1, 1, 1],
[0, 1, 0, 1, 0, 1, 0, 1]])
Each column is one 3D vector. Each of these eight vectors represents one corner of a 1x1x1 cube (a 3D grid with step size 1 and length 1 in all dimensions).
Let's call this array vectors (it contains all 3D vectors representing all points in the grid). Then, prepare a bool mask for selecting those vectors fulfilling your mod2 criterion:
mod2bool = np.sum(vectors, axis=0) % 2 == 0
np.sum(vectors, axis=0) creates an 1 x M array containing the element sum for each column vector. Hence, mod2bool is a 1 x M array with a bool value for each column vector. Now use this bool mask:
vectorsubset = vectors[:,mod2bool]
This selects all rows (:) and uses boolean indexing for filtering the columns, both are fast operations in numpy. Calculate the lengths of the remaining vectors, using the native numpy approach:
lengths = np.sqrt(np.sum(vectorsubset**2, axis=0))
This is quite fast -- however, scipy.stats.ss and bottleneck.ss can perform the squared sum calculation even faster than this.
Transform the lengths using your instructions:
with np.errstate(divide='ignore'):
p = (r/lengths)**n
This involves finite number division by zero, resulting in Infs in the output array. This is entirely fine. We use numpy's errstate context manager for making sure that these zero divisions do not throw an exception or a runtime warning.
Now sum up the finite elements (ignore the infs) and return the sum:
return np.sum(p[np.isfinite(p)])
I have implemented this method two times below. Once exactly like just explained, and once involving bottleneck's ss and nansum functions. I have also added your method for comparison, and a modified version of your method that skips the np.where((x*x+y*y+z*z)!=0) indexing, but rather creates Infs, and finally sums up the isfinite way.
import sys
import numpy as np
import bottleneck as bn
N = 100
n = 12
r = np.sqrt(2)
x,y,z = np.meshgrid(*[np.arange(-N, N+1)]*3)
gridvectors = np.vstack((x,y,z)).reshape(3, -1)
def measure_time(func):
import time
def modified_func(*args, **kwargs):
t0 = time.time()
result = func(*args, **kwargs)
duration = time.time() - t0
print("%s duration: %.3f s" % (func.__name__, duration))
return result
return modified_func
#measure_time
def method_columnvecs(vectors):
mod2bool = np.sum(vectors, axis=0) % 2 == 0
vectorsubset = vectors[:,mod2bool]
lengths = np.sqrt(np.sum(vectorsubset**2, axis=0))
with np.errstate(divide='ignore'):
p = (r/lengths)**n
return np.sum(p[np.isfinite(p)])
#measure_time
def method_columnvecs_opt(vectors):
# On my system, bn.nansum is even slightly faster than np.sum.
mod2bool = bn.nansum(vectors, axis=0) % 2 == 0
# Use ss from bottleneck or scipy.stats (axis=0 is default).
lengths = np.sqrt(bn.ss(vectors[:,mod2bool]))
with np.errstate(divide='ignore'):
p = (r/lengths)**n
return bn.nansum(p[np.isfinite(p)])
#measure_time
def method_original(x,y,z):
ind = np.where((x+y+z)%2==0)
x = x[ind]
y = y[ind]
z = z[ind]
ind = np.where((x*x+y*y+z*z)!=0)
x = x[ind]
y = y[ind]
z = z[ind]
p=np.sqrt(x*x+y*y+z*z)/r
return np.sum((1/p)**n)
#measure_time
def method_original_finitesum(x,y,z):
ind = np.where((x+y+z)%2==0)
x = x[ind]
y = y[ind]
z = z[ind]
lengths = np.sqrt(x*x+y*y+z*z)
with np.errstate(divide='ignore'):
p = (r/lengths)**n
return np.sum(p[np.isfinite(p)])
print method_columnvecs(gridvectors)
print method_columnvecs_opt(gridvectors)
print method_original(x,y,z)
print method_original_finitesum(x,y,z)
This is the output:
$ python test.py
method_columnvecs duration: 1.295 s
12.1318801965
method_columnvecs_opt duration: 1.162 s
12.1318801965
method_original duration: 1.936 s
12.1318801965
method_original_finitesum duration: 1.714 s
12.1318801965
All methods produce the same result. Your method becomes a bit faster when doing the isfinite style sum. My methods are faster, but I would say that this is an exercise of academic nature rather than an important improvement :-)
I have one question left: you were saying that for N=3, the calculation should produce a 12. Even yours doesn't do this. All methods above produce 12.1317530867 for N=3. Is this expected?
Thanks to #Bill, I was able to get this to work. Very fast now. Perhaps could be done better, especially with the two masks to get rid of the two conditions that I originally had for loops for.
from __future__ import division
import numpy as np
N = 100
n = 12
r = np.sqrt(2)
x, y, z = np.meshgrid(*[np.arange(-N, N+1)]*3)
ind = np.where((x+y+z)%2==0)
x = x[ind]
y = y[ind]
z = z[ind]
ind = np.where((x*x+y*y+z*z)!=0)
x = x[ind]
y = y[ind]
z = z[ind]
p=np.sqrt(x*x+y*y+z*z)/r
ans = (1/p)**n
ans = np.sum(ans)
print 'ans'
print ans