Related
Below are some of the functions I wrote for distance (square) calculation in 3-D toroidal geometry for a collection of particles in that 3-D space:
import itertools
import time
import numpy as np
import scipy
import numba
from numba import njit
#njit(cache=True)
def get_dr2(i=np.array([]),j=np.array([]),cellsize=np.array([])):
k=np.zeros(3,dtype=np.float64)
dr2=0.0
for idx in numba.prange(cellsize.shape[0]):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.rint((j[idx]-i[idx])/cellsize[idx])
dr2+=k[idx]**2
return dr2
#numba.guvectorize(["void(float64[:],float64[:],float64[:],float64[:])"],
"(m),(m),(m)->()",nopython=True,cache=True)
def get_dr2_vec(i,j,cellsize,dr2):
dr2[:]=0.0
k=np.zeros(3,dtype=np.float64)
for idx in numba.prange(cellsize.shape[0]):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.rint((j[idx]-i[idx])/cellsize[idx])
dr2[0]+=k[idx]**2
#njit(cache=True)
def pair_vec_gen(pIList=np.array([[]]),pJList=np.array([[]])):
assert pIList.shape[1] == pJList.shape[1]
vecI=np.zeros((pIList.shape[0]*pJList.shape[0],pIList.shape[1]))
vecJ=np.zeros_like(vecI)
for i in numba.prange(pIList.shape[0]):
for j in numba.prange(pJList.shape[0]):
for k in numba.prange(pIList.shape[1]):
vecI[j+pJList.shape[0]*i][k]=pIList[i][k]
vecJ[j+pJList.shape[0]*i][k]=pJList[j][k]
return vecI,vecJ
#njit(cache=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
vecI=np.zeros((pIList.shape[0]*pJList.shape[0],pIList.shape[1]))
vecJ=np.zeros_like(vecI)
r2List=np.zeros(vecI.shape[0])
for i in numba.prange(pIList.shape[0]):
for j in numba.prange(pJList.shape[0]):
for k in numba.prange(pIList.shape[1]):
vecI[j+pJList.shape[0]*i][k]=pIList[i][k]
vecJ[j+pJList.shape[0]*i][k]=pJList[j][k]
r2List=get_dr2_vec2(vecI,vecJ,cellsize)
return r2List
#njit(cache=True)
def get_dr2_vec2(i=np.array([[]]),j=np.array([[]]),cellsize=np.array([])):
dr2=np.zeros(i.shape[0],dtype=np.float64)
k=np.zeros(i.shape[1],dtype=np.float64)
for m in numba.prange(i.shape[0]):
for n in numba.prange(i.shape[1]):
k[n] = (j[m,n]-i[m,n])-cellsize[n]*np.rint((j[m,n]-i[m,n])/cellsize[n])
dr2[m]+=k[n]**2
return dr2
def pair_dist_calculator_cdist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
r2List = (scipy.spatial.distance.cdist(pIList, pJList, metric=get_dr2_wrapper(cellsize=cellsize))).flatten()
return np.array(r2List).flatten()
def get_dr2_wrapper(cellsize=np.array([])):
return lambda u, v: get_dr2(u,v,cellsize)
frames=50
timedata=np.zeros((5,frames),dtype=np.float64)
N, dim = 100, 3 # 100 particles in 3D
cellsize=np.array([26.4,19.4,102.4])
for i in range(frames):
print("\rIter {}".format(i),end='')
vec = np.random.random((N, dim))
rList1=[];rList2=[];rList3=[];rList4=[];rList5=[]
#method 1
#print("method 1")
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
rList1.append(get_dr2(pI,pJ,cellsize))
end =time.perf_counter()
timedata[0,i]=(end-start)
#method 2
#print("method 2")
pIvec=[];pJvec=[];rList2=[]
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
pIvec.append(pI)
pJvec.append(pJ)
rList2=get_dr2_vec(np.array(pIvec),np.array(pJvec),cellsize)
end =time.perf_counter()
timedata[1,i]=(end-start)
#method 3
#print("method 3")
start = time.perf_counter()
rList3=get_dr2_vec(*pair_vec_gen(vec,vec),cellsize)
end =time.perf_counter()
timedata[2,i]=(end-start)
#method 4
#print("method 4")
start = time.perf_counter()
rList4=pair_vec_dist(vec,vec,cellsize)
end =time.perf_counter()
timedata[3,i]=(end-start)
#method 5
#print("method 5")
#start = time.perf_counter()
#rList5=pair_dist_calculator_cdist(np.array(pIvec),np.array(pJvec),cellsize)
#end =time.perf_counter()
#timedata[4,i]=(end-start)
assert (rList1 == rList2).all()
assert (rList2 == rList3).all()
assert (rList3 == rList4).all()
#assert rList4 == rList5
print("\n")
for i in range(4):
print("Method {} Average time {:.3g}s \u00B1 {:.3g}s".format(i+1,np.mean(timedata[i,1:]),np.std(timedata[i,1:])))
exit()
The essential idea is that at a particular time you have a snapshot of the particles or frame which contains the position of the particles. Now we can calculate all the distances between the particles we can use the following approaches:
Calculate distance between points iteratively in pure python; passing the combination of the position of the two particles one by one via Numba.
Create an iteration list (in pure python) beforehand and pass the whole list to a Numba #guvectorize function
Do (2) but all steps in Numba
Integrate all step in (3) to a simple Numba function
(optional) parse the positions to scipy.spatial.distance.cdist with the distance function as the distance metric.
For 50 frames containing 100 particles we have the respective times (frames, N = 50, 100):
Method 1 Average time 0.017s ± 0.00555s
Method 2 Average time 0.0181s ± 0.00573s
Method 3 Average time 0.00182s ± 0.000944s
Method 4 Average time 0.000485s ± 0.000348s
For 50 frames containing 1000 particles we have the respective times (frames, N = 50, 1000):
Method 1 Average time 2.11s ± 0.977s
Method 2 Average time 2.42s ± 0.859s
Method 3 Average time 0.349s ± 0.12s
Method 4 Average time 0.0694s ± 0.022s
and for 1000 frames containing 100 particles we have the respective times (frames, N = 1000, 100):
Method 1 Average time 0.0244s ± 0.0166s
Method 2 Average time 0.0288s ± 0.0254s
Method 3 Average time 0.00258s ± 0.00231s
Method 4 Average time 0.000636s ± 0.00086s
(All the time shown above are after removing the contribution from the first iteration)
Method 5 simply fails due to memory requirements and is much slower in comparison to any other method
Given the above dataset, I tend to prefer Method 4 though I am a bit concerned about the average time increase when I increase frames from 50 to 1000. Is there any further optimizations I can do in these implementations or if someone has ideas for much faster and memory conscious implementations? Any suggestions are welcome.
Update
Based on Jerome's answer the modified function is now:
#njit(cache=True,parallel=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
assert cellsize.size == 3
dr2=np.zeros(pIList.shape[0]*pJList.shape[0],dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
offset = j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[offset] = xk**2+yk**2+zk**2
return dr2
As Jerome pointed out that a very simple optimization could be running the loops through just the "lower half of the symmetric matrix" the distance calculation creates, though in a realistic situation I might have vector lists as pI and pJ where pI is a subset of pJ, which complicates this situation. Either I have to create two separate functions and control them via a wrapper function or somehow manage that in one single function. If there are any suggestions on how to do so that would be really helpful.
Update 2
I should clarify the problem furthermore. In this code I am trying to calculate distance between all points in a frame/snapshot, which is used further for pair distance distribution analysis. But in some cases we might want to focus on a subset of coordinates in a frame and calculate the distribution from their perspective. In such a case we select this subset smallVec from a pool of all coordinates vec (such that smallVec +restOfVec = vec) and calculate pair_vec_dist(smallVec,vec) instead of pair_vec_dist(vec,vec). For this calculation one can use list(pair_vec_dist(smallVec,smallVec)).append(pair_vec_dist(smallVec,restOfVec).
Based on the discussion with Jerome, I modified my function as:
#njit(cache=True,parallel=True)
def pair_vec_dist_cmb(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([]),is_sq=True,is_nonsq=True):
assert pIList.shape[1] == pJList.shape[1]
assert cellsize.size == 3
dr2_1=0; dr2_2=0
dr2_1=int(0.5*pIList.shape[0]*(pIList.shape[0]+1))
if is_nonsq:
dr2_2=int(pIList.shape[0]*pJList.shape[0])
dr2 = np.zeros((dr2_1+dr2_2),dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for j in numba.prange(0,pIList.shape[0],1):
if is_sq:
for i in range(j,pIList.shape[0],1):
index_1 = int(0.5*i*(i+1)+j)
xdist = pIList[j,0]-pIList[i,0]
ydist = pIList[j,1]-pIList[i,1]
zdist = pIList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index_1] = xk**2+yk**2+zk**2
if is_nonsq:
for j in range(pJList.shape[0]):
index_2 = dr2_1+ j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index_2] = xk**2+yk**2+zk**2
return dr2
Where pI (size: (N,3)) is the subset of pJ (size (M,3). In this code we subdivide the calculation into two sections: pair distance between pI-pI, which is symmetric and hence we can calculate only the lower triangular matrix i.e. N(N-1)/2 unique values. The other section is pI-pJ distances where we have to go through M(M-N) unique values. To further optimize the function, I have two additional changes:
Combining the outer loop for both sections. In order to do so I am now iterating over the upper triangular matrix which translates to N(N+1)/2 values. One can also implement an if check to see if coordinates are identical, though I am not sure how much time it would save.
To avoid appending the results from the two section together, I am predefining and partitioning the returned array by length.
A further assumption I have made is that time needed for partitioning vec into smallVec and restOfVec is negligent with respect to the pair distance calculation. Obviously, if wrong, one might need to rethink another optimization pathway.
The resultant function is 1.5 times faster than the previous function. I am looking to further optimize the function, but I am very new to loop tilling and other advanced optimizations, so if you have any suggestions, please let me know.
Update 3
So I figured that I should focus on making the function more optimized in terms of serial calculations as I might simply use Dask or multiprocessing to implement to work on multiple sections of an input collection of frames. So the reference function now is:
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test(pIList,pJList,cellsize):
_I=pIList.shape[0]
_J=pJList.shape[0]
dr2 = np.empty(int(_I*_J),dtype=np.float32)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
index = j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index] = xk**2+yk**2+zk**2
return dr2
Going back to the main problem while ignoring the symmetry aspect, I tried to further optimize the distance function as:
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test_v2(pIList,pJList,cellsize):
_I=pIList.shape[0]
_J=pJList.shape[0]
dr2 = np.empty(int(_I*_J),dtype=np.float32)
inv_cellsize = 1.0 / cellsize
tile=32
for ii in range(0,_I,tile):
for jj in range(0,_J,tile):
for i in range(ii,min(_I,ii+tile)):
for j in range(jj,min(_J,jj+tile)):
index = j + _J * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index] = xk**2+yk**2+zk**2
return dr2
which is essentially tiling up the two vector arrays. However I couldn't get any speedup as the exec time for both functions are roughly the same. I also thought about working with the transpose of the vector arrays, but I couldn't figure out how to align them in a loop when the vector lengths are not a multiple of tile length. Does anyone has any further suggestions or ideas on how to procced?
Edit: Another failed trial
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test_v3(pIList,pJList,cellsize):
inv_cellsize = 1.0 / cellsize
tile=32
_I=pIList.shape[0]
_J=pJList.shape[0]
vecI=np.empty((_I+2*tile,3),dtype=np.float64) # for rolling effect
vecJ=np.empty((_J+2*tile,3),dtype=np.float64) # for rolling effect
vecI_mask=np.ones((_I+2*tile),dtype=np.uint8)
vecJ_mask=np.ones((_J+2*tile),dtype=np.uint8)
vecI[:_I]=pIList
vecJ[:_J]=pJList
vecI[_I:]=0.
vecJ[_J:]=0.
vecI_mask[_I:]=0
vecI_mask[_J:]=0
#print(vecI,vecJ)
ILim=_I+(tile-_I%tile)
JLim=_J+(tile-_J%tile)
dr2 = np.empty((ILim*JLim),dtype=np.float64)
vecI=vecI.T
vecJ=vecJ.T
for ii in range(ILim):
for jj in range(0,JLim,tile):
index = jj + JLim*ii
#print(ii,jj,index)
mask = np.multiply(vecJ_mask[jj:jj+tile],vecI_mask[ii:ii+tile])
xdist = vecJ[0,jj:jj+tile]-vecI[0,ii:ii+tile]
ydist = vecJ[1,jj:jj+tile]-vecI[1,ii:ii+tile]
zdist = vecJ[2,jj:jj+tile]-vecI[2,ii:ii+tile]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
arr = xk**2+yk**2+zk**2
dr2[index:index+tile] = np.multiply(arr,mask)
return dr2
First things first: there are races conditions in your current code. This basically means the produced results can be corrupted (and it also impact performance). In practice, this causes an undefined behaviour. For example, k[n] is read by multiple thread in get_dr2_vec2. One need to be very careful when using prange. In this case, the race condition can be removed by just not using a temporary array which is not really useful and not using prange in the inner loop due to dr2[m] being updated (updating it from multiple threads also cause a race condition).
Moreover, prange is often not practically useful when parallel=True is not set in the Numba decorator. Indeed, the current functions are not parallel since this flag is missing.
Finally, you can merge the function pair_vec_dist and get_dr2_vec2 and the internal loops so to avoid creating and filling large temporary arrays. Indeed, the RAM throughput is pretty small nowadays compared to the computing power of modern processor. This gap is getting bigger since the last two decades. This effect is called the "memory wall" and it is not expected to disappear any time soon. Codes less memory-bound generally tends to be faster and scale better.
Here is the resulting code:
#njit(cache=True, parallel=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
dr2=np.zeros(pIList.shape[0]*pJList.shape[0],dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
offset = j + pJList.shape[0] * i
for k in range(pIList.shape[1]):
tmp = pJList[j,k]-pIList[i,k]
k = tmp-cellsize[k]*np.rint(tmp*inv_cellsize[k])
dr2[offset] += k**2
return dr2
It is 11 times faster with frames=50 and N=1000 on my 6-core machine (i5-9600KF).
The code can be optimized further. For example, dr2 is a flatten symmetric square matrix, so only the upper-right part needs to be computed and the bottom-left part can just be copied. Note that to do that efficiently in parallel, the work needs to be balanced between the thread (otherwise, the slowest will not be faster and will be the bottleneck). One can also generate an optimized version of the function only supporting cellsize.size == 3. Moreover, one can use register tiling so to make the code more cache-friendly. Finally, one can transpose the input so the layout is more SIMD-friendly (this certainly require the loop to be manually unrolled and the register tiling optimization to be done before).
I'm looking to calculate the similarity between two routes, where a route is defined as an origin-destination pair. I've looked around and couldn't find a similarity measure which takes both direction and length into account, so I invented my own (but to be honest, I'd prefer doing something standard):
Here's the code to calculate similarity score for this metric:
def calculate_similarity(o1_lat, o1_long, d1_lat, d1_long, o2_lat, o2_long, d2_lat, d2_long):
l1 = mpu.haversine_distance((o1_lat, o1_long), (d1_lat, d1_long))
l2 = mpu.haversine_distance((o2_lat, o2_long), (d2_lat, d2_long))
od = mpu.haversine_distance((o1_lat, o1_long), (o2_lat, o2_long))
dd = mpu.haversine_distance((d1_lat, d1_long), (d2_lat, d2_long))
sim = (l1+l2)/(l1+l2+od+dd)
return sim
My main problem now is that this is too slow - I have 200k origin destination pairs that all need to be compared to each other and on my current compute that'll take 45 days.
Does anyone know of a different approach that I could use?
import mpu
import time
from itertools import combinations
import numpy as np
from numpy import sin, cos, sqrt, arctan
# initial function
def calculate_similarity(o1_lat, o1_long, d1_lat, d1_long, o2_lat, o2_long, d2_lat, d2_long):
l1 = mpu.haversine_distance((o1_lat, o1_long), (d1_lat, d1_long))
l2 = mpu.haversine_distance((o2_lat, o2_long), (d2_lat, d2_long))
od = mpu.haversine_distance((o1_lat, o1_long), (o2_lat, o2_long))
dd = mpu.haversine_distance((d1_lat, d1_long), (d2_lat, d2_long))
sim = (l1+l2)/(l1+l2+od+dd)
return sim
# get n random origin, destination coordinates
def get_random_coordinates(n_coordinates):
MIN_LATITUDE = -90
MAX_LATITUDE = 90
MIN_LONGITUDE = -180
MAX_LONGITUDE = 180
coordinates = []
for i in range(n_coordinates):
o_lat = np.random.uniform(MIN_LATITUDE, MAX_LATITUDE)
o_long = np.random.uniform(MIN_LONGITUDE, MAX_LONGITUDE)
d_lat = np.random.uniform(MIN_LATITUDE, MAX_LATITUDE)
d_long = np.random.uniform(MIN_LONGITUDE, MAX_LONGITUDE)
coordinates.append((o_lat, o_long, d_lat, d_long))
return coordinates
# compute similarity matrix (iterative approach)
def calculate_similarity_matrix(coordinates, similarity_function):
n = len(coordinates)
similarity_matrix = np.diag(np.ones(n))
for i, j in combinations(range(n), 2):
o1_lat, o1_long, d1_lat, d1_long = coordinates[i]
o2_lat, o2_long, d2_lat, d2_long = coordinates[j]
similarity = similarity_function(
o1_lat, o1_long, d1_lat, d1_long, o2_lat, o2_long, d2_lat, d2_long
)
similarity_matrix[i, j] = similarity
similarity_matrix[j, i] = similarity
return similarity_matrix
# faster approach
def csm(coordinates):
n = len(coordinates)
rc = np.radians(coordinates)
c = np.array(list(combinations(np.arange(n),2)))
a = sin((rc[:,2]-rc[:,0])/2)**2+cos(rc[:,0])*cos(rc[:,2])*sin((rc[:,3]-rc[:,1])/2)**2
d = arctan(sqrt(a/(1-a)))
l1_p_l2 = d[c[:,0]]+d[c[:,1]]
aod = sin((rc[c[:,1],0]-rc[c[:,0],0])/2)**2+cos(rc[c[:,0],0])*cos(rc[c[:,1],0])*sin((rc[c[:,1],1]-rc[c[:,0],1])/2)**2
add = sin((rc[c[:,1],2]-rc[c[:,0],2])/2)**2+cos(rc[c[:,0],2])*cos(rc[c[:,1],2])*sin((rc[c[:,1],3]-rc[c[:,0],3])/2)**2
tri = np.diag(np.full(n, 1/2))
tri[np.triu_indices(n,1)] = l1_p_l2/(l1_p_l2+arctan(sqrt(aod/(1-aod)))+arctan(sqrt(add/(1-add))))
return tri + tri.T
# simulate coordinates
n_coordinates = 1000
np.random.seed(69)
coordinates = get_random_coordinates(n_coordinates)
# number of pairs
target = 200000
# time provided function
start = time.time()
similarity_matrix = calculate_similarity_matrix(coordinates, calculate_similarity)
end = time.time()
runtime = end - start
# time faster approach
start2 = time.time()
similarity_matrix2 = csm(coordinates)
end2 = time.time()
runtime2 = end2 - start2
# ensure correctness
assert np.max(np.abs(similarity_matrix - similarity_matrix2)) < 1e-8
# calculate estimated speedup & runtime
speedup = runtime / runtime2
n_sim = n_coordinates * (n_coordinates - 1) / 2
n_target = target * (target - 1) / 2
ratio = n_target / n_sim
runtime = runtime2 * ratio / 3600
print(f'Estimated speedup: {speedup:.2f}x')
print(f'Estimated runtime: {runtime:.2f} hours')
prints
Estimated speedup: 31.31x
Estimated runtime: 4.70 hours
This code employs a wide range of tricks to speed up runtime, above all, it vectorizes the computation of the similarity measures by using numpy. Some of those tricks are:
Avoid unneccessary recomputations of the route distances: If calculate_similarity were to be called on each pair of routes out of a total of n routes, then each route distance would be computed n-1 times. By effectively precomputing the distances only once, one can thus save n-2 distance computations per route.
Avoid unneccessarily converting the distances to kilometers by removing the multiplication with the earth's diameter. This works due to the way SimScore is defined, i.e. due to the fact that multiplicating each additive component l1, l2, od, dd by a constant will not affect the ratio (l1 + l2) / (l1 + l2 + od + dd).
Avoid unneccessary conversions of latitude and longitude degrees to radians by computing them only once up front.
Avoid performing checks for coordinate validity. Note that this preconditions all coordinates to be within the valid range.
Simplifying math to save on expensive operations, e.g. rewriting atan2(sqrt(a), sqrt(1 - a) as atan(sqrt(a / (1 - a)) (saving square root operations) or sin(x) * sin(x) as sin(x) ** 2 (saving sine operations).
To estimate the speedup and runtime, n=1000 random routes were simulated and both approaches were timed. To verify correctness, the code asserts the infinity norm of the difference between the similarity matrices to be essentially zero. Note that these estimates should only serve as rough proxies, as the chosen timing method is not very robust and things will likely look a bit different for larger n (the esimated runtime is simply a linear extrapolation) and these numbers were achieved on a GPU via Google Colab, which additionally affects the estimation bias.
I believe there are still optimizations that could be made, especially: If highly accurate similarity measures are not required for the use case at hand and routes (i.e. l1, l2) and/or distances between start- and endpoints (i.e. od, dd) are small enough, then using a cheaper to compute distance metric may approximate the Haversine distance well enough, and thus could be employed if/where applicable.
However, as 200000 * (200000 - 1) / 2 = 19'999'900'000 similarity measures are to be computed, this will inevitably take a while.
I have a problem with optimization of the rejection method of generating continuous random variables. I've got a density: f(x) = 3/2 (1-x^2). Here's my code:
import random
import matplotlib.pyplot as plt
import numpy as np
import time
import scipy.stats as ss
a=0 # xmin
b=1 # xmax
m=3/2 # ymax
variables = [] #list for variables
def f(x):
return 3/2 * (1 - x**2) #probability density function
reject = 0 # number of rejections
start = time.time()
while len(variables) < 100000: #I want to generate 100 000 variables
u1 = random.uniform(a,b)
u2 = random.uniform(0,m)
if u2 <= f(u1):
variables.append(u1)
else:
reject +=1
end = time.time()
print("Time: ", end-start)
print("Rejection: ", reject)
x = np.linspace(a,b,1000)
plt.hist(variables,50, density=1)
plt.plot(x, f(x))
plt.show()
ss.probplot(variables, plot=plt)
plt.show()
My first question: Is my probability plot made properly?
And the second, what is in the title. How to optimize that method? I would like to get some advice to optimize the code. Now that code takes about 0.5 seconds and there are about 50 000 rejections. Is it possible to reduce the time and number of rejections? If it's needed I can optimize using a different method of generating variables.
My first question: Is my probability plot made properly?
No. It is made versus default normal distribution. You have to pack your function f(x) into class derived from stats.rv_continuous, make it into _pdf method, and pass it to probplot
And the second, what is in the title. How to optimise that method? Is it possible to reduce the time and number of rejections?
Sure, you have the power of NumPy vector abilities at your hands. Don't ever write explicit loops - vectoriz, vectorize and vectorize!
Look at modified code below, not a single loop, everything is done via NumPy vectors. Time went down on my computer for 100000 samples (Xeon, Win10 x64, Anaconda Python 3.7) from 0.19 to 0.003.
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import time
a = 0. # xmin
b = 1. # xmax
m = 3.0/2.0 # ymax
def f(x):
return 1.5 * (1.0 - x*x) # probability density function
start = time.time()
N = 100000
u1 = np.random.uniform(a, b, N)
u2 = np.random.uniform(0.0, m, N)
negs = np.empty(N)
negs.fill(-1)
variables = np.where(u2 <= f(u1), u1, negs) # accepted samples are positive or 0, rejected are -1
end = time.time()
accept = np.extract(variables>=0.0, variables)
reject = N - len(accept)
print("Time: ", end-start)
print("Rejection: ", reject)
x = np.linspace(a, b, 1000)
plt.hist(accept, 50, density=True)
plt.plot(x, f(x))
plt.show()
ss.probplot(accept, plot=plt) # against normal distribution
plt.show()
Concerning reducing number of rejections, you could sample with 0 rejects doing inverse method, it is cubic equation so it could work with easy
UPDATE
Here is the code to use for probplot:
class my_pdf(ss.rv_continuous):
def _pdf(self, x):
return 1.5 * (1.0 - x*x)
ss.probplot(accept, dist=my_pdf(a=a, b=b, name='my_pdf'), plot=plt)
and you should get something like
Regarding your first question, scipy.stats.probplot compares your sample against the quantiles of the normal distribution. If you'd like it to compare against the quantiles of your f(x) distribution, check out the dist parameter of probplot.
In terms of making this sampling procedure faster, avoiding loops is generally the way to go. Replacing the code between start = ... and end = ... with the following resulted in a >20x speedup for me.
n_before_accept_reject = 150000
u1 = np.random.uniform(a, b, size=n_before_accept_reject)
u2 = np.random.uniform(0, m, size=n_before_accept_reject)
variables = u1[u2 <= f(u1)]
reject = n_before_accept_reject - len(variables)
Note that this will give you approximately 100000 accepted samples each time you run it. You could raise the value of n_before_accept_reject slightly to effectively guarantee that variables will always have >100000 accepted values, and then just cap the size of variables to return exactly 100000 if necessary.
Others have spoken to the probability plotting, I'm going to address the efficiency of the rejection algorithm.
Acceptance/rejection schemes are based on m(x), a "majorizing function". A majorizing function should have two properties: 1) m(x)≥ f(x) ∀ x; and 2) m(x), when scaled to be a distribution, should be easy to generate values from.
You went with the constant function m = 3/2, which meets both requirements but does not bound f(x) very closely. Integrated from zero to one, that has an area of 3/2. Your f(x), being a valid density function, has an area of 1. Consequently, ∫f(x)) / ∫m(x)) = 1 / (3/2) = 2/3. In other words, 2/3 of the values you generate from the majorizing function are accepted, and you are rejecting 1/3 of the attempts.
You need an m(x) which provides a tighter bound for f(x). I went with a line which is tangent to f(x) at x = 1/2. With a little bit of calculus to get the slope, I derived m(x) = 15/8 - 3x/2.
This choice of m(x) has an area of 9/8, so only 1/9 of the values will be rejected. A bit more calculus yielded the inverse transform generator for x's based on this m(x) is x = (5 - sqrt(25 - 24U)) / 4, where U is a uniform(0,1) random varible.
Here's an implementation, based off your original version. I wrapped the rejection scheme in a function, and created the values with a list comprehension rather than appending to a list. As you'll see if you run this, it produces a lot fewer rejections than your original version.
import random
import matplotlib.pyplot as plt
import numpy as np
import time
import math
import scipy.stats as ss
a = 0 # xmin
b = 1 # xmax
reject = 0 # number of rejections
def f(x):
return 3.0 / 2.0 * (1.0 - x**2) #probability density function
def m(x):
return 1.875 - 1.5 * x
def generate_x():
global reject
while True:
x = (5.0 - math.sqrt(25.0 - random.uniform(0.0, 24.0))) / 4.0
u = random.uniform(0, m(x))
if u <= f(x):
return x
reject += 1
start = time.time()
variables = [generate_x() for _ in range(100000)]
end = time.time()
print("Time: ", end-start)
print("Rejection: ", reject)
x = np.linspace(a,b,1000)
plt.hist(variables,50, density=1)
plt.plot(x, f(x))
plt.show()
I am trying to perform a Monte Carlo Simulation in order to calculate the uncertainty in the electricity costs of a heat pump system. I have several input parameters (COPs, electricity costs), which are of a triangular probability distribution. The total electricity costs are composed of the sum of the calculated costs of the three subcomponents (heatpump and pumps) and are of an (approximately) normal probability distribution.
I was wondering if I am performing the MC simulation correctly. Since I have to loop this MC simulation over 70 different heat pump systems, I am also wondering if there is a faster method.
As I am an absolute greenhorn in coding, please apologize for my messy code.
I am thankful for any help!
My code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import triangular
N = 1_000_000
def energy_output(coef_performance, energy_input):
return energy_input * coef_performance / (coef_performance - 1)
COP_DISTRIBUTION_PARAM = dict(left=4, mode=4.5, right=5)
def seed_cop():
return triangular(**COP_DISTRIBUTION_PARAM )
INPUT_ENERGY_HEATING = 866
INPUT_ENERGY_COOLING = 912
def random_energy_output():
return energy_output(seed_cop(), energy_input=INPUT_ENERGY_HEATING)
energy_outputs = [random_energy_output() for _ in range(N)]
a = min(energy_outputs)
b = max(energy_outputs)
med = np.median(energy_outputs)
############################
def elec_costs_heatpump(elec_costs, coef_performance,energy_output):
return energy_output * 1000 / coef_performance * elec_costs
ELEC_DISTRIBUTION_PARAM = dict(left=0.14, mode=0.15, right=0.16)
def seed_elec():
return triangular(**ELEC_DISTRIBUTION_PARAM )
HP_OUTPUT_DISTRIBUTION_PARAM = dict(left=a, mode=med, right=b)
def seed_output():
return triangular(**HP_OUTPUT_DISTRIBUTION_PARAM )
def random_elec_costs_heatpump():
return elec_costs_heatpump(seed_elec(),seed_cop(),seed_output() )
elec_costs_heatpump = [random_elec_costs_heatpump() for _ in range(N)]
mean_hp = np.mean(elec_costs_heatpump)
std_hp = np.std(elec_costs_heatpump)
############################
def elec_costs_coldpump(elec_costs, coef_performance_pump,energy_input):
return energy_input * 1000 / coef_performance_pump * elec_costs
COP_PUMP_DISTRIBUTION_PARAM = dict(left=35, mode=40, right=45)
def seed_cop_pump():
return triangular(**COP_PUMP_DISTRIBUTION_PARAM )
def random_elec_costs_coldpump():
return elec_costs_coldpump(seed_elec(),seed_cop_pump(), energy_input=INPUT_ENERGY_COOLING)
elec_costs_coldpump = [random_elec_costs_coldpump() for _ in range(N)]
mean_cp = np.mean(elec_costs_coldpump)
sdt_cp = np.std(elec_costs_coldpump)
#########################
def elec_costs_warmpump(elec_costs, coef_performance_pump,energy_input):
return energy_input * 1000 / coef_performance_pump * elec_costs
def random_elec_costs_warmpump():
return elec_costs_warmpump(seed_elec(),seed_cop_pump(), energy_input=INPUT_ENERGY_HEATING)
elec_costs_warmpump = [random_elec_costs_warmpump() for _ in range(N)]
mean_wp = np.mean(elec_costs_warmpump)
sdt_wp = np.std(elec_costs_warmpump)
#########################
def total_costs(costs_heatpump, costs_coldpump, costs_warmpump):
return costs_heatpump + costs_coldpump + costs_warmpump
ELEC_COSTS_HEATPUMP_PARAM = dict(loc=mean_hp, scale=sdt_hp)
def seed_costs_hp():
return np.random.normal(**ELEC_COSTS_HEATPUMP_PARAM )
ELEC_COSTS_COLDPUMP_PARAM = dict(loc=mean_cp, scale=sdt_cp)
def seed_costs_cp():
return np.random.normal(**ELEC_COSTS_COLDPUMP_PARAM )
ELEC_COSTS_WARMPUMP_PARAM = dict(loc=mean_wp,scale=sdt_wp)
def seed_cost_wp():
return np.random.normal(**ELEC_COSTS_WARMPUMP_PARAM )
def random_total_costs():
return seed_costs_hp(), seed_costs_cp(), seed_cost_wp()
total_costs = [random_total_costs() for _ in range(N)]
print(total_costs)
#Plot = plt.hist(total_costs, bins=75, density=True)
Great you have a prototype for your code!
Some impressions on code structure and readability:
the quickest improvement is separating functions and scripting part, this allows to split your algorithm into simple testable blocks and keep control of calculations in one place, followed by plotting
some repeated code can go to own functions
it pays back to stick to more widely accepted naming convention (PEP8), this way people are less astonished less with style and can devote more attention to code substance. Specifically, you normally name functions lowercase underscore def do_something(): and UPPERCASE is reserved for constants.
Need to be able to run your code to check for speedups, see comments above about what is lacking.
Update on more complete code in question, some additional comments:
let functions be functions, do not pass the function arguments as globals
consider separating your model structure (your equations) from parameters (fixed inputs and seed values), it pays back in longer run
avoid 'magic numbers', put them into constants (example is 866) or pass explicitly as args.
Here is a refactoring to consider:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import triangular
N = 1_000_000
#def EnergyHP():
# COP_Hp = triangular(4.0, 4.5, 5) # COP of heatpump
# Ein = 866 # Energy input heatpump
# return Ein * COP_Hp /(COP_Hp-1) # Calculation of Energy output of heatpump
#
#Eout = np.zeros(N)
#for i in range(N):
# Eout[i] = EnergyHP()
#minval = np.min(Eout[np.nonzero(Eout)])
#maxval = np.max(Eout[np.nonzero(Eout)])
#Medi= np.median(Eout, axis=0)
def energy_output(coef_performance, energy_input):
"""Return energy output of heatpump given efficiency and input.
Args:
coef_performance - <description, units of measurement>
energy_input - <description, units of measurement>
"""
return energy_input * coef_performance / (coef_performance - 1)
# you will use this variable again, so we put it into a function to recylce
COP_DISTRIBUTION_PARAM = dict(left=4, mode=4.5, right=5)
def seed_cop():
"""Get random value for coef of performance."""
return triangular(**COP_DISTRIBUTION_PARAM )
# rename it if it is a constant, pass as an argument is it is not
SOME_MEANINGFUL_CONSTANT = 866
def random_energy_output():
return energy_output(seed_cop(), energy_input=SOME_MEANINGFUL_CONSTANT)
# pure python list and metrics
energy_outputs = [random_energy_output() for _ in range(N)]
a = min(energy_outputs)
b = max(energy_outputs)
med = np.median(energy_outputs)
# above does this does not use numpy, but you can convert it to vector
energy_outputs_np = np.array(energy_outputs)
# or you can construct np array directly, this is a very readable way
# make a vector or random arguments
cop_seeded = triangular(**COP_DISTRIBUTION_PARAM, size=N)
# apply function
energy_outputs_np_2 = energy_output(cop_seeded, SOME_MEANINGFUL_CONSTANT)
The next pressing intent is writing a HeatPump class, but I suggest sticking
to functions as long as you can - that usually makes our thinking about a class
state and methods better.
I also appears the seeded values for parameters may not be independent, in some experiment you can draw them from a joint distribution.
I have a following piece of code, which basically evaluates some numerical expression, and use it to integrate over certain range of values. The current piece of code runs within about 8.6 s, but I am just using mock values, and my actual array is much larger. Especially, my actual size of freq_c= (3800, 101) and size of number_bin = (3800, 100), which makes the following code really inefficient, as the total execution time will be close to 9 minutes for the actual array. One part of the code that is quite slow is evaluation of k_one_third and k_two_third, for which I have also used numexpr.evaluate(".."), which speeds up the code quite a bit by about 10-20%. But, I have avoided numexpr below, so that anyone can run it without having to install the package. Is there any more ways to improve the speed of this code? An improvement of a few factor would also be good enough. Please note that the for loop is almost unavoidable, due to memory issues, as the arrays are really large, I am manipulating each axis at a time through the loop. I also wonder if numba jit optimisation is possible here.
import numpy as np
import scipy
from scipy.integrate import simps as simps
import time
def k_one_third(x):
return (2.*np.exp(-x**2)/x**(1/3) + 4./x**(1/6)*np.exp(-x)/(1+x**(1/3)))**2
def k_two_third(x):
return (np.exp(-x**2)/x**(2/3) + 2.*x**(5/2)*np.exp(-x)/(6.+x**3))**2
def spectrum(freq_c, number_bin, frequency, gamma, theta):
theta_gamma_factor = np.einsum('i,j->ij', theta**2, gamma**2)
theta_gamma_factor += 1.
t_g_bessel_factor = 1.-1./theta_gamma_factor
number = np.concatenate((number_bin, np.zeros((number_bin.shape[0], 1), dtype=number_bin.dtype)), axis=1)
number_theta_gamma = np.einsum('jk, ik->ijk', theta_gamma_factor**2*1./gamma**3, number)
final = np.zeros((np.size(freq_c[:,0]), np.size(theta), np.size(frequency)))
for i in xrange(np.size(frequency)):
b_n_omega_theta_gamma = frequency[i]**2*number_theta_gamma
eta = theta_gamma_factor**(1.5)*frequency[i]/2.
eta = np.einsum('jk, ik->ijk', eta, 1./freq_c)
bessel_eta = np.einsum('jl, ijl->ijl',t_g_bessel_factor, k_one_third(eta))
bessel_eta += k_two_third(eta)
eta = None
integrand = np.multiply(bessel_eta, b_n_omega_theta_gamma, out= bessel_eta)
final[:,:, i] = simps(integrand, gamma)
integrand = None
return final
frequency = np.linspace(1, 100, 100)
theta = np.linspace(1, 3, 100)
gamma = np.linspace(2, 200, 101)
freq_c = np.random.randint(1, 200, size=(50, 101))
number_bin = np.random.randint(1, 100, size=(50, 100))
time1 = time.time()
spectra = spectrum(freq_c, number_bin, frequency, gamma, theta)
print(time.time()-time1)
I profiled the code and found that k_one_third() and k_two_third() are slow. There are some duplicated calculations in the two functions.
By merging the two functions into one function, and decorate it with #numba.jit(parallel=True), I got 4x speedup.
#jit(parallel=True)
def k_one_two_third(x):
x0 = x ** (1/3)
x1 = np.exp(-x ** 2)
x2 = np.exp(-x)
one = (2*x1/x0 + 4*x2/(x**(1/6)*(x0 + 1)))**2
two = (2*x**(5/2)*x2/(x**3 + 6) + x1/x**(2/3))**2
return one, two
As said in the comments large parts of the code should be rewritten to get best performance.
I have only modified the simpson integration and modified #HYRY answer a bit. This speeds up the calculation from 26.15s to 1.76s (15x), by the test-data you provided. By replacing the np.einsums with simple loops this should end up in less than a second. (About 0.4s from the improved integration, 24s from k_one_two_third(x))
For getting performance using Numba read. The latest Numba version (0.39), the Intel SVML-package and things like fastmath=True makes quite a big impact on your example.
Code
#a bit faster than HYRY's version
#nb.njit(parallel=True,fastmath=True,error_model='numpy')
def k_one_two_third(x):
one=np.empty(x.shape,dtype=x.dtype)
two=np.empty(x.shape,dtype=x.dtype)
for i in nb.prange(x.shape[0]):
for j in range(x.shape[1]):
for k in range(x.shape[2]):
x0 = x[i,j,k] ** (1/3)
x1 = np.exp(-x[i,j,k] ** 2)
x2 = np.exp(-x[i,j,k])
one[i,j,k] = (2*x1/x0 + 4*x2/(x[i,j,k]**(1/6)*(x0 + 1)))**2
two[i,j,k] = (2*x[i,j,k]**(5/2)*x2/(x[i,j,k]**3 + 6) + x1/x[i,j,k]**(2/3))**2
return one, two
#improved integration
#nb.njit(fastmath=True)
def simpson_nb(y_in,dx):
s = y[0]+y[-1]
n=y.shape[0]//2
for i in range(n-1):
s += 4.*y[i*2+1]
s += 2.*y[i*2+2]
s += 4*y[(n-1)*2+1]
return(dx/ 3.)*s
#nb.jit(fastmath=True)
def spectrum(freq_c, number_bin, frequency, gamma, theta):
theta_gamma_factor = np.einsum('i,j->ij', theta**2, gamma**2)
theta_gamma_factor += 1.
t_g_bessel_factor = 1.-1./theta_gamma_factor
number = np.concatenate((number_bin, np.zeros((number_bin.shape[0], 1), dtype=number_bin.dtype)), axis=1)
number_theta_gamma = np.einsum('jk, ik->ijk', theta_gamma_factor**2*1./gamma**3, number)
final = np.empty((np.size(frequency),np.size(freq_c[:,0]), np.size(theta)))
#assume that dx is const. on integration
#speedimprovement of the scipy.simps is about 4x
#numba version to scipy.simps(y,x) is about 60x
dx=gamma[1]-gamma[0]
for i in range(np.size(frequency)):
b_n_omega_theta_gamma = frequency[i]**2*number_theta_gamma
eta = theta_gamma_factor**(1.5)*frequency[i]/2.
eta = np.einsum('jk, ik->ijk', eta, 1./freq_c)
one,two=k_one_two_third(eta)
bessel_eta = np.einsum('jl, ijl->ijl',t_g_bessel_factor, one)
bessel_eta += two
integrand = np.multiply(bessel_eta, b_n_omega_theta_gamma, out= bessel_eta)
#reorder array
for j in range(integrand.shape[0]):
for k in range(integrand.shape[1]):
final[i,j, k] = simpson_nb(integrand[j,k,:],dx)
return final