Vectorizing calculation of values using numpy which requires previously calculated value

Vectorizing calculation of values using numpy which requires previously calculated value - python

I'm trying to calculate a particular formula for EMA from Investopedia which looks like
EmaToday = (ValueToday ∗ (Smoothing / 1+Days))
+ (EmaYesterday * (1 - (Smoothing / 1+Days)))
We can simplify this to:
Smoothing and Days are constants.
Let's call (Smoothing / 1 + Days) as 'M'
The simplified equation becomes:
EmaToday = ((ValueToday - EmaYesterday) * M) + EmaYesterday
We can do this in traditional python using loops as follows:
# Initialize an empty numpy array to hold calculated ema values
emaTodayArray = np.zeros((1, valueTodayArray.size - Days), dtype=np.float32)
ema = emaYesterday
# Calculate ema
for i, valueToday in enumerate(np.nditer(valueList)):
ema = ((valueToday - ema) * M) + ema
emaTodayArray[i] = ema
emaTodayArray holds all the computed EMA values.
I'm having a hard time trying to figure out how to vectorize this completely as the emaYesterday value is needed for every new calculation.
If a full vectorization using numpy is possible first of all, I'd really appreciate it if someone can show me the way.

Note: I had to fill in a few dummies to make your code run, pls check whether they are ok.
The loop can be vectorized by transforming ema[i] ~> ema'[i] = ema[i] x (1-M)^-i after which it becomes just a cumsum.
This is implemented below as ema_pp_naive.
The problem with this method is that for medium sized i (~10^3) the (1-M)^-i term may overflow rendering the result useless.
We can circumvent this problem by going to log space (using np.logaddexp for the summation). This ema_pp_safe is quite a bit more expensive than the naive method but still >10x faster than the original loop. In my quick and dirty testing this gave correct results for a million terms and beyond.
Code:
import numpy as np
K = 1000
Days = 0
emaYesterday = np.random.random()
valueTodayArray = np.random.random(K)
M = np.random.random()
valueList = valueTodayArray
import time
T = []
T.append(time.perf_counter())
# Initialize an empty numpy array to hold calculated ema values
emaTodayArray = np.zeros((valueTodayArray.size - Days), dtype=np.float32)
ema = emaYesterday
# Calculate ema
for i, valueToday in enumerate(np.nditer(valueList)):
ema = ((valueToday - ema) * M) + ema
emaTodayArray[i] = ema
T.append(time.perf_counter())
scaling = np.broadcast_to(1/(1-M), valueTodayArray.size+1).cumprod()
ema_pp_naive = ((np.concatenate([[emaYesterday], valueTodayArray * M]) * scaling).cumsum() / scaling)[1:]
T.append(time.perf_counter())
logscaling = np.log(1-M)*np.arange(valueTodayArray.size+1)
log_ema_pp = np.logaddexp.accumulate(np.log(np.concatenate([[emaYesterday], valueTodayArray * M])) - logscaling) + logscaling
ema_pp_safe = np.exp(log_ema_pp[1:])
T.append(time.perf_counter())
print(f'K = {K}')
print('naive method correct:', np.allclose(ema_pp_naive, emaTodayArray))
print('safe method correct:', np.allclose(ema_pp_safe, emaTodayArray))
print('OP {:.3f} ms naive {:.3f} ms safe {:.3f} ms'.format(*np.diff(T)*1000))
Sample runs:
K = 100
naive method correct: True
safe method correct: True
OP 0.236 ms naive 0.061 ms safe 0.053 ms
K = 1000
naive method correct: False
safe method correct: True
OP 2.397 ms naive 0.224 ms safe 0.183 ms
K = 1000000
naive method correct: False
safe method correct: True
OP 2145.956 ms naive 18.342 ms safe 108.528 ms

Related

Speeding up vector distance calculation using Numba

Below are some of the functions I wrote for distance (square) calculation in 3-D toroidal geometry for a collection of particles in that 3-D space:
import itertools
import time
import numpy as np
import scipy
import numba
from numba import njit
#njit(cache=True)
def get_dr2(i=np.array([]),j=np.array([]),cellsize=np.array([])):
k=np.zeros(3,dtype=np.float64)
dr2=0.0
for idx in numba.prange(cellsize.shape[0]):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.rint((j[idx]-i[idx])/cellsize[idx])
dr2+=k[idx]**2
return dr2
#numba.guvectorize(["void(float64[:],float64[:],float64[:],float64[:])"],
"(m),(m),(m)->()",nopython=True,cache=True)
def get_dr2_vec(i,j,cellsize,dr2):
dr2[:]=0.0
k=np.zeros(3,dtype=np.float64)
for idx in numba.prange(cellsize.shape[0]):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.rint((j[idx]-i[idx])/cellsize[idx])
dr2[0]+=k[idx]**2
#njit(cache=True)
def pair_vec_gen(pIList=np.array([[]]),pJList=np.array([[]])):
assert pIList.shape[1] == pJList.shape[1]
vecI=np.zeros((pIList.shape[0]*pJList.shape[0],pIList.shape[1]))
vecJ=np.zeros_like(vecI)
for i in numba.prange(pIList.shape[0]):
for j in numba.prange(pJList.shape[0]):
for k in numba.prange(pIList.shape[1]):
vecI[j+pJList.shape[0]*i][k]=pIList[i][k]
vecJ[j+pJList.shape[0]*i][k]=pJList[j][k]
return vecI,vecJ
#njit(cache=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
vecI=np.zeros((pIList.shape[0]*pJList.shape[0],pIList.shape[1]))
vecJ=np.zeros_like(vecI)
r2List=np.zeros(vecI.shape[0])
for i in numba.prange(pIList.shape[0]):
for j in numba.prange(pJList.shape[0]):
for k in numba.prange(pIList.shape[1]):
vecI[j+pJList.shape[0]*i][k]=pIList[i][k]
vecJ[j+pJList.shape[0]*i][k]=pJList[j][k]
r2List=get_dr2_vec2(vecI,vecJ,cellsize)
return r2List
#njit(cache=True)
def get_dr2_vec2(i=np.array([[]]),j=np.array([[]]),cellsize=np.array([])):
dr2=np.zeros(i.shape[0],dtype=np.float64)
k=np.zeros(i.shape[1],dtype=np.float64)
for m in numba.prange(i.shape[0]):
for n in numba.prange(i.shape[1]):
k[n] = (j[m,n]-i[m,n])-cellsize[n]*np.rint((j[m,n]-i[m,n])/cellsize[n])
dr2[m]+=k[n]**2
return dr2
def pair_dist_calculator_cdist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
r2List = (scipy.spatial.distance.cdist(pIList, pJList, metric=get_dr2_wrapper(cellsize=cellsize))).flatten()
return np.array(r2List).flatten()
def get_dr2_wrapper(cellsize=np.array([])):
return lambda u, v: get_dr2(u,v,cellsize)
frames=50
timedata=np.zeros((5,frames),dtype=np.float64)
N, dim = 100, 3 # 100 particles in 3D
cellsize=np.array([26.4,19.4,102.4])
for i in range(frames):
print("\rIter {}".format(i),end='')
vec = np.random.random((N, dim))
rList1=[];rList2=[];rList3=[];rList4=[];rList5=[]
#method 1
#print("method 1")
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
rList1.append(get_dr2(pI,pJ,cellsize))
end =time.perf_counter()
timedata[0,i]=(end-start)
#method 2
#print("method 2")
pIvec=[];pJvec=[];rList2=[]
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
pIvec.append(pI)
pJvec.append(pJ)
rList2=get_dr2_vec(np.array(pIvec),np.array(pJvec),cellsize)
end =time.perf_counter()
timedata[1,i]=(end-start)
#method 3
#print("method 3")
start = time.perf_counter()
rList3=get_dr2_vec(*pair_vec_gen(vec,vec),cellsize)
end =time.perf_counter()
timedata[2,i]=(end-start)
#method 4
#print("method 4")
start = time.perf_counter()
rList4=pair_vec_dist(vec,vec,cellsize)
end =time.perf_counter()
timedata[3,i]=(end-start)
#method 5
#print("method 5")
#start = time.perf_counter()
#rList5=pair_dist_calculator_cdist(np.array(pIvec),np.array(pJvec),cellsize)
#end =time.perf_counter()
#timedata[4,i]=(end-start)
assert (rList1 == rList2).all()
assert (rList2 == rList3).all()
assert (rList3 == rList4).all()
#assert rList4 == rList5
print("\n")
for i in range(4):
print("Method {} Average time {:.3g}s \u00B1 {:.3g}s".format(i+1,np.mean(timedata[i,1:]),np.std(timedata[i,1:])))
exit()
The essential idea is that at a particular time you have a snapshot of the particles or frame which contains the position of the particles. Now we can calculate all the distances between the particles we can use the following approaches:
Calculate distance between points iteratively in pure python; passing the combination of the position of the two particles one by one via Numba.
Create an iteration list (in pure python) beforehand and pass the whole list to a Numba #guvectorize function
Do (2) but all steps in Numba
Integrate all step in (3) to a simple Numba function
(optional) parse the positions to scipy.spatial.distance.cdist with the distance function as the distance metric.
For 50 frames containing 100 particles we have the respective times (frames, N = 50, 100):
Method 1 Average time 0.017s ± 0.00555s
Method 2 Average time 0.0181s ± 0.00573s
Method 3 Average time 0.00182s ± 0.000944s
Method 4 Average time 0.000485s ± 0.000348s
For 50 frames containing 1000 particles we have the respective times (frames, N = 50, 1000):
Method 1 Average time 2.11s ± 0.977s
Method 2 Average time 2.42s ± 0.859s
Method 3 Average time 0.349s ± 0.12s
Method 4 Average time 0.0694s ± 0.022s
and for 1000 frames containing 100 particles we have the respective times (frames, N = 1000, 100):
Method 1 Average time 0.0244s ± 0.0166s
Method 2 Average time 0.0288s ± 0.0254s
Method 3 Average time 0.00258s ± 0.00231s
Method 4 Average time 0.000636s ± 0.00086s
(All the time shown above are after removing the contribution from the first iteration)
Method 5 simply fails due to memory requirements and is much slower in comparison to any other method
Given the above dataset, I tend to prefer Method 4 though I am a bit concerned about the average time increase when I increase frames from 50 to 1000. Is there any further optimizations I can do in these implementations or if someone has ideas for much faster and memory conscious implementations? Any suggestions are welcome.
Update
Based on Jerome's answer the modified function is now:
#njit(cache=True,parallel=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
assert cellsize.size == 3
dr2=np.zeros(pIList.shape[0]*pJList.shape[0],dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
offset = j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[offset] = xk**2+yk**2+zk**2
return dr2
As Jerome pointed out that a very simple optimization could be running the loops through just the "lower half of the symmetric matrix" the distance calculation creates, though in a realistic situation I might have vector lists as pI and pJ where pI is a subset of pJ, which complicates this situation. Either I have to create two separate functions and control them via a wrapper function or somehow manage that in one single function. If there are any suggestions on how to do so that would be really helpful.
Update 2
I should clarify the problem furthermore. In this code I am trying to calculate distance between all points in a frame/snapshot, which is used further for pair distance distribution analysis. But in some cases we might want to focus on a subset of coordinates in a frame and calculate the distribution from their perspective. In such a case we select this subset smallVec from a pool of all coordinates vec (such that smallVec +restOfVec = vec) and calculate pair_vec_dist(smallVec,vec) instead of pair_vec_dist(vec,vec). For this calculation one can use list(pair_vec_dist(smallVec,smallVec)).append(pair_vec_dist(smallVec,restOfVec).
Based on the discussion with Jerome, I modified my function as:
#njit(cache=True,parallel=True)
def pair_vec_dist_cmb(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([]),is_sq=True,is_nonsq=True):
assert pIList.shape[1] == pJList.shape[1]
assert cellsize.size == 3
dr2_1=0; dr2_2=0
dr2_1=int(0.5*pIList.shape[0]*(pIList.shape[0]+1))
if is_nonsq:
dr2_2=int(pIList.shape[0]*pJList.shape[0])
dr2 = np.zeros((dr2_1+dr2_2),dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for j in numba.prange(0,pIList.shape[0],1):
if is_sq:
for i in range(j,pIList.shape[0],1):
index_1 = int(0.5*i*(i+1)+j)
xdist = pIList[j,0]-pIList[i,0]
ydist = pIList[j,1]-pIList[i,1]
zdist = pIList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index_1] = xk**2+yk**2+zk**2
if is_nonsq:
for j in range(pJList.shape[0]):
index_2 = dr2_1+ j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index_2] = xk**2+yk**2+zk**2
return dr2
Where pI (size: (N,3)) is the subset of pJ (size (M,3). In this code we subdivide the calculation into two sections: pair distance between pI-pI, which is symmetric and hence we can calculate only the lower triangular matrix i.e. N(N-1)/2 unique values. The other section is pI-pJ distances where we have to go through M(M-N) unique values. To further optimize the function, I have two additional changes:
Combining the outer loop for both sections. In order to do so I am now iterating over the upper triangular matrix which translates to N(N+1)/2 values. One can also implement an if check to see if coordinates are identical, though I am not sure how much time it would save.
To avoid appending the results from the two section together, I am predefining and partitioning the returned array by length.
A further assumption I have made is that time needed for partitioning vec into smallVec and restOfVec is negligent with respect to the pair distance calculation. Obviously, if wrong, one might need to rethink another optimization pathway.
The resultant function is 1.5 times faster than the previous function. I am looking to further optimize the function, but I am very new to loop tilling and other advanced optimizations, so if you have any suggestions, please let me know.
Update 3
So I figured that I should focus on making the function more optimized in terms of serial calculations as I might simply use Dask or multiprocessing to implement to work on multiple sections of an input collection of frames. So the reference function now is:
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test(pIList,pJList,cellsize):
_I=pIList.shape[0]
_J=pJList.shape[0]
dr2 = np.empty(int(_I*_J),dtype=np.float32)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
index = j + pJList.shape[0] * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index] = xk**2+yk**2+zk**2
return dr2
Going back to the main problem while ignoring the symmetry aspect, I tried to further optimize the distance function as:
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test_v2(pIList,pJList,cellsize):
_I=pIList.shape[0]
_J=pJList.shape[0]
dr2 = np.empty(int(_I*_J),dtype=np.float32)
inv_cellsize = 1.0 / cellsize
tile=32
for ii in range(0,_I,tile):
for jj in range(0,_J,tile):
for i in range(ii,min(_I,ii+tile)):
for j in range(jj,min(_J,jj+tile)):
index = j + _J * i
xdist = pJList[j,0]-pIList[i,0]
ydist = pJList[j,1]-pIList[i,1]
zdist = pJList[j,2]-pIList[i,2]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
dr2[index] = xk**2+yk**2+zk**2
return dr2
which is essentially tiling up the two vector arrays. However I couldn't get any speedup as the exec time for both functions are roughly the same. I also thought about working with the transpose of the vector arrays, but I couldn't figure out how to align them in a loop when the vector lengths are not a multiple of tile length. Does anyone has any further suggestions or ideas on how to procced?
Edit: Another failed trial
#njit(cache=True,parallel=False, fastmath=True, boundscheck=False, nogil=True)
def pair_vec_dist_test_v3(pIList,pJList,cellsize):
inv_cellsize = 1.0 / cellsize
tile=32
_I=pIList.shape[0]
_J=pJList.shape[0]
vecI=np.empty((_I+2*tile,3),dtype=np.float64) # for rolling effect
vecJ=np.empty((_J+2*tile,3),dtype=np.float64) # for rolling effect
vecI_mask=np.ones((_I+2*tile),dtype=np.uint8)
vecJ_mask=np.ones((_J+2*tile),dtype=np.uint8)
vecI[:_I]=pIList
vecJ[:_J]=pJList
vecI[_I:]=0.
vecJ[_J:]=0.
vecI_mask[_I:]=0
vecI_mask[_J:]=0
#print(vecI,vecJ)
ILim=_I+(tile-_I%tile)
JLim=_J+(tile-_J%tile)
dr2 = np.empty((ILim*JLim),dtype=np.float64)
vecI=vecI.T
vecJ=vecJ.T
for ii in range(ILim):
for jj in range(0,JLim,tile):
index = jj + JLim*ii
#print(ii,jj,index)
mask = np.multiply(vecJ_mask[jj:jj+tile],vecI_mask[ii:ii+tile])
xdist = vecJ[0,jj:jj+tile]-vecI[0,ii:ii+tile]
ydist = vecJ[1,jj:jj+tile]-vecI[1,ii:ii+tile]
zdist = vecJ[2,jj:jj+tile]-vecI[2,ii:ii+tile]
xk = xdist-cellsize[0]*np.rint(xdist*inv_cellsize[0])
yk = ydist-cellsize[1]*np.rint(ydist*inv_cellsize[1])
zk = zdist-cellsize[2]*np.rint(zdist*inv_cellsize[2])
arr = xk**2+yk**2+zk**2
dr2[index:index+tile] = np.multiply(arr,mask)
return dr2

First things first: there are races conditions in your current code. This basically means the produced results can be corrupted (and it also impact performance). In practice, this causes an undefined behaviour. For example, k[n] is read by multiple thread in get_dr2_vec2. One need to be very careful when using prange. In this case, the race condition can be removed by just not using a temporary array which is not really useful and not using prange in the inner loop due to dr2[m] being updated (updating it from multiple threads also cause a race condition).
Moreover, prange is often not practically useful when parallel=True is not set in the Numba decorator. Indeed, the current functions are not parallel since this flag is missing.
Finally, you can merge the function pair_vec_dist and get_dr2_vec2 and the internal loops so to avoid creating and filling large temporary arrays. Indeed, the RAM throughput is pretty small nowadays compared to the computing power of modern processor. This gap is getting bigger since the last two decades. This effect is called the "memory wall" and it is not expected to disappear any time soon. Codes less memory-bound generally tends to be faster and scale better.
Here is the resulting code:
#njit(cache=True, parallel=True)
def pair_vec_dist(pIList=np.array([[]]),pJList=np.array([[]]),cellsize=np.array([])):
assert pIList.shape[1] == pJList.shape[1]
dr2=np.zeros(pIList.shape[0]*pJList.shape[0],dtype=np.float64)
inv_cellsize = 1.0 / cellsize
for i in numba.prange(pIList.shape[0]):
for j in range(pJList.shape[0]):
offset = j + pJList.shape[0] * i
for k in range(pIList.shape[1]):
tmp = pJList[j,k]-pIList[i,k]
k = tmp-cellsize[k]*np.rint(tmp*inv_cellsize[k])
dr2[offset] += k**2
return dr2
It is 11 times faster with frames=50 and N=1000 on my 6-core machine (i5-9600KF).
The code can be optimized further. For example, dr2 is a flatten symmetric square matrix, so only the upper-right part needs to be computed and the bottom-left part can just be copied. Note that to do that efficiently in parallel, the work needs to be balanced between the thread (otherwise, the slowest will not be faster and will be the bottleneck). One can also generate an optimized version of the function only supporting cellsize.size == 3. Moreover, one can use register tiling so to make the code more cache-friendly. Finally, one can transpose the input so the layout is more SIMD-friendly (this certainly require the loop to be manually unrolled and the register tiling optimization to be done before).

How I can improve a loop for in a Python code?

I am translating this code from Matlab to Python. The code function fine but it is painfully slow in python. In Matlab, the code runs in way less then a minute, in python it took 30 min!!! Someone with mode experience in python could help me?
# P({ai})
somai = 0
for i in range(1, n):
somaj = 0
for j in range(1, n):
exponencial = math.exp(-((a[i] - a[j]) * (a[i] - a[j])) / dev_a2 - ((b[i] - b[j]) * (b[i] - b[j])) / dev_b2)
somaj = somaj + exponencial
somai = somai + somaj

As with MATLAB, I'd recommend you vectorize your code. Iterating by for-loops can be much slower than the lower level implementation of MATLAB and numpy.
Your operations (a[i] - a[j])*(a[i] - a[j]) are pairwise squared-Euclidean distance for all N data points. You can calculate a pairwise distance matrix using scipy's pdist and squareform functions -- pdist, squareform.
Then you calculate the difference between pairwise distance matrices A and B, and sum the exponential decay. So you could get a vectorized code like:
import numpy as np
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
# Example data
N = 1000
a = np.random.rand(N,1)
b = np.random.rand(N,1)
dev_a2 = np.random.rand()
dev_b2 = np.random.rand()
# `a` is an [N,1] matrix (i.e. column vector)
A = pdist(a, 'sqeuclidean')
# Change to pairwise distance matrix
A = squareform(A)
# Divide all elements by same divisor
A = A / dev_a2
# Then do the same for `b`'s
# `b` is an [N,1] matrix (i.e. column vector)
B = pdist(b, 'sqeuclidean')
B = squareform(B)
B = B / dev_b2
# Calculate exponential decay
expo = np.exp(-(A-B))
# Sum all elements
total = np.sum(expo)
Here's a quick timing comparison between the iterative method and this vectorized code.
N: 1000 | Iter Output: 2729989.851117 | Vect Output: 2732194.924364
Iter time: 6.759 secs | Vect time: 0.031 secs
N: 5000 | Iter Output: 24855530.997400 | Vect Output: 24864471.007726
Iter time: 171.795 secs | Vect time: 0.784 secs
Note that the final results are not exactly the same. I'm not sure why this is, it might be rounding error or math error on my part, but I'll leave that to you.

TLDR
Use numpy
Why Numpy?
Python, by default, is slow. One of the powers of python is that it plays nicely with C and has tons of libraries. The one that will help you hear is numpy. Numpy is mostly implemented in C and, when used properly, is blazing fast. The trick is to phrase the code in such a way that you keep the execution inside numpy and outside of python proper.
Code and Results
import math
import numpy as np
n = 1000
np_a = np.random.rand(n)
a = list(np_a)
np_b = np.random.rand(n)
b = list(np_b)
dev_a2, dev_b2 = (1, 1)
def old():
somai = 0.0
for i in range(0, n):
somaj = 0.0
for j in range(0, n):
tmp_1 = -((a[i] - a[j]) * (a[i] - a[j])) / dev_a2
tmp_2 = -((b[i] - b[j]) * (b[i] - b[j])) / dev_b2
exponencial = math.exp(tmp_1 + tmp_2)
somaj += exponencial
somai += somaj
return somai
def new():
tmp_1 = -np.square(np.subtract.outer(np_a, np_a)) / dev_a2
tmp_2 = -np.square(np.subtract.outer(np_b, np_b)) / dev_a2
exponential = np.exp(tmp_1 + tmp_2)
somai = np.sum(exponential)
return somai
old = 1.76 s ± 48.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
new = 24.6 ms ± 66.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
This is about a 70x improvement
old yields 740919.6020840995
new yields 740919.602084099
Explanation
You'll notice I broke up your code with the tmp_1 and tmp_2 a bit for clarity.
np.random.rand(n): This creates an array of length n that has random floats going from 0 to 1 (excluding 1) (documented here).
np.subtract.outer(a, b): Numpy has modules for all the operators that allow you do various things with them. Lets say you had np_a = [1, 2, 3], np.subtract.outer(np_a, np_a) would yield
array([[ 0, -1, -2],
[ 1, 0, -1],
[ 2, 1, 0]])
Here's a stackoverflow link if you want to go deeper on this. (also the word "outer" comes from "outer product" like from linear algebra)
np.square: simply squares every element in the matrix.
/: In numpy when you do arithmetic operators between scalars and matrices it does the appropriate thing and applies that operation to every element in the matrix.
np.exp: like np.square
np.sum: sums every element together and returns a scalar.

Efficient online linear regression algorithm in python

I got a 2-D dataset with two columns x and y. I would like to get the linear regression coefficients and interception dynamically when new data feed in. Using scikit-learn I could calculate all current available data like this:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
x = np.arange(100)
y = np.arange(100)+10*np.random.random_sample((100,))
regr.fit(x,y)
print(regr.coef_)
print(regr.intercept_)
However, I got quite big dataset (more than 10k rows in total) and I want to calculate coefficient and intercept as fast as possible whenever there's new rows coming in. Currently calculate 10k rows takes about 600 microseconds, and I want to accelerate this process.
Scikit-learn looks like does not have online update function for linear regression module. Is there any better ways to do this?

I've found solution from this paper: updating simple linear regression. The implementation is as below:
def lr(x_avg,y_avg,Sxy,Sx,n,new_x,new_y):
"""
x_avg: average of previous x, if no previous sample, set to 0
y_avg: average of previous y, if no previous sample, set to 0
Sxy: covariance of previous x and y, if no previous sample, set to 0
Sx: variance of previous x, if no previous sample, set to 0
n: number of previous samples
new_x: new incoming 1-D numpy array x
new_y: new incoming 1-D numpy array x
"""
new_n = n + len(new_x)
new_x_avg = (x_avg*n + np.sum(new_x))/new_n
new_y_avg = (y_avg*n + np.sum(new_y))/new_n
if n > 0:
x_star = (x_avg*np.sqrt(n) + new_x_avg*np.sqrt(new_n))/(np.sqrt(n)+np.sqrt(new_n))
y_star = (y_avg*np.sqrt(n) + new_y_avg*np.sqrt(new_n))/(np.sqrt(n)+np.sqrt(new_n))
elif n == 0:
x_star = new_x_avg
y_star = new_y_avg
else:
raise ValueError
new_Sx = Sx + np.sum((new_x-x_star)**2)
new_Sxy = Sxy + np.sum((new_x-x_star).reshape(-1) * (new_y-y_star).reshape(-1))
beta = new_Sxy/new_Sx
alpha = new_y_avg - beta * new_x_avg
return new_Sxy, new_Sx, new_n, alpha, beta, new_x_avg, new_y_avg
Performance comparison:
Scikit learn version that calculate 10k samples altogether.
from sklearn.linear_model import LinearRegression
x = np.arange(10000).reshape(-1,1)
y = np.arange(10000)+100*np.random.random_sample((10000,))
regr = LinearRegression()
%timeit regr.fit(x,y)
# 419 µs ± 14.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
My version assume 9k sample is already calculated:
Sxy, Sx, n, alpha, beta, new_x_avg, new_y_avg = lr(0, 0, 0, 0, 0, x.reshape(-1,1)[:9000], y[:9000])
new_x, new_y = x.reshape(-1,1)[9000:], y[9000:]
%timeit lr(new_x_avg, new_y_avg, Sxy,Sx,n,new_x, new_y)
# 38.7 µs ± 1.31 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
10 times faster, which is expected.

Nice! Thanks for sharing your findings :) Here is an equivalent implementation of this solution written with dot products:
class SimpleLinearRegressor(object):
def __init__(self):
self.dots = np.zeros(5)
self.intercept = None
self.slope = None
def update(self, x: np.ndarray, y: np.ndarray):
self.dots += np.array(
[
x.shape[0],
x.sum(),
y.sum(),
np.dot(x, x),
np.dot(x, y),
]
)
size, sum_x, sum_y, sum_xx, sum_xy = self.dots
det = size * sum_xx - sum_x ** 2
if det > 1e-10: # determinant may be zero initially
self.intercept = (sum_xx * sum_y - sum_xy * sum_x) / det
self.slope = (sum_xy * size - sum_x * sum_y) / det
When working with time series data, we can extend this idea to do sliding window regression with a soft (EMA-like) window.

You can use accelerated libraries that implement faster algorithms - particularly
https://github.com/intel/scikit-learn-intelex
For linear regression you would get much better performance
First install package
pip install scikit-learn-intelex
And then add in your python script
from sklearnex import patch_sklearn
patch_sklearn()

Optimization of a timeline builder function

I've got a squared signal with a frequency f, and I'm interested in the time at which the square starts.
def time_builder(f, t0=0, tf=300):
"""
Function building the time line in ms between t0 and tf with a frequency f.
f: Hz
t0 and tf: ms
"""
time = [t0] # /!\ time in ms
i = 1
while time[len(time)-1] < tf:
if t0 + (i/f)*1000 < tf:
time.append(t0 + (i/f)*1000)
else:
break
i += 1
return time
So this function loops between t0 and tf to create a list in which is the timing at which a square starts. I'm quite sure it's not the best way to do it, and I'd like to know how to improve it.
Thanks.

If I am interpreting this correct, you are looking for a list of the times of the waves, starting at t0 and ending at tf.
def time_builder(f, t0=0, tf=300):
"""
Function building the time line in ms between t0 and tf with a frequency f.
f: Hz
t0 and tf: ms
"""
T = 1000 / f # period [ms]
n = int( (tf - t0) / T + 0.5 ) # n integer number of wavefronts, +0.5 added for rounding consistency
return [t0 + i*T for i in range(n)]

Using standard library python for this might not be the best approach... particularly considering that you might want to do other things later on.
An alternative is to use numpy. This will let you to do the following
from numpy import np
from scipy import signal
t = np.linspace(0, 1, 500, endpoint=False)
s = signal.square(2 * np.pi * 5 * t) # we create a square signal usign scipy
d = np.diff(s) # obtaining the differences, this tell when there is a step.
# In this particular case, 2 means step up -2 step down.
starts = t[np.where(d == 2)] # take the times array t filtered by which
# elements in the differences array d equal to 2

Convert to Web Mercator With Numpy

My program vertically stretches a Numpy array, representing a 180 by 360 map image, so it represents a Web Mercator map image.
I wrote a function (below) that does what I want - but it is crazy slow (takes like five minutes). Is there a much faster and easier way to do this? Maybe using Numpy interpolate2d or MatPlotLib?
def row2lat(row):
return 180.0/math.pi*(2.0*math.atan(math.exp(row*math.pi/180.0))-math.pi/2.0)
def mercator(geodetic):
geo = np.repeat(geodetic, 2, axis=0)
merc = np.zeros_like(geo)
side = geo[0].size
for row in range(side):
lat = row2lat(180 - ((row * 1.0)/side) * 360)
g_row = (abs(90 - lat)/180)*side
fraction = g_row-math.floor(g_row)
for col in range(side):
high_row = geo[math.floor(g_row)][col] * (fraction)
low_row = geo[math.ceil(g_row)][col] * (1-fraction)
merc[row][col] = high_row + low_row
return merc

Try to avoid the inner for loop and vectorize your functions. Numpy is highly optimized to run those things efficient. Your function would then read like
def mercator_faster(geodetic):
geo = np.repeat(geodetic, 2, axis=0)
merc = np.zeros_like(geo)
side = geo[0].size
for row in range(side):
lat = row2lat(180 - ((row * 1.0)/side) * 360)
g_row = (abs(90 - lat)/180)*side
fraction = g_row-math.floor(g_row)
# Here I optimized the code by using the numpy vector operations
# instead of the for loop:
high_row = geo[math.floor(g_row), :] * (fraction)
low_row = geo[math.ceil(g_row), :] * (1-fraction)
merc[row, :] = high_row + low_row
return merc
If I run it on my machine it takes less then a second:
%timeit mercator_faster(geo)
1 loops, best of 3: 727 ms per loop
And it looks like this (I had to rescale it, because it was too big for SO):
Possibly the outer for loop might be vectorized as well, but I guess this is much harder.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Vectorizing calculation of values using numpy which requires previously calculated value - python

Related

Speeding up vector distance calculation using Numba

How I can improve a loop for in a Python code?

Efficient online linear regression algorithm in python

Optimization of a timeline builder function

Convert to Web Mercator With Numpy

Categories

Resources