Vectorize the following python code? - python

I am trying to vectorize the following operations with two matrix in python.
f= matrix([[ 96],
[192],
[288],
[384]], dtype=int32)
g = matrix([[ 0.],
[ 70.],
[ 200.],
[ 60.]])
Need to create z without creating loops such that z is maximum of cumulative sum of first column and sum of last value of z and another matrix g. This loop is called thousands of time, therefore slowing the run time.
for i in range(4):
if i != 0:
z[i] = max(f[i], z[i-1] + g[i])
else:
z[0] = f[i]
Any guidance on how to vectorize this code would be really helpful.
Thanks in advance.

Here is a vectorized version. It uses the cumulative maximum on the difference between f and cumsum(g) to predict the points where f[i] is larger than z[i]:
Timings:
N = 10
loopy 0.00594156 ms
vect 0.03193051 ms
N = 100
loopy 0.05560229 ms
vect 0.03186400 ms
N = 1000
loopy 0.57484017 ms
vect 0.04492043 ms
N = 10000
loopy 5.75115310 ms
vect 0.15519847 ms
N = 100000
loopy 57.30253551 ms
vect 1.69428380 ms
Code:
import numpy as np
import types
from timeit import timeit
def setup_data(N):
g = np.random.random((N,))
f = 2 + np.cumsum(np.random.random(N,))
return f, g
def f_loopy(f, g):
N, = f.shape
z = np.empty_like(f)
for i in range(N):
if i != 0:
z[i] = max(f[i], z[i-1] + g[i])
else:
z[0] = f[i]
return z
def f_vect(f, g):
N, = f.shape
gg = np.cumsum(g)
rmx = np.maximum.accumulate(f - gg)
sw = np.r_[0, 1 + np.flatnonzero(rmx[:-1] != rmx[1:]), N]
return gg + np.repeat(f[sw[:-1]]-gg[sw[:-1]], np.diff(sw))
for N in [10, 100, 1000, 10000, 100000]:
data = setup_data(N)
ref = f_loopy(*data)
print(f'N = {N}')
for name, func in list(globals().items()):
if not name.startswith('f_') or not isinstance(func, types.FunctionType):
continue
try:
assert np.allclose(ref, func(*data))
print("{:16s}{:16.8f} ms".format(name[2:], timeit(
'f(*data)', globals={'f':func, 'data':data}, number=100)*10))
except:
print("{:16s} apparently failed".format(name[2:]))

Related

a 2d numpy array element-wise raised to the power of another 2d array

I need to raise a large 2d numpy array element-wise to the power of another 2d array with the same dimensions. I tried to use numba to improve the speed. Is there an efficient implementation using numpy built-in functions instead of numba?
import numpy as np
from time import time
def elewise_power_raw(x, y):
z = np.zeros((n, n))
for i in range(n):
for j in range(n):
z[i,j] = x[i,j]**y[i,j]
return z
import numba
#numba.njit
def elewise_power_numba(x, y):
n = x.shape[0]
z = np.zeros((n, n))
for i in range(n):
for j in range(n):
z[i,j] = x[i,j]**y[i,j]
return z
def measure_time(n=5000):
x = np.random.rand(n, n)
y = np.random.rand(n, n)
t0 = time()
elewise_power_raw(x, y)
print('Raw: ', round(time() - t0, 2), 's' )
t1 = time()
elewise_power_numba(x, y)
print('numba: ', round(time() - t1, 2), 's' )
measure_time(5000)
# Raw: 22.31 s
# numba: 1.4 s
You can always vectorize it.
x = np.random.rand(5000, 5000)
y = np.random.rand(5000, 5000)
%timeit x**y
977 ms ± 7.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Cluster data based on distance threshold

I would like to delete a data on which is 10cm close from the previous data.
This is what i have but it takes a large computational time because my dataset is very huge
for i in range(len(data)):
for j in range(i, len(data)):
if (i == j):
continue
elif np.sqrt((data[i, 0]-data[j, 0])**2 + (data[i, 1]-data[i, 1])**2) <= 0.1:
data[j, 0] = np.nan
data = data[~np.isnan(data).any(axis=1)]
Is there a pythonic way to do this?
Here is an approach using a KDTree:
import numpy as np
from scipy.spatial import cKDTree as KDTree
def cluster_data_KDTree(a, thr=0.1):
t = KDTree(a)
mask = np.ones(a.shape[:1], bool)
idx = 0
nxt = 1
while nxt:
mask[t.query_ball_point(a[idx], thr)] = False
nxt = mask[idx:].argmax()
mask[idx] = True
idx += nxt
return a[mask]
Borrowing #Divakar's test case we see that this delivers another 100x speedup on top of the 400x Divakar reports. Compared to OP we extrapolate a ridiculous 40,000x:
np.random.seed(0)
data1 = np.random.rand(10000,2)
data2 = data1.copy()
from timeit import timeit
kwds = dict(globals=globals(), number=10)
print(timeit("cluster_data_KDTree(data1)", **kwds))
print(timeit("cluster_data_pdist_v1(data2)", **kwds))
np.random.seed(0)
data1 = np.random.rand(10000,2)
data2 = data1.copy()
out1 = cluster_data_KDTree(data1, thr=0.1)
out2 = cluster_data_pdist_v1(data2, dist_thresh = 0.1)
print(np.allclose(out1, out2))
Sample output:
0.05073001119308174
5.646531613077968
True
It turns out that this test case happens to be quite favorable to my approach because there are very few clusters and thus very few iterations.
If we drastically increase the number of clusters to about 3800 by changing the threshold to 0.01 KDTree still wins but the speedup is reduced from 100x to 15x:
0.33647687803022563
5.28947562398389
True
We can use pdist with one-loop -
from scipy.spatial.distance import pdist
def cluster_data_pdist_v1(a, dist_thresh = 0.1):
d = pdist(a)
mask = d<=dist_thresh
n = len(a)
idx = np.concatenate(( [0], np.arange(n-1,0,-1).cumsum() ))
start, stop = idx[:-1], idx[1:]
idx_out = np.zeros(mask.sum(), dtype=int) # use np.empty for bit more speedup
cur_start = 0
for iterID,(i,j) in enumerate(zip(start, stop)):
if iterID not in idx_out[:cur_start]:
rm_idx = np.flatnonzero(mask[i:j])+iterID+1
L = len(rm_idx)
idx_out[cur_start:cur_start+L] = rm_idx
cur_start += L
return np.delete(a, idx_out[:cur_start], axis=0)
Benchmarking
Original approach -
def cluster_data_org(data, dist_thresh = 0.1):
for i in range(len(data)):
for j in range(i, len(data)):
if (i == j):
continue
elif np.sqrt((data[i, 0]-data[j, 0])**2 +
(data[i, 1]-data[j, 1])**2) <= 0.1:
data[j, 0] = np.nan
return data[~np.isnan(data).any(axis=1)]
Runtime test, verification on random data in the range : [0,1) with 10,000 points -
In [207]: np.random.seed(0)
...: data1 = np.random.rand(10000,2)
...: data2 = data1.copy()
...:
...: out1 = cluster_data_org(data1, dist_thresh = 0.1)
...: out2 = cluster_data_pdist_v1(data2, dist_thresh = 0.1)
...: print np.allclose(out1, out2)
True
In [208]: np.random.seed(0)
...: data1 = np.random.rand(10000,2)
...: data2 = data1.copy()
In [209]: %timeit cluster_data_org(data1, dist_thresh = 0.1)
1 loop, best of 3: 1min 50s per loop
In [210]: %timeit cluster_data_pdist_v1(data2, dist_thresh = 0.1)
1 loop, best of 3: 287 ms per loop
Around 400x speedup for such a setup!

Arnaud Legoux Moving Average and numpy

I'd like to write the vectored version of code that calculates Arnaud Legoux Moving Average using NumPy (or Pandas). Could you help me with this, please? Thanks.
Non-vectored version looks like following (see below).
def NPALMA(pnp_array, **kwargs) :
'''
ALMA - Arnaud Legoux Moving Average,
http://www.financial-hacker.com/trend-delusion-or-reality/
https://github.com/darwinsys/Trading_Strategies/blob/master/ML/Features.py
'''
length = kwargs['length']
# just some number (6.0 is useful)
sigma = kwargs['sigma']
# sensisitivity (close to 1) or smoothness (close to 0)
offset = kwargs['offset']
asize = length - 1
m = offset * asize
s = length / sigma
dss = 2 * s * s
alma = np.zeros(pnp_array.shape)
wtd_sum = np.zeros(pnp_array.shape)
for l in range(len(pnp_array)):
if l >= asize:
for i in range(length):
im = i - m
wtd = np.exp( -(im * im) / dss)
alma[l] += pnp_array[l - length + i] * wtd
wtd_sum[l] += wtd
alma[l] = alma[l] / wtd_sum[l]
return alma
Starting Approach
We can create sliding windows along the first axis and then use tensor multiplication with the range of wtd values for the sum-reductions.
The implementation would look something like this -
# Get all wtd values in an array
wtds = np.exp(-(np.arange(length) - m)**2/dss)
# Get the sliding windows for input array along first axis
pnp_array3D = strided_axis0(pnp_array,len(wtds))
# Initialize o/p array
out = np.zeros(pnp_array.shape)
# Get sum-reductions for the windows which don't need wrapping over
out[length:] = np.tensordot(pnp_array3D,wtds,axes=((1),(0)))[:-1]
# Last element of the output needed wrapping. So, do it separately.
out[length-1] = wtds.dot(pnp_array[np.r_[-1,range(length-1)]])
# Finally perform the divisions
out /= wtds.sum()
Function to get the sliding windows : strided_axis0 is from here.
Boost with 1D convolution
Those multiplications with wtds values and then their sum-reductions are basically convolution along the first axis. As such, we can use scipy.ndimage.convolve1d along axis=0. This would be much faster given the memory efficiency, as we won't be creating huge sliding windows.
The implementation would be -
from scipy.ndimage import convolve1d as conv
avgs = conv(pnp_array, weights=wtds/wtds.sum(),axis=0, mode='wrap')
Thus, out[length-1:], which are the non-zero rows would be same as avgs[:-length+1].
There could be some precision difference if we are working with really small kernel numbers from wtds. So, keep that in mind if using this convolution method.
Runtime test
Approaches -
def original_app(pnp_array, length, m, dss):
alma = np.zeros(pnp_array.shape)
wtd_sum = np.zeros(pnp_array.shape)
for l in range(len(pnp_array)):
if l >= asize:
for i in range(length):
im = i - m
wtd = np.exp( -(im * im) / dss)
alma[l] += pnp_array[l - length + i] * wtd
wtd_sum[l] += wtd
alma[l] = alma[l] / wtd_sum[l]
return alma
def vectorized_app1(pnp_array, length, m, dss):
wtds = np.exp(-(np.arange(length) - m)**2/dss)
pnp_array3D = strided_axis0(pnp_array,len(wtds))
out = np.zeros(pnp_array.shape)
out[length:] = np.tensordot(pnp_array3D,wtds,axes=((1),(0)))[:-1]
out[length-1] = wtds.dot(pnp_array[np.r_[-1,range(length-1)]])
out /= wtds.sum()
return out
def vectorized_app2(pnp_array, length, m, dss):
wtds = np.exp(-(np.arange(length) - m)**2/dss)
return conv(pnp_array, weights=wtds/wtds.sum(),axis=0, mode='wrap')
Timings -
In [470]: np.random.seed(0)
...: m,n = 1000,100
...: pnp_array = np.random.rand(m,n)
...:
...: length = 6
...: sigma = 0.3
...: offset = 0.5
...:
...: asize = length - 1
...: m = np.floor(offset * asize)
...: s = length / sigma
...: dss = 2 * s * s
...:
In [471]: %timeit original_app(pnp_array, length, m, dss)
...: %timeit vectorized_app1(pnp_array, length, m, dss)
...: %timeit vectorized_app2(pnp_array, length, m, dss)
...:
10 loops, best of 3: 36.1 ms per loop
1000 loops, best of 3: 1.84 ms per loop
1000 loops, best of 3: 684 µs per loop
In [472]: np.random.seed(0)
...: m,n = 10000,1000 # rest same as previous one
In [473]: %timeit original_app(pnp_array, length, m, dss)
...: %timeit vectorized_app1(pnp_array, length, m, dss)
...: %timeit vectorized_app2(pnp_array, length, m, dss)
...:
1 loop, best of 3: 503 ms per loop
1 loop, best of 3: 222 ms per loop
10 loops, best of 3: 106 ms per loop

Double dot product with broadcasting in numpy

I have the following operation :
import numpy as np
x = np.random.rand(3,5,5)
w = np.random.rand(5,5)
y=np.zeros((3,5,5))
for i in range(3):
y[i] = np.dot(w.T,np.dot(x[i],w))
Which corresponds to the pseudo-expression y[m,i,j] = sum( w[k,i] * x[m,k,l] * w[l,j], axes=[k,l] or equivalently simply the dot product of w.T , x, w broadcaster over the first dimension of x.
How can I implement it with numpy's broadcasting rules ?
Thanks in advance.
Here's one vectorized approach with np.tensordot which should be better than broadcasting + summation anyday -
# Take care of "np.dot(x[i],w)" term
x_w = np.tensordot(x,w,axes=((2),(0)))
# Perform "np.dot(w.T,np.dot(x[i],w))" : "np.dot(w.T,x_w)"
y_out = np.tensordot(x_w,w,axes=((1),(0))).swapaxes(1,2)
Alternatively, all of the mess being taken care of with one np.einsum call, but could be slower -
y_out = np.einsum('ab,cae,eg->cbg',w,x,w)
Runtime test -
In [114]: def tensordot_app(x, w):
...: x_w = np.tensordot(x,w,axes=((2),(0)))
...: return np.tensordot(x_w,w,axes=((1),(0))).swapaxes(1,2)
...:
...: def einsum_app(x, w):
...: return np.einsum('ab,cae,eg->cbg',w,x,w)
...:
In [115]: x = np.random.rand(30,50,50)
...: w = np.random.rand(50,50)
...:
In [116]: %timeit tensordot_app(x, w)
1000 loops, best of 3: 477 µs per loop
In [117]: %timeit einsum_app(x, w)
1 loop, best of 3: 219 ms per loop
Giving the broadcasting a chance
The sum-notation was -
y[m,i,j] = sum( w[k,i] * x[m,k,l] * w[l,j], axes=[k,l] )
Thus, the three terms would be stacked for broadcasting, like so -
w : [ N x k x i x N x N]
x : [ m x k x N x l x N]
w : [ N x N X N x l x j]
, where N represents new-axis being appended to facilitate broadcasting along those dims.
The terms with new axes being added with None/np.newaxis would then look like this -
w : w[None, :, :, None, None]
x : x[:, :, None, :, None]
w : w[None, None, None, :, :]
Thus, the broadcasted product would be -
p = w[None,:,:,None,None]*x[:,:,None,:,None]*w[None,None,None,:,:]
Finally, the output would be sum-reduction to lose (k,l), i.e. axes =(1,3) -
y = p.sum((1,3))

Optimize a numpy ndarray indexing operation

I have a numpy operation that looks like the following:
for i in range(i_max):
for j in range(j_max):
r[i, j, x[i, j], y[i, j]] = c[i, j]
where x, y and c have the same shape.
Is it possible to use numpy's advanced indexing to speed this operation up?
I tried using:
i = numpy.arange(i_max)
j = numpy.arange(j_max)
r[i, j, x, y] = c
However, I didn't get the result I expected.
Using linear indexing -
d0,d1,d2,d3 = r.shape
np.put(r,np.arange(i_max)[:,None]*d1*d2*d3 + np.arange(j_max)*d2*d3 + x*d3 +y,c)
Benchmarking and verification
Define functions -
def linear_indx(r,x,y,c,i_max,j_max):
d0,d1,d2,d3 = r.shape
np.put(r,np.arange(i_max)[:,None]*d1*d2*d3 + np.arange(j_max)*d2*d3 + x*d3 +y,c)
return r
def org_app(r,x,y,c,i_max,j_max):
for i in range(i_max):
for j in range(j_max):
r[i, j, x[i,j], y[i,j]] = c[i,j]
return r
Setup input arrays and benchmark -
In [134]: # Setup input arrays
...: i_max = 40
...: j_max = 50
...: D0 = 60
...: D1 = 70
...: N = 80
...:
...: r = np.zeros((D0,D1,N,N))
...: c = np.random.rand(i_max,j_max)
...:
...: x = np.random.randint(0,N,(i_max,j_max))
...: y = np.random.randint(0,N,(i_max,j_max))
...:
In [135]: # Make copies for testing, as both functions make in-situ changes
...: r1 = r.copy()
...: r2 = r.copy()
...:
In [136]: # Verify results by comparing with original loopy approach
...: np.allclose(linear_indx(r1,x,y,c,i_max,j_max),org_app(r2,x,y,c,i_max,j_max))
Out[136]: True
In [137]: # Make copies for testing, as both functions make in-situ changes
...: r1 = r.copy()
...: r2 = r.copy()
...:
In [138]: %timeit linear_indx(r1,x,y,c,i_max,j_max)
10000 loops, best of 3: 115 µs per loop
In [139]: %timeit org_app(r2,x,y,c,i_max,j_max)
100 loops, best of 3: 2.25 ms per loop
The indexing arrays need to be broadcastable for this to work. The only change needed is to add an axis to the first index i to match the shape with the rest. The quick way to accomplish this is by indexing with None (which is equivalent to numpy.newaxis):
i = numpy.arange(i_max)
j = numpy.arange(j_max)
r[i[:,None], j, x, y] = c

Categories