as stated in the title I want to make a calculation where instead of multiplying corresponding elements I binary XOR them and then add them.
Example for illustration:
EDIT: The big picture above IS the calculation but here we go: Take first row from the left [1 0 1] and first column from top matrix [1 0 0]. 1 XOR 1 = 0, 0 XOR 0 = 0, 1 XOR 0 = 1. Add them all 0 + 0 + 1 = 1. First row from the left [1 0 1], second column [0 0 0]: 1 XOR 0 = 1, 0 XOR 0 = 0, 1 XOR 0 = 1. Add them all 1 + 0 + 1 = 2. And so on
Is it possible to do that in numpy?
Try this:
M1 = np.array([[1, 0, 0], [0, 0, 0], [0, 0, 0]])
M2 = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
(M1 ^ M2[:,None]).sum(-1)
Output:
array([[1, 2, 2],
[2, 1, 1],
[2, 3, 3]])
EDIT
If you want to preallocate memory:
intermediary = np.empty((3,3,3), dtype=np.int32)
np.bitwise_xor(M1, M2[:,None], out=intermediary).sum(-1)
This is just a longer comment on Artys answer. There are a few things to speed up the Numba function.
Steps to improve performance
import numpy as np, numba
m1 = np.random.randint(low=0, high=1,size=1_000_000).reshape(1_000,1_000)
m2 = np.random.randint(low=0, high=1,size=1_000_000).reshape(1_000,1_000)
##Arty
#numba.njit(cache = True)
def matxor_1(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
mr[i, j] = np.sum(m1[:, j] ^ m2[i, :])
return mr
%timeit matxor_1(m1, m2)
#1.06 s ± 9.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#Aligned memory acces (real transpose the ascontiguousarray is important)
#numba.njit(cache = True)
def matxor_2(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
m1_T=np.ascontiguousarray(m1.T)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
mr[i, j] = np.sum(m1_T[j, :] ^ m2[i, :])
return mr
%timeit matxor_2(m1, m2)
#312 ms ± 7.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#Writing out the inner loop
#numba.njit(fastmath=True,cache = True)
def matxor_3(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
m1_T=np.ascontiguousarray(m1.T)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
acc=0
for k in range(m2.shape[1]):
acc+=m1_T[j, k] ^ m2[i, k]
mr[i, j] = acc
return mr
%timeit matxor_3(m1, m2)
#125 ms ± 3.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#Parallelization
#numba.njit(fastmath=True,cache = True,parallel=True)
def matxor_4(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
m1_T=np.ascontiguousarray(m1.T)
for i in numba.prange(mr.shape[0]):
for j in range(mr.shape[1]):
acc=0
for k in range(m2.shape[1]):
acc+=m1_T[j, k] ^ m2[i, k]
mr[i, j] = acc
return mr
%timeit matxor_4(m1, m2)
#23.8 ms ± 711 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
print(np.allclose(matxor_1(m1, m2),matxor_2(m1, m2)))
#True
print(np.allclose(matxor_1(m1, m2),matxor_3(m1, m2)))
#True
print(np.allclose(matxor_1(m1, m2),matxor_4(m1, m2)))
#True
You can just make a combination of two loops and Numpy 1D xor-sum, like below:
Try it online!
import numpy as np
m1 = np.array([[1, 0, 0], [0, 0, 0], [0, 0, 0]])
m2 = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
mr[i, j] = np.sum(m1[:, j] ^ m2[i, :])
print(mr)
Output:
[[1 2 2]
[2 1 1]
[2 3 3]]
As #MadPhysicist suggested you can use Numba JIT-optimizer (pip install numba) to boost code above and you'll get very fast code for you operations with small memory consumption:
Try it online!
import numpy as np, numba
#numba.njit(cache = True)
def matxor(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
mr[i, j] = np.sum(m1[:, j] ^ m2[i, :])
return mr
m1 = np.array([[1, 0, 0], [0, 0, 0], [0, 0, 0]])
m2 = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
print(matxor(m1, m2))
Also Numba code above can be boosted up to 44x times more thanks to following great improvements suggested and coded by #max9111:
import numpy as np, numba
m1 = np.random.randint(low=0, high=1,size=1_000_000).reshape(1_000,1_000)
m2 = np.random.randint(low=0, high=1,size=1_000_000).reshape(1_000,1_000)
##Arty
#numba.njit(cache = True)
def matxor_1(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
mr[i, j] = np.sum(m1[:, j] ^ m2[i, :])
return mr
%timeit matxor_1(m1, m2)
#1.06 s ± 9.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#Aligned memory acces (real transpose the ascontiguousarray is important)
#numba.njit(cache = True)
def matxor_2(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
m1_T=np.ascontiguousarray(m1.T)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
mr[i, j] = np.sum(m1_T[j, :] ^ m2[i, :])
return mr
%timeit matxor_2(m1, m2)
#312 ms ± 7.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#Writing out the inner loop
#numba.njit(fastmath=True,cache = True)
def matxor_3(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
m1_T=np.ascontiguousarray(m1.T)
for i in range(mr.shape[0]):
for j in range(mr.shape[1]):
acc=0
for k in range(m2.shape[1]):
acc+=m1_T[j, k] ^ m2[i, k]
mr[i, j] = acc
return mr
%timeit matxor_3(m1, m2)
#125 ms ± 3.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#Parallelization
#numba.njit(fastmath=True,cache = True,parallel=True)
def matxor_4(m1, m2):
mr = np.empty((m2.shape[0], m1.shape[1]), dtype = np.int64)
m1_T=np.ascontiguousarray(m1.T)
for i in numba.prange(mr.shape[0]):
for j in range(mr.shape[1]):
acc=0
for k in range(m2.shape[1]):
acc+=m1_T[j, k] ^ m2[i, k]
mr[i, j] = acc
return mr
%timeit matxor_4(m1, m2)
#23.8 ms ± 711 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
print(np.allclose(matxor_1(m1, m2),matxor_2(m1, m2)))
#True
print(np.allclose(matxor_1(m1, m2),matxor_3(m1, m2)))
#True
print(np.allclose(matxor_1(m1, m2),matxor_4(m1, m2)))
#True
I was trying to find something that is simpler for me to understand and runs faster. So, I came up with this.
To try and avoid loops, we can convert this problem into a matrix multiplication problem. Matrix multiplications are highly optimized in libraries like numpy(I guess).
XOR operation followed by summing calculates number of dissimilar bits between two bit vectors. We could count this value indirectly by using bipolar vectors, i.e. converting the 1,0 bit vectors to 1,-1 vectors.
Steps:
The matrix can be seen as a stack of bit vectors of length m. You can assume these bit vectors to be the rows of M1 and the columns of M2.
Obtain the product matrix M1 x M2.
Recover the number of dissimilar bits from the values in the product matrix using the following formula.
Each bit vector has m bits. m = length of bit vector
When comparing two bit vectors,
let k = number of similar bits
=> m-k = number of dissimilar bits
let r = intermediate result = result obtained by taking dot product of two 1,-1 vectors
clearly,
r = 1*k + (-1) (m-k)
r = k -m + k
r = 2k -m --(1)
OR
k = (m+r)/2 --(2) = sum(XNOR)
Given bit vectors, we know m. We can compute r as descibed above.
We can compute k using (2)
But we need the number of dissimilar bits (m-k)
m-k = m - ((m+r)/2) = (2m-m-r)/2 = (m-r)/2
number of dissimilar bits = (m-k) = (m-r)/2 = sum(XOR)
Doing the same at the matrix level, we get:
Actual code:
import numpy as np
def matXOR_matXNOR(M1,M2):
N1 = 2*M1 - 1; N2 = 2*M2 - 1 #convert to bipolar vectors
m1, n = N1.shape #N2 is n x m2 matrix
intr = np.matmul(N1,N2) #m1 x m2 matrix -> intermediate result
xorMat = (m-intr)//2
xnorMat = (m+intr)//2
return xorMat, xnorMat
Related
I have this piece of code:
other = np.random.rand((m,n,o))
prev = np.random.rand((m,n,o,m,n,o))
mu = np.zeros((m,n,o,m,n,o))
for c in range(m):
for i in range(n):
for j in range(o):
mu[c,i,j,c,i,j] = other[c,i,j]*prev[c,i,j,c,i,j]
And I'd like to simplify it using einsum notation (possibly saving time by skipping the for loops in python). However after a few tries I'm eventually not sure how to approach the problem. My current try is:
np.einsum('cijklm,cij->cijklm', prev, other)
It does not achieves the same result as the "for-loop" piece of code.
With shapes (2,3,4), I get:
In [52]: mu.shape
Out[52]: (2, 3, 4, 2, 3, 4)
This einsum expression complains that dimensions are repeated in the output:
In [53]: np.einsum('cijcij,cij->cijcij', prev, other).shape
Traceback (most recent call last):
File "<ipython-input-53-92862a0865a2>", line 1, in <module>
np.einsum('cijcij,cij->cijcij', prev, other).shape
File "<__array_function__ internals>", line 180, in einsum
File "/usr/local/lib/python3.8/dist-packages/numpy/core/einsumfunc.py", line 1359, in einsum
return c_einsum(*operands, **kwargs)
ValueError: einstein sum subscripts string includes output subscript 'c' multiple times
Without the repeat:
In [55]: x=np.einsum('cijcij,cij->cij', prev, other)
In [56]: x.shape
Out[56]: (2, 3, 4)
Nonzero values match:
In [57]: np.allclose(mu[np.nonzero(mu)].ravel(), x.ravel())
Out[57]: True
Or by extracting the diagonals from mu:
In [59]: I,J,K = np.ix_(np.arange(2),np.arange(3),np.arange(4))
In [60]: mu[I,J,K,I,J,K].shape
Out[60]: (2, 3, 4)
In [61]: np.allclose(mu[I,J,K,I,J,K],x)
Out[61]: True
Your einsum satisfies the same 'diagonals' test:
In [68]: y=np.einsum('cijklm,cij->cijklm', prev, other)
In [69]: y.shape
Out[69]: (2, 3, 4, 2, 3, 4)
In [70]: np.allclose(y[I,J,K,I,J,K],x)
Out[70]: True
So the mu values are also present in y, but distributed in a different way. But the arrays are too big to readily view and compare.
OK, each y[i,j,k] is the same, and equal to x. In mu most of these values are 0, with only selected diagonals being nonzero.
While einsum can generate the same nonzero values, it cannot distribute them in the same 3d diagonals way as your loop.
Changing your mu calculation to produce a 3d array:
In [76]: nu = np.zeros((m,n,o))
...: for c in range(m):
...: for i in range(n):
...: for j in range(o):
...: nu[c,i,j] = other[c,i,j]*prev[c,i,j,c,i,j]
...:
In [77]: np.allclose(nu,x)
Out[77]: True
edit
We can assign einsum result to the diagonals with:
In [134]: out = np.zeros((2,3,4,2,3,4))
In [135]: out[I,J,K,I,J,K] = x
In [136]: np.allclose(out, mu)
Out[136]: True
Conceptually it may be simpler than the as_strided solution. And may be just as fast. as_strided, while making a view, is not as fast as a reshape kind of view.
In [143]: %%timeit
...: out = np.zeros((m, n, o, m, n, o))
...: mu_view = np.lib.stride_tricks.as_strided(out,
...: shape=(m, n, o),
...: strides=[sum(mu.strides[i::3]) for i in range(3)
...: ]
...: )
...: np.einsum('cijcij,cij->cij', prev, other, out=mu_view)
...:
...:
31.6 µs ± 69.1 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [144]: %%timeit
...: out = np.zeros((2,3,4,2,3,4))
...: out[I,J,K,I,J,K] =np.einsum('cijcij,cij->cij', prev, other)
...:
...:
18.5 µs ± 178 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
or including the I,J,K generation in the time loop
In [146]: %%timeit
...: I,J,K = np.ix_(np.arange(2),np.arange(3),np.arange(4))
...: out = np.zeros((2,3,4,2,3,4))
...: out[I,J,K,I,J,K] =np.einsum('cijcij,cij->cij', prev, other)
40.4 µs ± 1.45 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
or creating the IJK directly:
In [151]: %%timeit
...: I,J,K = np.arange(2)[:,None,None],np.arange(3)[:,None],np.arange(4)
...: out = np.zeros((2,3,4,2,3,4))
...: out[I,J,K,I,J,K] =np.einsum('cijcij,cij->cij', prev, other)
25.1 µs ± 38.3 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
It is not possible to get this result using np.einsum() alone, but you can try this:
import numpy as np
from numpy.lib.stride_tricks import as_strided
m, n, o = 2, 3, 5
np.random.seed(0)
other = np.random.rand(m, n, o)
prev = np.random.rand(m, n, o, m, n, o)
mu = np.zeros((m, n, o, m, n, o))
mu_view = as_strided(mu,
shape=(m, n, o),
strides=[sum(mu.strides[i::3]) for i in range(3)]
)
np.einsum('cijcij,cij->cij', prev, other, out=mu_view)
The array mu should be then the same as the one produced by the code using nested loops in the question.
Some explanation. Regardless of a shape of a numpy array, internally its elements are stored in a contiguous block of memory. Part of the structure of an array are strides, which specify how many bytes one needs to jump when one of the indices of an array element is incremented by 1. Thus, in a 2-dimensional array arr, arr.stride[0] is the number of bytes separating an element arr[i, j] from arr[i+1, j] and arr.stride[1] is the number of bytes separating arr[i, j] from a[i, j+1]. Using the strides information numpy can find a given element in an array based on its indices. See e.g. this post for more details.
numpy.lib.stride_tricks.as_strided is a function that creates a view of a given array with custom-made strides. By specifying strides, one can change which array element corresponds to which indices. In the code above this is used to create mu_view, which is a view of mu with the property, that the element mu_view[c, i, j] is the element mu[c, i, j, c, i, j]. This is done by specifying strides of mu_view in terms of strides of mu. For example, the distance between mu_view[c, i, j] and mu_view[c+1, i, j] is set to be the distance between mu[c, i, j, c, i, j] and mu[c+1, i, j, c+1, i, j], which is mu.strides[0] + mu.strides[3].
Is there an efficient way to calculate sum of bits in each column over array in Python?
Example (Python 3.7 and Numpy 1.20.1):
Create numpy array with values 0 or 1
import numpy as np
array = np.array(
[
[1, 0, 1],
[1, 1, 1],
[0, 0, 1],
]
)
Compress size by np.packbits
pack_array = np.packbits(array, axis=1)
Expected result: sum of bits in each position (column) without np.unpackbits to get the same as array.sum(axis=0):
array([2, 1, 3])
I found just very slow solution:
dim = array.shape[1]
candidates = np.zeros((dim, dim)).astype(int)
np.fill_diagonal(candidates, 1)
pack_candidates = np.packbits(candidates, axis=1)
np.apply_along_axis(lambda c:np.sum((np.bitwise_and(pack_array, c) == c).all(axis=1)), 1, pack_candidates)
Using np.unpackbits can be problematic if the input array is big since the resulting array can be too big to fit in RAM, and even if it does fit in RAM, this would be far from being efficient since the huge array have to be written and read from the (slow) main memory. The same thing apply for CPU caches: smaller arrays can generally be computed faster. Moreover, np.unpackbits have a quite big overhead for small arrays.
AFAIK, this is not possible to do this operation very efficiently in Numpy while using a small amount of RAM (ie. using np.unpackbits, as pointed out by #mathfux). However, Numba can be used to speed up this computation, especially for small arrays. Here is the code:
#nb.njit('int32[::1](uint8[:,::1], int_)')
def bitSum(packed, m):
n = packed.shape[0]
assert packed.shape[1]*8-7 <= m <= packed.shape[1]*8
res = np.zeros(m, dtype=np.int32)
for i in range(n):
for j in range(m):
res[j] += bool(packed[i, j//8] & (128>>(j%8)))
return res
If you want a faster implementation, you can optimize the code by working on fixed-size tiles. However, this makes the code also more complex. Here is the resulting code:
#nb.njit('int32[::1](uint8[:,::1], int_)')
def bitSumOpt(packed, m):
n = packed.shape[0]
assert packed.shape[1]*8-7 <= m <= packed.shape[1]*8
res = np.zeros(m, dtype=np.int32)
for i in range(0, n, 4):
for j in range(0, m, 8):
if i+3 < n and j+7 < m:
# Highly-optimized 4x8 tile computation
k = j//8
b0, b1, b2, b3 = packed[i,k], packed[i+1,k], packed[i+2,k], packed[i+3,k]
for j2 in range(8):
shift = 7 - j2
mask = 1 << shift
res[j+j2] += ((b0 & mask) + (b1 & mask) + (b2 & mask) + (b3 & mask)) >> shift
else:
# Slow fallback computation
for i2 in range(i, min(i+4, n)):
for j2 in range(j, min(j+8, m)):
res[j2] += bool(packed[i2, j2//8] & (128>>(j2%8)))
return res
Here are performance results on my machine:
On the example array:
Initial code: 62.90 us (x1)
numpy_sumbits: 4.37 us (x14)
bitSumOpt: 0.84 us (x75)
bitSum: 0.77 us (x82)
On a random 2000x2000 array:
Initial code: 1203.8 ms (x1)
numpy_sumbits: 3.9 ms (x308)
bitSum: 2.7 ms (x446)
bitSumOpt: 1.5 ms (x802)
The memory footprint of the Numba implementations is much better too (at least 8 times smaller).
It seems there is no better option in numpy than numpy.unpackbits.
To be more clear, let's take another example:
array = np.array([[1, 0, 1, 0, 1, 1, 1, 0, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 1, 0, 0, 0, 0, 0, 0]])
pack_array = np.packbits(array, axis=1)
dim = array.shape[1]
Now, pack_array is calculated in this way:
[[1,0,1,0,1,1,1,0], [1,0,0,0,0,0,0,0]] -> [174, 128]
[[1,1,1,1,1,1,1,1], [1,0,0,0,0,0,0,0]] -> [255, 128]
[[0,0,1,0,0,0,0,0], [0,0,0,0,0,0,0,0]] -> [32, 0]
I've tested various algorithms and unpacking bits seems to be the fastest:
def numpy_sumbits(pack_array, dim):
out = np.unpackbits(pack_array, axis=1, count=dim)
arr = np.sum(out, axis=0)
return arr
def manual_sumbits(pack_array, dim):
arr = pack_array.copy()
out = np.empty((dim//8+1) * 8, dtype=int)
for i in range(8):
out[7 - i%8::8] = np.sum(arr % 2, axis=0)
arr = arr // 2
return out[:dim]
def numpy_sumshifts(pack_array, dim):
res = (pack_array.reshape(pack_array.size, -1) >> np.arange(8)) % 2
res = res.reshape(*pack_array.shape, 8)
return np.sum(res, axis=0)[:,::-1].ravel()[:dim]
print(numpy_unpackbits(pack_array, dim))
print(manual_unpackbits(pack_array, dim))
print(numpy_sumshifts(pack_array, dim))
>>>
[2 1 3 1 2 2 2 1 2]
[2 1 3 1 2 2 2 1 2]
[2 1 3 1 2 2 2 1 2]
%%timeit
numpy_sumbits(pack_array, dim)
>>> 3.49 ms ± 57.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
manual_sumbits(pack_array, dim)
>>> 10 ms ± 22.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
numpy_sumshifts(pack_array, dim)
>>> 20.1 ms ± 97.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
How to can I compute variance without zero elements?
For example
np.var([[1, 1], [1, 2]], axis=1) -> [0, 0.25]
I need:
var([[1, 1, 0], [1, 2, 0]], axis=1) -> [0, 0.25]
Is it what your are looking for? You can filter out columns where all values are 0 (or at least one value is not 0).
m = np.array([[1, 1, 0], [1, 2, 0]])
np.var(m[:, np.any(m != 0, axis=0)], axis=1)
# Output
array([0. , 0.25])
V1
You can use a masked array:
data = np.array([[1, 1, 0], [1, 2, 0]])
np.ma.array(data, mask=(data == 0)).var(axis=1)
The result is
masked_array(data=[0. , 0.25],
mask=False,
fill_value=1e+20)
The raw numpy array is the data attribute of the resulting masked array:
>>> np.ma.array(data, mask=(data == 0)).var(axis=1).data
array([0. , 0.25])
V2
Without masked arrays, the operation of removing a variable number of elements in each row is a bit tricky. It would be simpler to implement the variance in terms of the formula sum(x**2) / N - (sum(x) / N)**2 and partial reduction of ufuncs.
First we need to find the split indices and segment lengths. In the general case, that looks like
lens = np.count_nonzero(data, axis=1)
inds = np.r_[0, lens[:-1].cumsum()]
Now you can operate on the raveled masked data:
mdata = data[data != 0]
mdata2 = mdata**2
var = np.add.reduceat(mdata2, inds) / lens - (np.add.reduceat(mdata, inds) / lens)**2
This gives you the same result for var (probably more efficiently than the masked version by the way):
array([0. , 0.25])
V3
The var function appears to use the more traditional formula (x - x.mean()).mean(). You can implement that using the quantities above with just a bit more work:
means = (np.add.reduceat(mdata, inds) / lens).repeat(lens)
var = np.add.reduceat((mdata - means)**2, inds) / lens
Comparison
Here is a quick benchmark for the two approaches:
def nzvar_v1(data):
return np.ma.array(data, mask=(data == 0)).var(axis=1).data
def nzvar_v2(data):
lens = np.count_nonzero(data, axis=1)
inds = np.r_[0, lens[:-1].cumsum()]
mdata = data[data != 0]
return np.add.reduceat(mdata**2, inds) / lens - (np.add.reduceat(mdata, inds) / lens)**2
def nzvar_v3(data):
lens = np.count_nonzero(data, axis=1)
inds = np.r_[0, lens[:-1].cumsum()]
mdata = data[data != 0]
return np.add.reduceat((mdata - (np.add.reduceat(mdata, inds) / lens).repeat(lens))**2, inds) / lens
np.random.seed(100)
data = np.random.randint(10, size=(1000, 1000))
%timeit nzvar_v1(data)
18.3 ms ± 278 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit nzvar_v2(data)
5.89 ms ± 69.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit nzvar_v3(data)
11.8 ms ± 62.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
So for a large dataset, the second approach, while requiring a bit more code, appears to be ~3x faster than masked arrays and ~2x faster than using the traditional formulation.
I have a data of shape d X N (each column is a vector of features)
I have this code for calculating the kernel matrix:
def kernel(x1, x2):
return x1.T # x2
data = np.array([[1,2,3], [1,2,3], [1,2,3]])
result = []
for i in range(data.shape[1]):
current_result = []
for j in range(data.shape[1]):
x1 = data[:, i]
x2 = data[:, j]
current_result.append(kernel(x1, x2))
result.append(current_result)
np.array(result)
and I am getting this result:
array([[ 3, 6, 9],
[ 6, 12, 18],
[ 9, 18, 27]])
The problem is that this code is too slow, so I tried to use np.vectorize:
vec = np.vectorize(kernel, signature='(n),(n)->()')
vec(data, data)
But I am getting the wrong result:
array([14, 14, 14])
what am I doing wrong?
When tested for bigger dimensions of your problem, and random numbers to ensure the robustness, for instance with dimensions (100,200), there are several ways:
import numpy as np
def kernel(x1, x2):
return x1.T # x2
def kernel_kenny(a):
result = []
for i in range(a.shape[1]):
current_result = []
for j in range(a.shape[1]):
x1 = a[:, i]
x2 = a[:, j]
current_result.append(kernel(x1, x2))
result.append(current_result)
return np.array(result)
a = np.random.random((100,200))
res1 = kernel_kenny(a)
# perhaps einsum signature might help you to understand the calculations
res2 = np.einsum('ji,jk->ik', a, a, optimize=True)
# or the following if you want to explicitly specify the transpose
# res2 = np.einsum('ij,jk->ik', a.T, a, optimize=True)
# or simply ...
res3 = a.T # a
Hera are the sanity checks:
np.allclose(res1,res2)
>>> True
np.allclose(res1,res3)
>>> True
and timings:
%timeit kernel_kenny(a)
>>> 83.2 ms ± 425 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit np.einsum('ji,jk->ik', a, a, optimize=True)
>>> 325 µs ± 4.15 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit a.T # a
>>> 82 µs ± 9.39 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
I've been computing pairwise distances with scipy, and I am trying to get distances to two of the closest neighbors. My current working solution is:
dists = squareform(pdist(xs.todense()))
dists = np.sort(dists, axis=1)[:, 1:3]
However, the squareform method is spatially very expensive and somewhat redundant in my case. I only need the two closest distances, not all of them. Is there a simple workaround?
Thanks!
The relation between linear index and the (i, j) of the upper triangle distance matrix is not directly, or easily, invertible (see note 2 in squareform doc).
However, by looping over all indices the inverse relation can be obtained:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist
def inverse_condensed_indices(idx, n):
k = 0
for i in range(n):
for j in range(i+1, n):
if k == idx:
return (i, j)
k +=1
else:
return None
# test
points = np.random.rand(8, 2)
distances = pdist(points)
sorted_idx = np.argsort(distances)
n = points.shape[0]
ij = [inverse_condensed_indices(idx, n)
for idx in sorted_idx[:2]]
# graph
plt.figure(figsize=(5, 5))
for i, j in ij:
x = [points[i, 0], points[j, 0]]
y = [points[i, 1], points[j, 1]]
plt.plot(x, y, '-', color='red');
plt.plot(points[:, 0], points[:, 1], '.', color='black');
plt.xlim(0, 1); plt.ylim(0, 1);
It seems to be a little faster than using squareform:
%timeit squareform(range(28))
# 9.23 µs ± 63 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit inverse_condensed_indices(27, 8)
# 2.38 µs ± 25 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)