Optimize a distance matrix calculation - python

Im trying to calculate a matrix distance from a Fourier transformation for the first two components. The matrix is a 40k by 40k and the way im doing it is extremely slow. Is there a way to calculate the matrix is a more efficient faster way?
import numpy as np
from scipy.linalg import dft
#Transform the data using Fourier Transform.
ft = norm_data.dot(dft(8).transpose())/sqrt(8)
def ft_distance_calc(x,y):
temp = np.zeros((x,y))
for i in range(x):
for z in range(y):
temp[i,z] = sqrt(np.square(abs(ft[i,0:2] - ft[z,0:2])).sum())
return temp
ft_distance = ft_distance_calc(40000,40000)

You can use built-in functions for it:
from scipy.spatial.distance import cdist
def ft_distance_calc_2(x,y):
return cdist(ft[:x,0:2],ft[:y,0:2])
Comparison using benchit:
#OP's solution
def ft_distance_calc(x,y):
temp = np.zeros((x,y))
for i in range(x):
for z in range(y):
temp[i,z] = np.sqrt(np.square(abs(ft[i,0:2] - ft[z,0:2])).sum())
return temp
##Ehsan's solution
def ft_distance_calc_2(x,y):
return cdist(ft[:x,0:2],ft[:y,0:2])
##Quang's solution
def dist_cal(x,y):
return np.sqrt(np.square(ft[:x,None, :2]-ft[None, :y, :2]).sum(-1))
ft = np.random.rand(1000,2)
in_ = {n:[n, n] for n in [10,100,1000]}
Seems like ft_distance_calc_2 is the fastest.

How about a broadcasting
def dist_cal(x,y):
return np.sqrt(np.square(ft[:x,None, :2]-ft[None, :y, :2]).sum(-1))
# test
a = ft_distance_calc(400,200)
b = dist_cal(400,200)
(np.abs(a-b) < 1e-6).all()
# True

Related

discrete cosine transform implementation differs from library function

I've implemented my own DCT function, but the output differs from scipy's fftpack dct function. I was wondering if anyone knows whether fftpack.dct( ) does any additional transformations, and if so what they are ?
Note: I've tried subtracting 128 from the data but that just changes the colors, not the frequency locations.
import numpy as np
from numpy import empty,arange,exp,real,imag,pi
from numpy.fft import rfft,irfft
import matplotlib.pyplot as plt
from scipy import fftpack
def dct(x):
N = len(x)
x2 = empty(2*N,float)
x2[:N] = x[:]
x2[N:] = x[::-1]
X = rfft(x2)
phi = exp(-1j*pi*arange(N)/(2*N))
return real(phi*X[:N])
def dct2(x):
M = x.shape[0]
N = x.shape[1]
a = empty([M,N],float)
X = empty([M,N],float)
for i in range(M):
a[i,:] = dct(x[i,:])
for j in range(N):
X[:,j] = dct(a[:,j])
return X
if __name__ == "__main__":
data = np.array([
[0,0,0,20,0,0,0],
[0,0,20,50,20,0,0],
[0,7,50,90,50,7,0],
[0,0,20,50,20,0,0],
[0,0,0,20,0,0,0],
])
X = dct2(data)
plt.matshow(X)
X2 = fftpack.dct(data)
plt.matshow(X2)
data:
X:
X2:
The scipy.fftpack.dct performs the 1D dct transform whereas you implemented the 2d dct transform. To perform the 2D dct using scipy use:
X2 = fftpack.dct(fftpack.dct(data, axis=0), axis=1)
This should solve your problem, since the resulting matrix using your example will be:
Which is similar to your implementation up to a constant factor. The constant factor can be controlled using the norm argument to the dct, read more here

3D Fourier transformation of a gaussian function in python

I'm trying to get the 3D Fourier Transform of the gaussian function e^(-r^(2)/2) in python using the numpy.fft library.
I've attempted using different ffts from the library with different inputs, shifting the results with np.fft.fftshift, trying to find a multiplicative factor and many other things, the last thing I tried was using the 1D fft function, and then cubing the result, here's the corresponding source code:
import numpy as np
R = float(10)
N = float(100)
y= np.dtype(np.float64)
dr = R/N
def F(x):
return np.exp(-((x*dr)**2)/2)
Frange = np.arange(1,int(N)+1)
y = np.zeros((int(N)))
i = 0
while i<int(N):
y[i] = F(Frange[i])
i += 1
y = y/3
y_fft = np.fft.fftshift(np.abs(np.fft.fft(y)))**3
print (y_fft)
The first values I get:
4.62e-03, 4.63e-03, 4.65e-03, 4.69e-03, 4.74e-03
According to Lado, Fred. (1971) Numerical Fourier transforms in one, two, and three dimensions for liquid state calculations, the analytic solution to the problem is: (2pi )^(3/2)*e^(-k^(2)/2)
And the first values of the analytic solution with the same values of R and N are:
14.99, 12.92, 10.10, 7.15, 4.58
I also created a DFT program using a formula provided in the previous article which gives the expected results, but I haven't been able to replicate the analytic results in any of my attempts using the NumPy or SciPy fft libraries.
Here's my program for the analytic and DFT results:
import math
import numpy as np
def F(r):
x=math.exp((-1/2)*(r**2))
return x
def FT(r):
x=((2*math.pi)**(3/2))*(math.exp((-1/2)*(r**2)))
return x
R = float(10)
N = int(100)
ft = np.zeros(N)
fta = np.zeros(N)
dr = R/N
dk = math.pi/R
print ("\tk \t\t\t Discrete \t\t\t Analytic")
for j in range (1, N):
kj = j*dk
#Discrete Transform
sum = 0
for i in range(1, N):
ri = i*dr
sum = sum + (dr*ri*(F(ri))*(math.sin(kj*ri)))
ft[j] = ((4*math.pi)/kj)*sum
#Analytic Transform
fta[j] = FT(kj)
#Print results
print(kj, f" \t\t{ft[j]:.10E} \t\t{fta[j]:.10E}")
And these are the first few results:
k Discrete Analytic
0.3141592653589793 1.4991263193E+01 1.4991263193E+01
0.6283185307179586 1.2928362116E+01 1.2928362116E+01
0.9424777960769379 1.0101494686E+01 1.0101494686E+01
1.2566370614359172 7.1509645344E+00 7.1509645344E+00
1.5707963267948966 4.5864901093E+00 4.5864901093E+00

Compute commutation matrix in numpy/scipy efficiently

I am trying to compute the commutation matrix in python for a large dataset. I wrote the following code but found it performs terribly (and runs into memory errors for examples of around 500 by 500). In my code a and b are equivilent to the m and n notation in the linked wikipedia page. Can anyone provide a quicker and more memory efficient alternative to my current attempt?
def vec(matrix):
#Return vectorised matrix
return(matrix.transpose().reshape(matrix.shape[0]*matrix.shape[1],1))
def commutation(a, b):
# Example matrix with unique elements
m = np.arange(a*b).reshape(a,b)
# Vec(m)
vecm = vec(m)
vecm = vecm.reshape(vecm.shape[0])
# Get row inds
rowInds = np.arange(a*b)
# Get column inds
colInds = np.argsort(vecm)
colInds = colInds.reshape(colInds.shape[0])
# Work out mapping between them.
K = scipy.sparse.csr_matrix((np.ones(a*b),(rowInds,colInds)))
return(K)
Below is an improved version of your code:
import numpy as np
from scipy.sparse import csr_matrix
def vec(A):
m, n = A.shape[0], A.shape[1]
return A.reshape(m*n, order='F')
def commutation_matrix_sp(A):
m, n = A.shape[0], A.shape[1]
row = np.arange(m*n)
col = row.reshape((m, n), order='F').ravel()
data = np.ones(m*n, dtype=np.int8)
K = csr_matrix((data, (row, col)), shape=(m*n, m*n))
return K
Test:
A = np.random.rand(500, 500)
K = commutation_matrix_sp(A)
print(f'{K.data.nbytes/2**20:.2f} MB')
# 0.24 MB
print(np.all(K # vec(A) == vec(A.T)))
# True

What is the best way to implement an element-wise cosine similarity in Python?

The code below is very inefficient given large matrices. Is there a better way to implement this ?
I have already searched the web for this here.
import numpy as np
def cosine_similarity(x, y):
return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))
def compare(a, b):
c = np.zeros((a.shape[0], b.shape[0]))
for i, ai in enumerate(a):
for j, bj in enumerate(b):
c[i, j] = cosine_similarity(ai, bj)
return c
a = np.random.rand(100,2000)
b = np.random.rand(800,2000)
compare(a,b) # shape -> (100, 800)
As in the comments, if you want to take the product of two matrices, then numpy already has an efficient implementation of this, but it might be too slow for you (O(n^3)).
import numpy as np
a=np.array([3,2,1])
b=np.array([1,2,3])
c=a.dot(b)
print(c) #output = 10
I saw in the comments that you were interested in the cosine distance between vectors. For the cosine similarity, consider using Scipy:
from scipy.spatial.distance import cosine
a=[1,0,1]
b=[0,1,0]
print(cosine(a,b)) #output = 1.0
This might be faster for your needs. Here is the documentation.
[Personal edit]
In order to compute the cosine similarity efficiently, here is a solution I have written:
def compare(a, b):
x = np.atleast_2d(np.sqrt(np.sum(a*a, axis=1))).T
y = np.atleast_2d(np.sqrt(np.sum(b*b, axis=1))).T
return a.dot(b.T) / x.dot(y.T)

Need help to compute derivative and integral using Numpy

I need help to compute derivative and integral of function using finite difference method and Numpy without using loops.
The whole task: Tabulate Gauss function f(x) = (1./(sqrt(2.*pi)*s))*e**(-0.5*((x-m)/s)**2) on the interval [-10,10] for m = 0 and s=[0.5,5]. Compute derivative and integral of the function using finite difference method without using loops. Create plots of the function and its derivative. Use Numpy and Matplotlib.
Here's the beginning of the programm:
def f(x,s,m):
return (1./(sqrt(2.*pi)*s))*e**(-0.5*((x-m)/s)**2)
def main():
m = 0
s = np.linspace(0.5,5,3)
x = np.linspace(-10,10,20)
for i in range(3):
print('s = ', s[i])
for j in range(20):
f(x[j],s[i],m)
print('x = ',x[j],', y = ',f(x[j],s[i],m))
By using the numpy arrays you can apply that operation directly with algebraic notation:
result = (1./(np.sqrt(2.*np.pi)*s))*np.exp(-0.5*((x-m)/s)**2)
The simplest way (without using SciPy) seems to me to directly sum for the integral and central difference method for the derivative:
import numpy as np
import pylab
def gaussian(x, s, m):
return 1./(np.sqrt(2.*np.pi)*s) * np.exp(-0.5*((x-m)/s)**2)
m = 0
s = np.linspace(0.5,5,3)
x, dx = np.linspace(-10,10,1000, retstep=True)
x = x[:,np.newaxis]
y = gaussian(x,s,m)
h = 1.e-6
dydx = (gaussian(x+h, s, m) - gaussian(x-h, s, m))/2/h
int_y = np.sum(gaussian(x, s, m), axis=0) * dx
print(int_y)
pylab.plot(x, y)
pylab.plot(x, dydx)
pylab.show()

Categories