Calculate similarity/distance between rows using pandas faster

Calculate similarity/distance between rows using pandas faster - python

I am fairly new to Python and Pandas.
I have following columns in a Pandas Datframe:
SongNumber songID albumID artistID similarArtists artistHotttnesss songHotness loudness tempo year
With numerical data from artistHotnesss to year columns.
So I tried calculating distance/cosine between songs using the below code:
t1=time()
m = 1000
mat = np.zeros((m, m))
for i in range(0,m):
for j in range(0,m):
if i != j:
mat[i][j] = euclidean(data.ix[i,5:], data.ix[j,5:])
'''if data.ix[i,2] == data.ix[j,2]:
mat[i][j] += 1
if data.ix[i,3] == data.ix[j,3]:
mat[i][j] += 1
#l1,l2 - list of similar artists
l1_str = data.ix[i,4].strip(']')[1:]
l2_str = data.ix[j,4].strip(']')[1:]
l1 = l1_str.split()
l2 = l2_str.split()
common = len(set(l1).intersection(l2))
mat[i][j] += common
mat[i][j] /= 3'''
else:
mat[i][j] = 0.0
t2 =time()
print(t2-t1)
So this essentially requires looping 10^4*10^4 times.
If I perform this for m =1000 , I get results in 2249 sec or 37.48 mins, so I am not getting the results for m = 10000 in time.
How can I speed it up (by avoiding loops ? pandas functions) ?
Thanks for help

You can avoid using loops by using the euclidean_distances function in scikit-learn.
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
mat = np.random.rand(5, 5)
pairwise_dist_mat = euclidean_distances(mat)
pairwise_dist_mat
array([[ 0. , 1.19602663, 1.08341967, 1.07792121, 1.1245057 ],
[ 1.19602663, 0. , 0.52135682, 0.82797734, 0.78247091],
[ 1.08341967, 0.52135682, 0. , 0.87764513, 0.81903634],
[ 1.07792121, 0.82797734, 0.87764513, 0. , 0.1386294 ],
[ 1.1245057 , 0.78247091, 0.81903634, 0.1386294 , 0. ]])

Related

Best way to find a point near all four points known coordinates

I have the coordinates of four points. Can anyone help me find the coordinates of one point that satisfies the condition: the distances from the finding point to four input points are in the range of 1.9 and 2.5?
import numpy as np
dist_min = 1.9
dist_max = 2.5
# this show no points satisfied
input_points1 = [[ 7.57447956, 6.67658376, 10.79921475],
[ 8.98026868, 7.69010703, 12.89377068],
[ 6.22242062, 7.73362942, 12.87947421],
[ 10.0000000, 9.00000000, 8.500000000]]
#this has
input_points2 = [[ 7.57447956, 6.67658376, 10.79921475],
[ 8.98026868, 7.69010703, 12.89377068],
[ 6.22242062, 7.73362942, 12.87947421],
[ 6.22473072, 4.74175054, 12.96455411]]
def Distance(point1, point2):
return np.linalg.norm(point1 - point2)

Here a method that finds a random point:
import numpy as np
dist_min = 1.9
dist_max = 2.5
# this show no points satisfied
input_points1 = [[ 7.57447956, 6.67658376, 10.79921475],
[ 8.98026868, 7.69010703, 12.89377068],
[ 6.22242062, 7.73362942, 12.87947421],
[ 10.0000000, 9.00000000, 8.500000000]]
#this has
input_points2 = [[ 7.57447956, 6.67658376, 10.79921475],
[ 8.98026868, 7.69010703, 12.89377068],
[ 6.22242062, 7.73362942, 12.87947421],
[ 6.22473072, 4.74175054, 12.96455411]]
def Distance(point1, point2):
return np.linalg.norm(np.array(point1) - np.array(point2))
def find_point(input_points):
dmax = max([Distance(input_points[i], input_points[j])
for i in range(len(input_points)-1)
for j in range(i+1, len(input_points))])
if dmax > 2 * dist_max:
return None
found = False
while not found:
ip = np.random.choice(len(input_points))
p = np.random.normal(size=3)
r = np.random.uniform(dist_min, dist_max)
x = p / np.linalg.norm(p) * r + np.array(input_points[ip])
found = True
for i in input_points:
d = Distance(i, x)
if d <= dist_min or d >= dist_max:
found = False
continue
return(x)
a = find_point(input_points1)
print(a)
# NONE
b = find_point(input_points2)
print([Distance(i, b) for i in input_points2])
# [2.4877643881304805, 2.1439232926982417, 2.2860134633791795, 1.9466840567560841]

This looks like something you could use a K-Means (link to Wikipedia) function for with just 1 centroid and then check that the point's distance is the right distance away from all the points in the data. Perhaps not the most elegant or efficient solution, but it should work.
K-Means Code adapted from this tutorial on K-Means:
import pandas as pd
from sklearn.cluster import KMeans
# data is whatever set of points you have
df = pd.DataFrame(data)
# fit k means with 1 centroid lol
k_means = KMeans(n_clusters=1)
labels=k_means.fit_predict(df)
centroid = k_means.cluster_centers_[:,0]
# compare to all points
for point in data:
assert distance(point, centroid) >= 1.9
assert distance(point, centroid) <= 2.5

Importing an array using Python

I have an array Pe with dimensions (2,3,3). I am importing Pe but then I want the code to calculate for each Pe and give me visited indices. Right now, I have to manually add Pe.Pe[0],Pe.Pe[1] in the line Visited_Indices,timestamps=iterate_array(0,0, Pe.Pe[0], lambda x : x < 150). How do I implement Pe.Pe[i] where i goes from 0 to 1?
Pe array is
import numpy as np
Pe = np.array([[[128.22918457, 168.52413295, 209.72343319],
[129.01598287, 179.03716051, 150.68633749],
[131.00688309, 187.42601593, 193.68172751]],
[[ 87.70103267, 115.2603484 , 143.4381863 ],
[ 88.23915528, 122.45062554, 103.06037156],
[ 89.60081102, 128.18809696, 132.46662659]]])
print([Pe])
The code is
import numpy as np
import time
import Pe
def get_neighbor_indices(position, dimensions):
'''
dimensions is a shape of np.array
'''
i, j = position
indices = [(i+1,j), (i-1,j), (i,j+1), (i,j-1)]
return [
(i,j) for i,j in indices
if i>=0 and i<dimensions[0]
and j>=0 and j<dimensions[1]
]
def iterate_array(init_i, init_j, arr, condition_func):
'''
arr is an instance of np.array
condition_func is a function (value) => boolean
'''
indices_to_check = [(init_i,init_j)]
checked_indices = set()
result = []
t0 = None
t1 = None
timestamps = []
while indices_to_check:
pos = indices_to_check.pop()
if pos in checked_indices:
continue
item = arr[pos]
checked_indices.add(pos)
if condition_func(item):
result.append(item)
t1=time.time()
if(t0==None):
t0=t1
timestamps.append(t1-t0)
indices_to_check.extend(
get_neighbor_indices(pos, arr.shape)
)
return result,timestamps
Visited_Indices,timestamps=iterate_array(0,0, Pe.Pe[0], lambda x : x < 150)
out = list(zip(*np.where(np.isin(Pe, Visited_Indices))))
print("Visited =",[Visited_Indices])

Monte Carlo with Metropolis algorithm extremely slow in Python

I'm trying to implement a simple Monte Carlo in Python (to which I'm fairly new). Coming from C I'm probably following the wrongest path since my code is far too slow for what I'm asking: I have a potential hard sphere-like (see V_pot(r) in the code) for 60 3d particles and periodic boundary conditions (PBC), so I defined the following functions
import timeit
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from numpy import inf
#
L, kb, d, eps, DIM = 100, 1, 1, 1, 3
r_c, T = L/2, eps/(.5*kb)
beta = 1/(kb*T)
#
def dist(A, B):
d = A - B
d -= L*np.around(d/L)
return np.sqrt(np.sum(d**2))
#
def V_pot(r):
V = -eps*(d**6/r**6 - d**6/r_c**6)
if r > r_c:
V = 0
elif r < d:
V = inf
return V
#
def ener(config):
V_jk_val, j = 0, N
#
while (j > 0):
j -= 1
i = 0
while (i < j):
V_jk_val += V_pot(dist(config[j,:], config[i,:]))
i += 1
#
return V_jk_val
#
def acc(en_n, en_o):
d_en = en_n-en_o
if (d_en <= 0):
acc_val = 1
else:
acc_val = np.exp(-beta*(d_en))
return acc_val
#
then, starting from the configuration (where every line of the array represents the coordinates of a 3D particle)
config = np.array([[16.24155657, 57.41672173, 94.39565792],
[76.38121764, 55.88334066, 5.72255163],
[38.41393783, 58.09432145, 6.26448054],
[86.44286438, 61.37100899, 91.97737383],
[37.7315366 , 44.52697269, 23.86320444],
[ 0.59231801, 39.20183376, 89.63974115],
[38.00998141, 3.84363202, 52.74021401],
[99.53480756, 69.97688928, 21.43528924],
[49.62030291, 93.60889503, 15.73723259],
[54.49195524, 0.6431965 , 25.37401196],
[33.82527814, 25.37776021, 67.4320553 ],
[64.61952893, 46.8407798 , 4.93960443],
[60.47322732, 16.48140136, 33.26481306],
[19.71667792, 46.56999616, 35.61044526],
[ 5.33252557, 4.44393836, 60.55759256],
[44.95897856, 7.81728046, 10.26000715],
[86.5548395 , 49.74079452, 4.80480133],
[52.47965686, 42.831448 , 22.03890639],
[ 2.88752006, 59.84605062, 22.75760029],
[ 9.49231045, 42.08653603, 40.63380097],
[13.90093641, 74.40377984, 32.62917915],
[97.44839233, 90.47695772, 91.60794836],
[51.29501624, 27.03796277, 57.09525454],
[10.30180295, 21.977336 , 69.54173272],
[59.61327648, 14.29582325, 11.70942289],
[89.52722796, 26.87758644, 76.34934637],
[82.03736088, 78.5665713 , 23.23587395],
[79.77571695, 66.140968 , 53.6784269 ],
[82.86070472, 40.82189833, 51.48739072],
[99.05647523, 98.63386809, 6.33888993],
[31.02997123, 66.99709163, 95.88332332],
[97.71654767, 59.24793618, 5.20183793],
[ 6.79964473, 45.01258652, 48.69477807],
[93.34977049, 55.20537774, 82.35693526],
[17.35577815, 20.45936211, 29.27981422],
[55.51942207, 52.22875901, 3.6616131 ],
[61.45612224, 36.50170405, 62.89796773],
[23.55822368, 7.09069623, 37.38274914],
[39.57082799, 58.95457592, 48.0304924 ],
[93.94997617, 64.34383203, 77.63346308],
[17.47989107, 90.01113402, 81.00648645],
[86.79068539, 66.35768515, 56.64402907],
[98.71924121, 38.33749023, 73.4715132 ],
[ 0.42356139, 78.32172925, 15.19883322],
[77.75572529, 2.60088767, 56.4683935 ],
[49.76486142, 3.01800153, 93.48019286],
[42.54483899, 4.27174457, 4.38942325],
[66.75777178, 41.1220603 , 19.64484167],
[19.69520773, 41.09230171, 2.51986091],
[73.20493772, 73.16590392, 99.19174281],
[94.16756184, 72.77653334, 10.32128552],
[29.95281655, 27.58596604, 85.12791195],
[ 2.44803886, 32.82333962, 41.6654683 ],
[23.9665915 , 49.94906612, 37.42701059],
[30.40282934, 39.63854309, 47.16572743],
[56.04809276, 30.19705527, 29.15729635],
[ 2.50566522, 70.37965564, 16.78016719],
[28.39713572, 4.04948368, 27.72615789],
[26.11873563, 41.49557167, 14.38703697],
[81.91731981, 12.10514972, 12.03083427]])
I make the 5000 time steps of the simulation with the following code
N = 60
TIME_MC = 5000
DELTA_LIST = [d]
#d/6, d/3, d, 2*d, 3*d
np.random.seed(19680801)
en_mc_delta = np.zeros((TIME_MC, len(DELTA_LIST)))
start = timeit.default_timer()
config_tmp = config
#
for iD, Delta in enumerate(DELTA_LIST):
t=0
while (t < TIME_MC):
for k in range(N):
RND = np.random.rand()
config_tmp[k,:] = config[k,:] + Delta*(np.random.random_sample((1,3))-.5)
en_o, en_n = ener(config), ener(config_tmp)
ACC = acc(en_n, en_o)
if (RND < ACC):
config[k,:] = config_tmp[k,:]
en_o = en_n
en_mc_delta[t][iD] = en_o
t += 1
stop = timeit.default_timer()
print('Time: ', stop-start)
following the rule of the Metropolis algorithm for the acceptance of the proposed move extracted with config_tmp[k,:] = config[k,:] + Delta*(np.random.random_sample((1,3))-.5).
I made some attempts to check where the code get stuck and I found that the function ener (also because of the function dist) is extremely slow: it takes something like ~0.02s to calculate the energy of a configuration, which means something around ~6000s to run the complete simulation (60 particles, 5000 proposed moves).
The outer for it's just to calculate the results for different values of Delta.
Running this code with TIME_MC=60 can make you an idea of how much slow is this code (~218s) which takes just some seconds if implemented in C. I read some other question about how to speed up Python codes but I can't understand how to do it here.
EDIT:
I'm now almost sure that the problem is in the function dist, since just to calculate PBC distance between two 3D vectors it takes around ~0.0012s which gives crazy long times when you calculate it 5000*60 times.

Note that this is a partial answer continued from comments on the original question.
Here's an example of how "unrolling" numpy's function can improve performance when replaced with a more direct calculation of the distance. Note that this was not verified to be equivalent, especially concerning the rounding. The principle still applies, I think.
import random
import time
import numpy as np
L = 100
inv_L = 0.01
vec_length = 10
repetitions = 100000
def dist_np(A, B):
d = A - B
d -= L*np.around(d/L)
return np.sqrt(np.sum(d**2))
def dist_direct(A, B):
sum = 0
for i in range(0, len(A)):
diff = (A[0,i] - B[0,i])
diff -= L * int(diff * inv_L)
sum += diff * diff
return np.sqrt(sum)
vec1 = np.zeros((1,vec_length))
vec2 = np.zeros((1,vec_length))
for i in range(0, vec_length):
vec1[0,i] = random.random()
vec2[0,i] = random.random()
print("with numpy method:")
start = time.time()
for i in range(0, repetitions):
dist_np(vec1, vec2)
print("done in {}".format(time.time() - start))
print("with direct method:")
start = time.time()
for i in range(0, repetitions):
dist_direct(vec1, vec2)
print("done in {}".format(time.time() - start))
Output:
with numpy method:
done in 6.332799911499023
with direct method:
done in 1.0938000679016113
Play around with the average vector length and the repetitions to see where the sweet spot is. I expect the performance gain is not constant when varying these meta-parameters.

Translating Matlab (Octave) group coloring code into python (numpy, pyplot)

I want to translate the following group coloring octave function to python and use it with pyplot.
Function input:
x - Data matrix (m x n)
a - A parameter.
index - A vector of size "m" with values in range [: a]
(For example if a = 4, index can be [random.choice(range(4)) for i in range(m)]
The values in "index" indicate the number of the group the "m"th data point belongs to.
The function should plot all the data points from x and color them in different colors (Number of different colors is "a").
The function in octave:
p = hsv(a); % This is a x 3 metrix
colors = p(index, :); % ****This is m x 3 metrix****
scatter(X(:,1), X(:,2), 10, colors);
I couldn't find a function like hsv in python, so I wrote it myself (I think I did..):
p = colors.hsv_to_rgb(numpy.column_stack((
numpy.linspace(0, 1, a), numpy.ones((a ,2)) )) )
But I can't figure out how to do the matrix selection p(index, :) in python (numpy).
Specially because the size of "index" is bigger then "a".
Thanks in advance for your help.

So, you want to take an m x 3 of HSV values, and convert each row to RGB?
import numpy as np
import colorsys
mymatrix = np.matrix([[11,12,13],
[21,22,23],
[31,32,33]])
def to_hsv(x):
return colorsys.rgb_to_hsv(*x)
#Apply the to_hsv function to each matrix row.
print np.apply_along_axis(to_hsv, axis=1, arr=mymatrix)
This produces:
[[ 0.5 0. 13. ]
[ 0.5 0. 23. ]
[ 0.5 0. 33. ]]
Follow through on your comment:
If I understand you have a matrix p that is an a x 3 matrix, and you want to randomly select rows from the matrix over and over again, until you have a new matrix that is m x 3?
Ok. Let's say you have a matrix p defined as follows:
a = 5
p = np.random.randint(5, size=(a, 3))
Now, make a list of random integers between the range 0 -> 3 (index starts at 0 and ends to a-1), That is m in length:
m = 20
index = np.random.randint(a, size=m)
Now access the right indexes and plug them into a new matrix:
p_prime = np.matrix([p[i] for i in index])
Produces a 20 x 3 matrix.

How should I multiply scipy.fftpack output vectors together?

The scipy.fftpack.rfft function returns the DFT as a vector of floats, alternating between the real and complex part. This means to multiply to DFTs together (for convolution) I will have to do the complex multiplication "manually" which seems quite tricky. This must be something people do often - I presume/hope there is a simple trick to do this efficiently that I haven't spotted?
Basically I want to fix this code so that both methods give the same answer:
import numpy as np
import scipy.fftpack as sfft
X = np.random.normal(size = 2000)
Y = np.random.normal(size = 2000)
NZ = np.fft.irfft(np.fft.rfft(Y) * np.fft.rfft(X))
SZ = sfft.irfft(sfft.rfft(Y) * sfft.rfft(X)) # This multiplication is wrong
NZ
array([-43.23961083, 53.62608086, 17.92013729, ..., -16.57605207,
8.19605764, 5.23929023])
SZ
array([-19.90115323, 16.98680347, -8.16608202, ..., -47.01643274,
-3.50572376, 58.1961597 ])
N.B. I am aware that fftpack contains a convolve function, but I only need to fft one half of the transform - my filter can be fft'd once in advance and then used over and over again.

You don't have to flip back to np.float64 and hstack. You can create an empty destination array, the same shape as sfft.rfft(Y) and sfft.rfft(X), then create a np.complex128 view of it and fill this view with the result of the multiplication. This will automatically fill the destination array as wanted.
If I retake your example :
import numpy as np
import scipy.fftpack as sfft
X = np.random.normal(size = 2000)
Y = np.random.normal(size = 2000)
Xf = np.fft.rfft(X)
Xf_cpx = Xf[1:-1].view(np.complex128)
Yf = np.fft.rfft(Y)
Yf_cpx = Yf[1:-1].view(np.complex128)
Zf = np.empty(X.shape)
Zf_cpx = Zf[1:-1].view(np.complex128)
Zf[0] = Xf[0]*Yf[0]
# the [...] is important to use the view as a reference to Zf and not overwrite it
Zf_cpx[...] = Xf_cpx * Yf_cpx
Zf[-1] = Xf[-1]*Yf[-1]
Z = sfft.irfft.irfft(Zf)
and that's it!
You can use a simple if statement if you want your code to be more general and handle odd lengths as explained in Jaime's answer.
Here is a function that does what you want:
def rfft_mult(a,b):
"""Multiplies two outputs of scipy.fftpack.rfft"""
assert a.shape == b.shape
c = np.empty( a.shape )
c[...,0] = a[...,0]*b[...,0]
# To comply with the rfft support of multi dimensional arrays
ar = a.reshape(-1,a.shape[-1])
br = b.reshape(-1,b.shape[-1])
cr = c.reshape(-1,c.shape[-1])
# Note that we cannot use ellipses to achieve that because of
# the way `view` work. If there are many dimensions, one should
# consider to manually perform the complex multiplication with slices.
if c.shape[-1] & 0x1: # if odd
for i in range(len(ar)):
ac = ar[i,1:].view(np.complex128)
bc = br[i,1:].view(np.complex128)
cc = cr[i,1:].view(np.complex128)
cc[...] = ac*bc
else:
for i in range(len(ar)):
ac = ar[i,1:-1].view(np.complex128)
bc = br[i,1:-1].view(np.complex128)
cc = cr[i,1:-1].view(np.complex128)
cc[...] = ac*bc
c[...,-1] = a[...,-1]*b[...,-1]
return c

You can take a view of a slice of your return array, e.g.:
>>> scipy.fftpack.fft(np.arange(8))
array([ 28.+0.j , -4.+9.65685425j, -4.+4.j ,
-4.+1.65685425j, -4.+0.j , -4.-1.65685425j,
-4.-4.j , -4.-9.65685425j])
>>> a = scipy.fftpack.rfft(np.arange(8))
>>> a
array([ 28. , -4. , 9.65685425, -4. ,
4. , -4. , 1.65685425, -4. ])
>>> a.dtype
dtype('float64')
>>> a[1:-1].view(np.complex128) # First and last entries are real
array([-4.+9.65685425j, -4.+4.j , -4.+1.65685425j])
You will need to handle even or odd sized FFTs differently:
>>> scipy.fftpack.fft(np.arange(7))
array([ 21.0+0.j , -3.5+7.26782489j, -3.5+2.79115686j,
-3.5+0.79885216j, -3.5-0.79885216j, -3.5-2.79115686j,
-3.5-7.26782489j])
>>> a = scipy.fftpack.rfft(np.arange(7))
>>> a
array([ 21. , -3.5 , 7.26782489, -3.5 ,
2.79115686, -3.5 , 0.79885216])
>>> a.dtype
dtype('float64')
>>> a[1:].view(np.complex128)
array([-3.5+7.26782489j, -3.5+2.79115686j, -3.5+0.79885216j])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Calculate similarity/distance between rows using pandas faster - python

Related

Best way to find a point near all four points known coordinates

Importing an array using Python

Monte Carlo with Metropolis algorithm extremely slow in Python

Translating Matlab (Octave) group coloring code into python (numpy, pyplot)

How should I multiply scipy.fftpack output vectors together?

Categories

Resources