normalize Euclidean distance - python - python

def euclidean_distance(n):
L = np.linalg.cholesky( [[1.0, 0.60], [0.60, 1.0]])
uncorrelated = np.random.standard_normal((2, n))
correlated = np.dot(L, uncorrelated)
A = correlated[0]
B = correlated[1]
v = np.linalg.norm(A-B)
return v
v50 = euclidean_distance(50)
v1000 = euclidean_distance(1000)
The euclidean distance is larger the more data points I use in the computation. How can I normalize the distances so that I can compare similarity between v50 and v1000?

you can normalize the distances by dividing them by the square root of the number of data points used in each computation. try this:
v = np.linalg.norm(A-B) / np.sqrt(n)

Related

k-mean clustering calculate distance between all points and the initial centroids

I want to do image segmention with k-means clustering, and i want to:
Calculate distances between all points and the initial centroids.
Assign all points to their closest centroid.
Here is my code:
def init(ds, k, random_state=42):
np.random.seed(random_state)
centroids = [ds[0]]
for _ in range(1, k):
dist_sq = np.array([min([np.inner(c-x,c-x) for c in centroids]) for x in ds])
probs = dist_sq/dist_sq.sum()
cumulative_probs = probs.cumsum()
r = np.random.rand()
for j, p in enumerate(cumulative_probs):
if r < p:
i = j
break
centroids.append(ds[i])
return np.array(centroids)
k = 4
centroids = init(pixels, k, random_state=42)
print(centroids)
# First centroid
centroids[0]
#Calculate distances between all points and the initial centroids.
# Assign all points to their closest centroid.

Returning the size of detected clusters

I applied a bisecting kMeans clustering on my high dimensional database and want to display the size of the derived clustering groups e.g. cluster 1 = 2000 elements; cluster 2 = 3489 elements and so on.
Which function do I need in order to be able to display the size? The visualisation is possbible as can be seen on the 2-D plot:
Cluster size
The def functions look like the following:
def convert_to_2d_array(points):
"""
Converts `points` to a 2-D numpy array.
"""
points = np.array(points)
if len(points.shape) == 1:
points = np.expand_dims(points, -1)
return points
def visualize_clusters(clusters):
"""
Visualizes the first 2 dimensions of the data as a 2-D scatter plot.
"""
plt.figure()
for cluster in clusters:
points = convert_to_2d_array(cluster)
if points.shape[1] < 2:
points = np.hstack([points, np.zeros_like(points)])
plt.plot(points[:,0], points[:,1], 'o')
plt.show()
def SSE(points):
"""
Calculates the sum of squared errors for the given list of data points.
"""
points = convert_to_2d_array(points)
centroid = np.mean(points, 0)
errors = np.linalg.norm(points-centroid, ord=2, axis=1)
return np.sum(errors)
def kmeans(points, k=2, epochs=10, max_iter=100, verbose=False):
"""
Clusters the list of points into `k` clusters using k-means clustering
algorithm.
"""
points = convert_to_2d_array(points)
assert len(points) >= k, "Number of data points can't be less than k"
best_sse = np.inf
for ep in range(epochs):
# Randomly initialize k centroids
np.random.shuffle(points)
centroids = points[0:k, :]
last_sse = np.inf
for it in range(max_iter):
# Cluster assignment
clusters = [None] * k
for p in points:
index = np.argmin(np.linalg.norm(centroids-p, 2, 1))
if clusters[index] is None:
clusters[index] = np.expand_dims(p, 0)
else:
clusters[index] = np.vstack((clusters[index], p))
# Centroid update
centroids = [np.mean(c, 0) for c in clusters]
# SSE calculation
sse = np.sum([SSE(c) for c in clusters])
gain = last_sse - sse
if verbose:
print((f'Epoch: {ep:3d}, Iter: {it:4d}, '
f'SSE: {sse:12.4f}, Gain: {gain:12.4f}'))
# Check for improvement
if sse < best_sse:
best_clusters, best_sse = clusters, sse
# Epoch termination condition
if np.isclose(gain, 0, atol=0.00001):
break
last_sse = sse
return best_clusters
def bisecting_kmeans(points, k=2, epochs=10, max_iter=100, verbose=False):
"""
Clusters the list of points into `k` clusters using bisecting k-means
clustering algorithm. Internally, it uses the standard k-means with k=2 in
each iteration.
"""
points = convert_to_2d_array(points)
clusters = [points]
while len(clusters) < k:
max_sse_i = np.argmax([SSE(c) for c in clusters])
cluster = clusters.pop(max_sse_i)
two_clusters = kmeans(
cluster, k=2, epochs=epochs, max_iter=max_iter, verbose=verbose)
clusters.extend(two_clusters)
return clusters
I thank you in advance for your help!
Best regards,
Fatih

Maximum likelihood estimation for trajectories estimation in python

I am trying to predict trajectories using maximum likelihood estimation. How should I use the mean and variance from MLE to compute the parameters for my trajectory estimation?
Let's say I have a function representing the X coordinate of a gesture, where:
X(t)=a*X(t-1)+a1*X(t-2)+a2*Y(t-1)+ε and a2=da(2-1)+u, where ε and u are noise.
Where t represents the next time period, t-1 the current one and Y the Y coordinate of the hand. I need to estimate a, a1 and a1 using MLE in order to predict X(t).
Any suggestions since I am really new to this?
Currently using some code in python for mean and variance computation.
import pandas as pd
import numpy as np
def expectation_max(data, max_iter=1000):
data = pd.DataFrame(data)
mu0 = data.mean()
c0 = data.cov()
for j in range(max_iter):
w = []
# perform the E part of algorithm
for i in data:
wk = (5 + len(data))/(5 + np.dot(np.dot(np.transpose(i - mu0), np.linalg.inv(c0)), (i - mu0)))
w.append(wk)
w = np.array(w)
# perform the M part of the algorithm
mu = (np.dot(w, data))/(np.sum(w))
c = 0
for i in range(len(data)):
c += w[i] * np.dot((data[i] - mu0), (np.transpose(data[i] - mu0)))
cov = c/len(data)
mu0 = mu
c0 = cov
return mu0, c0

Divisive clustering from scratch

I'm programming divisive (top-down) clustering from scratch. In divisive clustering we start at the top with all examples (variables) in one cluster. The cluster is than split recursively until each example is in its singleton cluster.
I use Pearson's correlation coefficient as a measure for splitting clusters. Pasted below is my initial attempt. I read the data and compute matrix of correlation coefficient.
Now we need to split first cluster according to minimal value of the correlation coefficient. Any idea how to proceed? Any pointers and suggestions are welcome.
import pandas as pd
from math import sqrt
# Read data from GitHub
df = pd.read_csv('https://raw.githubusercontent.com/nico/collectiveintelligence-book/master/blogdata.txt', sep = '\t', index_col = 0)
data = df.values.tolist()
data = data[1:10]
# Define correlation coefficient
def pearson(v1, v2):
# Simple sums
sum1 = sum(v1)
sum2 = sum(v2)
# Sums of the squares
sum1Sq = sum([pow(v, 2) for v in v1])
sum2Sq = sum([pow(v, 2) for v in v2])
# Sum of the products
pSum=sum([v1[i] * v2[i] for i in range(len(v1))])
# Calculate r (Pearson score)
num = pSum - (sum1 * sum2 / len(v1))
den = sqrt((sum1Sq - pow(sum1,2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1)))
if den == 0: return 0
return num / den
# Dict for distances
dist={}
min_dist = pearson(data[0], data[0])
# Loop over upper triangle of data matrix
for i in range(len(data)):
for j in range(i + 1, len(data)):
# Compute distance for each pair
dist_curr = pearson(data[i], data[j])
# Store distance in dict
dist[(i, j)] = dist_curr
# Store min distance
if dist_curr < min_dist:
min_dist = dist_curr

Duplicate removal within a certain distance in Python

I have two numpy.arrays of points (shapes (m,2) and (n,2)) like this:
A = numpy.array([[1,2],[3,4]])
B = numpy.array([[5,6],[7,8],[9,2]])
I need to merge them into an array with the next condition:
If there are two points with distance less or equal to epsilon, just leave one
I have this code, but it's so slow:
import numpy as np
eps = 0.1
A = np.array([[1,2],[3,4]])
B = np.array([[5,6],[7,8],[9,2]])
for point in B:
if not (np.amin(np.linalg.norm(A-point)) <= eps):
A = np.append( A , [point], axis=0)
What is the best way to do that using numpy?
Thanks a lot!
You could calculate a Delaunay triangulation first, from which a list of neighboring points can easily be extracted:
import numpy as np
from itertools import product
from scipy.spatial import Delaunay
eps = 3. # choose value, which filters out some points
A = np.array([[1,2],[3,4]])
B = np.array([[5,6],[7,8],[9,2]])
# triangulate points:
pts = np.vstack([A, B])
tri = Delaunay(pts)
# extract all edges:
si_idx = [[0, 1], [0, 2], [1, 2]] # edge indeces in tri.simplices
edges = [si[i] for si, i in product(tri.simplices, si_idx)]
dist_edges = [np.linalg.norm(tri.points[ii[0]] - tri.points[ii[1]])
for ii in edges] # calculate distances
# list points which are closer than eps:
for ee, d in zip(edges, dist_edges):
if d < eps:
print("|p[{}] - p[{}]| = {}".format(ee[0], ee[1], d))
As #David Wolever already noted, it is not clear from your question, how to exactly remove the points from the merged list.

Categories