k-mean clustering calculate distance between all points and the initial centroids - python

I want to do image segmention with k-means clustering, and i want to:
Calculate distances between all points and the initial centroids.
Assign all points to their closest centroid.
Here is my code:
def init(ds, k, random_state=42):
np.random.seed(random_state)
centroids = [ds[0]]
for _ in range(1, k):
dist_sq = np.array([min([np.inner(c-x,c-x) for c in centroids]) for x in ds])
probs = dist_sq/dist_sq.sum()
cumulative_probs = probs.cumsum()
r = np.random.rand()
for j, p in enumerate(cumulative_probs):
if r < p:
i = j
break
centroids.append(ds[i])
return np.array(centroids)
k = 4
centroids = init(pixels, k, random_state=42)
print(centroids)
# First centroid
centroids[0]
#Calculate distances between all points and the initial centroids.
# Assign all points to their closest centroid.

Related

normalize Euclidean distance - python

def euclidean_distance(n):
L = np.linalg.cholesky( [[1.0, 0.60], [0.60, 1.0]])
uncorrelated = np.random.standard_normal((2, n))
correlated = np.dot(L, uncorrelated)
A = correlated[0]
B = correlated[1]
v = np.linalg.norm(A-B)
return v
v50 = euclidean_distance(50)
v1000 = euclidean_distance(1000)
The euclidean distance is larger the more data points I use in the computation. How can I normalize the distances so that I can compare similarity between v50 and v1000?
you can normalize the distances by dividing them by the square root of the number of data points used in each computation. try this:
v = np.linalg.norm(A-B) / np.sqrt(n)

Wondering if my code is correct for Simple Kriging

I am a student and we learned about simple kriging with an example of 3 points with known elevations and 1 point with unknown elevation with the assumption that the empirical semivariogram is represented by the linear regression line with Y intercept = 0 and slope of 4.0.
I attempted to generalize this for any number of points with known elevations and any number of points with unknown elevations. I will show my code and then I will show an edited version with 50 known points and a grid of 40 000 unknown points plotted in a grid making it look like an interpolated surface. I just want to know if my code is working as it should because I can't seem to find any examples of simple kriging the way we did it in class.
'''
Simple Kriging using any number of known and unknown points
Assumption: Let us assume that the empirical semivariogram is represented
by the linear regression line with Y intercept = 0 and slope = 4.0
'''
__author__ = "Frank D"
import numpy as np
import matplotlib.pyplot as plt
import random
# Define the data in the form of a list of tuples
# k for known points xyz and u for unknown points xy
# k = [(3.00, 4.00, 120), (6.30, 3.40, 103.00), (2.00, 1.30, 142)]
# u = [(3.00, 3.00)]
k = [(random.uniform(0, 20), random.uniform(0, 20), random.uniform(100, 200)) for _ in range(50)]
u = [(random.uniform(0, 20), random.uniform(0, 20)) for _ in range(30)]
def distance(p1, p2):
'''
This function calculates the distance between two points.
Keyword arguments:
p1, p2 (tuple, list): coordinate with x and y as first and second indices.
'''
return ((p2[0]-p1[0])**2 + (p2[1]-p1[1])**2)**(1/2)
distance_matrix = np.zeros((len(k), len(k))) # Creating empty matrix
for i in range(len(k)): # Looping through the k rows of the matrix
for j in range(len(k)): # Looping through the elements in each row
distance_matrix[i, j] = distance(k[i], k[j])
semivariance_matrix = 4*distance_matrix
distance_to_unknowns = np.zeros((len(u), len(k))) # Creating empty matrix
for i in range(len(u)):
for j in range(len(k)):
distance_to_unknowns[i, j] = distance(u[i], k[j])
semivariance_to_unknowns = 4*distance_to_unknowns
# Assembling Gamma and appending Lagrange multipliers
# by adding ones to the right side and bottom and then
# setting the diagonal to zeros
gamma = np.append(semivariance_matrix, np.ones(
[semivariance_matrix.shape[0], 1]), axis=1)
gamma = np.append(gamma, np.ones([1, gamma.shape[1]]), axis=0)
np.fill_diagonal(gamma, 0)
# Assembling vector Beta and appending Lagrange multipliers
beta = np.append(semivariance_to_unknowns,
np.ones([len(u), 1]), axis=1).transpose()
# Calculating lambda:
lambda_vector = np.matmul(np.linalg.inv(gamma), beta)
# Finding the variance
variance = np.zeros([len(u), 1])
for i in range(len(u)):
for j in range(len(k)):
variance[i][0] += lambda_vector[j][i]*semivariance_to_unknowns[i][j]
# Finding the standard error
std_error = np.sqrt(variance)
# Print known points to the console
for i in range(len(k)):
print(f'k{i} = {k[i]}')
# Assembling results vector containing elevations for unknown points
# and printing the results to the console
results = np.zeros([len(u), 1])
for i in range(len(u)):
for j in range(len(k)):
results[i][0] += lambda_vector[j][i]*k[j][2]
print(f'u{i} = ({u[i][0]}, {u[i][1]}, {results[i][0]}), variance =' +
f' {round(variance[i][0], 2)}, standard error =' +
f' {round(std_error[i][0], 2)}, 95% CI = ' +
f'({round(results[i][0] - 1.96*std_error[i][0], 2)},' +
f' {round(results[i][0] + 1.96*std_error[i][0], 2)})')
# Plotting the results
x_known = [point[0] for point in k]
y_known = [point[1] for point in k]
x_unknown = [point[0] for point in u]
y_unknown = [point[1] for point in u]
plt.scatter(x_known, y_known)
plt.scatter(x_unknown, y_unknown, color='red')
plt.title('Scatterplot of x vs y')
plt.xlabel("x")
plt.ylabel("y", rotation='horizontal')
for i in range(len(k)): #adding elevation labels for known points
plt.annotate(round(k[i][2], 2), (k[i][0], k[i][1]))
for i in range(len(u)): #adding elevation labels for unknown points
plt.annotate(round(results[i][0], 2), (u[i][0], u[i][1]))
plt.show()
Here is the edited version with 40 000 points and a color gradient added to make it look like an interpolated surface.
'''
Simple Kriging using any number of known and a grid of unknown points
Assumption: Let us assume that the empirical semivariogram is represented
by the linear regression line with Y intercept = 0 and slope = 4.0
'''
__author__ = "Frank D"
import numpy as np
import matplotlib.pyplot as plt
import random
# Define the data in the form of a list of tuples
# k for known points xyz and u for unknown points xy
k = [(random.uniform(0, 20), random.uniform(0, 20), random.uniform(100, 200)) for _ in range(50)]
u=[]
for j in range(200):
u += [(0.1*i, 0.1*j) for i in range(200)]
x_u_fill = [point[0] for point in u]
y_u_fill = [point[1] for point in u]
def distance(p1, p2):
'''
This function calculates the distance between two points.
Keyword arguments:
p1, p2 (tuple, list): coordinate with x and y as first and second indices.
'''
return ((p2[0]-p1[0])**2 + (p2[1]-p1[1])**2)**(1/2)
distance_matrix = np.zeros((len(k), len(k))) # Creating empty matrix
for i in range(len(k)): # Looping through the k rows of the matrix
for j in range(len(k)): # Looping through the elements in each row
distance_matrix[i, j] = distance(k[i], k[j])
semivariance_matrix = 4*distance_matrix
distance_to_unknowns = np.zeros((len(u), len(k))) # Creating empty matrix
for i in range(len(u)):
for j in range(len(k)):
distance_to_unknowns[i, j] = distance(u[i], k[j])
semivariance_to_unknowns = 4*distance_to_unknowns
# Assembling Gamma and appending Lagrange multipliers
# by adding ones to the right side and bottom and then
# setting the diagonal to zeros
gamma = np.append(semivariance_matrix, np.ones(
[semivariance_matrix.shape[0], 1]), axis=1)
gamma = np.append(gamma, np.ones([1, gamma.shape[1]]), axis=0)
np.fill_diagonal(gamma, 0)
# Assembling vector Beta and appending Lagrange multipliers
beta = np.append(semivariance_to_unknowns,
np.ones([len(u), 1]), axis=1).transpose()
# Calculating lambda:
lambda_vector = np.matmul(np.linalg.inv(gamma), beta)
# Finding the variance
variance = np.zeros([len(u), 1])
for i in range(len(u)):
for j in range(len(k)):
variance[i][0] += lambda_vector[j][i]*semivariance_to_unknowns[i][j]
# Finding the standard error
std_error = np.sqrt(variance)
# Print known points to the console
for i in range(len(k)):
print(f'k{i} = {k[i]}')
# Assembling results vector containing elevations for unknown points
# and printing the results to the console
results = np.zeros([len(u), 1])
for i in range(len(u)):
for j in range(len(k)):
results[i][0] += lambda_vector[j][i]*k[j][2]
print(f'u{i} = ({u[i][0]}, {u[i][1]}, {results[i][0]}), ' +
f'variance = {round(variance[i][0], 2)}, standard error = '+
f'{round(std_error[i][0], 2)}, 95% CI = ' +
f'({round(results[i][0] - 1.96*std_error[i][0], 2)}, ' +
f'{round(results[i][0] + 1.96*std_error[i][0], 2)})')
# Plotting the results
fig, ax = plt.subplots()
scat_fill = ax.scatter(x_u_fill, y_u_fill, c=results, cmap='jet')
scat_fill.set_clim(100, 200)
x_known = [point[0] for point in k]
y_known = [point[1] for point in k]
plt.scatter(x_known, y_known, s=20)
plt.title('Scatterplot of x vs y')
plt.xlabel("x")
plt.ylabel("y", rotation='horizontal')
for i in range(len(k)): #adding elevation labels for known points
plt.annotate(round(k[i][2], 2), (round(k[i][0], 2),round(k[i][1], 2)))
plt.show()

Returning the size of detected clusters

I applied a bisecting kMeans clustering on my high dimensional database and want to display the size of the derived clustering groups e.g. cluster 1 = 2000 elements; cluster 2 = 3489 elements and so on.
Which function do I need in order to be able to display the size? The visualisation is possbible as can be seen on the 2-D plot:
Cluster size
The def functions look like the following:
def convert_to_2d_array(points):
"""
Converts `points` to a 2-D numpy array.
"""
points = np.array(points)
if len(points.shape) == 1:
points = np.expand_dims(points, -1)
return points
def visualize_clusters(clusters):
"""
Visualizes the first 2 dimensions of the data as a 2-D scatter plot.
"""
plt.figure()
for cluster in clusters:
points = convert_to_2d_array(cluster)
if points.shape[1] < 2:
points = np.hstack([points, np.zeros_like(points)])
plt.plot(points[:,0], points[:,1], 'o')
plt.show()
def SSE(points):
"""
Calculates the sum of squared errors for the given list of data points.
"""
points = convert_to_2d_array(points)
centroid = np.mean(points, 0)
errors = np.linalg.norm(points-centroid, ord=2, axis=1)
return np.sum(errors)
def kmeans(points, k=2, epochs=10, max_iter=100, verbose=False):
"""
Clusters the list of points into `k` clusters using k-means clustering
algorithm.
"""
points = convert_to_2d_array(points)
assert len(points) >= k, "Number of data points can't be less than k"
best_sse = np.inf
for ep in range(epochs):
# Randomly initialize k centroids
np.random.shuffle(points)
centroids = points[0:k, :]
last_sse = np.inf
for it in range(max_iter):
# Cluster assignment
clusters = [None] * k
for p in points:
index = np.argmin(np.linalg.norm(centroids-p, 2, 1))
if clusters[index] is None:
clusters[index] = np.expand_dims(p, 0)
else:
clusters[index] = np.vstack((clusters[index], p))
# Centroid update
centroids = [np.mean(c, 0) for c in clusters]
# SSE calculation
sse = np.sum([SSE(c) for c in clusters])
gain = last_sse - sse
if verbose:
print((f'Epoch: {ep:3d}, Iter: {it:4d}, '
f'SSE: {sse:12.4f}, Gain: {gain:12.4f}'))
# Check for improvement
if sse < best_sse:
best_clusters, best_sse = clusters, sse
# Epoch termination condition
if np.isclose(gain, 0, atol=0.00001):
break
last_sse = sse
return best_clusters
def bisecting_kmeans(points, k=2, epochs=10, max_iter=100, verbose=False):
"""
Clusters the list of points into `k` clusters using bisecting k-means
clustering algorithm. Internally, it uses the standard k-means with k=2 in
each iteration.
"""
points = convert_to_2d_array(points)
clusters = [points]
while len(clusters) < k:
max_sse_i = np.argmax([SSE(c) for c in clusters])
cluster = clusters.pop(max_sse_i)
two_clusters = kmeans(
cluster, k=2, epochs=epochs, max_iter=max_iter, verbose=verbose)
clusters.extend(two_clusters)
return clusters
I thank you in advance for your help!
Best regards,
Fatih

Fitting an Orthogonal Grid to Noisy Coordinates

Problem
I have a list of coordinates that are meant to form a grid. Each coordinate has a random error component and some of the coordinates are missing. Grid could be rotated (update). I want to fit a orthogonal grid to the data points and return a list of the grid's vertices. For example:
Application
The purpose is to find a grid in a scanned image. The data points come from the results of contour or edge detection in OpenCV. An example is image with a grid of photos.
Goal
I wrote some Python code that works, but would like to find a linear algebra algorithm using SciPy, statsmodels or other modules that would be more robust and handle a small rotation of the grid (less than 10°).
Python Code Using Lists Only
# Noisy [x, y] coordinates (origin is upper-left corner)
pts = [[103,101],
[198,103],
[300, 99],
[ 97,205],
[304,202],
[102,295],
[200,303],
[104,405],
[205,394],
[298,401]]
def row_col_avgs(num_list, ratio):
# Finds the average of each row and column. Coordinates are
# assigned to a row and column by specifying an error ratio.
last_num, sum_nums, count_nums, avgs = 0, 0, 0, []
num_list.sort()
for num in num_list:
# Calculate average for last row or column and begin new row or column
if num > (1+ratio)*last_num and count_nums != 0:
avgs.append(int(round(sum_nums/count_nums,0)))
sum_nums = num
count_nums = 1
# Or continue with current row or column
else:
sum_nums += num
count_nums += 1
last_num = num
avgs.append(int(round(sum_nums/count_nums,0)))
return avgs
# Split coordinates into two lists of x's and y's
xs, ys = map(list, zip(*pts))
# Find averages of each row and column of the grid
x_avgs = row_col_avgs(xs, 0.1)
y_avgs = row_col_avgs(ys, 0.1)
# Return vertices of completed averaged grid
avg_grid = []
for y_avg in y_avgs:
avg_row = []
for x_avg in x_avgs:
avg_row.append([int(x_avg), int(y_avg)])
avg_grid.append(avg_row)
print(avg_grid)
Output
[[[102, 101], [201, 101], [301, 101]],
[[102, 204], [201, 204], [301, 204]],
[[102, 299], [201, 299], [301, 299]],
[[102, 400], [201, 400], [301, 400]]]
Parallel Slopes Ordinary Least Squares (OLS) Model:
y = mx + grp + b where m=slope, b=y-intercept, & grp=categorical variable.
This is an alternative algorithm that can handle a rotated grid.
The OLS model includes both the data points in the original orientation
and a 90° rotation of the same data points. This is necessary so all gridlines are parallel and have the same slope.
Algorithm:
Find a reference gridline to compare with remaining points by choosing two neighboring points in the first or last row with a slope closest to zero.
Calculate the distances between this reference line and the remaining points.
Segment points into groups w.r.t. the calculated distances (one group per gridline).
Repeat steps 1 to 3 for the 90 degree rotated grid and combine results.
Create a parallel slopes OLS model to determine linear equations for the gridlines.
Rotate the rotated gridlines back to their original orientation.
Calculate the intersection points.
Note: Fails if noise, angle and/or missing data are too much.
Example:
                
Python Code to Create Example
def create_random_example():
# Requires import of numpy and random packages
# Creates grid with random noise and missing points
# Example will fail if std_dev, rotation, pct_removed too large
# Parameters
first_row, last_row = 100, 900
first_col, last_col = 100, 600
num_rows = 6
num_cols = 4
rotation = 3 # degrees that grid is rotated
sd = 3 # percent std dev of avg x and avg y coordinates
pct_remove = 30 # percent of points to randomly remove from data
# Create grid
x = np.linspace(first_col, last_col, num_cols)
y = np.linspace(first_row, last_row, num_rows)
xx, yy = np.meshgrid(x, y)
# Add noise
x = xx.flatten() + sd * np.mean(xx) * np.random.randn(xx.size) / 100
y = yy.flatten() + sd * np.mean(yy) * np.random.randn(yy.size) / 100
# Randomly remove points
random_list = random.sample(range(0, num_cols*num_rows),
int(pct_remove*num_cols*num_rows/100))
x, y = np.delete(x, random_list), np.delete(y, random_list)
pts = np.column_stack((x, y))
# Rotate points
radians = np.radians(rotation)
rot_mat = np.array([[np.cos(radians),-np.sin(radians)],
[np.sin(radians), np.cos(radians)]])
einsum = np.einsum('ji, mni -> jmn', rot_mat, [pts])
pts = np.squeeze(einsum).T
return np.rint(pts)
Python Code to Fit Gridlines
import numpy as np
import pandas as pd
import itertools
import math
import random
from statsmodels.formula.api import ols
from scipy.spatial import KDTree
import matplotlib.pyplot as plt
def pt_line_dist(pt, ref_line):
pt1, pt2 = [ref_line[:2], ref_line[2:]]
# Distance from point to line defined by two other points
return np.linalg.norm(np.cross(pt1 - pt2, [pt[0],pt[1]])) \
/ np.linalg.norm(pt1 - pt2)
def segment_pts(amts, grp_var, grp_label):
# Segment on amounts (distances here) in last column of array
# Note: need to label groups with string for OLS model
amts = amts[amts[:, -1].argsort()]
first_amt_in_grp = amts[0][-1]
group, groups, grp = [], [], 0
for amt in amts:
if amt[-1] - first_amt_in_grp > grp_var:
groups.append(group)
first_amt_in_grp = amt[-1]
group = []; grp += 1
group.append(np.append(amt[:-1],[[grp_label + str(grp)]]))
groups.append(group)
return groups
def find_reference_line(pts):
# Find point with minimum absolute slope relative both min y and max y
y = np.hsplit(pts, 2)[1] # y column of array
m = []
for i, y_pt in enumerate([ pts[np.argmin(y)], pts[np.argmax(y)] ]):
m.append(np.zeros((pts.shape[0]-1, 5))) # dtype default is float64
m[i][:,2:4] = np.delete(pts, np.where((pts==y_pt).all(axis=1))[0], axis=0)
m[i][:,4] = abs( (m[i][:,3]-y_pt[1]) / (m[i][:,2]-y_pt[0]) )
m[i][:,:2] = y_pt
m = np.vstack((m[0], m[1]))
return m[np.argmin(m[:,4]), :4]
# Ignore division by zero (slopes of vertical lines)
np.seterr(divide='ignore')
# Create dataset and plot
pts = create_random_example()
plt.scatter(pts[:,0], pts[:,1], c='r') # plot now because pts array changes
# Average distance to the nearest neighbor of each point
tree = KDTree(pts)
nn_avg_dist = np.mean(tree.query(pts, 2)[0][:, 1])
# Find groups of points representing each gridline
groups = []
for orientation in ['o', 'r']: # original and rotated orientations
# Rotate points 90 degrees (note: this moves pts to 2nd quadrant)
if orientation == 'r':
pts[:,1] = -1 * pts[:,1]
pts[:, [1, 0]] = pts[:, [0, 1]]
# Find reference line to compare remaining points for grouping
ref_line = find_reference_line(pts) # line is defined by two points
# Distances between points and reference line
pt_dists = np.zeros((pts.shape[0], 3))
pt_dists[:,:2] = pts
pt_dists[:,2] = np.apply_along_axis(pt_line_dist, 1, pts, ref_line).T
# Segment pts into groups w.r.t. distances (one group per gridline)
# Groups have range less than nn_avg_dist.
groups += segment_pts(pt_dists, 0.7*nn_avg_dist, orientation)
# Create dataframe of groups (OLS model requires a dataframe)
df = pd.DataFrame(np.row_stack(groups), columns=['x', 'y', 'grp'])
df['x'] = pd.to_numeric(df['x'])
df['y'] = pd.to_numeric(df['y'])
# Parallel slopes OLS model
ols_model = ols("y ~ x + grp + 0", data=df).fit()
# OLS parameters
grid_lines = ols_model.params[:-1].to_frame() # panda series to dataframe
grid_lines = grid_lines.rename(columns = {0:'b'})
grid_lines['grp'] = grid_lines.index.str[4:6]
grid_lines['m'] = ols_model.params[-1] # slope
# Rotate the rotated lines back to their original orientation
grid_lines.loc[grid_lines['grp'].str[0] == 'r', 'b'] = grid_lines['b'] / grid_lines['m']
grid_lines.loc[grid_lines['grp'].str[0] == 'r', 'm'] = -1 / grid_lines['m']
# Find grid intersection points by combinations of gridlines
comb = list(itertools.combinations(grid_lines['grp'], 2))
comb = [i for i in comb if i[0][0] != 'r']
comb = [i for i in comb if i[1][0] != 'o']
df_comb = pd.DataFrame(comb, columns=['grp', 'r_grp'])
# Merge gridline parameters with grid points
grid_pts = df_comb.merge(grid_lines.drop_duplicates('grp'),how='left',on='grp')
grid_lines.rename(columns={'grp': 'r_grp'}, inplace=True)
grid_pts.rename(columns={'b':'o_b', 'm': 'o_m', 'grp':'o_grp'}, inplace=True)
grid_pts = grid_pts.merge(grid_lines.drop_duplicates('r_grp'),how='left',on='r_grp')
grid_pts.rename(columns={'b':'r_b', 'm': 'r_m'}, inplace=True)
# Calculate x, y coordinates of gridline interception points
grid_pts['x'] = (grid_pts['r_b']-grid_pts['o_b']) \
/ (grid_pts['o_m']-grid_pts['r_m'])
grid_pts['y'] = grid_pts['o_m'] * grid_pts['x'] + grid_pts['o_b']
# Results output
print(grid_lines)
print(grid_pts)
plt.scatter(grid_pts['x'], grid_pts['y'], s=8, c='b') # for setting axes
axes = plt.gca()
axes.invert_yaxis()
axes.xaxis.tick_top()
axes.set_aspect('equal')
axes.set_xlim(axes.get_xlim())
axes.set_ylim(axes.get_ylim())
x_vals = np.array(axes.get_xlim())
for idx in grid_lines.index:
y_vals = grid_lines['b'][idx] + grid_lines['m'][idx] * x_vals
plt.plot(x_vals, y_vals, c='gray')
plt.show()
A numpy implementation of your code can be found below. As the size AvgGrid is known, I pre-allocate the required memory (rather than append). This should have speed advantages, especially if the number of output vertices is large.
import numpy as np
# Input of [x, y] coordinates of a sparse grid with errors
xys = np.array([[103,101],
[198,103],
[300, 99],
[ 97,205],
[304,202],
[102,295],
[200,303],
[104,405],
[205,394],
[298,401]])
# Function to average
def ColAvgs(CoordinateList, CutoffRatio = 1.1):
# Length of CoordinateList
L = len(CoordinateList)
# Sort input
SortedList = np.sort(CoordinateList)
# Determine indices to average
RelativeIncrease = SortedList[-(L-1):]/SortedList[:(L-1)]
CriticalIndices = np.flatnonzero(RelativeIncrease > CutoffRatio) + 1
Indices = np.hstack((0,CriticalIndices))
if (Indices[-1] != L):
Indices = np.hstack((Indices,L))
#print(Indices) # Uncomment to show index construction
# Compute averages
Avgs = np.empty((len(Indices)-1)); Avgs[:] = np.NaN
for iter in range(len(Avgs)):
Avgs[iter] = int( round(np.mean(SortedList[Indices[iter]:Indices[(iter+1)]]) ) )
# Return output
return Avgs
# Compute x- and y-coordinates of vertices
AvgsXcoord = ColAvgs(xys[:,0])
AvgsYcoord = ColAvgs(xys[:,1])
# Return all vertices
AvgGrid = np.empty((len(AvgsXcoord)*len(AvgsYcoord),2)); AvgGrid[:] = np.NaN
iter = 0
for y in AvgsYcoord:
for x in AvgsXcoord:
AvgGrid[iter, :] = np.hstack((x,y))
iter = iter+1
print(AvgGrid)
If you project all points on a vertical or horizontal axis, the problem turns to one of clustering with equally spaced clusters.
To perform these clusterings, you can consider the distances between the successive (sorted) points. They will form two clusters: short distances corresponding to noise, and longer ones for the grid size. You can solve the two-way clustering using the Otsu method.

Finding Optimal Value of K

How to calculate mean_distances from centroid to each point in the cluster for k clusters.
Formula:
My Code:
def mean_distances(k, X):
"""
Arguments:
k -- int, number of clusters
X -- np.array, matrix of input features
Returns:
Array of shape (k, ), containing mean of sum distances
from centroid to each point in the cluster for k clusters
"""
### START CODE HERE ###
mod = KMeans(X, k)
clusters, final_centrs = mod.final_centroids()
dist = []
for i in range(k):
d = np.sum(np.linalg.norm((clusters[i] - final_centrs[i, :])**2)).mean()
dist.append(d)
return dist
### END CODE HERE ###
But it doesn't work correctly.
(P.S. without scklearn, just numpy)
You are taking the mean of each element of the outer sum (i.e. each internal sum), as opposed to the mean of the outer sum:
import numpy as np
from sklearn.cluster import KMeans
def mean_distances(k, X):
"""
Arguments:
k -- int, number of clusters
X -- np.array, matrix of input features
Returns:
Array of shape (k, ), containing mean of sum distances
from centroid to each point in the cluster for k clusters
"""
mod = KMeans(X, k)
clusters, final_centrs = mod.final_centroids()
dist = []
for i in range(k):
d = np.sum(np.linalg.norm((clusters[i] - final_centrs[i, :])**2))
dist.append(d)
return dist.mean()

Categories