Dynamic time warping with python (final mapping) - python

I need to align two sound signals in order to map one into the other (both signals corresponds to the same behavior). I try to implement the python code from:
https://nipunbatra.github.io/blog/2014/dtw.html
as a function to be called by my code. An example:
#time warping sound function trial
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
my_path ='/home/...'
def time_warping (x,y,fs,name):
distances = np.zeros((len(y), len(x)))
accumulated_cost = np.zeros((len(y), len(x)))
accumulated_cost[0,0] = distances[0,0]
def distance_cost_plot(distances):
#function to visualize the distance matrix
im = plt.imshow(distances, interpolation='nearest', cmap='Reds')
plt.gca().invert_yaxis()
plt.xlabel("X")
plt.ylabel("Y")
plt.grid()
plt.colorbar();
#plt.show()
plt.close()
def path_cost(x, y, accumulated_cost, distances):
#this is like mlpy.dtw_std (I gues..)
path = [[len(x)-1, len(y)-1]]
cost = 0
i = len(y)-1
j = len(x)-1
while i>0 and j>0:
if i==0:
j = j - 1
elif j==0:
i = i - 1
else:
if accumulated_cost[i-1, j] == min(accumulated_cost[i-1, j-1], accumulated_cost[i-1, j], accumulated_cost[i, j-1]):
i = i - 1
elif accumulated_cost[i, j-1] == min(accumulated_cost[i-1, j-1], accumulated_cost[i-1, j], accumulated_cost[i, j-1]):
j = j-1
else:
i = i - 1
j= j- 1
path.append([j, i])
path.append([0,0])
for [y, x] in path:
cost = cost +distances[x, y]
return path, cost
#Here I apply the function over function x and y
path, cost = path_cost(x, y, accumulated_cost, distances)
for i in range(len(y)):
for j in range(len(x)):
distances[i,j] = (x[j]-y[i])**2
#Here I plot the distance
g=distance_cost_plot(distances)
accumulated_cost = np.zeros((len(y), len(x)))
accumulated_cost[0,0] = distances[0,0]
for i in range(1, len(y)):
accumulated_cost[i,0] = distances[i, 0] + accumulated_cost[i-1, 0]
for i in range(1, len(x)):
accumulated_cost[0,i] = distances[0,i] + accumulated_cost[0, i-1]
for i in range(1, len(y)):
for j in range(1, len(x)):
accumulated_cost[i, j] = min(accumulated_cost[i-1, j-1], accumulated_cost[i-1, j], accumulated_cost[i, j-1]) + distances[i, j]
#empy list for the maping
map_x_final =[]
map_y_final =[]
map_x_f_final =[]
map_y_f_final =[]
paths = path_cost(x, y, accumulated_cost, distances)[0] #no entiendo la sintaxis de esta linea
print 'path',paths
print 'accumulated_cost',accumulated_cost
print 'distances',distances
#print 'paths.shape',path.shape
plt.figure(figsize=(14,8)) # 8 plots in one
plt.subplot(2,1,1)
grid(True)
map_x_fx =[]
map_y_fy =[]
map_y_fy_newlist =[]
for [map_x, map_y] in paths:
#print map_x, x[map_x], ":", map_y, y[map_y]
plt.plot([map_x*float(1)/float(fs), map_y*float(1)/float(fs)], [x[map_x], y[map_y]], 'r')
#plt.plot([map_x, map_y], [x[map_x], y[map_y]], 'r')
#saving in empy list
map_x_fx.append([map_x,x[map_x]])
map_y_fy.append([map_x,y[map_y]])
map_x_final.append(map_x)
map_y_final.append(map_y)
map_x_f_final.append(x[map_x])
map_y_f_final.append(y[map_y])
dif_a_sumar = (map_y-map_x)*float(1)/float(fs)
map_x_final = np.asarray(map_x_final)
map_y_final = np.asarray(map_y_final)
map_x_f_final = np.asarray(map_x_f_final)
map_y_f_final = np.asarray(map_y_f_final)
####
map_x_final_vec = np.asarray(map_x_fx)
map_y_final_vec = np.asarray(map_y_fy)
#Erase the elements that has been alrady map
lista_aux=[]
for j,[a,b] in enumerate(map_y_fy):
print j,':', [a,b]
print len( map_x_final[:j])
if a not in map_x_final[:j]:
lista_aux.append([a,b])
else:
pass
print'++++++'
print'lista aux len: ',len(lista_aux)
map_y_final_vec_ =np.asarray(lista_aux)
print'++++'
print 'map_y_fy',len(map_y_fy)
print'*************************'
#print ' a veer map_x_fx: ',map_x_fx
#print ' a veer map_x_fx type: ',type(map_x_fx)
#print ' map_y_f_final_vec shape',map_y_f_final_vec.shape
#print ' a veer map_x_final_vec: ',map_x_final_vec
#print ' a veer map_x_final_vec[0]: ',map_x_final_vec[0]
print'*************************'
print 'x shape',x.shape
print 'y shape',y.shape
print 'map_x_f_final',map_x_f_final.shape
print 'map_y_f_final',map_y_f_final.shape
print 'map_y_final_vec shape',map_y_final_vec.shape
print 'map_y_final_vec_ shape',map_y_final_vec_.shape
print'*************************'
#print map_x_final.size, map_y_final.size, map_x_f_final.size, map_y_f_final.size
time_x = np.arange(x.size)*float(1)/float(fs)
time_y = np.arange(y.size)*float(1)/float(fs)
time_map_x = np.arange(map_x_f_final.size)*float(1)/float(fs)
time_map_y = np.arange(map_y_f_final.size)*float(1)/float(fs)
plt.plot(time_x,x, 'bo-',linewidth=1 ,label='funcion target: X ')#'bo-'
plt.plot(time_y,y, 'go-',linewidth=1,markersize=3, label = 'funcion a proyectar :Y')#'g^-'
plt.legend(fontsize= 'small')
plt.ylabel('Signal')
plt.xlabel('time [s]')
plt.subplot(2,1,2) #los graficos mapeados
grid(True)
plt.plot(time_x,x, 'b',linewidth=1 ,label='funcion target: X sonido-vs')#o-
plt.plot(time_y,y, 'g',linewidth=1,markersize=3, label = 'funcion a proyectar :Y sonido-p')#'g^-'
plt.plot(map_y_final_vec_[:, 0]*float(1)/float(fs), map_y_final_vec_[:,1],'yo-',markersize=5, label='funcion Y mapeada donde convergen con DTW sobre X')#'m^'
plt.ylabel('Signal')
plt.xlabel('time [s]')
plt.legend(fontsize= 'small')
figname = "%s.jpg"%('alineado_dtw_'+name)
plt.savefig(my_path+figname,dpi=200)
#plt.show()
plt.close()
mapeo_time = map_y_final_vec_[:, 0]*float(1)/float(fs)
mapeo_amplitude = map_y_final_vec_[:,1]
return mapeo_time, mapeo_amplitude
I am able to obtain the distance between both signals:
But I'm not sure with the final mapping.
Am I doing something wrong with my mapping? I need to project one signal over the other, rescaling the first one with the other. I also tried with these two real signals:
I try to compare with:https://pypi.python.org/pypi/fastdtw and also with mlp library, but I get different signal mapping.
I also put everything on https://github.com/katejarne/dtw with the data set to generate the last figure and the mapping.

Related

Python: got an output image with unexpected grid lines

I am writing a function that scales the input image into times of
its input size. The function Resize(Mat I, float s) first fills in the and Mat’s
that contained the query point coordinates. Then I calculate the query value by
using bilinear interpolation.
The output image seems to be alright except it has an unexpected # shape grid on it. Can you provide any hint for the resolution?
Output image:
Code:
import numpy as np
import cv2 as cv
import math
import matplotlib.pyplot as plt
#Mat I, float s
def Resize(I, s):
orig_x = I.shape[0];
orig_y = I.shape[1];
tar_x = int (orig_x * s) #int tar_x and tar_y
tar_y = int (orig_y * s);
#print(tar_x)
# Query points
X = np.empty((tar_y, tar_x), np.float32)
Y = np.empty((tar_y, tar_x), np.float32)
# calc interval between output points
interval = (orig_x-1) / (tar_x-1)
# Setting the query points
for i in range(0, tar_y):
for j in range(0, tar_x):
#set X[i, j] and Y[i,j]
X[i][j] = j * interval
Y[i][j] = i * interval
# Output image
output = np.empty((tar_y, tar_x), np.uint8)
# Performing the interpolation
for i in range(0, tar_y):
for j in range(0, tar_x):
#set output[i,j] using X[i, j] and Y[i,j]
x = X[i][j]
y = Y[i][j]
x1 = math.floor(x)
x2 = math.ceil(x)
y1 = math.floor(y)
y2 = math.ceil(y)
vq1= (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1]
vq2= (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1]
output[i,j] = (y-y1)*vq2 + (y2-y)*vq1
return output
s= 640 / 256
I = cv.imread("aerial_256.png", cv.IMREAD_GRAYSCALE)
output = Resize(I,s)
output = cv.cvtColor(output, cv.COLOR_BGR2RGB)
plt.imshow(output)
plt.savefig("aerial_640.png",bbox_inches='tight',transparent=True, pad_inches=0)
plt.show()
You are getting a black pixel where x is an integer and where y is an integer.
Take a look at the following code:
x1 = math.floor(x)
x2 = math.ceil(x)
vq1= (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1]
vq2= (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1]
Assume: x = 85.0
x1 = floor(x) = 85
x2 = ceil(x) = 85
(x-x1) = (85-85) = 0
(x2-x) = (85-85) = 0
vq1 = (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1] = 0*I[y1,x2] + 0*I[y1,x1] = 0
vq2 = (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1] = 0*I[y2,x2] + 0*I[y2,x1] = 0
output[i,j] = (y-y1)*vq2 + (y2-y)*vq1 = (y-y1)*0 + (y2-y)*0 = 0
Result:
In the entire column where x = 85.0 the value of output[i,j] is zero (we are getting a black column).
Same result applied to y = 85.0 - we are getting a black row.
When does x value is an integer?
Take a look at the following code:
# calc interval between output points
interval = (orig_x-1) / (tar_x-1)
# Setting the query points
for i in range(0, tar_y):
for j in range(0, tar_x):
#set X[i, j] and Y[i,j]
X[i][j] = j * interval
interval = (orig_x-1) / (tar_x-1) = 255/639 = (3*5*17/(3*3*71) = 85/213
j * interval = j * 85/213
Each time j is a multiple of 213, j * interval is an integer (we are getting a black column).
It happens when j=0, j=213, j=426, j=639, so there are two black columns (beside margins).
There are also two visible black rows (beside margins).
Suggested solution:
Replace x2 = math.ceil(x) with x2 = min(x1 + 1, orig_x-1).
Replace y2 = math.ceil(y) with y2 = min(y1 + 1, orig_y-1).
Corrected loop:
for i in range(0, tar_y):
for j in range(0, tar_x):
#set output[i,j] using X[i, j] and Y[i,j]
x = X[i][j]
y = Y[i][j]
x1 = math.floor(x)
x2 = min(x1 + 1, orig_x-1)
y1 = math.floor(y)
y2 = min(y1 + 1, orig_y-1)
vq1= (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1]
vq2= (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1]
output[i,j] = (y-y1)*vq2 + (y2-y)*vq1
Result:

Is there a way I can plot a figure at initial and final conditions?

The code I am using works and is written correctly, only I want to have plots of the initial and final conditions (time = 0, time = .01)
Whenever run the code to show plot at n=0,10 I get the error "show() got an unexpected keyword argument 'n'."
import numpy
import matplotlib.pyplot as plt
n = 10 #number of timesteps
dt = .001 #(timestep)
L = 1.0 #domain (total length)
dx = 0.1 #spacial resolution
T0 = float(q + 1)
T1s = float(q + 1 - r)
T2s = float(q + 1 + s)
t_final = n*dt
alpha = float(p + 1)
x = np.linspace(0, L, n)
T = np.ones(n)*T0
dTdt = np.empty(n)
t = np.arange(0,t_final, dt)
for j in range(1,len(t)):
plt.clf()
for i in range(1,n-1):
dTdt[i] = alpha*(-(T[i]-T[i-1])/dx**2+(T[i+1]-T[i])/dx**2)
dTdt[0] = alpha*(-(T[0]-T1s)/dx**2+(T[1]-T[0])/dx**2)
dTdt[n-1] = alpha*(-(T[n-1]-T[n-2])/dx**2+(T2s-T[n-1])/dx**2)
T = T + dTdt*dt
plt.figure(1)
plt.plot(x,T)
plt.axis([0, L, 0, 14])
plt.xlabel('Distance')
plt.ylabel('Temperature')
plt.show(n=0)
plt.show(n=10)
Of course, because matplotlib doesn't know what "n" is. I suspect what you want is to replace the last seven lines with:
if j == 0 or j == n-1:
plt.figure(1)
plt.plot(x,T)
plt.axis([0, L, 0, 14])
plt.xlabel('Distance')
plt.ylabel('Temperature')
plt.show()

Barabasi-Albert model, wrong degree exponent

I'm trying to generate a scale-free network using the Barabasi-Albert model. The model predicts a degree distribution that follows p(k) ~ k^-3 but mine shows k^-2.
The algorithm was taken from Barabasi's book at this URL: http://barabasi.com/networksciencebook,
here is the relevant paragraph:
Barabasi's algorithm
Here is my code, could someone please help me figure out what is wrong?
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
plt.rcParams["figure.figsize"] = (15,6)
#initialize values
N = 10000
k = 2
m = int(k / 2)
#initialize matrices
adjacency = np.zeros((N,N))
degrees = np.zeros(N)
#add links
for i in range(N):
degrees[i] = m
for c in range(m):
# choose a node with probability proportional to it's degree
j = np.random.choice(N, p = degrees / (2 * m * i + m + c))
degrees[j] += 1
adjacency[i][j] += 1
adjacency[j][i] += 1
def get_binned_data(labels, values, num):
min_label, max_label = min(labels), max(labels)
base = (max_label / min_label) ** (1 / num)
bins = [base**i for i in range(int(np.log(max_label) / np.log(base)) + 1)]
binned_values, binned_labels = [], []
counter = 0
for b in bins:
bin_size = 0
bin_sum = 0
while counter < len(labels) and labels[counter] <= b:
bin_size += values[counter]
bin_sum += values[counter] * labels[counter]
counter += 1
if(bin_size):
binned_values.append(bin_size)
binned_labels.append(bin_sum / bin_size)
return binned_labels, binned_values
labels, values = zip(*sorted(Counter(degrees).items(), key = lambda pair:
pair[0]))
binned_labels, binned_values = get_binned_data(labels, values, 15)
fig, (ax1, ax2) = plt.subplots(ncols = 2, nrows = 1)
fig.suptitle('Barabasi-Albert Model',fontsize = 25)
ax1.loglog(binned_labels, binned_values, basex = 10, basey = 10, linestyle =
'None', marker = 'o', color = 'red')
ax1.set(xlabel = 'degree', ylabel = '# of nodes')
ax1.set_title('log-log scale (log-binned)',{'fontsize':'15'})
ax2.plot(labels, values, 'ro')
ax2.set(xlabel = 'degree', ylabel = '# of nodes')
ax2.set_title('linear scale',{'fontsize':'15'})
plt.show()
Your code does not run (probabilities in np.random.choice do not sum to 1). Why not p = degrees/np.sum(degrees)?
According to Wikipedia, you need to start with some already connected nodes, whereas you start from nothing. Also, you should probably put degrees[i] = m after the inner loop to avoid forming links from node i to itself.
This might help, but it's not clear to me how you generate your degree plot, so I can't verify it.

Why is this Linear Classifier algorithm wrong?

I specify 'n' amount of points. Label them +1 or -1. I store all this in a dictionary that looks like: {'point1' : [(0.565,-0.676), +1], ... }. I am trying to find a line that separates them - i.e. points labeled +1 above the line, those -1 below the line. Can anyone help?
I'm trying to apply w = w + y(r) as the "learning algorithm", w is the weight vector y is +1 or -1, r is the point
The code runs but the separating line is not precise - it doesn't separate correctly. Also, as I increase the number of points to separate, the line gets less efficient.
If you run the code, the green line is supposed to be the separating line. The closer it get to the slope of the blue line (the perfect line by definition), the better.
from matplotlib import pyplot as plt
import numpy as np
import random
n = 4
x_values = [round(random.uniform(-1,1),3) for _ in range(n)]
y_values = [round(random.uniform(-1,1),3) for _ in range(n)]
pts10 = zip(x_values, y_values)
label_dict = {}
x1, y1, x2, y2 = (round(random.uniform(-1,1),3) for _ in range(4))
b = [x1, y1]
d = [x2, y2]
slope, intercept = np.polyfit(b, d, 1)
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(*zip(*pts10), color = 'black')
ax.plot(b,d,'b-')
label_plus = '+'
label_minus = '--'
i = 1
for x,y in pts10:
if(y > (slope*x + intercept)):
ax.annotate(label_plus, xy=(x,y), xytext=(0, -10), textcoords='offset points', color = 'blue', ha='center', va='center')
label_dict['point{}'.format(i)] = [(x,y), "+1"]
else:
ax.annotate(label_minus, xy=(x,y), xytext=(0, -10), textcoords='offset points', color = 'red', ha='center', va='center')
label_dict['point{}'.format(i)] = [(x,y), "-1"]
i += 1
# this is the algorithm
def check(ww,rr):
while(np.dot(ww,rr) >= 0):
print "being refined 1"
ww = np.subtract(ww,rr)
return ww
def check_two(ww,rr):
while(np.dot(ww,rr) < 0):
print "being refined 2"
ww = np.add(ww,rr)
return ww
w = np.array([0,0])
ii = 1
for x,y in pts10:
r = np.array([x,y])
print w
if (np.dot(w,r) >= 0) != int(label_dict['point{}'.format(ii)][1]) < 0:
print "Point " + str(ii) + " should have been below the line"
w = np.subtract(w,r)
w = check(w,r)
elif (np.dot(w,r) < 0) != int(label_dict['point{}'.format(ii)][1]) >= 0:
print "Point " + str(ii) + " should have been above the line"
w = np.add(w,r)
w = check_two(w,r)
else:
print "Point " + str(ii) + " is in the correct position"
ii += 1
ax.plot(w,'g--')
ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.set_title('Labelling 10 points')
ax.set_xticks(np.arange(-1, 1.1, 0.2))
ax.set_yticks(np.arange(-1, 1.1, 0.2))
ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)
ax.legend()
You can for example use the SGDClassifier from scikit-learn (sklearn). The linear classifiers compute predictions as follows (see the source code):
def predict(self, X):
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(np.int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]
where the decision_function is given by:
def decision_function(self, X):
[...]
scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores
So for the two-dimensional case of your example this means that a data point is classified +1 if
x*w1 + y*w2 + i > 0
where
x, y = X
w1, w2 = self.coef_
i = self.intercept_
and -1 otherwise. So the decision depends on x*w1 + y*w2 + i being greater than or less than (or equal to) zero. Thus the "border" is found by setting x*w1 + y*w2 + i == 0. We are free to choose one of the components and the other one is determined by this equation.
The following snippet fits a SGDClassifier and plots the resulting "border". It assumes that the data points are scattered around the origin (x, y = 0, 0), i.e. that their mean is (approximately) zero. Actually, in order to obtain good results, one should first subtract the mean from the data points, then perform the fit and then add the mean back to the result. The following snippet just scatters the points around the origin.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDClassifier
n = 100
x = np.random.uniform(-1, 1, size=(n, 2))
# We assume points are scatter around zero.
b = np.zeros(2)
d = np.random.uniform(-1, 1, size=2)
slope, intercept = (d[1] / d[0]), 0.
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(x[:, 0], x[:, 1], color = 'black')
ax.plot([b[0], d[0]], [b[1], d[1]], 'b-', label='Ideal')
labels = []
for point in x:
if(point[1] > (slope * point[0] + intercept)):
ax.annotate('+', xy=point, xytext=(0, -10), textcoords='offset points', color = 'blue', ha='center', va='center')
labels.append(1)
else:
ax.annotate('--', xy=point, xytext=(0, -10), textcoords='offset points', color = 'red', ha='center', va='center')
labels.append(-1)
labels = np.array(labels)
classifier = SGDClassifier()
classifier.fit(x, labels)
x1 = np.random.uniform(-1, 1)
x2 = (-classifier.intercept_ - x1 * classifier.coef_[0, 0]) / classifier.coef_[0, 1]
ax.plot([0, x1], [0, x2], 'g--', label='Fit')
plt.legend()
plt.show()
This plot shows the result for n = 100 data points:
The following plot shows the results for different n where the points have been chosen randomly from the pool which contains 1000 data points:
This is the answer I've come up with. Some notes I realised:
w = w + y(r) algorithm only works for normalised vectors. 'w' is the weight vector, 'r' is [x,y] of the point in question, 'y' is the sign of the label.
You can find the slope and intercept from the resulting vector 'w' by putting the coefficients in ax+by+c = 0 form and solving for 'y'.
w = np.array([0,0,0])
restart = True
while restart:
ii = 0
restart = False
for x,y in pts10:
if(restart == False):
ii += 1
r = np.array([x,y,1])
if (np.dot(w,r) >= 0) and int(label_dict['point{}'.format(ii)][1]) >= 0:
print "Point " + str(ii) + " is correctly above the line --> no adjustments"
elif (np.dot(w,r) < 0) and int(label_dict['point{}'.format(ii)][1]) < 0:
print "Point " + str(ii) + " is correctly below the line --> no adjustments"
elif (np.dot(w,r) >= 0) and int(label_dict['point{}'.format(ii)][1]) < 0:
print "Point " + str(ii) + " should have been below the line"
w = np.subtract(w,r)
restart = True
break
elif (np.dot(w,r) < 0) and int(label_dict['point{}'.format(ii)][1]) >= 0:
print "Point " + str(ii) + " should have been above the line"
w = np.add(w,r)
restart = True
break
else:
print "THERE IS AN ERROR, A POINT PASSED THROUGH HERE"
print w
slope_w = (-w[0])/w[1]
intercept_w = (-w[2])/w[1]

Centroids in K-Means

import math, random, os, operator, matplotlib, matplotlib.pyplot
from string import split
def EuDist(vecA, vecB):
return math.sqrt(sum(map(lambda x: x * x, [i - j for i, j in zip(vecA, vecB)])))
filename = "points.txt"
FILE = open(filename, "w")
for i in range(33):
line = str(random.uniform(1, 2) + random.uniform(-1, 1)) + "\t" + str(random.uniform(4, 5) + random.uniform(-1, 1)) + "\n"
FILE.write(line)
for i in range(33):
line = str(random.uniform(4, 6) + random.uniform(-1, 1)) + "\t" + str(random.uniform(4, 6) + random.uniform(-1, 1)) + "\n"
FILE.write(line)
for i in range(34):
line = str(random.uniform(2, 3) + random.uniform(-1, 1)) + "\t" + str(random.uniform(2, 3) + random.uniform(-1, 1)) + "\n"
FILE.write(line)
FILE.close()
dataFile = open("points.txt")
dataset = []
for line in dataFile:
lineSplit = split(line[: -2], "\t")
dataset.append([float(value) for value in lineSplit])
maxIters = input("Enter the maximum number of iterations: ")
center = input("Enter a number of clusters: ")
centoids = random.sample(dataset, center)
m = len(dataset)
cluster = [[] for i in range(len(centoids))]
for i in range(maxIters):
cluster = [[] for v in range(len(centoids))]
for j in range(m):
minK = 0
minDis = 100
for k in range(len(centoids)):
if operator.le(EuDist(dataset[j], centoids[k]), minDis):
minDis = EuDist(dataset[j], centoids[k])
minK = k
cluster[minK].append(j)
for t in range(len(centoids)):
x0 = sum([dataset[x][0] for x in cluster[t]])
y0 = sum([dataset[x][1] for x in cluster[t]])
centoids[k] = [x0 / len(cluster[t]), y0 / len(cluster[t])]
matplotlib.pyplot.plot(hold = False)
colorarr=["b", "r", "y", "g", "p"]
for k in range(len(cluster)):
clusterPoint = [dataset[x] for x in cluster[k]]
x0 = [x[0] for x in clusterPoint]
y0 = [x[1] for x in clusterPoint]
center = [(x0, y0) for x in clusterPoint]
matplotlib.pyplot.show(centoids)
matplotlib.pyplot.hold(True)
matplotlib.pyplot.scatter(x0, y0, center, c = colorarr[k])
picname = "picture_number_" + str(i + 1) + ".png"
matplotlib.pyplot.savefig(picname)
Code works fine, but I have a problem. I don't know how to display the centroids of clusters on this graph. I know that I need to use a variable centoids, but I don't know exactly how. Please give me a hint.
I'm not 100% sure what you want, but I'd think you just want to overplot the centroids of your clusters on the combined scatter plots of those clusters, all in one figure (each cluster with its own colour).
Something along these lines can work:
from matplotlib import pyplot as plt
import numpy as np
data = {
'x': np.random.rand(4, 100),
'y': np.random.rand(4, 100),
}
centoids = {
'x': np.random.rand(4),
'y': np.random.rand(4),
}
colorarr = ["b", "r", "y", "g"]
for i, cluster in enumerate(zip(data['x'], data['y'])):
plt.scatter(cluster[0], cluster[1], s=50, c=colorarr[i])
plt.grid(True)
plt.scatter(centoids['x'], centoids['y'], marker='+', color=colorarr, s=330)
plt.savefig("random.png")
Just use the few plt. lines shown here; you don't need more, and certainly not the hold variables or show. Basically, you're simply overplotting each cluster on top of the previous one, and on top of that, the cluster centroids.
In the last scatter, I've supplied the full colorarr to the color keyword: this way, each centroid gets the corresponding colour of the cluster.
In your code, it would look something like this:
colorarr=["b", "r", "y", "g", "p"]
for k in range(len(cluster)):
clusterPoint = [dataset[x] for x in cluster[k]]
x0 = [x[0] for x in clusterPoint]
y0 = [x[1] for x in clusterPoint]
center = [(x0, y0) for x in clusterPoint]
matplotlib.pyplot.scatter(x0, y0, center, c = colorarr[k])
xcentoids, ycentoids = zip(*centoids)
matplotlib.pyplot.scatter(xcentoids, ycentoids, marker='+', color=colorarr, s=330)
picname = "picture_number_" + str(i + 1) + ".png"
matplotlib.pyplot.savefig(picname)

Categories