Centroids in K-Means - python

import math, random, os, operator, matplotlib, matplotlib.pyplot
from string import split
def EuDist(vecA, vecB):
return math.sqrt(sum(map(lambda x: x * x, [i - j for i, j in zip(vecA, vecB)])))
filename = "points.txt"
FILE = open(filename, "w")
for i in range(33):
line = str(random.uniform(1, 2) + random.uniform(-1, 1)) + "\t" + str(random.uniform(4, 5) + random.uniform(-1, 1)) + "\n"
FILE.write(line)
for i in range(33):
line = str(random.uniform(4, 6) + random.uniform(-1, 1)) + "\t" + str(random.uniform(4, 6) + random.uniform(-1, 1)) + "\n"
FILE.write(line)
for i in range(34):
line = str(random.uniform(2, 3) + random.uniform(-1, 1)) + "\t" + str(random.uniform(2, 3) + random.uniform(-1, 1)) + "\n"
FILE.write(line)
FILE.close()
dataFile = open("points.txt")
dataset = []
for line in dataFile:
lineSplit = split(line[: -2], "\t")
dataset.append([float(value) for value in lineSplit])
maxIters = input("Enter the maximum number of iterations: ")
center = input("Enter a number of clusters: ")
centoids = random.sample(dataset, center)
m = len(dataset)
cluster = [[] for i in range(len(centoids))]
for i in range(maxIters):
cluster = [[] for v in range(len(centoids))]
for j in range(m):
minK = 0
minDis = 100
for k in range(len(centoids)):
if operator.le(EuDist(dataset[j], centoids[k]), minDis):
minDis = EuDist(dataset[j], centoids[k])
minK = k
cluster[minK].append(j)
for t in range(len(centoids)):
x0 = sum([dataset[x][0] for x in cluster[t]])
y0 = sum([dataset[x][1] for x in cluster[t]])
centoids[k] = [x0 / len(cluster[t]), y0 / len(cluster[t])]
matplotlib.pyplot.plot(hold = False)
colorarr=["b", "r", "y", "g", "p"]
for k in range(len(cluster)):
clusterPoint = [dataset[x] for x in cluster[k]]
x0 = [x[0] for x in clusterPoint]
y0 = [x[1] for x in clusterPoint]
center = [(x0, y0) for x in clusterPoint]
matplotlib.pyplot.show(centoids)
matplotlib.pyplot.hold(True)
matplotlib.pyplot.scatter(x0, y0, center, c = colorarr[k])
picname = "picture_number_" + str(i + 1) + ".png"
matplotlib.pyplot.savefig(picname)
Code works fine, but I have a problem. I don't know how to display the centroids of clusters on this graph. I know that I need to use a variable centoids, but I don't know exactly how. Please give me a hint.

I'm not 100% sure what you want, but I'd think you just want to overplot the centroids of your clusters on the combined scatter plots of those clusters, all in one figure (each cluster with its own colour).
Something along these lines can work:
from matplotlib import pyplot as plt
import numpy as np
data = {
'x': np.random.rand(4, 100),
'y': np.random.rand(4, 100),
}
centoids = {
'x': np.random.rand(4),
'y': np.random.rand(4),
}
colorarr = ["b", "r", "y", "g"]
for i, cluster in enumerate(zip(data['x'], data['y'])):
plt.scatter(cluster[0], cluster[1], s=50, c=colorarr[i])
plt.grid(True)
plt.scatter(centoids['x'], centoids['y'], marker='+', color=colorarr, s=330)
plt.savefig("random.png")
Just use the few plt. lines shown here; you don't need more, and certainly not the hold variables or show. Basically, you're simply overplotting each cluster on top of the previous one, and on top of that, the cluster centroids.
In the last scatter, I've supplied the full colorarr to the color keyword: this way, each centroid gets the corresponding colour of the cluster.
In your code, it would look something like this:
colorarr=["b", "r", "y", "g", "p"]
for k in range(len(cluster)):
clusterPoint = [dataset[x] for x in cluster[k]]
x0 = [x[0] for x in clusterPoint]
y0 = [x[1] for x in clusterPoint]
center = [(x0, y0) for x in clusterPoint]
matplotlib.pyplot.scatter(x0, y0, center, c = colorarr[k])
xcentoids, ycentoids = zip(*centoids)
matplotlib.pyplot.scatter(xcentoids, ycentoids, marker='+', color=colorarr, s=330)
picname = "picture_number_" + str(i + 1) + ".png"
matplotlib.pyplot.savefig(picname)

Related

How to stop the roots of a cubic from becoming mixed up when plotting the 3 roots as contour plots?

I have a function which determines the roots of a complex cubic. I am solving the cubic for a variety of k0 and k1 values and showing the solutions as contour plots. Since the cubic has three roots, I produce 3 contour plots for the real parts and 3 for the imaginary parts. However, sometimes you can clearly see that sections of the contour plots for one root really should be swapped with a different contour plot - all the contours should be continuous. I have tried various "sorting methods" which you can see, but none of them fully fix it. What can I do so that the roots don't get mixed up resulting in non-continuous contours.
import numpy as np
import matplotlib.pyplot as plt
# Constants
Ra = 2e4
Pr = 0.1
Omega = 1e5
zeta = 1e-4
deltaN = 0.05
L = 55
def polynomial(k):
m = 1
delta_k = m**2 * np.pi**2 + k[0]**2
a_3 = delta_k
a_2 = 1j*(Ra * Pr * delta_k * k[0])/Omega + (Pr + zeta + 1)*delta_k**2
a_1 = 1j*(Ra * Pr * delta_k**2 * k[0] * (Pr + zeta)/Omega) + k[1] * Pr * zeta * (delta_k**2/L**2 + delta_k) - deltaN * Ra * Pr * k[0]**2 + (Pr * zeta + Pr + zeta) * delta_k**3
a_0 = 1j*(Pr * zeta * k[0] * (Ra * Pr * delta_k**3/Omega + k[1] * Omega * deltaN * delta_k / L**2)) + Pr * zeta * (k[1] * (Pr * delta_k**3 / L**2 + delta_k**2) - deltaN * Ra * delta_k * k[0]**2 + delta_k**4)
x_K = np.roots([a_3, a_2, a_1, a_0])
# x_K = np.sort_complex(x_K)
x_K = sorted(x_K, key=lambda x: x.imag)
# x_K = sorted(x_K, key=lambda x: x.real)
# if x_K[2].imag >= 0:
# x_K[-1], x_K[-2] = x_K[-2], x_K[-1]
# if x_K[0].imag >= x_K[2].imag:
# x_K[0], x_K[-1] = x_K[-1], x_K[0]
if x_K[0].real >= x_K[1].real:
x_K[0], x_K[1] = x_K[1], x_K[0]
# if x_K[1].real >= x_K[2].real:
# x_K[1], x_K[2] = x_K[2], x_K[1]
return x_K
# Create arrays of k[0] and k[1] values for contour plot
k0, k1 = np.linspace(0, 5, 100), np.linspace(0, 5e2, 100)
K0, K1 = np.meshgrid(k0, k1)
# Get roots for each pair of k[0], k[1] value
roots = np.array([polynomial([K0[i, j], K1[i, j]]) for i in range(100) for j in range(100)], dtype=complex)
ky_max = []
Qz_max = []
# Plot real and imaginary parts of roots separately in one figure
fig, axs = plt.subplots(2, 3, figsize=(13.6, 7.6), constrained_layout=True)
axs = axs.ravel()
for i in range(3):
cnt = axs[i].contourf(K0, K1, roots[:, i].real.reshape(K0.shape), levels=20, cmap='coolwarm')
axs[i].set_title(f'Real part of root {i+1}')
axs[i].set_xlabel('$k_y$')
axs[i].set_ylabel('$Q_z$')
# axs[i].set_yscale('log')
fig.colorbar(cnt, ax=axs[i])
cnt = axs[i+3].contourf(K0, K1, roots[:, i].imag.reshape(K0.shape), levels=20, cmap='coolwarm')
axs[i+3].set_title(f'Imaginary part of root {i+1}')
axs[i+3].set_xlabel('$k_y$')
axs[i+3].set_ylabel('$Q_z$')
# axs[i+3].set_yscale('log')
cbar1 = fig.colorbar(cnt, ax=axs[i+3])
cbar1.formatter.set_powerlimits((0, 0))
max_val = np.max(roots[:, i].real)
print(f'Maximum value for real part of root {i+1} is: {max_val}')
max_val = np.max(roots[:, i].real)
max_index = np.argmax(roots[:, i].real)
k0_max, k1_max = K0.flatten()[max_index], K1.flatten()[max_index]
axs[i].scatter(k0_max, k1_max, s=150, color='yellow', marker='x', label=f'Max value {max_val:.4f}')
axs[i].legend(loc=0)
ky_max.append(K0.flatten()[max_index])
Qz_max.append(K1.flatten()[max_index])
print(f'k_y for root {i+1} is: {k0_max}')
print(f'Q_z for root {i+1} is: {k1_max}')
for axis in ['top','bottom','left','right']:
axs[2].spines[axis].set_linewidth(3)
axs[2].spines[axis].set_color("green")
axs[5].spines[axis].set_linewidth(3)
axs[5].spines[axis].set_color("green")
# Create a caption
caption = f'Contour plot showing the real and imaginary components of the roots of the cubic for a range of $k_y$ and $Q_z$ values. Where the other variables are given by: Ra$^* = $ {Ra:.1e}, $\Delta N =$ {deltaN}, Pr = {Pr:.1e}, $\zeta =$ {zeta:.1e}, $\Omega =$ {Omega:.1e}, $L$ = {L}.'
# Create a file name
figure_name = f'decay_contour_Ra={Ra:.1e}_Pr={Pr:.1e}_dN={deltaN}'
pdf_file = f'{figure_name}.pdf'
tex_file = f'{figure_name}.tex'
# save the plot as a PDF
plt.savefig(pdf_file)
# create a text file containing the LaTeX code to include the figure
with open(tex_file, 'w') as f:
f.write("\\begin{figure}[h]\n")
f.write("\\centering\n")
f.write("\\includegraphics[width=0.85\linewidth]{"+ pdf_file+"}\n")
f.write("\\caption{"+ caption +"}\n")
f.write("\\end{figure}\n")
fig2, axs2 = plt.subplots(2, 3, figsize=(11, 8), constrained_layout=True)
for idx_1 in range(3):
k1_slice = 0
indices = np.where(K1.flatten() == k1_slice)
root_slice = roots[indices][:,idx_1].real
k1_slice = K0.flatten()[indices]
root_slice = roots[indices][:,idx_1].real
axs2[0][idx_1].plot(k1_slice, root_slice, color = 'red')
k1_slice_imag = K0.flatten()[indices]
root_slice_imag = roots[indices][:,idx_1].imag
axs2[1][idx_1].plot(k1_slice, root_slice_imag, color = 'red')
axs2[1][idx_1].set_xlabel('$k_y$')
axs2[0][0].set_ylabel('Re$(s)$')
axs2[1][0].set_ylabel('Im$(s)$')
for idx_1 in range(3):
axs2[0][idx_1].plot(k0, -zeta*(np.pi**2 + k0**2), 'x', markevery=10, color = 'black')
# Create a caption
caption = f'Profiles at the $k_y$ at $Q_z = 0$ showing the real and imaginary components of the roots of the cubic for a range of $k_y$ and $Q_z$ values. Where the other variables are given by: Ra$^* = $ {Ra:.1e}, $\Delta N =$ {deltaN}, Pr = {Pr:.1e}, $\zeta =$ {zeta:.1e}, $\Omega =$ {Omega:.1e}, $L$ = {L}.'
# Create a file name
figure_name = f'decay_profiles_Ra={Ra:.1e}_Pr={Pr:.1e}_dN={deltaN}'
pdf_file = f'{figure_name}.pdf'
tex_file = f'{figure_name}.tex'
# create a text file containing the LaTeX code to include the figure
with open(tex_file, 'w') as f:
f.write("\\begin{figure}[h]\n")
f.write("\\centering\n")
f.write("\\includegraphics[width=0.99\linewidth]{"+ pdf_file+"}\n")
f.write("\\caption{"+ caption +"}\n")
f.write("\\end{figure}\n")
for axis in ['top','bottom','left','right']:
axs2[0][2].spines[axis].set_linewidth(3)
axs2[0][2].spines[axis].set_color("green")
axs2[1][2].spines[axis].set_linewidth(3)
axs2[1][2].spines[axis].set_color("green")
# save the plot as a PDF
plt.savefig(pdf_file)
plt.show()
I've tried np.sort, np.sorted, flapping the roots using if statements etc, nothing works 100%
For two successive polynomials P and Q, I suggest simply solving the assignment problem to pair each root of P to the closest root of Q.
You can use scipy's linear_sum_assignment along with distance_matrix to find the best assignment of P's roots with Q's roots.
import numpy as np
from scipy.optimize import linear_sum_assignment
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
def get_root_sequence(sequence_of_polynomials):
r0 = np.roots(sequence_of_polynomials[0])
roots = [r0]
for P in sequence_of_polynomials[1:]:
r1 = np.roots(P)
_, idx = linear_sum_assignment(distance_matrix(r0.reshape(3, 1), r1.reshape(3,1)))
r1 = r1[idx]
roots.append(r1)
r0 = r1
return np.array(roots)
sequence_of_polynomials = np.linspace((1,0,0,-1), (1,-7-2j,15+9j,-10-10j), 100)
roots = get_root_sequence(sequence_of_polynomials)
plt.axes().set_aspect('equal')
for i in range(3):
r = roots[:, i]
ordinal = ('first', 'second', 'third')[i]
plt.plot(r.real, r.imag, label=f'{ordinal} root')
for triangle, label in zip((roots[0], roots[-1]), ('x³-1', '(x-2)(x-2-i)(x-3-i)')):
triangle = triangle[[0,1,2,0]]
plt.plot(triangle.real, triangle.imag, label=label)
plt.legend(loc='best')
plt.xlabel('Real part')
plt.ylabel('Imaginary part')
plt.show()

Applying linear transformation to whole surface of weird shape

So, I have this torus that is moved from its usual position given by the function
def toro(u,v,R,r,t):
n = unitary_normal_t(t)
n0, n1, n2 = [n[k][0] for k in [0,1,2]]
b = binormal_t(t)
b0, b1, b2 = [b[k][0] for k in [0,1,2]]
tang = tangente(t)
t0, t1, t2 = [tang[k][0] for k in [0,1,2]]
x = n0*np.cos(u)*(R + r*np.cos(v)) + b0*(R + r*np.cos(v))*np.sin(u) + r*t0*np.sin(v) + n0*R
y = n1*np.cos(u)*(R + r*np.cos(v)) + b1*(R + r*np.cos(v))*np.sin(u) + r*t1*np.sin(v) + n1*R
z = n2*np.cos(u)*(R + r*np.cos(v)) + b2*(R + r*np.cos(v))*np.sin(u) + r*t2*np.sin(v) + n2*R
return np.array([x,y,z])
The actual expression of this tori is not relevant. Since I wish to plot this tori, I do the following:
fig = go.Figure() # this creates the figure object
d = 1 # we add the torus
r = 0.1
R = 0.5
colorscales = ['plotly3','bluered','magma','blugrn','blues','gray']
u = np.linspace(0, 2*np.pi, 240) # discretization of the u parameter
v = np.linspace(0, 2*np.pi, 240) # discretization of the v parameter
uplot,vplot = np.meshgrid(u,v)
t = 0.1
torito = toro(uplot,vplot,R,r,t)
X = torito[0]
Y = torito[1]
Z = torito[2]
colorscale = 'plotly3'
fig.add_surface(x=X, y=Y, z=Z, colorscale = colorscale) # we add a surfaceplot to it
fig.update_traces(showscale=False)
Recall I am using numpy and Plotly. In the code above, torito is the torus and it is an array of shape (3,240,240). Now, I have this 3x3 matrix, let's call it M, that I would like to apply to torito, something like transformed_tori = np.matmul(M,torito) so that I could apply the same code as above to transformed_tori, yet I cannot do so because shapes do not match.
Does anyone know how I could do such a thing? Tryouts can be made with the following matrix: M = np.array([[ 0.00348674, -0.2282992 , 0.97358478], [ 0.07565293, 0.97086078, 0.2273895 ], [-0.99712811, 0.07286169, 0.02065664]])
Thank you in advance!!!
You can use np.einsum() to do the appropriate matrix multiplication along the wanted axis.
transformed_tori = np.einsum("ij,jkl->ikl", M, torito)
Where transformed_tori is a (3, 240, 240) array where transformed_tori[:, i,j] = np.matmul(M, tori[:, i,j]

Python: got an output image with unexpected grid lines

I am writing a function that scales the input image into times of
its input size. The function Resize(Mat I, float s) first fills in the and Mat’s
that contained the query point coordinates. Then I calculate the query value by
using bilinear interpolation.
The output image seems to be alright except it has an unexpected # shape grid on it. Can you provide any hint for the resolution?
Output image:
Code:
import numpy as np
import cv2 as cv
import math
import matplotlib.pyplot as plt
#Mat I, float s
def Resize(I, s):
orig_x = I.shape[0];
orig_y = I.shape[1];
tar_x = int (orig_x * s) #int tar_x and tar_y
tar_y = int (orig_y * s);
#print(tar_x)
# Query points
X = np.empty((tar_y, tar_x), np.float32)
Y = np.empty((tar_y, tar_x), np.float32)
# calc interval between output points
interval = (orig_x-1) / (tar_x-1)
# Setting the query points
for i in range(0, tar_y):
for j in range(0, tar_x):
#set X[i, j] and Y[i,j]
X[i][j] = j * interval
Y[i][j] = i * interval
# Output image
output = np.empty((tar_y, tar_x), np.uint8)
# Performing the interpolation
for i in range(0, tar_y):
for j in range(0, tar_x):
#set output[i,j] using X[i, j] and Y[i,j]
x = X[i][j]
y = Y[i][j]
x1 = math.floor(x)
x2 = math.ceil(x)
y1 = math.floor(y)
y2 = math.ceil(y)
vq1= (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1]
vq2= (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1]
output[i,j] = (y-y1)*vq2 + (y2-y)*vq1
return output
s= 640 / 256
I = cv.imread("aerial_256.png", cv.IMREAD_GRAYSCALE)
output = Resize(I,s)
output = cv.cvtColor(output, cv.COLOR_BGR2RGB)
plt.imshow(output)
plt.savefig("aerial_640.png",bbox_inches='tight',transparent=True, pad_inches=0)
plt.show()
You are getting a black pixel where x is an integer and where y is an integer.
Take a look at the following code:
x1 = math.floor(x)
x2 = math.ceil(x)
vq1= (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1]
vq2= (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1]
Assume: x = 85.0
x1 = floor(x) = 85
x2 = ceil(x) = 85
(x-x1) = (85-85) = 0
(x2-x) = (85-85) = 0
vq1 = (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1] = 0*I[y1,x2] + 0*I[y1,x1] = 0
vq2 = (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1] = 0*I[y2,x2] + 0*I[y2,x1] = 0
output[i,j] = (y-y1)*vq2 + (y2-y)*vq1 = (y-y1)*0 + (y2-y)*0 = 0
Result:
In the entire column where x = 85.0 the value of output[i,j] is zero (we are getting a black column).
Same result applied to y = 85.0 - we are getting a black row.
When does x value is an integer?
Take a look at the following code:
# calc interval between output points
interval = (orig_x-1) / (tar_x-1)
# Setting the query points
for i in range(0, tar_y):
for j in range(0, tar_x):
#set X[i, j] and Y[i,j]
X[i][j] = j * interval
interval = (orig_x-1) / (tar_x-1) = 255/639 = (3*5*17/(3*3*71) = 85/213
j * interval = j * 85/213
Each time j is a multiple of 213, j * interval is an integer (we are getting a black column).
It happens when j=0, j=213, j=426, j=639, so there are two black columns (beside margins).
There are also two visible black rows (beside margins).
Suggested solution:
Replace x2 = math.ceil(x) with x2 = min(x1 + 1, orig_x-1).
Replace y2 = math.ceil(y) with y2 = min(y1 + 1, orig_y-1).
Corrected loop:
for i in range(0, tar_y):
for j in range(0, tar_x):
#set output[i,j] using X[i, j] and Y[i,j]
x = X[i][j]
y = Y[i][j]
x1 = math.floor(x)
x2 = min(x1 + 1, orig_x-1)
y1 = math.floor(y)
y2 = min(y1 + 1, orig_y-1)
vq1= (x-x1)*I[y1,x2] + (x2-x)*I[y1,x1]
vq2= (x-x1)*I[y2,x2] + (x2-x)*I[y2,x1]
output[i,j] = (y-y1)*vq2 + (y2-y)*vq1
Result:

Is there a way I can plot a figure at initial and final conditions?

The code I am using works and is written correctly, only I want to have plots of the initial and final conditions (time = 0, time = .01)
Whenever run the code to show plot at n=0,10 I get the error "show() got an unexpected keyword argument 'n'."
import numpy
import matplotlib.pyplot as plt
n = 10 #number of timesteps
dt = .001 #(timestep)
L = 1.0 #domain (total length)
dx = 0.1 #spacial resolution
T0 = float(q + 1)
T1s = float(q + 1 - r)
T2s = float(q + 1 + s)
t_final = n*dt
alpha = float(p + 1)
x = np.linspace(0, L, n)
T = np.ones(n)*T0
dTdt = np.empty(n)
t = np.arange(0,t_final, dt)
for j in range(1,len(t)):
plt.clf()
for i in range(1,n-1):
dTdt[i] = alpha*(-(T[i]-T[i-1])/dx**2+(T[i+1]-T[i])/dx**2)
dTdt[0] = alpha*(-(T[0]-T1s)/dx**2+(T[1]-T[0])/dx**2)
dTdt[n-1] = alpha*(-(T[n-1]-T[n-2])/dx**2+(T2s-T[n-1])/dx**2)
T = T + dTdt*dt
plt.figure(1)
plt.plot(x,T)
plt.axis([0, L, 0, 14])
plt.xlabel('Distance')
plt.ylabel('Temperature')
plt.show(n=0)
plt.show(n=10)
Of course, because matplotlib doesn't know what "n" is. I suspect what you want is to replace the last seven lines with:
if j == 0 or j == n-1:
plt.figure(1)
plt.plot(x,T)
plt.axis([0, L, 0, 14])
plt.xlabel('Distance')
plt.ylabel('Temperature')
plt.show()

Why is this Linear Classifier algorithm wrong?

I specify 'n' amount of points. Label them +1 or -1. I store all this in a dictionary that looks like: {'point1' : [(0.565,-0.676), +1], ... }. I am trying to find a line that separates them - i.e. points labeled +1 above the line, those -1 below the line. Can anyone help?
I'm trying to apply w = w + y(r) as the "learning algorithm", w is the weight vector y is +1 or -1, r is the point
The code runs but the separating line is not precise - it doesn't separate correctly. Also, as I increase the number of points to separate, the line gets less efficient.
If you run the code, the green line is supposed to be the separating line. The closer it get to the slope of the blue line (the perfect line by definition), the better.
from matplotlib import pyplot as plt
import numpy as np
import random
n = 4
x_values = [round(random.uniform(-1,1),3) for _ in range(n)]
y_values = [round(random.uniform(-1,1),3) for _ in range(n)]
pts10 = zip(x_values, y_values)
label_dict = {}
x1, y1, x2, y2 = (round(random.uniform(-1,1),3) for _ in range(4))
b = [x1, y1]
d = [x2, y2]
slope, intercept = np.polyfit(b, d, 1)
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(*zip(*pts10), color = 'black')
ax.plot(b,d,'b-')
label_plus = '+'
label_minus = '--'
i = 1
for x,y in pts10:
if(y > (slope*x + intercept)):
ax.annotate(label_plus, xy=(x,y), xytext=(0, -10), textcoords='offset points', color = 'blue', ha='center', va='center')
label_dict['point{}'.format(i)] = [(x,y), "+1"]
else:
ax.annotate(label_minus, xy=(x,y), xytext=(0, -10), textcoords='offset points', color = 'red', ha='center', va='center')
label_dict['point{}'.format(i)] = [(x,y), "-1"]
i += 1
# this is the algorithm
def check(ww,rr):
while(np.dot(ww,rr) >= 0):
print "being refined 1"
ww = np.subtract(ww,rr)
return ww
def check_two(ww,rr):
while(np.dot(ww,rr) < 0):
print "being refined 2"
ww = np.add(ww,rr)
return ww
w = np.array([0,0])
ii = 1
for x,y in pts10:
r = np.array([x,y])
print w
if (np.dot(w,r) >= 0) != int(label_dict['point{}'.format(ii)][1]) < 0:
print "Point " + str(ii) + " should have been below the line"
w = np.subtract(w,r)
w = check(w,r)
elif (np.dot(w,r) < 0) != int(label_dict['point{}'.format(ii)][1]) >= 0:
print "Point " + str(ii) + " should have been above the line"
w = np.add(w,r)
w = check_two(w,r)
else:
print "Point " + str(ii) + " is in the correct position"
ii += 1
ax.plot(w,'g--')
ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.set_title('Labelling 10 points')
ax.set_xticks(np.arange(-1, 1.1, 0.2))
ax.set_yticks(np.arange(-1, 1.1, 0.2))
ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)
ax.legend()
You can for example use the SGDClassifier from scikit-learn (sklearn). The linear classifiers compute predictions as follows (see the source code):
def predict(self, X):
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(np.int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]
where the decision_function is given by:
def decision_function(self, X):
[...]
scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores
So for the two-dimensional case of your example this means that a data point is classified +1 if
x*w1 + y*w2 + i > 0
where
x, y = X
w1, w2 = self.coef_
i = self.intercept_
and -1 otherwise. So the decision depends on x*w1 + y*w2 + i being greater than or less than (or equal to) zero. Thus the "border" is found by setting x*w1 + y*w2 + i == 0. We are free to choose one of the components and the other one is determined by this equation.
The following snippet fits a SGDClassifier and plots the resulting "border". It assumes that the data points are scattered around the origin (x, y = 0, 0), i.e. that their mean is (approximately) zero. Actually, in order to obtain good results, one should first subtract the mean from the data points, then perform the fit and then add the mean back to the result. The following snippet just scatters the points around the origin.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDClassifier
n = 100
x = np.random.uniform(-1, 1, size=(n, 2))
# We assume points are scatter around zero.
b = np.zeros(2)
d = np.random.uniform(-1, 1, size=2)
slope, intercept = (d[1] / d[0]), 0.
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(x[:, 0], x[:, 1], color = 'black')
ax.plot([b[0], d[0]], [b[1], d[1]], 'b-', label='Ideal')
labels = []
for point in x:
if(point[1] > (slope * point[0] + intercept)):
ax.annotate('+', xy=point, xytext=(0, -10), textcoords='offset points', color = 'blue', ha='center', va='center')
labels.append(1)
else:
ax.annotate('--', xy=point, xytext=(0, -10), textcoords='offset points', color = 'red', ha='center', va='center')
labels.append(-1)
labels = np.array(labels)
classifier = SGDClassifier()
classifier.fit(x, labels)
x1 = np.random.uniform(-1, 1)
x2 = (-classifier.intercept_ - x1 * classifier.coef_[0, 0]) / classifier.coef_[0, 1]
ax.plot([0, x1], [0, x2], 'g--', label='Fit')
plt.legend()
plt.show()
This plot shows the result for n = 100 data points:
The following plot shows the results for different n where the points have been chosen randomly from the pool which contains 1000 data points:
This is the answer I've come up with. Some notes I realised:
w = w + y(r) algorithm only works for normalised vectors. 'w' is the weight vector, 'r' is [x,y] of the point in question, 'y' is the sign of the label.
You can find the slope and intercept from the resulting vector 'w' by putting the coefficients in ax+by+c = 0 form and solving for 'y'.
w = np.array([0,0,0])
restart = True
while restart:
ii = 0
restart = False
for x,y in pts10:
if(restart == False):
ii += 1
r = np.array([x,y,1])
if (np.dot(w,r) >= 0) and int(label_dict['point{}'.format(ii)][1]) >= 0:
print "Point " + str(ii) + " is correctly above the line --> no adjustments"
elif (np.dot(w,r) < 0) and int(label_dict['point{}'.format(ii)][1]) < 0:
print "Point " + str(ii) + " is correctly below the line --> no adjustments"
elif (np.dot(w,r) >= 0) and int(label_dict['point{}'.format(ii)][1]) < 0:
print "Point " + str(ii) + " should have been below the line"
w = np.subtract(w,r)
restart = True
break
elif (np.dot(w,r) < 0) and int(label_dict['point{}'.format(ii)][1]) >= 0:
print "Point " + str(ii) + " should have been above the line"
w = np.add(w,r)
restart = True
break
else:
print "THERE IS AN ERROR, A POINT PASSED THROUGH HERE"
print w
slope_w = (-w[0])/w[1]
intercept_w = (-w[2])/w[1]

Categories