Clustering With K-Means in Python

Clustering With K-Means in Python - python

I have the code but i am trying to find a method to plot the results
import numpy as np
import random
def cluster_points(X, mu):
clusters = {}
for x in X:
bestmukey = min([(i[0], np.linalg.norm(x - mu[i[0]])) \
for i in enumerate(mu)], key=lambda t: t[1])[0]
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
return clusters
def reevaluate_centers(mu, clusters):
newmu = []
keys = sorted(clusters.keys())
for k in keys:
newmu.append(np.mean(clusters[k], axis=0))
return newmu
#def has_converged(mu, oldmu):
def find_centers(x, k):
# Initialize to K random centers
oldmu = random.sample(x, k)
mu = random.sample(x, k)
while not (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu])):
oldmu = mu
# Assign all points in X to clusters
clusters = cluster_points(x, mu)
# Reevaluate centers
mu = reevaluate_centers(oldmu, clusters)
return (mu, clusters)
def init_board(N):
X = np.array([(random.uniform(-1, 1), random.uniform(-1, 1)) for i in range(N)])
return X
def init_board_gauss(N, k):
n = float(N)/k
X = []
for i in range(k):
c = (random.uniform(-1, 1), random.uniform(-1, 1))
s = random.uniform(0.05,0.5)
x = []
while len(x) < n:
a, b = np.array([np.random.normal(c[0], s), np.random.normal(c[1], s)])
# Continue drawing points from the distribution in the range [-1,1]
if abs(a) < 1 and abs(b) < 1:
x.append([a,b])
X.extend(x)
X = np.array(X)[:N]
return X

Related

Unexpected value of cost function in Logistic regression

I been trying to write a python code for logistic regression but the results are showing very high value of cost function which is unexpected. I have created a random variable X and Y and added a noise term to Y which will flip the element of based on the probability theta. This is my code:
import numpy as np
from scipy.stats import bernoulli
rg = np.random.default_rng(100)
def data_generate(n, m, theta):
X_0 = np.ones((n, 1))
X = np.random.normal(loc=0.0, scale=1.0, size=(n, m))
X = np.concatenate((X_0, X), axis = 1)
beta = rg.random((m+1, 1))
Y = np.zeros((n, 1))
P = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
for i in range(len(P)):
if P[i] >= 0.5:
Y[i] = 1
else:
Y[i] = 0
# Noise addition
noise = bernoulli.rvs(size=(n,1), p=theta)
for j in range(len(noise)):
if noise[i] == 1:
Y[i] = int(not(Y[i]))
else:
pass
return X, Y, beta
def Gradient_Descent(X, Y, k, tollerence, learning_rate):
n,m = np.shape(X)
beta = rg.random((m, 1))
costs = []
initial_cost = 0.0
for i in range(k):
Y_pred = 1.0/(1.0 + np.exp(-np.dot(X, beta)))
cost = np.mean(np.dot(Y.T, np.log(Y_pred)) + np.dot((1-Y).T, np.log(1-Y_pred)))
if (abs(cost - initial_cost) <= tollerence):
break
else:
beta = beta - learning_rate*(np.mean(np.dot(X.T, (Y_pred - Y))))
initial_cost = cost
costs.append(cost)
return cost, beta, i
X = data_generate(200, 3, 0.1)[0]
Y = data_generate(200, 3, 0.1)[1]
Gradient_Descent(X, Y, 10000, 1e-6, 0.01)
# Output of code :
(-154.7689765716959,
array([[-0.02218003],
[-0.1182535 ],
[ 0.1169462 ],
[ 0.58610747]]),
14)`
Please tell what is the problem with the code.

Numpy - vectorize the bivariate poisson pmf equation

I'm trying to write a function to evaluate the probability mass function for the bivariate poisson distribution.
This is easy when all of the parameters (x, y, theta1, theta2, theta0) are scalars, but tricky to scale up without loops to allow these parameters to be vectors. I need it to scale such that, for:
theta0 being a scalar - the "correlation parameter" in the equation
theta1 and theta2 having length l
x, y both having length n
the output array would have shape (l, n, n). For example, a slice [j, :, :] from the output array would look like:
The first part (the constant, before the summation) I think i've figured out:
import numpy as np
from scipy.special import factorial
def constant(theta1, theta2, theta0, x, y):
exponential_part = np.exp(-(theta1 + theta2 + theta0)).reshape(-1, 1, 1)
x = np.tile(x, (len(x), 1)).transpose()
y = np.tile(y, (len(y), 1))
double_factorial = (np.power(np.array(theta1).reshape(-1, 1, 1), x)/factorial(x)) * \
(np.power(np.array(theta2).reshape(-1, 1, 1), y)/factorial(y))
return exponential_part * double_factorial
But I'm struggling with the summation part. How can I vectorize a summation where the limits depend on variable arrays?

I think I have this figured out, based on the approach that #w-m suggests: calculate every possible summation term which could appear, based on the maximum x or y value which appears, and use a mask to get rid of the ones you don't want. Assuming you have your x and y terms go from 0 to N, in consecutive order, this is calculating up to three times more terms than are actually required, but this is offset by getting to use vectorization.
Reference implementation
I wrote this by first writing a pure-Python reference implementation, which just implements your problem using loops. With 4 nested loops, it's not exactly fast, but it's handy to have while testing the numpy version.
import numpy as np
from scipy.special import factorial, comb
import operator as op
from functools import reduce
def choose(n, r):
# https://stackoverflow.com/a/4941932/530160
r = min(r, n-r)
numer = reduce(op.mul, range(n, n-r, -1), 1)
denom = reduce(op.mul, range(1, r+1), 1)
return numer // denom # or / in Python 2
def reference_impl_constant(s_theta1, s_theta2, s_theta0, s_x, s_y):
# Cast to float to prevent overflow
s_theta1 = float(s_theta1)
s_theta2 = float(s_theta2)
s_theta0 = float(s_theta0)
s_x = float(s_x)
s_y = float(s_y)
term1 = np.exp(-(s_theta1 + s_theta2 + s_theta0))
term2 = (s_theta1 ** s_x / factorial(s_x))
term3 = (s_theta2 ** s_y / factorial(s_y))
assert term1 >= 0
assert term2 >= 0
assert term3 >= 0
return term1 * term2 * term3
def reference_impl_constant_loop(theta1, theta2, theta0, x, y):
theta_len = theta1.shape[0]
xy_len = x.shape[0]
constant_array = np.zeros((theta_len, xy_len, xy_len))
for i in range(theta_len):
for j in range(xy_len):
for k in range(xy_len):
s_theta1 = theta1[i]
s_theta2 = theta2[i]
s_theta0 = theta0
s_x = x[j]
s_y = y[k]
constant_term = reference_impl_constant(s_theta1, s_theta2, s_theta0, s_x, s_y)
assert constant_term >= 0
constant_array[i, j, k] = constant_term
return constant_array
def reference_impl_summation(s_theta1, s_theta2, s_theta0, s_x, s_y):
sum_ = 0
for i in range(min(s_x, s_y) + 1):
sum_ += choose(s_x, i) * choose(s_y, i) * factorial(i) * ((s_theta0/s_theta1/s_theta2) ** i)
assert sum_ >= 0
return sum_
def reference_impl_summation_loop(theta1, theta2, theta0, x, y):
theta_len = theta1.shape[0]
xy_len = x.shape[0]
summation_array = np.zeros((theta_len, xy_len, xy_len))
for i in range(theta_len):
for j in range(xy_len):
for k in range(xy_len):
s_theta1 = theta1[i]
s_theta2 = theta2[i]
s_theta0 = theta0
s_x = x[j]
s_y = y[k]
summation_term = reference_impl_summation(s_theta1, s_theta2, s_theta0, s_x, s_y)
assert summation_term >= 0
summation_array[i, j, k] = summation_term
return summation_array
def reference_impl(theta1, theta2, theta0, x, y):
# all array inputs must be 1D
assert len(theta1.shape) == 1
assert len(theta2.shape) == 1
assert len(x.shape) == 1
assert len(y.shape) == 1
# theta vectors must have same length
theta_len = theta1.shape[0]
assert theta2.shape[0] == theta_len
# x and y must have same length
xy_len = x.shape[0]
assert y.shape[0] == xy_len
# theta0 is scalar
assert isinstance(theta0, (int, float))
constant_array = np.zeros((theta_len, xy_len, xy_len))
output = np.zeros((theta_len, xy_len, xy_len))
constant_array = reference_impl_constant_loop(theta1, theta2, theta0, x, y)
summation_array = reference_impl_summation_loop(theta1, theta2, theta0, x, y)
output = constant_array * summation_array
return output
Numpy implementation
I split the implementation of this across two functions.
The fast_constant() function calculates everything to the left of the summation symbol. The fast_summation() function calculates everything inside the summation symbol.
import numpy as np
from scipy.special import factorial, comb
def fast_summation(theta1, theta2, theta0, x, y):
x = np.tile(x, (len(x), 1)).transpose()
y = np.tile(y, (len(y), 1))
sum_limit = np.minimum(x, y)
max_sum_limit = np.max(sum_limit)
i = np.arange(max_sum_limit + 1).reshape(-1, 1, 1)
summation_mask = (i <= sum_limit)
theta_ratio = (theta0 / (theta1 * theta2)).reshape(-1, 1, 1, 1)
theta_to_power = np.power(theta_ratio, i)
terms = comb(x, i) * comb(y, i) * factorial(i) * theta_to_power
# mask out terms which aren't part of sum
terms *= summation_mask
# axis 0 is theta
# axis 1 is i
# axis 2 & 3 are x and y
# so sum across axis 1
terms = terms.sum(axis=1)
return terms
def fast_constant(theta1, theta2, theta0, x, y):
theta1 = theta1.astype('float64')
theta2 = theta2.astype('float64')
exponential_part = np.exp(-(theta1 + theta2 + theta0)).reshape(-1, 1, 1)
# x and y must be 1D
assert len(x.shape) == 1
assert len(y.shape) == 1
# x and y must have same shape
assert x.shape == y.shape
x_len, y_len = x.shape[0], y.shape[0]
x = x.reshape((x_len, 1))
y = y.reshape((1, y_len))
double_factorial = (np.power(np.array(theta1).reshape(-1, 1, 1), x)/factorial(x)) * \
(np.power(np.array(theta2).reshape(-1, 1, 1), y)/factorial(y))
return exponential_part * double_factorial
def fast_impl(theta1, theta2, theta0, x, y):
return fast_summation(theta1, theta2, theta0, x, y) * fast_constant(theta1, theta2, theta0, x, y)
Benchmarking
Assuming that X and Y range from 0 to 20, and that theta is centered somewhere inside that range, I get the result that the numpy version is roughly 280 times faster than the pure python reference.
Numerical stability
I'm unsure how numerically stable this is. For example, when I center theta at 100, I get a floating-point overflow. Typically, when computing an expression which has lots of choose and factorial expressions inside it, you'll use some mathematical equivalent which results in smaller intermediate sums. In this case I have so little understanding of the math that I don't know how you'd do that.

General minimal residual method with right-preconditioner of SSOR

I am trying to implement the algorithm of GMRES with right-preconditioner P for solving the linear system Ax = b . The code is running without error; however, it pops into unprecise result for me because the error I have is very large. For the GMRES method (without preconditioning matrix - remove P in the algorithm), the error I get is around 1e^{-12} and it converges with the same matrix.
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
from scipy.linalg import norm as norm
import scipy.sparse as sp
from scipy.sparse import diags
"""The program is to split the matrix into D-diagonal; L: strictly lower matrix; U strictly upper matrix
satisfying: A = D - L - U """
def splitMat(A):
n,m = A.shape
if (n == m):
diagval = np.diag(A)
D = diags(diagval,0).toarray()
L = (-1)*np.tril(A,-1)
U = (-1)*np.triu(A,1)
else:
print("A needs to be a square matrix")
return (L,D,U)
"""Preconditioned Matrix for symmetric successive over-relaxation (SSOR): """
def P_SSOR(A,w):
## Split up matrix A:
L,D,U = splitMat(A)
Comp1 = (D - w*U)
Comp2 = (D - w*L)
Comp1inv = np.linalg.inv(Comp1)
Comp2inv = np.linalg.inv(Comp2)
P = w*(2-w)*np.matmul(Comp1inv, np.matmul(D,Comp2inv))
return P
"""GMRES_SSOR using right preconditioning P:
A - matrix of linear system Ax = b
x0 - initial guess
tol - tolerance
maxit - maximum iteration """
def myGMRES_SSOR(A,x0, b, tol, maxit):
matrixSize = A.shape[0]
e = np.zeros((maxit+1,1))
rr = 1
rstart = 2
X = x0
w = 1.9 ## in ssor
P = P_SSOR(A,w) ### preconditioned matrix
### Starting the GMRES ####
for rs in range(0,rstart+1):
### first check the residual:
if rr<tol:
break
else:
r0 = (b-A.dot(x0))
rho = norm(r0)
e[0] = rho
H = np.zeros((maxit+1,maxit))
Qcol = np.zeros((matrixSize, maxit+1))
Qcol[:,0:1] = r0/rho
for k in range(1, maxit+1):
### Arnodi procedure ##
Qcol[:,k] =np.matmul(np.matmul(A,P), Qcol[:,k-1]) ### This step applies P here:
for j in range(0,k):
H[j,k-1] = np.dot(np.transpose(Qcol[:,k]),Qcol[:,j])
Qcol[:,k] = Qcol[:,k] - (np.dot(H[j,k-1], Qcol[:,j]))
H[k,k-1] =norm(Qcol[:,k])
Qcol[:,k] = Qcol[:,k]/H[k,k-1]
### QR decomposition step ###
n = k
Q = np.zeros((n+1, n))
R = np.zeros((n, n))
R[0, 0] = norm(H[0:n+2, 0])
Q[:, 0] = H[0:n+1, 0] / R[0,0]
for j in range (0, n+1):
t = H[0:n+1, j-1]
for i in range (0, j-1):
R[i, j-1] = np.dot(Q[:, i], t)
t = t - np.dot(R[i, j-1], Q[:, i])
R[j-1, j-1] = norm(t)
Q[:, j-1] = t / R[j-1, j-1]
g = np.dot(np.transpose(Q), e[0:k+1])
Y = np.dot(np.linalg.inv(R), g)
Res= e[0:n] - np.dot(H[0:n, 0:n], Y[0:n])
rr = norm(Res)
#### second check on the residual ###
if rr < tol:
break
#### Updating the solution with the preconditioned matrix ####
X = X + np.matmul(np.matmul(P,Qcol[:, 0:k]), Y) ### This steps applies P here:
return X
######
A = np.random.rand(100,100)
x = np.random.rand(100,1)
b = np.matmul(A,x)
x0 = np.zeros((100,1))
maxit = 100
tol = 0.00001
x = myGMRES_SSOR(A,x0,b,tol,maxit)
res = b - np.matmul(A,x)
print(norm(res))
print("Solution with gmres\n", np.matmul(A,x))
print("---------------------------------------")
print("b matrix:", b)
I hope anyone could help me figure out this!!!

I'm not sure where you got you "Symmetric_successive_over-relaxation" SSOR code from, but it appears to be wrong. You also seem to be assuming that A is symmetric matrix, but in your random test case it is not.
Following SSOR's Wikipedia entry, I replaced your P_SSOR function with
def P_SSOR(A,w):
L,D,U = splitMat(A)
P = 2/(2-w) * (1/w*D+L)*np.linalg.inv(D)*(1/w*D+L).T
return P
and your test matrix with
A = np.random.rand(100,100)
A = A + A.T
and your code works up to a 12 digit residual error.

filling a k,k matrix with lower triangular data

I want to put the output of d2arm as a lower triangular in a k by k matrix. Any advice is welcome.
def r(k):
v = np.arange(0, k, 1)
w = abs(v - (k - 1))
return(np.repeat(v, w, axis = None))
def s(k) :
l = pd.DataFrame()
for i in np.arange(1, k):
r = pd.DataFrame(np.arange(i, k, 1))
l = l.append(r)
return(np.array(l)).T
def d2arm(d, f):
k = d.shape[1]
m1 = pd.DataFrame(r(k)).T
m2 = pd.DataFrame(s(k))
H = m2.append(m1)
a = np.array([])
for i in range(H.shape[1]):
lr = round(f(d.iloc[:, H.iloc[:, i]]), 2)
a = np.append(a, lr)
return(a)

How to scatter plot a dict of lists containing arrays in Matplotlib? (Screenshot in details)

What I want to do is plotting data in a dict, preferably using Matplotlib.
Below is a screenshot since I think looking at the data structure makes it easier to understand. But here is also a description.
A dict contains 7 lists.
Each list represents a cluster.
Each list contains a number of arrays with two items in it.
Each array represents a two-dimensional point.
I want to recreate the results in this Blog post. Unfortunately the author didn't provide the code he used for the plots.
https://datasciencelab.wordpress.com/2013/12/12/clustering-with-k-means-in-python/
Here is the screenshot:
In case it helps, here is the full code I used to generate the clusters:
import numpy as np
import random
import matplotlib.pyplot as plt # STACKOVERFLOW
# k-Means Algorithm (Lloyd's Algorithm)
def cluster_points(X, mu):
clusters = {}
for x in X:
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
for i in enumerate(mu)], key=lambda t:t[1])[0]
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
return clusters
def reevaluate_centers(mu, clusters):
newmu = []
keys = sorted(clusters.keys())
for k in keys:
newmu.append(np.mean(clusters[k], axis = 0))
return newmu
def has_converged(mu, oldmu):
return set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu])
def find_centers(X, K):
# Initialize to K random centers
oldmu = random.sample(X, K)
mu = random.sample(X, K)
while not has_converged(mu, oldmu):
oldmu = mu
# Assign all points in X to clusters
clusters = cluster_points(X, mu)
# Reevaluate centers
mu = reevaluate_centers(oldmu, clusters)
return(mu, clusters)
# initialization
def init_board_gauss(N, k):
n = float(N)/k
X = []
for i in range(k):
c = (random.uniform(-1, 1), random.uniform(-1, 1))
s = random.uniform(0.05,0.5)
x = []
while len(x) < n:
a, b = np.array([np.random.normal(c[0], s), np.random.normal(c[1], s)])
# Continue drawing points from the distribution in the range [-1,1]
if abs(a) < 1 and abs(b) < 1:
x.append([a,b])
X.extend(x)
X = np.array(X)[:N]
return X
X = init_board_gauss(200,3)
# generating clusters
mu, clusters = find_centers(X, 7)
clusters = cluster_points(X, mu)

I came up with a solution.
Here is what I changed/added to the code:
# Initialize points
n_points = 200
n_clusters = 7
X = init_board_gauss(n_points, n_clusters)
# Cluster points
mu, clusters = find_centers(X, n_clusters)
clusters = cluster_points(X, mu)
# Generate random colors
def generate_random_color():
r = lambda: random.randint(0,255)
return '#%02X%02X%02X' % (r(),r(),r())
# Plot each cluster
for i in range(0, n_clusters):
colx = tuple(x[0] for x in clusters[i])
coly = tuple(x[1] for x in clusters[i])
cluster_color = generate_random_color()
plt.scatter(colx,coly, color=cluster_color)
And here is the full code:
import matplotlib.pyplot as plt
import numpy as np
import random
# k-Means Algorithm (Lloyd's Algorithm)
def cluster_points(X, mu):
clusters = {}
for x in X:
bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \
for i in enumerate(mu)], key=lambda t:t[1])[0]
try:
clusters[bestmukey].append(x)
except KeyError:
clusters[bestmukey] = [x]
return clusters
def reevaluate_centers(mu, clusters):
newmu = []
keys = sorted(clusters.keys())
for k in keys:
newmu.append(np.mean(clusters[k], axis = 0))
return newmu
def has_converged(mu, oldmu):
return set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu])
def find_centers(X, K):
# Initialize to K random centers
oldmu = random.sample(X, K)
mu = random.sample(X, K)
while not has_converged(mu, oldmu):
oldmu = mu
# Assign all points in X to clusters
clusters = cluster_points(X, mu)
# Reevaluate centers
mu = reevaluate_centers(oldmu, clusters)
return(mu, clusters)
# Initialization
def init_board(N):
X = np.array([(random.uniform(-1, 1), random.uniform(-1, 1)) for i in range(N)])
return X
def init_board_gauss(N, k):
n = float(N)/k
X = []
for i in range(k):
c = (random.uniform(-1, 1), random.uniform(-1, 1))
s = random.uniform(0.05,0.5)
x = []
while len(x) < n:
a, b = np.array([np.random.normal(c[0], s), np.random.normal(c[1], s)])
# Continue drawing points from the distribution in the range [-1,1]
if abs(a) < 1 and abs(b) < 1:
x.append([a,b])
X.extend(x)
X = np.array(X)[:N]
return X
# Initialize points
n_points = 200
n_clusters = 7
X = init_board_gauss(n_points, n_clusters)
# Cluster points
mu, clusters = find_centers(X, n_clusters)
clusters = cluster_points(X, mu)
# Generate random colors
def generate_random_color():
r = lambda: random.randint(0,255)
return '#%02X%02X%02X' % (r(),r(),r())
# Plot each cluster
for i in range(0, n_clusters):
colx = tuple(x[0] for x in clusters[i])
coly = tuple(x[1] for x in clusters[i])
cluster_color = generate_random_color()
plt.scatter(colx,coly, color=cluster_color)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Clustering With K-Means in Python - python

Related

Unexpected value of cost function in Logistic regression

Numpy - vectorize the bivariate poisson pmf equation

General minimal residual method with right-preconditioner of SSOR

filling a k,k matrix with lower triangular data

How to scatter plot a dict of lists containing arrays in Matplotlib? (Screenshot in details)

Categories

Resources