there. So I build a Kmean cluster program; however, each time I run the program the plot changes. I dont know why its happening and if anyone could help that would be very appreciate.
import numpy as np
import matplotlib.pyplot as plt
import random
def cal_centroids(clusters, cluster_array,k):
new_centroids= []
for c in range(k):
x= 0
y=0
count=0
for i in range(len(clusters)):
if clusters[i]==c:
x+=cluster_array[i][0]
y+=cluster_array[i][1]
count+=1
x/=count
y/=count
new_centroids.append([x,y])
return new_centroids
def assign_clusters(centroids,cluster_array):
clusters=[]
for i in range(cluster_array.shape[0]):
distances=[]
for centroid in centroids:
distances.append(calc_distance(centroid,cluster_array[i]))
cluster=[z for z, val in enumerate(distances) if val==min(distances)]
clusters.append(cluster[0])
return clusters
def calc_distance(x1,x2):
return (sum((x1-x2)**2))**0.5
#from here on its mostly storing data, initializing centroids and assigning cluster label to data
def kmean(data,no_clusters,iterations):
s= random.sample(range(data.shape[0]),no_clusters)
centroids= []
for i in s:
centroids.append(data[i,:])
clusters= assign_clusters(centroids,data)
initial_centroids= [i for i in centroids]
for i in range(0,iterations):
centroids= cal_centroids(clusters,data,no_clusters)
cluster= assign_clusters(centroids,data)
dict_centroids= {}
for i in range(no_clusters):
dict_centroids[i]=[]
for i in range(no_clusters):
for j in range(data.shape[0]):
if(clusters[j]==i):
dict_centroids[i].append(data[j,:])
return dict_centroids,centroids,clusters
def extract_file(file_name):
file = open(file_name,'r')
lines = [list(map(int, line.strip("\n").split(","))) for line in file]
x= np.array(lines)
return x
data= extract_file("backyard.txt")
dict_centroids,centroids,clusters= kmean(data,2,8)
x= data[:,0]
y= data[:,1]
fig=plt.figure()
scatter= plt.scatter(x,y,c=clusters,s=40)
for i,j in centroids:
plt.scatter(i,j,s=50,c='red',marker= '+')
plt.xlabel("Vitamin C")
plt.ylabel("GLA")
plt.title("File backyard 2 groups Displayed")
fig.show()
the backyard list is this:
40,40
10,10
200,200
230,231
40,43
15,45
220,190
I haven't run your code, however, if the graph changes on every run there is nothing to worry about. K-means is an algorithm that uses a random start (which I'm assuming you did in your code with this line: s= random.sample(range(data.shape[0]),no_clusters)). There is no guarantee that K-means will converge to a global minimum, but it will converge to a local minimum depending on the random start.
You could maybe try to fix your random start by setting a random seed with NumPy: numpy.random.seed(42)
Related
Im working in some Euler Equations about Draining a Conical Tank but I'm having problem when I plot the simulation.
My intention is reduce the radius of the tank while time increases. I want to delete the first blue line after the first cycle, but I don't know how :(
this is the code:
import matplotlib.pyplot as plt
import numpy as np
Hi=0.5
hf=0
r=0.003
ang=8*np.pi/180
teta=np.tan(ang)**2
dt=1
g=9.8
t=0
t1=0
h=Hi
h2=0
R=np.tan(ang)*h
R1=np.tan(ang)*
Vh=[h]
Vt=[t]
Rt=[R]
f1 = plt.figure(1)
plot1=plt.plot([R-r,0,2*R,R+r,R-r],[0,Hi,Hi,0,0],"k")
plot1=line=plt.plot([0,2*R],[h,h],"b")
plt.grid()
plt.ylabel("ALtura del agua")
while h>0 and R>0:
h=((5*((2*g)**(1/2))*r**2)/(-2*teta*((Hi**(3/2)))))*dt+h
R=(np.tan(ang)*h)
t=t+dt
plot1=line=plt.plot([R1-R,R1+R],[h,h],"b")
Vh.append(h)
Vt.append(t)
Rt.append(R)
line[0].set_ydata([h])
plt.pause(1/24)
plt.title("Tiempo = "+str(t))
Try this to delete the last line:
if h <= 0:
h = 0
Vh.append(h)
Vt.append(t)
Rt.append(R)
line[0].set_ydata([h])
plt.pause(1/24)
plt.title("Tiempo = " + str(t))
Tell me if it was useful.
Greetings
Here is the code for a random walk I made which I attempted to constrain where -5 < y < 5:
import random
import numpy as np
import matplotlib.pyplot as plt
import math
import decimal
def dirs(x):
return np.array( [math.cos(x), math.sin(x)] )
def constrainedRandomWalk(x):
numSteps = x
locations = np.zeros( (numSteps, 2) )
for i in range(1, numSteps):
r = random.randrange(628318)/100000
move = dirs(r)
locations[i] = locations[i-1] + move
if -5<locations[i][1]<5:
continue
#return locations
plt.figure(figsize=(8,8))
plt.plot( locations[:,0], locations[:,1], alpha=0.7 );
plt.xlim([-20,20])
plt.ylim([-20,20])
I attempted to constrain the "walking character" by setting a condition on the loop that
if -5<locations[i][1]<5:
continue
However, as you can see here, the character leaves the -5<y<5 region:
Can anyone let me know how to actually constrain the random walk and why this method doesn't work? Thank you!
You're updating locations before you test if the move is valid:
import math
import random
import matplotlib.pyplot as plt
import numpy as np
def dirs(x):
return np.array([math.cos(x), math.sin(x)])
def constrained_random_walk(num_steps):
# Specify Start Point
locations = [np.array([0, 0])]
# Go Until locations is long enough
while len(locations) < num_steps:
r = random.randrange(628318) / 100000
move = dirs(r)
# Test if the new move is in bounds first
new_location = locations[-1] + move
if -5 < new_location[1] < 5:
locations.append(new_location)
locations = np.array(locations)
plt.figure(figsize=(8, 8))
plt.plot(locations[:, 0], locations[:, 1], alpha=0.7)
plt.xlim([-20, 20])
plt.ylim([-20, 20])
Sample Output on:
constrained_random_walk(2000)
Edit: Updated so all skipped values are not (0,0) but every value in locations is populated by a generated move. Except for the first, which is specified as the start point. (Currently (0,0))
I want to read an grayscale image, say something with (248, 480, 3) shape, then use each element of it as the lam value for making a Poisson random value and do this for each element and make a new data set with the same shape. I want to do this as much as nscan, then I want to add them all together and put them in a new data set and plot it again to get something that is similar to the first image that I put in the beginning. This code is working but it is extremely slow, I was wondering if there is any way to make it faster?
import numpy as np
import matplotlib.pyplot as plt
my_image = plt.imread('myimage.png')
def genP(data):
new_data = np.zeros(data.shape)
for i in range(data.shape[0]):
for j in range(data.shape[1]):
for k in range(data.shape[2]):
new_data[i, j, k] = np.random.poisson(lam = data[i, j, k])
return new_data
def get_total(data, nscan = 1):
total = genP(data)
for i in range(nscan):
total += genP(data)
total = total/nscan
plt.imshow(total)
plt.show()
get_total(my_image, 100)
numpy.random.poisson can entirely replace your genP() function... This is basically guaranteed to be much faster.
If size is None (default), a single value is returned if lam is a scalar. Otherwise, np.array(lam).size samples are drawn
def get_total(data, nscan = 1):
total = np.random.poisson(lam=data)
for i in range(nscan):
total += np.random.poisson(lam=data)
total = total/nscan
plt.imshow(total)
plt.show()
i have a geodesic distance of graph data in .csv format
i want to reduce it into 2D using Multidimensional Scaling (MDS) and cluster it using Kmedoids
This is my code:
# coding: utf-8
import numpy as np
import csv
from sklearn import manifold
from sklearn.metrics.pairwise import pairwise_distances
import kmedoidss
rawdata = csv.reader(open('data.csv', 'r').readlines()[1:])
# Process the data into a 2D array, omitting the header row
data, labels = [], []
for row in rawdata:
labels.append(row[1])
data.append([int(i) for i in row[1:]])
#print data
# Now run very basic MDS
# Documentation here: http://scikit-learn.org/dev/modules/generated/sklearn.manifold.MDS.html#sklearn.manifold.MDS
mds = manifold.MDS(n_components=2, dissimilarity="precomputed")
pos = mds.fit_transform(data)
# distance matrix
D = pairwise_distances(pos, metric='euclidean')
# split into c clusters
M, C = kmedoidss.kMedoids(D, 3)
print ('Data awal : ')
for index, point_idx in enumerate(pos, 1):
print(index, point_idx)
print ('\n medoids:' )
for point_idx in M:
print('{} index ke - {} '.format (pos[point_idx], point_idx+1))
print('')
print('clustering result:')
for label in C:
for point_idx in C[label]:
print('cluster- {}:{} index- {}'.format(label, pos[point_idx], point_idx+1))
kmedoidss.py
import numpy as np
import random
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
# randomly initialize an array of k medoid indices
M = np.sort(np.random.choice(n, k))
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in xrange(tmax):
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C
how to visualize the cluster result as a graph with different node color based on its cluster?
You don't need MDS to run kMedoids - just run it on the original distance matrix (kMedoids can also be made to work on a similarity matrix by switching min for max).
Use MDS only for plotting.
The usual approach for visualization is to use a loop over clusters, and plot each cluster in a different color; or to use a color predicate. There are many examples in the scipy documentation.
http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
y_pred = labels.astype(np.int)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
where X is your pos variable (2d mds result) and labels are an integer cluster number for every point. Since you don't have your data in thid "labels" layout, consider using a loop instead:
for label, pts in C.items():
plt.scatter(pos[pts, 0], pos[pts, 1], color=colors[label])
plt.show()
I get this horrible massive error when trying to plot using matplotlib:
Traceback (most recent call last):
File "24oct_specanal.py", line 90, in <module>
main()
File "24oct_specanal.py", line 83, in main
plt.plot(Svar,Sav)
File "/usr/lib64/python2.6/site-packages/matplotlib/pyplot.py", line 2458, in plot
ret = ax.plot(*args, **kwargs)
File "/usr/lib64/python2.6/site-packages/matplotlib/axes.py", line 3849, in plot
self.add_line(line)
File "/usr/lib64/python2.6/site-packages/matplotlib/axes.py", line 1443, in add_line
self._update_line_limits(line)
File "/usr/lib64/python2.6/site-packages/matplotlib/axes.py", line 1451, in _update_line_limits
p = line.get_path()
File "/usr/lib64/python2.6/site-packages/matplotlib/lines.py", line 644, in get_path
self.recache()
File "/usr/lib64/python2.6/site-packages/matplotlib/lines.py", line 392, in recache
x = np.asarray(xconv, np.float_)
File "/usr/lib64/python2.6/site-packages/numpy/core/numeric.py", line 235, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.
This is the code I am using:
import numpy as np
import numpy.linalg
import random
import matplotlib.pyplot as plt
import pylab
from scipy.optimize import curve_fit
from array import array
def makeAImatrix(n):
A=np.zeros((n,n))
I=np.ones((n))
for i in range(0,n):
for j in range(i+1,n):
A[j,i]=random.random()
for i in range(0,n):
for j in range(i+1,n):
A[i,j] = A[j,i]
for i in range(n):
A[i,i]=1
return (A, I)
def main():
n=5 #number of species
t=1 # number of matrices to check
Aflat = []
Aflatlist = [] #list of matrices
Aflatav = []
Aflatvar = []
Aflatskew = []
remspec = []
Afreeze = [] #this is a LIST OF VECTORS that stores the vector corresponding to each extinct species as
#it is taken out. it is NOT the same as the original A matrix as it is only
#coherant in one direction. it is also NOT A SQUARE.
Sex = [] # (Species extinct) this is a vector that corresponds to the Afreeze matrix. if a species is extinct then
#the value stored here will be -1.
Sav = [] # (Species average) The average value of the A cooefficiants for each species
Svar = [] # (Species variance)
for k in range (0,t):
allpos = 0
A, I = makeAImatrix(n)
while allpos !=1: #while all solutions are not positive
x = numpy.linalg.solve(A,I)
if any(t<0 for t in x): #if any of the solutions in x are negative
p=np.where(x==min(x)) # find the most negative solution, p is the position
#now store the A coefficiants of the extinct species in the Afreeze list
Afreeze.append(A[p])
Sex.append(-1) #given -1 value as species is extinct.
x=np.delete(x, p, 0)
A=np.delete(A, p, 0)
A=np.delete(A, p, 1)
I=np.delete(I, p, 0)
else:
allpos = 1 #set allpos to one so loop is broken
l=len(x)
#now fill Afreeze and Sex with the remaining species that have survived
for m in range (0, l):
Afreeze.append(A[m])
Sex.append(1) # value of 1 as this species has survived
#now time to analyse the coefficiants for each species.
for m in range (0, len(Sex)):
X1 = sum(Afreeze[m])/len(Afreeze[m]) # this is the mean
X2 = 0
for p in range (len(Afreeze[m])):
X2 = X2 + Afreeze[m][p]
X2 = X2/len(Afreeze[m])
Sav.append(X1)
Svar.append(X2 - X1*X1)
spec = []
for b in range(0,n):
spec.append(b)
plt.plot(Svar,Sav)
plt.show()
#plt.scatter(spec, Sav)
#plt.show()
if __name__ == '__main__':
main()
I cannot figure this out at all! I think it was working before but then just stopped working. Any ideas?
Your problem is in this section:
if any(t<0 for t in x): #if any of the solutions in x are negative
p=np.where(x==min(x)) # find the most negative solution, p is the position
#now store the A coefficiants of the extinct species in the Afreeze list
Afreeze.append(A[p])
You're indexing a 2D array, and the result is still a 2D array. So, your Afreeze will get a 2D array appended, instead of a 1D array. Later, where you sum the separate elements of Afreeze, a summed 2D array will result in a 1D array, and that gets added to Sav and Svar. By the time you feed these variables to plt.plot(), matplotlib will get an array as one of the elements instead of a single number, which it of course can't cope with.
You probably want:
if any(t<0 for t in x):
p=np.where(x==min(x))
Afreeze.append(A[p][0])
but I haven't tried to follow the logic of the script very much; that's up to you.
Perhaps good to see if this is indeed what you want: print the value of A[p][0] in the line before it gets appended to Afreeze.
I noted that because of the random.random() in the matrix creation, the if statement isn't always true, so the problem doesn't always show up. Minor detail, but could confuse people.
Fix your identation?
import numpy as np
import numpy.linalg
import random
import matplotlib.pyplot as plt
import pylab
from scipy.optimize import curve_fit
from array import array
def main():
n=20 #number of species
spec=np.zeros((n+1))
for i in range(0,n):
spec[i]=i
t=100 #initial number of matrices to check
B = np.zeros((n+1)) #matrix to store the results of how big the matrices have to be
for k in range (0,t):
A=np.zeros((n,n))
I=np.ones((n))
for i in range(0,n):
for j in range(i+1,n):
A[j,i]=random.random()
for i in range(0,n):
for j in range(i+1,n):
A[i,j] = A[j,i]
for i in range(n):
A[i,i]=1
allpos = 0
while allpos !=1: #while all solutions are not positive
x = numpy.linalg.solve(A,I)
if any(t<0 for t in x): #if any of the solutions in x are negative
p=np.where(x==min(x)) # find the most negative solution, p is the position
x=np.delete(x, p, 0)
A=np.delete(A, p, 0)
A=np.delete(A, p, 1)
I=np.delete(I, p, 0)
else:
allpos = 1 #set allpos to one so loop is broken
l=len(x)
B[l] = B[l]+1
B = B/n
pi=3.14
resfile=open("results.txt","w")
for i in range (0,len(spec)):
resfile.write("%d " % spec[i])
resfile.write("%0.6f \n" %B[i])
resfile.close()
plt.hist(B, bins=n)
plt.title("Histogram")
plt.show()
plt.plot(spec,B)
plt.xlabel("final number of species")
plt.ylabel("fraction of total matrices")
plt.title("plot")
plt.show()
if __name__ == '__main__':
main()
Got this: