i'm stuck in a minimum spanning tree problem.
I've found a code that resolve the problem and return a graph of the solution, I've edited it a bit but I don't find a way to return the distance along to the graphic, as I don't have much python experience.
Can you help me ? Thanks
import numpy as np
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
def minimum_spanning_tree(X, copy_X=True):
if copy_X:
X = X.copy()
if X.shape[0] != X.shape[1]:
raise ValueError("X needs to be square matrix of edge weights")
n_vertices = X.shape[0]
spanning_edges = []
visited_vertices = [0]
num_visited = 1
diag_indices = np.arange(n_vertices)
X[diag_indices, diag_indices] = np.inf
while num_visited != n_vertices:
new_edge = np.argmin(X[visited_vertices], axis=None)
new_edge = divmod(new_edge, n_vertices)
new_edge = [visited_vertices[new_edge[0]], new_edge[1]]
spanning_edges.append(new_edge)
visited_vertices.append(new_edge[1])
X[visited_vertices, new_edge[1]] = np.inf
X[new_edge[1], visited_vertices] = np.inf
num_visited += 1
return np.vstack(spanning_edges)
def test_mst():
n = int(input())
P = np.array([], int)
for i in range(0,n):
y = list(map(int,input().split()))
P = np.append(P,[y[0],y[1]], axis=0)
P = P.reshape(n,2)
X = squareform(pdist(P))
edge_list = minimum_spanning_tree(X)
plt.scatter(P[:, 0], P[:, 1])
for edge in edge_list:
i, j = edge
plt.plot([P[i, 0], P[j, 0]], [P[i, 1], P[j, 1]], c='r')
plt.show()
if __name__ == "__main__":
test_mst()
Related
I did a logistic regression on my data and now I find the best Theta array to find the class of a new data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def h_theta(x,theta):
return np.dot(x,np.transpose(theta))
def g_z(x,theta):
return 1/(1+pow(np.e,-h_theta(x,theta)))
def cost_function(x,y,theta):
cost = 0
for i in range(len(y)):
l = np.log(g_z(x[i],theta))
cost += -y[i]*l -(1-y[i])*np.log((1-(g_z(x[i],theta))))
return cost/(2*len(y))
def updata_theta(x,y,theta,alpha):
for i in range(6):
u = 0
for j in range(len(y)):
u += (h_theta(x[j],theta)-y[j])*x[j,i]
theta[0,i] -= alpha*u/(len(y))
data = pd.read_csv("D:\REZA\programming\machine learning-andrew ng\coding\machine-learning-ex2\ex2\ex2data2.csv")
y = np.array(data["1"])
s = np.array(data.drop("1",axis=1))
x1T2 = np.zeros((117,1))
x2T2 = np.zeros((117,1))
x1x2 = np.zeros((117,1))
one = np.ones((117,1))
m = len(y)
for i in range(m):
x1T2[i] = s[i,0]*s[i,0]
x2T2[i] = s[i,1]*s[i,1]
x1x2[i] = s[i,0]*s[i,1]
x = np.append(one,s,axis=1)
f = np.append(x1T2,x2T2,axis=1)
f = np.append(f,x1x2,axis=1)
x = np.append(x,f,axis=1)
x = np.array(x,dtype=np.float)
theta = np.zeros((1,6),dtype=float)
n=0
alpha = 0.003
while(n<100 and cost_function(x,y,theta)>0.01):
updata_theta(x,y,theta,alpha)
n+=1
I can plot my data with plt.scatter
plt.scatter(x[:,1],x[:,2],c=y)
plt.show()
scatter plot output
Now I want to plot decision boundary using this theta array, but I don't know how to do it.
Assume there is a set of N points named A. I'd like to select K points and make a subset of A, which is A', where the minimum distance between points are maximum over subsets.
the distance between all points within A' is defined as y(x, x'), x∈A, x'∈A
the minimum distance is defined as k(A') = min_{x∈A', x'∈A', x!=x'}(y(x,x'))
C = argmin_A'(k(A')) where A'⊂A, |A'| = K
N = 100
K = 10
I want to solve and get set C. Is there an efficient algorithm to solve this?
The sample result is like the following, which is quite heavy to solve ; O(N^K).
import numpy as np
from matplotlib import pyplot as plt
from itertools import combinations
N=30
K=5
# generate sample point
mean = [0, 0]
cov = [[1, 0], [0, 1]]
rs=np.random.multivariate_normal(mean,cov, N)
def calc_min_dist(arr):
dist_arr = ((arr[None,:,:] - arr[:,None,:])**2).sum(axis=2)**.5
min_dist = min(dist_arr[dist_arr>0])
return min_dist
def min_points(arr, K):
max_value = 0
max_index = [range(K)]
index = list(range(len(arr)))
for sub_index in combinations(index, K):
sub_arr = arr[tuple([sub_index])]
dist = calc_min_dist(sub_arr)
if max_value < dist:
max_value = dist
max_index = sub_index
return max_value, max_index
# solve
min_value, sub_index = min_points(rs, K)
# plot result
fig, ax = plt.subplots()
ax.scatter(*rs.T)
ax.scatter(*rs[tuple([sub_index])].T)
I have following set of points that lie on a boundary and want to create the polygon that connects these points. For a person it is quite obvious what path to follow, but I am unable to find an algorithm that does the same and trying to solve it myself it all seems quite tricky and ambiguous occasionally. What is the best solution for this?
As a background.
This is the boundary for the julia set with constant = -0.624+0.435j with stable area defined after 100 iterations. I got these points by setting the stable points to 1 and all other to zero and then convolving with a 3x3 matrix [[1, 1, 1], [1, 1, 1], [1, 1, 1]] and select the points that have value 1. My experimenting code is as follows:
import numpy as np
from scipy.signal import convolve2d
import matplotlib.pyplot as plt
r_min, r_max = -1.5, 1.5
c_min, c_max = -2.0, 2.0
dpu = 50 # dots per unit - 50 dots per 1 units means 200 points per 4 units
max_iterations = 100
cmap='hot'
intval = 1 / dpu
r_range = np.arange(r_min, r_max + intval, intval)
c_range = np.arange(c_min, c_max + intval, intval)
constant = -0.624+0.435j
def z_func(point, constant):
z = point
stable = True
num_iterations = 1
while stable and num_iterations < max_iterations:
z = z**2 + constant
if abs(z) > max(abs(constant), 2):
stable = False
return (stable, num_iterations)
num_iterations += 1
return (stable, 0)
points = np.array([])
colors = np.array([])
stables = np.array([], dtype='bool')
progress = 0
for imag in c_range:
for real in r_range:
point = complex(real, imag)
points = np.append(points, point)
stable, color = z_func(point, constant)
stables = np.append(stables, stable)
colors = np.append(colors, color)
print(f'{100*progress/len(c_range)/len(r_range):3.2f}% completed\r', end='')
progress += len(r_range)
print(' \r', end='')
rows = len(r_range)
start = len(colors)
orig_field = []
for i_num in range(len(c_range)):
start -= rows
real_vals = [color for color in colors[start:start+rows]]
orig_field.append(real_vals)
orig_field = np.array(orig_field, dtype='int')
rows = len(r_range)
start = len(stables)
stable_field = []
for i_num in range(len(c_range)):
start -= rows
real_vals = [1 if val == True else 0 for val in stables[start:start+rows]]
stable_field.append(real_vals)
stable_field = np.array(stable_field, dtype='int')
kernel = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
stable_boundary = convolve2d(stable_field, kernel, mode='same')
boundary_points = []
cols, rows = stable_boundary.shape
assert cols == len(c_range), "check c_range and cols"
assert rows == len(r_range), "check r_range and rows"
zero_field = np.zeros((cols, rows))
for col in range(cols):
for row in range(rows):
if stable_boundary[col, row] in [1]:
real_val = r_range[row]
# invert cols as min imag value is highest col and vice versa
imag_val = c_range[cols-1 - col]
stable_boundary[col, row] = 1
boundary_points.append((real_val, imag_val))
else:
stable_boundary[col, row] = 0
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(5, 5))
ax1.matshow(orig_field, cmap=cmap)
ax2.matshow(stable_field, cmap=cmap)
ax3.matshow(stable_boundary, cmap=cmap)
x = [point[0] for point in boundary_points]
y = [point[1] for point in boundary_points]
ax4.plot(x, y, 'o', c='r', markersize=0.5)
ax4.set_aspect(1)
plt.show()
Output with dpu = 200 and max_iterations = 100:
inspired by this Youtube video: What's so special about the Mandelbrot Set? - Numberphile
Thanks for the input. As it turned out this is indeed not as easy as it seems. In the end I have used the convex_hull and the alpha shape algorithms to deterimine boundary polygon(s) around the boundary points as shown the picture below. Top left is the juliaset where colors represent the number of iterations; top right black is unstable and white is stable; bottom left is a collection of points representing the boundary between unstable and stable; and bottom right is the collection of boundary polygons around the boundary points.
The code shows below:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib import patches as mpl_patches
from matplotlib.collections import PatchCollection
import shapely.geometry as geometry
from shapely.ops import cascaded_union, polygonize
from scipy.signal import convolve2d
from scipy.spatial import Delaunay # pylint: disable-msg=no-name-in-module
from descartes.patch import PolygonPatch
def juliaset_func(point, constant, max_iterations):
z = point
stable = True
num_iterations = 1
while stable and num_iterations < max_iterations:
z = z**2 + constant
if abs(z) > max(abs(constant), 2):
stable = False
return (stable, num_iterations)
num_iterations += 1
return (stable, num_iterations)
def create_juliaset(r_range, c_range, constant, max_iterations):
''' create a juliaset that returns two fields (matrices) - orig_field and
stable_field, where orig_field contains the number of iterations for
a point in the complex plane (r, c) and stable_field for each point
either whether the point is stable (True) or not stable (False)
'''
points = np.array([])
colors = np.array([])
stables = np.array([], dtype='bool')
progress = 0
for imag in c_range:
for real in r_range:
point = complex(real, imag)
points = np.append(points, point)
stable, color = juliaset_func(point, constant, max_iterations)
stables = np.append(stables, stable)
colors = np.append(colors, color)
print(f'{100*progress/len(c_range)/len(r_range):3.2f}% completed\r', end='')
progress += len(r_range)
print(' \r', end='')
rows = len(r_range)
start = len(colors)
orig_field = []
stable_field = []
for i_num in range(len(c_range)):
start -= rows
real_colors = [color for color in colors[start:start+rows]]
real_stables = [1 if val == True else 0 for val in stables[start:start+rows]]
orig_field.append(real_colors)
stable_field.append(real_stables)
orig_field = np.array(orig_field, dtype='int')
stable_field = np.array(stable_field, dtype='int')
return orig_field, stable_field
def find_boundary_points_of_stable_field(stable_field, r_range, c_range):
''' find the boundary points by convolving the stable_field with a 3x3
kernel of all ones and define the point on the boundary where the
convolution is 1.
'''
kernel = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]], dtype='int8')
stable_boundary = convolve2d(stable_field, kernel, mode='same')
rows = len(r_range)
cols = len(c_range)
boundary_points = []
for col in range(cols):
for row in range(rows):
# Note you can make the boundary 'thicker ' by
# expanding the range of possible values like [1, 2, 3]
if stable_boundary[col, row] in [1]:
real_val = r_range[row]
# invert cols as min imag value is highest col and vice versa
imag_val = c_range[cols-1 - col]
boundary_points.append((real_val, imag_val))
else:
pass
return [geometry.Point(val[0], val[1]) for val in boundary_points]
def alpha_shape(points, alpha):
''' determine the boundary of a cluster of points whereby 'sharpness' of
the boundary depends on alpha.
paramaters:
:points: list of shapely Point objects
:alpha: scalar
returns:
shapely Polygon object or MultiPolygon
edge_points: list of start and end point of each side of the polygons
'''
if len(points) < 4:
# When you have a triangle, there is no sense
# in computing an alpha shape.
return geometry.MultiPoint(list(points)).convex_hull
def add_edge(edges, edge_points, coords, i, j):
"""
Add a line between the i-th and j-th points,
if not in the list already
"""
if (i, j) in edges or (j, i) in edges:
# already added
return
edges.add((i, j))
edge_points.append((coords[[i, j]]))
coords = np.array([point.coords[0]
for point in points])
tri = Delaunay(coords)
edges = set()
edge_points = []
# loop over triangles:
# ia, ib, ic = indices of corner points of the
# triangle
for ia, ib, ic in tri.vertices:
pa = coords[ia]
pb = coords[ib]
pc = coords[ic]
# Lengths of sides of triangle
a = np.sqrt((pa[0]-pb[0])**2 + (pa[1]-pb[1])**2)
b = np.sqrt((pb[0]-pc[0])**2 + (pb[1]-pc[1])**2)
c = np.sqrt((pc[0]-pa[0])**2 + (pc[1]-pa[1])**2)
# Semiperimeter of triangle
s = (a + b + c)/2.0
# Area of triangle by Heron's formula
area = np.sqrt(s*(s-a)*(s-b)*(s-c))
circum_r = a*b*c/(4.0*area)
# Here's the radius filter.
if circum_r < alpha:
add_edge(edges, edge_points, coords, ia, ib)
add_edge(edges, edge_points, coords, ib, ic)
add_edge(edges, edge_points, coords, ic, ia)
m = geometry.MultiLineString(edge_points)
triangles = list(polygonize(m))
return cascaded_union(triangles), edge_points
def main():
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(5, 5))
# define limits, range and resolution in the complex plane
r_min, r_max = -1.5, 1.5
c_min, c_max = -1.1, 1.1
dpu = 100 # dots per unit - 50 dots per 1 units means 200 points per 4 units
intval = 1 / dpu
r_range = np.arange(r_min, r_max + intval, intval)
c_range = np.arange(c_min, c_max + intval, intval)
# create two matrixes (orig_field and stable_field) for the juliaset with
# constant
constant = -0.76 -0.10j
max_iterations = 50
orig_field, stable_field = create_juliaset(r_range, c_range,
constant,
max_iterations)
cmap='nipy_spectral'
ax1.matshow(orig_field, cmap=cmap, interpolation='bilinear')
ax2.matshow(stable_field, cmap=cmap)
# find points that are on the boundary of the stable field
boundary_points = find_boundary_points_of_stable_field(stable_field,
r_range, c_range)
x = [p.x for p in boundary_points]
y = [p.y for p in boundary_points]
ax3.plot(x, y, 'o', c='r', markersize=0.5)
ax3.set_xlim(r_min, r_max)
ax3.set_ylim(c_min, c_max)
ax3.set_aspect(1)
# find the boundary polygon using alpha_shape where 'sharpness' of the
# boundary is determined by the factor ALPHA
# a green boundary consists of multiple polygons, a red boundary on a single
# polygon
alpha = 0.03 # determines shape of the boundary polygon
bnd_polygon, _ = alpha_shape(boundary_points, alpha)
patches = []
if bnd_polygon.geom_type == 'Polygon':
patches.append(PolygonPatch(bnd_polygon))
ec = 'red'
else:
for poly in bnd_polygon:
patches.append(PolygonPatch(poly))
ec = 'green'
p = PatchCollection(patches, facecolor='none', edgecolor=ec, lw=1)
ax4.add_collection(p)
ax4.set_xlim(r_min, r_max)
ax4.set_ylim(c_min, c_max)
ax4.set_aspect(1)
plt.show()
if __name__ == "__main__":
main()
I am calculating two distances and binning them in intervals of 0.1 in a 2D array. Currently I am doing this. However it takes a lot of time for large number of points
import numpy as np
from scipy.spatial import distance as d
dat=np.random.rand(100,3)
dd2d=np.zeros((10,10))
while len(dat)>0:
i=len(dat)-1
while i>0:
dist0=d.euclidean(dat[0],dat[i])
dist1=d.cosine(dat[0],dat[i])
ind0=int(dist0/0.1)
ind1=int(dist1/0.1)
if ind0>9 or ind1>9:
pass
else:
dd2d[ind0,ind1]+=1
i-=1
dat=np.delete(dat,0,axis=0)
print len(dat)
What is the most efficient way of doing this?
Also how can I convert the while loops in my code into for loops so that I can add progressbar/tqdm to keep track of run time.
If you are already importing scipy.spatial.distance, might as well use pdist. And then you're just making a 2d histogram. Use np.histogram2d.
def binDists2d(dat, f1 = 'euclidean', f2 = 'cosine'):
dist0 = d.pdist(dat, f1)
dist1 = d.pdist(dat, f2)
rng = np.array([[0, 1], [0, 1]])
return np.histogram2d(dist0, dist1, bins = 10, range = rng)
pdist only returns the upper triangular elements. If you want to do this manually, use np.triu_indices, which you could use to generate the distances if scipy is unavailable.
def cosdist(u, v):
return 1 - u.dot(v) / (np.linalg.norm(u) * np.linlg.norm(v))
def binDists2d(dat, f0 = lambda u, v: np.linalg.norm(u - v), f1 = cosdist):
i, j = np.triu_indices(dat.shape[0], 1)
dist0 = f0(dat[i], dat[j])
dist1 = f1(dat[i], dat[j])
rng = np.array([[0, 1], [0, 1]])
return np.histogram2d(dist0, dist1, bins = 10, range = rng)
EDIT: Less memory-hungry version:
def binDists2d(dat, f0, f1, n = 1, bins = 10, rng = np.array([[0, 1], [0, 1]])):
i_, j_ = np.triu_indices(dat.shape[0], 1)
out = np.zeros((bins, bins))
i_, j_ = np.array_split(i_, n), np.array_split(j_, n)
for k, (i, j) in enumerate(zip(i_, j_)):
dist0 = f0(dat[i], dat[j])
dist1 = f1(dat[i], dat[j])
out += np.histogram2d(dist0, dist1, bins = bins, range = rng)
print(str(k) + " of " + str(n) + "completed")
return out
Is there an elegant way finding the number of connected components using a pre-calculated KDTree? Right now find the connected components using a breath-first search algorithm with the adjacency matrix given by the KDTree of k-nearest neighbours, but is there a better possibility?
import collections
import numpy as np
from sklearn import neighbors
N = 100
N_k = 8
ra = np.random.random
X0,X1 = ra(N),ra(N)
X0[0:N//2]+= 2
X1[0:N//2]+= 2
X = np.array([X0,X1]).T
tree = neighbors.KDTree(X)
dist, adj = tree.query(X, k = N_k+1)
dist = dist[:,1::]
adj = adj[:,1::]
print("Inside of find_components_lifo")
print("N = %d/ N_k = %d"%(N,N_k))
labels = np.zeros(N, dtype = np.int) - 1
n = 0
steps = 0
remains = (labels == -1)
while n < N:
i = np.arange(0,N,1)[remains][np.random.randint(0,N - n)]
# This is important for directed graphs
labels[i] = i
lifo = collections.deque([i])
while lifo:
ele = lifo.pop()
for k in adj[ele,:]:
if labels[k] == -1:
labels[k] = labels[i]
lifo.append(k)
elif labels[k] != labels[i]:
labels[labels == labels[i]] = labels[k]
remains = (labels == -1)
n = N - len(np.nonzero(remains)[0])
unique = np.unique(labels)
labels_ = np.zeros(N, dtype = np.int) - 1
for i, label in enumerate(unique):
choice = (labels == label)
N_cl = len(np.nonzero(choice)[0])
print("We found a cluster with N = %d"%N_cl)
labels_[choice] = i
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
fixticks(ax)
plt.show()
colors_ = np.array(colors)
for i in range(N):
for j in range(N_k):
ax.plot([X0[i],X0[adj[i,j]]],[X1[i],X1[adj[i,j]]], color = colors_[labels_[i]], alpha = 0.3)
ax.grid(True)
ax.scatter(X0,X1, s = 60, color = "Black")
plt.show()
I
I think you can use scipy's connected_components and scikit-learn's kneighbors_graph together. Does this produce what you're looking for?
from sklearn import neighbors
from scipy.sparse import csgraph
adj = neighbors.kneighbors_graph(X, N_k)
n_components, labels = csgraph.connected_components(adj)