colormap scatter plot dependant on cluster membership - python

Im conducting soft clustering on a data set and I wanted to create a cool graphic that looks similar to the image posted. I want to show a data points membership between two (or more clusters) in graphical form. Im not really sure how to go about this however. Ive used criteria to assign colours to a data point, but am unsure how to create a more dynamic sort of graphic seen below. Any help appreciated.

I think markers are just the thing your looking for:
x1 = y1 = 1
x2 = y2 = 2
dx = np.random.rand(10)
dy = np.random.rand(10)
x = np.array([x1 + dx, x2 + dx]).ravel()
y = np.array([y1 + dy, y2 + dy]).ravel()
threshold = 4
markers = np.array(["o" if xy > threshold else "h" for xy in x + y])
fig, ax = plt.subplots()
for marker in np.unique(markers):
index = markers == marker
ax.scatter(x[index], y[index], marker=marker)
Adding someaditional code to control color and transparency (alpha)
import numpy as np
import matplotlib.pyplot as plt
x1 = y1 = 1
x2 = y2 = 2
dx = np.random.rand(10)
dy = np.random.rand(10)
x = np.array([x1 + dx, x2 + dx]).ravel()
y = np.array([y1 + dy, y2 + dy]).ravel()
threshold = 4
markers = np.array(["o" if xy > threshold else "h" for xy in x + y])
blue_color = "midnightblue" # predefined
pink_color = "orchid"
colors = [blue_color if marker == "o" else pink_color for marker in markers]
alphas = np.array([abs(xy - threshold) for xy in x + y])
alphas = 1 - alphas/np.max(alphas)
fig, ax = plt.subplots()
for i in range(len(x)):
ax.scatter(x[i], y[i], marker=markers[i], color=colors[i], alpha=alphas[i])

The GaussianMixture in scikit-learn does something close to what the question asks.
Specifically, predict_proba(X) returns an array with the probability of each point in X belonging to the component. In the example below we fit two mixture components, so the last two plots should be opposites of each other:
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
X, _ = make_moons(noise=0.05)
mix = GaussianMixture(n_components=2).fit(X)
probs = mix.predict_proba(X)
fig, ax = plt.subplots(1, 3, sharey=True)
ax[0].scatter(X[:, 0], X[:, 1])
ax[1].scatter(X[:, 0], X[:, 1], c=probs[:, 0])
ax[2].scatter(X[:, 0], X[:, 1], c=probs[:, 1])
plt.show()

Related

How to create a numpy array filled with average value of a vector

I am not sure how to phrase my question in any way better. Basically, I have three lists of the same length x, y and z and I want to fill a 2D numpy array in the z/y plane with the average of the associated z values.
Here is how I can achieve what I wan to do:
import numpy as np
import matplotlib.pyplot as plt
x = [37.59390426045407, 38.00530354847739, 38.28412244348653, 38.74871247986305, 38.73175910429809, 38.869008864244016, 39.188234404976555, 39.92835838352555, 40.881394113153334, 41.686136269465884]
y = [0.1305391767832006, 0.13764519613447768, 0.14573326951792354, 0.15090729309032114, 0.16355823707239897, 0.17327106424274763, 0.17749746339532224, 0.17310384614773594, 0.16545780437882962, 0.1604752704890856]
z = [0.05738534353865021, 0.012572155256903583, -0.021709582561809437, -0.11191337750722108, -0.07931921785775153, -0.06241610118871843, 0.014216349927058225, 0.042002641153291886, -0.029354425271534645, 0.061894011359833856]
n = 5
image = np.zeros(shape=(n,n))
# Fill the 2D array
x0 = min(x)
y0 = min(y)
dx = (max(x) - min(x))/n
dy = (max(y) - min(y))/n
# Loop over each 2D cell
for index_x in range(n):
for index_y in range(n):
# find the limits of the cell
x1 = x0 + index_x * dx
x2 = x0 + (index_x+1) * dx
y1 = y0 + index_y * dy
y2 = y0 + (index_y+1) * dy
# find the points of z that lie within the range of the cell
vec_z = [z[idx] for idx in range(len(z)) if x[idx]>=x1 and x[idx]<x2 and y[idx]>=y1 and y[idx]<y2]
if vec_z:
image[index_x, index_y] = np.mean(vec_z)
# In the end, used to create a surface plot
fig, ax = plt.subplots()
ax.imshow(image, cmap=plt.cm.gray, interpolation='nearest')
plt.show()
Is there a more easy way to achieve this? I can imagine there is a numpy method for that.
If I understand correctly what you want to do, maybe a 2D interpolation from scipy.interpolate.interp2d is what you are looking for.
You define the interpolation function of your points:
f = interp2d(x = x, y = y, z = z)
Then you define the X and Y meshgrid:
N = 50
x_axis = np.linspace(np.min(x), np.max(x), N)
y_axis = np.linspace(np.min(y), np.max(y), N)
X, Y = np.meshgrid(x_axis, y_axis)
Finally you can compute Z interpolated values on the meshgrid:
Z = np.zeros((N, N))
for i in range(N):
for j in range(N):
Z[i, j] = f(X[i, j], Y[i, j])
If you plot in 3D the interpolated surface, you get:
fig = plt.figure()
ax = fig.add_subplot(projection = '3d')
ax.plot_surface(X, Y, Z, cmap = 'jet', shade = False)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
plt.show()
Interpolated surface compared to interpolation data points:
ax.scatter(x, y, z, color = 'black', s = 100, alpha = 1)

How do I fill/shade a cetain section of my graph in python?

I have a function that I'd like to plot in python and shade the region of interest. I've tried using pyplot.fill_between() but can not quite get what I want. I've attached an image and shaded in orange the region I want to be filled:
I plot the function (in blue) and then the graph is bounded by y=0, y ≈ 0.05 and x = 0.And I wish to shade the relevant region (in orange).
Any tips as to how to go about this?
Thanks in advance.
import numpy as np
import matplotlib.pyplot as plt
def fn (M, r_min):
d = (1- 2*M/ r_min)
x = M/(r_min)**2
A_0 = d**-0.5
A_dot = np.arange(-0.6,0.5,0.0001) #X axis
a = np.zeros(len(A_dot))
for i in range(1,len(A_dot)):
a[i] = -3*d*A_dot[i]**2 -2*x*A_dot[i] + A_0**2*x**2 #Y axis
plt.plot(A_dot, a)
plt.xlim(-0.55,0.55)
plt.axhline(y = 0, color='black', linestyle='--')
plt.axhline(y = 0.049382716, color = 'black', linestyle = '--')
plt.axvline(x = 0,color ='black', linestyle = '--')
idx = np.argwhere(np.diff(np.sign(a))).flatten() #Finding intersection on x+axis
plt.plot(A_dot[idx], a[idx], 'ro')
plt.xlabel('$\\frac{dA_0}{d\tau}$')
plt.ylabel('$|a|^2$')
plt.show()
return(A_dot,a)
fn(1,3)
You need to give the x and y vectors as inputs to fill_between. To do that, you can define a mask selecting between the interception point and 0 (add to your fn function):
x_min = A_dot[idx[1]]
x_max = 0.0
mask_x = np.logical_and(A_dot >= x_min, A_dot <= x_max)
plt.fill_between(x=A_dot[mask_x], y1=a[mask_x], y2=0, color='orange')
Result:

Generating a random float below and above a line created by numpy arrays python

I would like to generate a random float point above and below a line created by numpy arrays.
For example I have these line equations:
x_values = np.linspace(-1, 1, 100)
y1 = 2 * x_values -5
y2= -3 * x_values +2
plt.plot(x_values,y1, '-k')
plt.plot(x_values,y2, '-g')
I have tried this method from Generate random points above and below a line in Python and it works if np.arrange is used like so:
lower, upper = -25, 25
num_points = 1
x1 = [random.randrange(start=1, stop=9) for i in range(num_points)]
x2 = [random.randrange(start=1, stop=9) for i in range(num_points)]
y1 = [random.randrange(start=lower, stop=(2 * x -5) )for x in x1]
y2 = [random.randrange(start=(2 * x -5), stop=upper) for x in x2]
plt.plot(np.arange(10), 2 * np.arange(10) -5)
plt.scatter(x1, y1, c='blue')
plt.scatter(x2, y2, c='red')
However, I wanted to find a way to generate a random point if np.linspace(-1, 1, 100) was used to create the line graph. The difference is involving/allowing float coordinates to be picked to. But unsure how.
Any ideas will be appreciated.
Here is an approach, using functions for the y-values. Random x positions are chosen uniformly over the x-range. For each random x, a value is randomly chosen between its y-ranges.
import numpy as np
import matplotlib.pyplot as plt
x_values = np.linspace(-1, 1, 100)
f1 = lambda x: 2 * x - 5
f2 = lambda x: -3 * x + 2
y1 = f1(x_values)
y2 = f2(x_values)
plt.plot(x_values, y1, '-k')
plt.plot(x_values, y2, '-g')
plt.fill_between (x_values, y1, y2, color='gold', alpha=0.2)
num_points = 20
xs = np.random.uniform(x_values[0], x_values[-1], num_points)
ys = np.random.uniform(f1(xs), f2(xs))
plt.scatter(xs, ys, color='crimson')
plt.show()
PS: Note that the simplicity of the approach chooses x uniform over its length. If you need an even distribution over the area of the trapezium, you need the x less probable at the right, and more at the left. You can visualize this with many more points and using transparency. With the simplistic approach, the right will look denser than the left.
The following code first generates x,y points in a parallelogram, and remaps the points on the wrong side back to its mirror position. The code looks like:
import numpy as np
import matplotlib.pyplot as plt
x0, x1 = -1, 1
x_values = np.linspace(x0, x1, 100)
f1 = lambda x: 2 * x - 5
f2 = lambda x: -3 * x + 2
y1 = f1(x_values)
y2 = f2(x_values)
plt.plot(x_values, y1, '-k')
plt.plot(x_values, y2, '-g')
plt.fill_between(x_values, y1, y2, color='gold', alpha=0.2)
num_points = 100_000
h0 = f2(x0) - f1(x0)
h1 = f2(x1) - f1(x1)
xs1 = np.random.uniform(x0, x1, num_points)
ys1 = np.random.uniform(0, h0 + h1, num_points) + f1(xs1)
xs = np.where(ys1 <= f2(xs1), xs1, x0 + x1 - xs1)
ys = np.where(ys1 <= f2(xs1), ys1, f1(xs) + h0 + h1 + f1(xs1) - ys1)
plt.scatter(xs, ys, color='crimson', alpha=0.2, ec='none', s=1)
plt.show()
Plot comparing the two approaches:
First of all, if you have 2 intersecting lines, there will most likely be a triangle in which you can pick random points. This is dangerously close to Bertrand's paradox, so make sure that your RNG suits its purpose.
If you don't really care about how "skewed" your randomness is, try this:
import numpy as np
left, right = -1, 1
# x_values = np.linspace(left, right, 100)
k1, k2 = 2, -3
b1, b2 = -5, 2
y1 = lambda x: k1*x + b1
y2 = lambda x: k2*x + b2
# If you need a point above the 1st equation, but below the second one.
# Check the limits where you can pick the points under this condition.
nosol = False
if k1==k2:
if b1>=b2:
inters = -100
nosol = True
else:
rand_x = np.random.uniform(left,right)
rand_y = np.random.uniform(y1(rand_x),y2(rand_x))
print(f'Random point is ({round(rand_x,2)}, {round(rand_y,2)})')
else:
inters = (b2-b1)/(k1-k2)
if inters<=left:
if k1>=k2:
nosol=True
elif inters>=right:
if k1<=k2:
nosol=True
if nosol:
print('No solution')
else:
if k1>k2:
right = inters
else:
left = inters
# Pick random X between "left" and "right"
# Pick whatever distribution you like or need
rand_x = np.random.uniform(left,right)
rand_y = np.random.uniform(y1(rand_x),y2(rand_x))
print(f'Random point is ({round(rand_x,2)}, {round(rand_y,2)})')
If your random X needs to belong to a specific number sequence, use some other np.random function: randint, choice...

Visualize the decision border (frontier) of my special SVC model using scikit learn [duplicate]

I could really use a tip to help me plotting a decision boundary to separate to classes of data. I created some sample data (from a Gaussian distribution) via Python NumPy. In this case, every data point is a 2D coordinate, i.e., a 1 column vector consisting of 2 rows. E.g.,
[ 1
2 ]
Let's assume I have 2 classes, class1 and class2, and I created 100 data points for class1 and 100 data points for class2 via the code below (assigned to the variables x1_samples and x2_samples).
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T
When I plot the data points for each class, it would look like this:
Now, I came up with an equation for an decision boundary to separate both classes and would like to add it to the plot. However, I am not really sure how I can plot this function:
def decision_boundary(x_vec, mu_vec1, mu_vec2):
g1 = (x_vec-mu_vec1).T.dot((x_vec-mu_vec1))
g2 = 2*( (x_vec-mu_vec2).T.dot((x_vec-mu_vec2)) )
return g1 - g2
I would really appreciate any help!
EDIT:
Intuitively (If I did my math right) I would expect the decision boundary to look somewhat like this red line when I plot the function...
Your question is more complicated than a simple plot : you need to draw the contour which will maximize the inter-class distance. Fortunately it's a well-studied field, particularly for SVM machine learning.
The easiest method is to download the scikit-learn module, which provides a lot of cool methods to draw boundaries: scikit-learn: Support Vector Machines
Code :
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import scipy
from sklearn import svm
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T
fig = plt.figure()
plt.scatter(x1_samples[:,0],x1_samples[:,1], marker='+')
plt.scatter(x2_samples[:,0],x2_samples[:,1], c= 'green', marker='o')
X = np.concatenate((x1_samples,x2_samples), axis = 0)
Y = np.array([0]*100 + [1]*100)
C = 1.0 # SVM regularization parameter
clf = svm.SVC(kernel = 'linear', gamma=0.7, C=C )
clf.fit(X, Y)
Linear Plot
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
plt.plot(xx, yy, 'k-')
MultiLinear Plot
C = 1.0 # SVM regularization parameter
clf = svm.SVC(kernel = 'rbf', gamma=0.7, C=C )
clf.fit(X, Y)
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, cmap=plt.cm.Paired)
Implementation
If you want to implement it yourself, you need to solve the following quadratic equation:
The Wikipedia article
Unfortunately, for non-linear boundaries like the one you draw, it's a difficult problem relying on a kernel trick but there isn't a clear cut solution.
Based on the way you've written decision_boundary you'll want to use the contour function, as Joe noted above. If you just want the boundary line, you can draw a single contour at the 0 level:
f, ax = plt.subplots(figsize=(7, 7))
c1, c2 = "#3366AA", "#AA3333"
ax.scatter(*x1_samples.T, c=c1, s=40)
ax.scatter(*x2_samples.T, c=c2, marker="D", s=40)
x_vec = np.linspace(*ax.get_xlim())
ax.contour(x_vec, x_vec,
decision_boundary(x_vec, mu_vec1, mu_vec2),
levels=[0], cmap="Greys_r")
Which makes:
Those were some great suggestions, thanks a lot for your help! I ended up solving the equation analytically and this is the solution I ended up with (I just want to post it for future reference:
# 2-category classification with random 2D-sample data
# from a multivariate normal distribution
import numpy as np
from matplotlib import pyplot as plt
def decision_boundary(x_1):
""" Calculates the x_2 value for plotting the decision boundary."""
return 4 - np.sqrt(-x_1**2 + 4*x_1 + 6 + np.log(16))
# Generating a Gaussion dataset:
# creating random vectors from the multivariate normal distribution
# given mean and covariance
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T # to 1-col vector
# Main scatter plot and plot annotation
f, ax = plt.subplots(figsize=(7, 7))
ax.scatter(x1_samples[:,0], x1_samples[:,1], marker='o', color='green', s=40, alpha=0.5)
ax.scatter(x2_samples[:,0], x2_samples[:,1], marker='^', color='blue', s=40, alpha=0.5)
plt.legend(['Class1 (w1)', 'Class2 (w2)'], loc='upper right')
plt.title('Densities of 2 classes with 25 bivariate random patterns each')
plt.ylabel('x2')
plt.xlabel('x1')
ftext = 'p(x|w1) ~ N(mu1=(0,0)^t, cov1=I)\np(x|w2) ~ N(mu2=(1,1)^t, cov2=I)'
plt.figtext(.15,.8, ftext, fontsize=11, ha='left')
# Adding decision boundary to plot
x_1 = np.arange(-5, 5, 0.1)
bound = decision_boundary(x_1)
plt.plot(x_1, bound, 'r--', lw=3)
x_vec = np.linspace(*ax.get_xlim())
x_1 = np.arange(0, 100, 0.05)
plt.show()
And the code can be found here
EDIT:
I also have a convenience function for plotting decision regions for classifiers that implement a fit and predict method, e.g., the classifiers in scikit-learn, which is useful if the solution cannot be found analytically. A more detailed description how it works can be found here.
You can create your own equation for the boundary:
where you have to find the positions x0 and y0, as well as the constants ai and bi for the radius equation. So, you have 2*(n+1)+2 variables. Using scipy.optimize.leastsq is straightforward for this type of problem.
The code attached below builds the residual for the leastsq penalizing the points outsize the boundary. The result for your problem, obtained with:
x, y = find_boundary(x2_samples[:,0], x2_samples[:,1], n)
ax.plot(x, y, '-k', lw=2.)
x, y = find_boundary(x1_samples[:,0], x1_samples[:,1], n)
ax.plot(x, y, '--k', lw=2.)
using n=1:
using n=2:
usng n=5:
using n=7:
import numpy as np
from numpy import sin, cos, pi
from scipy.optimize import leastsq
def find_boundary(x, y, n, plot_pts=1000):
def sines(theta):
ans = np.array([sin(i*theta) for i in range(n+1)])
return ans
def cosines(theta):
ans = np.array([cos(i*theta) for i in range(n+1)])
return ans
def residual(params, x, y):
x0 = params[0]
y0 = params[1]
c = params[2:]
r_pts = ((x-x0)**2 + (y-y0)**2)**0.5
thetas = np.arctan2((y-y0), (x-x0))
m = np.vstack((sines(thetas), cosines(thetas))).T
r_bound = m.dot(c)
delta = r_pts - r_bound
delta[delta>0] *= 10
return delta
# initial guess for x0 and y0
x0 = x.mean()
y0 = y.mean()
params = np.zeros(2 + 2*(n+1))
params[0] = x0
params[1] = y0
params[2:] += 1000
popt, pcov = leastsq(residual, x0=params, args=(x, y),
ftol=1.e-12, xtol=1.e-12)
thetas = np.linspace(0, 2*pi, plot_pts)
m = np.vstack((sines(thetas), cosines(thetas))).T
c = np.array(popt[2:])
r_bound = m.dot(c)
x_bound = popt[0] + r_bound*cos(thetas)
y_bound = popt[1] + r_bound*sin(thetas)
return x_bound, y_bound
I like the mglearn library to draw decision boundaries. Here is one example from the book "Introduction to Machine Learning with Python" by A. Mueller:
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
for n_neighbors, ax in zip([1, 3, 9], axes):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, ax=ax, alpha=.4)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
ax.set_title("{} neighbor(s)".format(n_neighbors))
ax.set_xlabel("feature 0")
ax.set_ylabel("feature 1")
axes[0].legend(loc=3)
If you want to use scikit learn, you can write your code like this:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
# read data
data = pd.read_csv('ex2data1.txt', header=None)
X = data[[0,1]].values
y = data[2]
# use LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)
# Coefficient of the features in the decision function. (from theta 1 to theta n)
parameters = log_reg.coef_[0]
# Intercept (a.k.a. bias) added to the decision function. (theta 0)
parameter0 = log_reg.intercept_
# Plotting the decision boundary
fig = plt.figure(figsize=(10,7))
x_values = [np.min(X[:, 1] -5 ), np.max(X[:, 1] +5 )]
# calcul y values
y_values = np.dot((-1./parameters[1]), (np.dot(parameters[0],x_values) + parameter0))
colors=['red' if l==0 else 'blue' for l in y]
plt.scatter(X[:, 0], X[:, 1], label='Logistics regression', color=colors)
plt.plot(x_values, y_values, label='Decision Boundary')
plt.show()
see: Building-a-Logistic-Regression-with-Scikit-learn
Just solved a very similar problem with a different approach (root finding) and wanted to post this alternative as answer here for future reference:
def discr_func(x, y, cov_mat, mu_vec):
"""
Calculates the value of the discriminant function for a dx1 dimensional
sample given covariance matrix and mean vector.
Keyword arguments:
x_vec: A dx1 dimensional numpy array representing the sample.
cov_mat: numpy array of the covariance matrix.
mu_vec: dx1 dimensional numpy array of the sample mean.
Returns a float value as result of the discriminant function.
"""
x_vec = np.array([[x],[y]])
W_i = (-1/2) * np.linalg.inv(cov_mat)
assert(W_i.shape[0] > 1 and W_i.shape[1] > 1), 'W_i must be a matrix'
w_i = np.linalg.inv(cov_mat).dot(mu_vec)
assert(w_i.shape[0] > 1 and w_i.shape[1] == 1), 'w_i must be a column vector'
omega_i_p1 = (((-1/2) * (mu_vec).T).dot(np.linalg.inv(cov_mat))).dot(mu_vec)
omega_i_p2 = (-1/2) * np.log(np.linalg.det(cov_mat))
omega_i = omega_i_p1 - omega_i_p2
assert(omega_i.shape == (1, 1)), 'omega_i must be a scalar'
g = ((x_vec.T).dot(W_i)).dot(x_vec) + (w_i.T).dot(x_vec) + omega_i
return float(g)
#g1 = discr_func(x, y, cov_mat=cov_mat1, mu_vec=mu_vec_1)
#g2 = discr_func(x, y, cov_mat=cov_mat2, mu_vec=mu_vec_2)
x_est50 = list(np.arange(-6, 6, 0.1))
y_est50 = []
for i in x_est50:
y_est50.append(scipy.optimize.bisect(lambda y: discr_func(i, y, cov_mat=cov_est_1, mu_vec=mu_est_1) - \
discr_func(i, y, cov_mat=cov_est_2, mu_vec=mu_est_2), -10,10))
y_est50 = [float(i) for i in y_est50]
Here is the result:
(blue the quadratic case, red the linear case (equal variances)
I know this question has been answered in a very thorough way analytically. I just wanted to share a possible 'hack' to the problem. It is unwieldy but gets the job done.
Start by building a mesh grid of the 2d area and then based on the classifier just build a class map of the entire space. Subsequently detect changes in the decision made row-wise and store the edges points in a list and scatter plot the points.
def disc(x): # returns the class of the point based on location x = [x,y]
temp = 0.5 + 0.5*np.sign(disc0(x)-disc1(x))
# disc0() and disc1() are the discriminant functions of the respective classes
return 0*temp + 1*(1-temp)
num = 200
a = np.linspace(-4,4,num)
b = np.linspace(-6,6,num)
X,Y = np.meshgrid(a,b)
def decColor(x,y):
temp = np.zeros((num,num))
print x.shape, np.size(x,axis=0)
for l in range(num):
for m in range(num):
p = np.array([x[l,m],y[l,m]])
#print p
temp[l,m] = disc(p)
return temp
boundColorMap = decColor(X,Y)
group = 0
boundary = []
for x in range(num):
group = boundColorMap[x,0]
for y in range(num):
if boundColorMap[x,y]!=group:
boundary.append([X[x,y],Y[x,y]])
group = boundColorMap[x,y]
boundary = np.array(boundary)
Sample Decision Boundary for a simple bivariate gaussian classifier
Given two bi-variate normal distributions, you can use Gaussian Discriminant Analysis (GDA) to come up with a decision boundary as the difference between the log of the 2 pdf's.
Here's a way to do it using scipy multivariate_normal (the code is not optimized):
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from numpy.linalg import norm
from numpy.linalg import inv
from scipy.spatial.distance import mahalanobis
def normal_scatter(mean, cov, p):
size = 100
sigma_x = cov[0,0]
sigma_y = cov[1,1]
mu_x = mean[0]
mu_y = mean[1]
x_ps, y_ps = np.random.multivariate_normal(mean, cov, size).T
x,y = np.mgrid[mu_x-3*sigma_x:mu_x+3*sigma_x:1/size, mu_y-3*sigma_y:mu_y+3*sigma_y:1/size]
grid = np.empty(x.shape + (2,))
grid[:, :, 0] = x; grid[:, :, 1] = y
z = p*multivariate_normal.pdf(grid, mean, cov)
return x_ps, y_ps, x,y,z
# Dist 1
mu_1 = np.array([1, 1])
cov_1 = .5*np.array([[1, 0], [0, 1]])
p_1 = .5
x_ps, y_ps, x,y,z = normal_scatter(mu_1, cov_1, p_1)
plt.plot(x_ps,y_ps,'x')
plt.contour(x, y, z, cmap='Blues', levels=3)
# Dist 2
mu_2 = np.array([2, 1])
#cov_2 = np.array([[2, -1], [-1, 1]])
cov_2 = cov_1
p_2 = .5
x_ps, y_ps, x,y,z = normal_scatter(mu_2, cov_2, p_2)
plt.plot(x_ps,y_ps,'.')
plt.contour(x, y, z, cmap='Oranges', levels=3)
# Decision Boundary
X = np.empty(x.shape + (2,))
X[:, :, 0] = x; X[:, :, 1] = y
g = np.log(p_1*multivariate_normal.pdf(X, mu_1, cov_1)) - np.log(p_2*multivariate_normal.pdf(X, mu_2, cov_2))
plt.contour(x, y, g, [0])
plt.grid()
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.plot([mu_1[0], mu_2[0]], [mu_1[1], mu_2[1]], 'k')
plt.show()
If p_1 != p_2, then you get non-linear boundary. The decision boundary is given by g above.
Then to plot the decision hyper-plane (line in 2D), you need to evaluate g for a 2D mesh, then get the contour which will give a separating line.
You can also assume to have equal co-variance matrices for both distributions, which will give a linear decision boundary. In this case, you can replace the calculation of g in the above code with the following:
W = inv(cov_1).dot(mu_1-mu_2)
x_0 = 1/2*(mu_1+mu_2) - cov_1.dot(np.log(p_1/p_2)).dot((mu_1-mu_2)/mahalanobis(mu_1, mu_2, cov_1))
X = np.empty(x.shape + (2,))
X[:, :, 0] = x; X[:, :, 1] = y
g = (X-x_0).dot(W)
i use this method from this book python-machine-learning-2nd.pdf URL
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.8,
c=colors[idx],
marker=markers[idx],
label=cl,
edgecolor='black')
# highlight test samples
if test_idx:
# plot all samples
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')
Since version 1.1, sklearn has a function for this:
https://scikit-learn.org/stable/modules/generated/sklearn.inspection.DecisionBoundaryDisplay.html#sklearn.inspection.DecisionBoundaryDisplay

Way to contour outer edge of selected grid region in Python

I have the following code:
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(-np.pi/2, np.pi/2, 30)
y = np.linspace(-np.pi/2, np.pi/2, 30)
x,y = np.meshgrid(x,y)
z = np.sin(x**2+y**2)[:-1,:-1]
fig,ax = plt.subplots()
ax.pcolormesh(x,y,z)
Which gives this image:
Now lets say I want to highlight the edge certain grid boxes:
highlight = (z > 0.9)
I could use the contour function, but this would result in a "smoothed" contour. I just want to highlight the edge of a region, following the edge of the grid boxes.
The closest I've come is adding something like this:
highlight = np.ma.masked_less(highlight, 1)
ax.pcolormesh(x, y, highlight, facecolor = 'None', edgecolors = 'w')
Which gives this plot:
Which is close, but what I really want is for only the outer and inner edges of that "donut" to be highlighted.
So essentially I am looking for some hybrid of the contour and pcolormesh functions - something that follows the contour of some value, but follows grid bins in "steps" rather than connecting point-to-point. Does that make sense?
Side note: In the pcolormesh arguments, I have edgecolors = 'w', but the edges still come out to be blue. Whats going on there?
EDIT:
JohanC's initial answer using add_iso_line() works for the question as posed. However, the actual data I'm using is a very irregular x,y grid, which cannot be converted to 1D (as is required for add_iso_line().
I am using data which has been converted from polar coordinates (rho, phi) to cartesian (x,y). The 2D solution posed by JohanC does not appear to work for the following case:
import numpy as np
import matplotlib.pyplot as plt
from scipy import ndimage
def pol2cart(rho, phi):
x = rho * np.cos(phi)
y = rho * np.sin(phi)
return(x, y)
phi = np.linspace(0,2*np.pi,30)
rho = np.linspace(0,2,30)
pp, rr = np.meshgrid(phi,rho)
xx,yy = pol2cart(rr, pp)
z = np.sin(xx**2 + yy**2)
scale = 5
zz = ndimage.zoom(z, scale, order=0)
fig,ax = plt.subplots()
ax.pcolormesh(xx,yy,z[:-1, :-1])
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xmin, xmax = xx.min(), xx.max()
ymin, ymax = yy.min(), yy.max()
ax.contour(np.linspace(xmin,xmax, zz.shape[1]) + (xmax-xmin)/z.shape[1]/2,
np.linspace(ymin,ymax, zz.shape[0]) + (ymax-ymin)/z.shape[0]/2,
np.where(zz < 0.9, 0, 1), levels=[0.5], colors='red')
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
This post shows a way to draw such lines. As it is not straightforward to adapt to the current pcolormesh, the following code demonstrates a possible adaption.
Note that the 2d versions of x and y have been renamed, as the 1d versions are needed for the line segments.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
x = np.linspace(-np.pi / 2, np.pi / 2, 30)
y = np.linspace(-np.pi / 2, np.pi / 2, 30)
xx, yy = np.meshgrid(x, y)
z = np.sin(xx ** 2 + yy ** 2)[:-1, :-1]
fig, ax = plt.subplots()
ax.pcolormesh(x, y, z)
def add_iso_line(ax, value, color):
v = np.diff(z > value, axis=1)
h = np.diff(z > value, axis=0)
l = np.argwhere(v.T)
vlines = np.array(list(zip(np.stack((x[l[:, 0] + 1], y[l[:, 1]])).T,
np.stack((x[l[:, 0] + 1], y[l[:, 1] + 1])).T)))
l = np.argwhere(h.T)
hlines = np.array(list(zip(np.stack((x[l[:, 0]], y[l[:, 1] + 1])).T,
np.stack((x[l[:, 0] + 1], y[l[:, 1] + 1])).T)))
lines = np.vstack((vlines, hlines))
ax.add_collection(LineCollection(lines, lw=1, colors=color))
add_iso_line(ax, 0.9, 'r')
plt.show()
Here is an adaption of the second answer, which can work with only 2d arrays:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from scipy import ndimage
x = np.linspace(-np.pi / 2, np.pi / 2, 30)
y = np.linspace(-np.pi / 2, np.pi / 2, 30)
x, y = np.meshgrid(x, y)
z = np.sin(x ** 2 + y ** 2)
scale = 5
zz = ndimage.zoom(z, scale, order=0)
fig, ax = plt.subplots()
ax.pcolormesh(x, y, z[:-1, :-1] )
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xmin, xmax = x.min(), x.max()
ymin, ymax = y.min(), y.max()
ax.contour(np.linspace(xmin,xmax, zz.shape[1]) + (xmax-xmin)/z.shape[1]/2,
np.linspace(ymin,ymax, zz.shape[0]) + (ymax-ymin)/z.shape[0]/2,
np.where(zz < 0.9, 0, 1), levels=[0.5], colors='red')
ax.set_xlim(*xlim)
ax.set_ylim(*ylim)
plt.show()
I'll try to refactor add_iso_line method in order to make it more clear an open for optimisations. So, at first, there comes a must-do part:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
x = np.linspace(-np.pi/2, np.pi/2, 30)
y = np.linspace(-np.pi/2, np.pi/2, 30)
x, y = np.meshgrid(x,y)
z = np.sin(x**2+y**2)[:-1,:-1]
fig, ax = plt.subplots()
ax.pcolormesh(x,y,z)
xlim, ylim = ax.get_xlim(), ax.get_ylim()
highlight = (z > 0.9)
Now highlight is a binary array that looks like this:
After that we can extract indexes of True cells, look for False neighbourhoods and identify positions of 'red' lines. I'm not comfortable enough with doing it in a vectorised manner (like here in add_iso_line method) so just using simple loop:
lines = []
cells = zip(*np.where(highlight))
for x, y in cells:
if x == 0 or highlight[x - 1, y] == 0: lines.append(([x, y], [x, y + 1]))
if x == highlight.shape[0] or highlight[x + 1, y] == 0: lines.append(([x + 1, y], [x + 1, y + 1]))
if y == 0 or highlight[x, y - 1] == 0: lines.append(([x, y], [x + 1, y]))
if y == highlight.shape[1] or highlight[x, y + 1] == 0: lines.append(([x, y + 1], [x + 1, y + 1]))
And, finally, I resize and center coordinates of lines in order to fit with pcolormesh:
lines = (np.array(lines) / highlight.shape - [0.5, 0.5]) * [xlim[1] - xlim[0], ylim[1] - ylim[0]]
ax.add_collection(LineCollection(lines, colors='r'))
plt.show()
In conclusion, this is very similar to JohanC solution and, in general, slower. Fortunately, we can reduce amount of cells significantly, extracting contours only using python-opencv package:
import cv2
highlight = highlight.astype(np.uint8)
contours, hierarchy = cv2.findContours(highlight, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
cells = np.vstack(contours).squeeze()
This is an illustration of cells being checked:

Categories