How to draw decision boundary in SVM sklearn data in python? - python

I am reading email data from training set and creating train_matrix, train_labels and test_labels. Now how do I display decision boundary using matplot in python. I am using svm of sklearn. There are online example for pre given data sets through iris. But plot fails on custom data. Here is my code
Error :
Traceback (most recent call last):
File "classifier-plot.py", line 115, in <module>
Z = Z.reshape(xx.shape)
ValueError: cannot reshape array of size 260 into shape (150,1750)
Code:
import os
import numpy as np
from collections import Counter
from sklearn import svm
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
def make_Dictionary(root_dir):
all_words = []
emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
for mail in emails:
with open(mail) as m:
for line in m:
words = line.split()
all_words += words
dictionary = Counter(all_words)
list_to_remove = dictionary.keys()
for item in list_to_remove:
if item.isalpha() == False:
del dictionary[item]
elif len(item) == 1:
del dictionary[item]
dictionary = dictionary.most_common(3000)
return dictionary
def extract_features(mail_dir):
files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
features_matrix = np.zeros((len(files),3000))
train_labels = np.zeros(len(files))
count = 0;
docID = 0;
for fil in files:
with open(fil) as fi:
for i,line in enumerate(fi):
if i == 2:
words = line.split()
for word in words:
wordID = 0
for i,d in enumerate(dictionary):
if d[0] == word:
wordID = i
features_matrix[docID,wordID] = words.count(word)
train_labels[docID] = 0;
filepathTokens = fil.split('/')
lastToken = filepathTokens[len(filepathTokens) - 1]
if lastToken.startswith("spmsg"):
train_labels[docID] = 1;
count = count + 1
docID = docID + 1
return features_matrix, train_labels
TRAIN_DIR = "../train-mails"
TEST_DIR = "../test-mails"
dictionary = make_Dictionary(TRAIN_DIR)
print "reading and processing emails from file."
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)
model = svm.SVC(kernel="rbf", C=10000)
print "Training model."
features_matrix = features_matrix[:len(features_matrix)/10]
labels = labels[:len(labels)/10]
#train model
model.fit(features_matrix, labels)
predicted_labels = model.predict(test_feature_matrix)
print "FINISHED classifying. accuracy score : "
print accuracy_score(test_labels, predicted_labels)
##----------------
h = .02 # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0 # SVM regularization parameter
X = features_matrix
y = labels
svc = model.fit(X, y)
#svm.SVC(kernel='linear', C=C).fit(X, y)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = y[:].min() - 1, y[:].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# title for the plots
titles = ['SVC with linear kernel']
Z = predicted_labels#svc.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title(titles[0])
plt.show()

In the tutorial that you were following Z is computed by applying the classifier to a set of feature vectors generated to form a regular NxM grid. This makes the plot smooth.
When you replaced
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
with
Z = predicted_labels
you replaced this regular grid with the predictions taken on your dataset. The next line failed with an error since it could not reshape an array of size len(files) to an NxM matrix. There is no reason len(files) = NxM.
There is a reason why you could not follow the tutorial directly. Your data dimension is 3000, so your decision boundary would be a 2999-dimensional hyperplane in a 3000-dimensional space. This is not easy to visualize.
In the tutorial the dimension is 4 and it is reduced to 2 for visualization.
The best way to reduce the dimension of your data depends on the data. In the tutorial we just pick the first two components of the 4-dimensional vector.
Another option that works well in many cases is to use Principal Component Analysis to reduce the dimension of data.
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(features_matrix, labels)
reduced_matrix = pca.fit_transform(features_matrix, labels)
model.fit(reduced_matrix, labels)
Such model can be used for 2D visualization. You can just follow the tutorial directly and define
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
A complete but not a very impressive example
We do not have access to your email data, so for illustration we could just use random data.
from sklearn import svm
from sklearn.decomposition import PCA
# initialize algorithms and data with random
model = svm.SVC(gamma=0.001,C=100.0)
pca = PCA(n_components = 2)
rng = np.random.RandomState(0)
U = rng.rand(200, 2000)
v = (rng.rand(200)*2).astype('int')
pca.fit(U,v)
U2 = pca.fit_transform(U,v)
model.fit(U2,v)
# generate grid for plotting
h = 0.2
x_min, x_max = U2[:,0].min() - 1, U2[:, 0].max() + 1
y_min, y_max = U2[:,1].min() - 1, U2[:, 1].max() + 1
xx, yy = np.meshgrid(
np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# create decision boundary plot
Z = s.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
contourf(xx,yy,Z,cmap=plt.cm.coolwarm, alpha=0.8)
scatter(U2[:,0],U2[:,1],c=v)
show()
Would produce a decision boundary that does not look very impressive.
Indeed the first two principal components capture just about 1% of the information contained in the data
>>> print(pca.explained_variance_ratio_)
[ 0.00841935 0.00831764]
If now you introduce just a little bit of carefully disguised asymmetry you would already see an effect.
Modify the data to introduce shifts at just one coordinate randomly selected for each feature
random_shifts = (rng.rand(2000)*200).astype('int')
for i in range(MM):
if v[i] == 1:
U[i,random_shifts[i]] += 5.0
And applying PCA you would get somewhat more informative picture.
Note that here the first two principal components already explain about 5% of the variance and the red part of the picture contains many more red points than blue ones.

Related

How to plot perceptron decision boundary and data set in python

I wrote multilayer-perceptron, using three layers (0,1,2). I want to plot the decision boundary and the data-set(eight features long) that i classified, Using python.
How do i plot it on the screen, using one of the python libraries?
Weight function -> matrix[3][8]
Sample x -> vector[8]
#-- Trains the boundary decision, and test it. --#
def perceptron(x, y):
m = len(x)
d = len(x[0])
eta = 0.1
w = [[0 for k in range(d)] for j in range(3)]
T = 2500
for t in range(0, T):
i = random.randint(0, m - 1)
v = [float(j) for j in x[i]]
y_hat = np.argmax(np.dot(w, v))
if y_hat != y[i]:
w[y[i]] = np.add(w[y[i]], np.array(v) * eta)
w[y_hat] = np.subtract(w[y_hat], np.array(v) * eta)
w_perceptron = w
#-- Test the decision boundary that we trained. --#
#-- Prints the loss weight function. --#
M_perceptron = 0
for t in range(0, m):
y_hat = np.argmax(np.dot(w_perceptron, x[t]))
if y[t] != y_hat:
M_perceptron = M_perceptron + 1
return float(M_perceptron) / m
def main():
y = []
x = [[]]
x = readTrain_X(sys.argv[1], x) # Reads data trainning set.
readTrain_Y(sys.argv[2], y) # Reads right classified training set.
print(perceptron(x, y))
You cannot plot 8 features. There is no way you can visualize a 8D space. But what you can do is to perform dimensionality reduction using PCA/t-SNE to 2D for visualization. If you can reduce it to 2D then you can use create a grid of values and use the probabilities returned by the model to visualize the decision boundary.
Reference: Link

Decision boundaries for nearest centroid

I am trying to draw decision boundaries for different classifiers including the nearestcentroid, but when I use this code
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
I get an error saying 'NearestCentroid' object has no attribute 'predict_proba'. How can I fix this?
You can make your own predict_proba:
from sklearn.utils.extmath import softmax
from sklearn.metrics.pairwise import pairwise_distances
def predict_proba(self, X):
distances = pairwise_distances(X, self.centroids_, metric=self.metric)
probs = softmax(distances)
return probs
clf = NearestCentroid()
clf.predict_proba = predict_proba.__get__(clf)
clf.fit(X_train, y_train)
clf.predict_proba(X_test)
Assuming your X has two features, you can generate a meshgrid where each axis pertains to one of the features.
Assuming X is your features array with two features - shape would be (N, 2), where N is the number of samples - and y is your target array.:
# first determine the min and max boundaries for generating the meshgrid
feat1_min, feat1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
feat2_min, feat2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
Now generate your meshgrid and make predictions along the grid:
xx, yy = np.meshgrid(np.arange(feat1_min, feat1_max , 0.02),
np.arange(feat2_min, feat2_max , 0.02)) # 0.02 is step size
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Now make the plot:
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap="autumn")
plt.scatter(X[:, 0], X[:, 1], c=y, cmap="autumn",
edgecolor='k', s=10)
plt.show()
As BearBrown pointed out, you only check if "decison_function" is an attribute of clf. You never check if "predict_proba" is an attribute of clf
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
elif hasattr(clf, "predict_proba"): # This condition ensures that you'll see that predict_proba is not an attribute of clf`enter code here`
Z = clf.predict_proba(numpy.c_[xx.ravel(), yy.ravel()])[:, 1]
else: #This will show you your error again
raise AttributeError("Neither 'decision_function' not 'predict_proba' found in clf")
After this, you should check why what you expect is not an attrib of clf

How can I make my 2D Gaussian fit to my image

I am trying to fit a 2D Gaussian to an image to find the location of the brightest point in it. My code looks like this:
import numpy as np
import astropy.io.fits as fits
import os
from astropy.stats import mad_std
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from matplotlib.patches import Circle
from lmfit.models import GaussianModel
from astropy.modeling import models, fitting
def gaussian(xycoor,x0, y0, sigma, amp):
'''This Function is the Gaussian Function'''
x, y = xycoor # x and y taken from fit function. Stars at 0, increases by 1, goes to length of axis
A = 1 / (2*sigma**2)
eq = amp*np.exp(-A*((x-x0)**2 + (y-y0)**2)) #Gaussian
return eq
def fit(image):
med = np.median(image)
image = image-med
image = image[0,0,:,:]
max_index = np.where(image >= np.max(image))
x0 = max_index[1] #Middle of X axis
y0 = max_index[0] #Middle of Y axis
x = np.arange(0, image.shape[1], 1) #Stars at 0, increases by 1, goes to length of axis
y = np.arange(0, image.shape[0], 1) #Stars at 0, increases by 1, goes to length of axis
xx, yy = np.meshgrid(x, y) #creates a grid to plot the function over
sigma = np.std(image) #The standard dev given in the Gaussian
amp = np.max(image) #amplitude
guess = [x0, y0, sigma, amp] #The initial guess for the gaussian fitting
low = [0,0,0,0] #start of data array
#Upper Bounds x0: length of x axis, y0: length of y axis, st dev: max value in image, amplitude: 2x the max value
upper = [image.shape[0], image.shape[1], np.max(image), np.max(image)*2]
bounds = [low, upper]
params, pcov = curve_fit(gaussian, (xx.ravel(), yy.ravel()), image.ravel(),p0 = guess, bounds = bounds) #optimal fit. Not sure what pcov is.
return params
def plotting(image, params):
fig, ax = plt.subplots()
ax.imshow(image)
ax.scatter(params[0], params[1],s = 10, c = 'red', marker = 'x')
circle = Circle((params[0], params[1]), params[2], facecolor = 'none', edgecolor = 'red', linewidth = 1)
ax.add_patch(circle)
plt.show()
data = fits.getdata('AzTECC100.fits') #read in file
med = np.median(data)
data = data - med
data = data[0,0,:,:]
parameters = fit(data)
#generates a gaussian based on the parameters given
plotting(data, parameters)
The image is plotting and the code is giving no errors but the fitting isn't working. It's just putting an x wherever the x0 and y0 are. The pixel values in my image are very small. The max value is 0.0007 and std dev is 0.0001 and the x and y are a few orders of magnitude larger. So I believe my problem is that because of this my eq is going to zero everywhere so the curve_fit is failing. I'm wondering if there's a better way to construct my gaussian so that it plots correctly?
I do not have access to your image. Instead I have generated some test "image" as follows:
y, x = np.indices((51,51))
x -= 25
y -= 25
data = 3 * np.exp(-0.7 * ((x+2)**2 + (y-1)**2))
Also, I have modified your code for plotting to increase the radius of the circle by 10:
circle = Circle((params[0], params[1]), 10 * params[2], ...)
and I commented out two more lines:
# image = image[0,0,:,:]
# data = data[0,0,:,:]
The result that I get is shown in the attached image and it looks reasonable to me:
Could it be that the issue is in how you access data from the FITS file? (e.g., image = image[0,0,:,:]) Are the data 4D array? Why do you have 4 indices?
I also saw that you have asked a similar question here: Astropy.model 2DGaussian issue in which you tried to use just astropy.modeling. I will look into that question.
NOTE: you can replace code such as
max_index = np.where(image >= np.max(image))
x0 = max_index[1] #Middle of X axis
y0 = max_index[0] #Middle of Y axis
with
y0, x0 = np.unravel_index(np.argmax(data), data.shape)

scikits learn SVM - 1-dimensional Separating Hyperplane

How to plot the separating "hyperplane" for 1-dimensional data using scikit svm ?
I follow this guide for 2-dimensional data : http://scikit-learn.org/stable/auto_examples/svm/plot_svm_margin.html, but don't know how to make it works for 1-dimensional data
pos = np.random.randn(20, 1) + 1
neg = np.random.randn(20, 1) - 1
X = np.r_[pos, neg]
Y = [0] * 20 + [1] * 20
clf = svm.SVC(kernel='linear', C=0.05)
clf.fit(X, Y)
# how to get "hyperplane" and margins values ??
thanks
The separating hyperplane for two-dimensional data is a line, whereas for one-dimensional data the hyperplane boils down to a point. The easiest way to plot the separating hyperplane for one-dimensional data is a bit of a hack: the data are made two-dimensional by adding a second feature which takes the value 0 for all the samples. By doing so, the second component of the weight vector is zero, i.e. w = [w0, 0] (see the appendix at the end of this post). As w1 = 0 and w1 is in the denominator of the expression that defines the slope and the y-intercept term of the separating line (see appendix), both coefficients are ∞. In this case it is convenient to solve the equation of the separating hyperplane for x, which results in x = x0 = -b/w0. The margin turns out to be 2/w0 (see appendix for details).
The following script implements this approach:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
np.random.seed(0)
pos = np.hstack((np.random.randn(20, 1) + 1, np.zeros((20, 1))))
neg = np.hstack((np.random.randn(20, 1) - 1, np.zeros((20, 1))))
X = np.r_[pos, neg]
Y = [0] * 20 + [1] * 20
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
w = clf.coef_[0]
x_0 = -clf.intercept_[0]/w[0]
margin = w[0]
plt.figure()
x_min, x_max = np.floor(X.min()), np.ceil(X.max())
y_min, y_max = -3, 3
yy = np.linspace(y_min, y_max)
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = clf.predict(np.c_[XX.ravel(), np.zeros(XX.size)]).reshape(XX.shape)
plt.pcolormesh(XX, YY, Z, cmap=plt.cm.Paired)
plt.plot(x_0*np.ones(shape=yy.shape), yy, 'k-')
plt.plot(x_0*np.ones(shape=yy.shape) - margin, yy, 'k--')
plt.plot(x_0*np.ones(shape=yy.shape) + margin, yy, 'k--')
plt.scatter(pos, np.zeros(shape=pos.shape), s=80, marker='o', facecolors='none')
plt.scatter(neg, np.zeros(shape=neg.shape), s=80, marker='^', facecolors='none')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.show()
Although the code above is self explanatory, here are some tips. X dimensions are 40 rows by 2 columns: the values in the first column are random numbers while all the elements of the second column are zeros. In the code, the weight vector w = [w0, 0] and the intercept b are clf_coef_[0] and clf.intercept_[0], respectively, wehre clf if the object returned by sklearn.svm.SVC.
And this is the plot you get when the script is run:
For the sake of clarity I'd suggest to tweak the code above by adding/subtracting a small constant to the second feature, for example:
plt.scatter(pos, .3 + np.zeros(shape=pos.shape), ...)
plt.scatter(neg, -.3 + np.zeros(shape=neg.shape), ...)
By doing so the visualization is significantly improved since the different classes are shown without overlap.
Appendix
The separating hyperplane is usually expressed as
where x is a n-dimensional vector, w is the weight vector and b is the bias or intercept. For n = 2 we have w0.x + w1.y + b = 0. After some algebra we obtain y = -(w0/w1).x + (-b/w1). It clearly emerges from this expression that the discriminant hyperplane in a 2D feature space is a line of equation y = a.x + y0, where the slope is given by a = -w0/w1 and the y-intercept term is y0 = -b/w1. In SVM, the margin of a separating hyperplane is 2/‖w‖, which for 2D reduces to
the .coef_ member of clf will return the "hyperplane," which, in one dimension, is just a point. Check out this post for info on how to plot points on a numberline.

Creating video of graph over time

I want to create a video of a graph as it evolves over time. I have tried stitching together PNG images of the graph, but it has 10,000 frames, which takes a VERY long time. I now want to try to use animate.FuncAnimation(), but I have been having a lot of trouble. Here is what I have so far:
def plot(fname, haveMLPY=False):
# Load data from .npz file.
data = np.load(fname)
X = data["X"]
T = data["T"]
N = X.shape[1]
A = data["vipWeights"]
degrees = A.sum(1)
ksB = data["ksB"]
# Initialize a figure.
figure = plt.figure()
files=[]
# filename for the name of the resulting movie
filename = 'animation'
from mpl_toolkits.mplot3d import Axes3D
for i in range(10**4):
mp = X[i,:,0]
data2 = np.c_[degrees, ksB, mp]
# Create best fit surface for data2
# regular grid covering the domain of the data
mn = np.min(data2, axis=0)
mx = np.max(data2, axis=0)
X_grid, Y_grid = np.meshgrid(np.linspace(mn[0], mx[0], 20), np.linspace(mn[1], mx[1], 20))
XX = X_grid.flatten()
YY = Y_grid.flatten()
order = 2 # 1: linear, 2: quadratic
if order == 1:
# best-fit linear plane
A = np.c_[data2[:,0], data2[:,1], np.ones(data2.shape[0])]
C,_,_,_ = scipy.linalg.lstsq(A, data2[:,2]) # coefficients
# evaluate it on grid
Z = C[0]*X_grid + C[1]*Y_grid + C[2]
# or expressed using matrix/vector product
#Z = np.dot(np.c_[XX, YY, np.ones(XX.shape)], C).reshape(X.shape)
elif order == 2:
# best-fit quadratic curve
A = np.c_[np.ones(data2.shape[0]), data2[:,:2], np.prod(data2[:,:2], axis=1), data2[:,:2]**2]
C,_,_,_ = scipy.linalg.lstsq(A, data2[:,2])
# evaluate it on a grid
Z = np.dot(np.c_[np.ones(XX.shape), XX, YY, XX*YY, XX**2, YY**2], C).reshape(X_grid.shape)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X_grid, Y_grid, Z, rstride=1, cstride=1, alpha=0.2)
ax.scatter(degrees, ksB, mp)
ax.set_xlabel('degrees')
ax.set_ylabel('ksB')
ax.set_zlabel('mp')
# form a filename
fname2 = '_tmp%03d.png'%i
# save the frame
savefig(fname2)
# append the filename to the list
files.append(fname2)
# call mencoder
os.system("mencoder 'mf://_tmp*.png' -mf type=png:fps=10 -ovc lavc -lavcopts vcodec=wmv2 -oac copy -o " + filename + ".mpg")
# cleanup
for fname2 in files: os.remove(fname2)
All the code from
# Create best fit surface for data2
to
fig = plt.figure()
can be mostly ignored because it is just used to calculate the best fit plane for the data.
Basically, there are N neurons, each of which has three important properties I want to plot: degrees, ksB, and mp. Only mp changes with time. All the data for mp is stored in X. The format X[i, i, i] means X[time, neuron, data type]. Right now, I am looping through X[i,:,0] (mp is the 0th variable). Taking screenshots of all 10^4 images takes forever, and the axis for mp keeps changing.
Is there a way to speed this up (either using animation.FuncAnimation or some other thing) and also prevent the axis from shifting each frame?
Thanks!

Categories