Visualize 2D / 3D decision surface in SVM scikit-learn - python

I made sklearn svm classifier work. I simply classify 2 options 0 or 1
using feature vectors. It works fine.
I want to visualize it on page using graphs.
Problem is that my vector is 512 item length, so hard to show on x,y graph.
Is there any way to visualize classification hyperplane for a long vector of features like 512?

You cannot visualize the decision surface for a lot of features. This is because the dimensions will be too many and there is no way to visualize an N-dimensional surface.
However, you can use 2 features and plot nice decision surfaces as follows.
I have also written an article about this here:
https://towardsdatascience.com/support-vector-machines-svm-clearly-explained-a-python-tutorial-for-classification-problems-29c539f3ad8?source=friends_link&sk=80f72ab272550d76a0cc3730d7c8af35
Case 1: 2D plot for 2 features and using the iris dataset
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()
Case 2: 3D plot for 3 features and using the iris dataset
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from mpl_toolkits.mplot3d import Axes3D
iris = datasets.load_iris()
X = iris.data[:, :3] # we only take the first three features.
Y = iris.target
#make it binary classification problem
X = X[np.logical_or(Y==0,Y==1)]
Y = Y[np.logical_or(Y==0,Y==1)]
model = svm.SVC(kernel='linear')
clf = model.fit(X, Y)
# The equation of the separating plane is given by all x so that np.dot(svc.coef_[0], x) + b = 0.
# Solve for w3 (z)
z = lambda x,y: (-clf.intercept_[0]-clf.coef_[0][0]*x -clf.coef_[0][1]*y) / clf.coef_[0][2]
tmp = np.linspace(-5,5,30)
x,y = np.meshgrid(tmp,tmp)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot3D(X[Y==0,0], X[Y==0,1], X[Y==0,2],'ob')
ax.plot3D(X[Y==1,0], X[Y==1,1], X[Y==1,2],'sr')
ax.plot_surface(x, y, z(x,y))
ax.view_init(30, 60)
plt.show()

Related

How to plot 3d scatter with QDA decision boundary?

With generated data, I am trying to plot 3d decision boundary of QDA in 3d spaces. I used sklearn library to calculate QDA, but couldn't plot 3d QDA decision boundary.
Here is my code:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_classification
# Generate a random dataset for classification
X, y = make_classification(n_features=3, n_informative=2, n_redundant=0, n_repeated=0, random_state=0)
# Create and fit a QDA classifier
qda = QuadraticDiscriminantAnalysis()
qda.fit(X, y)
# Plot the decision boundary of the QDA classifier
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
z_min, z_max = X[:, 2].min() - 1, X[:, 2].max() + 1
xx, yy, zz = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1),
np.arange(z_min, z_max, 0.1))
X_grid = np.c_[xx.ravel(), yy.ravel(), zz.ravel()]
Z = qda.predict(X_grid)
Z = Z.reshape(xx.shape)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.Paired)
ax.set_xlabel('X axis')
ax.set_ylabel('Y axis')
ax.set_zlabel('Z axis')
ax.plot_surface(xx, yy, zz, facecolors=plt.cm.Paired(Z), alpha=0.2)
plt.show()
Above code saying zz must be 2d instead of 3d, but I don't really get why it has to be 2d?
At the end I want to see figure with scatter plot of binary classes and decision boundary of QDA on data points.
With respect to z, this also needs to be a 2D array since Axes3D.surface_plot maps each element of the 2D array z with the 2D grid defined by x and y.
Hence, when you use your own x, y and z make sure that you use numpy.meshgrid for x and y and, then, define z = f(x,y) (e.g. the function flux_qubit_potential you show).
maybe help you: Link

plotting PCA and kmeans clustering output in scatter plot whilst colouring according to date with gradient legend in python matplotlib

I'd like to add a gradient legend based on the date of the points in the cluster. I've already applied PCA and clustered the data. Also how to test the performance.
I have this no legend
I want something with a legend like this with legend
I'm trying to predict when wells will fail.
from matplotlib import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.cluster import Kmeans
ccData = pd.read_csv("/Users/frun/Desktop/flum/E-43.csv", index_col=0)
print("Rows :", ccData.shape[0])
print("Columns :", ccData.shape[1])
ccData.drop(ccData.columns[ccData.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True) # drop unnamed column
#Standardize and Normalize
scaler = StandardScaler()
scaled_ccData = scaler.fit_transform(ccData)
norm_ccData = normalize(scaled_ccData)
#PCA
pca = PCA()
pca.fit(norm_ccData)
n_components = 2
pca_final = PCA(n_components = n_components)
pca_final.fit(norm_ccData)
pca_ccData = pca_final.fit_transform(norm_ccData)
#Kmeans Clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(pca_ccData)
plt.scatter(pca_ccData[:,0], pca_ccData[:,1],
c = KMeans(n_clusters = 3).fit_predict(pca_ccData),
cmap = plt.cm.summer)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
h = .01
x_min, x_max = pca_ccData[:,0].min() - 1, pca_ccData[:,0].max() + 1
y_min, y_max = pca_ccData[:,1].min() - 1, pca_ccData[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = kmeans.predict(np.array(list(zip(xx.ravel(), yy.ravel()))))
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.summer,
aspect='auto', origin='lower')
plt.plot(pca_ccData[:,0], pca_ccData[:,1], 'k.', markersize=2)
a bit of the csv file
a bit of the csv file
Date ,Frequency ,Avg VSD Current ,Drive Volts ,Intake Pressure ,Discharge Pressure,Intake Temperature ,Motor Temperature ,Vibration x ,Vibration y ,SYS Current,
11/13/2015,42,114,338,791.1,961,46.46,32.37,0.077,0.065,11.851,
11/13/2015,42,113,339,791.3,949.2,46.43,32.37,0.063,0.066,11.847,

Plotting classification area based on logistic regression

Let's consider data following :
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
I want to create logistic regression on that data set and after that create plot which shows classification area. So I used :
model = LogisticRegression(solver='liblinear', random_state=0)
est=model.fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=est.predict(X))
plt.show()
But how can make it look like the one below ?
Edit
I created plot below, but I still don't know how to change specific ones to squares, x'is and create a legend. Do you know maybe how it can be done ? I know I have to do something with marker='s' and marker='x' but it changes look for all image and I only want to change specific classifications.
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
Y = iris.target
logreg = LogisticRegression(C=1e5)
# Create an instance of Logistic Regression Classifier and fit the data.
logreg.fit(X, Y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.show()

How can I visualize border/decision function of two classes using scikit-learn

I am pretty new in machine learning, so I still don't understand how I can visualize the border between 2 classes in bag of words case.
I found the following exaplpe to plot data
plot a document tfidf 2D graph
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
newsgroups_train = fetch_20newsgroups(subset='train',
categories=['alt.atheism', 'sci.space'])
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
])
X = pipeline.fit_transform(newsgroups_train.data).todense()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
plt.scatter(data2D[:,0], data2D[:,1], c=newsgroups_train.target)
plt.show()
In my project I use SVC estimator
clf = SVC(random_state=241, kernel = 'linear')
clf.fit(X,newsgroups_train.target)
I have tried to use the example
http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html
but it didn't work in text clasification case
So how can I add the border of two classes to this plot?
Thank you!
The problem is that you need to select only 2 features in order to create the 2-dimensional decision surface plot. I will provide 2 examples. The first using iris data and the second using your data.
I have also written an article about this here:
https://towardsdatascience.com/support-vector-machines-svm-clearly-explained-a-python-tutorial-for-classification-problems-29c539f3ad8?source=friends_link&sk=80f72ab272550d76a0cc3730d7c8af35
In both cases, I select only 2 features in order to create the plot.
Example 1 using iris data:
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()
RESULTS
Example 2 using your data:
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
newsgroups_train = fetch_20newsgroups(subset='train',
categories=['alt.atheism', 'sci.space'])
pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())])
X = pipeline.fit_transform(newsgroups_train.data).todense()
# Select ONLY 2 features
X = np.array(X)
X = X[:, [0,1]]
y = newsgroups_train.target
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
model = svm.SVC(kernel='linear')
clf = model.fit(X, y)
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()
RESULTS
Important note:
In the second case, the plot is not nice since we selected randomly only 2 features to create it. One way to make it nice is the following: You could use a univariate ranking method (e.g. ANOVA F-value test) and find the best top-2 features from the 22464 that you initially have. Then using these top-2 you could create a nice separating surface plot.

scikit-learn: How to use the fitted probability model?

So I have used scikit-learn's Gaussian mixture models(http://scikit-learn.org/stable/modules/mixture.html) to fit my data, now I want to use the model, How can I do it? Specifically:
How can I plot the probability density distribution?
How can I calculate the mean square error of the fitting model?
Here is the code you may need:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from sklearn import mixture
import matplotlib as mpl
from matplotlib.patches import Ellipse
%matplotlib inline
n_samples = 300
# generate random sample, two components
np.random.seed(0)
shifted_gaussian = np.random.randn(n_samples, 2) + np.array([20, 5])
sample= shifted_gaussian
# fit a Gaussian Mixture Model with two components
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(sample)
# plot sample scatter
plt.scatter(sample[:, 0], sample[:, 1])
# 1. Plot the probobility density distribution
# 2. Calculate the mean square error of the fitting model
UPDATE:
I can plot the distribution by:
x = np.linspace(-20.0, 30.0)
y = np.linspace(-20.0, 40.0)
X, Y = np.meshgrid(x, y)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = -clf.score_samples(XX)[0]
Z = Z.reshape(X.shape)
CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0),
levels=np.logspace(0, 3, 10))
CB = plt.colorbar(CS, shrink=0.8, extend='both')
But isn't it quite strange? Is there better way do to it? Can I plot something like this?
I think the result is reasonable, if you adjust the xlim and ylim a little bit:
# plot sample scatter
plt.scatter(sample[:, 0], sample[:, 1], marker='+', alpha=0.5)
# 1. Plot the probobility density distribution
# 2. Calculate the mean square error of the fitting model
x = np.linspace(-20.0, 30.0, 100)
y = np.linspace(-20.0, 40.0, 100)
X, Y = np.meshgrid(x, y)
XX = np.array([X.ravel(), Y.ravel()]).T
Z = -clf.score_samples(XX)[0]
Z = Z.reshape(X.shape)
CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=10.0),
levels=np.logspace(0, 1, 10))
CB = plt.colorbar(CS, shrink=0.8, extend='both')
plt.xlim((10,30))
plt.ylim((-5, 15))

Categories