How to fix my boundary decision line ? (Sklearn and python) - python

I'm currently working on a little project with sklearn and python and I'm wondering why my boundary decision line is broken. My dataset is mostly measurements of a strain gauge, the associated temperature and if the measurement is provided by a "faulty" sensor.
# coding=utf-8
# Libraries
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier as KN
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sn
from Model import LogisticRegressionUsingGD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
url = "measurementsStrainGauge3.csv"
columnsHead = ['µm', 'tmp','fault']
dataset = pd.read_csv(url, names=columnsHead)
dataset.head()
# X = feature values, all the columns except the last column
X = dataset.iloc[:, :-1]
# y = target values, last column of the data frame
y = dataset.iloc[:, -1]
#Filtering data
faultyData = dataset.loc[y == 1]
notFaultyData = dataset.loc[y == 0]
#Model building
X = np.c_[np.ones((X.shape[0], 1)), X]
y = y[:, np.newaxis]
theta = np.zeros((X.shape[1], 1))
model = LogisticRegression()
model.fit(X, y.ravel())
predicted_classes = model.predict(X)
accuracy = accuracy_score(y.flatten(),predicted_classes)
parameters = model.coef_
params = parameters.flatten()
print(params)
print("Precision : %", accuracy)
x_values = [np.min(X[:, 1] - 5), np.max(X[:, 2] + 5)]
y_values = - (params[0] + np.dot(params[1], x_values)) / params[2]
# Plots
plt.plot(x_values, y_values, label=u'Ligne de décision')
plt.scatter(faultyData.iloc[:, 0], faultyData.iloc[:, 1],
s=10, label='Faute')
plt.scatter(notFaultyData.iloc[:, 0], notFaultyData.iloc[:, 1],
s=10, label='Non faute')
plt.xlabel(u'Déformation (µ/m)')
plt.ylabel(u'Température (C°)')
plt.legend()
plt.show()
Edit : Here is the data I use (1 is "faulty" and 0 is "non faulty") :
6973,15.02,0
3017,41.75,0
5900,61.35,1
8610,63.57,1
5405,44.42,0
3965,-5.13,0
3079,12.64,0
4562,13.09,0
4185,46.78,0
6734,34.73,0
5711,-7.34,0
5006,25.04,0
7614,51.4,1
3265,27.81,0
7218,60.65,1
5852,35.75,0
7880,46.89,0
7819,11.53,0
4775,2.16,0
5128,-14.42,0
6385,-7.32,0
3511,17.18,0
6303,28.88,0
3476,29.81,0
6285,61.21,1
3437,-2.2,0
8914,66.67,1
6306,67.7,1
3327,36.57,0
7842,-16.59,0
7336,67.02,1
4949,57,1
4036,66.4,1
3644,-0.57,0
6082,13.8,0
8044,65.51,1
7659,52.96,1
3319,40.44,0
7928,8.28,0
6812,35.83,0
7080,70.66,1
6876,79.59,1
7826,27.75,0
4514,69,1
5885,-18.39,0
4063,77.65,1
6827,-7.36,0
5085,50.1,1
7353,71.37,1
8878,11.08,0
4385,48.06,0
4204,27.01,0
6614,15.66,0
3379,-12.1,0
8312,-13.57,0
5565,21.29,0
3670,-18.79,0
4152,31.22,0
5448,-17.83,0
3081,32.11,0
8674,32.2,0
4224,21.73,0
7701,63.21,1
8984,18.09,0
6266,5.5,0
8223,32.91,0
3709,76.47,0
4888,-5.16,0
4824,-1.02,0
8579,4.81,0
8588,48.98,0
7805,73.59,1
3859,-1.31,0
4666,43.92,0
3473,-7.51,0
4301,-12.26,0
6421,65.2,1
8345,35.49,0
5840,45.75,0
4702,-1.85,0
6538,7.98,0
3217,44.56,0
6450,70.51,1
3444,12.54,0
5220,-13.33,0
8724,-16.96,0
6043,73.71,1
3187,23.54,0
6696,6.83,0
7928,34.15,0
3013,36.46,0
7376,76.77,1
7752,22.78,0
7328,-14.24,0
6690,71.65,1
6253,-1.57,0
4238,60.1,1
6569,33.7,0
6213,13.37,0
4075,48.68,0
7964,16.1,0
7810,65.45,1
6350,25.03,0
6275,61.15,1
6883,56.02,1
3622,2.82,0
4570,0.04,0
6514,37.81,0
3999,-19.13,0
5082,-6.88,0
6987,25.56,0
5706,42.42,0
5474,28.61,0
5932,4.84,0
4110,-2.27,0
7662,0.89,0
8851,-5.14,0
4370,58.47,1
5541,40.52,0
5408,11.39,0
7986,76.91,1
7124,79.9,1
3654,22.37,0
8165,2.77,0
8452,32.72,0
8849,49.46,0
8517,3.56,0
6027,2.8,0
8405,26.44,0
8313,76.85,1
3545,59.98,0
4033,77.04,1
3083,61.34,0
3041,47.35,0
4901,5.1,0
8225,0.49,0
8525,36.75,0
8402,-4.46,0
6794,36.73,0
6317,79.12,1
4961,18.47,0
5790,11.45,0
6661,-16.26,0
6211,45.59,0
4277,43.98,0
3116,-19.83,0
3971,34.46,0
5417,39.99,0
8881,73.96,1
7119,-12.92,0
7011,48.87,0
6932,31.42,0
4118,32.2,0
4412,70.49,1
5908,20.69,0
5367,3.74,0
7461,24.85,0
5154,26.32,0
6019,46.53,0
4566,-19.92,0
5633,48.09,0
6558,50.27,1
7257,-10.97,0
3896,74.1,0
8084,-5.84,0
3163,40.61,0
3983,45.91,0
4684,23.51,0
5147,75.9,1
6120,72.83,1
8039,63.16,1
6498,-1.05,0
3332,54.26,0
7504,52.7,1
3477,79.28,0
5549,13.41,0
6377,75.99,1
5114,19.59,0
8631,-3.75,0
4806,12.49,0
4923,6.8,0
8470,14.24,0
8032,-12.38,0
5387,-11.47,0
3330,21.95,0
3716,16.77,0
8085,39.17,0
3869,5.53,0
6466,71.76,1
6988,31.83,0
4922,10.24,0
8340,-9.13,0
4136,62.2,1
3747,45.66,0
5042,32.84,0
8492,14.71,0
6282,37.44,0
8732,36.03,0
7694,62.94,1
6814,67.12,1
6757,-2.81,0
5299,8.04,0
5733,71.57,1
3282,61.78,0
7036,53.86,1
3740,47.41,0
4021,53.49,1
5853,-2.98,0
7212,50.47,1
7237,21.88,0
5048,76.42,1
5289,-18.42,0
6370,40.66,0
5922,-0.84,0
4287,40.22,0
3039,50.98,0
7127,68.39,1
7718,45.12,0
5731,75.06,1
7578,76.26,1
7934,18.88,0
3404,72.66,0
8704,-3.06,0
8933,77.09,1
3789,6.55,0
4859,12.35,0
5283,32.99,0
4998,-4.25,0
6613,-1.29,0
5432,23.25,0
7086,17.65,0
4057,-2.48,0
4436,-4.3,0
8527,31.34,0
6375,63.06,1
7101,-13.35,0
5043,30.15,0
7747,29.09,0
4056,30.35,0
8823,21.67,0
4860,48.11,0
3699,69.05,0
4808,69.35,1
6619,25.9,0
4098,3.9,0
8463,73.25,1
5328,41.71,0
5073,68.73,1
4063,49.4,0
3353,29.46,0
6205,21.64,0
7663,5.2,0
6336,28.68,0
6559,64.37,1
5606,29.07,0
4768,5.83,0
5040,8.76,0
7409,36.27,0
7438,56.12,1
8719,42.81,0
3859,5.62,0
5280,-10.07,0
7795,-7.19,0
3874,-17.21,0
3356,6.77,0
3642,19.1,0
3619,65.96,0
5938,5.05,0
7545,65.69,1
5440,36.21,0
7870,30.08,0
3159,20.17,0
8689,44.11,0
5367,76.86,1
8470,-5.38,0
3394,76.58,0
8644,58.69,1
6883,0.8,0
8900,34.32,0
6060,-11.32,0
6081,45.06,0
5936,-8.27,0
3523,47.16,0
6247,77.33,1
4984,31.52,0
4176,21.07,0
3317,36.41,0
8621,10.17,0
6562,1.93,0
5837,8.01,0
5336,64.17,1
6620,44.64,0
5312,59.82,1
6323,11.16,0
7213,55.46,1
6894,30.54,0
7062,40.89,0
6575,36.44,0
3679,77.68,0
6566,29.49,0
7351,-6.37,0
5227,14.63,0
5461,0.9,0
7577,-18.63,0
4630,18.04,0
5132,37.62,0
8925,-17.93,0
8626,62.48,1
6980,21.47,0
8169,72.86,1
5566,63.81,1
7655,37.05,0
7134,-18.12,0
5795,26.67,0
6392,64.86,1
3324,-0.46,0
4810,22.8,0
8712,67.22,1
3803,62.02,0
4065,23.9,0
4695,59.94,1
7620,57.72,1
6799,67.89,1
5147,30.54,0
4629,-14.92,0
3560,-17.5,0
8586,54.64,1
3822,45.33,0
5930,-14.71,0
7754,41.33,0
3547,23.34,0
4163,32.52,0
8550,63.04,1
7552,-1.77,0
7803,-0.39,0
3628,45.4,0
6413,-17.97,0
6258,-14.1,0
7000,-16.14,0
8570,-2.87,0
3395,16.93,0
4259,41.77,0
8980,63.7,1
7635,58.79,1
3271,-5.45,0
3743,-4.47,0
3847,20.11,0
8649,26.46,0
4804,22.25,0
8054,68.84,1
5955,50.28,1
4421,13.44,0
8391,22.63,0
6611,27.72,0
4832,37.76,0
4960,9.2,0
6035,-8.52,0
6136,75.5,1
8702,52.76,1
4351,49.14,0
4085,5.4,0
7357,-11.35,0
5080,25.12,0
5243,79.92,1
6144,36.6,0
4686,27.78,0
4740,77.34,1
8634,22.09,0
3611,38.18,0
5529,13.2,0
3044,2.07,0
5618,1.39,0
3534,5.96,0
3281,21.92,0
6296,-4.04,0
6422,53.66,1
4770,36.74,0
5285,38.3,0
3466,-0.31,0
8347,78.31,1
4789,44.55,0
8260,-4.02,0
8314,8.51,0
4146,2.78,0
8530,-14.13,0
4529,71.55,1
7826,21.49,0
5980,72.18,1
7218,-1.31,0
5861,19.5,0
5662,50.07,1
6087,56.6,1
8219,66.81,1
7180,1.24,0
6594,54.13,1
8408,70.9,1
3766,-0.97,0
3113,35.67,0
7871,71.23,1
4898,-8.25,0

I found a couple of issues in your code.
I couldn't understand why you are # Filtering the data and what exactly you are trying out in # Model building. You have ended up changing the data.
When it comes to the plot, you are plotting a line between 2 x coordinates while generating the y coordinate using the model. This will not work as the model is predicting the class the point belongs to and there are already a couple of features namely Temperature and Deformation on the individual axis.
This is the right way of creating a plot for classification tasks with the data and the decision surface of the model.
I have fixed your code to generate the graph, it's a basic implementation of the complete method.
columnsHead = ['µm', 'tmp','fault']
dataset = pd.read_csv(url, names=columnsHead)
print(dataset.head())
# X = feature values, all the columns except the last column
X = dataset.iloc[:, :-1].values
# y = target values, last column of the data frame
y = dataset.iloc[:, -1]
model = LogisticRegression()
model.fit(X, y)
# Creating mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 100),
np.arange(y_min, y_max, 1))
# Plotting decision boundary
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.title("Decision surface of LogisticRegression")
plt.axis('tight')
colors = "br"
for i, color in zip(model.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired,
edgecolor='black', s=20)
plt.show()
Resulting plot

Related

selecting data points neighbourhood to support vectors

I have been thinking of this but not sure how to do it. I have a binary imbalanced data, and would like to use svm to select just subset of the majority data points nearest to support vector. Thereafter, I can fit a binary classifier on this "balanced" data.
To illustrate what I mean, a MWE:
# packages import
from collections import Counter
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
import seaborn as sns
# sample data
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.9], flip_y=0, random_state=1)
# class distribution summary
print(Counter(y))
Counter({0: 91, 1: 9})
# fit svm model
svc_model = SVC(kernel='linear', random_state=32)
svc_model.fit(X, y)
plt.figure(figsize=(10, 8))
# Plotting our two-features-space
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
# Constructing a hyperplane using a formula.
w = svc_model.coef_[0] # w consists of 2 elements
b = svc_model.intercept_[0] # b consists of 1 element
x_points = np.linspace(-1, 1) # generating x-points from -1 to 1
y_points = -(w[0] / w[1]) * x_points - b / w[1] # getting corresponding y-points
# Plotting a red hyperplane
plt.plot(x_points, y_points, c='r')
The two classes are well separated by the hyperplane. We can see the support vectors for both classes (even better for class 1).
Since the minority class 0 has 9-data-points, I want to down-sample class 0 by selecting its support vectors, and 8 other data points nearest to it. So that the class distribution becomes {0: 9, 1: 9} ignoring all other data points of 0. I will then use this to fit a binary classifier like LR (or even SVC).
My question is how to select those data points of class 0 nearest to the class support vector, taking into account, a way to reach a balance with data points of minority class 1.
This can be achieved as follows: Get the support vector for class 0, (sv0), iterate over all data points in class 0 (X[y == 0]), compute the distances (d) to the point represented by the support vector, sort them, take the 9 with the smallest values, and concatenate them with the points of class 1 to create the downsampled data (X_ds, y_ds).
sv0 = svc_model.support_vectors_[0]
distances = []
for i, x in enumerate(X[y == 0]):
d = np.linalg.norm(sv0 - x)
distances.append((i, d))
distances.sort(key=lambda tup: tup[1])
index = [i for i, d in distances][:9]
X_ds = np.concatenate((X[y == 0][index], X[y == 1]))
y_ds = np.concatenate((y[y == 0][index], y[y == 1]))
plt.plot(x_points[19:-29], y_points[19:-29], c='r')
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
plt.scatter(X_ds[y_ds == 0][:,0], X_ds[y_ds == 0][:,1], color='yellow', alpha=0.4)

How to plot the decision boundary of a One Class SVM?

I am having some troubles to plot the results from a One-class SVM that I have programmed. I have tried different examples found on the web, but with no good results at all. I have the following small dataset where id is the identification of a sample and f1 to f9 are certain features:
id,f1,f2,f3,f4,f5,f6,f7,f8,f9
d1,0,0,0,0,0,0,0,0.045454545,0
d2,0.047619048,0,0,0.047619048,0,0.047619048,0,0.047619048,0.047619048
d3,0,0,0,0.045454545,0,0,0,0,0
d4,0,0.045454545,0,0.045454545,0,0,0,0.045454545,0.045454545
d5,0,0,0,0,0,0,0,0,0
d6,0,0.045454545,0,0,0,0,0,0.045454545,0
d7,0,0,0,0,0,0,0.045454545,0,0
d8,0,0,0,0.045454545,0,0,0,0,0
d9,0,0,0,0.045454545,0,0,0,0,0
d10,0,0,0,0.045454545,0,0,0,0,0
d11,0,0,0,0.045454545,0,0,0,0,0
d12,0.045454545,0,0,0.045454545,0.045454545,0.045454545,0,0.045454545,0
d13,0,0,0,0.045454545,0,0,0,0.045454545,0.045454545
d14,0,0,0,0.045454545,0.045454545,0,0,0,0
d15,0,0,0,0,0,0,0,0.047619048,0.047619048
d16,0,0,0,0,0,0,0,0.045454545,0
d17,0,0,0.045454545,0,0,0,0,0,0.045454545
d18,0,0,0,0,0,0,0,0,0
d19,0.045454545,0,0.090909091,0,0,0,0.090909091,0,0
d20,0,0,0,0.090909091,0,0,0.045454545,0.045454545,0.045454545
d21,0,0,0.045454545,0.045454545,0,0.045454545,0.045454545,0,0
d22,0,0.090909091,0,0,0,0.045454545,0,0,0.045454545
d23,0,0.047619048,0,0.047619048,0,0,0,0.047619048,0.095238095
d24,0,0,0,0,0,0.045454545,0.045454545,0.045454545,0
d25,0,0,0,0,0,0,0,0.043478261,0
d26,0,0,0,0,0.043478261,0,0.043478261,0.043478261,0
d27,0.043478261,0,0,0.043478261,0,0,0.043478261,0.043478261,0
My code is the following:
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn import preprocessing
listDrop=['id']
df1=df.drop(listDrop,axis="columns")
colNames=list(df1.columns.values)
min_max_scaler=preprocessing.MinMaxScaler()
x_scaled=min_max_scaler.fit_transform(df1)
df1[colNames]=x_scaled
svm = OneClassSVM(kernel='rbf', nu=0.2, gamma=1e-04)
svm.fit(df1)
pred=svm.predict(df1)
listA=[i+1 for i,x in enumerate(pred) if x == -1]
listB=[i+1 for i,x in enumerate(pred) if x == 1]
xx, yy = np.meshgrid(np.linspace(-5, 5, 1), np.linspace(-5, 5, 7500))
Xpred=np.array([xx.ravel(),yy.ravel()]+ [np.repeat(0, xx.ravel().size) for _ in range(7)]).T
Z = svm.decision_function(Xpred).reshape(xx.shape)
assert len(Z) == (len(xx) * len(yy))
Z = np.array(Z)
Z = Z.reshape(xx.shape)((len(xx), len(yy)))
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.Blues_r)
b1 = plt.scatter(pred[:, 0], pred[:, 1], c='red')
b3 = plt.scatter(listB[:,0], listB[:, 1], c="green")
plt.legend([a.collections[0],b1,b3],
["learned frontier", "test","outliers"],
loc="lower right",
prop=matplotlib.font_manager.FontProperties(size=11))
I would like to get a plot like the following:
I found this code on the web, and I was playing with the following lines:
Xpred=np.array([xx.ravel(),yy.ravel()]+ [np.repeat(0, xx.ravel().size) for _ in range(7)]).T
This because it was throwing me an error about the dimensions, and I read that because it is a 2d plot and I have 9 features I should fill the remaining ones with any data.
Also I added the part of the assert, but I got an error:
assert len(Z) == (len(xx) * len(yy))
AssertionError
How can I plot the results from this one class SVM, it only returns an array composed of 1 and -1 like the following:
[ 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 1 -1 1 1 -1 -1 -1 -1 -1 -1 -1 -1
1 -1 -1]
The standard approach is to use t-SNE to reduce the dimensionality of the data for visualization purposes. Once you have reduced the data to two dimensions you can easily replicate the visualization in the scikit-learn tutorial, see the code below for an example.
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# load the data
df = pd.read_csv('data.csv')
x = df.drop(labels='id', axis=1).values
# rescale the data
x_scaled = MinMaxScaler().fit_transform(x)
# reduce the data to 2 dimensions using t-SNE
x_reduced = TSNE(n_components=2, random_state=0).fit_transform(x_scaled)
# fit the model to the reduced data
svm = OneClassSVM(kernel='rbf', nu=0.2, gamma=1e-04)
svm.fit(x_reduced)
# extract the model predictions
x_predicted = svm.predict(x_reduced)
# define the meshgrid
x_min, x_max = x_reduced[:, 0].min() - 5, x_reduced[:, 0].max() + 5
y_min, y_max = x_reduced[:, 1].min() - 5, x_reduced[:, 1].max() + 5
x_ = np.linspace(x_min, x_max, 500)
y_ = np.linspace(y_min, y_max, 500)
xx, yy = np.meshgrid(x_, y_)
# evaluate the decision function on the meshgrid
z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
# plot the decision function and the reduced data
plt.contourf(xx, yy, z, cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, z, levels=[0], linewidths=2, colors='darkred')
b = plt.scatter(x_reduced[x_predicted == 1, 0], x_reduced[x_predicted == 1, 1], c='white', edgecolors='k')
c = plt.scatter(x_reduced[x_predicted == -1, 0], x_reduced[x_predicted == -1, 1], c='gold', edgecolors='k')
plt.legend([a.collections[0], b, c], ['learned frontier', 'regular observations', 'abnormal observations'], bbox_to_anchor=(1.05, 1))
plt.axis('tight')
plt.show()

Plot the decision surface of a classification decision tree with 3 features on a 2D plot

My problem is that I have 3 features, but I only want to plot a 2D graphic while using 2 features at a time and show all the possible combinations.
The problem is that I did classifier.fit(X_train, Y_train) so it expects to be trained with 3 features, not just 2. X_train is the size (70, 3) which is (n_samples, n_features).
So far I tweaked the original code to add z_min and z_max, since I do need to have this third feature that I need to be able to use classifier.predict().
The error I get at the plt.contourf instruction is Input z must be a 2D array.
import matplotlib as pl
import matplotlib.colors as colors
import matplotlib.cm as cmx
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
z_min, z_max = X_train[:, 2].min() - 1, X_train[:, 2].max() + 1
xx, yy, zz = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1),
np.arange(z_min, z_max, 0.1))
fig, ax = plt.subplots()
# here "model" is your model's prediction (classification) function
Z = classifier.predict(np.c_[np.c_[xx.ravel(), yy.ravel()], zz.ravel()])
# Put the result into a color plot
Z = Z.reshape(len(Z.shape), 2)
plt.contourf(xx, yy, Z, cmap=pl.cm.Paired)
plt.axis('off')
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)
print(z.shape) = (4612640,)
print(xx.shape) = (20, 454, 508)
How can I plot a 2D array + train with 3 features but only plot 2 features and keep the right shape for my array Z ? How can I get Z to the right size?
What I tried so far:
I read scikit-learn example with the iris dataset
I want something like this, bus instead I have 2 features and I can only predict 2 values, not 3 like the example.
Kaggle link 1
Retrieve Decision Boundary Lines (x,y coordinate format) from SKlearn Decision Tree
plot decision boundary matplotlib
But again all the examples I'm seeing, they are only training with 2 features so they are good to go from my understanding, they are not facing my problem with the Z shape that's not the right one.
Would it also be possible to visualize this with a 3D graphic so we can see the 3 features ?
I don't think the shape/size is the main issue here. You have to do some calculation before you can plot a 2D decision surface (contourf) for a 3D feature space. A correct contour plot requires that you have a single defined value (Z) for each pair of (X, Y). Take your example and look just xx and yy:
import pandas as pd
df = pd.DataFrame({'x': xx.ravel(),
'y': yy.ravel(),
'Class': Z.ravel()})
xy_summ = df.groupby(['x', 'y']).agg(lambda x: x.value_counts().to_dict())
xy_summ = (xy_summ.drop('Class', axis=1)
.reset_index()
.join(pd.DataFrame(list(xy_summ.Class)))
.fillna(0))
xy_summ[[0, 1, 2]] = xy_summ[[0, 1, 2]].astype(np.int)
xy_summ.head()
You would find out that for each pair of xx and yy you would get 2 or 3 possible classes, depending on what zz is there:
xx yy 0 1 2
0 3.3 1.0 25 15 39
1 3.3 1.1 25 15 39
2 3.3 1.2 25 15 39
3 3.3 1.3 25 15 39
4 3.3 1.4 25 15 39
Therefore, to make a 2D contourf work, you have to decide what Z you'd like to call from 2 or 3 possibilities. For example, you can have a weighted class call like:
xy_summ['weighed_class'] = (xy_summ[1] + 2 * xy_summ[2]) / xy_summ[[0, 1, 2]].sum(1)
This will then allow you to draw a successful 2D plot:
import itertools
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
iris = load_iris()
X = iris.data[:, 0:3]
Y = iris.target
clf = DecisionTreeClassifier().fit(X, Y)
plot_step = 0.1
a, b, c = np.hsplit(X, 3)
ar = np.arange(a.min()-1, a.max()+1, plot_step)
br = np.arange(b.min()-1, b.max()+1, plot_step)
cr = np.arange(c.min()-1, c.max()+1, plot_step)
aa, bb, cc = np.meshgrid(ar, br, cr)
Z = clf.predict(np.c_[aa.ravel(), bb.ravel(), cc.ravel()])
datasets = [[0, len(ar), aa],
[1, len(br), bb],
[2, len(cr), cc]]
for i, (xsets, ysets) in enumerate(itertools.combinations(datasets, 2)):
xi, xl, xx = xsets
yi, yl, yy = ysets
df = pd.DataFrame({'x': xx.ravel(),
'y': yy.ravel(),
'Class': Z.ravel()})
xy_summ = df.groupby(['x', 'y']).agg(lambda x: x.value_counts().to_dict())
xy_summ = (xy_summ.drop('Class', axis=1)
.reset_index()
.join(pd.DataFrame(list(xy_summ.Class)))
.fillna(0))
xy_summ['weighed_class'] = (xy_summ[1] + 2 * xy_summ[2]) / xy_summ[[0, 1, 2]].sum(1)
xyz = (xy_summ.x.values.reshape(xl, yl),
xy_summ.y.values.reshape(xl, yl),
xy_summ.weighed_class.values.reshape(xl, yl))
ax = plt.subplot(1, 3, i + 1)
ax.contourf(*xyz, cmap=mpl.cm.Paired)
ax.scatter(X[:, xi], X[:, yi], c=Y, cmap=mpl.cm.Paired, edgecolor='black')
ax.set_xlabel(iris.feature_names[xi])
ax.set_ylabel(iris.feature_names[yi])
plt.show()
If I understand this correctly, "visualize this with a 3D graph" will be difficult. You've got not only 3 features, which make it 3D, but also a class call. In the end, you actually have to work with a 4D data, or density like data in a 3D space. I guess this might be the reason why a 3D decision space (not really surface anymore) graph is not quite common.

two dimensional linear regression coefficients

I am doing linear regression with two dimensional variables:
filtered[['p_tag_x', 'p_tag_y', 's_tag_x', 's_tag_y']].head()
p_tag_x p_tag_y s_tag_x s_tag_y
35 589.665646 1405.580171 517.5 1636.5
36 589.665646 1405.580171 679.5 1665.5
100 610.546851 2425.303250 569.5 2722.0
101 610.546851 2425.303250 728.0 2710.0
102 717.237730 1411.842428 820.0 1616.5
clt = linear_model.LinearRegression()
clt.fit(filtered[['p_tag_x', 'p_tag_y']], filtered[['s_tag_x', 's_tag_y']])
I am getting following coefficients of the regression:
clt.coef_
array([[ 0.4529769 , -0.22406594],
[-0.00859452, -0.00816968]])
And the residues (X_0, and Y_0)
clt.residues_
array([ 1452.97816371, 69.12754694])
How I should I understand the above coefficients matrix in terms of the regression line ?
As i already explained in the comments, you got an extra-dimension in your coef_ as well as intercept_ because you got 2 targets (y.shape(n_samples, n_targets)). In this case sklearn will fit 2 independent regressors, one for each target.
You then can just take those n regressors apart and handle each one on it's own.
The formula of your regression line is still:
y(w, x) = intercept_ + coef_[0] * x[0] + coef_[1] * x[1] ...
Sadly your example is a bit harder to visualize because of the dimensionality.
Consider this a demo, with a lot of ugly hard-coding for this specific case (and bad example data!):
Code:
# Warning: ugly demo-like code using a lot of hard-coding!!!!!
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import linear_model
X = np.array([[589.665646, 1405.580171],
[589.665646, 1405.580171],
[610.546851, 2425.303250],
[610.546851, 2425.303250],
[717.237730, 1411.842428]])
y = np.array([[517.5, 1636.5],
[679.5, 1665.5],
[569.5, 2722.0],
[728.0, 2710.0],
[820.0, 1616.5]])
clt = linear_model.LinearRegression()
clt.fit(X, y)
print(clt.coef_)
print(clt.residues_)
def curve_0(x, y): # target 0; single-point evaluation hardcoded for 2 features!
return clt.intercept_[0] + x * clt.coef_[0, 0] + y * clt.coef_[0, 1]
def curve_1(x, y): # target 1; single-point evaluation hardcoded for 2 features!
return clt.intercept_[1] + x * clt.coef_[1, 0] + y * clt.coef_[1, 1]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
xs = [np.amin(X[:, 0]), np.amax(X[:, 0])]
ys = [np.amin(X[:, 1]), np.amax(X[:, 1])]
# regressor 0
ax.scatter(X[:, 0], X[:, 1], y[:, 0], c='blue')
ax.plot([xs[0], xs[1]], [ys[0], ys[1]], [curve_0(xs[0], ys[0]), curve_0(xs[1], ys[1])], c='cyan')
# regressor 1
ax.scatter(X[:, 0], X[:, 1], y[:, 1], c='red')
ax.plot([xs[0], xs[1]], [ys[0], ys[1]], [curve_1(xs[0], ys[0]), curve_1(xs[1], ys[1])], c='magenta')
ax.set_xlabel('X[:, 0] feature 0')
ax.set_ylabel('X[:, 1] feature 1')
ax.set_zlabel('Y')
plt.show()
Output:
Remarks:
You don't have to calculate the formula by yourself: clt.predict() will do that!
The code-lines involving ax.plot(...) use the assumption, that our line is defined by just 2 points (linear)!

[scikit learn]: Anomaly Detection - Alternative for OneClassSVM

I have implemented LinearSVC and SVC from the sklearn-framework for text classification.
I am using TfidfVectorizer to get sparse representation of the input data that consists of two different classes(benign data and malicious data). This part is working pretty fine but now i wanted to implement some kind of anomaly detection by using the OneClassSVM classificator and training a model with only one class (outliers detection...). Unfortunately it is not working with sparse-data. Some developers are working on a patch (https://github.com/scikit-learn/scikit-learn/pull/1586) but there a some bugs so there is no solution yet for using the OneClassSVM-implementation.
Are there any other methods in the sklearn-framework for doing something like that? I am looking over the examples but nothing seems to fit.
Thanks!
A bit late, but in case anyone else is looking for information on this... There's a third-party anomaly detection module for sklearn here: http://www.cit.mak.ac.ug/staff/jquinn/software/lsanomaly.html, based on least-squares methods. It should be a plug-in replacement for OneClassSVM.
Unfortunately, scikit-learn currently implements only one-class SVM and robust covariance estimator for outlier detection
You can try a comparision of these methods (as provided in the doc) by examining differences on the 2d data:
import numpy as np
import pylab as pl
import matplotlib.font_manager
from scipy import stats
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
# define two outlier detection tools to be compared
classifiers = {
"One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
kernel="rbf", gamma=0.1),
"robust covariance estimator": EllipticEnvelope(contamination=.1)}
# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0
# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
# Data generation
X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset
X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model with the One-Class SVM
pl.figure(figsize=(10, 5))
for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
# fit the data and tag outliers
clf.fit(X)
y_pred = clf.decision_function(X).ravel()
threshold = stats.scoreatpercentile(y_pred,
100 * outliers_fraction)
y_pred = y_pred > threshold
n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
subplot = pl.subplot(1, 2, i + 1)
subplot.set_title("Outlier detection")
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=pl.cm.Blues_r)
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
colors='orange')
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=11))
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
pl.show()

Categories