Bayes PCA doesn't recover the Principal Components in PyMC3 - python

I'd like to implement a hierarchical bayesian model with PyMC3. Before designing a complex model, I'm trying to get accustomed with PyMC3 by implementing Bayes PCA and comparing the results with sklearn.decomposition.pca
In 1:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
# Generate data
nsamp_cl = 1000 #Number of samples per class and per site
cov = np.matrix([[1, 0.9, 0.05],
[0.9, 1, 0.05],
[0.05, 0.05, 1]])
nfeat = cov.shape[0] #Number of features
X0 = np.random.multivariate_normal(np.zeros(nfeat),cov,nsamp_cl)
X1 = np.random.multivariate_normal(np.zeros(nfeat),cov,nsamp_cl)
# Rotate class 1
theta = np.radians(90)
cos, sin = np.cos(theta), np.sin(theta)
R = np.matrix('{} {}; {} {}'.format(cos, -sin, sin, cos))
X1[:,0:2] = np.dot(X1[:,0:2],R.T)
X = np.concatenate([X0,X1])
Y = np.concatenate([np.zeros(X0.shape[0]),np.ones(X1.shape[0])])
n = X.shape[0]
d = X.shape[1]
In 2:
plt.figure()
cols = ['b','r']
colors = [cols[y.astype(int)] for y in Y ]
plt.scatter(X[:,0],X[:,1],20,colors)
plt.title('Features 0 and 1')
plt.figure()
cols = ['b','r']
colors = [cols[y.astype(int)] for y in Y ]
plt.scatter(X[:,1],X[:,2],20,colors)
plt.title('Features 1 and 2')
Out 2:
In 3:
from pymc3 import Model,Normal,Gamma,math,variational
common_latent_model = Model()
# Builiding a latent model to extract site-robust principal components
with common_latent_model:
n_latent = 3
#ARD prior
alphas = Gamma('alphas', alpha=1e-6, beta=1e-6, shape=n_latent)
# Weight vector
w = Normal('w',mu=0,tau=alphas,shape=(d,n_latent))
# Latent space
z = Normal('z',mu=0,tau=1,shape=(n,n_latent))
# Multiply latent variables by W to go from latent to observation space
t = math.dot(z,w.T)
# Add bias
mu = Normal('mu', mu=0, tau=0.01, shape=d)
u = t + mu
# Precision of the observation
sigma = Gamma('sigma',alpha=1e-6, beta=1e-6,shape=1)
# Likelihood (sampling distribution) of observations
X_obs = Normal('X_obs', mu=u, tau=sigma, observed=X)
with common_latent_model:
means, sds, elbos = variational.advi(n=10000,learning_rate=0.1, accurate_elbo=True)#100000)
plt.plot(elbos)
plt.ylabel('ELBO')
plt.xlabel('iteration')
In [4]:
for key in means:
print "key: %s , value: %s" % (key, means[key])
key: mu , value: [ 0.03288066 -0.05347487 0.00260641]
key: alphas_log_ , value: [ 6.94631195 6.85621834 6.84792233]
key: sigma_log_ , value: [-0.009662]
key: z , value: [[-0.01260083 -0.00460729 -0.01360558]
[-0.02817471 0.04281501 0.01643355]
[-0.05178572 -0.02470609 -0.05092171]
...,
[-0.05201711 0.00150599 -0.01167801]
[-0.01097088 -0.02666511 0.03660954]
[ 0.0609949 0.01156182 0.01814843]]
key: w , value: [[-0.06004834 0.00599346 -0.03071374]
[ 0.00668656 -0.01306511 0.00400904]
[-0.00141243 -0.00778869 0.03257137]]
In [5]:
PC_bayes = means['z']
plt.figure()
cols = ['b','r']
colors = [cols[y.astype(int)] for y in Y ]
plt.scatter(PC_bayes[:,0],PC_bayes[:,1],20,colors,alpha=.1)
Out [5]:
In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
PC = pca.transform(X)
In [7]:
plt.figure()
cols = ['b','r']
colors = [cols[y.astype(int)] for y in Y ]
plt.scatter(PC[:,0],PC[:,1],20,colors,alpha=.1)
Out [7]:
(You can find the iPython Notebook here:
https://github.com/peppeFarAway/pymc3/blob/master/BayesPCA.ipynb)
Why can't my Bayes PCA implementation recover the Principal Components, while sklearn.decomposition.pca can? Where am I making a mistake?
The main reference I used to implement the model are:
https://blogs.msdn.microsoft.com/infernet_team_blog/2011/09/30/bayesian-pca/

Related

selecting data points neighbourhood to support vectors

I have been thinking of this but not sure how to do it. I have a binary imbalanced data, and would like to use svm to select just subset of the majority data points nearest to support vector. Thereafter, I can fit a binary classifier on this "balanced" data.
To illustrate what I mean, a MWE:
# packages import
from collections import Counter
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
import seaborn as sns
# sample data
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.9], flip_y=0, random_state=1)
# class distribution summary
print(Counter(y))
Counter({0: 91, 1: 9})
# fit svm model
svc_model = SVC(kernel='linear', random_state=32)
svc_model.fit(X, y)
plt.figure(figsize=(10, 8))
# Plotting our two-features-space
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
# Constructing a hyperplane using a formula.
w = svc_model.coef_[0] # w consists of 2 elements
b = svc_model.intercept_[0] # b consists of 1 element
x_points = np.linspace(-1, 1) # generating x-points from -1 to 1
y_points = -(w[0] / w[1]) * x_points - b / w[1] # getting corresponding y-points
# Plotting a red hyperplane
plt.plot(x_points, y_points, c='r')
The two classes are well separated by the hyperplane. We can see the support vectors for both classes (even better for class 1).
Since the minority class 0 has 9-data-points, I want to down-sample class 0 by selecting its support vectors, and 8 other data points nearest to it. So that the class distribution becomes {0: 9, 1: 9} ignoring all other data points of 0. I will then use this to fit a binary classifier like LR (or even SVC).
My question is how to select those data points of class 0 nearest to the class support vector, taking into account, a way to reach a balance with data points of minority class 1.
This can be achieved as follows: Get the support vector for class 0, (sv0), iterate over all data points in class 0 (X[y == 0]), compute the distances (d) to the point represented by the support vector, sort them, take the 9 with the smallest values, and concatenate them with the points of class 1 to create the downsampled data (X_ds, y_ds).
sv0 = svc_model.support_vectors_[0]
distances = []
for i, x in enumerate(X[y == 0]):
d = np.linalg.norm(sv0 - x)
distances.append((i, d))
distances.sort(key=lambda tup: tup[1])
index = [i for i, d in distances][:9]
X_ds = np.concatenate((X[y == 0][index], X[y == 1]))
y_ds = np.concatenate((y[y == 0][index], y[y == 1]))
plt.plot(x_points[19:-29], y_points[19:-29], c='r')
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
plt.scatter(X_ds[y_ds == 0][:,0], X_ds[y_ds == 0][:,1], color='yellow', alpha=0.4)

GEKKO ARX Model Forecasting: data arrays must have the same length

I am trying to use sysid to regress an ARX model periodically, then evaluate the predictive ability of that model by simulating with the future inputs and comparing the output with the experimental data. When I try to solve using m.solve() I get the following error: Exception: Data arrays must have the same length, and match time discretization in dynamic problems
The following is an MRE:
X = [[ 0.9, 0.], [ 0.9, 0.],[ 0.9,0.],[ 0.9,0.],[ 0.9, 0.],[ 0.5,0.],[0.5,0.],[0.5,0.],[0.5,0.], [ 0.5, 0.]] # 2 values for inputs at each time step
Y = [20.3, 20.3, 20.2, 20.2, 20.1, 20.1, 20.1, 20., 19.9, 19.8,] # 1 output at each time step
t = np.linspace(0, 9*300, 10) # 10 points 5 minutes apart each
na = 1 # output coefficients
nb = 2 # input coefficients
res, p, K = m.sysid(t, X, Y, na, nb, pred='meas')
m.time = t - t[0]
y_, u_ = m.arx(p)
u_[0].value = X[0]
u_[1].value = X[1]
m.options.imode = 4
m.options.nodes = 2
# simulate
m.solve()
I don't want to control, rather apply the experimental values to future timesteps and see how the ARX model extrapolates.
Thanks for your help
The problem was with this section and how the data is loaded into value:
u_[0].value = X[:,0]
u_[1].value = X[:,1]
y_[0].value = Y
Try printing X[0] and X[1] to see that they are only the first and second elements of the original list. Converting to a numpy array helps with the slicing.
import numpy as np
from gekko import GEKKO
m = GEKKO()
X = np.array([[ 0.9, 0.], [ 0.9, 0.],\
[ 0.9,0.],[ 0.9,0.],\
[ 0.9, 0.],[ 0.5,0.],\
[0.5,0.],[0.5,0.],\
[0.5,0.], [ 0.5, 0.]]) # 2 values for inputs at each time step
Y = np.array([20.3, 20.3, 20.2, 20.2,\
20.1, 20.1, 20.1, 20., \
19.9, 19.8]) # 1 output at each time step
t = np.linspace(0, 9*300, 10) # 10 points 5 minutes apart each
na = 1 # output coefficients
nb = 2 # input coefficients
res, p, K = m.sysid(t, X, Y, na, nb, pred='meas')
m.time = t - t[0]
y_, u_ = m.arx(p)
u_[0].value = X[:,0]
u_[1].value = X[:,1]
y_[0].value = Y
print(X[0])
print(X[1])
m.options.imode = 4
m.options.nodes = 2
# simulate
m.solve()
Here is another example with Pandas DataFrames:
from gekko import GEKKO
import pandas as pd
import matplotlib.pyplot as plt
# load data and parse into columns
url = 'http://apmonitor.com/do/uploads/Main/tclab_dyn_data2.txt'
data = pd.read_csv(url)
t = data['Time']
u = data[['H1','H2']]
y = data['T1']
# generate time-series model
m = GEKKO(remote=False) # remote=True for MacOS
# system identification
na = 2 # output coefficients
nb = 2 # input coefficients
yp,p,K = m.sysid(t,u,y,na,nb,diaglevel=1)
plt.figure()
plt.subplot(2,1,1)
plt.plot(t,u)
plt.legend([r'$u_0$',r'$u_1$'])
plt.ylabel('MVs')
plt.subplot(2,1,2)
plt.plot(t,y)
plt.plot(t,yp)
plt.legend([r'$y_0$',r'$z_0$'])
plt.ylabel('CVs')
plt.xlabel('Time')
plt.savefig('sysid.png')
plt.show()
We're also working on a package with a Seeq add-on for system identification that runs in Python and Jupyter notebooks.

Can I inverse transform the intercept and coefficients of LASSO regression after using Robust Scaler?

Is it possible to inverse transform the intercept and coefficients in LASSO regression, after fitting the model on scaled data using Robust Scaler?
I'm using LASSO regression to predict values on data that is not normalized and doesn't perform well with LASSO unless it's scaled beforehand. After scaling the data and fitting the LASSO model, I ideally want to be able to see what the model intercept and coefficients are but in the original units (not the scaled versions). I asked a similar question here and it doesn't appear this is possible. If not, why? Can someone explain this to me? I'm trying to broaden my understanding of how LASSO and Robust Scaler work.
Below is the code I was using. Here I was trying to inverse transform the coefficients using transformer_x and the intercept using transformer_y. However, it sounds like this is incorrect.
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
df = pd.DataFrame({'Y':[5, -10, 10, .5, 2.5, 15], 'X1':[1., -2., 2., .1, .5, 3], 'X2':[1, 1, 2, 1, 1, 1],
'X3':[6, 6, 6, 5, 6, 4], 'X4':[6, 5, 4, 3, 2, 1]})
X = df[['X1','X2', 'X3' ,'X4']]
y = df[['Y']]
#Scaling
transformer_x = RobustScaler().fit(X)
transformer_y = RobustScaler().fit(y)
X_scal = transformer_x.transform(X)
y_scal = transformer_y.transform(y)
#LASSO
lasso = Lasso()
lasso = lasso.fit(X_scal, y_scal)
def pred_val(X1,X2,X3,X4):
print('X1 entered: ', X1)
#Scale X value that user entered - by hand
med_X = X.median()
Q1_X = X.quantile(0.25)
Q3_X = X.quantile(0.75)
IQR_X = Q3_X - Q1_X
X_scaled = (X1 - med_X)/IQR_X
print('X1 scaled by hand: ', X_scaled[0].round(2))
#Scale X value that user entered - by function
X_scaled2 = transformer_x.transform(np.array([[X1,X2]]))
print('X1 scaled by function: ', X_scaled2[0][0].round(2))
#Intercept by hand
med_y = y.median()
Q1_y = y.quantile(0.25)
Q3_y = y.quantile(0.75)
IQR_y = Q3_y - Q1_y
inv_int = med_y + IQR_y*lasso.intercept_[0]
#Intercept by function
inv_int2 = transformer_y.inverse_transform(lasso.intercept_.reshape(-1, 1))[0][0]
#Coefficient by hand
inv_coef = lasso.coef_[0]*IQR_y
#Coefficient by function
inv_coef2 = transformer_x.inverse_transform(reg.coef_.reshape(1,-1))[0]
#Prediction by hand
preds = inv_int + inv_coef*X_scaled[0]
#Prediction by function
preds_inner = lasso.predict(X_scaled2)
preds_f = transformer_y.inverse_transform(preds_inner.reshape(-1, 1))[0][0]
print('\nIntercept by hand: ', inv_int[0].round(2))
print('Intercept by function: ', inv_int2.round(2))
print('\nCoefficients by hand: ', inv_coef[0].round(2))
print('Coefficients by function: ', inv_coef2[0].round(2))
print('\nYour predicted value by hand is: ', preds[0].round(2))
print('Your predicted value by function is: ', preds_f.round(2))
print('Perfect Prediction would be 80')
pred_val(10,1,1,1)
Update: I've updated my code to show the type of prediction function I'm trying to create. I'm just trying to create a function that does exactly what .predict does, but also shows the intercept and coefficients in their unscaled units.
Current output:
Out[1]:
X1 entered: 10
X1 scaled by hand: 5.97
X1 scaled by function: 5.97
Intercept by hand: 34.19
Intercept by function: 34.19
Coefficients by hand: 7.6
Coefficients by function: 8.5
Your predicted value by hand is: 79.54
Your predicted value by function is: 79.54
Perfect Prediction would be 80
Ideal output:
Out[1]:
X1 entered: 10
X1 scaled by hand: 5.97
X1 scaled by function: 5.97
Intercept by hand: 34.19
Intercept by function: 34.19
Coefficients by hand: 7.6
Coefficients by function: 7.6
Your predicted value by hand is: 79.54
Your predicted value by function is: 79.54
Perfect Prediction would be 80
Based on the linked SO thread, all you want to do is to get the unscaled prediction value. Is that right?
If yes, then all you need to do is:
# Scale the test dataset
X_test_scaled = transformer_x.transform(X_test)
# Predict with the trained model
prediction = lasso.predict(X_test_scaled)
# Inverse transform the prediction
prediction_in_dollars = transformer_y.inverse_transform(prediction)
UPDATE:
Suppose the train data contain just a single feature named X. Here is what the RobustScaler will do:
X_scaled = (X - median(X))/IQR(X)
y_scaled = (y - median(y))/IQR(y)
Then, the lasso regression will give a prediction like this:
a * X_scaled + b = y_scaled
You have to work out the equations to see what model coefficient on the unscaled data:
# Substituting X_scaled and y_scaled from the 1st equation
# In this equation `median(X), IQR(X), median(y) and IQR(y) are plain numbers you already know from the training phase
a * (X - median(X))/IQR(X) + b = (y - median(y))/IQR(y)
If you try to make a a_new * x + b_new = y-like equation out of this, you end up with:
a_new = (a * (X - median(X)) / (X * IQR(X))) * IQR(y)
b_new = b * IQR(y) + median(y)
a_new * X + b_new = y
You can see that the unscaled coefficient (a_new) depends on X. So, you can use the unscaled X to make predictions directly but in between you are applying the transformation indirectly.
UPDATE 2
I've adapted your code and it now shows how you can get the coefficients in the original scale. The script is just the implementation of the formulas I'm showing above.
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
df = pd.DataFrame({'Y':[5, -10, 10, .5, 2.5, 15], 'X1':[1., -2., 2., .1, .5, 3], 'X2':[1, 1, 2, 1, 1, 1],
'X3':[6, 6, 6, 5, 6, 4], 'X4':[6, 5, 4, 3, 2, 1]})
X = df[['X1','X2','X3','X4']]
y = df[['Y']]
#Scaling
transformer_x = RobustScaler().fit(X)
transformer_y = RobustScaler().fit(y)
X_scal = transformer_x.transform(X)
y_scal = transformer_y.transform(y)
#LASSO
lasso = Lasso()
lasso = lasso.fit(X_scal, y_scal)
def pred_val(X_test):
print('X entered: ',)
print (X_test.values[0])
#Scale X value that user entered - by hand
med_X = X.median()
Q1_X = X.quantile(0.25)
Q3_X = X.quantile(0.75)
IQR_X = Q3_X - Q1_X
X_scaled = ((X_test - med_X)/IQR_X).fillna(0).values
print('X_test scaled by hand: ',)
print (X_scaled[0])
#Scale X value that user entered - by function
X_scaled2 = transformer_x.transform(X_test)
print('X_test scaled by function: ',)
print (X_scaled2[0])
#Intercept by hand
med_y = y.median()
Q1_y = y.quantile(0.25)
Q3_y = y.quantile(0.75)
IQR_y = Q3_y - Q1_y
a = lasso.coef_
coef_new = ((a * (X_test - med_X).values) / (X_test * IQR_X).values) * float(IQR_y)
coef_new = np.nan_to_num(coef_new)[0]
b = lasso.intercept_[0]
intercept_new = b * float(IQR_y) + float(med_y)
custom_pred = sum((coef_new * X_test.values)[0]) + intercept_new
pred = lasso.predict(X_scaled2)
final_pred = transformer_y.inverse_transform(pred.reshape(-1, 1))[0][0]
print('Original intercept: ', lasso.intercept_[0].round(2))
print('New intercept: ', intercept_new.round(2))
print('Original coefficients: ', lasso.coef_.round(2))
print('New coefficients: ', coef_new.round(2))
print('Your predicted value by function is: ', final_pred.round(2))
print('Your predicted value by hand is: ', custom_pred.round(2))
X_test = pd.DataFrame([10,1,1,1]).T
X_test.columns = ['X1', 'X2', 'X3', 'X4']
pred_val(X_test)
You can see the the custom prediction uses the original values (X_test.values).
Result:
X entered:
[10 1 1 1]
X_test scaled by hand:
[ 5.96774194 0. -6.66666667 -1. ]
X_test scaled by function:
[ 5.96774194 0. -6.66666667 -1. ]
Original intercept: 0.01
New intercept: 3.83
Original coefficients: [ 0.02 0. -0. -0. ]
New coefficients: [0.1 0. 0. 0. ]
Your predicted value by function is: 4.83
Your predicted value by hand is: 4.83
As I explained above, the new coefficients depend on X_test. This means that you cannot use their current values with another test sample. Their values will be different for different inputs.

issue with sklearn.mixture.GMM (Gaussian Mixture Model)

I'm new to scikit-lear and GMM in general... I have some problem with the fit quality of a Gaussian Mixture Model in python (scikit-learn) .
I have an array of data, which you may find at DATA HERE that I want to fit with a GMM with n = 2 components.
As benchmark I superimpose a Normal fit.
Errors/weirdness:
setting n = 1 components, I cannot recover with GMM(1) the Normal benchmark fit
setting n = 2 components, the Normal fit is better than GMM(2) fit
GMM(n) seems to provide always the same fit...
Here is what I get: what I'm doing wrong here? (the picture displays the fits with GMM(2)). Thanks in advance for your help.
Code below (to run it, save data in the same folder)
from numpy import *
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from collections import OrderedDict
from scipy.stats import norm
from sklearn.mixture import GMM
# Upload the data: "epsi" (array of floats)
file_xlsx = './db_X.xlsx'
data = pd.read_excel(file_xlsx)
epsi = data["epsi"].values;
t_ = len(epsi);
# Normal fit (for benchmark)
epsi_grid = arange(min(epsi),max(epsi)+0.001,0.001);
mu = mean(epsi);
sigma2 = var(epsi);
normal = norm.pdf(epsi_grid, mu, sqrt(sigma2));
# TENTATIVE - Gaussian mixture fit
gmm = GMM(n_components = 2); # fit quality doesn't improve if I set: covariance_type = 'full'
gmm.fit(reshape(epsi,(t_,1)));
gauss_mixt = exp(gmm.score(reshape(epsi_grid,(len(epsi_grid),1))));
# same result if I apply the definition of pdf of a Gaussian mixture:
# pdf_mixture = w_1 * N(mu_1, sigma_1) + w_2 * N(mu_2, sigma_2)
# as suggested in:
# http://stackoverflow.com/questions/24878729/how-to-construct-and-plot-uni-variate-gaussian-mixture-using-its-parameters-in-p
#
#gauss_mixt = array([p * norm.pdf(epsi_grid, mu, sd) for mu, sd, p in zip(gmm.means_.flatten(), sqrt(gmm.covars_.flatten()), gmm.weights_)]);
#gauss_mixt = sum(gauss_mixt, axis = 0);
# Create a figure showing the comparison between the estimated distributions
# setting the figure object
fig = plt.figure(figsize = (10,8))
fig.set_facecolor('white')
ax = plt.subplot(111)
# colors
red = [0.9, 0.3, 0.0];
grey = [0.9, 0.9, 0.9];
green = [0.2, 0.6, 0.3];
# x-axis limits
q_inf = float(pd.DataFrame(epsi).quantile(0.0025));
q_sup = float(pd.DataFrame(epsi).quantile(0.9975));
ax.set_xlim([q_inf, q_sup])
# empirical pdf of data
nb = int(10*log(t_));
ax.hist(epsi, bins = nb, normed = True, color = grey, edgecolor = 'k', label = "Empirical");
# Normal fit
ax.plot(epsi_grid, normal, color = green, lw = 1.0, label = "Normal fit");
# Gaussian Mixture fit
ax.plot(epsi_grid, gauss_mixt, color = red, lw = 1.0, label = "GMM(2)");
# title
ax.set_title("Issue: Normal fit out-performs the GMM fit?", size = 14)
# legend
ax.legend(loc='upper left');
plt.tight_layout()
plt.show()
The problem was the bound on the single components variances min_covar, which is by default 1e-3 and is meant to prevent overfitting.
Lowering that limit solved the problem (see picture):
gmm = GMM(n_components = 2, min_covar = 1e-12)

[scikit learn]: Anomaly Detection - Alternative for OneClassSVM

I have implemented LinearSVC and SVC from the sklearn-framework for text classification.
I am using TfidfVectorizer to get sparse representation of the input data that consists of two different classes(benign data and malicious data). This part is working pretty fine but now i wanted to implement some kind of anomaly detection by using the OneClassSVM classificator and training a model with only one class (outliers detection...). Unfortunately it is not working with sparse-data. Some developers are working on a patch (https://github.com/scikit-learn/scikit-learn/pull/1586) but there a some bugs so there is no solution yet for using the OneClassSVM-implementation.
Are there any other methods in the sklearn-framework for doing something like that? I am looking over the examples but nothing seems to fit.
Thanks!
A bit late, but in case anyone else is looking for information on this... There's a third-party anomaly detection module for sklearn here: http://www.cit.mak.ac.ug/staff/jquinn/software/lsanomaly.html, based on least-squares methods. It should be a plug-in replacement for OneClassSVM.
Unfortunately, scikit-learn currently implements only one-class SVM and robust covariance estimator for outlier detection
You can try a comparision of these methods (as provided in the doc) by examining differences on the 2d data:
import numpy as np
import pylab as pl
import matplotlib.font_manager
from scipy import stats
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
# define two outlier detection tools to be compared
classifiers = {
"One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
kernel="rbf", gamma=0.1),
"robust covariance estimator": EllipticEnvelope(contamination=.1)}
# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0
# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
# Data generation
X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset
X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model with the One-Class SVM
pl.figure(figsize=(10, 5))
for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
# fit the data and tag outliers
clf.fit(X)
y_pred = clf.decision_function(X).ravel()
threshold = stats.scoreatpercentile(y_pred,
100 * outliers_fraction)
y_pred = y_pred > threshold
n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
subplot = pl.subplot(1, 2, i + 1)
subplot.set_title("Outlier detection")
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=pl.cm.Blues_r)
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
colors='orange')
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=11))
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
pl.show()

Categories