issue with sklearn.mixture.GMM (Gaussian Mixture Model) - python

I'm new to scikit-lear and GMM in general... I have some problem with the fit quality of a Gaussian Mixture Model in python (scikit-learn) .
I have an array of data, which you may find at DATA HERE that I want to fit with a GMM with n = 2 components.
As benchmark I superimpose a Normal fit.
Errors/weirdness:
setting n = 1 components, I cannot recover with GMM(1) the Normal benchmark fit
setting n = 2 components, the Normal fit is better than GMM(2) fit
GMM(n) seems to provide always the same fit...
Here is what I get: what I'm doing wrong here? (the picture displays the fits with GMM(2)). Thanks in advance for your help.
Code below (to run it, save data in the same folder)
from numpy import *
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from collections import OrderedDict
from scipy.stats import norm
from sklearn.mixture import GMM
# Upload the data: "epsi" (array of floats)
file_xlsx = './db_X.xlsx'
data = pd.read_excel(file_xlsx)
epsi = data["epsi"].values;
t_ = len(epsi);
# Normal fit (for benchmark)
epsi_grid = arange(min(epsi),max(epsi)+0.001,0.001);
mu = mean(epsi);
sigma2 = var(epsi);
normal = norm.pdf(epsi_grid, mu, sqrt(sigma2));
# TENTATIVE - Gaussian mixture fit
gmm = GMM(n_components = 2); # fit quality doesn't improve if I set: covariance_type = 'full'
gmm.fit(reshape(epsi,(t_,1)));
gauss_mixt = exp(gmm.score(reshape(epsi_grid,(len(epsi_grid),1))));
# same result if I apply the definition of pdf of a Gaussian mixture:
# pdf_mixture = w_1 * N(mu_1, sigma_1) + w_2 * N(mu_2, sigma_2)
# as suggested in:
# http://stackoverflow.com/questions/24878729/how-to-construct-and-plot-uni-variate-gaussian-mixture-using-its-parameters-in-p
#
#gauss_mixt = array([p * norm.pdf(epsi_grid, mu, sd) for mu, sd, p in zip(gmm.means_.flatten(), sqrt(gmm.covars_.flatten()), gmm.weights_)]);
#gauss_mixt = sum(gauss_mixt, axis = 0);
# Create a figure showing the comparison between the estimated distributions
# setting the figure object
fig = plt.figure(figsize = (10,8))
fig.set_facecolor('white')
ax = plt.subplot(111)
# colors
red = [0.9, 0.3, 0.0];
grey = [0.9, 0.9, 0.9];
green = [0.2, 0.6, 0.3];
# x-axis limits
q_inf = float(pd.DataFrame(epsi).quantile(0.0025));
q_sup = float(pd.DataFrame(epsi).quantile(0.9975));
ax.set_xlim([q_inf, q_sup])
# empirical pdf of data
nb = int(10*log(t_));
ax.hist(epsi, bins = nb, normed = True, color = grey, edgecolor = 'k', label = "Empirical");
# Normal fit
ax.plot(epsi_grid, normal, color = green, lw = 1.0, label = "Normal fit");
# Gaussian Mixture fit
ax.plot(epsi_grid, gauss_mixt, color = red, lw = 1.0, label = "GMM(2)");
# title
ax.set_title("Issue: Normal fit out-performs the GMM fit?", size = 14)
# legend
ax.legend(loc='upper left');
plt.tight_layout()
plt.show()

The problem was the bound on the single components variances min_covar, which is by default 1e-3 and is meant to prevent overfitting.
Lowering that limit solved the problem (see picture):
gmm = GMM(n_components = 2, min_covar = 1e-12)

Related

Getting a smooth Poisson Distribution over a Histogram with Small Number of Bins

I have a Poisson distribution of a background count which mostly contains counts equal to zero, I've fitted a Poisson distribution to this data and gotten the following result:
I have another dataset from a source which has higher count rates, in this case it works fine:
Here's my (inelegant) code in full;
mean_values = []
# obtaining results:
for a in data_arrays:
dataset = globals()[a]
cps_vals = dataset[:,1]
max_cps = int(max(cps_vals))
mean_name = a +"_mean"
std_name = a + "_std"
serr_name = a + "_serr"
mean = globals()[mean_name] = np.mean(cps_vals)
globals()[std_name] = np.std(cps_vals,ddof=1)
globals()[serr_name] = globals()[std_name]/np.sqrt(len(cps_vals)) ## I used globals() so I could call in e.g. the background serr as the variable bg_serr.
print(a,"mean:",globals()[mean_name],"sqrt(mean):",np.sqrt(globals()[mean_name]),"std:",globals()[std_name],"serr:",globals()[serr_name],"sqrt(lambda)/sigma =",np.sqrt(globals()[mean_name])/globals()[std_name])
# plotting with Poisson:
plt.figure()
bin_edges = np.arange(0, max_cps+1.1, 1)
histogram = plt.hist(cps_vals,density=True,bins=bin_edges)
plt.xlabel("Counts Per Second")
plt.ylabel("Probability of Occurence")
pops = histogram[0]
bins = histogram[1]
maxidx = np.argmax(pops)
maxpop = pops[maxidx]
maxbin = np.max(bins)
most_populated_bin = bins[maxidx]
plt.plot(np.arange(0, maxbin), poisson.pmf(np.arange(0,maxbin),
np.mean(cps_vals)),c="black")
This is the relevant line for the Poisson plot:
plt.plot(np.arange(0, maxbin), poisson.pmf(np.arange(0,maxbin), np.mean(cps_vals)),c="black")
If I try to make the np.arange spacing smaller, I get ringing in the Poisson curves:
I think this is because it needs integer values of counts?
How can I produce a smooth Guassian curve for the background count? The one I'm getting doesn't look right.
mu = 15
r = poisson.rvs(mu, size=100000)
plt.hist(r, bins=np.linspace(0, 35, 36), alpha=0.5, label='counting process', ec='black', align='left')
plt.plot(poisson.pmf(np.linspace(0, 35, 36),mu)*100000)
plt.legend()
Gives:

How to implement kmeans clustering as a feature for classification techniques in SVM?

Ive already created a clustering and saved the model but im confused what should i do with this model and how to use it as a feature for classification.
This clustering is based on the coordinate of a crime place. after the data has been clustered, i want to use the clustered model as features in SVM.
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
import xlrd
import pickle
import tkinter as tk
from tkinter import *
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
#kmeans section
#Creating and labelling latitudes of X and Y and plotting it
data=pd.read_excel("sanfrancisco.xlsx")
x1=data['X']
y1=data['Y']
X = np.array(list(zip(x1,y1)))
# Elbow method
from sklearn.cluster import KMeans
wcss = [] #empty string
# to check in range for 10 cluster
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++') # will generate centroids
kmeans.fit(X)
wcss.append(kmeans.inertia_) # to find euclidean distance
plot1 = plt.figure(1)
plt.xlabel("Number of Clusters")
plt.ylabel("Euclidean Distance")
plt.plot(range(1,11), wcss)
k = 3
# data visual section.. Eg: how many crimes in diff month, most number of crime in a day in a week
# most number crime in what address, most number of crimes in what city, how many crime occur
# in how much time. , etc..
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x,C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)
# n_clustersr takes numbers of clusters, init chooses random data points for the initial centroids
# in default sckit provides 10 times of count and chooses the best one, in order to elak n_init assigned to 1
model = KMeans(n_clusters=k, init='random', n_init=1)
model.fit_transform(X)
centroids = model.cluster_centers_ # final centroids
rgb_colors = {0.: 'y',
1.: 'c',
2.: 'fuchsia',
}
if k == 4:
rgb_colors[3.] = 'lime'
if k == 6:
rgb_colors[3.] = 'lime'
rgb_colors[4.] = 'orange'
rgb_colors[5.] = 'tomato'
new_labels = pd.Series(model.labels_.astype(float)) # label that predicted by kmeans
plot2 = plt.figure(2)
plt.scatter(x1, y1, c=new_labels.map(rgb_colors), s=20)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='black', s=200 )
plt.xlabel('Final Cluster Centers\n Iteration Count=' +str(model.n_iter_)+
'\n Objective Function Value: ' +str(model.inertia_))
plt.ylabel('y')
plt.title("k-Means")
plt.show()
# save the model to disk
filename = 'clusteredmatrix.sav'
pickle.dump(model, open(filename,'wb'))
Your problem is not much clear, but if you want to see the behavior of clusters, I recommend you to use a tool like Weka, so that you can freely cluster them and get meaningful inferences before going into complex coding stuff!

Display issue of fitted curve: cannot solve coarseness

Despite having a working script for curve fitting using the lmfit library, I am not able to solve a display issue. Indeed, having only 5 dependent values, the resulting graph is rather coarse.
Before switching to lmfit, I was using curve_fit and could solve the display issue by simply using np.linspace and plot the optimized values resulting from the fit procedure. Then, I was displaying the "real" values through plt.errorbar. With lmfit, the above solution yields a mismatch error, since it recognizes the "fake" independent variables and launches a mismatch type error.
My full script is the following:
import lmfit as lf
from lmfit import Model, Parameters
import numpy as np
import matplotlib.pyplot as plt
from math import atan
def on_res(omega_eff, thetas, R2avg=5, k_ex=0.1, phi_ex=500):
return R2avg*(np.sin(thetas))**2 + ((np.sin(thetas))**2)*(phi_ex*k_ex/(k_ex**2 + omega_eff**2))
model = Model(on_res,independent_vars=['omega_eff','thetas'])
params = model.make_params(R2avg=5, k_ex=0.01, phi_ex=1500)
carrier = 6146.53
O_1 = 5846
spin_locks = (1000, 2000, 3000, 4000, 5000)
delta_omega = (O_1 - carrier)
omega_eff1 = ((delta_omega**2) + (spin_locks[0]**2))**0.5
omega_eff2 = ((delta_omega**2) + (spin_locks[1]**2))**0.5
omega_eff3 = ((delta_omega**2) + (spin_locks[2]**2))**0.5
omega_eff4 = ((delta_omega**2) + (spin_locks[3]**2))**0.5
omega_eff5 = ((delta_omega**2) + (spin_locks[4]**2))**0.5
theta_rad1 = atan(spin_locks[0]/delta_omega)
theta_rad2 = atan(spin_locks[1]/delta_omega)
theta_rad3 = atan(spin_locks[2]/delta_omega)
theta_rad4 = atan(spin_locks[3]/delta_omega)
theta_rad5 = atan(spin_locks[4]/delta_omega)
x = (omega_eff1/1000, omega_eff2/1000, omega_eff3/1000, omega_eff4/1000, omega_eff5/1000)# , omega_eff6/1000)# , omega_eff7/1000)
theta = (theta_rad1, theta_rad2, theta_rad3, theta_rad4, theta_rad5)
R1rho_vals = (7.9328, 6.2642, 6.0005, 5.9972, 5.988)
e = (0.2, 0.2, 0.2, 0.2, 0.2)
new_x = np.linspace(0, 6, 1000)
omega_eff = np.array(x, dtype=float)
thetas = np.array(theta, dtype=float)
R1rho_vals = np.array(R1rho_vals, dtype=float)
error = np.array(e, dtype=float)
R2avg = []
k_ex = []
phi_ex = []
result = model.fit(R1rho_vals, params, weights=1/error, thetas=thetas, omega_eff=omega_eff, method = "emcee", steps = 1000)
print(result.fit_report())
plt.errorbar(x, R1rho_vals, yerr = error, fmt = ".k", markersize = 8, capsize = 3)
plt.plot(new_x, result.best_fit)
plt.show()
As you can see running it, it launches the mismatch shape error message. Changing the plt.plot line to plt.plot(x, result.best_fit) yields the graph correctly, but displaying a very coarse profile (as one would expect, having only 5 points on the x-axis).
Are you aware of any way to solve this? Checking the documentation, I noticed the examples provided all plot the results via the actual independent variables values, since they have enough experimental values.
You need to re-evaluate the ModelResult with your new values for the independent variables:
plt.plot(new_x, result.eval(omega_eff=new_x/1000., thetas=thetas))

how to isolate data that are 2 and 3 sigma deviated from mean and then mark them in a plot in python?

I am reading from a dataset which looks like the following when plotted in matplotlib and then taken the best fit curve using linear regression.
The sample of data looks like following:
# ID X Y px py pz M R
1.04826492772e-05 1.04828050287e-05 1.048233088e-05 0.000107002791008 0.000106552433081 0.000108704469007 387.02 4.81947797625e+13
1.87380963036e-05 1.87370588085e-05 1.87372620448e-05 0.000121616280029 0.000151924707761 0.00012371156585 428.77 6.54636174067e+13
3.95579877816e-05 3.95603773653e-05 3.95610756809e-05 0.000163470663023 0.000265203868883 0.000228031803626 470.74 8.66961875758e+13
My code looks the following:
# Regression Function
def regress(x, y):
#Return a tuple of predicted y values and parameters for linear regression.
p = sp.stats.linregress(x, y)
b1, b0, r, p_val, stderr = p
y_pred = sp.polyval([b1, b0], x)
return y_pred, p
# plotting z
xz, yz = M, Y_z # data, non-transformed
y_pred, _ = regress(xz, np.log(yz)) # change here # transformed input
plt.semilogy(xz, yz, marker='o',color ='b', markersize=4,linestyle='None', label="l.o.s within R500")
plt.semilogy(xz, np.exp(y_pred), "b", label = 'best fit') # transformed output
However I can see a lot upward scatter in the data and the best fit curve is affected by those. So first I want to isolate the data points which are 2 and 3 sigma away from my mean data, and mark them with circle around them.
Then take the best fit curve considering only the points which fall within 1 sigma of my mean data
Is there a good function in python which can do that for me?
Also in addition to that may I also isolate the data from my actual dataset, like if the third row in the sample input represents 2 sigma deviation may I have that row as an output too to save later and investigate more?
Your help is most appreciated.
Here's some code that goes through the data in a given number of windows, calculates statistics in said windows, and separates data in well- and misbehaved lists.
Hope this helps.
from scipy import stats
from scipy import polyval
import numpy as np
import matplotlib.pyplot as plt
num_data = 10000
fake_data_x = np.sort(12.8+np.random.random(num_data))
fake_data_y = np.exp(fake_data_x) + np.random.normal(0,scale=50000,size=num_data)
# Regression Function
def regress(x, y):
#Return a tuple of predicted y values and parameters for linear regression.
p = stats.linregress(x, y)
b1, b0, r, p_val, stderr = p
y_pred = polyval([b1, b0], x)
return y_pred, p
# plotting z
xz, yz = fake_data_x, fake_data_y # data, non-transformed
y_pred, _ = regress(xz, np.log(yz)) # change here # transformed input
plt.figure()
plt.semilogy(xz, yz, marker='o',color ='b', markersize=4,linestyle='None', label="l.o.s within R500")
plt.semilogy(xz, np.exp(y_pred), "b", label = 'best fit') # transformed output
plt.show()
num_bin_intervals = 10 # approx number of averaging windows
window_boundaries = np.linspace(min(fake_data_x),max(fake_data_x),int(len(fake_data_x)/num_bin_intervals)) # window boundaries
y_good = [] # list to collect the "well-behaved" y-axis data
x_good = [] # list to collect the "well-behaved" x-axis data
y_outlier = []
x_outlier = []
for i in range(len(window_boundaries)-1):
# create a boolean mask to select the data within the averaging window
window_indices = (fake_data_x<=window_boundaries[i+1]) & (fake_data_x>window_boundaries[i])
# separate the pieces of data in the window
fake_data_x_slice = fake_data_x[window_indices]
fake_data_y_slice = fake_data_y[window_indices]
# calculate the mean y_value in the window
y_mean = np.mean(fake_data_y_slice)
y_std = np.std(fake_data_y_slice)
# choose and select the outliers
y_outliers = fake_data_y_slice[np.abs(fake_data_y_slice-y_mean)>=2*y_std]
x_outliers = fake_data_x_slice[np.abs(fake_data_y_slice-y_mean)>=2*y_std]
# choose and select the good ones
y_goodies = fake_data_y_slice[np.abs(fake_data_y_slice-y_mean)<2*y_std]
x_goodies = fake_data_x_slice[np.abs(fake_data_y_slice-y_mean)<2*y_std]
# extend the lists with all the good and the bad
y_good.extend(list(y_goodies))
y_outlier.extend(list(y_outliers))
x_good.extend(list(x_goodies))
x_outlier.extend(list(x_outliers))
plt.figure()
plt.semilogy(x_good,y_good,'o')
plt.semilogy(x_outlier,y_outlier,'r*')
plt.show()

[scikit learn]: Anomaly Detection - Alternative for OneClassSVM

I have implemented LinearSVC and SVC from the sklearn-framework for text classification.
I am using TfidfVectorizer to get sparse representation of the input data that consists of two different classes(benign data and malicious data). This part is working pretty fine but now i wanted to implement some kind of anomaly detection by using the OneClassSVM classificator and training a model with only one class (outliers detection...). Unfortunately it is not working with sparse-data. Some developers are working on a patch (https://github.com/scikit-learn/scikit-learn/pull/1586) but there a some bugs so there is no solution yet for using the OneClassSVM-implementation.
Are there any other methods in the sklearn-framework for doing something like that? I am looking over the examples but nothing seems to fit.
Thanks!
A bit late, but in case anyone else is looking for information on this... There's a third-party anomaly detection module for sklearn here: http://www.cit.mak.ac.ug/staff/jquinn/software/lsanomaly.html, based on least-squares methods. It should be a plug-in replacement for OneClassSVM.
Unfortunately, scikit-learn currently implements only one-class SVM and robust covariance estimator for outlier detection
You can try a comparision of these methods (as provided in the doc) by examining differences on the 2d data:
import numpy as np
import pylab as pl
import matplotlib.font_manager
from scipy import stats
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
# define two outlier detection tools to be compared
classifiers = {
"One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
kernel="rbf", gamma=0.1),
"robust covariance estimator": EllipticEnvelope(contamination=.1)}
# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0
# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
# Data generation
X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset
X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model with the One-Class SVM
pl.figure(figsize=(10, 5))
for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
# fit the data and tag outliers
clf.fit(X)
y_pred = clf.decision_function(X).ravel()
threshold = stats.scoreatpercentile(y_pred,
100 * outliers_fraction)
y_pred = y_pred > threshold
n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
subplot = pl.subplot(1, 2, i + 1)
subplot.set_title("Outlier detection")
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=pl.cm.Blues_r)
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
colors='orange')
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=11))
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
pl.show()

Categories