Related
I'm trying to follow this as an example but can't seem to adapt it to work with my dataset since I need truncated normals:
https://stackoverflow.com/questions/35990467/fit-two-gaussians-to-a-histogram-from-one-set-of-data-python#=
I have a dataset that is definitely a mixture of 2 truncated normals. The minimum value in the domain is 0 and the maximum is 1. I want to create an object that I can fit to optimize the parameters and get the likelihood of a sequence of numbers being drawn from that distribution. One option may be to just use the KDE model and using the pdf to get the likelihood. However, I want the exact mean and standard deviations of the 2 distributions. I guess I could, split the data in half and then model the 2 normals separately but I also want to learn how to use optimize in SciPy. I'm just starting to experiment with this type of statistical analysis so my apologies if this seems naive.
I'm not sure how to get a pdf this way that can integrate to 1 and have a domain constrained between 0 and 1.
import requests
from ast import literal_eval
from scipy import optimize, stats
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Actual Data
u = np.asarray(literal_eval(requests.get("https://pastebin.com/raw/hP5VJ9vr").text))
# u.size ==> 6000
u.min(), u.max()
# (1.3628525454666037e-08, 0.99973136607553781)
# Distribution
with plt.style.context("seaborn-white"):
fig, ax = plt.subplots()
sns.kdeplot(u, color="black", ax=ax)
ax.axvline(0, linestyle=":", color="red")
ax.axvline(1, linestyle=":", color="red")
kde = stats.gaussian_kde(u)
# KDE Model
def truncated_gaussian_lower(x,mu,sigma,A):
return np.clip(A*np.exp(-(x-mu)**2/2/sigma**2), a_min=0, a_max=None)
def truncated_gaussian_upper(x,mu,sigma,A):
return np.clip(A*np.exp(-(x-mu)**2/2/sigma**2), a_min=None, a_max=1)
def mixture_model(x,mu1,sigma1,A1,mu2,sigma2,A2):
return truncated_gaussian_lower(x,mu1,sigma1,A1) + truncated_gaussian_upper(x,mu2,sigma2,A2)
kde = stats.gaussian_kde(u)
# Estimates: mu sigma A
estimates= [0.1, 1, 3,
0.9, 1, 1]
params,cov= optimize.curve_fit(mixture_model,u,kde.pdf(u),estimates )
# ---------------------------------------------------------------------------
# RuntimeError Traceback (most recent call last)
# <ipython-input-265-b2efb2ca0e0a> in <module>()
# 32 estimates= [0.1, 1, 3,
# 33 0.9, 1, 1]
# ---> 34 params,cov= optimize.curve_fit(mixture_model,u,kde.pdf(u),estimates )
# /Users/mu/anaconda/lib/python3.6/site-packages/scipy/optimize/minpack.py in curve_fit(f, xdata, ydata, p0, sigma, absolute_sigma, check_finite, bounds, method, jac, **kwargs)
# 738 cost = np.sum(infodict['fvec'] ** 2)
# 739 if ier not in [1, 2, 3, 4]:
# --> 740 raise RuntimeError("Optimal parameters not found: " + errmsg)
# 741 else:
# 742 # Rename maxfev (leastsq) to max_nfev (least_squares), if specified.
# RuntimeError: Optimal parameters not found: Number of calls to function has reached maxfev = 1400.
In response to #Uvar 's very helpful explanation below. I am trying to test the integral from 0 - 1 to see if it equals 1 but I'm getting 0.3. I think I'm missing a crucial step in logic:
# KDE Model
def truncated_gaussian(x,mu,sigma,A):
return A*np.exp(-(x-mu)**2/2/sigma**2)
def mixture_model(x,mu1,sigma1,A1,mu2,sigma2,A2):
if type(x) == np.ndarray:
norm_probas = truncated_gaussian(x,mu1,sigma1,A1) + truncated_gaussian(x,mu2,sigma2,A2)
mask_lower = x < 0
mask_upper = x > 1
mask_floor = (mask_lower.astype(int) + mask_upper.astype(int)) > 1
norm_probas[mask_floor] = 0
return norm_probas
else:
if (x < 0) or (x > 1):
return 0
return truncated_gaussian_lower(x,mu1,sigma1,A1) + truncated_gaussian_upper(x,mu2,sigma2,A2)
kde = stats.gaussian_kde(u, bw_method=2e-2)
# # Estimates: mu sigma A
estimates= [0.1, 1, 3,
0.9, 1, 1]
params,cov= optimize.curve_fit(mixture_model,u,kde.pdf(u)/integrate.quad(kde, 0 , 1)[0],estimates ,maxfev=5000)
# params
# array([ 9.89751700e-01, 1.92831695e-02, 7.84324114e+00,
# 3.73623345e-03, 1.07754038e-02, 3.79238972e+01])
# Test the integral from 0 - 1
x = np.linspace(0,1,1000)
with plt.style.context("seaborn-white"):
fig, ax = plt.subplots()
ax.plot(x, kde(x), color="black", label="Data")
ax.plot(x, mixture_model(x, *params), color="red", label="Model")
ax.legend()
# Integrating from 0 to 1
integrate.quad(lambda x: mixture_model(x, *params), 0,1)[0]
# 0.3026863969781809
It seems you are misspecifying the fitting procedure.
You are trying to fit the kde.pdf(u) while constraining half-bounds.
foo = kde.pdf(u)
min(foo)
Out[329]: 0.22903365654960098
max(foo)
Out[330]: 4.0119283429320332
As you can see, the probability density function of u is not constrained to [0,1].
As such, just deleting the clipping action, will result in an exact fit.
def truncated_gaussian_lower(x,mu,sigma,A):
return A*np.exp((-(x-mu)**2)/(2*sigma**2))
def truncated_gaussian_upper(x,mu,sigma,A):
return A * np.exp((-(x-mu)**2)/(2*sigma**2))
def mixture_model(x,mu1,sigma1,A1,mu2,sigma2,A2):
return truncated_gaussian_lower(x,mu1,sigma1,A1) + truncated_gaussian_upper(x,mu2,sigma2,A2)
estimates= [0.15, 1, 3,
0.95, 1, 1]
params,cov= optimize.curve_fit(f=mixture_model, xdata=u, ydata=kde.pdf(u), p0=estimates)
params
Out[327]:
array([ 0.00672248, 0.07462657, 4.01188383, 0.98006841, 0.07654998,
1.30569665])
y3 = mixture_model(u, params[0], params[1], params[2], params[3], params[4], params[5])
plt.plot(kde.pdf(u)+0.1) #add offset for visual inspection purpose
plt.plot(y3)
So, let's now say I change what I am plotting to:
plt.figure(); plt.plot(u,y3,'.')
Because, indeed:
np.allclose(y3, kde(u), atol=1e-2)
>>True
You can edit the mixture model a bit to be 0 outside of the domain [0, 1]:
def mixture_model(x,mu1,sigma1,A1,mu2,sigma2,A2):
if (x < 0) or (x > 1):
return 0
return truncated_gaussian_lower(x,mu1,sigma1,A1) + truncated_gaussian_upper(x,mu2,sigma2,A2)
Doing so, however, will lose the option of instantly evaluating the function over an array of x.. So for the sake of argument, I will leave it out for now.
Anyway, we want our integral to sum up to 1 in the domain [0, 1], and one way to do this (feel free to play around with the bandwidth estimator in stats.gaussian_kde as well..) is to divide the probability density estimate by its integral over the domain. Take care that optimize.curve_fit only takes 1400 iterations in this implementation, so the initial parameter estimates matter.
from scipy import integrate
sum_prob = integrate.quad(kde, 0 , 1)[0]
y = kde(u)/sum_prob
# Estimates: mu sigma A
estimates= [0.15, 1, 5,
0.95, 0.5, 3]
params,cov= optimize.curve_fit(f=mixture_model, xdata=u, ydata=y, p0=estimates)
>>array([ 6.72247814e-03, 7.46265651e-02, 7.23699661e+00,
9.80068414e-01, 7.65499825e-02, 2.35533297e+00])
y3 = mixture_model(np.arange(0,1,0.001), params[0], params[1], params[2],
params[3], params[4], params[5])
with plt.style.context("seaborn-white"):
fig, ax = plt.subplots()
sns.kdeplot(u, color="black", ax=ax)
ax.axvline(0, linestyle=":", color="red")
ax.axvline(1, linestyle=":", color="red")
plt.plot(np.arange(0,1,0.001), y3) #The red line is now your custom pdf with area-under-curve = 0.998 in the domain..
To check for the area under the curve, I used this hacky solution of redefining mixture_model..:
def mixture_model(x):
mu1=params[0]; sigma1=params[1]; A1=params[2]; mu2=params[3]; sigma2=params[4]; A2=params[5]
return truncated_gaussian_lower(x,mu1,sigma1,A1) + truncated_gaussian_upper(x,mu2,sigma2,A2)
from scipy import integrate
integrated_value, error = integrate.quad(mixture_model, 0, 1) #0 lower bound, 1 upper bound
>>(0.9978588016186962, 5.222293368393178e-14)
Or doing the integral a second way:
import sympy
x = sympy.symbols('x', real=True, nonnegative=True)
foo = sympy.integrate(params[2]*sympy.exp((-(x-params[0])**2)/(2*params[1]**2))+params[5]*sympy.exp((-(x-params[3])**2)/(2*params[4]**2)),(x,0,1), manual=True)
foo.doit()
>>0.562981541724715*sqrt(pi) #this evaluates to 0.9978588016186956
And actually doing it your way as described in your edited question:
def mixture_model(x,mu1,sigma1,A1,mu2,sigma2,A2):
return truncated_gaussian_lower(x,mu1,sigma1,A1) + truncated_gaussian_upper(x,mu2,sigma2,A2)
integrate.quad(lambda x: mixture_model(x, *params), 0,1)[0]
>>0.9978588016186962
If I set my bandwidth to your level (2e-2), indeed the evaluation comes down to 0.92, which is a worse result than the 0.998 we had earlier, but that is still significantly different from the 0.3 you report which is something I cannot recreate, even while copying your code snippets. Do you perhaps accidentally redefine functions/variables somewhere?
I am looking for a way to graph grid_scores_ from GridSearchCV in sklearn. In this example I am trying to grid search for best gamma and C parameters for an SVR algorithm. My code looks as follows:
C_range = 10.0 ** np.arange(-4, 4)
gamma_range = 10.0 ** np.arange(-4, 4)
param_grid = dict(gamma=gamma_range.tolist(), C=C_range.tolist())
grid = GridSearchCV(SVR(kernel='rbf', gamma=0.1),param_grid, cv=5)
grid.fit(X_train,y_train)
print(grid.grid_scores_)
After I run the code and print the grid scores I get the following outcome:
[mean: -3.28593, std: 1.69134, params: {'gamma': 0.0001, 'C': 0.0001}, mean: -3.29370, std: 1.69346, params: {'gamma': 0.001, 'C': 0.0001}, mean: -3.28933, std: 1.69104, params: {'gamma': 0.01, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 0.1, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 1.0, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 10.0, 'C': 0.0001},etc]
I would like to visualize all the scores (mean values) depending on gamma and C parameters. The graph I am trying to obtain should look as follows:
Where x-axis is gamma, y-axis is mean score (root mean square error in this case), and different lines represent different C values.
The code shown by #sascha is correct. However, the grid_scores_ attribute will be soon deprecated. It is better to use the cv_results attribute.
It can be implemente in a similar fashion to that of #sascha method:
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results['mean_test_score']
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = cv_results['std_test_score']
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Plot Grid search scores
_, ax = plt.subplots(1,1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))
ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel('CV Average Score', fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
# Calling Method
plot_grid_search(pipe_grid.cv_results_, n_estimators, max_features, 'N Estimators', 'Max Features')
The above results in the following plot:
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
digits = datasets.load_digits()
X = digits.data
y = digits.target
clf_ = SVC(kernel='rbf')
Cs = [1, 10, 100, 1000]
Gammas = [1e-3, 1e-4]
clf = GridSearchCV(clf_,
dict(C=Cs,
gamma=Gammas),
cv=2,
pre_dispatch='1*n_jobs',
n_jobs=1)
clf.fit(X, y)
scores = [x[1] for x in clf.grid_scores_]
scores = np.array(scores).reshape(len(Cs), len(Gammas))
for ind, i in enumerate(Cs):
plt.plot(Gammas, scores[ind], label='C: ' + str(i))
plt.legend()
plt.xlabel('Gamma')
plt.ylabel('Mean score')
plt.show()
Code is based on this.
Only puzzling part: will sklearn always respect the order of C & Gamma -> official example uses this "ordering"
Output:
For plotting the results when tuning several hyperparameters, what I did was fixed all parameters to their best value except for one and plotted the mean score for the other parameter for each of its values.
def plot_search_results(grid):
"""
Params:
grid: A trained GridSearchCV object.
"""
## Results from grid search
results = grid.cv_results_
means_test = results['mean_test_score']
stds_test = results['std_test_score']
means_train = results['mean_train_score']
stds_train = results['std_train_score']
## Getting indexes of values per hyper-parameter
masks=[]
masks_names= list(grid.best_params_.keys())
for p_k, p_v in grid.best_params_.items():
masks.append(list(results['param_'+p_k].data==p_v))
params=grid.param_grid
## Ploting results
fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(20,5))
fig.suptitle('Score per parameter')
fig.text(0.04, 0.5, 'MEAN SCORE', va='center', rotation='vertical')
pram_preformace_in_best = {}
for i, p in enumerate(masks_names):
m = np.stack(masks[:i] + masks[i+1:])
pram_preformace_in_best
best_parms_mask = m.all(axis=0)
best_index = np.where(best_parms_mask)[0]
x = np.array(params[p])
y_1 = np.array(means_test[best_index])
e_1 = np.array(stds_test[best_index])
y_2 = np.array(means_train[best_index])
e_2 = np.array(stds_train[best_index])
ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='test')
ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
ax[i].set_xlabel(p.upper())
plt.legend()
plt.show()
I wanted to do something similar (but scalable to a large number of parameters) and here is my solution to generate swarm plots of the output:
score = pd.DataFrame(gs_clf.grid_scores_).sort_values(by='mean_validation_score', ascending = False)
for i in parameters.keys():
print(i, len(parameters[i]), parameters[i])
score[i] = score.parameters.apply(lambda x: x[i])
l =['mean_validation_score'] + list(parameters.keys())
for i in list(parameters.keys()):
sns.swarmplot(data = score[l], x = i, y = 'mean_validation_score')
#plt.savefig('170705_sgd_optimisation//'+i+'.jpg', dpi = 100)
plt.show()
The order that the parameter grid is traversed is deterministic, such that it can be reshaped and plotted straightforwardly. Something like this:
scores = [entry.mean_validation_score for entry in grid.grid_scores_]
# the shape is according to the alphabetical order of the parameters in the grid
scores = np.array(scores).reshape(len(C_range), len(gamma_range))
for c_scores in scores:
plt.plot(gamma_range, c_scores, '-')
here's a solution that makes use of seaborn pointplot. the advantage of this method is that it will allow you to plot results when searching across more than 2 parameters
import seaborn as sns
import pandas as pd
def plot_cv_results(cv_results, param_x, param_z, metric='mean_test_score'):
"""
cv_results - cv_results_ attribute of a GridSearchCV instance (or similar)
param_x - name of grid search parameter to plot on x axis
param_z - name of grid search parameter to plot by line color
"""
cv_results = pd.DataFrame(cv_results)
col_x = 'param_' + param_x
col_z = 'param_' + param_z
fig, ax = plt.subplots(1, 1, figsize=(11, 8))
sns.pointplot(x=col_x, y=metric, hue=col_z, data=cv_results, ci=99, n_boot=64, ax=ax)
ax.set_title("CV Grid Search Results")
ax.set_xlabel(param_x)
ax.set_ylabel(metric)
ax.legend(title=param_z)
return fig
Example usage with xgboost:
from xgboost import XGBRegressor
from sklearn import GridSearchCV
params = {
'max_depth': [3, 6, 9, 12],
'gamma': [0, 1, 10, 20, 100],
'min_child_weight': [1, 4, 16, 64, 256],
}
model = XGBRegressor()
grid = GridSearchCV(model, params, scoring='neg_mean_squared_error')
grid.fit(...)
fig = plot_cv_results(grid.cv_results_, 'gamma', 'min_child_weight')
This will produce a figure that shows the gamma regularization parameter on the x-axis, the min_child_weight regularization parameter in the line color, and any other grid search parameters (in this case max_depth) will be described by the spread of the 99% confidence interval of the seaborn pointplot.
*Note in the example below I have changed the aesthetics slightly from the code above.
I used grid search on xgboost with different learning rates, max depths and number of estimators.
gs_param_grid = {'max_depth': [3,4,5],
'n_estimators' : [x for x in range(3000,5000,250)],
'learning_rate':[0.01,0.03,0.1]
}
gbm = XGBRegressor()
grid_gbm = GridSearchCV(estimator=gbm,
param_grid=gs_param_grid,
scoring='neg_mean_squared_error',
cv=4,
verbose=1
)
grid_gbm.fit(X_train,y_train)
To create the graph for error vs number of estimators with different learning rates, I used the following approach:
y=[]
cvres = grid_gbm.cv_results_
best_md=grid_gbm.best_params_['max_depth']
la=gs_param_grid['learning_rate']
n_estimators=gs_param_grid['n_estimators']
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
if params["max_depth"]==best_md:
y.append(np.sqrt(-mean_score))
y=np.array(y).reshape(len(la),len(n_estimators))
%matplotlib inline
plt.figure(figsize=(8,8))
for y_arr, label in zip(y, la):
plt.plot(n_estimators, y_arr, label=label)
plt.title('Error for different learning rates(keeping max_depth=%d(best_param))'%best_md)
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('Error')
plt.show()
The plot can be viewed here:
Note that the graph can similarly be created for error vs number of estimators with different max depth (or any other parameters as per the user's case).
Here's fully working code that will produce plots so you can fully visualize the varying of up to 3 parameters using GridSearchCV. This is what you will see when running the code:
Parameter1 (x-axis)
Cross Validaton Mean Score (y-axis)
Parameter2 (extra line plotted for each different Parameter2 value, with a legend for reference)
Parameter3 (extra charts will pop up for each different Parameter3 value, allowing you to view differences between these different charts)
For each line plotted, also shown is a standard deviation of what you can expect the Cross Validation Mean Score to do based on the multiple CV's you're running. Enjoy!
from sklearn import tree
from sklearn import model_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
Algo = [['DecisionTreeClassifier', tree.DecisionTreeClassifier(), # algorithm
'max_depth', [1, 2, 4, 6, 8, 10, 12, 14, 18, 20, 22, 24, 26, 28, 30], # Parameter1
'max_features', ['sqrt', 'log2', None], # Parameter2
'criterion', ['gini', 'entropy']]] # Parameter3
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2, title):
# Get Test Scores Mean and std for each grid search
grid_param_1 = list(str(e) for e in grid_param_1)
grid_param_2 = list(str(e) for e in grid_param_2)
scores_mean = cv_results['mean_test_score']
scores_std = cv_results['std_test_score']
params_set = cv_results['params']
scores_organized = {}
std_organized = {}
std_upper = {}
std_lower = {}
for p2 in grid_param_2:
scores_organized[p2] = []
std_organized[p2] = []
std_upper[p2] = []
std_lower[p2] = []
for p1 in grid_param_1:
for i in range(len(params_set)):
if str(params_set[i][name_param_1]) == str(p1) and str(params_set[i][name_param_2]) == str(p2):
mean = scores_mean[i]
std = scores_std[i]
scores_organized[p2].append(mean)
std_organized[p2].append(std)
std_upper[p2].append(mean + std)
std_lower[p2].append(mean - std)
_, ax = plt.subplots(1, 1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
# plot means
for key in scores_organized.keys():
ax.plot(grid_param_1, scores_organized[key], '-o', label= name_param_2 + ': ' + str(key))
ax.fill_between(grid_param_1, std_lower[key], std_upper[key], alpha=0.1)
ax.set_title(title)
ax.set_xlabel(name_param_1)
ax.set_ylabel('CV Average Score')
ax.legend(loc="best")
ax.grid('on')
plt.show()
dataset = 'Titanic'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
cv_split = model_selection.KFold(n_splits=10, random_state=2)
for i in range(len(Algo)):
name = Algo[0][0]
alg = Algo[0][1]
param_1_name = Algo[0][2]
param_1_range = Algo[0][3]
param_2_name = Algo[0][4]
param_2_range = Algo[0][5]
param_3_name = Algo[0][6]
param_3_range = Algo[0][7]
for p in param_3_range:
# grid search
param = {
param_1_name: param_1_range,
param_2_name: param_2_range,
param_3_name: [p]
}
grid_test = GridSearchCV(alg, param_grid=param, scoring='accuracy', cv=cv_split)
grid_test.fit(X_train, y_train)
plot_grid_search(grid_test.cv_results_, param[param_1_name], param[param_2_name], param_1_name, param_2_name, dataset + ' GridSearch Scores: ' + name + ', ' + param_3_name + '=' + str(p))
param = {
param_1_name: param_1_range,
param_2_name: param_2_range,
param_3_name: param_3_range
}
grid_final = GridSearchCV(alg, param_grid=param, scoring='accuracy', cv=cv_split)
grid_final.fit(X_train, y_train)
best_params = grid_final.best_params_
alg.set_params(**best_params)
#nathandrake Try the following which is adapted based off the code from #david-alvarez :
def plot_grid_search(cv_results, metric, grid_param_1, grid_param_2, name_param_1, name_param_2):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results[('mean_test_' + metric)]
scores_sd = cv_results[('std_test_' + metric)]
if grid_param_2 is not None:
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Set plot style
plt.style.use('seaborn')
# Plot Grid search scores
_, ax = plt.subplots(1,1)
if grid_param_2 is not None:
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))
else:
# If only one Param1 is given
ax.plot(grid_param_1, scores_mean, '-o')
ax.set_title("Grid Search", fontsize=20, fontweight='normal')
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel('CV Average ' + str.capitalize(metric), fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
As you can see, I added the ability to support grid searches that include multiple metrics. You simply specify the metric you want to plot in the call to the plotting function.
Also, if your grid search only tuned a single parameter you can simply specify None for grid_param_2 and name_param_2.
Call it as follows:
plot_grid_search(grid_search.cv_results_,
'Accuracy',
list(np.linspace(0.001, 10, 50)),
['linear', 'rbf'],
'C',
'kernel')
This worked for me when I was trying to plot mean scores vs no. of trees in the Random Forest. The reshape() function helps to find out the averages.
param_n_estimators = cv_results['param_n_estimators']
param_n_estimators = np.array(param_n_estimators)
mean_n_estimators = np.mean(param_n_estimators.reshape(-1,5), axis=0)
mean_test_scores = cv_results['mean_test_score']
mean_test_scores = np.array(mean_test_scores)
mean_test_scores = np.mean(mean_test_scores.reshape(-1,5), axis=0)
mean_train_scores = cv_results['mean_train_score']
mean_train_scores = np.array(mean_train_scores)
mean_train_scores = np.mean(mean_train_scores.reshape(-1,5), axis=0)
I'm new to scikit-lear and GMM in general... I have some problem with the fit quality of a Gaussian Mixture Model in python (scikit-learn) .
I have an array of data, which you may find at DATA HERE that I want to fit with a GMM with n = 2 components.
As benchmark I superimpose a Normal fit.
Errors/weirdness:
setting n = 1 components, I cannot recover with GMM(1) the Normal benchmark fit
setting n = 2 components, the Normal fit is better than GMM(2) fit
GMM(n) seems to provide always the same fit...
Here is what I get: what I'm doing wrong here? (the picture displays the fits with GMM(2)). Thanks in advance for your help.
Code below (to run it, save data in the same folder)
from numpy import *
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from collections import OrderedDict
from scipy.stats import norm
from sklearn.mixture import GMM
# Upload the data: "epsi" (array of floats)
file_xlsx = './db_X.xlsx'
data = pd.read_excel(file_xlsx)
epsi = data["epsi"].values;
t_ = len(epsi);
# Normal fit (for benchmark)
epsi_grid = arange(min(epsi),max(epsi)+0.001,0.001);
mu = mean(epsi);
sigma2 = var(epsi);
normal = norm.pdf(epsi_grid, mu, sqrt(sigma2));
# TENTATIVE - Gaussian mixture fit
gmm = GMM(n_components = 2); # fit quality doesn't improve if I set: covariance_type = 'full'
gmm.fit(reshape(epsi,(t_,1)));
gauss_mixt = exp(gmm.score(reshape(epsi_grid,(len(epsi_grid),1))));
# same result if I apply the definition of pdf of a Gaussian mixture:
# pdf_mixture = w_1 * N(mu_1, sigma_1) + w_2 * N(mu_2, sigma_2)
# as suggested in:
# http://stackoverflow.com/questions/24878729/how-to-construct-and-plot-uni-variate-gaussian-mixture-using-its-parameters-in-p
#
#gauss_mixt = array([p * norm.pdf(epsi_grid, mu, sd) for mu, sd, p in zip(gmm.means_.flatten(), sqrt(gmm.covars_.flatten()), gmm.weights_)]);
#gauss_mixt = sum(gauss_mixt, axis = 0);
# Create a figure showing the comparison between the estimated distributions
# setting the figure object
fig = plt.figure(figsize = (10,8))
fig.set_facecolor('white')
ax = plt.subplot(111)
# colors
red = [0.9, 0.3, 0.0];
grey = [0.9, 0.9, 0.9];
green = [0.2, 0.6, 0.3];
# x-axis limits
q_inf = float(pd.DataFrame(epsi).quantile(0.0025));
q_sup = float(pd.DataFrame(epsi).quantile(0.9975));
ax.set_xlim([q_inf, q_sup])
# empirical pdf of data
nb = int(10*log(t_));
ax.hist(epsi, bins = nb, normed = True, color = grey, edgecolor = 'k', label = "Empirical");
# Normal fit
ax.plot(epsi_grid, normal, color = green, lw = 1.0, label = "Normal fit");
# Gaussian Mixture fit
ax.plot(epsi_grid, gauss_mixt, color = red, lw = 1.0, label = "GMM(2)");
# title
ax.set_title("Issue: Normal fit out-performs the GMM fit?", size = 14)
# legend
ax.legend(loc='upper left');
plt.tight_layout()
plt.show()
The problem was the bound on the single components variances min_covar, which is by default 1e-3 and is meant to prevent overfitting.
Lowering that limit solved the problem (see picture):
gmm = GMM(n_components = 2, min_covar = 1e-12)
I'm running a ridge regression on somewhat collinear data. One of the methods used to identify a stable fit is a ridge trace and thanks to the great example on scikit-learn, I'm able to do that. Another method is to calculate variance inflation factors (VIFs) for each variable as k increases. When the VIFs decrease to <5 it is an indication the fit is satisfactory. Statsmodels has code for VIFs, but it is for an OLS regression. I've attempted to alter it to handle a ridge regression.
I'm checking my results against Regression Analysis by Example, 5th edition, chapter 10. My code generates the correct results for k = 0.000, but not after that. Working SAS code is available, but I'm not a SAS user and I don't know the differences between that implementation and scikit-learn's (and/or statsmodels's).
I've been stuck on this for a few days so any help would be much appreciated.
#http://www.ats.ucla.edu/stat/sas/examples/chp/chp_ch10.htm
from __future__ import division
import numpy as np
import pandas as pd
example = pd.read_csv('by_example_import.csv')
example.dropna(inplace=True)
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(example)
scaler.transform(example)
X = example.drop(['year', 'import'], axis=1)
#c_matrix = X.corr()
y = example['import']
#w, v = np.linalg.eig(c_matrix)
import pylab as pl
from sklearn import linear_model
###############################################################################
# Compute paths
alphas = [0.000, 0.001, 0.003, 0.005, 0.007, 0.009, 0.010, 0.012, 0.014, 0.016, 0.018,
0.020, 0.022, 0.024, 0.026, 0.028, 0.030, 0.040, 0.050, 0.060, 0.070, 0.080,
0.090, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0]
clf = linear_model.Ridge(fit_intercept=False)
clf2 = linear_model.Ridge(fit_intercept=False)
coefs = []
vif_list = [[] for x in range(X.shape[1])]
for a in alphas:
clf.set_params(alpha=a)
clf.fit(X, y)
coefs.append(clf.coef_)
for j, data in enumerate(X.columns):
cols = [col for col in X.columns if col not in [data]]
Z = X[cols]
yy = X.iloc[:,j]
clf2.set_params(alpha=a)
clf2.fit(Z, yy)
r_squared_j = clf2.score(Z, yy)
vif = 1. / (1. - r_squared_j)
print r_squared_j
vif_list[j].append(vif)
pd.DataFrame(vif_list, columns = alphas).T
pd.DataFrame(coefs, index=alphas)
###############################################################################
# Display results
ax = pl.gca()
ax.set_color_cycle(['b', 'r', 'g', 'c', 'k', 'y', 'm'])
ax.plot(alphas, coefs)
pl.vlines(ridge_cv.alpha_, np.min(coefs), np.max(coefs), linestyle='dashdot')
pl.xlabel('alpha')
pl.ylabel('weights')
pl.title('Ridge coefficients as a function of the regularization')
pl.axis('tight')
pl.show()
Variance inflation factor for Ridge regression is just three lines. I checked it with the example on the UCLA statistics page.
A variation of this will make it into the next statsmodels release. Here is my current function:
def vif_ridge(corr_x, pen_factors, is_corr=True):
"""variance inflation factor for Ridge regression
assumes penalization is on standardized variables
data should not include a constant
Parameters
----------
corr_x : array_like
correlation matrix if is_corr=True or original data if is_corr is False.
pen_factors : iterable
iterable of Ridge penalization factors
is_corr : bool
Boolean to indicate how corr_x is interpreted, see corr_x
Returns
-------
vif : ndarray
variance inflation factors for parameters in columns and ridge
penalization factors in rows
could be optimized for repeated calculations
"""
corr_x = np.asarray(corr_x)
if not is_corr:
corr = np.corrcoef(corr_x, rowvar=0, bias=True)
else:
corr = corr_x
eye = np.eye(corr.shape[1])
res = []
for k in pen_factors:
minv = np.linalg.inv(corr + k * eye)
vif = minv.dot(corr).dot(minv)
res.append(np.diag(vif))
return np.asarray(res)
I have implemented LinearSVC and SVC from the sklearn-framework for text classification.
I am using TfidfVectorizer to get sparse representation of the input data that consists of two different classes(benign data and malicious data). This part is working pretty fine but now i wanted to implement some kind of anomaly detection by using the OneClassSVM classificator and training a model with only one class (outliers detection...). Unfortunately it is not working with sparse-data. Some developers are working on a patch (https://github.com/scikit-learn/scikit-learn/pull/1586) but there a some bugs so there is no solution yet for using the OneClassSVM-implementation.
Are there any other methods in the sklearn-framework for doing something like that? I am looking over the examples but nothing seems to fit.
Thanks!
A bit late, but in case anyone else is looking for information on this... There's a third-party anomaly detection module for sklearn here: http://www.cit.mak.ac.ug/staff/jquinn/software/lsanomaly.html, based on least-squares methods. It should be a plug-in replacement for OneClassSVM.
Unfortunately, scikit-learn currently implements only one-class SVM and robust covariance estimator for outlier detection
You can try a comparision of these methods (as provided in the doc) by examining differences on the 2d data:
import numpy as np
import pylab as pl
import matplotlib.font_manager
from scipy import stats
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
# define two outlier detection tools to be compared
classifiers = {
"One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
kernel="rbf", gamma=0.1),
"robust covariance estimator": EllipticEnvelope(contamination=.1)}
# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0
# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
# Data generation
X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset
X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model with the One-Class SVM
pl.figure(figsize=(10, 5))
for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
# fit the data and tag outliers
clf.fit(X)
y_pred = clf.decision_function(X).ravel()
threshold = stats.scoreatpercentile(y_pred,
100 * outliers_fraction)
y_pred = y_pred > threshold
n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
subplot = pl.subplot(1, 2, i + 1)
subplot.set_title("Outlier detection")
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=pl.cm.Blues_r)
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
colors='orange')
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=11))
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
pl.show()