How to graph grid scores from GridSearchCV? - python

I am looking for a way to graph grid_scores_ from GridSearchCV in sklearn. In this example I am trying to grid search for best gamma and C parameters for an SVR algorithm. My code looks as follows:
C_range = 10.0 ** np.arange(-4, 4)
gamma_range = 10.0 ** np.arange(-4, 4)
param_grid = dict(gamma=gamma_range.tolist(), C=C_range.tolist())
grid = GridSearchCV(SVR(kernel='rbf', gamma=0.1),param_grid, cv=5)
grid.fit(X_train,y_train)
print(grid.grid_scores_)
After I run the code and print the grid scores I get the following outcome:
[mean: -3.28593, std: 1.69134, params: {'gamma': 0.0001, 'C': 0.0001}, mean: -3.29370, std: 1.69346, params: {'gamma': 0.001, 'C': 0.0001}, mean: -3.28933, std: 1.69104, params: {'gamma': 0.01, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 0.1, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 1.0, 'C': 0.0001}, mean: -3.28925, std: 1.69106, params: {'gamma': 10.0, 'C': 0.0001},etc]
I would like to visualize all the scores (mean values) depending on gamma and C parameters. The graph I am trying to obtain should look as follows:
Where x-axis is gamma, y-axis is mean score (root mean square error in this case), and different lines represent different C values.

The code shown by #sascha is correct. However, the grid_scores_ attribute will be soon deprecated. It is better to use the cv_results attribute.
It can be implemente in a similar fashion to that of #sascha method:
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results['mean_test_score']
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = cv_results['std_test_score']
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Plot Grid search scores
_, ax = plt.subplots(1,1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))
ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel('CV Average Score', fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
# Calling Method
plot_grid_search(pipe_grid.cv_results_, n_estimators, max_features, 'N Estimators', 'Max Features')
The above results in the following plot:

from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
digits = datasets.load_digits()
X = digits.data
y = digits.target
clf_ = SVC(kernel='rbf')
Cs = [1, 10, 100, 1000]
Gammas = [1e-3, 1e-4]
clf = GridSearchCV(clf_,
dict(C=Cs,
gamma=Gammas),
cv=2,
pre_dispatch='1*n_jobs',
n_jobs=1)
clf.fit(X, y)
scores = [x[1] for x in clf.grid_scores_]
scores = np.array(scores).reshape(len(Cs), len(Gammas))
for ind, i in enumerate(Cs):
plt.plot(Gammas, scores[ind], label='C: ' + str(i))
plt.legend()
plt.xlabel('Gamma')
plt.ylabel('Mean score')
plt.show()
Code is based on this.
Only puzzling part: will sklearn always respect the order of C & Gamma -> official example uses this "ordering"
Output:

For plotting the results when tuning several hyperparameters, what I did was fixed all parameters to their best value except for one and plotted the mean score for the other parameter for each of its values.
def plot_search_results(grid):
"""
Params:
grid: A trained GridSearchCV object.
"""
## Results from grid search
results = grid.cv_results_
means_test = results['mean_test_score']
stds_test = results['std_test_score']
means_train = results['mean_train_score']
stds_train = results['std_train_score']
## Getting indexes of values per hyper-parameter
masks=[]
masks_names= list(grid.best_params_.keys())
for p_k, p_v in grid.best_params_.items():
masks.append(list(results['param_'+p_k].data==p_v))
params=grid.param_grid
## Ploting results
fig, ax = plt.subplots(1,len(params),sharex='none', sharey='all',figsize=(20,5))
fig.suptitle('Score per parameter')
fig.text(0.04, 0.5, 'MEAN SCORE', va='center', rotation='vertical')
pram_preformace_in_best = {}
for i, p in enumerate(masks_names):
m = np.stack(masks[:i] + masks[i+1:])
pram_preformace_in_best
best_parms_mask = m.all(axis=0)
best_index = np.where(best_parms_mask)[0]
x = np.array(params[p])
y_1 = np.array(means_test[best_index])
e_1 = np.array(stds_test[best_index])
y_2 = np.array(means_train[best_index])
e_2 = np.array(stds_train[best_index])
ax[i].errorbar(x, y_1, e_1, linestyle='--', marker='o', label='test')
ax[i].errorbar(x, y_2, e_2, linestyle='-', marker='^',label='train' )
ax[i].set_xlabel(p.upper())
plt.legend()
plt.show()

I wanted to do something similar (but scalable to a large number of parameters) and here is my solution to generate swarm plots of the output:
score = pd.DataFrame(gs_clf.grid_scores_).sort_values(by='mean_validation_score', ascending = False)
for i in parameters.keys():
print(i, len(parameters[i]), parameters[i])
score[i] = score.parameters.apply(lambda x: x[i])
l =['mean_validation_score'] + list(parameters.keys())
for i in list(parameters.keys()):
sns.swarmplot(data = score[l], x = i, y = 'mean_validation_score')
#plt.savefig('170705_sgd_optimisation//'+i+'.jpg', dpi = 100)
plt.show()

The order that the parameter grid is traversed is deterministic, such that it can be reshaped and plotted straightforwardly. Something like this:
scores = [entry.mean_validation_score for entry in grid.grid_scores_]
# the shape is according to the alphabetical order of the parameters in the grid
scores = np.array(scores).reshape(len(C_range), len(gamma_range))
for c_scores in scores:
plt.plot(gamma_range, c_scores, '-')

here's a solution that makes use of seaborn pointplot. the advantage of this method is that it will allow you to plot results when searching across more than 2 parameters
import seaborn as sns
import pandas as pd
def plot_cv_results(cv_results, param_x, param_z, metric='mean_test_score'):
"""
cv_results - cv_results_ attribute of a GridSearchCV instance (or similar)
param_x - name of grid search parameter to plot on x axis
param_z - name of grid search parameter to plot by line color
"""
cv_results = pd.DataFrame(cv_results)
col_x = 'param_' + param_x
col_z = 'param_' + param_z
fig, ax = plt.subplots(1, 1, figsize=(11, 8))
sns.pointplot(x=col_x, y=metric, hue=col_z, data=cv_results, ci=99, n_boot=64, ax=ax)
ax.set_title("CV Grid Search Results")
ax.set_xlabel(param_x)
ax.set_ylabel(metric)
ax.legend(title=param_z)
return fig
Example usage with xgboost:
from xgboost import XGBRegressor
from sklearn import GridSearchCV
params = {
'max_depth': [3, 6, 9, 12],
'gamma': [0, 1, 10, 20, 100],
'min_child_weight': [1, 4, 16, 64, 256],
}
model = XGBRegressor()
grid = GridSearchCV(model, params, scoring='neg_mean_squared_error')
grid.fit(...)
fig = plot_cv_results(grid.cv_results_, 'gamma', 'min_child_weight')
This will produce a figure that shows the gamma regularization parameter on the x-axis, the min_child_weight regularization parameter in the line color, and any other grid search parameters (in this case max_depth) will be described by the spread of the 99% confidence interval of the seaborn pointplot.
*Note in the example below I have changed the aesthetics slightly from the code above.

I used grid search on xgboost with different learning rates, max depths and number of estimators.
gs_param_grid = {'max_depth': [3,4,5],
'n_estimators' : [x for x in range(3000,5000,250)],
'learning_rate':[0.01,0.03,0.1]
}
gbm = XGBRegressor()
grid_gbm = GridSearchCV(estimator=gbm,
param_grid=gs_param_grid,
scoring='neg_mean_squared_error',
cv=4,
verbose=1
)
grid_gbm.fit(X_train,y_train)
To create the graph for error vs number of estimators with different learning rates, I used the following approach:
y=[]
cvres = grid_gbm.cv_results_
best_md=grid_gbm.best_params_['max_depth']
la=gs_param_grid['learning_rate']
n_estimators=gs_param_grid['n_estimators']
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
if params["max_depth"]==best_md:
y.append(np.sqrt(-mean_score))
y=np.array(y).reshape(len(la),len(n_estimators))
%matplotlib inline
plt.figure(figsize=(8,8))
for y_arr, label in zip(y, la):
plt.plot(n_estimators, y_arr, label=label)
plt.title('Error for different learning rates(keeping max_depth=%d(best_param))'%best_md)
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('Error')
plt.show()
The plot can be viewed here:
Note that the graph can similarly be created for error vs number of estimators with different max depth (or any other parameters as per the user's case).

Here's fully working code that will produce plots so you can fully visualize the varying of up to 3 parameters using GridSearchCV. This is what you will see when running the code:
Parameter1 (x-axis)
Cross Validaton Mean Score (y-axis)
Parameter2 (extra line plotted for each different Parameter2 value, with a legend for reference)
Parameter3 (extra charts will pop up for each different Parameter3 value, allowing you to view differences between these different charts)
For each line plotted, also shown is a standard deviation of what you can expect the Cross Validation Mean Score to do based on the multiple CV's you're running. Enjoy!
from sklearn import tree
from sklearn import model_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
Algo = [['DecisionTreeClassifier', tree.DecisionTreeClassifier(), # algorithm
'max_depth', [1, 2, 4, 6, 8, 10, 12, 14, 18, 20, 22, 24, 26, 28, 30], # Parameter1
'max_features', ['sqrt', 'log2', None], # Parameter2
'criterion', ['gini', 'entropy']]] # Parameter3
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2, title):
# Get Test Scores Mean and std for each grid search
grid_param_1 = list(str(e) for e in grid_param_1)
grid_param_2 = list(str(e) for e in grid_param_2)
scores_mean = cv_results['mean_test_score']
scores_std = cv_results['std_test_score']
params_set = cv_results['params']
scores_organized = {}
std_organized = {}
std_upper = {}
std_lower = {}
for p2 in grid_param_2:
scores_organized[p2] = []
std_organized[p2] = []
std_upper[p2] = []
std_lower[p2] = []
for p1 in grid_param_1:
for i in range(len(params_set)):
if str(params_set[i][name_param_1]) == str(p1) and str(params_set[i][name_param_2]) == str(p2):
mean = scores_mean[i]
std = scores_std[i]
scores_organized[p2].append(mean)
std_organized[p2].append(std)
std_upper[p2].append(mean + std)
std_lower[p2].append(mean - std)
_, ax = plt.subplots(1, 1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
# plot means
for key in scores_organized.keys():
ax.plot(grid_param_1, scores_organized[key], '-o', label= name_param_2 + ': ' + str(key))
ax.fill_between(grid_param_1, std_lower[key], std_upper[key], alpha=0.1)
ax.set_title(title)
ax.set_xlabel(name_param_1)
ax.set_ylabel('CV Average Score')
ax.legend(loc="best")
ax.grid('on')
plt.show()
dataset = 'Titanic'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
cv_split = model_selection.KFold(n_splits=10, random_state=2)
for i in range(len(Algo)):
name = Algo[0][0]
alg = Algo[0][1]
param_1_name = Algo[0][2]
param_1_range = Algo[0][3]
param_2_name = Algo[0][4]
param_2_range = Algo[0][5]
param_3_name = Algo[0][6]
param_3_range = Algo[0][7]
for p in param_3_range:
# grid search
param = {
param_1_name: param_1_range,
param_2_name: param_2_range,
param_3_name: [p]
}
grid_test = GridSearchCV(alg, param_grid=param, scoring='accuracy', cv=cv_split)
grid_test.fit(X_train, y_train)
plot_grid_search(grid_test.cv_results_, param[param_1_name], param[param_2_name], param_1_name, param_2_name, dataset + ' GridSearch Scores: ' + name + ', ' + param_3_name + '=' + str(p))
param = {
param_1_name: param_1_range,
param_2_name: param_2_range,
param_3_name: param_3_range
}
grid_final = GridSearchCV(alg, param_grid=param, scoring='accuracy', cv=cv_split)
grid_final.fit(X_train, y_train)
best_params = grid_final.best_params_
alg.set_params(**best_params)

#nathandrake Try the following which is adapted based off the code from #david-alvarez :
def plot_grid_search(cv_results, metric, grid_param_1, grid_param_2, name_param_1, name_param_2):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results[('mean_test_' + metric)]
scores_sd = cv_results[('std_test_' + metric)]
if grid_param_2 is not None:
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Set plot style
plt.style.use('seaborn')
# Plot Grid search scores
_, ax = plt.subplots(1,1)
if grid_param_2 is not None:
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))
else:
# If only one Param1 is given
ax.plot(grid_param_1, scores_mean, '-o')
ax.set_title("Grid Search", fontsize=20, fontweight='normal')
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel('CV Average ' + str.capitalize(metric), fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
As you can see, I added the ability to support grid searches that include multiple metrics. You simply specify the metric you want to plot in the call to the plotting function.
Also, if your grid search only tuned a single parameter you can simply specify None for grid_param_2 and name_param_2.
Call it as follows:
plot_grid_search(grid_search.cv_results_,
'Accuracy',
list(np.linspace(0.001, 10, 50)),
['linear', 'rbf'],
'C',
'kernel')

This worked for me when I was trying to plot mean scores vs no. of trees in the Random Forest. The reshape() function helps to find out the averages.
param_n_estimators = cv_results['param_n_estimators']
param_n_estimators = np.array(param_n_estimators)
mean_n_estimators = np.mean(param_n_estimators.reshape(-1,5), axis=0)
mean_test_scores = cv_results['mean_test_score']
mean_test_scores = np.array(mean_test_scores)
mean_test_scores = np.mean(mean_test_scores.reshape(-1,5), axis=0)
mean_train_scores = cv_results['mean_train_score']
mean_train_scores = np.array(mean_train_scores)
mean_train_scores = np.mean(mean_train_scores.reshape(-1,5), axis=0)

Related

How to implement kmeans clustering as a feature for classification techniques in SVM?

Ive already created a clustering and saved the model but im confused what should i do with this model and how to use it as a feature for classification.
This clustering is based on the coordinate of a crime place. after the data has been clustered, i want to use the clustered model as features in SVM.
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
import xlrd
import pickle
import tkinter as tk
from tkinter import *
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
#kmeans section
#Creating and labelling latitudes of X and Y and plotting it
data=pd.read_excel("sanfrancisco.xlsx")
x1=data['X']
y1=data['Y']
X = np.array(list(zip(x1,y1)))
# Elbow method
from sklearn.cluster import KMeans
wcss = [] #empty string
# to check in range for 10 cluster
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++') # will generate centroids
kmeans.fit(X)
wcss.append(kmeans.inertia_) # to find euclidean distance
plot1 = plt.figure(1)
plt.xlabel("Number of Clusters")
plt.ylabel("Euclidean Distance")
plt.plot(range(1,11), wcss)
k = 3
# data visual section.. Eg: how many crimes in diff month, most number of crime in a day in a week
# most number crime in what address, most number of crimes in what city, how many crime occur
# in how much time. , etc..
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x,C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)
# n_clustersr takes numbers of clusters, init chooses random data points for the initial centroids
# in default sckit provides 10 times of count and chooses the best one, in order to elak n_init assigned to 1
model = KMeans(n_clusters=k, init='random', n_init=1)
model.fit_transform(X)
centroids = model.cluster_centers_ # final centroids
rgb_colors = {0.: 'y',
1.: 'c',
2.: 'fuchsia',
}
if k == 4:
rgb_colors[3.] = 'lime'
if k == 6:
rgb_colors[3.] = 'lime'
rgb_colors[4.] = 'orange'
rgb_colors[5.] = 'tomato'
new_labels = pd.Series(model.labels_.astype(float)) # label that predicted by kmeans
plot2 = plt.figure(2)
plt.scatter(x1, y1, c=new_labels.map(rgb_colors), s=20)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='black', s=200 )
plt.xlabel('Final Cluster Centers\n Iteration Count=' +str(model.n_iter_)+
'\n Objective Function Value: ' +str(model.inertia_))
plt.ylabel('y')
plt.title("k-Means")
plt.show()
# save the model to disk
filename = 'clusteredmatrix.sav'
pickle.dump(model, open(filename,'wb'))
Your problem is not much clear, but if you want to see the behavior of clusters, I recommend you to use a tool like Weka, so that you can freely cluster them and get meaningful inferences before going into complex coding stuff!

How to plot feature importance for random forest in python

I have created a random forest model, and would like to plot the feature importances
model_RF_tune = RandomForestClassifier(random_state=0, n_estimators = 80,
min_samples_split =10, max_depth= None, max_features = "auto",)
I have tried defining a function:
def plot_feature_importances_health(model):
n_features = model.data.shape
plt.barh(range(n_features), model.feature_importances_, align = "center")
plt.yticks(np.arrange(n_features), df_health_reconstructed.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
but this
plot_feature_importances_health(model_RF_tune)
Gives this result:
AttributeError: 'RandomForestClassifier' object has no attribute 'data'
How do I plot it correctly?
Not all models can execute model.data. Would you like to try my codes instead? However, the codes plot the top 10 features only.
# use RandomForestClassifier to look for important key features
n = 10 # choose top n features
rfc = RandomForestClassifier(random_state=SEED, n_estimators=200, max_depth=3)
rfc_model = rfc.fit(X, y)
(pd.Series(rfc_model.feature_importances_, index=X.columns)
.nlargest(n)
.plot(kind='barh', figsize=[8, n/2.5],color='navy')
.invert_yaxis()) # most important feature is on top, ie, descending order
ticks_x = np.linspace(0, 0.5, 6) # (start, end, number of ticks)
plt.xticks(ticks_x, fontsize=15, color='black')
plt.yticks(size=15, color='navy' )
plt.title('Top Features derived by RandomForestClassifier', family='fantasy', size=15)
print(list((pd.Series(rfc_model.feature_importances_, index=X.columns).nlargest(n)).index))
This one seems to work for me
%matplotlib inline
#do code to support model
#"data" is the X dataframe and model is the SKlearn object
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(dataframe_name.columns,
model_name.feature_importances_):
feats[feature] = importance #add the name/value pair
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-
importance'})
importances.sort_values(by='Gini-importance').plot(kind='barh',
color="SeaGreen",figsize=(10,8))

how to add columns into clustering algorithm

i'm working with flask python framework on a data science project , and i need to add selected columns from a csv file in this clustering code, please can anyone help me ? knowing that the clustering code can read the columns, i could save csv file and select the columns from the server side so i just need to add it to this clustering code
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
class Clustering():
def __init__(self, filename, start_column, end_column):
self.n = start_column
self.m = end_column
self.filename = filename
self.dataset = pd.read_csv(self.filename)
self.X = self.dataset.iloc[:,[self.n,self.m]].values
def show_test(a):
return "just a test object"+a[0]+","+a[1]
#def return_x(self):
######## return concerned columns of the dataset ########
#return self.X
def print_elbow(self, number_of_k):
# Plot the graph to visualize the Elbow Method to find the optimal number of cluster
self.k = number_of_k
wcss=[]
silhouette_values = {}
for i in range (2,self.k):
self.kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter=300, n_init = 10, random_state =None)
self.kmeans.fit(self.X)
wcss.append(self.kmeans.inertia_) # Sum of squared distances of samples to their closest cluster center.
# Compute the silhouette scores for each sample
cluster_labels = self.kmeans.fit_predict(self.X)
silhouette_avg = silhouette_score(self.X, cluster_labels)
silhouette_values[i] = silhouette_avg
print("For n_clusters =", i,"The average silhouette_score is :", silhouette_avg)
print("Best silhouette score:", max(silhouette_values, key=silhouette_values.get))
plt.plot(range(2,self.k),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
return
def print_kmeans(self, Optimal_k):
plt.style.use('seaborn-deep')
# Applying KMeans to the dataset with the optimal number of cluster
self.opt_k = Optimal_k
self.kmeans=KMeans(n_clusters= self.opt_k, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = self.kmeans.fit_predict(self.X)
for i in range(self.opt_k):
plt.scatter(self.X[y_kmeans == i, 0], self.X[y_kmeans == i,1],s = 80, marker='o', alpha=0.7 , label = 'Cluster {}'.format(i+1))
plt.scatter(self.kmeans.cluster_centers_[:,0], self.kmeans.cluster_centers_[:,1], s = 100, c = 'black',edgecolors='none', label = 'Centroids')
plt.title('Clusters')
plt.xlabel('first column')
plt.ylabel('second column')
plt.legend()
plt.show()
return ```
Please try this.
Clustering('file.csv',start,end+1).
'file.csv' represents file path.
start denotes starting column number (int).
end is the last column u want to read. +1 to include that column.

How to color certain bars in barchart of matplotlib in python?

I am looking into the selected features in hybrid feature selection which consists of embedded feature selection and wrapper selection. So, I get the features with their feature importance and then run the wrapper selection using the selected features in the embedded selection and get the features with best model accuracy.
I got the bar chart from the embedded selection and now I want to just colour bars that for features selected in wrapper selection. How can I approach this? See my following code,
############################################# Hybrid Feature Selection Methodology #####################################
#################### Embedded Method ########################
# perform permutation importance
results = permutation_importance(knn, X_train, y_train, scoring='accuracy')
# get importance
importance = results.importances_mean
print(importance)
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.title('Permutation Feature Importance with KNN')
plt.xlabel('Features')
plt.ylabel('Feature Importance')
plt.show()
#################### Wrapper Method ########################
efs = EFS(knn, min_features=1, max_features=len(X_train_knn.columns), scoring='accuracy', print_progress=True, cv=2)
# fit the object to the training data.
efs = efs.fit(X_train_knn, y_train)
print('\n')
print('Best accuracy score: ', efs.best_score_ * 100)
print('Best subset (indices):', efs.best_idx_)
print('Best subset (corresponding names):', efs.best_feature_names_)
# transform our data to the newly selected features.
optimum_number_features = list(efs.best_idx_)
optimum_number_features_knn = list(efs.best_feature_names_)
A Minimal, Reproducible Example partly means that everyone can execute your code and get result.
import matplotlib.pyplot as plt
# importance_list = list(zip(feature_name_list, results.importances_mean))
importance_list = [('quiz', 0.4080183920815765), ('time', 0.1779846287534165), ('hm', 0.1392329389521148), ('submitNum', 0.09889260035850235), ('class', 0.09379925836350246), ('post', 0.049803191453511066), ('startTime', 0.03226899003737626)]
plt.figure()
colors = ['b' for i in importance_list]
# selected_list is what your wrapper function returns
selected_list = ['quiz', 'time', 'hm']
for i, v in enumerate(importance_list):
if v[0] in selected_list:
colors[i] = 'r'
plt.bar([i[0] for i in importance_list], [i[1] for i in importance_list], color=colors)
plt.title('Permutation Feature Importance with KNN')
plt.xlabel('Features')
plt.ylabel('Feature Importance')
plt.show()
As for the zip function and importances_mean atrribute, I test it with examples from sklearn.inspection.permutation_importance.
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],
[0, 9, 9],[0, 9, 9],[0, 9, 9]]
y = [1, 1, 1, 0, 0, 0]
clf = LogisticRegression().fit(X, y)
result = permutation_importance(clf, X, y, n_repeats=10, random_state=0)
list(zip(['a', 'b', 'c'], result.importances_mean))
# Result:
# [('a', 0.4666666666666666), ('b', 0.0), ('c', 0.0)]

[scikit learn]: Anomaly Detection - Alternative for OneClassSVM

I have implemented LinearSVC and SVC from the sklearn-framework for text classification.
I am using TfidfVectorizer to get sparse representation of the input data that consists of two different classes(benign data and malicious data). This part is working pretty fine but now i wanted to implement some kind of anomaly detection by using the OneClassSVM classificator and training a model with only one class (outliers detection...). Unfortunately it is not working with sparse-data. Some developers are working on a patch (https://github.com/scikit-learn/scikit-learn/pull/1586) but there a some bugs so there is no solution yet for using the OneClassSVM-implementation.
Are there any other methods in the sklearn-framework for doing something like that? I am looking over the examples but nothing seems to fit.
Thanks!
A bit late, but in case anyone else is looking for information on this... There's a third-party anomaly detection module for sklearn here: http://www.cit.mak.ac.ug/staff/jquinn/software/lsanomaly.html, based on least-squares methods. It should be a plug-in replacement for OneClassSVM.
Unfortunately, scikit-learn currently implements only one-class SVM and robust covariance estimator for outlier detection
You can try a comparision of these methods (as provided in the doc) by examining differences on the 2d data:
import numpy as np
import pylab as pl
import matplotlib.font_manager
from scipy import stats
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]
# define two outlier detection tools to be compared
classifiers = {
"One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
kernel="rbf", gamma=0.1),
"robust covariance estimator": EllipticEnvelope(contamination=.1)}
# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0
# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
# Data generation
X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset
X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
X = np.r_[X1, X2]
# Add outliers
X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))]
# Fit the model with the One-Class SVM
pl.figure(figsize=(10, 5))
for i, (clf_name, clf) in enumerate(classifiers.iteritems()):
# fit the data and tag outliers
clf.fit(X)
y_pred = clf.decision_function(X).ravel()
threshold = stats.scoreatpercentile(y_pred,
100 * outliers_fraction)
y_pred = y_pred > threshold
n_errors = (y_pred != ground_truth).sum()
# plot the levels lines and the points
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
subplot = pl.subplot(1, 2, i + 1)
subplot.set_title("Outlier detection")
subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
cmap=pl.cm.Blues_r)
a = subplot.contour(xx, yy, Z, levels=[threshold],
linewidths=2, colors='red')
subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
colors='orange')
b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')
c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')
subplot.axis('tight')
subplot.legend(
[a.collections[0], b, c],
['learned decision function', 'true inliers', 'true outliers'],
prop=matplotlib.font_manager.FontProperties(size=11))
subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
subplot.set_xlim((-7, 7))
subplot.set_ylim((-7, 7))
pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26)
pl.show()

Categories