GridSearchCV initialization - python

I want to use GridSearchCV over a range of alphas (LaPlace smoothing parameters) to check which gives me the best accuracy with a Bernoulli Naive Bayes model.
def binarize_pixels(data, threshold=0.784):
# Initialize a new feature array with the same shape as the original data.
binarized_data = np.zeros(data.shape)
# Apply a threshold to each feature.
for feature in range(data.shape[1]):
binarized_data[:,feature] = data[:,feature] > threshold
return binarized_data
binarized_train_data = binarize_pixels(mini_train_data)
def BNB():
clf = BernoulliNB()
clf.fit(binarized_train_data, mini_train_labels)
scoring = clf.score(mini_train_data, mini_train_labels)
predsNB = clf.predict(dev_data)
print "Bernoulli binarized model accuracy: {:.4}".format(np.mean(predsNB == dev_labels))
The model runs fine, while my GridSearch cross validation does not:
pipeline = Pipeline([('classifier', BNB())])
def P8(alphas):
gs_clf = GridSearchCV(pipeline, param_grid = alphas, refit=True)
y_predictions = gs_clf.best_estimator_.predict(dev_data)
print classification_report(dev_labels, y_predictions)
alphas = {'alpha' : [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
P8(alphas)
I get AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

The problem is in the following two rows:
gs_clf = GridSearchCV(pipeline, param_grid = alphas, refit=True)
y_predictions = gs_clf.best_estimator_.predict(dev_data)
Note that before using predict, you first need to fit the model. That is, to call gs_clf.fit. See the following example from the documentation:
>>> from sklearn import svm, datasets
>>> from sklearn.model_selection import GridSearchCV
>>> iris = datasets.load_iris()
>>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
>>> svr = svm.SVC()
>>> clf = GridSearchCV(svr, parameters)
>>> clf.fit(iris.data, iris.target)
...
GridSearchCV(cv=None, error_score=...,
estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
decision_function_shape=None, degree=..., gamma=...,
kernel='rbf', max_iter=-1, probability=False,
random_state=None, shrinking=True, tol=...,
verbose=False),
fit_params={}, iid=..., n_jobs=1,
param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
scoring=..., verbose=...)
>>> sorted(clf.cv_results_.keys())
...
['mean_fit_time', 'mean_score_time', 'mean_test_score',...
'mean_train_score', 'param_C', 'param_kernel', 'params',...
'rank_test_score', 'split0_test_score',...
'split0_train_score', 'split1_test_score', 'split1_train_score',...
'split2_test_score', 'split2_train_score',...
'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]

Related

Results from GridSearchCV/RandomizedSearchCV cannot be reproduced by running a single model using the same parameters

I am running RandomizedSearchCV with 5-folds in order to find best parameters. I have a hold-out set (X_test) that I use to predict. My portion of code is:
svc= SVC(class_weight=class_weights, random_state=42)
Cs = [0.01, 0.1, 1, 10, 100, 1000, 10000]
gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
param_grid = {'C': Cs,
'gamma': gammas,
'kernel': ['linear', 'rbf', 'poly']}
my_cv = TimeSeriesSplit(n_splits=5).split(X_train)
rs_svm = RandomizedSearchCV(SVC(), param_grid, cv = my_cv, scoring='accuracy',
refit='accuracy', verbose = 3, n_jobs=1, random_state=42)
rs_svm.fit(X_train, y_train)
y_pred = rs_svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
clfreport = classification_report(y_test, y_pred)
print (rs_svm.best_params_)
The result is classification report:
Now, I am interested in reproducing this result using a run-alone model (no randomizedsearchCV) with the selected parameters:
from sklearn.model_selection import TimeSeriesSplit
tcsv=TimeSeriesSplit(n_splits=5)
for train_index, test_index in tcsv.split(X_train):
train_index_ = int(train_index.shape[0])
test_index_ = int(test_index.shape[0])
X_train_, y_train_ = X_train[0:train_index_],y_train[0:train_index_]
X_test_, y_test_ = X_train[test_index_:],y_train[test_index_:]
class_weights = compute_class_weight('balanced', np.unique(y_train_), y_train_)
class_weights = dict(enumerate(class_weights))
svc= SVC(C=0.01, gamma=0.1, kernel='linear', class_weight=class_weights, verbose=True,
random_state=42)
svc.fit(X_train_, y_train_)
y_pred_=svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_)
clfreport = classification_report(y_test, y_pred_)
In my understanding, the clfreports should be identical but my result after this run are:
Does anyone have any suggestions why that might be happening?
Given your 1st code snippet, where you use RandomizedSearchCV to find the best hyperparameters, you don't need to do any splitting again; so, in your 2nd snippet, you should just fit using the found hyperparameters and the class weights using the whole of your training set, and then predict on your test set:
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = dict(enumerate(class_weights))
svc= SVC(C=0.01, gamma=0.1, kernel='linear', class_weight=class_weights, verbose=True, random_state=42)
svc.fit(X_train, y_train)
y_pred_=svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
clfreport = classification_report(y_test, y_pred)
The discussion in Order between using validation, training and test sets might be useful for clarifying the procedure...

How to run RFECV with SVC in sklearn

I am trying to perform Recursive Feature Elimination with Cross Validation (RFECV) with GridSearchCV as follows using SVC as the classifier.
My code is as follows.
X = df[my_features]
y = df['gold_standard']
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
clf = SVC(class_weight="balanced")
rfecv = RFECV(estimator=clf, step=1, cv=k_fold, scoring='roc_auc')
param_grid = {'estimator__C': [0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1.0, 10.0, 100.0, 1000.0],
'estimator__gamma': [0.001, 0.01, 0.1, 1.0, 2.0, 3.0, 10.0, 100.0, 1000.0],
'estimator__kernel':('rbf', 'sigmoid', 'poly')
}
CV_rfc = GridSearchCV(estimator=rfecv, param_grid=param_grid, cv= k_fold, scoring = 'roc_auc', verbose=10)
CV_rfc.fit(x_train, y_train)
However, I got an error saying: RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes
Is there a way to resolve this error? If not what are the other feature selection techniques that I can use with SVC?
I am happy to provide more details if needed.
To look at more feature selection implementations you can have a look at:
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
As an example, in the next link they use PCA with k-best feature selection and svc.
https://scikit-learn.org/stable/auto_examples/compose/plot_feature_union.html#sphx-glr-auto-examples-compose-plot-feature-union-py
An example of use would be, modified form the previous link for more simplicity:
iris = load_iris()
X, y = iris.data, iris.target
# Maybe some original features where good, too?
selection = SelectKBest()
# Build SVC
svm = SVC(kernel="linear")
# Do grid search over k, n_components and C:
pipeline = Pipeline([("features", selection), ("svm", svm)])
param_grid = dict(features__k=[1, 2],
svm__C=[0.1, 1, 10])
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, verbose=10)
grid_search.fit(X, y)
print(grid_search.best_estimator_)
emmm...in sklearn 0.19.2,The problem seems to have been solved.My code is similar to yours, but it works:
svc = SVC(
kernel = 'linear',
probability = True,
random_state = 1 )
rfecv = RFECV(
estimator = svc,
scoring = 'roc_auc'
)
rfecv.fit(train_values,train_Labels)
selecInfo = rfecv.support_
selecIndex = np.where(selecInfo==1)

SVC with class_weight in scikit-learn

I would like to use class_weight to create a weighted SVC classifier in sikit-learn. Nevertheless, I'm not sure if I'm configuring correctly my model. Please consider the example below:
x = np.array([[0,0,1],[0,1,1],[1,0,0]])
y = np.array([1,1,0])
cw = {}
for l in set(y):
cw[l] = np.sum(y == l)
print(cw)
m = SVC(probability = True, max_iter = 1000, class_weight = cw)
m = m.fit(x,y)
I obtained the model:
SVC(C=1.0, cache_size=200, class_weight={0: 1, 1: 2}, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
max_iter=1000, probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False)
With class_weight={0: 1, 1: 2} corresponding to the number of data points in each class.
QUESTION: Is it correct to proceed in this way?
As you have a 2:1 ratio of class labels, this weighting appears to be correct.
One other thing you can do if you don't want to manually calculate the class weights is to pass class_weight='balanced' and let the SVC balance the weights for you

How to print the best parameters through GridSearchCV for k-fold cross validation

I have used a GridSearch for parameter optimization when predicting values with 10-fold cross validation using sklearn, as shown below,
svr_params = {
'C': [0.1, 1, 10],
'epsilon': [0.01, 0.05, 0.1, 0.5, 1],
}
svr = SVR(kernel='linear', coef0=0.1, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1)
best_svr = GridSearchCV(
svr, param_grid=svr_params, cv=10, verbose=0, n_jobs=-1)
predicted = cross_val_predict(best_svr, X, y, cv=10)
I want to print out the best parameters selected by the GridSearch for C and epsilon. I would really appriate some help. Thanks in advance.
The best parameters are available as best_params_ attribute of GridSearchCV.
best_svr = GridSearchCV(svr, param_grid=svr_params, cv=10, verbose=0, n_jobs=-1, refit=True)
best_svr.fit(X, y)
print(best_svr.best_params_)

Python Naive Bayes with cross validation using GaussianNB classifier

I would like to apply Naive Bayes with 10-fold stratified cross-validation to my data, and then I want to see how the model performs on the test data I set aside initially.
However, the results I am getting (i.e. the predicted outcome and probability values y_pred_nb2 and y_score_nb2) are identical to when I run the code without any cross-validation.
QUESTION: How can I correct this?
The code is below, where X_train consists of 75% of the entire dataset and X_test consists of 25%.
from sklearn.model_selection import StratifiedKFold
params = {}
#gridsearch searches for the best hyperparameters and keeps the classifier with the highest recall score
skf = StratifiedKFold(n_splits=10)
nb2 = GridSearchCV(GaussianNB(), cv=skf, param_grid=params)
%time nb2.fit(X_train, y_train)
# predict values on the test set
y_pred_nb2 = nb2.predict(X_test)
print(y_pred_nb2)
# predicted probabilities on the test set
y_scores_nb2 = nb2.predict_proba(X_test)[:, 1]
print(y_scores_nb2)
First off GaussianNB only accepts priors as an argument so unless you have some priors to set for your model ahead of time you will have nothing to grid search over.
Furthermore, your param_grid is set to an empty dictionary which ensures that you only fit one estimator with GridSearchCV. This is the same as fitting an estimator without using a grid search (e.g., I use MultinomialNB in order to show use of hyperparameters):
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
skf = StratifiedKFold(n_splits=10)
params = {}
nb = MultinomialNB()
gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=True)
data = load_iris()
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)
gs.fit(x_train, y_train)
gs.cv_results_
{'mean_fit_time': array([0.]),
'mean_score_time': array([0.]),
'mean_test_score': array([0.85714286]),
'mean_train_score': array([0.85992157]),
'params': [{}],
'rank_test_score': array([1]),
'split0_test_score': array([0.91666667]),
'split0_train_score': array([0.84]),
'split1_test_score': array([0.75]),
'split1_train_score': array([0.86]),
'split2_test_score': array([0.83333333]),
'split2_train_score': array([0.84]),
'split3_test_score': array([0.91666667]),
'split3_train_score': array([0.83]),
'split4_test_score': array([0.83333333]),
'split4_train_score': array([0.85]),
'split5_test_score': array([0.91666667]),
'split5_train_score': array([0.84]),
'split6_test_score': array([0.9]),
'split6_train_score': array([0.88235294]),
'split7_test_score': array([0.8]),
'split7_train_score': array([0.88235294]),
'split8_test_score': array([0.8]),
'split8_train_score': array([0.89215686]),
'split9_test_score': array([0.9]),
'split9_train_score': array([0.88235294]),
'std_fit_time': array([0.]),
'std_score_time': array([0.]),
'std_test_score': array([0.05832118]),
'std_train_score': array([0.02175538])}
nb.fit(x_train, y_train)
nb.score(x_test, y_test)
0.8157894736842105
gs.score(x_test, y_test)
0.8157894736842105
gs.param_grid = {'alpha': [0.1, 2]}
gs.fit(x_train, y_train)
gs.score(x_test, y_test)
0.8421052631578947
gs.cv_results_
{'mean_fit_time': array([0.00090394, 0.00049713]),
'mean_score_time': array([0.00029924, 0.0003005 ]),
'mean_test_score': array([0.86607143, 0.85714286]),
'mean_train_score': array([0.86092157, 0.85494118]),
'param_alpha': masked_array(data=[0.1, 2],
mask=[False, False],
fill_value='?',
dtype=object),
'params': [{'alpha': 0.1}, {'alpha': 2}],
'rank_test_score': array([1, 2]),
'split0_test_score': array([0.91666667, 0.91666667]),
'split0_train_score': array([0.84, 0.83]),
'split1_test_score': array([0.75, 0.75]),
'split1_train_score': array([0.86, 0.86]),
'split2_test_score': array([0.83333333, 0.83333333]),
'split2_train_score': array([0.85, 0.84]),
'split3_test_score': array([0.91666667, 0.91666667]),
'split3_train_score': array([0.83, 0.81]),
'split4_test_score': array([0.83333333, 0.83333333]),
'split4_train_score': array([0.85, 0.84]),
'split5_test_score': array([0.91666667, 0.91666667]),
'split5_train_score': array([0.84, 0.84]),
'split6_test_score': array([0.9, 0.9]),
'split6_train_score': array([0.88235294, 0.88235294]),
'split7_test_score': array([0.9, 0.8]),
'split7_train_score': array([0.88235294, 0.88235294]),
'split8_test_score': array([0.8, 0.8]),
'split8_train_score': array([0.89215686, 0.89215686]),
'split9_test_score': array([0.9, 0.9]),
'split9_train_score': array([0.88235294, 0.87254902]),
'std_fit_time': array([0.00030147, 0.00049713]),
'std_score_time': array([0.00045711, 0.00045921]),
'std_test_score': array([0.05651628, 0.05832118]),
'std_train_score': array([0.02103457, 0.02556351])}
How about something like this
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
#because only var_smoothing can be 'tuned'
#do a cross validation on different var_smoothing values
def cross_val(params):
model = GaussianNB()
model.set_params(**params)
cv_results = cross_val_score(model, X_train, y_train,
cv = 10, #10 folds
scoring = "accuracy",
verbose = 2
)
#return the mean of the 10 fold cross validation
return cv_results.mean()
#baseline parameters
params = {
"priors" : "None",
"var_smoothing" : 1e-9
}
#create an list of var_smoothing to cross validate
steps = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
#will contain the cv results
results = []
for step in steps:
params["var_smoothing"] = step
cv_result = cross_val(params)
#save result
results.append(cv_result)
#print results
#convert results to pandas dataframe for easier visualization
df = pd.DataFrame({"var_smoothing" : steps, "accuracy" : results})
#sort it
df_sorted = df.sort_values("accuracy", ascending=False)
#reset the index of the sorted dataframe
df_sorted.reset_index(inplace=True, drop=True)
df_sorted.head()

Categories