Python Naive Bayes with cross validation using GaussianNB classifier - python

I would like to apply Naive Bayes with 10-fold stratified cross-validation to my data, and then I want to see how the model performs on the test data I set aside initially.
However, the results I am getting (i.e. the predicted outcome and probability values y_pred_nb2 and y_score_nb2) are identical to when I run the code without any cross-validation.
QUESTION: How can I correct this?
The code is below, where X_train consists of 75% of the entire dataset and X_test consists of 25%.
from sklearn.model_selection import StratifiedKFold
params = {}
#gridsearch searches for the best hyperparameters and keeps the classifier with the highest recall score
skf = StratifiedKFold(n_splits=10)
nb2 = GridSearchCV(GaussianNB(), cv=skf, param_grid=params)
%time nb2.fit(X_train, y_train)
# predict values on the test set
y_pred_nb2 = nb2.predict(X_test)
print(y_pred_nb2)
# predicted probabilities on the test set
y_scores_nb2 = nb2.predict_proba(X_test)[:, 1]
print(y_scores_nb2)

First off GaussianNB only accepts priors as an argument so unless you have some priors to set for your model ahead of time you will have nothing to grid search over.
Furthermore, your param_grid is set to an empty dictionary which ensures that you only fit one estimator with GridSearchCV. This is the same as fitting an estimator without using a grid search (e.g., I use MultinomialNB in order to show use of hyperparameters):
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
skf = StratifiedKFold(n_splits=10)
params = {}
nb = MultinomialNB()
gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=True)
data = load_iris()
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)
gs.fit(x_train, y_train)
gs.cv_results_
{'mean_fit_time': array([0.]),
'mean_score_time': array([0.]),
'mean_test_score': array([0.85714286]),
'mean_train_score': array([0.85992157]),
'params': [{}],
'rank_test_score': array([1]),
'split0_test_score': array([0.91666667]),
'split0_train_score': array([0.84]),
'split1_test_score': array([0.75]),
'split1_train_score': array([0.86]),
'split2_test_score': array([0.83333333]),
'split2_train_score': array([0.84]),
'split3_test_score': array([0.91666667]),
'split3_train_score': array([0.83]),
'split4_test_score': array([0.83333333]),
'split4_train_score': array([0.85]),
'split5_test_score': array([0.91666667]),
'split5_train_score': array([0.84]),
'split6_test_score': array([0.9]),
'split6_train_score': array([0.88235294]),
'split7_test_score': array([0.8]),
'split7_train_score': array([0.88235294]),
'split8_test_score': array([0.8]),
'split8_train_score': array([0.89215686]),
'split9_test_score': array([0.9]),
'split9_train_score': array([0.88235294]),
'std_fit_time': array([0.]),
'std_score_time': array([0.]),
'std_test_score': array([0.05832118]),
'std_train_score': array([0.02175538])}
nb.fit(x_train, y_train)
nb.score(x_test, y_test)
0.8157894736842105
gs.score(x_test, y_test)
0.8157894736842105
gs.param_grid = {'alpha': [0.1, 2]}
gs.fit(x_train, y_train)
gs.score(x_test, y_test)
0.8421052631578947
gs.cv_results_
{'mean_fit_time': array([0.00090394, 0.00049713]),
'mean_score_time': array([0.00029924, 0.0003005 ]),
'mean_test_score': array([0.86607143, 0.85714286]),
'mean_train_score': array([0.86092157, 0.85494118]),
'param_alpha': masked_array(data=[0.1, 2],
mask=[False, False],
fill_value='?',
dtype=object),
'params': [{'alpha': 0.1}, {'alpha': 2}],
'rank_test_score': array([1, 2]),
'split0_test_score': array([0.91666667, 0.91666667]),
'split0_train_score': array([0.84, 0.83]),
'split1_test_score': array([0.75, 0.75]),
'split1_train_score': array([0.86, 0.86]),
'split2_test_score': array([0.83333333, 0.83333333]),
'split2_train_score': array([0.85, 0.84]),
'split3_test_score': array([0.91666667, 0.91666667]),
'split3_train_score': array([0.83, 0.81]),
'split4_test_score': array([0.83333333, 0.83333333]),
'split4_train_score': array([0.85, 0.84]),
'split5_test_score': array([0.91666667, 0.91666667]),
'split5_train_score': array([0.84, 0.84]),
'split6_test_score': array([0.9, 0.9]),
'split6_train_score': array([0.88235294, 0.88235294]),
'split7_test_score': array([0.9, 0.8]),
'split7_train_score': array([0.88235294, 0.88235294]),
'split8_test_score': array([0.8, 0.8]),
'split8_train_score': array([0.89215686, 0.89215686]),
'split9_test_score': array([0.9, 0.9]),
'split9_train_score': array([0.88235294, 0.87254902]),
'std_fit_time': array([0.00030147, 0.00049713]),
'std_score_time': array([0.00045711, 0.00045921]),
'std_test_score': array([0.05651628, 0.05832118]),
'std_train_score': array([0.02103457, 0.02556351])}

How about something like this
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
#because only var_smoothing can be 'tuned'
#do a cross validation on different var_smoothing values
def cross_val(params):
model = GaussianNB()
model.set_params(**params)
cv_results = cross_val_score(model, X_train, y_train,
cv = 10, #10 folds
scoring = "accuracy",
verbose = 2
)
#return the mean of the 10 fold cross validation
return cv_results.mean()
#baseline parameters
params = {
"priors" : "None",
"var_smoothing" : 1e-9
}
#create an list of var_smoothing to cross validate
steps = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
#will contain the cv results
results = []
for step in steps:
params["var_smoothing"] = step
cv_result = cross_val(params)
#save result
results.append(cv_result)
#print results
#convert results to pandas dataframe for easier visualization
df = pd.DataFrame({"var_smoothing" : steps, "accuracy" : results})
#sort it
df_sorted = df.sort_values("accuracy", ascending=False)
#reset the index of the sorted dataframe
df_sorted.reset_index(inplace=True, drop=True)
df_sorted.head()

Related

ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn(

I am trying to use Grid-Search-Cross-Validation to find the best value of hyperparameter C. I did split the data set into two subsets contains 50% of the Mnist 784, and used only one of the two subsets with 60% and 40% for training and testing respectively.
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import numpy as np
mnist = fetch_openml('mnist_784')
X, y = mnist['data'], mnist['target']
X_1, X_2, y_1, y_2 = train_test_split(X, y, test_size=0.5, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.4)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm = LinearSVC(dual=False, max_iter=10000)
param_grid = {'C': [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]}
grid = GridSearchCV(svm, param_grid, scoring='accuracy')
grid.fit(X_train_scaled, y_train)
print("Best value of C:", grid.best_params_['C'])
accuracy = grid.score(X_test_scaled, y_test)
print("Test accuracy:", accuracy)
I have tried everything without any progress.
I tried minimizing the data and then normalizing it, also tried to increase the max_iter=10000 and 15000.

SKLearn & ElasticNet: Cross validation fails when using Accuracy as a metric

I have a binary classification problem. I've been using cross validation to optimize the ElasticNet parameters. However ElasticNet only seems to work when I supply roc_auc as the scoring method to be used during CV, However I also want to test out a wide range of scoring methods, in particular accuracy. Specifically, when using accuracy, ElasticNet returns this error:
ValueError: Classification metrics can't handle a mix of binary and continuous targets
However my y targets are indeed binary. Below is a replication of my problem using the dataset from here:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
data = pd.read_csv('data 2.csv')
# by default majority class (benign) will be negative
lb = LabelBinarizer()
data['diagnosis'] = lb.fit_transform(data['diagnosis'].values)
targets = data['diagnosis']
data.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, targets, stratify=targets)
#elastic net logistic regression
lr = ElasticNet(max_iter=2000)
scorer = 'accuracy'
param_grid = {
'alpha': [1e-4, 1e-3, 1e-2, 0.01, 0.1, 1, 5, 10],
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
skf = StratifiedKFold(n_splits=10)
clf = GridSearchCV(lr, param_grid, scoring=scorer, cv=skf, return_train_score=True,
n_jobs=-1)
clf.fit(X_train.values, y_train.values)
I figured that ElasticNet might be trying to solve a linear regression problem so I tried lr = LogisticRegression(penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga') as the classifier but the same problem persists.
If I use as the scoring metric scorer = 'roc_auc' then the model is built as expected.
Also, as a sanity to check to see if there is something wrong with the data I tried the same but with a random forest classifier and here the problem disappears:
# random forest
clf = RandomForestClassifier(n_jobs=-1)
param_grid = {
'min_samples_split': [3, 5, 10],
'n_estimators' : [100, 300],
'max_depth': [3, 5, 15, 25],
'max_features': [3, 5, 10, 20]
}
skf = StratifiedKFold(n_splits=10)
scorer = 'accuracy'
grid_search = GridSearchCV(clf, param_grid, scoring=scorer,
cv=skf, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train.values, y_train.values)
Has anyone got any ideas on what's happening here?
ElasticNet is a regression model.
If you want an ElasticNet penalty in classification, use LogisticRegression:
lr = LogisticRegression(solver="saga", penalty="elasticnet")
Minimal Reproducible Example:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
lr = LogisticRegression(solver="saga", penalty="elasticnet", max_iter=2000)
param_grid = {
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
clf = GridSearchCV(lr, param_grid, scoring='accuracy', cv=StratifiedKFold(n_splits=10), return_train_score=True, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

'GridSearchCV' object has no attribute 'best_params_' when using LogisticRegression

Below is the code that I am trying to execute
# Train a logistic regression model, report the coefficients and model performance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
clf = LogisticRegression().fit(X_train, y_train)
params = {'penalty':['l1','l2'],'dual':[True,False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True,False],
'solver':['saga']}
gridlog = GridSearchCV(clf, params, cv=5, n_jobs=2, scoring='roc_auc')
cv_scores = cross_val_score(gridlog, X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_) # throws error
The last code line above is where the error is being thrown from. I have used this exact same code to run other models. Any idea why I may be facing this issue?
You need to fit gridlog first. cross_val_score will not do this, it returns the scores & nothing else.
Hence, as gridlog isn't trained, it throws error.
Below code works perfectly fine:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
diabetes = datasets.load_breast_cancer()
x = diabetes.data[:150]
y = diabetes.target[:150]
clf = LogisticRegression().fit(x, y)
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gridlog = GridSearchCV(clf, params, cv=2, n_jobs=2,
scoring='roc_auc')
gridlog.fit(x,y) # <- missing in your code
cv_scores = cross_val_score(gridlog, x, y)
print(cv_scores)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_)
# result:
Logistic regression parameters: {'C': 1}
Your code should be updated such that the LogisticRegression classifier is passed to the GridSearch (not its fit):
from sklearn.datasets import load_breast_cancer # For example only
X_train, y_train = load_breast_cancer(return_X_y=True)
params = {'penalty':['l1', 'l2'],'dual':[True, False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True, False],
'solver':['saga']}
gridlog = GridSearchCV(LogisticRegression(), params, cv=5, n_jobs=2, scoring='roc_auc')
gridlog.fit(X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ', gridlog.best_params_) # Now it displays all the parameters selected by the grid search
Results
Logistic Regression parameters: {'C': 0.1, 'dual': False, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'saga'}
Note, as #desertnaut pointed out, you don't use cross_val_score for GridSearchCV.
See a complete example of how to use GridSearch here.
The example use a SVC classifier instead of a LogisticRegression, but the approach is the same.

How to do only simple cross validation using GridSearchCV

How I am using below code to perform both simple cross validation and K-fold cross validation
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import numpy as np
# our hyperparameters to choose from
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2]
n_estimators = [30, 50, 100, 150, 200]
param_grid = dict(learning_rate = learning_rate, n_estimators = n_estimators)
xgb_model = xgb.XGBClassifier(random_state=42, n_jobs = -1)
clf = GridSearchCV(xgb_model, param_grid, scoring = 'roc_auc', cv=3, return_train_score=True)
sc = clf.fit(X_train, y_train)
# getting all the results
scores = clf.cv_results_
# getting train scores and cross validation scores
train_score = scores['mean_train_score']
cv_score = scores['mean_test_score']
Access the classifier trained with the best set of hyper-parameters, then call the score method, which will make predictions from X_cv and score accuracy compared to y_cv:
clf.best_estimator_.score(X_cv,y_cv)
If you just want the predictions, then call the predict method instead with X_cv as argument.

Results from GridSearchCV/RandomizedSearchCV cannot be reproduced by running a single model using the same parameters

I am running RandomizedSearchCV with 5-folds in order to find best parameters. I have a hold-out set (X_test) that I use to predict. My portion of code is:
svc= SVC(class_weight=class_weights, random_state=42)
Cs = [0.01, 0.1, 1, 10, 100, 1000, 10000]
gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
param_grid = {'C': Cs,
'gamma': gammas,
'kernel': ['linear', 'rbf', 'poly']}
my_cv = TimeSeriesSplit(n_splits=5).split(X_train)
rs_svm = RandomizedSearchCV(SVC(), param_grid, cv = my_cv, scoring='accuracy',
refit='accuracy', verbose = 3, n_jobs=1, random_state=42)
rs_svm.fit(X_train, y_train)
y_pred = rs_svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
clfreport = classification_report(y_test, y_pred)
print (rs_svm.best_params_)
The result is classification report:
Now, I am interested in reproducing this result using a run-alone model (no randomizedsearchCV) with the selected parameters:
from sklearn.model_selection import TimeSeriesSplit
tcsv=TimeSeriesSplit(n_splits=5)
for train_index, test_index in tcsv.split(X_train):
train_index_ = int(train_index.shape[0])
test_index_ = int(test_index.shape[0])
X_train_, y_train_ = X_train[0:train_index_],y_train[0:train_index_]
X_test_, y_test_ = X_train[test_index_:],y_train[test_index_:]
class_weights = compute_class_weight('balanced', np.unique(y_train_), y_train_)
class_weights = dict(enumerate(class_weights))
svc= SVC(C=0.01, gamma=0.1, kernel='linear', class_weight=class_weights, verbose=True,
random_state=42)
svc.fit(X_train_, y_train_)
y_pred_=svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_)
clfreport = classification_report(y_test, y_pred_)
In my understanding, the clfreports should be identical but my result after this run are:
Does anyone have any suggestions why that might be happening?
Given your 1st code snippet, where you use RandomizedSearchCV to find the best hyperparameters, you don't need to do any splitting again; so, in your 2nd snippet, you should just fit using the found hyperparameters and the class weights using the whole of your training set, and then predict on your test set:
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = dict(enumerate(class_weights))
svc= SVC(C=0.01, gamma=0.1, kernel='linear', class_weight=class_weights, verbose=True, random_state=42)
svc.fit(X_train, y_train)
y_pred_=svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
clfreport = classification_report(y_test, y_pred)
The discussion in Order between using validation, training and test sets might be useful for clarifying the procedure...

Categories