How I am using below code to perform both simple cross validation and K-fold cross validation
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import numpy as np
# our hyperparameters to choose from
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2]
n_estimators = [30, 50, 100, 150, 200]
param_grid = dict(learning_rate = learning_rate, n_estimators = n_estimators)
xgb_model = xgb.XGBClassifier(random_state=42, n_jobs = -1)
clf = GridSearchCV(xgb_model, param_grid, scoring = 'roc_auc', cv=3, return_train_score=True)
sc = clf.fit(X_train, y_train)
# getting all the results
scores = clf.cv_results_
# getting train scores and cross validation scores
train_score = scores['mean_train_score']
cv_score = scores['mean_test_score']
Access the classifier trained with the best set of hyper-parameters, then call the score method, which will make predictions from X_cv and score accuracy compared to y_cv:
clf.best_estimator_.score(X_cv,y_cv)
If you just want the predictions, then call the predict method instead with X_cv as argument.
Related
I'm attempting to classify seismic events which are very challenging. I have a validation accuracy of 68% using a CNN however when attempting to get a baseline for random forest I receive very poor results. Somewhere around 35%. I'm new to using Random Forest and I'm looking for some help. The shape of the data is (500,15001) 500 being the number of samples and 15001 is the numpy array amount of data points in the time series data (I.E the seismic data). Then the labels are (500,). There are 4 different types of classification from Rockfall to Earthquake.
xTrain, xTest, yTrain, yTest = train_test_split(data, data_labels_np, test_size = 0.3, random_state = 2)
num_classes = len(np.unique(yTrain))
scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain.reshape(-1, xTrain.shape[-1])).reshape(xTrain.shape)
xTest = scaler.transform(xTest.reshape(-1, xTest.shape[-1])).reshape(xTest.shape)
n_estimators = [25]
max_depth = [25]
min_samples_leaf = [2]
bootstrap = [True, False]
param_grid = {
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_samples_leaf": min_samples_leaf,
"bootstrap": bootstrap,
}
rf = RandomForestRegressor(random_state=42)
rf_model = GridSearchCV(estimator=rf, param_grid=param_grid, cv=4, verbose=10, n_jobs=-1,error_score='raise')
rf_model.fit(xTrain, yTrain)
print("Using hyperparameters --> \n", rf_model.best_params_)
rf = RandomForestRegressor(random_state = 42)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter=50, cv =4, verbose = 10, random_state=42, n_jobs = 10)
rf_random.fit(xTrain, yTrain)
rf_model.best_params_
rf_model.best_score_
rf_model.best_estimator_
print('Best score for training data:', rf_random.best_score_,"\n")
I would like to use Gridsearch in the code to fine tune my SVM model, I have copied this code from other githubs and it has been working perfectly fine for my cross-fold.
X = Corpus.drop(['text','ManipulativeTag','compound'],axis=1).values # !!! this drops compund because of Naive Bayes
y = Corpus['ManipulativeTag'].values
kf = KFold(n_splits=5, shuffle=True, random_state=1)
# Create splits
splits = kf.split(X)
# Access the training and validation indices of splits
kfold_accuracy = {}
kfold_precision = {}
kfold_f = {}
kfold_recall = {}
for i, (train_index, val_index) in enumerate(splits):
print("Split n°: ", i)
# Setup the training and validation data
X_train, y_train = X[train_index], y[train_index]
# print("training:", train_index, "validations:", val_index)
X_val,y_val= X[val_index], y[val_index]
SVM = svm.SVC(C=1.0, kernel='linear', random_state=1111, probability=True) ### the base estimator
SVM.fit(X_train, y_train)
# predict the labels on validation dataset
predictions = SVM.predict(X_val)
# Use accuracy_score function to get the accuracy
kfold_accuracy[i] = accuracy_score(y_val, predictions)
kfold_precision[i] = precision_score(y_val, predictions)
kfold_f[i] = f1_score(y_val,predictions)
kfold_recall[i] = recall_score(y_val,predictions)
However when trying to implement Gridsearch most of the articles that I ran into uses train_test_split() rather than my kf.split(), I am having trouble finding the right place to shove the GridSearchCV() line:
GridSearchCV(estimator=classifier,
param_grid=grid_param,
scoring='accuracy',
cv=5,
n_jobs=-1)
I found my solution here: Grid search and cross validation SVM
I have copied this from the post:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
{'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000] },{'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}]
And I have kept everything from my code and only made changes in the loop by adding the Gridsearch() in my loop:
for i, (train_index, val_index) in enumerate(splits):
print("Split n°: ", i)
# Setup the training and validation data
X_train, y_train = X[train_index], y[train_index]
X_val,y_val= X[val_index], y[val_index]
# this is where I put GridSearch()
# here cv cannot be 1, so I put 2 instead
SVM = GridSearchCV(SVC(), tuned_parameters, cv=2, scoring='accuracy')
SVM.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(SVM.best_params_)
I have a dataset with multiple outputs and am trying to use gradient boosting to predict all the values at once. I imported MultiOutputRegressor so multiple outputs can be predicted at once; I'm able to make it work for the default gradient boosting function. However, I'm running into an error when I try to optimize the gradient boosting function for each output.
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn import ensemble
params = {'max_depth': 3, 'n_estimators': 100, 'learning_rate': 0.1}
gradient_regressor = MultiOutputRegressor(ensemble.GradientBoostingRegressor(**params))
GradBoostModel = gradient_regressor.fit(X_train, y_train)
prediction_GradBoost = GradBoostModel.predict(X_test)
LR = {'learning_rate':[0.15, 0.125, 0.1, 0.75, 0.05], 'n_estimators':[50, 75, 100, 150, 200, 250, 300, 400]}
tuning = GridSearchCV(estimator = GradBoostModel, param_grid = LR, scoring = 'r2')
tuning.fit(X_train, y_train)
tuning.best_params_, tuning.best_score_
I'm trying to use GridSearchCV to cycle through the listed learning rates and number of estimators to find the optimal values. But, I get the following error:
Invalid parameter learning_rate for estimator MultiOutputRegressor.
Check the list of available parameters with `estimator.get_params().keys()`
I think I understand the reason for the error: when I try to optimize the gradient boosting parameters, they are passed through the MultiOutputRegressor, which doesn't recognize them. Is this the case? Also, how can I change my code, such that I can optimize these parameters for each output?
Indeed the params are prefixed with estimator__, in general, to find out what params to use downstream in your pipeline use the .get_params().keys() method on your model, eg:
print(GradBoostModel.get_params().keys())
dict_keys(['estimator__alpha', 'estimator__ccp_alpha', 'estimator__criterion', 'estimator__init', 'estimator__learning_rate',...
Full working example with the linnerud dataset:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.datasets import load_linnerud
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
# Data
rng = np.random.RandomState(0)
X, y = load_linnerud(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
# Model
params = {'max_depth': 3, 'n_estimators': 100, 'learning_rate': 0.1}
gradient_regressor = MultiOutputRegressor(GradientBoostingRegressor(**params))
GradBoostModel = gradient_regressor.fit(X_train, y_train)
prediction_GradBoost = GradBoostModel.predict(X_test)
LR = {'estimator__learning_rate': [0.15, 0.125, 0.1, 0.75, 0.05], 'estimator__n_estimators': [50, 75, 100, 150, 200, 250, 300, 400]}
print('Params from GradBoostModel', GradBoostModel.get_params().keys())
tuning = GridSearchCV(estimator=GradBoostModel, param_grid=LR, scoring='r2')
tuning.fit(X_train, y_train)
Below is the code that I am trying to execute
# Train a logistic regression model, report the coefficients and model performance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
clf = LogisticRegression().fit(X_train, y_train)
params = {'penalty':['l1','l2'],'dual':[True,False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True,False],
'solver':['saga']}
gridlog = GridSearchCV(clf, params, cv=5, n_jobs=2, scoring='roc_auc')
cv_scores = cross_val_score(gridlog, X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_) # throws error
The last code line above is where the error is being thrown from. I have used this exact same code to run other models. Any idea why I may be facing this issue?
You need to fit gridlog first. cross_val_score will not do this, it returns the scores & nothing else.
Hence, as gridlog isn't trained, it throws error.
Below code works perfectly fine:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
diabetes = datasets.load_breast_cancer()
x = diabetes.data[:150]
y = diabetes.target[:150]
clf = LogisticRegression().fit(x, y)
params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gridlog = GridSearchCV(clf, params, cv=2, n_jobs=2,
scoring='roc_auc')
gridlog.fit(x,y) # <- missing in your code
cv_scores = cross_val_score(gridlog, x, y)
print(cv_scores)
#find best parameters
print('Logistic Regression parameters: ',gridlog.best_params_)
# result:
Logistic regression parameters: {'C': 1}
Your code should be updated such that the LogisticRegression classifier is passed to the GridSearch (not its fit):
from sklearn.datasets import load_breast_cancer # For example only
X_train, y_train = load_breast_cancer(return_X_y=True)
params = {'penalty':['l1', 'l2'],'dual':[True, False],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'fit_intercept':[True, False],
'solver':['saga']}
gridlog = GridSearchCV(LogisticRegression(), params, cv=5, n_jobs=2, scoring='roc_auc')
gridlog.fit(X_train, y_train)
#find best parameters
print('Logistic Regression parameters: ', gridlog.best_params_) # Now it displays all the parameters selected by the grid search
Results
Logistic Regression parameters: {'C': 0.1, 'dual': False, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'saga'}
Note, as #desertnaut pointed out, you don't use cross_val_score for GridSearchCV.
See a complete example of how to use GridSearch here.
The example use a SVC classifier instead of a LogisticRegression, but the approach is the same.
I am running RandomizedSearchCV with 5-folds in order to find best parameters. I have a hold-out set (X_test) that I use to predict. My portion of code is:
svc= SVC(class_weight=class_weights, random_state=42)
Cs = [0.01, 0.1, 1, 10, 100, 1000, 10000]
gammas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
param_grid = {'C': Cs,
'gamma': gammas,
'kernel': ['linear', 'rbf', 'poly']}
my_cv = TimeSeriesSplit(n_splits=5).split(X_train)
rs_svm = RandomizedSearchCV(SVC(), param_grid, cv = my_cv, scoring='accuracy',
refit='accuracy', verbose = 3, n_jobs=1, random_state=42)
rs_svm.fit(X_train, y_train)
y_pred = rs_svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
clfreport = classification_report(y_test, y_pred)
print (rs_svm.best_params_)
The result is classification report:
Now, I am interested in reproducing this result using a run-alone model (no randomizedsearchCV) with the selected parameters:
from sklearn.model_selection import TimeSeriesSplit
tcsv=TimeSeriesSplit(n_splits=5)
for train_index, test_index in tcsv.split(X_train):
train_index_ = int(train_index.shape[0])
test_index_ = int(test_index.shape[0])
X_train_, y_train_ = X_train[0:train_index_],y_train[0:train_index_]
X_test_, y_test_ = X_train[test_index_:],y_train[test_index_:]
class_weights = compute_class_weight('balanced', np.unique(y_train_), y_train_)
class_weights = dict(enumerate(class_weights))
svc= SVC(C=0.01, gamma=0.1, kernel='linear', class_weight=class_weights, verbose=True,
random_state=42)
svc.fit(X_train_, y_train_)
y_pred_=svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred_)
clfreport = classification_report(y_test, y_pred_)
In my understanding, the clfreports should be identical but my result after this run are:
Does anyone have any suggestions why that might be happening?
Given your 1st code snippet, where you use RandomizedSearchCV to find the best hyperparameters, you don't need to do any splitting again; so, in your 2nd snippet, you should just fit using the found hyperparameters and the class weights using the whole of your training set, and then predict on your test set:
class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = dict(enumerate(class_weights))
svc= SVC(C=0.01, gamma=0.1, kernel='linear', class_weight=class_weights, verbose=True, random_state=42)
svc.fit(X_train, y_train)
y_pred_=svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
clfreport = classification_report(y_test, y_pred)
The discussion in Order between using validation, training and test sets might be useful for clarifying the procedure...