Related
I am following along with the book titled: Hands-On Machine Learning with SciKit-Learn, Keras and TensorFlow by Aurelien Geron (link). In chapter 2 you get hands on with actually building an ML system using a dataset from StatLib's California Housing Prices (link).
I have been running cross validation tests using BOTH GridSearchCV and RandomSearchCV to test and see which performs better (they both perform about the same, depending on the run GridSearch will perform better than RandomSearch and vice versa). During my cross validation of the training set, all of my RMSE's come back (after about 10 folds) looking like so:
49871.10156541779 {'max_features': 6, 'n_estimators': 100} GRID SEARCH CV
49573.67188289324 {'max_features': 6, 'n_estimators': 300} GRID SEARCH CV
49759.116323927 {'max_features': 8, 'n_estimators': 100} GRID SEARCH CV
49388.93702859155 {'max_features': 8, 'n_estimators': 300} GRID SEARCH CV
49759.445071611895 {'max_features': 10, 'n_estimators': 100} GRID SEARCH CV
49517.74394767381 {'max_features': 10, 'n_estimators': 300} GRID SEARCH CV
49796.22587441326 {'max_features': 12, 'n_estimators': 100} GRID SEARCH CV
49616.61833604992 {'max_features': 12, 'n_estimators': 300} GRID SEARCH CV
49795.571075148444 {'max_features': 14, 'n_estimators': 300} GRID SEARCH CV
49790.38581725693 {'n_estimators': 100, 'max_features': 12} RANDOM SEARCH CV
49462.758078362356 {'n_estimators': 300, 'max_features': 8} RANDOM SEARCH CV
Please note that I am selecting the best results out of about 50 or so results to present here. I am using the following code to generate this:
param_grid = [{'n_estimators' : [3, 10, 30, 100, 300],
'max_features' : [2, 4, 6, 8, 10, 12, 14]},
{'bootstrap' : [False], 'n_estimators' : [3, 10, 12],
'max_features' : [2, 3, 4]}]
forest_regressor = RandomForestRegressor({'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse',
'max_depth': None, 'max_features': 8, 'max_leaf_nodes': None,
'max_samples': None, 'min_impurity_decrease': 0.0,
'min_impurity_split': None, 'min_samples_leaf': 1,
'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0,
'n_estimators': 300, 'n_jobs': None, 'oob_score': False,
'random_state': None, 'verbose': 0, 'warm_start': False})
grid_search = GridSearchCV(forest_regressor, param_grid, cv=10, scoring="neg_mean_squared_error",
return_train_score=True, refit=True)
grid_search.fit(Dataframe, TrainingLabels)
prediction = grid_search.predict(Dataframe)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params, "GRID SEARCH CV")
##################################################################################
#Randomized Search Cross Validation
param_grid = [{'n_estimators' : [3, 10, 30, 100, 300],
'max_features' : [2, 4, 6, 8, 10, 12, 14]},
{'bootstrap' : [False], 'n_estimators' : [3, 10, 12],
'max_features' : [2, 3, 4]}]
forest_regressor = RandomForestRegressor({'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse',
'max_depth': None, 'max_features': 8, 'max_leaf_nodes': None,
'max_samples': None, 'min_impurity_decrease': 0.0,
'min_impurity_split': None, 'min_samples_leaf': 1,
'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0,
'n_estimators': 300, 'n_jobs': None, 'oob_score': False,
'random_state': None, 'verbose': 0, 'warm_start': False})
rand_search = RandomizedSearchCV(forest_regressor, param_grid, cv=10, refit=True,
scoring='neg_mean_squared_error', return_train_score=True)
rand_search.fit(Dataframe, TrainingLabels)
prediction = rand_search.predict(Dataframe)
cvres = rand_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params, "RANDOM SEARCH CV")
Now, I am doing things a little differently than what the book states; my pipeline looks as such:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
class Dataframe_Manipulation:
def __init__(self):
self.dataframe = pd.read_csv(r'C:\Users\bohayes\AppData\Local\Programs\Python\Python38\Excel and Text\housing.csv')
def Cat_Creation(self):
# Creation of an Income Category to organize the median incomes into strata (bins) to sample from
self.income_cat = self.dataframe['income_category'] = pd.cut(self.dataframe['median_income'],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
self.rooms_per_house_cat = self.dataframe['rooms_per_house'] = self.dataframe['total_rooms']/self.dataframe['households']
self.bedrooms_per_room_cat = self.dataframe['bedrooms_per_room'] = self.dataframe['total_bedrooms']/self.dataframe['total_rooms']
self.pop_per_house = self.dataframe['pop_per_house'] = self.dataframe['population'] / self.dataframe['households']
return self.dataframe
def Fill_NA(self):
self.imputer = KNNImputer(n_neighbors=5, weights='uniform')
self.dataframe['total_bedrooms'] = self.imputer.fit_transform(self.dataframe[['total_bedrooms']])
self.dataframe['bedrooms_per_room'] = self.imputer.fit_transform(self.dataframe[['bedrooms_per_room']])
return self.dataframe
def Income_Cat_Split(self):
self.inc_cat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for self.train_index, self.test_index in self.inc_cat_split.split(self.dataframe, self.dataframe['income_category']):
self.strat_train_set = self.dataframe.loc[self.train_index].reset_index(drop=True)
self.strat_test_set = self.dataframe.loc[self.test_index].reset_index(drop=True)
# the proportion is the % of total instances and which strata they are assigned to
self.proportions = self.strat_test_set['income_category'].value_counts() / len(self.strat_test_set)
# Only pulling out training set!!!!!!!!!!!!!!!
return self.strat_train_set, self.strat_test_set
def Remove_Cats_Test(self):
self.test_labels = self.strat_test_set['median_house_value'].copy()
self.strat_test_set = self.strat_test_set.drop(['median_house_value'], axis=1)
return self.test_labels
def Remove_Cats_Training(self):
self.training_labels = self.strat_train_set['median_house_value'].copy()
self.strat_train_set = self.strat_train_set.drop(['median_house_value'], axis=1)
return self.training_labels
def Encode_Transform(self):
self.column_trans = make_column_transformer((OneHotEncoder(), ['ocean_proximity']), remainder='passthrough')
self.training_set_encoded = self.column_trans.fit_transform(self.strat_train_set)
self.test_set_encoded = self.column_trans.fit_transform(self.strat_test_set)
return self.training_set_encoded, self.test_set_encoded
def Standard_Scaler(self):
self.scaler = StandardScaler()
self.scale_training_set = self.scaler.fit(self.training_set_encoded)
self.scale_test_set = self.scaler.fit(self.test_set_encoded)
self.scaled_training_set = self.scaler.transform(self.training_set_encoded)
self.scaled_test_set = self.scaler.transform(self.test_set_encoded)
return self.scaled_training_set
def Test_Set(self):
return self.scaled_test_set
A = Dataframe_Manipulation()
B = A.Cat_Creation()
C = A.Fill_NA()
D = A.Income_Cat_Split()
TestLabels = A.Remove_Cats_Test()
TrainingLabels = A.Remove_Cats_Training()
G = A.Encode_Transform()
TrainingSet = A.Standard_Scaler()
TestSet = A.Test_Set()
The Grid and Random Searches come after this bit, however my RMSE scores come back drastically different when I test them on the TestSet, which leads me to believe that I am overfitting, however maybe the RSME's look different because I am using a smaller test set? Here you go:
19366.910530221918
19969.043158986697
Now here is the code that generates that: and it comes after I run Grid and Random Searches and fit the test labels and test set to the model:
#Final Grid Model
final_grid_model = grid_search.best_estimator_
final_grid_prediction = final_grid_model.predict(TestSet)
final_grid_mse = mean_squared_error(TestLabels, final_grid_prediction)
final_grid_rmse = np.sqrt(final_grid_mse)
print(final_grid_rmse)
###################################################################################
#Final Random Model
final_rand_model = rand_search.best_estimator_
final_rand_prediction = final_rand_model.predict(TestSet)
final_rand_mse = mean_squared_error(TestLabels, final_rand_prediction)
final_rand_rmse = np.sqrt(final_rand_mse)
print(final_rand_rmse)
Just to make sure I also did a confidence score on the model as well and these are the code and results:
#Confidence Grid Search
confidence = 0.95
squared_errors = (final_grid_prediction - TestLabels) ** 2
print(np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors))))
###################################################################################
#Confidence Random Search
confidence1 = 0.95
squared_errors1 = (final_rand_prediction - TestLabels) ** 2
print(np.sqrt(stats.t.interval(confidence1, len(squared_errors1) - 1,
loc=squared_errors1.mean(),
scale=stats.sem(squared_errors1))))
>>>[18643.4914044 20064.26363526]
[19222.30464011 20688.84660134]
Why is it that my average RMSE score on the TrainingSet is about 49,000 and that same score on the test set is averaging at about 19,000? I must be overfitting, but I am not sure how or where I am going wrong.
tl;dr: Your code is unnecessarily convoluted for such a (standard) job; do not re-invent the wheel, go with a pipeline instead.
There is an error in how you scale your data, which most probably is the root cause of the observed behavior here; in the second line:
self.scale_training_set = self.scaler.fit(self.training_set_encoded)
self.scale_test_set = self.scaler.fit(self.test_set_encoded)
you essentially overwrite your scaler with the results on the test set fit, and subsequently you actually scale your training data with this test-fitted scaler:
self.scaled_training_set = self.scaler.transform(self.training_set_encoded)
Since your test set is only 20% of the dataset, what happens is that it does not contain enough values to adequately cover the whole range (min-max) of the (bigger) training set; as a result, the training set is mis-scaled (actually containing values well above the max value of the test set), which probably leads to a higher RMSE (which is not scale invariant, and by definition depends on the scale pf the predictions).
You may think that using StratifiedShuffleSplit upstream should have protected you from such a case, but truth is that StratifiedShuffleSplit is only good for classification datasets, and it is actually meaningless in regression ones (I am genuinely surprised that it does not throw an error here).
To remedy this issue, you should just remove the line
self.scale_test_set = self.scaler.fit(self.test_set_encoded)
from your Standard_Scaler() function.
Keep in mind that, in general, we never fit on a test set - we only transform; scikit-learn pipelines, apart from saving you from having to write all this boilerplate code (thus increasing the probability of coding errors), will protect you from this kind of error...
I have 2 regressors:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
params = {
'num_leaves': [7, 14, 21, 28, 31, 50],
'learning_rate': [0.1, 0.03, 0.003],
'max_depth': [-1, 3, 5],
'n_estimators': [50, 100, 200, 500],
}
grid = GridSearchCV(lgb.LGBMRegressor(random_state=0), params, scoring='r2', cv=5)
grid.fit(X_train, y_train)
reg = lgb.LGBMRegressor(random_state=0)
As you see, I defined a random_state for both regressors. GridSearchCV must find the best params for estimator to increace its scroring. But
r2_score(y_train, grid.predict(X_train)) # output is 0.69
r2_score(y_train, reg.predict(X_train)) # output is 0.84
So, how can find best params for LGBMRegressor?
Based on the documentation here, after calling grid.fit() you can find the best estimator (ready model) and params here:
grid.best_estimator_
grid.best_params_
FYI: random_state works just for random cases (when shuffling for example).
In your case the params for models are different and the results of your metric R2 are accordingly different.
So, I believe you would have to script it like:
params = {
'num_leaves': [7, 14, 21, 28, 31, 50],
'learning_rate': [0.1, 0.03, 0.003],
'max_depth': [-1, 3, 5],
'n_estimators': [50, 100, 200, 500],
}
grid = GridSearchCV(lgb.LGBMRegressor(random_state=0), params, scoring='r2', cv=5)
grid.fit(X_train, y_train)
reg = lgb.LGBMRegressor(random_state=0)
reg.fit(X_train,y_train)
lgbm_tuned = grid.best_estimator_
r2_tuned = grid.best_params_
r2_regular = r2_score(y_train, reg.predict(X_train))
when r2_tuned is the best score found with Grid Search, lgbm_tuned is your model defined with the best parameters and r2_regular is your score with default parameters.
It is weird to find a worst result after gridsearch, specially when the parameters for the gridsearch includes the default parameters for LightGBM.
I am running this:
# Hyperparameter tuning - Random Forest #
# Hyperparameters' grid
parameters = {'n_estimators': list(range(100, 250, 25)), 'criterion': ['gini', 'entropy'],
'max_depth': list(range(2, 11, 2)), 'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
'class_weight': [{0: 1, 1: i} for i in np.arange(1, 4, 0.2).tolist()], 'min_samples_split': list(range(2, 7))}
# Instantiate random forest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
# Execute grid search and retrieve the best classifier
from sklearn.model_selection import GridSearchCV
classifiers_grid = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='balanced_accuracy',
cv=5, refit=True, n_jobs=-1)
classifiers_grid.fit(X, y)
and I am receiving this warning:
.../anaconda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:536:
FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
TypeError: '<' not supported between instances of 'str' and 'int'
Why is this and how can I fix it?
I had similar issue of FitFailedWarning with different details, after many runs I found, the parameter value passing has the error, try
parameters = {'n_estimators': [100,125,150,175,200,225,250],
'criterion': ['gini', 'entropy'],
'max_depth': [2,4,6,8,10],
'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
'class_weight': [0.2,0.4,0.6,0.8,1.0],
'min_samples_split': [2,3,4,5,6,7]}
This will pass for sure, for me it happened in XGBClassifier, somehow the values datatype mixing up
One more is if the value exceeds the range, for example in XGBClassifier 'subsample' paramerters max value is 1.0, if it is set as 1.1, FitFailedWarning will occur
For me this was giving same error but after removing none from max_dept it is fitting properly.
param_grid={'n_estimators':[100,200,300,400,500],
'criterion':['gini', 'entropy'],
'max_depth':['None',5,10,20,30,40,50,60,70],
'min_samples_split':[5,10,20,25,30,40,50],
'max_features':[ 'sqrt', 'log2'],
'max_leaf_nodes':[5,10,20,25,30,40,50],
'min_samples_leaf':[1,100,200,300,400,500]
}
code which is running properly:
param_grid={'n_estimators':[100,200,300,400,500],
'criterion':['gini', 'entropy'],
'max_depth':[5,10,20,30,40,50,60,70],
'min_samples_split':[5,10,20,25,30,40,50],
'max_features':[ 'sqrt', 'log2'],
'max_leaf_nodes':[5,10,20,25,30,40,50],
'min_samples_leaf':[1,100,200,300,400,500]
}
I too got same error and when I passed hyperparameters as in MachineLearningMastery, I got output without warning...
Try this way if anyone get similar issues...
# grid search logistic regression model on the sonar dataset
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
Make sure the y-variable is an int, not bool or str.
Change your last line of code to make the y series a 0 or 1, for example:
classifiers_grid.fit(X, list(map(int, y)))
I need some help to do a bagging aggregation of different XGBoost models (with types Booster). The idea is after to store one model, the final one, into a pickle file.
I start by creating a dummy dataframe:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
import pickle
dummy_df = pd.DataFrame(np.random.randn(100, 3), columns=list('ABC'))
dummy_df['D'] = -15 * dummy_df['A'] + 32 * dummy_df['B']
X = dummy_df.drop('D', axis=1)
y = dummy_df['D']
I establish some parameters I'd like to test (resulting for instance from a gridsearch):
params = {'eta': 0.06, # learning rate
'tree_method': "auto",#considering my dummy df, might be more interesting to use "gblinear" of course...
'max_depth': 3,
'subsample': 0.75,
'colsample_bytree': 0.75,
'colsample_bylevel': 0.75,
'min_child_weight': 5,
'alpha': 10,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'random_state': 99,
'silent': True}
Finally, I create my cross-validation scheme:
accu = 0
n_splits = 5
folds = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], y.iloc[valid_idx]
dtrain = xgb.DMatrix(train_x, train_y)
dvalid = xgb.DMatrix(valid_x, valid_y)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 2500, watchlist, maximize=False, early_stopping_rounds=40, verbose_eval=50)
if accu == 0:
model_to_save = model
accu += 1
else:
model_to_save += model
It trains properly for the first and second iterations in my for loop, but when needs to add the 2 first iterations (final line), I get the following error:
TypeError: unsupported operand type(s) for +=: 'Booster' and 'Booster'
Is there any way in Python to add 2 Boosters? And also to divide a Booster by an integer since I'll have to divide at the end model_to_save by n_splits?
PS: Storing all the XGBoost models is not an option considering other constraints I can face later on.
params = {'eta': 0.06, # learning rate
'tree_method': "auto",#considering my dummy df, might be more interesting to use "gblinear" of course...
'max_depth': 3,
'subsample': 0.75,
'colsample_bytree': 0.75,
'colsample_bylevel': 0.75,
'min_child_weight': 5,
'alpha': 10,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'random_state': 99,
'silent': True}
I'm trying to train a XGBoost model using the params below:
xgb_params = {
'objective': 'binary:logistic',
'eval_metric': 'auc',
'lambda': 0.8,
'alpha': 0.4,
'max_depth': 10,
'max_delta_step': 1,
'verbose': True
}
Since my input data is too big to be fully loaded into the memory, I adapt the incremental training:
xgb_clf = xgb.train(xgb_params, input_data, num_boost_round=rounds_per_batch,
xgb_model=model_path)
The code for prediction is
xgb_clf = xgb.XGBClassifier()
booster = xgb.Booster()
booster.load_model(model_path)
xgb_clf._Booster = booster
raw_probas = xgb_clf.predict_proba(x)
The result seemed good. But when I tried to invoke xgb_clf.get_xgb_params(), I got a param dict in which all params were set to default values.
I can guess that the root cause is when I initialized the model, I didn't pass any params in. So the model was initialized using the default values but when it predicted, it used an internal booster that had been fitted using some pre-defined params.
However, I wonder is there any way that, after I assign a pre-trained booster model to a XGBClassifier, I can see the real params that are used to train the booster, but not those which are used to initialize the classifier.
You seem to be mixing the sklearn API with the functional API in your code, if you stick to either one you should get the parameters to persist in the pickle. Here's an example using the sklearn API.
import pickle
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_digits
digits = load_digits(2)
y = digits['target']
X = digits['data']
xgb_params = {
'objective': 'binary:logistic',
'reg_lambda': 0.8,
'reg_alpha': 0.4,
'max_depth': 10,
'max_delta_step': 1,
}
clf = xgb.XGBClassifier(**xgb_params)
clf.fit(X, y, eval_metric='auc', verbose=True)
pickle.dump(clf, open("xgb_temp.pkl", "wb"))
clf2 = pickle.load(open("xgb_temp.pkl", "rb"))
assert np.allclose(clf.predict(X), clf2.predict(X))
print(clf2.get_xgb_params())
which produces
{'base_score': 0.5,
'colsample_bylevel': 1,
'colsample_bytree': 1,
'gamma': 0,
'learning_rate': 0.1,
'max_delta_step': 1,
'max_depth': 10,
'min_child_weight': 1,
'missing': nan,
'n_estimators': 100,
'objective': 'binary:logistic',
'reg_alpha': 0.4,
'reg_lambda': 0.8,
'scale_pos_weight': 1,
'seed': 0,
'silent': 1,
'subsample': 1}
If you are training like this -
dtrain = xgb.DMatrix(x_train, label=y_train)
model = xgb.train(model_params, dtrain, model_num_rounds)
Then the model returned is a Booster.
import json
json.loads(model.save_config())
the model.save_config() function lists down model parameters in addition to other configurations.
To add to #ytsaig's answer, if you are using early_stopping_rounds argument in clf.fit() method then certain additional parameters are generated but not returned as part of clf.get_xgb_params() method. These can be accessed directly as follows: clf.best_score, clf.best_iteration and clf.best_ntree_limit.
Ref: https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier.fit