Related
I am trying to fit a LightGBM Regressor in python and it gives me an error. Basically, I have a dataset where all the predictors are categorical and my target variable is continuous numeric. Since, all my X variables are categorical I converted them into numeric form using label encoding.
After that, I passed to LGBMRegressor my categorical variables in order to the algorithm to handle them accordingly.
# lightgbm for regression
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
df = pd.read_csv("TrainModelling.csv")
df.drop(df.columns[0],axis=1,inplace=True) #Remove index column
y = df["Target"]
X = df.drop("Target", axis=1)
le = preprocessing.LabelEncoder()
X = X.apply(le.fit_transform)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
hyper_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': ['l2', 'auc'],
'learning_rate': 0.005,
'feature_fraction': 0.9,
'bagging_fraction': 0.7,
'bagging_freq': 10,
'verbose': 0,
"max_depth": 8,
"num_leaves": 128,
"max_bin": 512,
"num_iterations": 100000,
"n_estimators": 1000
}
cat_feature_list = np.where(X.dtypes != float)[0]
gbm = lgb.LGBMRegressor(**hyper_params, categorical_feature=cat_feature_list)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric='l1',
early_stopping_rounds=1000)
The error:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
This line is problematic:
cat_feature_list = np.where(X.dtypes != float)[0]
(i wish you shared the whole traceback of the error, it could have saved time..)
X.dtypes != float gives a pandas series of booleans and numpy then tries to evaluate its truthiness and hence the error. To get the name of categorical columns in a list:
cat_feature_list = X.select_dtypes("object").columns.tolist()
I want to do predictions with a Regression model.
I try to optimize my LightGBM model for the best hyperparameters while aiming for the lowest generalization RMSE score without overfitting/underfitting.
All examples I've seen use Classifications and split randomly without awareness for Time Series data + use GridSearch which are all not applicable to my problem.
How can I get bayesian hyperparameter optimization for my final model while using nested CV and TimeSeriesSplit?
My code for simple CV so far:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, Trials
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
... import data via pandas ...
y = df["target"] # predictor y
features = df.drop("target", axis=1).columns
X = df[traffic_features] # features X
days = len(df)- 60 # 2 Months for test data / ~20%
X_train, X_test = X[:days], X[days:]
y_train, y_test = y[:days], y[days:]
# hyperopt
random_state = 42
def lightgbm_cv(params, random_state=random_state, cv=cvTSS, X=X_train, y=y_train):
params = {
'n_estimators': int(params['n_estimators']),
'max_depth': int(params['max_depth']),
'learning_rate': params['learning_rate'],
'min_child_weight': params['min_child_weight'],
'feature_fraction': params['feature_fraction'],
'bagging_fraction': params['bagging_fraction'],
'bagging_freq': int(params['bagging_freq']),
'num_leaves': int(params['num_leaves']),
'max_bin': int(params['max_bin']),
'num_iterations': int(params['num_iterations']),
'objective': 'rmse',
}
# we use this params to create a new LGBM Regressor
model = lgb.LGBMRegressor(random_state=random_state, **params)
# and then conduct the cross validation with the same folds as before
score = -cross_val_score(model, X, y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1).mean()
print(score)
return score
space={
'n_estimators': hp.quniform('n_estimators', 100, 10_000, 1),
'max_depth' : hp.quniform('max_depth', 2, 100, 1),
'learning_rate': hp.loguniform('learning_rate', -5, 2),
'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
'feature_fraction': hp.quniform('feature_fraction', 0.1, 1, 0.1),
'bagging_fraction': hp.quniform('bagging_fraction', 0.1, 1, 0.1),
'bagging_freq': hp.quniform('bagging_freq', 1, 1_000, 1),
"num_leaves": hp.quniform('num_leaves', 10, 1_000, 1),
"max_bin": hp.quniform('max_bin', 10, 2_000, 1),
"num_iterations": hp.quniform('num_iterations', 100, 10_000, 1),
'objective': 'rmse',
#'verbose': 0,
}
# trials will contain logging information
trials = Trials()
cvTSS = TimeSeriesSplit(max_train_size=None, n_splits=10) #
n_iter = 100
best=fmin(fn=lightgbm_cv, # function to optimize
space=space,
algo=tpe.suggest, # optimization, hyperotp will select its parameters automatically
max_evals=n_iter, # maximum number of iterations
trials=trials, # logging
stratified = False,
rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
)
# computing the score on the test set - some parameters from "space" are missing here, not important atm
model = lgb.LGBMRegressor(random_state=random_state, n_estimators=int(best['n_estimators']),
max_depth=int(best['max_depth']),learning_rate=best['learning_rate'])
model.fit(X_train, y_train)
tpe_test_score = mean_squared_error(y_test, model.predict(X_test), squared=False)
print("Best RMSE {:.3f} params {}".format( lightgbm_cv(best), best))
I am running this:
# Hyperparameter tuning - Random Forest #
# Hyperparameters' grid
parameters = {'n_estimators': list(range(100, 250, 25)), 'criterion': ['gini', 'entropy'],
'max_depth': list(range(2, 11, 2)), 'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
'class_weight': [{0: 1, 1: i} for i in np.arange(1, 4, 0.2).tolist()], 'min_samples_split': list(range(2, 7))}
# Instantiate random forest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
# Execute grid search and retrieve the best classifier
from sklearn.model_selection import GridSearchCV
classifiers_grid = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='balanced_accuracy',
cv=5, refit=True, n_jobs=-1)
classifiers_grid.fit(X, y)
and I am receiving this warning:
.../anaconda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:536:
FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
TypeError: '<' not supported between instances of 'str' and 'int'
Why is this and how can I fix it?
I had similar issue of FitFailedWarning with different details, after many runs I found, the parameter value passing has the error, try
parameters = {'n_estimators': [100,125,150,175,200,225,250],
'criterion': ['gini', 'entropy'],
'max_depth': [2,4,6,8,10],
'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
'class_weight': [0.2,0.4,0.6,0.8,1.0],
'min_samples_split': [2,3,4,5,6,7]}
This will pass for sure, for me it happened in XGBClassifier, somehow the values datatype mixing up
One more is if the value exceeds the range, for example in XGBClassifier 'subsample' paramerters max value is 1.0, if it is set as 1.1, FitFailedWarning will occur
For me this was giving same error but after removing none from max_dept it is fitting properly.
param_grid={'n_estimators':[100,200,300,400,500],
'criterion':['gini', 'entropy'],
'max_depth':['None',5,10,20,30,40,50,60,70],
'min_samples_split':[5,10,20,25,30,40,50],
'max_features':[ 'sqrt', 'log2'],
'max_leaf_nodes':[5,10,20,25,30,40,50],
'min_samples_leaf':[1,100,200,300,400,500]
}
code which is running properly:
param_grid={'n_estimators':[100,200,300,400,500],
'criterion':['gini', 'entropy'],
'max_depth':[5,10,20,30,40,50,60,70],
'min_samples_split':[5,10,20,25,30,40,50],
'max_features':[ 'sqrt', 'log2'],
'max_leaf_nodes':[5,10,20,25,30,40,50],
'min_samples_leaf':[1,100,200,300,400,500]
}
I too got same error and when I passed hyperparameters as in MachineLearningMastery, I got output without warning...
Try this way if anyone get similar issues...
# grid search logistic regression model on the sonar dataset
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
Make sure the y-variable is an int, not bool or str.
Change your last line of code to make the y series a 0 or 1, for example:
classifiers_grid.fit(X, list(map(int, y)))
I need some help to do a bagging aggregation of different XGBoost models (with types Booster). The idea is after to store one model, the final one, into a pickle file.
I start by creating a dummy dataframe:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
import pickle
dummy_df = pd.DataFrame(np.random.randn(100, 3), columns=list('ABC'))
dummy_df['D'] = -15 * dummy_df['A'] + 32 * dummy_df['B']
X = dummy_df.drop('D', axis=1)
y = dummy_df['D']
I establish some parameters I'd like to test (resulting for instance from a gridsearch):
params = {'eta': 0.06, # learning rate
'tree_method': "auto",#considering my dummy df, might be more interesting to use "gblinear" of course...
'max_depth': 3,
'subsample': 0.75,
'colsample_bytree': 0.75,
'colsample_bylevel': 0.75,
'min_child_weight': 5,
'alpha': 10,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'random_state': 99,
'silent': True}
Finally, I create my cross-validation scheme:
accu = 0
n_splits = 5
folds = KFold(n_splits=n_splits, shuffle=True, random_state=1)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
valid_x, valid_y = X.iloc[valid_idx], y.iloc[valid_idx]
dtrain = xgb.DMatrix(train_x, train_y)
dvalid = xgb.DMatrix(valid_x, valid_y)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 2500, watchlist, maximize=False, early_stopping_rounds=40, verbose_eval=50)
if accu == 0:
model_to_save = model
accu += 1
else:
model_to_save += model
It trains properly for the first and second iterations in my for loop, but when needs to add the 2 first iterations (final line), I get the following error:
TypeError: unsupported operand type(s) for +=: 'Booster' and 'Booster'
Is there any way in Python to add 2 Boosters? And also to divide a Booster by an integer since I'll have to divide at the end model_to_save by n_splits?
PS: Storing all the XGBoost models is not an option considering other constraints I can face later on.
params = {'eta': 0.06, # learning rate
'tree_method': "auto",#considering my dummy df, might be more interesting to use "gblinear" of course...
'max_depth': 3,
'subsample': 0.75,
'colsample_bytree': 0.75,
'colsample_bylevel': 0.75,
'min_child_weight': 5,
'alpha': 10,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'random_state': 99,
'silent': True}
I would like to apply Naive Bayes with 10-fold stratified cross-validation to my data, and then I want to see how the model performs on the test data I set aside initially.
However, the results I am getting (i.e. the predicted outcome and probability values y_pred_nb2 and y_score_nb2) are identical to when I run the code without any cross-validation.
QUESTION: How can I correct this?
The code is below, where X_train consists of 75% of the entire dataset and X_test consists of 25%.
from sklearn.model_selection import StratifiedKFold
params = {}
#gridsearch searches for the best hyperparameters and keeps the classifier with the highest recall score
skf = StratifiedKFold(n_splits=10)
nb2 = GridSearchCV(GaussianNB(), cv=skf, param_grid=params)
%time nb2.fit(X_train, y_train)
# predict values on the test set
y_pred_nb2 = nb2.predict(X_test)
print(y_pred_nb2)
# predicted probabilities on the test set
y_scores_nb2 = nb2.predict_proba(X_test)[:, 1]
print(y_scores_nb2)
First off GaussianNB only accepts priors as an argument so unless you have some priors to set for your model ahead of time you will have nothing to grid search over.
Furthermore, your param_grid is set to an empty dictionary which ensures that you only fit one estimator with GridSearchCV. This is the same as fitting an estimator without using a grid search (e.g., I use MultinomialNB in order to show use of hyperparameters):
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
skf = StratifiedKFold(n_splits=10)
params = {}
nb = MultinomialNB()
gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=True)
data = load_iris()
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)
gs.fit(x_train, y_train)
gs.cv_results_
{'mean_fit_time': array([0.]),
'mean_score_time': array([0.]),
'mean_test_score': array([0.85714286]),
'mean_train_score': array([0.85992157]),
'params': [{}],
'rank_test_score': array([1]),
'split0_test_score': array([0.91666667]),
'split0_train_score': array([0.84]),
'split1_test_score': array([0.75]),
'split1_train_score': array([0.86]),
'split2_test_score': array([0.83333333]),
'split2_train_score': array([0.84]),
'split3_test_score': array([0.91666667]),
'split3_train_score': array([0.83]),
'split4_test_score': array([0.83333333]),
'split4_train_score': array([0.85]),
'split5_test_score': array([0.91666667]),
'split5_train_score': array([0.84]),
'split6_test_score': array([0.9]),
'split6_train_score': array([0.88235294]),
'split7_test_score': array([0.8]),
'split7_train_score': array([0.88235294]),
'split8_test_score': array([0.8]),
'split8_train_score': array([0.89215686]),
'split9_test_score': array([0.9]),
'split9_train_score': array([0.88235294]),
'std_fit_time': array([0.]),
'std_score_time': array([0.]),
'std_test_score': array([0.05832118]),
'std_train_score': array([0.02175538])}
nb.fit(x_train, y_train)
nb.score(x_test, y_test)
0.8157894736842105
gs.score(x_test, y_test)
0.8157894736842105
gs.param_grid = {'alpha': [0.1, 2]}
gs.fit(x_train, y_train)
gs.score(x_test, y_test)
0.8421052631578947
gs.cv_results_
{'mean_fit_time': array([0.00090394, 0.00049713]),
'mean_score_time': array([0.00029924, 0.0003005 ]),
'mean_test_score': array([0.86607143, 0.85714286]),
'mean_train_score': array([0.86092157, 0.85494118]),
'param_alpha': masked_array(data=[0.1, 2],
mask=[False, False],
fill_value='?',
dtype=object),
'params': [{'alpha': 0.1}, {'alpha': 2}],
'rank_test_score': array([1, 2]),
'split0_test_score': array([0.91666667, 0.91666667]),
'split0_train_score': array([0.84, 0.83]),
'split1_test_score': array([0.75, 0.75]),
'split1_train_score': array([0.86, 0.86]),
'split2_test_score': array([0.83333333, 0.83333333]),
'split2_train_score': array([0.85, 0.84]),
'split3_test_score': array([0.91666667, 0.91666667]),
'split3_train_score': array([0.83, 0.81]),
'split4_test_score': array([0.83333333, 0.83333333]),
'split4_train_score': array([0.85, 0.84]),
'split5_test_score': array([0.91666667, 0.91666667]),
'split5_train_score': array([0.84, 0.84]),
'split6_test_score': array([0.9, 0.9]),
'split6_train_score': array([0.88235294, 0.88235294]),
'split7_test_score': array([0.9, 0.8]),
'split7_train_score': array([0.88235294, 0.88235294]),
'split8_test_score': array([0.8, 0.8]),
'split8_train_score': array([0.89215686, 0.89215686]),
'split9_test_score': array([0.9, 0.9]),
'split9_train_score': array([0.88235294, 0.87254902]),
'std_fit_time': array([0.00030147, 0.00049713]),
'std_score_time': array([0.00045711, 0.00045921]),
'std_test_score': array([0.05651628, 0.05832118]),
'std_train_score': array([0.02103457, 0.02556351])}
How about something like this
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
#because only var_smoothing can be 'tuned'
#do a cross validation on different var_smoothing values
def cross_val(params):
model = GaussianNB()
model.set_params(**params)
cv_results = cross_val_score(model, X_train, y_train,
cv = 10, #10 folds
scoring = "accuracy",
verbose = 2
)
#return the mean of the 10 fold cross validation
return cv_results.mean()
#baseline parameters
params = {
"priors" : "None",
"var_smoothing" : 1e-9
}
#create an list of var_smoothing to cross validate
steps = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
#will contain the cv results
results = []
for step in steps:
params["var_smoothing"] = step
cv_result = cross_val(params)
#save result
results.append(cv_result)
#print results
#convert results to pandas dataframe for easier visualization
df = pd.DataFrame({"var_smoothing" : steps, "accuracy" : results})
#sort it
df_sorted = df.sort_values("accuracy", ascending=False)
#reset the index of the sorted dataframe
df_sorted.reset_index(inplace=True, drop=True)
df_sorted.head()