Extending xgboost.XGBClassifier - python

I am trying to define a class called XGBExtended that extends the class xgboost.XGBClassifier, the scikit-learn API for xgboost. I am running into some issues with the get_params method. Below is an IPython session illustrating the issue. Basically, get_params seems to only be returning the attributes I define within XGBExtended.__init__, and attributes defined during the parent init method (xgboost.XGBClassifier.__init__) are ignored. I am using IPython and running python 2.7. Full system specs at bottom.
In [182]: import xgboost as xgb
...:
...: class XGBExtended(xgb.XGBClassifier):
...: def __init__(self, foo):
...: super(XGBExtended, self).__init__()
...: self.foo = foo
...:
...: clf = XGBExtended(foo = 1)
...:
...: clf.get_params()
...:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-182-431c4c3f334b> in <module>()
8 clf = XGBExtended(foo = 1)
9
---> 10 clf.get_params()
/Users/andrewhannigan/lib/xgboost/python-package/xgboost/sklearn.pyc in get_params(self, deep)
188 if isinstance(self.kwargs, dict): # if kwargs is a dict, update params accordingly
189 params.update(self.kwargs)
--> 190 if params['missing'] is np.nan:
191 params['missing'] = None # sklearn doesn't handle nan. see #4725
192 if not params.get('eval_metric', True):
KeyError: 'missing'
So I've hit an error because 'missing' is not a key in the params dict within the XGBClassifier.get_params method. I enter the debugger to poke around:
In [183]: %debug
> /Users/andrewhannigan/lib/xgboost/python-package/xgboost/sklearn.py(190)get_params()
188 if isinstance(self.kwargs, dict): # if kwargs is a dict, update params accordingly
189 params.update(self.kwargs)
--> 190 if params['missing'] is np.nan:
191 params['missing'] = None # sklearn doesn't handle nan. see #4725
192 if not params.get('eval_metric', True):
ipdb> params
{'foo': 1}
ipdb> self.__dict__
{'n_jobs': 1, 'seed': None, 'silent': True, 'missing': nan, 'nthread': None, 'min_child_weight': 1, 'random_state': 0, 'kwargs': {}, 'objective': 'binary:logistic', 'foo': 1, 'max_depth': 3, 'reg_alpha': 0, 'colsample_bylevel': 1, 'scale_pos_weight': 1, '_Booster': None, 'learning_rate': 0.1, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 100, 'booster': 'gbtree', 'colsample_bytree': 1, 'subsample': 1, 'reg_lambda': 1, 'gamma': 0}
ipdb>
As you can see, the params contains only the foo variable. However, the object itself contains all of the params defined by xgboost.XGBClassifier.__init__. But for some reason the BaseEstimator.get_params method which is called from xgboost.XGBClassifier.get_params is only getting the parameters defined explicitly in the XGBExtended.__init__ method. Unfortunately, even if I explicitly call get_params with deep = True, it still does not work correctly:
ipdb> super(XGBModel, self).get_params(deep=True)
{'foo': 1}
ipdb>
Can anyone tell why this is happening?
System specs:
In [186]: print IPython.sys_info()
{'commit_hash': u'1149d1700',
'commit_source': 'installation',
'default_encoding': 'UTF-8',
'ipython_path': '/Users/andrewhannigan/virtualenvironment/nimble_ai/lib/python2.7/site-packages/IPython',
'ipython_version': '5.4.1',
'os_name': 'posix',
'platform': 'Darwin-14.5.0-x86_64-i386-64bit',
'sys_executable': '/usr/local/Cellar/python/2.7.10/Frameworks/Python.framework/Versions/2.7/Resources/Python.app/Contents/MacOS/Python',
'sys_platform': 'darwin',
'sys_version': '2.7.10 (default, Jul 3 2015, 12:05:53) \n[GCC 4.2.1 Compatible Apple LLVM 6.1.0 (clang-602.0.53)]'}

The problem here is incorrect declaration of child class.
When you declare the init method only using foo, you are overriding the original one. It will not be initialized automatically, even if the base class constructor is supposed to have default values for them.
You should use the following:
class XGBExtended(xgb.XGBClassifier):
def __init__(self, foo, max_depth=3, learning_rate=0.1,
n_estimators=100, silent=True,
objective="binary:logistic",
nthread=-1, gamma=0, min_child_weight=1,
max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
base_score=0.5, seed=0, missing=None, **kwargs):
# Pass the required parameters to super class
super(XGBExtended, self).__init__(max_depth, learning_rate,
n_estimators, silent, objective,
nthread, gamma, min_child_weight,
max_delta_step, subsample,
colsample_bytree, colsample_bylevel,
reg_alpha, reg_lambda,
scale_pos_weight, base_score, seed, missing, **kwargs)
# Use other custom parameters
self.foo = foo
After that you will not get any error.
clf = XGBExtended(foo = 1)
print(clf.get_params(deep=True))
>>> {'reg_alpha': 0, 'colsample_bytree': 1, 'silent': True,
'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1,
'missing': None, 'max_delta_step': 0, 'nthread': -1, 'base_score': 0.5,
'n_estimators': 100, 'subsample': 1, 'reg_lambda': 1, 'seed': 0,
'min_child_weight': 1, 'objective': 'binary:logistic',
'foo': 1, 'max_depth': 3, 'gamma': 0}

Related

TabNetRegressor does not recognize parameters on python

I can't understand why, using TabnetRegressor, it does not recognize in any way the parameters created using optuna, Using tabnetclassifier gives me no problem, but if I use tabnetregressor it tells me Unexpected argument
`
clf = TabNetRegressor(**final_params) # TabNetRegressor()
clf.fit(
X_train=X_train.values, y_train=y_train.values,
eval_set=[(X_test.values, y_test.values)],
patience=TabNet_params['patience'], max_epochs=epochs,
eval_metric=['rmse']
)
res.append(roc_auc_score(y_test.values, clf.predict(X_test.values)))
File line 504, in main_pipeline2
clf = TabNetRegressor(**final_params) # TabNetRegressor()
TypeError: __init__() got an unexpected keyword argument 'n_d'
this is how I create the hyperparameters, using tabnetclassifier because the regressor gives me problems
def Objective(trial):
mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
n_da = trial.suggest_int("n_da", 56, 64, step=4)
n_steps = trial.suggest_int("n_steps", 1, 3, step=1)
gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
n_shared = trial.suggest_int("n_shared", 1, 3)
lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
mask_type=mask_type, n_shared=n_shared,
scheduler_params=dict(mode="min",
patience=trial.suggest_int("patienceScheduler", low=3, high=10),
# changing sheduler patience to be lower than early stopping patience
min_lr=1e-5,
factor=0.5, ),
scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
verbose=0,
) # early stopping
regressor = TabNetClassifier(**tabnet_params)
regressor.fit(X_train=X_train.values, y_train=y_train.values,
eval_set=[(X_test.values, y_test.values)],
patience=trial.suggest_int("patience", low=15, high=30),
max_epochs=trial.suggest_int('epochs', 1, 100),
eval_metric=['rmse'])
avg = roc_auc_score(y_test.values, regressor.predict(X_test.values))
return avg
study = optuna.create_study(direction="maximize", study_name='TabNet optimization')
# TabNet_params = {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 1, 'gamma': 1.2, 'n_shared': 1, 'lambda_sparse': 0.00018593172980376437, 'patienceScheduler': 8, 'patience': 17, 'epochs': 13}
TabNet_params = {'mask_type': 'entmax', 'n_da': 56, 'n_steps': 3, 'gamma': 1.4, 'n_shared': 2,
'lambda_sparse': 7.628773104483722e-05, 'patienceScheduler': 10, 'patience': 29, 'epochs': 45}
final_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'], n_steps=TabNet_params['n_steps'],
gamma=TabNet_params['gamma'],
lambda_sparse=TabNet_params['lambda_sparse'], optimizer_fn=torch.optim.Adam,
optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
mask_type=TabNet_params['mask_type'], n_shared=TabNet_params['n_shared'],
scheduler_params=dict(mode="min",
patience=TabNet_params['patienceScheduler'],
min_lr=1e-5,
factor=0.5, ),
scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
verbose=0)
epochs = TabNet_params['epochs']

How to get "is_active_trail" in pgmpy working?

I'm unable to get the code on this webpage to run. I don't see the function "is_active_trail" in pgmpy. Has it been changed or removed? How best to get the code to work?
from pgmpy.models import BayesianModel
model = BayesianModel()
model.add_nodes_from(['rain', 'traffic_jam'])
model.add_edge('rain', 'traffic_jam')
model.add_edge('rain', 'traffic_jam')
model.add_edge('accident', 'traffic_jam')
model.nodes()
from pgmpy.factors.discrete import TabularCPD
cpd_rain = TabularCPD('rain', 2, [[0.4], [0.6]])
cpd_accident = TabularCPD('accident', 2, [[0.2], [0.8]])
cpd_traffic_jam = TabularCPD(
'traffic_jam', 2,
[[0.9, 0.6, 0.7, 0.1],
[0.1, 0.4, 0.3, 0.9]],
evidence=['rain', 'accident'],
evidence_card=[2, 2])
model.add_cpds(cpd_rain, cpd_accident, cpd_traffic_jam)
model.get_cpds()
model.add_node('long_queues')
model.add_edge('traffic_jam', 'long_queues')
cpd_long_queues = TabularCPD('long_queues', 2,
[[0.9, 0.2],
[0.1, 0.8]],
evidence=['traffic_jam'],
evidence_card=[2])
model.add_cpds(cpd_long_queues)
model.add_nodes_from(['getting_up_late',
'late_for_school'])
model.add_edges_from(
[('getting_up_late', 'late_for_school'),
('traffic_jam', 'late_for_school')])
cpd_getting_up_late = TabularCPD('getting_up_late', 2,
[[0.6], [0.4]])
cpd_late_for_school = TabularCPD(
'late_for_school', 2,
[[0.9, 0.45, 0.8, 0.1],
[0.1, 0.55, 0.2, 0.9]],
evidence=['getting_up_late',
'traffic_jam'],
evidence_card=[2, 2])
model.add_cpds(cpd_getting_up_late, cpd_late_for_school)
model.get_cpds()
model.check_model()
model.is_active_trail('accident', 'rain')
error:
/usr/local/lib/python3.7/dist-packages/pgmpy/models/BayesianModel.py:10: FutureWarning: BayesianModel has been renamed to BayesianNetwork. Please use BayesianNetwork class, BayesianModel will be removed in future.
FutureWarning,
AttributeError Traceback (most recent call last)
in ()
46 model.check_model()
47
---> 48 model.is_active_trail('accident', 'rain')
AttributeError: 'BayesianModel' object has no attribute 'is_active_trail'

How to use xgboost.train()

I am new to XGBoost and I want to use the train() function but when I try to I get the following error
146 else:
147 try:
148 main()
149 except KeyboardInterrupt:
150 print("KeyboardInterrupt, exiting")
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in __init__(self, params, cache, model_file)
938 for d in cache:
939 if not isinstance(d, DMatrix):
940 raise TypeError('invalid cache item: {}'.format(type(d).__name__))
941 self._validate_features(d)
942
TypeError: invalid cache item: DataFrame
my code is
import xgboost as xgb
xgb_params = {
"objective": "multi:softmax",
"eta": 0.3,
"num_class": 62,
"max_depth": 10,
"nthread": 4,
"eval_metric": "merror",
"print.every.n": 1
#"silent": 1
}
clf = xgb.train(params=xgb_params, dtrain=df, num_boost_round=10)
XGB requires you to turn your dataframe into a DMatrix for it to process
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

Scikit-learn custom estimator "Invalid parameter for estimator" error

I'm implementing custom classifier for SVM ensemble with different voting schemes for my university project. My code for estimator:
svm_possible_args = {"C", "kernel", "degree", "gamma", "coef0", "shrinking", "probability", "tol", "cache_size",
"class_weight", "max_iter", "decision_function_shape", "break_ties"}
bagging_possible_args = {"n_estimators", "max_samples", "max_features", "bootstrap", "bootstrap_features",
"oob_score", "warm_start", "n_jobs"}
common_possible_args = {"random_state", "verbose"}
class SVMEnsemble(BaggingClassifier):
def __init__(self, kernel="linear", voting_method=None, **kwargs):
if voting_method not in {None, "hard", "soft"}:
raise ValueError(f"voting_method {voting_method} is not recognized.")
svm_args = dict()
bagging_args = dict()
for arg_name, arg_val in kwargs.items():
if arg_name in svm_possible_args:
svm_args[arg_name] = arg_val
elif arg_name in bagging_possible_args:
bagging_args[arg_name] = arg_val
elif arg_name in common_possible_args:
svm_args[arg_name] = arg_val
bagging_args[arg_name] = arg_val
else:
raise ValueError(f"argument {voting_method} is not recognized.")
probability = True if voting_method == "soft" else False
svm_args = dict() if not svm_args else svm_args
base_estimator = SVC(kernel=kernel, probability=probability, **svm_args)
super().__init__(base_estimator=base_estimator, **bagging_args)
self.voting_method = voting_method
def predict(self, X):
if self.voting_method in {None, "hard"}:
return super().predict(X)
elif self.voting_method == "soft":
probabilities = np.zeros((X.shape[0], self.classes_.shape[0]))
for estimator in self.estimators_:
estimator_probabilities = estimator.predict_proba(X)
probabilities += estimator_probabilities
return self.classes_[probabilities.argmax(axis=1)]
else:
raise ValueError(f"voting_method {self.voting_method} is not recognized.")
I want to inherit most functionality from BaggingClassifier and plug in SVC. The user should be able to specify both SVM and bagging hyperparameters, so I've used for loop and svm_possible_args etc. to filter arguments passed to SVC and BaggingClassifier. The argument sets are almost separable (they only have random_state and verbose in common, which is not a problem).
I'm trying to find optimal hyperparameters with GridSearchCV:
def get_best_ensemble(X_train, y_train):
parameters = {
"voting_method": ["hard", "soft"],
"max_samples": np.linspace(0.5, 1, 6, endpoint=True).round(1),
"max_features": [0.7, 0.8, 0.9, 1],
"n_estimators": [5, 10, 15],
"kernel": ["linear", "poly", "rbf", "sigmoid"],
"C": [0.01, 0.1, 0.5, 1, 10],
"gamma": [0.01, 0.1, 0.3, 0.6, 1]
}
model = SVMEnsemble()
grid = GridSearchCV(model, parameters, verbose=2, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
print("Best hyperparameters:")
print(grid.best_params_)
return grid.best_estimator_
I'm getting the following error:
ValueError: Invalid parameter C for estimator SVMEnsemble(kernel=None, voting_method=None). Check the list of available parameters with `estimator.get_params().keys()`.
Using print(model.get_params().keys()) I get dict_keys(['kernel', 'voting_method']). Does that mean that I have to list all parameters for SVC and BaggingClassifier explicitly in my __init__ for SVMEnsemble for GridSearchCV to be able to "see" them and actually work? Or is there any cleaner solution?
You could override the get_params and set_params methods, or have the actual SVM object as an initialization parameter. You'll need to do something so that when the grid search tries to set_params, the estimator inside your instance gets updated correctly (not just the parameters in your instance; note that __init__ doesn't get rerun).
There's some discussion on making inherited class parameter discovery easier, but it's tricky, and wouldn't solve the second problem:
https://github.com/scikit-learn/scikit-learn/issues/13555

LightGBMError "Check failed: num_data > 0" with Sklearn RandomizedSearchCV

I'm trying LightGBMRegressor parameter tuning with Sklearn RandomizedSearchCV. I got an error with message below.
error:
LightGBMError: b'Check failed: num_data > 0 at /src/LightGBM/src/io/dataset.cpp, line 27 .\n'
I cannot tell why and the specific parameters caused this error. Any of params_dist below was not suitable for train_x.shape:(1630, 1565)?
Please tell me any hints or solutions. Thank you.
LightGBM version: '2.0.12'
function caused this error:
def get_lgbm(train_x, train_y, val_x, val_y):
lgbm = lgb.LGBMRegressor(
objective='regression',
device='gpu',
n_jobs=1,
)
param_dist = {'boosting_type': ['gbdt', 'dart', 'rf'],
'num_leaves': sp.stats.randint(2, 1001),
'subsample_for_bin': sp.stats.randint(10, 1001),
'min_split_gain': sp.stats.uniform(0, 5.0),
'min_child_weight': sp.stats.uniform(1e-6, 1e-2),
'reg_alpha': sp.stats.uniform(0, 1e-2),
'reg_lambda': sp.stats.uniform(0, 1e-2),
'tree_learner': ['data', 'feature', 'serial', 'voting' ],
'application': ['regression_l1', 'regression_l2', 'regression'],
'bagging_freq': sp.stats.randint(1, 11),
'bagging_fraction': sp.stats.uniform(1e-3, 0.99),
'feature_fraction': sp.stats.uniform(1e-3, 0.99),
'learning_rate': sp.stats.uniform(1e-6, 0.99),
'max_depth': sp.stats.randint(1, 501),
'n_estimators': sp.stats.randint(100, 20001),
'gpu_use_dp': [True, False],
}
rscv = RandomizedSearchCV(
estimator=lgbm,
param_distributions=param_dist,
cv=3,
n_iter=3000,
n_jobs=4,
verbose=1,
refit=True,
fit_params={'eval_set':(val_x, val_y.ravel()),
'early_stopping_rounds':1,
'eval_metric':['l2', 'l1'],
'verbose': False,
},
)
# This line throws error
rscv = rscv.fit(train_x,
train_y.ravel(),
)
return rscv.best_estimator_
Too long to put full stack trace, here is on the lightgbm src.
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/sklearn.py in fit(self=LGBMRegressor(application='regression_l1',
..., subsample_freq=1,
tree_learner='voting'), X=memmap([[-0.80256822, 1.63302752, -0.55377441, ...12.251635 ,
12.27866017, 1. ]]), y=array([-1.81712472, 0. , -1.7366136 , 0... , 0.36258158, -0.13661202, 0.2919708 ]), sample_weight=None, init_score=None, eval_set=(memmap([[-1.16531701, -0.97454256, -1.36807818, ...11.55465037,
11.55160629, 2. ]]), array([ 0.58517555, -1.01419878, -0.05787037, -0...64139942, 1.04166667, 0. , -0.11668611])), eval_names=None, eval_sample_weight=None, eval_init_score=None, eval_metric=['l2', 'l1'], early_stopping_rounds=1, verbose=False, feature_name='auto', categorical_feature='auto', callbacks=None)
613 eval_init_score=eval_init_score,
614 eval_metric=eval_metric,
615 early_stopping_rounds=early_stopping_rounds,
616 verbose=verbose, feature_name=feature_name,
617 categorical_feature=categorical_feature,
--> 618 callbacks=callbacks)
callbacks = None
619 return self
620
621 base_doc = LGBMModel.fit.__doc__
622 fit.__doc__ = (base_doc[:base_doc.find('eval_class_weight :')] +
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/sklearn.py in fit(self=LGBMRegressor(application='regression_l1',
..., subsample_freq=1,
tree_learner='voting'), X=array([[-0.80256822, 1.63302752, -0.55377441, .... 12.251635 ,
12.27866017, 1. ]]), y=array([-1.81712472, 0. , -1.7366136 , 0... , 0.36258158, -0.13661202, 0.2919708 ]), sample_weight=None, init_score=None, group=None, eval_set=[(memmap([[-1.16531701, -0.97454256, -1.36807818, ...11.55465037,
11.55160629, 2. ]]), array([ 0.58517555, -1.01419878, -0.05787037, -0...64139942, 1.04166667, 0. , -0.11668611]))], eval_names=None, eval_sample_weight=None, eval_class_weight=None, eval_init_score=None, eval_group=None, eval_metric=['l2', 'l1'], early_stopping_rounds=1, verbose=False, feature_name='auto', categorical_feature='auto', callbacks=None)
468 self.n_estimators, valid_sets=valid_sets, valid_names=eval_names,
469 early_stopping_rounds=early_stopping_rounds,
470 evals_result=evals_result, fobj=self._fobj, feval=feval,
471 verbose_eval=verbose, feature_name=feature_name,
472 categorical_feature=categorical_feature,
--> 473 callbacks=callbacks)
callbacks = None
474
475 if evals_result:
476 self._evals_result = evals_result
477
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/engine.py in train(params={'application': 'regression_l1', 'bagging_fraction': 0.0013516565394267757, 'bagging_freq': 8, 'boosting_type': 'dart', 'colsample_bytree': 1.0, 'device': 'gpu', 'feature_fraction': 0.18574060093496944, 'gpu_use_dp': True, 'learning_rate': 0.06354739024799887, 'max_depth': 267, ...}, train_set=<lightgbm.basic.Dataset object>, num_boost_round=11610, valid_sets=[<lightgbm.basic.Dataset object>], valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=1, evals_result={}, verbose_eval=False, learning_rates=None, keep_training_booster=False, callbacks={<function print_evaluation.<locals>.callback>, <function early_stopping.<locals>.callback>, <function record_evaluation.<locals>.callback>})
175 callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
176 callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
177
178 # construct booster
179 try:
--> 180 booster = Booster(params=params, train_set=train_set)
booster = undefined
params = {'application': 'regression_l1', 'bagging_fraction': 0.0013516565394267757, 'bagging_freq': 8, 'boosting_type': 'dart', 'colsample_bytree': 1.0, 'device': 'gpu', 'feature_fraction': 0.18574060093496944, 'gpu_use_dp': True, 'learning_rate': 0.06354739024799887, 'max_depth': 267, ...}
train_set = <lightgbm.basic.Dataset object>
181 if is_valid_contain_train:
182 booster.set_train_data_name(train_data_name)
183 for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
184 booster.add_valid(valid_set, name_valid_set)
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py in __init__(self=<lightgbm.basic.Booster object>, params={'application': 'regression_l1', 'bagging_fraction': 0.0013516565394267757, 'bagging_freq': 8, 'boosting_type': 'dart', 'colsample_bytree': 1.0, 'device': 'gpu', 'feature_fraction': 0.18574060093496944, 'gpu_use_dp': True, 'learning_rate': 0.06354739024799887, 'max_depth': 267, ...}, train_set=<lightgbm.basic.Dataset object>, model_file=None, silent=False)
1290 # construct booster object
1291 self.handle = ctypes.c_void_p()
1292 _safe_call(_LIB.LGBM_BoosterCreate(
1293 train_set.construct().handle,
1294 c_str(params_str),
-> 1295 ctypes.byref(self.handle)))
self.handle = c_void_p(None)
1296 # save reference to data
1297 self.train_set = train_set
1298 self.valid_sets = []
1299 self.name_valid_sets = []
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py in _safe_call(ret=-1)
43 ----------
44 ret : int
45 return value from API calls
46 """
47 if ret != 0:
---> 48 raise LightGBMError(_LIB.LGBM_GetLastError())
49
50
51 def is_numeric(obj):
52 """Check is a number or not, include numpy number etc."""
LightGBMError: b'Check failed: num_data > 0 at /usr/local/src/lightgbm/LightGBM/src/io/dataset.cpp, line 27 .\n'
Minimum value of bagging_fraction and feature_fraction could be too small. I changed the distribution to "sp.stats.uniform(loc=0.1, scale=0.9)" and it works.
I got the same error in LightGBM Python. In my case, the size of the test dataset was 0 rows. So make sure the size of test/train dataset is not 0 rows.
In my cases, this error always happens when min_sum_hessian_in_leaf = 0 since I do grid search for min_sum_hessian_in_leaf in [0, 2, 4, 5, 6, 7, 8, 9, 10]
After remove the 0 from the list then the error never happen again
Maybe train_x or train_y is null. You can check it by print the data

Categories