LightGBMError "Check failed: num_data > 0" with Sklearn RandomizedSearchCV

LightGBMError "Check failed: num_data > 0" with Sklearn RandomizedSearchCV - python

I'm trying LightGBMRegressor parameter tuning with Sklearn RandomizedSearchCV. I got an error with message below.
error:
LightGBMError: b'Check failed: num_data > 0 at /src/LightGBM/src/io/dataset.cpp, line 27 .\n'
I cannot tell why and the specific parameters caused this error. Any of params_dist below was not suitable for train_x.shape:(1630, 1565)?
Please tell me any hints or solutions. Thank you.
LightGBM version: '2.0.12'
function caused this error:
def get_lgbm(train_x, train_y, val_x, val_y):
lgbm = lgb.LGBMRegressor(
objective='regression',
device='gpu',
n_jobs=1,
)
param_dist = {'boosting_type': ['gbdt', 'dart', 'rf'],
'num_leaves': sp.stats.randint(2, 1001),
'subsample_for_bin': sp.stats.randint(10, 1001),
'min_split_gain': sp.stats.uniform(0, 5.0),
'min_child_weight': sp.stats.uniform(1e-6, 1e-2),
'reg_alpha': sp.stats.uniform(0, 1e-2),
'reg_lambda': sp.stats.uniform(0, 1e-2),
'tree_learner': ['data', 'feature', 'serial', 'voting' ],
'application': ['regression_l1', 'regression_l2', 'regression'],
'bagging_freq': sp.stats.randint(1, 11),
'bagging_fraction': sp.stats.uniform(1e-3, 0.99),
'feature_fraction': sp.stats.uniform(1e-3, 0.99),
'learning_rate': sp.stats.uniform(1e-6, 0.99),
'max_depth': sp.stats.randint(1, 501),
'n_estimators': sp.stats.randint(100, 20001),
'gpu_use_dp': [True, False],
}
rscv = RandomizedSearchCV(
estimator=lgbm,
param_distributions=param_dist,
cv=3,
n_iter=3000,
n_jobs=4,
verbose=1,
refit=True,
fit_params={'eval_set':(val_x, val_y.ravel()),
'early_stopping_rounds':1,
'eval_metric':['l2', 'l1'],
'verbose': False,
},
)
# This line throws error
rscv = rscv.fit(train_x,
train_y.ravel(),
)
return rscv.best_estimator_
Too long to put full stack trace, here is on the lightgbm src.
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/sklearn.py in fit(self=LGBMRegressor(application='regression_l1',
..., subsample_freq=1,
tree_learner='voting'), X=memmap([[-0.80256822, 1.63302752, -0.55377441, ...12.251635 ,
12.27866017, 1. ]]), y=array([-1.81712472, 0. , -1.7366136 , 0... , 0.36258158, -0.13661202, 0.2919708 ]), sample_weight=None, init_score=None, eval_set=(memmap([[-1.16531701, -0.97454256, -1.36807818, ...11.55465037,
11.55160629, 2. ]]), array([ 0.58517555, -1.01419878, -0.05787037, -0...64139942, 1.04166667, 0. , -0.11668611])), eval_names=None, eval_sample_weight=None, eval_init_score=None, eval_metric=['l2', 'l1'], early_stopping_rounds=1, verbose=False, feature_name='auto', categorical_feature='auto', callbacks=None)
613 eval_init_score=eval_init_score,
614 eval_metric=eval_metric,
615 early_stopping_rounds=early_stopping_rounds,
616 verbose=verbose, feature_name=feature_name,
617 categorical_feature=categorical_feature,
--> 618 callbacks=callbacks)
callbacks = None
619 return self
620
621 base_doc = LGBMModel.fit.__doc__
622 fit.__doc__ = (base_doc[:base_doc.find('eval_class_weight :')] +
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/sklearn.py in fit(self=LGBMRegressor(application='regression_l1',
..., subsample_freq=1,
tree_learner='voting'), X=array([[-0.80256822, 1.63302752, -0.55377441, .... 12.251635 ,
12.27866017, 1. ]]), y=array([-1.81712472, 0. , -1.7366136 , 0... , 0.36258158, -0.13661202, 0.2919708 ]), sample_weight=None, init_score=None, group=None, eval_set=[(memmap([[-1.16531701, -0.97454256, -1.36807818, ...11.55465037,
11.55160629, 2. ]]), array([ 0.58517555, -1.01419878, -0.05787037, -0...64139942, 1.04166667, 0. , -0.11668611]))], eval_names=None, eval_sample_weight=None, eval_class_weight=None, eval_init_score=None, eval_group=None, eval_metric=['l2', 'l1'], early_stopping_rounds=1, verbose=False, feature_name='auto', categorical_feature='auto', callbacks=None)
468 self.n_estimators, valid_sets=valid_sets, valid_names=eval_names,
469 early_stopping_rounds=early_stopping_rounds,
470 evals_result=evals_result, fobj=self._fobj, feval=feval,
471 verbose_eval=verbose, feature_name=feature_name,
472 categorical_feature=categorical_feature,
--> 473 callbacks=callbacks)
callbacks = None
474
475 if evals_result:
476 self._evals_result = evals_result
477
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/engine.py in train(params={'application': 'regression_l1', 'bagging_fraction': 0.0013516565394267757, 'bagging_freq': 8, 'boosting_type': 'dart', 'colsample_bytree': 1.0, 'device': 'gpu', 'feature_fraction': 0.18574060093496944, 'gpu_use_dp': True, 'learning_rate': 0.06354739024799887, 'max_depth': 267, ...}, train_set=<lightgbm.basic.Dataset object>, num_boost_round=11610, valid_sets=[<lightgbm.basic.Dataset object>], valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=1, evals_result={}, verbose_eval=False, learning_rates=None, keep_training_booster=False, callbacks={<function print_evaluation.<locals>.callback>, <function early_stopping.<locals>.callback>, <function record_evaluation.<locals>.callback>})
175 callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
176 callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
177
178 # construct booster
179 try:
--> 180 booster = Booster(params=params, train_set=train_set)
booster = undefined
params = {'application': 'regression_l1', 'bagging_fraction': 0.0013516565394267757, 'bagging_freq': 8, 'boosting_type': 'dart', 'colsample_bytree': 1.0, 'device': 'gpu', 'feature_fraction': 0.18574060093496944, 'gpu_use_dp': True, 'learning_rate': 0.06354739024799887, 'max_depth': 267, ...}
train_set = <lightgbm.basic.Dataset object>
181 if is_valid_contain_train:
182 booster.set_train_data_name(train_data_name)
183 for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
184 booster.add_valid(valid_set, name_valid_set)
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py in __init__(self=<lightgbm.basic.Booster object>, params={'application': 'regression_l1', 'bagging_fraction': 0.0013516565394267757, 'bagging_freq': 8, 'boosting_type': 'dart', 'colsample_bytree': 1.0, 'device': 'gpu', 'feature_fraction': 0.18574060093496944, 'gpu_use_dp': True, 'learning_rate': 0.06354739024799887, 'max_depth': 267, ...}, train_set=<lightgbm.basic.Dataset object>, model_file=None, silent=False)
1290 # construct booster object
1291 self.handle = ctypes.c_void_p()
1292 _safe_call(_LIB.LGBM_BoosterCreate(
1293 train_set.construct().handle,
1294 c_str(params_str),
-> 1295 ctypes.byref(self.handle)))
self.handle = c_void_p(None)
1296 # save reference to data
1297 self.train_set = train_set
1298 self.valid_sets = []
1299 self.name_valid_sets = []
...........................................................................
/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py in _safe_call(ret=-1)
43 ----------
44 ret : int
45 return value from API calls
46 """
47 if ret != 0:
---> 48 raise LightGBMError(_LIB.LGBM_GetLastError())
49
50
51 def is_numeric(obj):
52 """Check is a number or not, include numpy number etc."""
LightGBMError: b'Check failed: num_data > 0 at /usr/local/src/lightgbm/LightGBM/src/io/dataset.cpp, line 27 .\n'

Minimum value of bagging_fraction and feature_fraction could be too small. I changed the distribution to "sp.stats.uniform(loc=0.1, scale=0.9)" and it works.

I got the same error in LightGBM Python. In my case, the size of the test dataset was 0 rows. So make sure the size of test/train dataset is not 0 rows.

In my cases, this error always happens when min_sum_hessian_in_leaf = 0 since I do grid search for min_sum_hessian_in_leaf in [0, 2, 4, 5, 6, 7, 8, 9, 10]
After remove the 0 from the list then the error never happen again

Maybe train_x or train_y is null. You can check it by print the data

Related

Rapids CUML Random Forest Regression Model doesn't return the scoring function value

I used the code below to tune the model and search for the best combination using the grid search algorithm. However, the model does not evaluate and return the score value of each combination.
I used the same thing with the SVM regressor model and it return the score during the training.
Here is the code:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
param_grid2 = {
'bootstrap': [True, False],
'max_depth': [16, 30, 50, 80, 90, 100 ],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 3, 4, 5],
'min_samples_split': [2,4, 8, 10, 12],
'n_estimators': [100, 200, 300, 400 ,500]
}
from cuml.ensemble import RandomForestRegressor
# Create a based model
rfr_cu = RandomForestRegressor() # remove the output_type. Check this link for more details: https://docs.rapids.ai/api/cuml/stable/api.html#output-data-type-configuration
# Instantiate the grid search model
rfr_tune_cu = GridSearchCV(estimator = rfr_cu, param_grid = param_grid2, scoring='neg_mean_squared_error',
cv = 3, n_jobs = -1, verbose = 3, error_score= 'raise', return_train_score =True )
The output message
RuntimeError Traceback (most recent call last)
Input In [80], in <cell line: 2>()
1 # Fit the grid search to the data
----> 2 rfr_tune_cu.fit(cu_X_train, cu_y_train)
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:875, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
869 results = self._format_results(
870 all_candidate_params, n_splits, all_out, all_more_results
871 )
873 return results
--> 875 self._run_search(evaluate_candidates)
877 # multimetric is determined here because in the case of a callable
878 # self.scoring the return type is only known after calling
879 first_test_score = all_out[0]["test_scores"]
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:1375, in GridSearchCV._run_search(self, evaluate_candidates)
1373 def _run_search(self, evaluate_candidates):
1374 """Search all candidates in param_grid"""
-> 1375 evaluate_candidates(ParameterGrid(self.param_grid))
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:822, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
814 if self.verbose > 0:
815 print(
816 "Fitting {0} folds for each of {1} candidates,"
817 " totalling {2} fits".format(
818 n_splits, n_candidates, n_candidates * n_splits
819 )
820 )
--> 822 out = parallel(
823 delayed(_fit_and_score)(
824 clone(base_estimator),
825 X,
826 y,
827 train=train,
828 test=test,
829 parameters=parameters,
830 split_progress=(split_idx, n_splits),
831 candidate_progress=(cand_idx, n_candidates),
832 **fit_and_score_kwargs,
833 )
834 for (cand_idx, parameters), (split_idx, (train, test)) in product(
835 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
836 )
837 )
839 if len(out) < 1:
840 raise ValueError(
841 "No fits were performed. "
842 "Was the CV iterator empty? "
843 "Were there no candidates?"
844 )
File /usr/local/lib/python3.9/dist-packages/joblib/parallel.py:1056, in Parallel.__call__(self, iterable)
1053 self._iterating = False
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
File /usr/local/lib/python3.9/dist-packages/joblib/parallel.py:935, in Parallel.retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
File /usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py:542, in LokyBackend.wrap_future_result(future, timeout)
539 """Wrapper for Future.result to implement the same behaviour as
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
File /usr/lib/python3.9/concurrent/futures/_base.py:446, in Future.result(self, timeout)
444 raise CancelledError()
445 elif self._state == FINISHED:
--> 446 return self.__get_result()
447 else:
448 raise TimeoutError()
File /usr/lib/python3.9/concurrent/futures/_base.py:391, in Future.__get_result(self)
389 if self._exception:
390 try:
--> 391 raise self._exception
392 finally:
393 # Break a reference cycle with the exception in self._exception
394 self = None
RuntimeError: CUDA error encountered at: file=/project/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh line=52: call='cub::DeviceRadixSort::SortKeys( nullptr, temp_storage_bytes, data, sorted_column.data(), n_rows, 0, 8 * sizeof(T), stream)', Reason=cudaErrorInvalidDeviceFunction:invalid device function

MultiInputOutput Model RandomSearch with Scikit Pipelines

I am trying to compare different regression stategies for a forecasting problem:
Using algorithms that support multiple input output regression by default (i.e Linear Regression, Trees etc..).
Using algorithms a wrapper to do multiple input output regression (i.e SVR, XGboost)
Using the chained regressor to exploit correlations between my targets (as my forecast at t+1 is auto-correlated with the target at t+2).
The documentation of scikit for the multiple input output wrappers is actually not that good but it is mentioned that:
https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html
set_params(**params)[source]¶
Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects (such as Pipeline).
The latter have parameters of the form <component>__<parameter> so that it’s possible to
update each component of a nested object.
Therefore I am building my pipeline as:
pipeline_xgboost = Pipeline([('scaler', StandardScaler()),
('variance_selector', VarianceThreshold(threshold=0.03)),
('estimator', xgb.XGBRegressor())])
And then creating the wrapper as:
mimo_wrapper = MultiOutputRegressor(pipeline_xgboost)
Following the documentation of scikit pipelines I am defining my xgboost parameters as:
parameters = [
{
'estimator__reg_alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
'estimator__max_depth': [10, 100, 1000]
etc...
}
And then I am running my cross validation as:
randomized_search = RandomizedSearchCV(mimo_wrapper, perparameters, random_state=0, n_iter=5,
n_jobs=-1, refit=True, cv=3, verbose=True,
pre_dispatch='2*n_jobs', error_score='raise',
return_train_score=True,
scoring='neg_mean_absolute_error')
However I am getting the following issue:
ValueError: Invalid parameter reg_alpha for estimator Pipeline(steps=[('scaler', StandardScaler()),
('variance_selector', VarianceThreshold(threshold=0.03)),
('estimator',
XGBRegressor(base_score=None, booster=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, gamma=None, gpu_id=None,
importance_type='gain',
interaction_constraints=None, learning_rate=None,
max_delta_step=None, max_depth=None,
min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100,
n_jobs=None, num_parallel_tree=None,
random_state=None, reg_alpha=None,
reg_lambda=None, scale_pos_weight=None,
subsample=None, tree_method=None,
validate_parameters=None, verbosity=None))]). Check the list of available parameters with `estimator.get_params().keys()`.
Did I missunderstood the documentation of scikit? I have also tried with setting the parameters as estimator__estimator__param as maybe this is the way to access the parameters when they are in the mimo_wrapper but this as proved unsuccesfull. (Example below):
parameters = {
'estimator__estimator__reg_alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
'estimator__estimator__max_depth': [10, 100, 1000]
}
random_grid = RandomizedSearchCV(estimator=pipeline_xgboost, param_distributions=parameters,random_state=0, n_iter=5,
n_jobs=-1, refit=True, cv=3, verbose=True,
pre_dispatch='2*n_jobs', error_score='raise',
return_train_score=True,
scoring='neg_mean_absolute_error')
hyperparameters_tuning = random_grid.fit(df.drop(columns=TARGETS+UMAPS),
df[TARGETS])
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
/tmp/ipykernel_11898/2539017483.py in <module>
----> 1 hyperparameters_tuning = random_grid.fit(final_file_df_with_aggregates.drop(columns=TARGETS+UMAPS),
2 final_file_df_with_aggregates[TARGETS])
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1764 def _run_search(self, evaluate_candidates):
1765 """Search n_iter candidates from param_distributions"""
-> 1766 evaluate_candidates(
1767 ParameterSampler(
1768 self.param_distributions, self.n_iter, random_state=self.random_state
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/joblib/parallel.py in retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
/anaconda/envs/azureml_py38/lib/python3.8/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
/anaconda/envs/azureml_py38/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
437 raise CancelledError()
438 elif self._state == FINISHED:
--> 439 return self.__get_result()
440 else:
441 raise TimeoutError()
/anaconda/envs/azureml_py38/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
Funny enough I have noticed that when setting the estimator parameters outside the random search function this works well:
parameters = dict({
'estimator__max_depth': [10, 100, 1000]
})
mimo_wrapper.estimator.set_params(estimator__max_depth=200)
And as you can see the max_depth is now changed.
Pipeline(steps=[('scaler', StandardScaler()),
('variance_selector', VarianceThreshold(threshold=0.03)),
('estimator',
XGBRegressor(base_score=None, booster=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, gamma=None, gpu_id=None,
importance_type='gain',
interaction_constraints=None, learning_rate=None,
max_delta_step=None, max_depth=200,
min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100,
n_jobs=None, num_parallel_tree=None,
random_state=None, reg_alpha=None,
reg_lambda=None, scale_pos_weight=None,
subsample=None, tree_method=None,
validate_parameters=None, verbosity=None))])

Dear colleagues it seems that this was due to a problem in XGB.Regressor in any case the right way of creating parameters for the MultiOutput Regressor within a pipeline it would be:
parameters = {
'estimator__estimator__reg_alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
'estimator__estimator__max_depth': [10, 100, 1000]
}

What's the error when I used the GridSearchCV?

When I used GridSearhCV to find the best parameters setting for SVR, some errors happened.
The following is the code.
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
param_grid = [
{'kernel':["poly"], 'degree':[2, 3, 4],'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100], 'episilon':[0.1, 0.2, 0.4]},
{'kernal':["rbf"], 'gamma':[1, 2, 5, 10], 'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100]}
]
svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=10, scoring = 'neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
The input data X_train and y_train have been processed properly. And the error shown in JupyterLab is like this
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-52-39598b5bc032> in <module>
7 svm_reg = SVR()
8 grid_search = GridSearchCV(svm_reg, param_grid, cv=10, scoring = 'neg_mean_squared_error', return_train_score=True)
----> 9 grid_search.fit(X_train, y_train)
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1286 def _run_search(self, evaluate_candidates):
1287 """Search all candidates in param_grid"""
-> 1288 evaluate_candidates(ParameterGrid(self.param_grid))
1289
1290
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
579 cloned_parameters[k] = clone(v, safe=False)
580
--> 581 estimator = estimator.set_params(**cloned_parameters)
582
583 start_time = time.time()
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\base.py in set_params(self, **params)
228 key, delim, sub_key = key.partition('__')
229 if key not in valid_params:
--> 230 raise ValueError('Invalid parameter %s for estimator %s. '
231 'Check the list of available parameters '
232 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter episilon for estimator SVR(C=0.001, degree=2). Check the list of available parameters with `estimator.get_params().keys()`.
It's said that I should check the keys of SVR, but I can not find what's the error.

It's just typos. In your code:
param_grid = [
{'kernel':["poly"], 'degree':[2, 3, 4],'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100], 'episilon':[0.1, 0.2, 0.4]},
{'kernal':["rbf"], 'gamma':[1, 2, 5, 10], 'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100]}
]
Please check episilon and kernal.

How to use xgboost.train()

I am new to XGBoost and I want to use the train() function but when I try to I get the following error
146 else:
147 try:
148 main()
149 except KeyboardInterrupt:
150 print("KeyboardInterrupt, exiting")
/usr/local/lib/python3.6/dist-packages/xgboost/core.py in __init__(self, params, cache, model_file)
938 for d in cache:
939 if not isinstance(d, DMatrix):
940 raise TypeError('invalid cache item: {}'.format(type(d).__name__))
941 self._validate_features(d)
942
TypeError: invalid cache item: DataFrame
my code is
import xgboost as xgb
xgb_params = {
"objective": "multi:softmax",
"eta": 0.3,
"num_class": 62,
"max_depth": 10,
"nthread": 4,
"eval_metric": "merror",
"print.every.n": 1
#"silent": 1
}
clf = xgb.train(params=xgb_params, dtrain=df, num_boost_round=10)

XGB requires you to turn your dataframe into a DMatrix for it to process
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

IndexError: positional indexers are out-of-bounds - RandomizedSearchCV() - Random Forest

i am trying to build a random forest model using a walk forward validation approach.
I use TimeBasedCV() to split my data accordingly: TimeBasedCV()
My Code looks like this:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
from sklearn.model_selection import RandomizedSearchCV
from random import randint, uniform
tscv = TimeBasedCV(train_period=60,test_period=12,freq='months')
index_output = tscv.split(X_train, date_column='Date')
rf = RandomForestRegressor()
model = RandomizedSearchCV(
estimator = rf,
param_distributions = random_grid,
n_iter = 10,
n_jobs = -1,
cv = index_output,
verbose=5,
random_state = 42,
return_train_score = True)
model.fit(X_train.drop('Date', axis=1),y_train)
model.cv_results_
Error Message for my model.fit is
IndexError: positional indexers are out-of-bounds
Do i have to adjust my Randomized Search? Or is this error due to an error in my data?
IndexError Traceback (most recent call last)
<ipython-input-71-eebc6186b2c3> in <module>
18 return_train_score = True)
19
---> 20 model.fit(X_train,y_train)
21 model.cv_results_
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
708 return results
709
--> 710 self._run_search(evaluate_candidates)
711
712 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1482 evaluate_candidates(ParameterSampler(
1483 self.param_distributions, self.n_iter,
-> 1484 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
687 for parameters, (train, test)
688 in product(candidate_params,
--> 689 cv.split(X, y, groups)))
690
691 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
IndexError: positional indexers are out-of-bounds

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

LightGBMError "Check failed: num_data > 0" with Sklearn RandomizedSearchCV - python

Minimum value of bagging_fraction and feature_fraction could be too small. I changed the distribution to "sp.stats.uniform(loc=0.1, scale=0.9)" and it works.

I got the same error in LightGBM Python. In my case, the size of the test dataset was 0 rows. So make sure the size of test/train dataset is not 0 rows.

In my cases, this error always happens when min_sum_hessian_in_leaf = 0 since I do grid search for min_sum_hessian_in_leaf in [0, 2, 4, 5, 6, 7, 8, 9, 10] After remove the 0 from the list then the error never happen again

Maybe train_x or train_y is null. You can check it by print the data

Related

Rapids CUML Random Forest Regression Model doesn't return the scoring function value

MultiInputOutput Model RandomSearch with Scikit Pipelines

What's the error when I used the GridSearchCV?

How to use xgboost.train()

IndexError: positional indexers are out-of-bounds - RandomizedSearchCV() - Random Forest

Categories

Resources