H2O Python API: retrieve best models from GridSearch - python

I am performing a GridSearch with H2O using the Python API using the following code,
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid import H2OGridSearch
hyper_parameters = {'ntrees':[10, 50, 100, 200], 'max_depth':[5, 10, 15, 20, 25], 'balance_classes':[True, False]}
search_criteria = {
"strategy": "RandomDiscrete",
"max_runtime_secs": 600,
"max_models": 30,
"stopping_metric": 'AUTO',
"stopping_tolerance": 0.0001,
'seed': 42
}
grid_search = H2OGridSearch(H2ORandomForestEstimator, hyper_parameters, search_criteria=search_criteria)
grid_search.train(x=events_names_x,
y="total_rsvps",
training_frame=train,
validation_frame=test)
Once run I want to print the models and predict in order of AUC,
grid_search.sort_by('auc', False)
I get the following error,
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-272-b250bf2b838e> in <module>()
----> 1 grid_search.sort_by('auc', False)
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in sort_by(self, metric, increasing)
663
664 if metric[-1] != ')': metric += '()'
--> 665 c_values = [list(x) for x in zip(*sorted(eval('self.' + metric + '.items()'), key=lambda k_v: k_v[1]))]
666 c_values.insert(1, [self.get_hyperparams(model_id, display=False) for model_id in c_values[0]])
667 if not increasing:
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in <module>()
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in auc(self, train, valid, xval)
606 :return: The AUC.
607 """
--> 608 return {model.model_id: model.auc(train, valid, xval) for model in self.models}
609
610 def aic(self, train=False, valid=False, xval=False):
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in <dictcomp>(.0)
606 :return: The AUC.
607 """
--> 608 return {model.model_id: model.auc(train, valid, xval) for model in self.models}
609
610 def aic(self, train=False, valid=False, xval=False):
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/model/model_base.py in auc(self, train, valid, xval)
669 tm = ModelBase._get_metrics(self, train, valid, xval)
670 m = {}
--> 671 for k, v in viewitems(tm): m[k] = None if v is None else v.auc()
672 return list(m.values())[0] if len(m) == 1 else m
673
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/model/metrics_base.py in auc(self)
158 :return: Retrieve the AUC for this set of metrics.
159 """
--> 160 return self._metric_json['AUC']
161
162 def aic(self):
KeyError: 'AUC'
Any advise on:
can print the models in order of performance
forecast with the model with the highest AUC

what you need is
sorted_grid = grid_search.get_grid(sort_by='auc',decreasing=True)
print(sorted_grid)
you can change decreasing to False if you would prefer

Related

Rapids CUML Random Forest Regression Model doesn't return the scoring function value

I used the code below to tune the model and search for the best combination using the grid search algorithm. However, the model does not evaluate and return the score value of each combination.
I used the same thing with the SVM regressor model and it return the score during the training.
Here is the code:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
param_grid2 = {
'bootstrap': [True, False],
'max_depth': [16, 30, 50, 80, 90, 100 ],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 3, 4, 5],
'min_samples_split': [2,4, 8, 10, 12],
'n_estimators': [100, 200, 300, 400 ,500]
}
from cuml.ensemble import RandomForestRegressor
# Create a based model
rfr_cu = RandomForestRegressor() # remove the output_type. Check this link for more details: https://docs.rapids.ai/api/cuml/stable/api.html#output-data-type-configuration
# Instantiate the grid search model
rfr_tune_cu = GridSearchCV(estimator = rfr_cu, param_grid = param_grid2, scoring='neg_mean_squared_error',
cv = 3, n_jobs = -1, verbose = 3, error_score= 'raise', return_train_score =True )
The output message
RuntimeError Traceback (most recent call last)
Input In [80], in <cell line: 2>()
1 # Fit the grid search to the data
----> 2 rfr_tune_cu.fit(cu_X_train, cu_y_train)
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:875, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
869 results = self._format_results(
870 all_candidate_params, n_splits, all_out, all_more_results
871 )
873 return results
--> 875 self._run_search(evaluate_candidates)
877 # multimetric is determined here because in the case of a callable
878 # self.scoring the return type is only known after calling
879 first_test_score = all_out[0]["test_scores"]
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:1375, in GridSearchCV._run_search(self, evaluate_candidates)
1373 def _run_search(self, evaluate_candidates):
1374 """Search all candidates in param_grid"""
-> 1375 evaluate_candidates(ParameterGrid(self.param_grid))
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:822, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
814 if self.verbose > 0:
815 print(
816 "Fitting {0} folds for each of {1} candidates,"
817 " totalling {2} fits".format(
818 n_splits, n_candidates, n_candidates * n_splits
819 )
820 )
--> 822 out = parallel(
823 delayed(_fit_and_score)(
824 clone(base_estimator),
825 X,
826 y,
827 train=train,
828 test=test,
829 parameters=parameters,
830 split_progress=(split_idx, n_splits),
831 candidate_progress=(cand_idx, n_candidates),
832 **fit_and_score_kwargs,
833 )
834 for (cand_idx, parameters), (split_idx, (train, test)) in product(
835 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
836 )
837 )
839 if len(out) < 1:
840 raise ValueError(
841 "No fits were performed. "
842 "Was the CV iterator empty? "
843 "Were there no candidates?"
844 )
File /usr/local/lib/python3.9/dist-packages/joblib/parallel.py:1056, in Parallel.__call__(self, iterable)
1053 self._iterating = False
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
File /usr/local/lib/python3.9/dist-packages/joblib/parallel.py:935, in Parallel.retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
File /usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py:542, in LokyBackend.wrap_future_result(future, timeout)
539 """Wrapper for Future.result to implement the same behaviour as
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
File /usr/lib/python3.9/concurrent/futures/_base.py:446, in Future.result(self, timeout)
444 raise CancelledError()
445 elif self._state == FINISHED:
--> 446 return self.__get_result()
447 else:
448 raise TimeoutError()
File /usr/lib/python3.9/concurrent/futures/_base.py:391, in Future.__get_result(self)
389 if self._exception:
390 try:
--> 391 raise self._exception
392 finally:
393 # Break a reference cycle with the exception in self._exception
394 self = None
RuntimeError: CUDA error encountered at: file=/project/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh line=52: call='cub::DeviceRadixSort::SortKeys( nullptr, temp_storage_bytes, data, sorted_column.data(), n_rows, 0, 8 * sizeof(T), stream)', Reason=cudaErrorInvalidDeviceFunction:invalid device function

Error using sci-kit learn MLPClassifier: 'str' object has no attribute 'decode'

I am creating, tuning, and fitting various sci-kit learn models for a classification problem. The structure of the code below works fine for all other methods (SVM, Logistic Regression, Random Forest, Decision Tree, etc.)
When running the MLPClassifier() and its hyperparameter tuning, I get the error message originating from the last line where I fit:
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8160/4102235488.py in <module>
19
20 reverse_mlp_en_clf = GridSearchCV(reverse_mlp_en, reverse_mlp_en_hyperparameters, cv = 5, verbose = 1, n_jobs = 5, scoring = 'f1')
---> 21 reverse_mlp_en_best_model = reverse_mlp_en_clf.fit(X_en, y_en)
~\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
737 refit_start_time = time.time()
738 if y is not None:
--> 739 self.best_estimator_.fit(X, y, **fit_params)
740 else:
741 self.best_estimator_.fit(X, **fit_params)
~\AppData\Roaming\Python\Python38\site-packages\sklearn\neural_network\_multilayer_perceptron.py in fit(self, X, y)
992 self : returns a trained MLP model.
993 """
--> 994 return self._fit(X, y, incremental=(self.warm_start and
995 hasattr(self, "classes_")))
996
~\AppData\Roaming\Python\Python38\site-packages\sklearn\neural_network\_multilayer_perceptron.py in _fit(self, X, y, incremental)
372 # Run the LBFGS solver
373 elif self.solver == 'lbfgs':
--> 374 self._fit_lbfgs(X, y, activations, deltas, coef_grads,
375 intercept_grads, layer_units)
376 return self
~\AppData\Roaming\Python\Python38\site-packages\sklearn\neural_network\_multilayer_perceptron.py in _fit_lbfgs(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units)
468 },
469 args=(X, y, activations, deltas, coef_grads, intercept_grads))
--> 470 self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
471 self.loss_ = opt_res.fun
472 self._unpack(opt_res.x)
~\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
Code is as follows:
# Reverse hyperparameter tuning - MLP English
# Create model
reverse_mlp_en = MLPClassifier()
# Define parameters, store in dictionary
reverse_mlp_en_hidden_layer_sizes = [(50,),(100,),(150,), (200,)]
reverse_mlp_en_activation = ['logistic', 'tanh', 'relu']
reverse_mlp_en_alpha = [0.0001, 0.05]
reverse_mlp_en_learning_rate = ['constant','adaptive']
reverse_mlp_en_hyperparameters = dict(hidden_layer_sizes = reverse_mlp_en_hidden_layer_sizes, activation = reverse_mlp_en_activation
, solver = reverse_mlp_en_solver, alpha = reverse_mlp_en_alpha, learning_rate =reverse_mlp_en_learning_rate )
# Get model with the best parameters
reverse_mlp_en_clf = GridSearchCV(reverse_mlp_en, reverse_mlp_en_hyperparameters, cv = 5, verbose = 1, n_jobs = 5, scoring = 'f1')
reverse_mlp_en_best_model = reverse_mlp_en_clf.fit(X_en, y_en)
Libraries are up to date if I am not mistaken.
If anyone has an idea that would be greatly appreciated - I just find it weird how:
The same code, but using different models, works fine and
how the error is only shown after fitting everything (which is taking 40 minutes or so)

AttributeError: 'PipelinedRDD' object has no attribute '_jdf'

I am fairly new to PySpark. I am getting an attribute error while trying to run a logistic regression. I am trying to run a logistic regression on minmaxscaler vectors to get the probability values of a likely match up between the data points.
number_games = df2.filter(df2.GAME_ID > 22000000).filter(
df2.GAME_ID < 40000000).groupby("TEAM_ABBREVIATION").agg(
(F.sum("FGM") / F.countDistinct("GAME_ID")).alias('Points_Per_Game'))
vectorassembler = VectorAssembler(inputCols=["Points_Per_Game"],
outputCol="Performance")
scaler = MinMaxScaler(inputCol="Performance", outputCol="Output")
vectors = vectorassembler.transform(number_games)
scaler_model = scaler.fit(vectors)
scaler_data = scaler_model.transform(vectors)
statistics_teams = scaler_data.select('TEAM_ABBREVIATION',
'Output') # teams match up against one another
statistics_teams
RDD2 = sc.parallelize(statistics_teams.collect())
# RDD4 = RDD2.map( lambda x: x.split()) even as a pipelineRDD I get the same attribute error
lr = LogisticRegression(maxIter=20, regParam=0.001)
logistic_model = lr.fit(RDD2)
logistic_model.show()
the error returns
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-46-3c0eb05824a8> in <module>
1 lr = LogisticRegression(maxIter=20, regParam=0.001)
----> 2 logistic_model = lr.fit(RDD4)
3
4 logistic_model.show()
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit(self, dataset)
333
334 def _fit(self, dataset):
--> 335 java_model = self._fit_java(dataset)
336 model = self._create_model(java_model)
337 return self._copyValues(model)
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit_java(self, dataset)
330 """
331 self._transfer_params_to_java()
--> 332 return self._java_obj.fit(dataset._jdf)
333
334 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
Can you try to call .fit() on the actual data frame, in this case statistics_teams? I think that LogisticRegression works with data frames and not RDDs.

IndexError: positional indexers are out-of-bounds - RandomizedSearchCV() - Random Forest

i am trying to build a random forest model using a walk forward validation approach.
I use TimeBasedCV() to split my data accordingly: TimeBasedCV()
My Code looks like this:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
from sklearn.model_selection import RandomizedSearchCV
from random import randint, uniform
tscv = TimeBasedCV(train_period=60,test_period=12,freq='months')
index_output = tscv.split(X_train, date_column='Date')
rf = RandomForestRegressor()
model = RandomizedSearchCV(
estimator = rf,
param_distributions = random_grid,
n_iter = 10,
n_jobs = -1,
cv = index_output,
verbose=5,
random_state = 42,
return_train_score = True)
model.fit(X_train.drop('Date', axis=1),y_train)
model.cv_results_
Error Message for my model.fit is
IndexError: positional indexers are out-of-bounds
Do i have to adjust my Randomized Search? Or is this error due to an error in my data?
IndexError Traceback (most recent call last)
<ipython-input-71-eebc6186b2c3> in <module>
18 return_train_score = True)
19
---> 20 model.fit(X_train,y_train)
21 model.cv_results_
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
708 return results
709
--> 710 self._run_search(evaluate_candidates)
711
712 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1482 evaluate_candidates(ParameterSampler(
1483 self.param_distributions, self.n_iter,
-> 1484 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
687 for parameters, (train, test)
688 in product(candidate_params,
--> 689 cv.split(X, y, groups)))
690
691 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
IndexError: positional indexers are out-of-bounds

XGBoost Python: XGBoostError: we need weight to evaluate ams

i am trying to use XGBoost package in Python
I have this error when running this code
import xgboost as xgb
data=np.array(traindata.drop('Category',axis=1))
labels=np.array(traindata['Category'].cat.codes)
dtrain = xgb.DMatrix( data, label=labels)
param = {'bst:max_depth':6, 'bst:eta':0.5, 'silent':1, 'objective':'multi:softprob' }
param['nthread'] = 4
param['eval_metric'] = 'mlogloss'
param['lambda'] = 1
param['num_class']=39
evallist = [(dtrain,'train')]
plst = param.items()
plst += [('eval_metric', 'ams#0')]
num_round = 10
bst = xgb.train( plst, dtrain, num_round, evallist )
bst.save_model('0001.model')
--------------------------------------------------------------------------- XGBoostError Traceback (most recent call
last) in ()
17
18 num_round = 10
---> 19 bst = xgb.train( plst, dtrain, num_round, evallist )
20
21 bst.save_model('0001.model')
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/training.pyc
in train(params, dtrain, num_boost_round, evals, obj, feval, maximize,
early_stopping_rounds, evals_result, verbose_eval, learning_rates,
xgb_model)
122 nboost += 1
123 if len(evals) != 0:
--> 124 bst_eval_set = bst.eval_set(evals, i, feval)
125 if isinstance(bst_eval_set, STRING_TYPES):
126 msg = bst_eval_set
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/core.pyc
in eval_set(self, evals, iteration, feval)
753 _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration,
754 dmats, evnames, len(evals),
--> 755 ctypes.byref(msg)))
756 return msg.value
757 else:
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/core.pyc
in _check_call(ret)
95 """
96 if ret != 0:
---> 97 raise XGBoostError(_LIB.XGBGetLastError())
98
99
XGBoostError: we need weight to evaluate ams
i don't see anything about it in the docs
https://xgboost.readthedocs.io/en/latest/python/python_intro.html
http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
When computing the ams metric, you need to have a weight for each labeled training point. You set the weights by using the keyword argument weight when creating your DMatrix. A simple example.
weights = np.ones(len(labels))
dtrain = xgb.DMatrix(data, label = labels, weight = weights)
And an in-depth example from a recent Kaggle competition: https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py.

Categories