When I used GridSearhCV to find the best parameters setting for SVR, some errors happened.
The following is the code.
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
param_grid = [
{'kernel':["poly"], 'degree':[2, 3, 4],'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100], 'episilon':[0.1, 0.2, 0.4]},
{'kernal':["rbf"], 'gamma':[1, 2, 5, 10], 'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100]}
]
svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=10, scoring = 'neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
The input data X_train and y_train have been processed properly. And the error shown in JupyterLab is like this
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-52-39598b5bc032> in <module>
7 svm_reg = SVR()
8 grid_search = GridSearchCV(svm_reg, param_grid, cv=10, scoring = 'neg_mean_squared_error', return_train_score=True)
----> 9 grid_search.fit(X_train, y_train)
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1286 def _run_search(self, evaluate_candidates):
1287 """Search all candidates in param_grid"""
-> 1288 evaluate_candidates(ParameterGrid(self.param_grid))
1289
1290
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
579 cloned_parameters[k] = clone(v, safe=False)
580
--> 581 estimator = estimator.set_params(**cloned_parameters)
582
583 start_time = time.time()
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\base.py in set_params(self, **params)
228 key, delim, sub_key = key.partition('__')
229 if key not in valid_params:
--> 230 raise ValueError('Invalid parameter %s for estimator %s. '
231 'Check the list of available parameters '
232 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter episilon for estimator SVR(C=0.001, degree=2). Check the list of available parameters with `estimator.get_params().keys()`.
It's said that I should check the keys of SVR, but I can not find what's the error.
It's just typos. In your code:
param_grid = [
{'kernel':["poly"], 'degree':[2, 3, 4],'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100], 'episilon':[0.1, 0.2, 0.4]},
{'kernal':["rbf"], 'gamma':[1, 2, 5, 10], 'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100]}
]
Please check episilon and kernal.
Related
I am trying to use random forest classifier, logistic regression and SVC in a voting classifier (hard voting), but I encounter an error saying: AttributeError: 'str' object has no attribute 'decode'
Here is my code:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
log_clf = LogisticRegression(solver = 'lbfgs', random_state = 42)
rnd_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
svm_clf = SVC(gamma = "scale", random_state = 42)
voting_clf = VotingClassifier(estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting = 'hard')
y_train_raveled = y_train.ravel()
voting_clf.fit(X_train, y_train_raveled)
Here is my X_train and y_train:
>>> X_train
array([[-1.00334117e-01, 1.00000000e+00, 0.00000000e+00, ...,
1.30000000e+01, 4.20000000e+01, 5.60000000e+01],
[-6.94836737e-03, 0.00000000e+00, 0.00000000e+00, ...,
8.00000000e+00, 4.90000000e+01, 1.90000000e+01],
[-7.61506695e-02, 0.00000000e+00, 0.00000000e+00, ...,
1.40000000e+01, 3.30000000e+01, 5.00000000e+01],
...,
>>> y_train.ravel()
array([0., 0., 0., ..., 0., 1., 0.])
This error comes up. I already use standardScaler() to scale the numerical data and use OrdinalEncoder() to convert categorical data. Could anyone please tell me what I did wrong?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-56-5836b28a28cb> in <module>
1 y_train_raveled = y_train.ravel()
----> 2 voting_clf.fit(X_train, y_train_raveled)
~\Anaconda3\lib\site-packages\sklearn\ensemble\_voting.py in fit(self, X, y, sample_weight)
220 transformed_y = self.le_.transform(y)
221
--> 222 return super().fit(X, transformed_y, sample_weight)
223
224 def predict(self, X):
~\Anaconda3\lib\site-packages\sklearn\ensemble\_voting.py in fit(self, X, y, sample_weight)
66 delayed(_parallel_fit_estimator)(clone(clf), X, y,
67 sample_weight=sample_weight)
---> 68 for clf in clfs if clf not in (None, 'drop')
69 )
70
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\ensemble\_base.py in _parallel_fit_estimator(estimator, X, y, sample_weight)
34 raise
35 else:
---> 36 estimator.fit(X, y)
37 return estimator
38
~\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py in fit(self, X, y, sample_weight)
1599 penalty=penalty, max_squared_sum=max_squared_sum,
1600 sample_weight=sample_weight)
-> 1601 for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
1602
1603 fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio)
938 n_iter_i = _check_optimize_result(
939 solver, opt_res, max_iter,
--> 940 extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
941 w0, loss = opt_res.x, opt_res.fun
942 elif solver == 'newton-cg':
~\Anaconda3\lib\site-packages\sklearn\utils\optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
Trying to implement the code on here. The code was working fine then stopped, restarted updated and etc. Keep getting
TypeError: issubclass() arg 2 must be a class or tuple of classes
smote = SMOTE(random_state = 45)
X_train1, X_test1, y_train1, y_test1 = train_test_split(Xall, yall,
test_size = 0.3, random_state = 123)
# fit smote on training data
balanced_X1, balanced_y1 = smote.fit_sample(X_train1, y_train1)
# smote outputs numpy array therefore transformed to df
balanced_X1 = pd.DataFrame(data=balanced_X1, columns= X_train1.columns )
balanced_y1 = pd.DataFrame(data = balanced_y1,columns=['y'])
# hypertuning parameters; Create hyperparameter grid and fit
param_grid = {'penalty' : ['l1', 'l2'], 'C' : [0.001, 0.01, 0.1, 1, 10,
100]}
clf = GridSearchCV(LogisticRegression(random_state = 123,
),
param_grid,
cv=5)
best = clf.fit(balanced_X1, balanced_y1)
print('Best Penalty:', best.best_estimator_.get_params()['penalty'])
print('Best C:', best.best_estimator_.get_params()['C'])
I am just trying to run a grid search on said code but can't get past this error. PLEASE help
error message in full:
TypeError Traceeback (most recent call last)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in
_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params, return_train_score, return_parameters, return_n_test_samples,
return_times, return_estimator, error_score)
513 else:
--> 514 estimator.fit(X_train, y_train, **fit_params)
515
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self,
X, y, sample_weight)
1492 """
-> 1493 solver = _check_solver(self.solver, self.penalty, self.dual)
1494
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in
_check_solver(solver, penalty, dual)
431 "Specify a solver to silence this warning.",
--> 432 FutureWarning)
433
TypeError: issubclass() arg 2 must be a class or tuple of classes
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-13-925cae965240> in <module>
6 param_grid,
7 cv=5)
----> 8 cv.fit(X_train1, y_train1)
9 print('Best Penalty:', cv.best_estimator_.get_params()['penalty'])
10 print('Best C:', cv.best_estimator_.get_params()['C'])
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self,
X, y, groups, **fit_params)
685 return results
686
--> 687 self._run_search(evaluate_candidates)
688
689 # For multi-metric evaluation, store the best_index_,
best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in
_run_search(self, evaluate_candidates)
1146 def _run_search(self, evaluate_candidates):
1147 """Search all candidates in param_grid"""
-> 1148 evaluate_candidates(ParameterGrid(self.param_grid))
1149
1150
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in
evaluate_candidates(candidate_params)
664 for parameters, (train, test)
665 in product(candidate_params,
--> 666 cv.split(X, y, groups)))
667
668 if len(out) < 1:
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self,
iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in
apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self,
batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in
_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params, return_train_score, return_parameters, return_n_test_samples,
return_times, return_estimator, error_score)
526 "raised or error_score=np.nan to adopt
the "
527 "behavior from version 0.22.",
--> 528 FutureWarning)
529 raise
530 elif isinstance(error_score, numbers.Number):
TypeError: issubclass() arg 2 must be a class or tuple of classes
UPDATE: I just ran this code without SMOTE who I thought was the likely culprit and it turns out that the issue lies somewhere in scikit learn.
I'm trying to run sklearn ColumnTransformer for individual Pipelines. My dataframe is called listings_prepared. All features are float or int. The dataframe is clean--the only issue is a few missing values which are marked Nan, so SimpleImputer should handle it...
Sklearn provides documentation here on running pipelines through ColumnTransformer, which is what I've followed.
First, I create the pipelines using Pipeline:
num_pipeline = Pipeline([
('num_imputer', SimpleImputer(strategy='median')),
('num_scaler', StandardScaler()),
])
disc_pipeline = Pipeline([
('disc_imputer', SimpleImputer(strategy='most_frequent')),
('disc_scaler', StandardScaler(), disc_attribs),
])
cat_pipeline = Pipeline([
('cat_imputer', SimpleImputer(strategy='most_frequent')),
('cat_ohe', OneHotEncoder(categories='auto', drop='first',
sparse=False)),
])
amen_pipeline= Pipeline([
('amen_imputer', SimpleImputer(strategy='most_frequent')),
])
Then, I run them through ColumnTransformer:
listings_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('disc', disc_pipeline, disc_attribs),
('cat', cat_pipeline, cat_attribs),
('amen', amen_pipeline, amen_attribs),
])
X_train = listings_pipeline.fit_transform(listings_explore)
Here's the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-511-9d5100fe0f5d> in <module>
12 ('amen', amen_pipeline, amen_attribs),
13 ])
---> 14 X_train = listings_pipeline.fit_transform(listings_explore_pipeline)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
466 self._validate_remainder(X)
467
--> 468 result = self._fit_transform(X, y, _fit_transform_one)
469
470 if not result:
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
410 message=self._log_message(name, idx, len(transformers)))
411 for idx, (name, trans, column, weight) in enumerate(
--> 412 self._iter(fitted=fitted, replace_strings=True), 1))
413 except ValueError as e:
414 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
922 self._iterating = self._original_iterator is not None
923
--> 924 while self.dispatch_one_batch(iterator):
925 pass
926
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 """
386 last_step = self._final_estimator
--> 387 Xt, fit_params = self._fit(X, y, **fit_params)
388 with _print_elapsed_time('Pipeline',
389 self._log_message(len(self.steps) - 1)):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
270 fit_transform_one_cached = memory.cache(_fit_transform_one)
271
--> 272 fit_params_steps = {name: {} for name, step in self.steps
273 if step is not None}
274 for pname, pval in fit_params.items():
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in <dictcomp>(.0)
270 fit_transform_one_cached = memory.cache(_fit_transform_one)
271
--> 272 fit_params_steps = {name: {} for name, step in self.steps
273 if step is not None}
274 for pname, pval in fit_params.items():
ValueError: too many values to unpack (expected 2)
Why isn't this working?
I'm new to pytorch. I'm trying to do a cross validation, and I found the skorch library, which allow users to use sklearn functions with a torch model. So, I define a neural network class:
torch.manual_seed(42)
class Netcross(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(5,30)
self.sig1 = nn.Tanh()
#self.dout = nn.Dropout(0.2)
self.fc2 = nn.Linear(30,30)
self.sig2 = nn.Sigmoid()
self.out = nn.Linear(30, 1)
self.out_act = nn.Sigmoid()
#self.fc1.weight = torch.nn.Parameter(torch.rand(50,5))
def forward(self, x):
x = self.fc1(x)
x = self.sig1(x)
#x = self.dout(x)
x = self.fc2(x)
x = self.sig2(x)
x = self.out(x)
y = self.out_act(x)
return y
crossnet1 = NeuralNet(
Netcross,
max_epochs = 5,
criterion=torch.nn.BCELoss,
#user defined coeff.
callbacks = [epoch_acc, epoch_f1, epoch_phi],
optimizer=torch.optim.SGD,
optimizer__momentum=0.9,
lr=0.85,
)
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
crossnet1.fit(inputs, labels)
so far everything is fine, the function returns credible results without any errors. The problem appears when I try to use the GridSearchCV function:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_epochs':[5, 10, 20],
'lr': [0.1, 0.65, 0.8],
}
gs = GridSearchCV(estimator = crossnet1, param_grid = param_grid, refit = False, cv = 3, scoring = "accuracy")
gs.fit(inputs, labels)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-41-e1f3dbd9a2b0> in <module>
3 labels1 = torch.from_numpy(np.array(labels))
4
----> 5 gs.fit(inputs1, labels1)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
516 start_time = time.time()
517
--> 518 X_train, y_train = _safe_split(estimator, X, y, train)
519 X_test, y_test = _safe_split(estimator, X, y, test, train)
520
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
201 X_subset = X[np.ix_(indices, train_indices)]
202 else:
--> 203 X_subset = safe_indexing(X, indices)
204
205 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
214 indices.dtype.kind == 'i'):
215 # This is often substantially faster than X[indices]
--> 216 return X.take(indices, axis=0)
217 else:
218 return X[indices]
TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray
What is wrong?
Change your input and labels to np.ndarray (see examples here).
Those will be casted to torch.Tensor when needed automatically by skorch.
All in all change your
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
to:
inputs = x_traintensor.numpy() # assuming x is torch.Tensor
labels = y_traintensor.numpy() # assuming y is torch.Tensor
BTW. torch.Variable is deprecated, you should use torch.Tensor(data, requires_grad=True). In this case, inputs and labels do not need gradient, hence Variable is even more out of place.
I am using a GridSerach to search for the best hyperparameters of a classifier as described in here: http://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html
Here is a how a piece of code looks like:
X = X.values # convert from pandas Dataframe to numpy array
y = np.array(y)
n_samples, n_features = X.shape
n_outputs = y.shape[0]
inner_cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True, random_state=rnd)
outer_cv = cross_validation.StratifiedKFold(y, n_folds=kFold, shuffle=True, random_state=rnd)
# Non_nested parameter search and scoring
clf = GridSearchCV(estimator=pipeline, param_grid=param_dict, scoring= scores, cv=inner_cv)
# Nested CV with parameter optimization
nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv)
nested_score.fit(X,y)
nested_scores = nested_score.mean()
however for some reason I am getting this error:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-1-cad4e848fb54> in <module>()
124
125 # Nested CV with parameter optimization
--> 126 nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv)
127 nested_score.fit(X,y)
128 nested_scores = nested_score.mean()
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1431 train, test, verbose, None,
1432 fit_params)
-> 1433 for train, test in cv)
1434 return np.array(scores)[:, 0]
1435
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1522 start_time = time.time()
1523
-> 1524 X_train, y_train = _safe_split(estimator, X, y, train)
1525 X_test, y_test = _safe_split(estimator, X, y, test, train)
1526
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _safe_split(estimator, X, y, indices, train_indices)
1589 X_subset = X[np.ix_(indices, train_indices)]
1590 else:
-> 1591 X_subset = safe_indexing(X, indices)
1592
1593 if y is not None:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
161 indices.dtype.kind == 'i'):
162 # This is often substantially faster than X[indices]
--> 163 return X.take(indices, axis=0)
164 else:
165 return X[indices]
IndexError: index 4549 is out of bounds for size 4549
X and y has the following dimensions:
X: (6066, 5)
y: (6066,)
everything looks normal. Where is the problem originating from?
Thanks for sharing your opinion.
Not sure what you are trying to do here but GridsearchCV is not a classifier and therefore you can't pass it to cross_val_score.
GridsearchCV runs a cross validation multiple times using different parameters. Therefore it represents multiple classifiers. It does have a best_classifier attribute once it has been fitted.