I am trying to use random forest classifier, logistic regression and SVC in a voting classifier (hard voting), but I encounter an error saying: AttributeError: 'str' object has no attribute 'decode'
Here is my code:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
log_clf = LogisticRegression(solver = 'lbfgs', random_state = 42)
rnd_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
svm_clf = SVC(gamma = "scale", random_state = 42)
voting_clf = VotingClassifier(estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting = 'hard')
y_train_raveled = y_train.ravel()
voting_clf.fit(X_train, y_train_raveled)
Here is my X_train and y_train:
>>> X_train
array([[-1.00334117e-01, 1.00000000e+00, 0.00000000e+00, ...,
1.30000000e+01, 4.20000000e+01, 5.60000000e+01],
[-6.94836737e-03, 0.00000000e+00, 0.00000000e+00, ...,
8.00000000e+00, 4.90000000e+01, 1.90000000e+01],
[-7.61506695e-02, 0.00000000e+00, 0.00000000e+00, ...,
1.40000000e+01, 3.30000000e+01, 5.00000000e+01],
...,
>>> y_train.ravel()
array([0., 0., 0., ..., 0., 1., 0.])
This error comes up. I already use standardScaler() to scale the numerical data and use OrdinalEncoder() to convert categorical data. Could anyone please tell me what I did wrong?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-56-5836b28a28cb> in <module>
1 y_train_raveled = y_train.ravel()
----> 2 voting_clf.fit(X_train, y_train_raveled)
~\Anaconda3\lib\site-packages\sklearn\ensemble\_voting.py in fit(self, X, y, sample_weight)
220 transformed_y = self.le_.transform(y)
221
--> 222 return super().fit(X, transformed_y, sample_weight)
223
224 def predict(self, X):
~\Anaconda3\lib\site-packages\sklearn\ensemble\_voting.py in fit(self, X, y, sample_weight)
66 delayed(_parallel_fit_estimator)(clone(clf), X, y,
67 sample_weight=sample_weight)
---> 68 for clf in clfs if clf not in (None, 'drop')
69 )
70
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\ensemble\_base.py in _parallel_fit_estimator(estimator, X, y, sample_weight)
34 raise
35 else:
---> 36 estimator.fit(X, y)
37 return estimator
38
~\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py in fit(self, X, y, sample_weight)
1599 penalty=penalty, max_squared_sum=max_squared_sum,
1600 sample_weight=sample_weight)
-> 1601 for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
1602
1603 fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio)
938 n_iter_i = _check_optimize_result(
939 solver, opt_res, max_iter,
--> 940 extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
941 w0, loss = opt_res.x, opt_res.fun
942 elif solver == 'newton-cg':
~\Anaconda3\lib\site-packages\sklearn\utils\optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
Related
All,
I'm facing an issue while trying to use KNN imputer in a pipleline. I have listed my workflow as below.
I have separated my Numerical and Categorical Variables and built a pipleline as below
numeric_transformer = Pipeline(steps=[
('imputer', KNN(k=3)),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, num_attr),
('cat', categorical_transformer, cat_attr)])
I want to use KNN imputer to impute the missing values in the numerical columns.
I ran logistic regression
clf_logreg = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
clf_logreg.fit(X_train, Y_train)
The above code chunk worked fine but when i'm trying to predict on X_train, i'm getting the below error. Please help me out. Thanks
train_pred_logreg = clf_logreg.predict(X_train)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-121-f17e49913947> in <module>
1 #train_pred_logreg = clf_logreg.predict(X_train)
----> 2 test_pred_logreg = clf_logreg.predict(X_test)
3
4 print(confusion_matrix(y_true=Y_train, y_pred = train_pred_logreg))
5
/opt/conda/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
423
/opt/conda/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
537 'remainder keyword')
538
--> 539 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
540 self._validate_output(Xs)
541
/opt/conda/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
418 message=self._log_message(name, idx, len(transformers)))
419 for idx, (name, trans, column, weight) in enumerate(
--> 420 self._iter(fitted=fitted, replace_strings=True), 1))
421 except ValueError as e:
422 if "Expected 2D array, got 1D array instead" in str(e):
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
/opt/conda/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
693
694 def _transform_one(transformer, X, y, weight, **fit_params):
--> 695 res = transformer.transform(X)
696 # if we have a weight for this transformer, multiply output
697 if weight is None:
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X)
538 Xt = X
539 for _, _, transform in self._iter():
--> 540 Xt = transform.transform(Xt)
541 return Xt
542
/opt/conda/lib/python3.6/site-packages/fancyimpute/solver.py in transform(self, X, y)
223 "doesn't support inductive mode. Only %s.fit_transform is "
224 "supported at this time." % (
--> 225 self.__class__.__name__, self.__class__.__name__))
ValueError: KNN.transform not implemented! This imputation algorithm likely doesn't support inductive mode. Only KNN.fit_transform is supported at this time.
When I try to use fit_transform as shown in the error message I get the below error
clf_logreg.fit_transform(X_train, Y_train)
AttributeError: 'LogisticRegression' object has no attribute 'transform'
Try clf_logreg.preprocessor.fit_transform(X_train, Y_train) to test if KNN works. As per the logistic regression, it doesn't have a transform method as it's a classification method.
Trying to implement the code on here. The code was working fine then stopped, restarted updated and etc. Keep getting
TypeError: issubclass() arg 2 must be a class or tuple of classes
smote = SMOTE(random_state = 45)
X_train1, X_test1, y_train1, y_test1 = train_test_split(Xall, yall,
test_size = 0.3, random_state = 123)
# fit smote on training data
balanced_X1, balanced_y1 = smote.fit_sample(X_train1, y_train1)
# smote outputs numpy array therefore transformed to df
balanced_X1 = pd.DataFrame(data=balanced_X1, columns= X_train1.columns )
balanced_y1 = pd.DataFrame(data = balanced_y1,columns=['y'])
# hypertuning parameters; Create hyperparameter grid and fit
param_grid = {'penalty' : ['l1', 'l2'], 'C' : [0.001, 0.01, 0.1, 1, 10,
100]}
clf = GridSearchCV(LogisticRegression(random_state = 123,
),
param_grid,
cv=5)
best = clf.fit(balanced_X1, balanced_y1)
print('Best Penalty:', best.best_estimator_.get_params()['penalty'])
print('Best C:', best.best_estimator_.get_params()['C'])
I am just trying to run a grid search on said code but can't get past this error. PLEASE help
error message in full:
TypeError Traceeback (most recent call last)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in
_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params, return_train_score, return_parameters, return_n_test_samples,
return_times, return_estimator, error_score)
513 else:
--> 514 estimator.fit(X_train, y_train, **fit_params)
515
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self,
X, y, sample_weight)
1492 """
-> 1493 solver = _check_solver(self.solver, self.penalty, self.dual)
1494
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in
_check_solver(solver, penalty, dual)
431 "Specify a solver to silence this warning.",
--> 432 FutureWarning)
433
TypeError: issubclass() arg 2 must be a class or tuple of classes
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-13-925cae965240> in <module>
6 param_grid,
7 cv=5)
----> 8 cv.fit(X_train1, y_train1)
9 print('Best Penalty:', cv.best_estimator_.get_params()['penalty'])
10 print('Best C:', cv.best_estimator_.get_params()['C'])
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self,
X, y, groups, **fit_params)
685 return results
686
--> 687 self._run_search(evaluate_candidates)
688
689 # For multi-metric evaluation, store the best_index_,
best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in
_run_search(self, evaluate_candidates)
1146 def _run_search(self, evaluate_candidates):
1147 """Search all candidates in param_grid"""
-> 1148 evaluate_candidates(ParameterGrid(self.param_grid))
1149
1150
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in
evaluate_candidates(candidate_params)
664 for parameters, (train, test)
665 in product(candidate_params,
--> 666 cv.split(X, y, groups)))
667
668 if len(out) < 1:
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self,
iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in
apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self,
batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in
_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params, return_train_score, return_parameters, return_n_test_samples,
return_times, return_estimator, error_score)
526 "raised or error_score=np.nan to adopt
the "
527 "behavior from version 0.22.",
--> 528 FutureWarning)
529 raise
530 elif isinstance(error_score, numbers.Number):
TypeError: issubclass() arg 2 must be a class or tuple of classes
UPDATE: I just ran this code without SMOTE who I thought was the likely culprit and it turns out that the issue lies somewhere in scikit learn.
I'm trying to run sklearn ColumnTransformer for individual Pipelines. My dataframe is called listings_prepared. All features are float or int. The dataframe is clean--the only issue is a few missing values which are marked Nan, so SimpleImputer should handle it...
Sklearn provides documentation here on running pipelines through ColumnTransformer, which is what I've followed.
First, I create the pipelines using Pipeline:
num_pipeline = Pipeline([
('num_imputer', SimpleImputer(strategy='median')),
('num_scaler', StandardScaler()),
])
disc_pipeline = Pipeline([
('disc_imputer', SimpleImputer(strategy='most_frequent')),
('disc_scaler', StandardScaler(), disc_attribs),
])
cat_pipeline = Pipeline([
('cat_imputer', SimpleImputer(strategy='most_frequent')),
('cat_ohe', OneHotEncoder(categories='auto', drop='first',
sparse=False)),
])
amen_pipeline= Pipeline([
('amen_imputer', SimpleImputer(strategy='most_frequent')),
])
Then, I run them through ColumnTransformer:
listings_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('disc', disc_pipeline, disc_attribs),
('cat', cat_pipeline, cat_attribs),
('amen', amen_pipeline, amen_attribs),
])
X_train = listings_pipeline.fit_transform(listings_explore)
Here's the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-511-9d5100fe0f5d> in <module>
12 ('amen', amen_pipeline, amen_attribs),
13 ])
---> 14 X_train = listings_pipeline.fit_transform(listings_explore_pipeline)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
466 self._validate_remainder(X)
467
--> 468 result = self._fit_transform(X, y, _fit_transform_one)
469
470 if not result:
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
410 message=self._log_message(name, idx, len(transformers)))
411 for idx, (name, trans, column, weight) in enumerate(
--> 412 self._iter(fitted=fitted, replace_strings=True), 1))
413 except ValueError as e:
414 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
922 self._iterating = self._original_iterator is not None
923
--> 924 while self.dispatch_one_batch(iterator):
925 pass
926
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 """
386 last_step = self._final_estimator
--> 387 Xt, fit_params = self._fit(X, y, **fit_params)
388 with _print_elapsed_time('Pipeline',
389 self._log_message(len(self.steps) - 1)):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
270 fit_transform_one_cached = memory.cache(_fit_transform_one)
271
--> 272 fit_params_steps = {name: {} for name, step in self.steps
273 if step is not None}
274 for pname, pval in fit_params.items():
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in <dictcomp>(.0)
270 fit_transform_one_cached = memory.cache(_fit_transform_one)
271
--> 272 fit_params_steps = {name: {} for name, step in self.steps
273 if step is not None}
274 for pname, pval in fit_params.items():
ValueError: too many values to unpack (expected 2)
Why isn't this working?
I'm new to pytorch. I'm trying to do a cross validation, and I found the skorch library, which allow users to use sklearn functions with a torch model. So, I define a neural network class:
torch.manual_seed(42)
class Netcross(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(5,30)
self.sig1 = nn.Tanh()
#self.dout = nn.Dropout(0.2)
self.fc2 = nn.Linear(30,30)
self.sig2 = nn.Sigmoid()
self.out = nn.Linear(30, 1)
self.out_act = nn.Sigmoid()
#self.fc1.weight = torch.nn.Parameter(torch.rand(50,5))
def forward(self, x):
x = self.fc1(x)
x = self.sig1(x)
#x = self.dout(x)
x = self.fc2(x)
x = self.sig2(x)
x = self.out(x)
y = self.out_act(x)
return y
crossnet1 = NeuralNet(
Netcross,
max_epochs = 5,
criterion=torch.nn.BCELoss,
#user defined coeff.
callbacks = [epoch_acc, epoch_f1, epoch_phi],
optimizer=torch.optim.SGD,
optimizer__momentum=0.9,
lr=0.85,
)
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
crossnet1.fit(inputs, labels)
so far everything is fine, the function returns credible results without any errors. The problem appears when I try to use the GridSearchCV function:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_epochs':[5, 10, 20],
'lr': [0.1, 0.65, 0.8],
}
gs = GridSearchCV(estimator = crossnet1, param_grid = param_grid, refit = False, cv = 3, scoring = "accuracy")
gs.fit(inputs, labels)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-41-e1f3dbd9a2b0> in <module>
3 labels1 = torch.from_numpy(np.array(labels))
4
----> 5 gs.fit(inputs1, labels1)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
516 start_time = time.time()
517
--> 518 X_train, y_train = _safe_split(estimator, X, y, train)
519 X_test, y_test = _safe_split(estimator, X, y, test, train)
520
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
201 X_subset = X[np.ix_(indices, train_indices)]
202 else:
--> 203 X_subset = safe_indexing(X, indices)
204
205 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
214 indices.dtype.kind == 'i'):
215 # This is often substantially faster than X[indices]
--> 216 return X.take(indices, axis=0)
217 else:
218 return X[indices]
TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray
What is wrong?
Change your input and labels to np.ndarray (see examples here).
Those will be casted to torch.Tensor when needed automatically by skorch.
All in all change your
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
to:
inputs = x_traintensor.numpy() # assuming x is torch.Tensor
labels = y_traintensor.numpy() # assuming y is torch.Tensor
BTW. torch.Variable is deprecated, you should use torch.Tensor(data, requires_grad=True). In this case, inputs and labels do not need gradient, hence Variable is even more out of place.
I am using a GridSerach to search for the best hyperparameters of a classifier as described in here: http://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html
Here is a how a piece of code looks like:
X = X.values # convert from pandas Dataframe to numpy array
y = np.array(y)
n_samples, n_features = X.shape
n_outputs = y.shape[0]
inner_cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True, random_state=rnd)
outer_cv = cross_validation.StratifiedKFold(y, n_folds=kFold, shuffle=True, random_state=rnd)
# Non_nested parameter search and scoring
clf = GridSearchCV(estimator=pipeline, param_grid=param_dict, scoring= scores, cv=inner_cv)
# Nested CV with parameter optimization
nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv)
nested_score.fit(X,y)
nested_scores = nested_score.mean()
however for some reason I am getting this error:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-1-cad4e848fb54> in <module>()
124
125 # Nested CV with parameter optimization
--> 126 nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv)
127 nested_score.fit(X,y)
128 nested_scores = nested_score.mean()
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1431 train, test, verbose, None,
1432 fit_params)
-> 1433 for train, test in cv)
1434 return np.array(scores)[:, 0]
1435
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1522 start_time = time.time()
1523
-> 1524 X_train, y_train = _safe_split(estimator, X, y, train)
1525 X_test, y_test = _safe_split(estimator, X, y, test, train)
1526
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _safe_split(estimator, X, y, indices, train_indices)
1589 X_subset = X[np.ix_(indices, train_indices)]
1590 else:
-> 1591 X_subset = safe_indexing(X, indices)
1592
1593 if y is not None:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
161 indices.dtype.kind == 'i'):
162 # This is often substantially faster than X[indices]
--> 163 return X.take(indices, axis=0)
164 else:
165 return X[indices]
IndexError: index 4549 is out of bounds for size 4549
X and y has the following dimensions:
X: (6066, 5)
y: (6066,)
everything looks normal. Where is the problem originating from?
Thanks for sharing your opinion.
Not sure what you are trying to do here but GridsearchCV is not a classifier and therefore you can't pass it to cross_val_score.
GridsearchCV runs a cross validation multiple times using different parameters. Therefore it represents multiple classifiers. It does have a best_classifier attribute once it has been fitted.