GridSearchCV paramaters - python

I'm trying to use GridSearchCV with KMeans clustering to explore the optimal number to clusters to use in order to get the best results on a classification problem.
I've got the following code:
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
faces = fetch_olivetti_faces()
X_data, y_data = faces.data, faces.target
log_reg = LogisticRegression()
split = StratifiedShuffleSplit(n_splits = 1, test_size=.2, random_state=42)
for train_index, test_index in split.split(X_train, y_train):
X_train_set , y_train_set = X_data[train_index,], y_data[train_index,]
X_test_set, y_test_set = X_data[test_index,], y_data[test_index, ]
pipeline = Pipeline([
('kmeans', KMeans(n_clusters = 30)),
('log_reg', LogisticRegression())
])
cluster_grid = dict(n_clusters=range(2,100))
grid = GridSearchCV(pipeline, cluster_grid)
grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
Here's the entire traceback:
-------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-80e6a3932897> in <module>
----> 1 grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
686 return results
687
--> 688 self._run_search(evaluate_candidates)
689
690 # For multi-metric evaluation, store the best_index_, best_params_ and
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1147 def _run_search(self, evaluate_candidates):
1148 """Search all candidates in param_grid"""
-> 1149 evaluate_candidates(ParameterGrid(self.param_grid))
1150
1151
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
665 for parameters, (train, test)
666 in product(candidate_params,
--> 667 cv.split(X, y, groups)))
668
669 if len(out) < 1:
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
501 train_scores = {}
502 if parameters is not None:
--> 503 estimator.set_params(**parameters)
504
505 start_time = time.time()
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
162 self
163 """
--> 164 self._set_params('steps', **kwargs)
165 return self
166
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
48 self._replace_estimator(attr, name, params.pop(name))
49 # 3. Step parameters and other initialisation arguments
---> 50 super().set_params(**params)
51 return self
52
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py in set_params(self, **params)
222 'Check the list of available parameters '
223 'with `estimator.get_params().keys()`.' %
--> 224 (key, self))
225
226 if delim:
ValueError: Invalid parameter n_clusters for estimator Pipeline(memory=None,
steps=[('kmeans',
KMeans(algorithm='auto', copy_x=True, init='k-means++',
max_iter=300, n_clusters=30, n_init=10, n_jobs=None,
precompute_distances='auto', random_state=None,
tol=0.0001, verbose=0)),
('log_reg',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
​
I have no idea what the heck is going on...I'm not sure how to interpret this error message but my parameter grid doesn't seem to be out of wack. PLEASE HELP!

When you are using pipeline you need to give the parameters as following:
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
where kmeans is taken from ('kmeans', KMeans())
So, your code should look as the following:
pipeline = Pipeline([
('kmeans', KMeans(),
('log_reg', LogisticRegression())
])
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)

The parameter n_clusters is only applicable to KMeans and not LogisticRegression
Specify in your cluster_grid that the grid param is only meant for KMeans:
# Parameters of pipelines can be set using ‘__’ separated parameter names:
cluster_grid = dict(kmeans__n_clusters=range(2,100))
Reference : https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

Related

score() missing 1 required positional argument: 'y_true' scikitlearn, crossvalidation

I am trying to make a simple crossvalidation process by scikitlearn, crossvalidate, and I get the following TypeError:
TypeError Traceback (most recent call last)
<ipython-input-59-0471fb78d8f0> in <module>
5
6 model = NMF(n_components=185, init='random', random_state=0)
----> 7 scores = cross_validate(model, df4_array, cv=5, scoring=('neg_mean_squared_error'))
8 W = model.fit_transform(df4_array)
9 H = model.components_
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
234 return_times=True, return_estimator=return_estimator,
235 error_score=error_score)
--> 236 for train, test in cv.split(X, y, groups))
237
238 zipped_scores = list(zip(*scores))
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
587 scorer = _MultimetricScorer(**scorer)
588 if y_test is None:
--> 589 scores = scorer(estimator, X_test)
590 else:
591 scores = scorer(estimator, X_test, y_test)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 if isinstance(scorer, _BaseScorer):
86 score = scorer._score(cached_call, estimator,
---> 87 *args, **kwargs)
88 else:
89 score = scorer(estimator, *args, **kwargs)
TypeError: _score() missing 1 required positional argument: 'y_true'
I do not know why, because I am trying to do a Recommender System for tue University using Non negative matrix factorization, which is an unsupervised method ... shouldn`t the code work without y?
Code:
from sklearn.decomposition import NMF
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
model = NMF(n_components=185, init='random', random_state=0)
scores = cross_validate(model, df4_array, cv=5, scoring=('neg_mean_squared_error'))
W = model.fit_transform(df4_array)
H = model.components_
So I think the problem is with the scoring method.
Any form of MSE(mean_squared_error) is a function for regression type problems as the formula involves a dependent variable(y) component of regression.
I would suggest to look through this link for different unsupervised scoring methods.
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
Hope this helped.
May the force be with you.

Code works sometimes but sometimes get TypeError: issubclass() when using imblearn's SMOTE

Trying to implement the code on here. The code was working fine then stopped, restarted updated and etc. Keep getting
TypeError: issubclass() arg 2 must be a class or tuple of classes
smote = SMOTE(random_state = 45)
X_train1, X_test1, y_train1, y_test1 = train_test_split(Xall, yall,
test_size = 0.3, random_state = 123)
# fit smote on training data
balanced_X1, balanced_y1 = smote.fit_sample(X_train1, y_train1)
# smote outputs numpy array therefore transformed to df
balanced_X1 = pd.DataFrame(data=balanced_X1, columns= X_train1.columns )
balanced_y1 = pd.DataFrame(data = balanced_y1,columns=['y'])
# hypertuning parameters; Create hyperparameter grid and fit
param_grid = {'penalty' : ['l1', 'l2'], 'C' : [0.001, 0.01, 0.1, 1, 10,
100]}
clf = GridSearchCV(LogisticRegression(random_state = 123,
),
param_grid,
cv=5)
best = clf.fit(balanced_X1, balanced_y1)
print('Best Penalty:', best.best_estimator_.get_params()['penalty'])
print('Best C:', best.best_estimator_.get_params()['C'])
I am just trying to run a grid search on said code but can't get past this error. PLEASE help
error message in full:
TypeError Traceeback (most recent call last)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in
_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params, return_train_score, return_parameters, return_n_test_samples,
return_times, return_estimator, error_score)
513 else:
--> 514 estimator.fit(X_train, y_train, **fit_params)
515
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self,
X, y, sample_weight)
1492 """
-> 1493 solver = _check_solver(self.solver, self.penalty, self.dual)
1494
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in
_check_solver(solver, penalty, dual)
431 "Specify a solver to silence this warning.",
--> 432 FutureWarning)
433
TypeError: issubclass() arg 2 must be a class or tuple of classes
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-13-925cae965240> in <module>
6 param_grid,
7 cv=5)
----> 8 cv.fit(X_train1, y_train1)
9 print('Best Penalty:', cv.best_estimator_.get_params()['penalty'])
10 print('Best C:', cv.best_estimator_.get_params()['C'])
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self,
X, y, groups, **fit_params)
685 return results
686
--> 687 self._run_search(evaluate_candidates)
688
689 # For multi-metric evaluation, store the best_index_,
best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in
_run_search(self, evaluate_candidates)
1146 def _run_search(self, evaluate_candidates):
1147 """Search all candidates in param_grid"""
-> 1148 evaluate_candidates(ParameterGrid(self.param_grid))
1149
1150
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in
evaluate_candidates(candidate_params)
664 for parameters, (train, test)
665 in product(candidate_params,
--> 666 cv.split(X, y, groups)))
667
668 if len(out) < 1:
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self,
iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in
apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self,
batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in
_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params, return_train_score, return_parameters, return_n_test_samples,
return_times, return_estimator, error_score)
526 "raised or error_score=np.nan to adopt
the "
527 "behavior from version 0.22.",
--> 528 FutureWarning)
529 raise
530 elif isinstance(error_score, numbers.Number):
TypeError: issubclass() arg 2 must be a class or tuple of classes
UPDATE: I just ran this code without SMOTE who I thought was the likely culprit and it turns out that the issue lies somewhere in scikit learn.

TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray

I'm new to pytorch. I'm trying to do a cross validation, and I found the skorch library, which allow users to use sklearn functions with a torch model. So, I define a neural network class:
torch.manual_seed(42)
class Netcross(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(5,30)
self.sig1 = nn.Tanh()
#self.dout = nn.Dropout(0.2)
self.fc2 = nn.Linear(30,30)
self.sig2 = nn.Sigmoid()
self.out = nn.Linear(30, 1)
self.out_act = nn.Sigmoid()
#self.fc1.weight = torch.nn.Parameter(torch.rand(50,5))
def forward(self, x):
x = self.fc1(x)
x = self.sig1(x)
#x = self.dout(x)
x = self.fc2(x)
x = self.sig2(x)
x = self.out(x)
y = self.out_act(x)
return y
crossnet1 = NeuralNet(
Netcross,
max_epochs = 5,
criterion=torch.nn.BCELoss,
#user defined coeff.
callbacks = [epoch_acc, epoch_f1, epoch_phi],
optimizer=torch.optim.SGD,
optimizer__momentum=0.9,
lr=0.85,
)
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
crossnet1.fit(inputs, labels)
so far everything is fine, the function returns credible results without any errors. The problem appears when I try to use the GridSearchCV function:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_epochs':[5, 10, 20],
'lr': [0.1, 0.65, 0.8],
}
gs = GridSearchCV(estimator = crossnet1, param_grid = param_grid, refit = False, cv = 3, scoring = "accuracy")
gs.fit(inputs, labels)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-41-e1f3dbd9a2b0> in <module>
3 labels1 = torch.from_numpy(np.array(labels))
4
----> 5 gs.fit(inputs1, labels1)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
516 start_time = time.time()
517
--> 518 X_train, y_train = _safe_split(estimator, X, y, train)
519 X_test, y_test = _safe_split(estimator, X, y, test, train)
520
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
201 X_subset = X[np.ix_(indices, train_indices)]
202 else:
--> 203 X_subset = safe_indexing(X, indices)
204
205 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
214 indices.dtype.kind == 'i'):
215 # This is often substantially faster than X[indices]
--> 216 return X.take(indices, axis=0)
217 else:
218 return X[indices]
TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray
What is wrong?
Change your input and labels to np.ndarray (see examples here).
Those will be casted to torch.Tensor when needed automatically by skorch.
All in all change your
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
to:
inputs = x_traintensor.numpy() # assuming x is torch.Tensor
labels = y_traintensor.numpy() # assuming y is torch.Tensor
BTW. torch.Variable is deprecated, you should use torch.Tensor(data, requires_grad=True). In this case, inputs and labels do not need gradient, hence Variable is even more out of place.

How to use numeric indices in cross-validation with pd.DataFrame in scikit-learn (disable _safe_split)?

I want to disable the safe_indexing and force the indicies that I've given my model.
I can't simply do X.values and y.values because I have a custom classifier that I've made where the column/attribute labels used during the __init__ (crucial for the algorithm).
This is from the following line of code:
model_selection.cross_val_score(model, X=X, y=y, cv=cv, n_jobs=1, scoring="accuracy")
where cv is a list of lists with numeric indices
X has to be a pd.DataFrame and cv has to be predefined indicies. How I can make this work?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-74-e1775ca32abb> in <module>()
1 smc.fit(X,y)
----> 2 smc.cross_validate(X,y,cv=cv, n_jobs=1)
<ipython-input-72-61f814fd075c> in cross_validate(self, X, y, cv, scoring, n_jobs, **args)
150 cv_idx.append((idx_tr.map(lambda x:X.index.get_loc(x)), idx_te.map(lambda x:X.index.get_loc(x))))
151 cv = cv_idx
--> 152 return model_selection.cross_val_score(self, X=X, y=y, cv=cv, n_jobs=n_jobs, scoring=scoring, **args)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
446 start_time = time.time()
447
--> 448 X_train, y_train = _safe_split(estimator, X, y, train)
449 X_test, y_test = _safe_split(estimator, X, y, test, train)
450
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
198 X_subset = X[np.ix_(indices, train_indices)]
199 else:
--> 200 X_subset = safe_indexing(X, indices)
201
202 if y is not None:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/__init__.py in safe_indexing(X, indices)
144 if hasattr(X, "iloc"):
145 # Work-around for indexing with read-only indices in pandas
--> 146 indices = indices if indices.flags.writeable else indices.copy()
147 # Pandas Dataframes and Series
148 try:
AttributeError: 'list' object has no attribute 'flags'
In response to the suggestions in the comments (2018-June-04):

ValueError in pipeline - featureHasher not working?

I think I'm having issues getting my vectorizer working within a gridsearch pipeline:
data as panda df x_train:
bathrooms bedrooms price building_id manager_id
10 1.5 3 3000 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc
10000 1.0 2 5465 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d
100004 1.0 1 2850 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01
100007 1.0 1 3275 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316
100013 1.0 4 3350 0 98e13ad4b495b9613cef886d79a6291f
numeric_predictors = ['bathrooms', 'bedrooms', 'price']
categorical_predictors = ['building_id', 'manager_id']
minMaxScaler fit & transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
class MyScaler(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
self.scaler = MinMaxScaler()
self.scaler.fit(X[self.cols])
return self
def transform(self, X):
return self.scaler.transform(X[self.cols])
My categorical feature hashing vectorizer:
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer
class MyVectorizer(BaseEstimator, TransformerMixin):
"""
Vectorize a set of categorical variables
"""
def __init__(self, cols, hashing=None):
"""
args:
cols: a list of column names of the categorical variables
hashing:
If None, then vectorization is a simple one-hot-encoding.
If an integer, then hashing is the number of features in the output.
"""
self.cols = cols
self.hashing = hashing
def fit(self, X, y=None):
data = X[self.cols]
# Choose a vectorizer
if self.hashing is None:
self.myvec = HashingVectorizer()
else:
self.myvec = FeatureHasher(n_features = self.hashing)
self.myvec.fit(X[self.cols].to_dict(orient='records'))
return self
def transform(self, X):
# Vectorize Input
if self.hashing is None:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')),
columns = self.myvec.feature_names_
)
else:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray()
)
GridSearch hyperparameters:
search_params = {
'preprocess__vectorize__hashing': [20, 40, 80],
'predict__alpha': [.01, .1, 1, 2, 10]
}
pipeline:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
('preprocess', FeatureUnion([
('scale', MyScaler(cols=numeric_predictors)),
('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5))
])),
('predict', MultinomialNB())
])
And last, calling this with the gridsearchCV classifier:
grid_search = GridSearchCV(pipeline, search_params)
grid_search.fit(x_train, y_train)
I get a ValueError: Input X must be non-negative. I checked and my numeric_predictor columns' data all non-negative, so I am narrowing it down to an issue with the hashing of the categorical predictors.
ValueError Traceback (most recent call last)
<ipython-input-62-50522376d1e5> in <module>()
1 grid_search = GridSearchCV(pipeline, search_params)
----> 2 grid_search.fit(x_train, y_train)
3 grid_search.best_params_
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
636 error_score=self.error_score)
637 for parameters, (train, test) in product(candidate_params,
--> 638 cv.split(X, y, groups)))
639
640 # if one choose to see train score, "out" will contain train score info
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
435 estimator.fit(X_train, **fit_params)
436 else:
--> 437 estimator.fit(X_train, y_train, **fit_params)
438
439 except Exception as e:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
257 Xt, fit_params = self._fit(X, y, **fit_params)
258 if self._final_estimator is not None:
--> 259 self._final_estimator.fit(Xt, y, **fit_params)
260 return self
261
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight)
602 self.feature_count_ = np.zeros((n_effective_classes, n_features),
603 dtype=np.float64)
--> 604 self._count(X, Y)
605 alpha = self._check_alpha()
606 self._update_feature_log_prob(alpha)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)
ValueError: Input X must be non-negative
> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count()
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)
Yes, when hashing is not None, FeatureHasher() is chosen, which can output negative values.
But you can remove convert those negative values to positive by using the non_negative parameter of FeatureHashser as given in documentation:
non_negative : boolean, optional, default False
When True, an absolute value is applied to the features matrix prior
to returning it. When used in conjunction with
alternate_sign=True, this significantly reduces the inner product
preservation property.
So change this line in MyVectorizer:
self.myvec = FeatureHasher(n_features = self.hashing)
to this:
self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True)
Note:
This parameter has been deprecated since version 0.19 and will be removed in 0.21.
You need to study how this parameter will affect your classification problem.

Categories