IndexError: positional indexers are out-of-bounds - RandomizedSearchCV() - Random Forest - python

i am trying to build a random forest model using a walk forward validation approach.
I use TimeBasedCV() to split my data accordingly: TimeBasedCV()
My Code looks like this:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,10]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
from sklearn.model_selection import RandomizedSearchCV
from random import randint, uniform
tscv = TimeBasedCV(train_period=60,test_period=12,freq='months')
index_output = tscv.split(X_train, date_column='Date')
rf = RandomForestRegressor()
model = RandomizedSearchCV(
estimator = rf,
param_distributions = random_grid,
n_iter = 10,
n_jobs = -1,
cv = index_output,
verbose=5,
random_state = 42,
return_train_score = True)
model.fit(X_train.drop('Date', axis=1),y_train)
model.cv_results_
Error Message for my model.fit is
IndexError: positional indexers are out-of-bounds
Do i have to adjust my Randomized Search? Or is this error due to an error in my data?
IndexError Traceback (most recent call last)
<ipython-input-71-eebc6186b2c3> in <module>
18 return_train_score = True)
19
---> 20 model.fit(X_train,y_train)
21 model.cv_results_
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
708 return results
709
--> 710 self._run_search(evaluate_candidates)
711
712 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1482 evaluate_candidates(ParameterSampler(
1483 self.param_distributions, self.n_iter,
-> 1484 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
687 for parameters, (train, test)
688 in product(candidate_params,
--> 689 cv.split(X, y, groups)))
690
691 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
IndexError: positional indexers are out-of-bounds

Related

Rapids CUML Random Forest Regression Model doesn't return the scoring function value

I used the code below to tune the model and search for the best combination using the grid search algorithm. However, the model does not evaluate and return the score value of each combination.
I used the same thing with the SVM regressor model and it return the score during the training.
Here is the code:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
param_grid2 = {
'bootstrap': [True, False],
'max_depth': [16, 30, 50, 80, 90, 100 ],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 3, 4, 5],
'min_samples_split': [2,4, 8, 10, 12],
'n_estimators': [100, 200, 300, 400 ,500]
}
from cuml.ensemble import RandomForestRegressor
# Create a based model
rfr_cu = RandomForestRegressor() # remove the output_type. Check this link for more details: https://docs.rapids.ai/api/cuml/stable/api.html#output-data-type-configuration
# Instantiate the grid search model
rfr_tune_cu = GridSearchCV(estimator = rfr_cu, param_grid = param_grid2, scoring='neg_mean_squared_error',
cv = 3, n_jobs = -1, verbose = 3, error_score= 'raise', return_train_score =True )
The output message
RuntimeError Traceback (most recent call last)
Input In [80], in <cell line: 2>()
1 # Fit the grid search to the data
----> 2 rfr_tune_cu.fit(cu_X_train, cu_y_train)
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:875, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
869 results = self._format_results(
870 all_candidate_params, n_splits, all_out, all_more_results
871 )
873 return results
--> 875 self._run_search(evaluate_candidates)
877 # multimetric is determined here because in the case of a callable
878 # self.scoring the return type is only known after calling
879 first_test_score = all_out[0]["test_scores"]
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:1375, in GridSearchCV._run_search(self, evaluate_candidates)
1373 def _run_search(self, evaluate_candidates):
1374 """Search all candidates in param_grid"""
-> 1375 evaluate_candidates(ParameterGrid(self.param_grid))
File /usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_search.py:822, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
814 if self.verbose > 0:
815 print(
816 "Fitting {0} folds for each of {1} candidates,"
817 " totalling {2} fits".format(
818 n_splits, n_candidates, n_candidates * n_splits
819 )
820 )
--> 822 out = parallel(
823 delayed(_fit_and_score)(
824 clone(base_estimator),
825 X,
826 y,
827 train=train,
828 test=test,
829 parameters=parameters,
830 split_progress=(split_idx, n_splits),
831 candidate_progress=(cand_idx, n_candidates),
832 **fit_and_score_kwargs,
833 )
834 for (cand_idx, parameters), (split_idx, (train, test)) in product(
835 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
836 )
837 )
839 if len(out) < 1:
840 raise ValueError(
841 "No fits were performed. "
842 "Was the CV iterator empty? "
843 "Were there no candidates?"
844 )
File /usr/local/lib/python3.9/dist-packages/joblib/parallel.py:1056, in Parallel.__call__(self, iterable)
1053 self._iterating = False
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
File /usr/local/lib/python3.9/dist-packages/joblib/parallel.py:935, in Parallel.retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
File /usr/local/lib/python3.9/dist-packages/joblib/_parallel_backends.py:542, in LokyBackend.wrap_future_result(future, timeout)
539 """Wrapper for Future.result to implement the same behaviour as
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
File /usr/lib/python3.9/concurrent/futures/_base.py:446, in Future.result(self, timeout)
444 raise CancelledError()
445 elif self._state == FINISHED:
--> 446 return self.__get_result()
447 else:
448 raise TimeoutError()
File /usr/lib/python3.9/concurrent/futures/_base.py:391, in Future.__get_result(self)
389 if self._exception:
390 try:
--> 391 raise self._exception
392 finally:
393 # Break a reference cycle with the exception in self._exception
394 self = None
RuntimeError: CUDA error encountered at: file=/project/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh line=52: call='cub::DeviceRadixSort::SortKeys( nullptr, temp_storage_bytes, data, sorted_column.data(), n_rows, 0, 8 * sizeof(T), stream)', Reason=cudaErrorInvalidDeviceFunction:invalid device function

What's the error when I used the GridSearchCV?

When I used GridSearhCV to find the best parameters setting for SVR, some errors happened.
The following is the code.
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
param_grid = [
{'kernel':["poly"], 'degree':[2, 3, 4],'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100], 'episilon':[0.1, 0.2, 0.4]},
{'kernal':["rbf"], 'gamma':[1, 2, 5, 10], 'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100]}
]
svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=10, scoring = 'neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
The input data X_train and y_train have been processed properly. And the error shown in JupyterLab is like this
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-52-39598b5bc032> in <module>
7 svm_reg = SVR()
8 grid_search = GridSearchCV(svm_reg, param_grid, cv=10, scoring = 'neg_mean_squared_error', return_train_score=True)
----> 9 grid_search.fit(X_train, y_train)
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1286 def _run_search(self, evaluate_candidates):
1287 """Search all candidates in param_grid"""
-> 1288 evaluate_candidates(ParameterGrid(self.param_grid))
1289
1290
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\anaconda3\envs\jupyterlab\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
579 cloned_parameters[k] = clone(v, safe=False)
580
--> 581 estimator = estimator.set_params(**cloned_parameters)
582
583 start_time = time.time()
~\anaconda3\envs\jupyterlab\lib\site-packages\sklearn\base.py in set_params(self, **params)
228 key, delim, sub_key = key.partition('__')
229 if key not in valid_params:
--> 230 raise ValueError('Invalid parameter %s for estimator %s. '
231 'Check the list of available parameters '
232 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter episilon for estimator SVR(C=0.001, degree=2). Check the list of available parameters with `estimator.get_params().keys()`.
It's said that I should check the keys of SVR, but I can not find what's the error.
It's just typos. In your code:
param_grid = [
{'kernel':["poly"], 'degree':[2, 3, 4],'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100], 'episilon':[0.1, 0.2, 0.4]},
{'kernal':["rbf"], 'gamma':[1, 2, 5, 10], 'C':[0.001, 0.1, 0.5, 1, 5, 10, 50, 100]}
]
Please check episilon and kernal.

TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray

I'm new to pytorch. I'm trying to do a cross validation, and I found the skorch library, which allow users to use sklearn functions with a torch model. So, I define a neural network class:
torch.manual_seed(42)
class Netcross(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(5,30)
self.sig1 = nn.Tanh()
#self.dout = nn.Dropout(0.2)
self.fc2 = nn.Linear(30,30)
self.sig2 = nn.Sigmoid()
self.out = nn.Linear(30, 1)
self.out_act = nn.Sigmoid()
#self.fc1.weight = torch.nn.Parameter(torch.rand(50,5))
def forward(self, x):
x = self.fc1(x)
x = self.sig1(x)
#x = self.dout(x)
x = self.fc2(x)
x = self.sig2(x)
x = self.out(x)
y = self.out_act(x)
return y
crossnet1 = NeuralNet(
Netcross,
max_epochs = 5,
criterion=torch.nn.BCELoss,
#user defined coeff.
callbacks = [epoch_acc, epoch_f1, epoch_phi],
optimizer=torch.optim.SGD,
optimizer__momentum=0.9,
lr=0.85,
)
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
crossnet1.fit(inputs, labels)
so far everything is fine, the function returns credible results without any errors. The problem appears when I try to use the GridSearchCV function:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_epochs':[5, 10, 20],
'lr': [0.1, 0.65, 0.8],
}
gs = GridSearchCV(estimator = crossnet1, param_grid = param_grid, refit = False, cv = 3, scoring = "accuracy")
gs.fit(inputs, labels)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-41-e1f3dbd9a2b0> in <module>
3 labels1 = torch.from_numpy(np.array(labels))
4
----> 5 gs.fit(inputs1, labels1)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
516 start_time = time.time()
517
--> 518 X_train, y_train = _safe_split(estimator, X, y, train)
519 X_test, y_test = _safe_split(estimator, X, y, test, train)
520
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
201 X_subset = X[np.ix_(indices, train_indices)]
202 else:
--> 203 X_subset = safe_indexing(X, indices)
204
205 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
214 indices.dtype.kind == 'i'):
215 # This is often substantially faster than X[indices]
--> 216 return X.take(indices, axis=0)
217 else:
218 return X[indices]
TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray
What is wrong?
Change your input and labels to np.ndarray (see examples here).
Those will be casted to torch.Tensor when needed automatically by skorch.
All in all change your
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
to:
inputs = x_traintensor.numpy() # assuming x is torch.Tensor
labels = y_traintensor.numpy() # assuming y is torch.Tensor
BTW. torch.Variable is deprecated, you should use torch.Tensor(data, requires_grad=True). In this case, inputs and labels do not need gradient, hence Variable is even more out of place.

ValueError in pipeline - featureHasher not working?

I think I'm having issues getting my vectorizer working within a gridsearch pipeline:
data as panda df x_train:
bathrooms bedrooms price building_id manager_id
10 1.5 3 3000 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc
10000 1.0 2 5465 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d
100004 1.0 1 2850 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01
100007 1.0 1 3275 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316
100013 1.0 4 3350 0 98e13ad4b495b9613cef886d79a6291f
numeric_predictors = ['bathrooms', 'bedrooms', 'price']
categorical_predictors = ['building_id', 'manager_id']
minMaxScaler fit & transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
class MyScaler(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
self.scaler = MinMaxScaler()
self.scaler.fit(X[self.cols])
return self
def transform(self, X):
return self.scaler.transform(X[self.cols])
My categorical feature hashing vectorizer:
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer
class MyVectorizer(BaseEstimator, TransformerMixin):
"""
Vectorize a set of categorical variables
"""
def __init__(self, cols, hashing=None):
"""
args:
cols: a list of column names of the categorical variables
hashing:
If None, then vectorization is a simple one-hot-encoding.
If an integer, then hashing is the number of features in the output.
"""
self.cols = cols
self.hashing = hashing
def fit(self, X, y=None):
data = X[self.cols]
# Choose a vectorizer
if self.hashing is None:
self.myvec = HashingVectorizer()
else:
self.myvec = FeatureHasher(n_features = self.hashing)
self.myvec.fit(X[self.cols].to_dict(orient='records'))
return self
def transform(self, X):
# Vectorize Input
if self.hashing is None:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')),
columns = self.myvec.feature_names_
)
else:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray()
)
GridSearch hyperparameters:
search_params = {
'preprocess__vectorize__hashing': [20, 40, 80],
'predict__alpha': [.01, .1, 1, 2, 10]
}
pipeline:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
('preprocess', FeatureUnion([
('scale', MyScaler(cols=numeric_predictors)),
('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5))
])),
('predict', MultinomialNB())
])
And last, calling this with the gridsearchCV classifier:
grid_search = GridSearchCV(pipeline, search_params)
grid_search.fit(x_train, y_train)
I get a ValueError: Input X must be non-negative. I checked and my numeric_predictor columns' data all non-negative, so I am narrowing it down to an issue with the hashing of the categorical predictors.
ValueError Traceback (most recent call last)
<ipython-input-62-50522376d1e5> in <module>()
1 grid_search = GridSearchCV(pipeline, search_params)
----> 2 grid_search.fit(x_train, y_train)
3 grid_search.best_params_
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
636 error_score=self.error_score)
637 for parameters, (train, test) in product(candidate_params,
--> 638 cv.split(X, y, groups)))
639
640 # if one choose to see train score, "out" will contain train score info
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
435 estimator.fit(X_train, **fit_params)
436 else:
--> 437 estimator.fit(X_train, y_train, **fit_params)
438
439 except Exception as e:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
257 Xt, fit_params = self._fit(X, y, **fit_params)
258 if self._final_estimator is not None:
--> 259 self._final_estimator.fit(Xt, y, **fit_params)
260 return self
261
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight)
602 self.feature_count_ = np.zeros((n_effective_classes, n_features),
603 dtype=np.float64)
--> 604 self._count(X, Y)
605 alpha = self._check_alpha()
606 self._update_feature_log_prob(alpha)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)
ValueError: Input X must be non-negative
> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count()
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)
Yes, when hashing is not None, FeatureHasher() is chosen, which can output negative values.
But you can remove convert those negative values to positive by using the non_negative parameter of FeatureHashser as given in documentation:
non_negative : boolean, optional, default False
When True, an absolute value is applied to the features matrix prior
to returning it. When used in conjunction with
alternate_sign=True, this significantly reduces the inner product
preservation property.
So change this line in MyVectorizer:
self.myvec = FeatureHasher(n_features = self.hashing)
to this:
self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True)
Note:
This parameter has been deprecated since version 0.19 and will be removed in 0.21.
You need to study how this parameter will affect your classification problem.

sklearn GridSearchCV : ValueError: X has 21 features per sample; expecting 19

I'm attempting to run GridSearchCV for Logistic Regression in sklearn and the code is giving me the following error:
ValueError: X has 21 features per sample; expecting 19
The shapes of the training and testing data are
X_train.shape
(891L, 21L)
X_test.shape
(418L, 21L)
The code I'm using to run the GridSearchCV with is
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
logistic = LogisticRegression()
parameters = [{'C' : [1.0, 10.0, 100.0, 1000.0],
'fit_intercept' : ['True', 'False'],
'intercept_scaling' : [0, 1, 10, 100, 1000],
'class_weight' : ['auto'],
'random_state' : [26],
'tol' : [0.001, 0.01, 0.1, 1, 10, 100]
}]
logistic = GridSearchCV(LogisticRegression(),
parameters,
cv=3,
refit=True,
verbose=1)
logistic = logistic.fit(X_train, y_train)
logit_pred = logistic.predict(X_test)
The traceback I'm getting is:
ValueError Traceback (most recent call last)
C:\Code\kaggle\titanic\titanic.py in <module>()
351
352
--> 353 logistic = logistic.fit(X_train, y_train)
354
355 logit_pred = logistic.predict(X_test)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\grid_search.pyc in fit(self, X, y)
594
595 """
--> 596 return self._fit(X, y, ParameterGrid(self.param_grid))
597
598
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\grid_search.pyc in _fit(self, X, y, parameter_iterable)
376 train, test, self.verbose, parameters,
377 self.fit_params, return_parameters=True)
--> 378 for parameters in parameter_iterable
379 for train, test in cv)
380
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
651 self._iterating = True
652 for function, args, kwargs in iterable:
--> 653 self.dispatch(function, args, kwargs)
654
655 if pre_dispatch == "all" or n_jobs == 1:
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch(self, func, args, kwargs)
398 """
399 if self._pool is None:
--> 400 job = ImmediateApply(func, args, kwargs)
401 index = len(self._jobs)
402 if not _verbosity_filter(index, self.verbose):
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __init__(self, func, args, kwargs)
136 # Don't delay the application, to avoid keeping the input
137 # arguments in memory
--> 138 self.results = func(*args, **kwargs)
139
140 def get(self):
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
1238 else:
1239 estimator.fit(X_train, y_train, **fit_params)
-> 1240 test_score = _score(estimator, X_test, y_test, scorer)
1241 if return_train_score:
1242 train_score = _score(estimator, X_train, y_train, scorer)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1294 score = scorer(estimator, X_test)
1295 else:
-> 1296 score = scorer(estimator, X_test, y_test)
1297 if not isinstance(score, numbers.Number):
1298 raise ValueError("scoring must return a number, got %s (%s) instead."
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\metrics\scorer.pyc in _passthrough_scorer(estimator, *args, **kwargs)
174 def _passthrough_scorer(estimator, *args, **kwargs):
175 """Function that wraps estimator.score"""
--> 176 return estimator.score(*args, **kwargs)
177
178
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\base.pyc in score(self, X, y, sample_weight)
289 """
290 from .metrics import accuracy_score
--> 291 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
292
293
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc in predict(self, X)
213 Predicted class label per sample.
214 """
--> 215 scores = self.decision_function(X)
216 if len(scores.shape) == 1:
217 indices = (scores > 0).astype(np.int)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc in decision_function(self, X)
194 if X.shape[1] != n_features:
195 raise ValueError("X has %d features per sample; expecting %d"
--> 196 % (X.shape[1], n_features))
197
198 scores = safe_sparse_dot(X, self.coef_.T,
ValueError: X has 21 features per sample; expecting 19
Why is GridSearchCV expecting a different number of features than the dataset contains?
UPDATE:
Thanks for the response Andy. The datasets are all type numpy.ndarray and dtype is float64.
type(X_Train) type(y_train) type(X_test)
numpy.ndarray numpy.ndarray numpy.ndarray
The steps right before I bring them into sklearn:
train_data = traindf.values
test_data = testdf.values
X_train = train_data[0::, 1::] # training features
y_train = train_data[0::, 0] # training targets
X_test = test_data[0::, 0::] # test features
The next step is the GridSearchCV code I typed above...
UPDATE 2: Link to Data
Here is a link to the datasets
The error is cause by intercept_scaling=0. Looks like a bug in scikit-learn.

Categories