Python Pipeline Error When Used in Cross Validation - python
I have a complex error that can't interpret and can't figure out. Evidently my preprocessing pipeline works when fitting a model simplistically but fails when I attempt cross validation. I can't deciper the error and don't understand the issue. Please help.
Preprocessing
I have created a pipeline that performs some pre-processing tasks on data. It works. Includes some customer transformers. Here is the code.
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.cols = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.cols]
class dummy_creator(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
X_categorical_scaled_df = pd.get_dummies(X)
return X_categorical_scaled_df
class DFStandardScaler(BaseEstimator,TransformerMixin):
def __init__(self):
self.ss = None
def fit(self,X,y=None):
self.ss = StandardScaler().fit(X)
return self
def transform(self, X):
Xss = self.ss.transform(X)
X_continuous_scaled_df = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return X_continuous_scaled_df
pipeline_categorical = Pipeline(steps = [
('column_selector', column_selector(categorical_features)),
('create_dummies', dummy_creator())
])
pipeline_continuous = Pipeline(steps = [
('column_selector', column_selector(numeric_features)),
('scaler',DFStandardScaler())
])
feature_union = FeatureUnion([('cat', pipeline_categorical),
('cont', pipeline_continuous)])
If I fit_transform the the pipeline I get good results:
X_train_enc = feature_union.fit_transform(X_train)
X_train_enc
>>>array([[ 0. , 1. , 0. , ..., -0.05977797,
-0.21011127, -0.24460191],
[ 1. , 0. , 0. , ..., -0.68765273,
-0.00946558, -0.82457039],
[ 0. , 1. , 0. , ..., -1.06122696,
Model Without Cross Validation
If I now make a pipeline with the above pre-processing pipeline and a model (in this case Linear Regression) I still get good results (just predictions shown below to indicated data properly pre-processed and model fit):
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(feature_union, LinearRegression())
pipe.fit(X_train, y_train)
pipe.predict(X_validation)
>>>array([ 9.17773438, 9.38226318, 8.35693359, 10.62176514, 11.29095459,
7.45025635, 6.03497314, 10.04321289, 10.57568359, 9.86663818,
7.01202393, 8.08374023, 8.80700684, 10.80102539, 12.32678223,
6.7588501 , 10.44604492, 6.86547852, 9.20465088, 9.04406738,
Model With Cross Validation
Now I attempt to test the same model using cross validation. You will note that I put the pipeline into a list (`pipelines') and the cross-validation in a loop. This is because I intend to create a list of pipelines similar to this with different models and loop through them but that is outside the scope of my issue (but just in case you are wondering why I have coded it this way)
seed = 7
pipelines = []
pipelines.append(('ScaledLR',Pipeline([('Preprocess', feature_union),('LR', LinearRegression())])))
results=[]
names=[]
scoring='neg_mean_squared_error'
for name, model in pipelines:
kfold=KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
print("%s %f (%r)" % (name, cv_results.mean(), cv_results.std()))
And I get the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2655 try:
-> 2656 return self._engine.get_loc(key)
2657 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-70-1aa4a50ac843> in <module>
29
30 kfold=KFold(n_splits=10, random_state=7)
---> 31 cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
32 results.append(cv_results)
33 names.append(name)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
263 This estimator
264 """
--> 265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
267 self._final_estimator.fit(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
791 delayed(_fit_transform_one)(trans, X, y, weight,
792 **fit_params)
--> 793 for name, trans, weight in self._iter())
794
795 if not result:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
296 """
297 last_step = self._final_estimator
--> 298 Xt, fit_params = self._fit(X, y, **fit_params)
299 if hasattr(last_step, 'fit_transform'):
300 return last_step.fit_transform(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
463 else:
464 # fit method of arity 2 (supervised transformation)
--> 465 return self.fit(X, y, **fit_params).transform(X)
466
467
<ipython-input-24-666c2228e73d> in transform(self, X, y)
13
14 def transform(self, X, y=None):
---> 15 return X.loc[:, self.cols]
16
17
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1492 except (KeyError, IndexError, AttributeError):
1493 pass
-> 1494 return self._getitem_tuple(key)
1495 else:
1496 # we by definition only have the 0th axis
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
866 def _getitem_tuple(self, tup):
867 try:
--> 868 return self._getitem_lowerdim(tup)
869 except IndexingError:
870 pass
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
986 for i, key in enumerate(tup):
987 if is_label_like(key) or isinstance(key, tuple):
--> 988 section = self._getitem_axis(key, axis=i)
989
990 # we have yielded a scalar ?
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 # fall thru to straight lookup
1912 self._validate_key(key, axis)
-> 1913 return self._get_label(key, axis=axis)
1914
1915
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
139 raise IndexingError('no slices here, handle elsewhere')
140
--> 141 return self.obj._xs(label, axis=axis)
142
143 def _get_loc(self, key, axis=None):
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3574
3575 if axis == 1:
-> 3576 return self[key]
3577
3578 self._consolidate_inplace()
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2925 if self.columns.nlevels > 1:
2926 return self._getitem_multilevel(key)
-> 2927 indexer = self.columns.get_loc(key)
2928 if is_integer(indexer):
2929 indexer = [indexer]
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 return self._engine.get_loc(key)
2657 except KeyError:
-> 2658 return self._engine.get_loc(self._maybe_cast_indexer(key))
2659 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2660 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
The error appears to be located in the pre-processing featureunion - but I'm not sure exactly where or why. I believe it might be in the create_dummies class around the pd.get_dummies() function but not sure.
Can someone advise as to what is going on?
I am aware this is an old question but it was among the first that popped up when I was challenged with the same problem (code working without cross-validation and failing with it).
The reason this fails is because the signature of your custom estimator column_selector does not match the attributes. From sklearn documentation on developing estimators:
In addition, every keyword argument accepted by __init__ should correspond to an attribute on the instance. Scikit-learn relies on this to find the relevant attributes to set on an estimator when doing model selection.
It should work fine if you change it to:
class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.columns = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.columns]
After a day and a half of brain busting I created a work-around.
I removed the pre-processing feature union from the final pipeline that included the feature union and the model. I run the feature union on the training set within each pipeline loop, and then invoke the model in cross_validation while feeding in the encoded/transformed variables. Here is the code.
from sklearn.pipeline import Pipeline
seed = 7
#prepare Models
pipelines = []
pipelines.append(('ScaledLR',Pipeline([('LR',LinearRegression())])))
#evaluate each model
results=[]
names=[]
scoring='neg_mean_squared_error'
for name, model in pipelines:
X_train_enc = feature_union.fit_transform(X_train)
y_train = Y_train.values
kfold=KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X_train_enc, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
print("%s %f (%r)" % (name, cv_results.mean(),cv_results.std()))
Of course I don't technically need the final pipeline that simply calls the model. However I am keeping for now as this is work in progress and I might put some other steps in.
Related
Code works sometimes but sometimes get TypeError: issubclass() when using imblearn's SMOTE
Trying to implement the code on here. The code was working fine then stopped, restarted updated and etc. Keep getting TypeError: issubclass() arg 2 must be a class or tuple of classes smote = SMOTE(random_state = 45) X_train1, X_test1, y_train1, y_test1 = train_test_split(Xall, yall, test_size = 0.3, random_state = 123) # fit smote on training data balanced_X1, balanced_y1 = smote.fit_sample(X_train1, y_train1) # smote outputs numpy array therefore transformed to df balanced_X1 = pd.DataFrame(data=balanced_X1, columns= X_train1.columns ) balanced_y1 = pd.DataFrame(data = balanced_y1,columns=['y']) # hypertuning parameters; Create hyperparameter grid and fit param_grid = {'penalty' : ['l1', 'l2'], 'C' : [0.001, 0.01, 0.1, 1, 10, 100]} clf = GridSearchCV(LogisticRegression(random_state = 123, ), param_grid, cv=5) best = clf.fit(balanced_X1, balanced_y1) print('Best Penalty:', best.best_estimator_.get_params()['penalty']) print('Best C:', best.best_estimator_.get_params()['C']) I am just trying to run a grid search on said code but can't get past this error. PLEASE help error message in full: TypeError Traceeback (most recent call last) ~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score) 513 else: --> 514 estimator.fit(X_train, y_train, **fit_params) 515 ~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight) 1492 """ -> 1493 solver = _check_solver(self.solver, self.penalty, self.dual) 1494 ~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in _check_solver(solver, penalty, dual) 431 "Specify a solver to silence this warning.", --> 432 FutureWarning) 433 TypeError: issubclass() arg 2 must be a class or tuple of classes During handling of the above exception, another exception occurred: TypeError Traceback (most recent call last) <ipython-input-13-925cae965240> in <module> 6 param_grid, 7 cv=5) ----> 8 cv.fit(X_train1, y_train1) 9 print('Best Penalty:', cv.best_estimator_.get_params()['penalty']) 10 print('Best C:', cv.best_estimator_.get_params()['C']) ~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params) 685 return results 686 --> 687 self._run_search(evaluate_candidates) 688 689 # For multi-metric evaluation, store the best_index_, best_params_ and ~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates) 1146 def _run_search(self, evaluate_candidates): 1147 """Search all candidates in param_grid""" -> 1148 evaluate_candidates(ParameterGrid(self.param_grid)) 1149 1150 ~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params) 664 for parameters, (train, test) 665 in product(candidate_params, --> 666 cv.split(X, y, groups))) 667 668 if len(out) < 1: ~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable) 919 # remaining jobs. 920 self._iterating = False --> 921 if self.dispatch_one_batch(iterator): 922 self._iterating = self._original_iterator is not None 923 ~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator) 757 return False 758 else: --> 759 self._dispatch(tasks) 760 return True 761 ~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch) 714 with self._lock: 715 job_idx = len(self._jobs) --> 716 job = self._backend.apply_async(batch, callback=cb) 717 # A job can complete so quickly than its callback is 718 # called before we get here, causing self._jobs to ~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback) 180 def apply_async(self, func, callback=None): 181 """Schedule a func to be run""" --> 182 result = ImmediateResult(func) 183 if callback: 184 callback(result) ~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch) 547 # Don't delay the application, to avoid keeping the input 548 # arguments in memory --> 549 self.results = batch() 550 551 def get(self): ~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs) --> 225 for func, args, kwargs in self.items] 226 227 def __len__(self): ~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs) --> 225 for func, args, kwargs in self.items] 226 227 def __len__(self): ~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score) 526 "raised or error_score=np.nan to adopt the " 527 "behavior from version 0.22.", --> 528 FutureWarning) 529 raise 530 elif isinstance(error_score, numbers.Number): TypeError: issubclass() arg 2 must be a class or tuple of classes UPDATE: I just ran this code without SMOTE who I thought was the likely culprit and it turns out that the issue lies somewhere in scikit learn.
Sklearn Pipeline/ColumnTransformer throwing ValueError: too many values to unpack (expected 2)
I'm trying to run sklearn ColumnTransformer for individual Pipelines. My dataframe is called listings_prepared. All features are float or int. The dataframe is clean--the only issue is a few missing values which are marked Nan, so SimpleImputer should handle it... Sklearn provides documentation here on running pipelines through ColumnTransformer, which is what I've followed. First, I create the pipelines using Pipeline: num_pipeline = Pipeline([ ('num_imputer', SimpleImputer(strategy='median')), ('num_scaler', StandardScaler()), ]) disc_pipeline = Pipeline([ ('disc_imputer', SimpleImputer(strategy='most_frequent')), ('disc_scaler', StandardScaler(), disc_attribs), ]) cat_pipeline = Pipeline([ ('cat_imputer', SimpleImputer(strategy='most_frequent')), ('cat_ohe', OneHotEncoder(categories='auto', drop='first', sparse=False)), ]) amen_pipeline= Pipeline([ ('amen_imputer', SimpleImputer(strategy='most_frequent')), ]) Then, I run them through ColumnTransformer: listings_pipeline = ColumnTransformer([ ('num', num_pipeline, num_attribs), ('disc', disc_pipeline, disc_attribs), ('cat', cat_pipeline, cat_attribs), ('amen', amen_pipeline, amen_attribs), ]) X_train = listings_pipeline.fit_transform(listings_explore) Here's the error: --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-511-9d5100fe0f5d> in <module> 12 ('amen', amen_pipeline, amen_attribs), 13 ]) ---> 14 X_train = listings_pipeline.fit_transform(listings_explore_pipeline) ~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y) 466 self._validate_remainder(X) 467 --> 468 result = self._fit_transform(X, y, _fit_transform_one) 469 470 if not result: ~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted) 410 message=self._log_message(name, idx, len(transformers))) 411 for idx, (name, trans, column, weight) in enumerate( --> 412 self._iter(fitted=fitted, replace_strings=True), 1)) 413 except ValueError as e: 414 if "Expected 2D array, got 1D array instead" in str(e): ~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 922 self._iterating = self._original_iterator is not None 923 --> 924 while self.dispatch_one_batch(iterator): 925 pass 926 ~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 757 return False 758 else: --> 759 self._dispatch(tasks) 760 return True 761 ~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch) 714 with self._lock: 715 job_idx = len(self._jobs) --> 716 job = self._backend.apply_async(batch, callback=cb) 717 # A job can complete so quickly than its callback is 718 # called before we get here, causing self._jobs to ~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 180 def apply_async(self, func, callback=None): 181 """Schedule a func to be run""" --> 182 result = ImmediateResult(func) 183 if callback: 184 callback(result) ~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch) 547 # Don't delay the application, to avoid keeping the input 548 # arguments in memory --> 549 self.results = batch() 550 551 def get(self): ~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs) --> 225 for func, args, kwargs in self.items] 226 227 def __len__(self): ~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0) 223 with parallel_backend(self._backend, n_jobs=self._n_jobs): 224 return [func(*args, **kwargs) --> 225 for func, args, kwargs in self.items] 226 227 def __len__(self): ~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 714 with _print_elapsed_time(message_clsname, message): 715 if hasattr(transformer, 'fit_transform'): --> 716 res = transformer.fit_transform(X, y, **fit_params) 717 else: 718 res = transformer.fit(X, y, **fit_params).transform(X) ~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 385 """ 386 last_step = self._final_estimator --> 387 Xt, fit_params = self._fit(X, y, **fit_params) 388 with _print_elapsed_time('Pipeline', 389 self._log_message(len(self.steps) - 1)): ~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 270 fit_transform_one_cached = memory.cache(_fit_transform_one) 271 --> 272 fit_params_steps = {name: {} for name, step in self.steps 273 if step is not None} 274 for pname, pval in fit_params.items(): ~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in <dictcomp>(.0) 270 fit_transform_one_cached = memory.cache(_fit_transform_one) 271 --> 272 fit_params_steps = {name: {} for name, step in self.steps 273 if step is not None} 274 for pname, pval in fit_params.items(): ValueError: too many values to unpack (expected 2) Why isn't this working?
scikit-learn FeatureUnion not working to combine text & numeric features
I'm trying to combine a textual column of movie plots I have in a dataset with a categorical column of each movie's rating (the MPAA rating - G, PG, PG-13, R; not an IMDb user's score). I'm using sklearn's FeatureUnion object, but I keep getting en error about the fit_transform method being called with too many named arguments. Here's my code: # create training and testing sets X_train, X_test, y_train, y_test = train_test_split(movie_ratings[['Genre', 'Plot']], pd.get_dummies(movie_ratings['Rated']), random_state=56) ''' create a processing pipeline and feature union ''' # create function transformers get_genre_data = FunctionTransformer(lambda x: x['Genre'], validate=False) get_plot_data = FunctionTransformer(lambda x: x['Plot'], validate=False) # obtain the data genres = get_genre_data.fit_transform(movie_ratings) plots = get_plot_data.fit_transform(movie_ratings) # # join the processing in a feature union join_data_formats = FeatureUnion( transformer_list = [ ('genres', Pipeline([ ('selector', get_genre_data), ('one_hot_encoder', LabelEncoder()) ])), ('plots', Pipeline([ ('selector', get_plot_data), ('count_vectorizer', CountVectorizer(tokenizer=nltk.tokenize)), ('tfidf_transformer', TfidfTransformer()) ])) ] ) # # instantiate a nested pipeline pipeline = Pipeline([ ('feature_union', join_data_formats), ('neural_network', MLPClassifier(alpha=0.01, hidden_layer_sizes=(100,), early_stopping=False, verbose=True)) ]) # # fit the pipeline to the training data pipeline.fit(X_train, y_train) ...and the error being thrown is: 34 # # fit the pipeline to the training data ---> 35 pipeline.fit(X_train, y_train) ... TypeError: fit_transform() takes 2 positional arguments but 3 were given Where am I going wrong? Thanks much for the help! UPDATE: here's the full stack trace: --------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-171-f57d9b24a9c8> in <module>() 28 # print(y_test.shape) 29 ---> 30 pipeline.fit(X_train, y_train) 31 y_pred = pipeline.predict(X_test) 32 ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params) 246 This estimator 247 """ --> 248 Xt, fit_params = self._fit(X, y, **fit_params) 249 if self._final_estimator is not None: 250 self._final_estimator.fit(Xt, y, **fit_params) ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params) 211 Xt, fitted_transformer = fit_transform_one_cached( 212 cloned_transformer, None, Xt, y, --> 213 **fit_params_steps[name]) 214 # Replace the transformer of the step with the fitted 215 # transformer. This is necessary when loading the transformer ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs) 360 361 def __call__(self, *args, **kwargs): --> 362 return self.func(*args, **kwargs) 363 364 def call_and_shelve(self, *args, **kwargs): ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params) 579 **fit_params): 580 if hasattr(transformer, 'fit_transform'): --> 581 res = transformer.fit_transform(X, y, **fit_params) 582 else: 583 res = transformer.fit(X, y, **fit_params).transform(X) ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params) 737 delayed(_fit_transform_one)(trans, weight, X, y, 738 **fit_params) --> 739 for name, trans, weight in self._iter()) 740 741 if not result: ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable) 777 # was dispatched. In particular this covers the edge 778 # case of Parallel used with an exhausted iterator. --> 779 while self.dispatch_one_batch(iterator): 780 self._iterating = True 781 else: ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator) 623 return False 624 else: --> 625 self._dispatch(tasks) 626 return True 627 ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch) 586 dispatch_timestamp = time.time() 587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 588 job = self._backend.apply_async(batch, callback=cb) 589 self._jobs.append(job) 590 ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback) 109 def apply_async(self, func, callback=None): 110 """Schedule a func to be run""" --> 111 result = ImmediateResult(func) 112 if callback: 113 callback(result) ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch) 330 # Don't delay the application, to avoid keeping the input 331 # arguments in memory --> 332 self.results = batch() 333 334 def get(self): ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): ~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params) 579 **fit_params): 580 if hasattr(transformer, 'fit_transform'): --> 581 res = transformer.fit_transform(X, y, **fit_params) 582 else: 583 res = transformer.fit(X, y, **fit_params).transform(X) ~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params) 281 Xt, fit_params = self._fit(X, y, **fit_params) 282 if hasattr(last_step, 'fit_transform'): --> 283 return last_step.fit_transform(Xt, y, **fit_params) 284 elif last_step is None: 285 return Xt TypeError: fit_transform() takes 2 positional arguments but 3 were given
How to use numeric indices in cross-validation with pd.DataFrame in scikit-learn (disable _safe_split)?
I want to disable the safe_indexing and force the indicies that I've given my model. I can't simply do X.values and y.values because I have a custom classifier that I've made where the column/attribute labels used during the __init__ (crucial for the algorithm). This is from the following line of code: model_selection.cross_val_score(model, X=X, y=y, cv=cv, n_jobs=1, scoring="accuracy") where cv is a list of lists with numeric indices X has to be a pd.DataFrame and cv has to be predefined indicies. How I can make this work? --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-74-e1775ca32abb> in <module>() 1 smc.fit(X,y) ----> 2 smc.cross_validate(X,y,cv=cv, n_jobs=1) <ipython-input-72-61f814fd075c> in cross_validate(self, X, y, cv, scoring, n_jobs, **args) 150 cv_idx.append((idx_tr.map(lambda x:X.index.get_loc(x)), idx_te.map(lambda x:X.index.get_loc(x)))) 151 cv = cv_idx --> 152 return model_selection.cross_val_score(self, X=X, y=y, cv=cv, n_jobs=n_jobs, scoring=scoring, **args) ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch) 340 n_jobs=n_jobs, verbose=verbose, 341 fit_params=fit_params, --> 342 pre_dispatch=pre_dispatch) 343 return cv_results['test_score'] 344 ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score) 204 fit_params, return_train_score=return_train_score, 205 return_times=True) --> 206 for train, test in cv.split(X, y, groups)) 207 208 if return_train_score: ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable) 777 # was dispatched. In particular this covers the edge 778 # case of Parallel used with an exhausted iterator. --> 779 while self.dispatch_one_batch(iterator): 780 self._iterating = True 781 else: ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator) 623 return False 624 else: --> 625 self._dispatch(tasks) 626 return True 627 ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch) 586 dispatch_timestamp = time.time() 587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) --> 588 job = self._backend.apply_async(batch, callback=cb) 589 self._jobs.append(job) 590 ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback) 109 def apply_async(self, func, callback=None): 110 """Schedule a func to be run""" --> 111 result = ImmediateResult(func) 112 if callback: 113 callback(result) ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch) 330 # Don't delay the application, to avoid keeping the input 331 # arguments in memory --> 332 self.results = batch() 333 334 def get(self): ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0) 129 130 def __call__(self): --> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items] 132 133 def __len__(self): ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score) 446 start_time = time.time() 447 --> 448 X_train, y_train = _safe_split(estimator, X, y, train) 449 X_test, y_test = _safe_split(estimator, X, y, test, train) 450 ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _safe_split(estimator, X, y, indices, train_indices) 198 X_subset = X[np.ix_(indices, train_indices)] 199 else: --> 200 X_subset = safe_indexing(X, indices) 201 202 if y is not None: ~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/__init__.py in safe_indexing(X, indices) 144 if hasattr(X, "iloc"): 145 # Work-around for indexing with read-only indices in pandas --> 146 indices = indices if indices.flags.writeable else indices.copy() 147 # Pandas Dataframes and Series 148 try: AttributeError: 'list' object has no attribute 'flags' In response to the suggestions in the comments (2018-June-04):
Python GridSearchCV index xxxxx is out of bounds for size xxxxxx
I am using a GridSerach to search for the best hyperparameters of a classifier as described in here: http://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html Here is a how a piece of code looks like: X = X.values # convert from pandas Dataframe to numpy array y = np.array(y) n_samples, n_features = X.shape n_outputs = y.shape[0] inner_cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True, random_state=rnd) outer_cv = cross_validation.StratifiedKFold(y, n_folds=kFold, shuffle=True, random_state=rnd) # Non_nested parameter search and scoring clf = GridSearchCV(estimator=pipeline, param_grid=param_dict, scoring= scores, cv=inner_cv) # Nested CV with parameter optimization nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv) nested_score.fit(X,y) nested_scores = nested_score.mean() however for some reason I am getting this error: --------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-1-cad4e848fb54> in <module>() 124 125 # Nested CV with parameter optimization --> 126 nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv) 127 nested_score.fit(X,y) 128 nested_scores = nested_score.mean() C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch) 1431 train, test, verbose, None, 1432 fit_params) -> 1433 for train, test in cv) 1434 return np.array(scores)[:, 0] 1435 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable) 798 # was dispatched. In particular this covers the edge 799 # case of Parallel used with an exhausted iterator. --> 800 while self.dispatch_one_batch(iterator): 801 self._iterating = True 802 else: C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator) 656 return False 657 else: --> 658 self._dispatch(tasks) 659 return True 660 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch) 564 565 if self._pool is None: --> 566 job = ImmediateComputeBatch(batch) 567 self._jobs.append(job) 568 self.n_dispatched_batches += 1 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch) 178 # Don't delay the application, to avoid keeping the input 179 # arguments in memory --> 180 self.results = batch() 181 182 def get(self): C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self) 70 71 def __call__(self): ---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items] 73 74 def __len__(self): C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0) 70 71 def __call__(self): ---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items] 73 74 def __len__(self): C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score) 1529 estimator.fit(X_train, **fit_params) 1530 else: -> 1531 estimator.fit(X_train, y_train, **fit_params) 1532 1533 except Exception as e: C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in fit(self, X, y) 802 803 """ --> 804 return self._fit(X, y, ParameterGrid(self.param_grid)) 805 806 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in _fit(self, X, y, parameter_iterable) 551 self.fit_params, return_parameters=True, 552 error_score=self.error_score) --> 553 for parameters in parameter_iterable 554 for train, test in cv) 555 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable) 798 # was dispatched. In particular this covers the edge 799 # case of Parallel used with an exhausted iterator. --> 800 while self.dispatch_one_batch(iterator): 801 self._iterating = True 802 else: C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator) 656 return False 657 else: --> 658 self._dispatch(tasks) 659 return True 660 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch) 564 565 if self._pool is None: --> 566 job = ImmediateComputeBatch(batch) 567 self._jobs.append(job) 568 self.n_dispatched_batches += 1 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch) 178 # Don't delay the application, to avoid keeping the input 179 # arguments in memory --> 180 self.results = batch() 181 182 def get(self): C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self) 70 71 def __call__(self): ---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items] 73 74 def __len__(self): C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0) 70 71 def __call__(self): ---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items] 73 74 def __len__(self): C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score) 1522 start_time = time.time() 1523 -> 1524 X_train, y_train = _safe_split(estimator, X, y, train) 1525 X_test, y_test = _safe_split(estimator, X, y, test, train) 1526 C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _safe_split(estimator, X, y, indices, train_indices) 1589 X_subset = X[np.ix_(indices, train_indices)] 1590 else: -> 1591 X_subset = safe_indexing(X, indices) 1592 1593 if y is not None: C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices) 161 indices.dtype.kind == 'i'): 162 # This is often substantially faster than X[indices] --> 163 return X.take(indices, axis=0) 164 else: 165 return X[indices] IndexError: index 4549 is out of bounds for size 4549 X and y has the following dimensions: X: (6066, 5) y: (6066,) everything looks normal. Where is the problem originating from? Thanks for sharing your opinion.
Not sure what you are trying to do here but GridsearchCV is not a classifier and therefore you can't pass it to cross_val_score. GridsearchCV runs a cross validation multiple times using different parameters. Therefore it represents multiple classifiers. It does have a best_classifier attribute once it has been fitted.