scikit-learn FeatureUnion not working to combine text & numeric features - python

I'm trying to combine a textual column of movie plots I have in a dataset with a categorical column of each movie's rating (the MPAA rating - G, PG, PG-13, R; not an IMDb user's score). I'm using sklearn's FeatureUnion object, but I keep getting en error about the fit_transform method being called with too many named arguments. Here's my code:
# create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(movie_ratings[['Genre', 'Plot']], pd.get_dummies(movie_ratings['Rated']), random_state=56)
''' create a processing pipeline and feature union '''
# create function transformers
get_genre_data = FunctionTransformer(lambda x: x['Genre'], validate=False)
get_plot_data = FunctionTransformer(lambda x: x['Plot'], validate=False)
# obtain the data
genres = get_genre_data.fit_transform(movie_ratings)
plots = get_plot_data.fit_transform(movie_ratings)
# # join the processing in a feature union
join_data_formats = FeatureUnion(
transformer_list = [
('genres', Pipeline([
('selector', get_genre_data),
('one_hot_encoder', LabelEncoder())
])),
('plots', Pipeline([
('selector', get_plot_data),
('count_vectorizer', CountVectorizer(tokenizer=nltk.tokenize)),
('tfidf_transformer', TfidfTransformer())
]))
]
)
# # instantiate a nested pipeline
pipeline = Pipeline([
('feature_union', join_data_formats),
('neural_network', MLPClassifier(alpha=0.01, hidden_layer_sizes=(100,), early_stopping=False, verbose=True))
])
# # fit the pipeline to the training data
pipeline.fit(X_train, y_train)
...and the error being thrown is:
34 # # fit the pipeline to the training data
---> 35 pipeline.fit(X_train, y_train)
...
TypeError: fit_transform() takes 2 positional arguments but 3 were given
Where am I going wrong? Thanks much for the help!
UPDATE: here's the full stack trace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-171-f57d9b24a9c8> in <module>()
28 # print(y_test.shape)
29
---> 30 pipeline.fit(X_train, y_train)
31 y_pred = pipeline.predict(X_test)
32
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
246 This estimator
247 """
--> 248 Xt, fit_params = self._fit(X, y, **fit_params)
249 if self._final_estimator is not None:
250 self._final_estimator.fit(Xt, y, **fit_params)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
211 Xt, fitted_transformer = fit_transform_one_cached(
212 cloned_transformer, None, Xt, y,
--> 213 **fit_params_steps[name])
214 # Replace the transformer of the step with the fitted
215 # transformer. This is necessary when loading the transformer
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
360
361 def __call__(self, *args, **kwargs):
--> 362 return self.func(*args, **kwargs)
363
364 def call_and_shelve(self, *args, **kwargs):
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
737 delayed(_fit_transform_one)(trans, weight, X, y,
738 **fit_params)
--> 739 for name, trans, weight in self._iter())
740
741 if not result:
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
281 Xt, fit_params = self._fit(X, y, **fit_params)
282 if hasattr(last_step, 'fit_transform'):
--> 283 return last_step.fit_transform(Xt, y, **fit_params)
284 elif last_step is None:
285 return Xt
TypeError: fit_transform() takes 2 positional arguments but 3 were given

Related

KNN imputation of numerical variables in pipleine in Dataframe- Python

All,
I'm facing an issue while trying to use KNN imputer in a pipleline. I have listed my workflow as below.
I have separated my Numerical and Categorical Variables and built a pipleline as below
numeric_transformer = Pipeline(steps=[
('imputer', KNN(k=3)),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, num_attr),
('cat', categorical_transformer, cat_attr)])
I want to use KNN imputer to impute the missing values in the numerical columns.
I ran logistic regression
clf_logreg = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
clf_logreg.fit(X_train, Y_train)
The above code chunk worked fine but when i'm trying to predict on X_train, i'm getting the below error. Please help me out. Thanks
train_pred_logreg = clf_logreg.predict(X_train)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-121-f17e49913947> in <module>
1 #train_pred_logreg = clf_logreg.predict(X_train)
----> 2 test_pred_logreg = clf_logreg.predict(X_test)
3
4 print(confusion_matrix(y_true=Y_train, y_pred = train_pred_logreg))
5
/opt/conda/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
423
/opt/conda/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
537 'remainder keyword')
538
--> 539 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
540 self._validate_output(Xs)
541
/opt/conda/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
418 message=self._log_message(name, idx, len(transformers)))
419 for idx, (name, trans, column, weight) in enumerate(
--> 420 self._iter(fitted=fitted, replace_strings=True), 1))
421 except ValueError as e:
422 if "Expected 2D array, got 1D array instead" in str(e):
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
/opt/conda/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
/opt/conda/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
693
694 def _transform_one(transformer, X, y, weight, **fit_params):
--> 695 res = transformer.transform(X)
696 # if we have a weight for this transformer, multiply output
697 if weight is None:
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X)
538 Xt = X
539 for _, _, transform in self._iter():
--> 540 Xt = transform.transform(Xt)
541 return Xt
542
/opt/conda/lib/python3.6/site-packages/fancyimpute/solver.py in transform(self, X, y)
223 "doesn't support inductive mode. Only %s.fit_transform is "
224 "supported at this time." % (
--> 225 self.__class__.__name__, self.__class__.__name__))
ValueError: KNN.transform not implemented! This imputation algorithm likely doesn't support inductive mode. Only KNN.fit_transform is supported at this time.
When I try to use fit_transform as shown in the error message I get the below error
clf_logreg.fit_transform(X_train, Y_train)
AttributeError: 'LogisticRegression' object has no attribute 'transform'
Try clf_logreg.preprocessor.fit_transform(X_train, Y_train) to test if KNN works. As per the logistic regression, it doesn't have a transform method as it's a classification method.

Python Pipeline Error When Used in Cross Validation

I have a complex error that can't interpret and can't figure out. Evidently my preprocessing pipeline works when fitting a model simplistically but fails when I attempt cross validation. I can't deciper the error and don't understand the issue. Please help.
Preprocessing
I have created a pipeline that performs some pre-processing tasks on data. It works. Includes some customer transformers. Here is the code.
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.cols = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.cols]
class dummy_creator(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
X_categorical_scaled_df = pd.get_dummies(X)
return X_categorical_scaled_df
class DFStandardScaler(BaseEstimator,TransformerMixin):
def __init__(self):
self.ss = None
def fit(self,X,y=None):
self.ss = StandardScaler().fit(X)
return self
def transform(self, X):
Xss = self.ss.transform(X)
X_continuous_scaled_df = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return X_continuous_scaled_df
pipeline_categorical = Pipeline(steps = [
('column_selector', column_selector(categorical_features)),
('create_dummies', dummy_creator())
])
pipeline_continuous = Pipeline(steps = [
('column_selector', column_selector(numeric_features)),
('scaler',DFStandardScaler())
])
feature_union = FeatureUnion([('cat', pipeline_categorical),
('cont', pipeline_continuous)])
If I fit_transform the the pipeline I get good results:
X_train_enc = feature_union.fit_transform(X_train)
X_train_enc
>>>array([[ 0. , 1. , 0. , ..., -0.05977797,
-0.21011127, -0.24460191],
[ 1. , 0. , 0. , ..., -0.68765273,
-0.00946558, -0.82457039],
[ 0. , 1. , 0. , ..., -1.06122696,
Model Without Cross Validation
If I now make a pipeline with the above pre-processing pipeline and a model (in this case Linear Regression) I still get good results (just predictions shown below to indicated data properly pre-processed and model fit):
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(feature_union, LinearRegression())
pipe.fit(X_train, y_train)
pipe.predict(X_validation)
>>>array([ 9.17773438, 9.38226318, 8.35693359, 10.62176514, 11.29095459,
7.45025635, 6.03497314, 10.04321289, 10.57568359, 9.86663818,
7.01202393, 8.08374023, 8.80700684, 10.80102539, 12.32678223,
6.7588501 , 10.44604492, 6.86547852, 9.20465088, 9.04406738,
Model With Cross Validation
Now I attempt to test the same model using cross validation. You will note that I put the pipeline into a list (`pipelines') and the cross-validation in a loop. This is because I intend to create a list of pipelines similar to this with different models and loop through them but that is outside the scope of my issue (but just in case you are wondering why I have coded it this way)
seed = 7
pipelines = []
pipelines.append(('ScaledLR',Pipeline([('Preprocess', feature_union),('LR', LinearRegression())])))
results=[]
names=[]
scoring='neg_mean_squared_error'
for name, model in pipelines:
kfold=KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
print("%s %f (%r)" % (name, cv_results.mean(), cv_results.std()))
And I get the following error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2655 try:
-> 2656 return self._engine.get_loc(key)
2657 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-70-1aa4a50ac843> in <module>
29
30 kfold=KFold(n_splits=10, random_state=7)
---> 31 cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
32 results.append(cv_results)
33 names.append(name)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
263 This estimator
264 """
--> 265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
267 self._final_estimator.fit(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
791 delayed(_fit_transform_one)(trans, X, y, weight,
792 **fit_params)
--> 793 for name, trans, weight in self._iter())
794
795 if not result:
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
296 """
297 last_step = self._final_estimator
--> 298 Xt, fit_params = self._fit(X, y, **fit_params)
299 if hasattr(last_step, 'fit_transform'):
300 return last_step.fit_transform(Xt, y, **fit_params)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
~\Anaconda2\envs\py36\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
340
341 def __call__(self, *args, **kwargs):
--> 342 return self.func(*args, **kwargs)
343
344 def call_and_shelve(self, *args, **kwargs):
~\Anaconda2\envs\py36\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda2\envs\py36\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
463 else:
464 # fit method of arity 2 (supervised transformation)
--> 465 return self.fit(X, y, **fit_params).transform(X)
466
467
<ipython-input-24-666c2228e73d> in transform(self, X, y)
13
14 def transform(self, X, y=None):
---> 15 return X.loc[:, self.cols]
16
17
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1492 except (KeyError, IndexError, AttributeError):
1493 pass
-> 1494 return self._getitem_tuple(key)
1495 else:
1496 # we by definition only have the 0th axis
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
866 def _getitem_tuple(self, tup):
867 try:
--> 868 return self._getitem_lowerdim(tup)
869 except IndexingError:
870 pass
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
986 for i, key in enumerate(tup):
987 if is_label_like(key) or isinstance(key, tuple):
--> 988 section = self._getitem_axis(key, axis=i)
989
990 # we have yielded a scalar ?
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 # fall thru to straight lookup
1912 self._validate_key(key, axis)
-> 1913 return self._get_label(key, axis=axis)
1914
1915
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
139 raise IndexingError('no slices here, handle elsewhere')
140
--> 141 return self.obj._xs(label, axis=axis)
142
143 def _get_loc(self, key, axis=None):
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3574
3575 if axis == 1:
-> 3576 return self[key]
3577
3578 self._consolidate_inplace()
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2925 if self.columns.nlevels > 1:
2926 return self._getitem_multilevel(key)
-> 2927 indexer = self.columns.get_loc(key)
2928 if is_integer(indexer):
2929 indexer = [indexer]
~\Anaconda2\envs\py36\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 return self._engine.get_loc(key)
2657 except KeyError:
-> 2658 return self._engine.get_loc(self._maybe_cast_indexer(key))
2659 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2660 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: None
The error appears to be located in the pre-processing featureunion - but I'm not sure exactly where or why. I believe it might be in the create_dummies class around the pd.get_dummies() function but not sure.
Can someone advise as to what is going on?
I am aware this is an old question but it was among the first that popped up when I was challenged with the same problem (code working without cross-validation and failing with it).
The reason this fails is because the signature of your custom estimator column_selector does not match the attributes. From sklearn documentation on developing estimators:
In addition, every keyword argument accepted by __init__ should correspond to an attribute on the instance. Scikit-learn relies on this to find the relevant attributes to set on an estimator when doing model selection.
It should work fine if you change it to:
class column_selector(BaseEstimator,TransformerMixin):
def __init__(self, columns: list):
self.columns = columns
def fit(self,X,y=None):
return self
def transform(self, X, y=None):
return X.loc[:, self.columns]
After a day and a half of brain busting I created a work-around.
I removed the pre-processing feature union from the final pipeline that included the feature union and the model. I run the feature union on the training set within each pipeline loop, and then invoke the model in cross_validation while feeding in the encoded/transformed variables. Here is the code.
from sklearn.pipeline import Pipeline
seed = 7
#prepare Models
pipelines = []
pipelines.append(('ScaledLR',Pipeline([('LR',LinearRegression())])))
#evaluate each model
results=[]
names=[]
scoring='neg_mean_squared_error'
for name, model in pipelines:
X_train_enc = feature_union.fit_transform(X_train)
y_train = Y_train.values
kfold=KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X_train_enc, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
print("%s %f (%r)" % (name, cv_results.mean(),cv_results.std()))
Of course I don't technically need the final pipeline that simply calls the model. However I am keeping for now as this is work in progress and I might put some other steps in.

Sklearn Pipeline/ColumnTransformer throwing ValueError: too many values to unpack (expected 2)

I'm trying to run sklearn ColumnTransformer for individual Pipelines. My dataframe is called listings_prepared. All features are float or int. The dataframe is clean--the only issue is a few missing values which are marked Nan, so SimpleImputer should handle it...
Sklearn provides documentation here on running pipelines through ColumnTransformer, which is what I've followed.
First, I create the pipelines using Pipeline:
num_pipeline = Pipeline([
('num_imputer', SimpleImputer(strategy='median')),
('num_scaler', StandardScaler()),
])
disc_pipeline = Pipeline([
('disc_imputer', SimpleImputer(strategy='most_frequent')),
('disc_scaler', StandardScaler(), disc_attribs),
])
cat_pipeline = Pipeline([
('cat_imputer', SimpleImputer(strategy='most_frequent')),
('cat_ohe', OneHotEncoder(categories='auto', drop='first',
sparse=False)),
])
amen_pipeline= Pipeline([
('amen_imputer', SimpleImputer(strategy='most_frequent')),
])
Then, I run them through ColumnTransformer:
listings_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('disc', disc_pipeline, disc_attribs),
('cat', cat_pipeline, cat_attribs),
('amen', amen_pipeline, amen_attribs),
])
X_train = listings_pipeline.fit_transform(listings_explore)
Here's the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-511-9d5100fe0f5d> in <module>
12 ('amen', amen_pipeline, amen_attribs),
13 ])
---> 14 X_train = listings_pipeline.fit_transform(listings_explore_pipeline)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
466 self._validate_remainder(X)
467
--> 468 result = self._fit_transform(X, y, _fit_transform_one)
469
470 if not result:
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
410 message=self._log_message(name, idx, len(transformers)))
411 for idx, (name, trans, column, weight) in enumerate(
--> 412 self._iter(fitted=fitted, replace_strings=True), 1))
413 except ValueError as e:
414 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
922 self._iterating = self._original_iterator is not None
923
--> 924 while self.dispatch_one_batch(iterator):
925 pass
926
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 """
386 last_step = self._final_estimator
--> 387 Xt, fit_params = self._fit(X, y, **fit_params)
388 with _print_elapsed_time('Pipeline',
389 self._log_message(len(self.steps) - 1)):
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
270 fit_transform_one_cached = memory.cache(_fit_transform_one)
271
--> 272 fit_params_steps = {name: {} for name, step in self.steps
273 if step is not None}
274 for pname, pval in fit_params.items():
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in <dictcomp>(.0)
270 fit_transform_one_cached = memory.cache(_fit_transform_one)
271
--> 272 fit_params_steps = {name: {} for name, step in self.steps
273 if step is not None}
274 for pname, pval in fit_params.items():
ValueError: too many values to unpack (expected 2)
Why isn't this working?

How to use numeric indices in cross-validation with pd.DataFrame in scikit-learn (disable _safe_split)?

I want to disable the safe_indexing and force the indicies that I've given my model.
I can't simply do X.values and y.values because I have a custom classifier that I've made where the column/attribute labels used during the __init__ (crucial for the algorithm).
This is from the following line of code:
model_selection.cross_val_score(model, X=X, y=y, cv=cv, n_jobs=1, scoring="accuracy")
where cv is a list of lists with numeric indices
X has to be a pd.DataFrame and cv has to be predefined indicies. How I can make this work?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-74-e1775ca32abb> in <module>()
1 smc.fit(X,y)
----> 2 smc.cross_validate(X,y,cv=cv, n_jobs=1)
<ipython-input-72-61f814fd075c> in cross_validate(self, X, y, cv, scoring, n_jobs, **args)
150 cv_idx.append((idx_tr.map(lambda x:X.index.get_loc(x)), idx_te.map(lambda x:X.index.get_loc(x))))
151 cv = cv_idx
--> 152 return model_selection.cross_val_score(self, X=X, y=y, cv=cv, n_jobs=n_jobs, scoring=scoring, **args)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
446 start_time = time.time()
447
--> 448 X_train, y_train = _safe_split(estimator, X, y, train)
449 X_test, y_test = _safe_split(estimator, X, y, test, train)
450
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
198 X_subset = X[np.ix_(indices, train_indices)]
199 else:
--> 200 X_subset = safe_indexing(X, indices)
201
202 if y is not None:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/__init__.py in safe_indexing(X, indices)
144 if hasattr(X, "iloc"):
145 # Work-around for indexing with read-only indices in pandas
--> 146 indices = indices if indices.flags.writeable else indices.copy()
147 # Pandas Dataframes and Series
148 try:
AttributeError: 'list' object has no attribute 'flags'
In response to the suggestions in the comments (2018-June-04):

Python GridSearchCV index xxxxx is out of bounds for size xxxxxx

I am using a GridSerach to search for the best hyperparameters of a classifier as described in here: http://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html
Here is a how a piece of code looks like:
X = X.values # convert from pandas Dataframe to numpy array
y = np.array(y)
n_samples, n_features = X.shape
n_outputs = y.shape[0]
inner_cv = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True, random_state=rnd)
outer_cv = cross_validation.StratifiedKFold(y, n_folds=kFold, shuffle=True, random_state=rnd)
# Non_nested parameter search and scoring
clf = GridSearchCV(estimator=pipeline, param_grid=param_dict, scoring= scores, cv=inner_cv)
# Nested CV with parameter optimization
nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv)
nested_score.fit(X,y)
nested_scores = nested_score.mean()
however for some reason I am getting this error:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-1-cad4e848fb54> in <module>()
124
125 # Nested CV with parameter optimization
--> 126 nested_score = cross_validation.cross_val_score(clf, X=X, y=y, cv=outer_cv)
127 nested_score.fit(X,y)
128 nested_scores = nested_score.mean()
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1431 train, test, verbose, None,
1432 fit_params)
-> 1433 for train, test in cv)
1434 return np.array(scores)[:, 0]
1435
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\grid_search.py in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1522 start_time = time.time()
1523
-> 1524 X_train, y_train = _safe_split(estimator, X, y, train)
1525 X_test, y_test = _safe_split(estimator, X, y, test, train)
1526
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\cross_validation.py in _safe_split(estimator, X, y, indices, train_indices)
1589 X_subset = X[np.ix_(indices, train_indices)]
1590 else:
-> 1591 X_subset = safe_indexing(X, indices)
1592
1593 if y is not None:
C:\Users\Yas\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
161 indices.dtype.kind == 'i'):
162 # This is often substantially faster than X[indices]
--> 163 return X.take(indices, axis=0)
164 else:
165 return X[indices]
IndexError: index 4549 is out of bounds for size 4549
X and y has the following dimensions:
X: (6066, 5)
y: (6066,)
everything looks normal. Where is the problem originating from?
Thanks for sharing your opinion.
​
Not sure what you are trying to do here but GridsearchCV is not a classifier and therefore you can't pass it to cross_val_score.
GridsearchCV runs a cross validation multiple times using different parameters. Therefore it represents multiple classifiers. It does have a best_classifier attribute once it has been fitted.

Categories