ValueError in pipeline - featureHasher not working? - python

I think I'm having issues getting my vectorizer working within a gridsearch pipeline:
data as panda df x_train:
bathrooms bedrooms price building_id manager_id
10 1.5 3 3000 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc
10000 1.0 2 5465 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d
100004 1.0 1 2850 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01
100007 1.0 1 3275 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316
100013 1.0 4 3350 0 98e13ad4b495b9613cef886d79a6291f
numeric_predictors = ['bathrooms', 'bedrooms', 'price']
categorical_predictors = ['building_id', 'manager_id']
minMaxScaler fit & transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
class MyScaler(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
self.scaler = MinMaxScaler()
self.scaler.fit(X[self.cols])
return self
def transform(self, X):
return self.scaler.transform(X[self.cols])
My categorical feature hashing vectorizer:
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer
class MyVectorizer(BaseEstimator, TransformerMixin):
"""
Vectorize a set of categorical variables
"""
def __init__(self, cols, hashing=None):
"""
args:
cols: a list of column names of the categorical variables
hashing:
If None, then vectorization is a simple one-hot-encoding.
If an integer, then hashing is the number of features in the output.
"""
self.cols = cols
self.hashing = hashing
def fit(self, X, y=None):
data = X[self.cols]
# Choose a vectorizer
if self.hashing is None:
self.myvec = HashingVectorizer()
else:
self.myvec = FeatureHasher(n_features = self.hashing)
self.myvec.fit(X[self.cols].to_dict(orient='records'))
return self
def transform(self, X):
# Vectorize Input
if self.hashing is None:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')),
columns = self.myvec.feature_names_
)
else:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray()
)
GridSearch hyperparameters:
search_params = {
'preprocess__vectorize__hashing': [20, 40, 80],
'predict__alpha': [.01, .1, 1, 2, 10]
}
pipeline:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
('preprocess', FeatureUnion([
('scale', MyScaler(cols=numeric_predictors)),
('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5))
])),
('predict', MultinomialNB())
])
And last, calling this with the gridsearchCV classifier:
grid_search = GridSearchCV(pipeline, search_params)
grid_search.fit(x_train, y_train)
I get a ValueError: Input X must be non-negative. I checked and my numeric_predictor columns' data all non-negative, so I am narrowing it down to an issue with the hashing of the categorical predictors.
ValueError Traceback (most recent call last)
<ipython-input-62-50522376d1e5> in <module>()
1 grid_search = GridSearchCV(pipeline, search_params)
----> 2 grid_search.fit(x_train, y_train)
3 grid_search.best_params_
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
636 error_score=self.error_score)
637 for parameters, (train, test) in product(candidate_params,
--> 638 cv.split(X, y, groups)))
639
640 # if one choose to see train score, "out" will contain train score info
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
435 estimator.fit(X_train, **fit_params)
436 else:
--> 437 estimator.fit(X_train, y_train, **fit_params)
438
439 except Exception as e:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
257 Xt, fit_params = self._fit(X, y, **fit_params)
258 if self._final_estimator is not None:
--> 259 self._final_estimator.fit(Xt, y, **fit_params)
260 return self
261
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight)
602 self.feature_count_ = np.zeros((n_effective_classes, n_features),
603 dtype=np.float64)
--> 604 self._count(X, Y)
605 alpha = self._check_alpha()
606 self._update_feature_log_prob(alpha)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)
ValueError: Input X must be non-negative
> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count()
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)

Yes, when hashing is not None, FeatureHasher() is chosen, which can output negative values.
But you can remove convert those negative values to positive by using the non_negative parameter of FeatureHashser as given in documentation:
non_negative : boolean, optional, default False
When True, an absolute value is applied to the features matrix prior
to returning it. When used in conjunction with
alternate_sign=True, this significantly reduces the inner product
preservation property.
So change this line in MyVectorizer:
self.myvec = FeatureHasher(n_features = self.hashing)
to this:
self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True)
Note:
This parameter has been deprecated since version 0.19 and will be removed in 0.21.
You need to study how this parameter will affect your classification problem.

Related

Feature-Engine RareLabelEncoder: ValueError: could not convert string to float: 'Rare'

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import BoxCoxTransformer, PowerTransformer
high_card_cols = ['brand', 'model', 'location']
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
processor = make_column_transformer(
(RareLabelEncoder(n_categories = 9), ['brand', 'model']), #to group rare cateogircal observations
(MeanEncoder(), high_card_cols), # To encode categorical observations with target mean
(OrdinalEncoder(), cat_cols), #to encode low cardinal variables
(PowerTransformer(), ['milage_kmpl']), # transform continuous variables using Exponential transformation
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),# transform continuous variables using BoxCox
remainder = "passthrough"
)
I am currently on a regression task. I have 2 categorical columns that have high cardinality and rare observations. I created a pipeline that includes rarelabelencoder followed by meanencoder and other encoders.
When I try to fit a simple linear regression model, I get the following error:
ValueError: could not convert string to float: 'Rare'
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
lr_pipe = make_pipeline(
(processor),
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-1c31eaf7c59a> in <module>
8 )
9
---> 10 lr_pipe.fit(X_train, y_train.price)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
700 else:
701 # fit method of arity 2 (supervised transformation)
--> 702 return self.fit(X, y, **fit_params).transform(X)
703
704
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
671 array = array.astype(dtype, casting="unsafe", copy=False)
672 else:
--> 673 array = np.asarray(array, order=order, dtype=dtype)
674 except ComplexWarning as complex_warning:
675 raise ValueError("Complex data not supported\n"
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: 'Rare'
How to overcome this issue?
The beauty of Feature-engine transformers is that you can select the variables directly at the transformer, so there is no need to use sklearn's column transformer at all. You can place all Feature-engine transformers directly within a Pipeline.
lr_pipe = make_pipeline(
(RareLabelEncoder(n_categories = 9,variables= ['brand', 'model']),
(MeanEncoder(variables=high_card_cols),
(OrdinalEncoder(variables = cat_cols),
etc...
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
Update:
I managed to solve the problem the following way:
I added rarelabelencoder to the pipeline rather than column-transformer. This solved the issue for me.
lr_pipe = make_pipeline(
(RareLabelEncoder(0.002, variables = ['brand', 'model'])),
(nontree_processor),
(StandardScaler()),
(LinearRegression())
)
ColumnTransformer applies its transformers in parallel, so the brand column actually shows up twice coming out of the processor: once with rare labels grouped, but not otherwise encoded (throwing the error), and then again mean-encoded (but with rare groups getting different values). You can use pipelines to get around that:
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
brandmodel_pipe = make_pipeline(
RareLabelEncoder(n_categories=9),
MeanEncoder(),
)
processor = make_column_transformer(
(brandmodel_pipe, ['brand', 'model']),
(MeanEncoder(), ['location']),
(OrdinalEncoder(), cat_cols),
(PowerTransformer(), ['milage_kmpl']),
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),
remainder = "passthrough"
)

TypeError, Hands on ML With Sci-Kit Learn Regression

Trying to work through the project in Chapter 2 of Hands-on ML with Sci-Kit Learn and Tensorflow, I am being given a type error when trying to run the data through a pipeline prior to building a model.
I keep getting a TypeError telling me that fit_transform() takes 2 positional arguments and yet 3 are given. Not sure what I am doing wrong as I am following along the best I can with the exercise. Please advise, let me know or if more information is needed as I tried to stick to the minimum amount of code required to generate the error sustained. Thanks for what insight you may be able to kindly provide.
Code is as follows
#Load the Data
import os
import tarfile
from six.moves import urllib
download_root = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
housing_path = os.path.join('datasets', 'housing')
housing_url = download_root + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=housing_url, housing_path = housing_path):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, 'housing.tgz')
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
# Pipeline Build
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, x, y=None):
return self
def transform(self, x):
return x[self.attribute_names].values
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_per_house = X[:, rooms_ix] / X[:, household_ix]
pop_per_house = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
return np.c_[X, rooms_per_house, pop_per_house,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_house, pop_per_house]
num_attribs = list(house_num)
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer())
])
from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
('num_pipeline', num_pipeline, num_attribs),
('cat_pipeline', cat_pipeline, cat_attribs),
])
Type Error encountered is as follows
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-50-925e65d2e69a> in <module>
----> 1 house_prep = full_pipeline.fit_transform(house)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
474 self._validate_remainder(X)
475
--> 476 result = self._fit_transform(X, y, _fit_transform_one)
477
478 if not result:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in _fit_transform(self, X, y, func, fitted)
418 message=self._log_message(name, idx, len(transformers)))
419 for idx, (name, trans, column, weight) in enumerate(
--> 420 self._iter(fitted=fitted, replace_strings=True), 1))
421 except ValueError as e:
422 if "Expected 2D array, got 1D array instead" in str(e):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
922 self._iterating = self._original_iterator is not None
923
--> 924 while self.dispatch_one_batch(iterator):
925 pass
926
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
TypeError: fit_transform() takes 2 positional arguments but 3 were given

GridSearchCV paramaters

I'm trying to use GridSearchCV with KMeans clustering to explore the optimal number to clusters to use in order to get the best results on a classification problem.
I've got the following code:
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
faces = fetch_olivetti_faces()
X_data, y_data = faces.data, faces.target
log_reg = LogisticRegression()
split = StratifiedShuffleSplit(n_splits = 1, test_size=.2, random_state=42)
for train_index, test_index in split.split(X_train, y_train):
X_train_set , y_train_set = X_data[train_index,], y_data[train_index,]
X_test_set, y_test_set = X_data[test_index,], y_data[test_index, ]
pipeline = Pipeline([
('kmeans', KMeans(n_clusters = 30)),
('log_reg', LogisticRegression())
])
cluster_grid = dict(n_clusters=range(2,100))
grid = GridSearchCV(pipeline, cluster_grid)
grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
Here's the entire traceback:
-------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-80e6a3932897> in <module>
----> 1 grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
686 return results
687
--> 688 self._run_search(evaluate_candidates)
689
690 # For multi-metric evaluation, store the best_index_, best_params_ and
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1147 def _run_search(self, evaluate_candidates):
1148 """Search all candidates in param_grid"""
-> 1149 evaluate_candidates(ParameterGrid(self.param_grid))
1150
1151
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
665 for parameters, (train, test)
666 in product(candidate_params,
--> 667 cv.split(X, y, groups)))
668
669 if len(out) < 1:
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
501 train_scores = {}
502 if parameters is not None:
--> 503 estimator.set_params(**parameters)
504
505 start_time = time.time()
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
162 self
163 """
--> 164 self._set_params('steps', **kwargs)
165 return self
166
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
48 self._replace_estimator(attr, name, params.pop(name))
49 # 3. Step parameters and other initialisation arguments
---> 50 super().set_params(**params)
51 return self
52
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py in set_params(self, **params)
222 'Check the list of available parameters '
223 'with `estimator.get_params().keys()`.' %
--> 224 (key, self))
225
226 if delim:
ValueError: Invalid parameter n_clusters for estimator Pipeline(memory=None,
steps=[('kmeans',
KMeans(algorithm='auto', copy_x=True, init='k-means++',
max_iter=300, n_clusters=30, n_init=10, n_jobs=None,
precompute_distances='auto', random_state=None,
tol=0.0001, verbose=0)),
('log_reg',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
​
I have no idea what the heck is going on...I'm not sure how to interpret this error message but my parameter grid doesn't seem to be out of wack. PLEASE HELP!
When you are using pipeline you need to give the parameters as following:
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
where kmeans is taken from ('kmeans', KMeans())
So, your code should look as the following:
pipeline = Pipeline([
('kmeans', KMeans(),
('log_reg', LogisticRegression())
])
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
The parameter n_clusters is only applicable to KMeans and not LogisticRegression
Specify in your cluster_grid that the grid param is only meant for KMeans:
# Parameters of pipelines can be set using ‘__’ separated parameter names:
cluster_grid = dict(kmeans__n_clusters=range(2,100))
Reference : https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray

I'm new to pytorch. I'm trying to do a cross validation, and I found the skorch library, which allow users to use sklearn functions with a torch model. So, I define a neural network class:
torch.manual_seed(42)
class Netcross(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(5,30)
self.sig1 = nn.Tanh()
#self.dout = nn.Dropout(0.2)
self.fc2 = nn.Linear(30,30)
self.sig2 = nn.Sigmoid()
self.out = nn.Linear(30, 1)
self.out_act = nn.Sigmoid()
#self.fc1.weight = torch.nn.Parameter(torch.rand(50,5))
def forward(self, x):
x = self.fc1(x)
x = self.sig1(x)
#x = self.dout(x)
x = self.fc2(x)
x = self.sig2(x)
x = self.out(x)
y = self.out_act(x)
return y
crossnet1 = NeuralNet(
Netcross,
max_epochs = 5,
criterion=torch.nn.BCELoss,
#user defined coeff.
callbacks = [epoch_acc, epoch_f1, epoch_phi],
optimizer=torch.optim.SGD,
optimizer__momentum=0.9,
lr=0.85,
)
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
crossnet1.fit(inputs, labels)
so far everything is fine, the function returns credible results without any errors. The problem appears when I try to use the GridSearchCV function:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_epochs':[5, 10, 20],
'lr': [0.1, 0.65, 0.8],
}
gs = GridSearchCV(estimator = crossnet1, param_grid = param_grid, refit = False, cv = 3, scoring = "accuracy")
gs.fit(inputs, labels)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-41-e1f3dbd9a2b0> in <module>
3 labels1 = torch.from_numpy(np.array(labels))
4
----> 5 gs.fit(inputs1, labels1)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
516 start_time = time.time()
517
--> 518 X_train, y_train = _safe_split(estimator, X, y, train)
519 X_test, y_test = _safe_split(estimator, X, y, test, train)
520
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
201 X_subset = X[np.ix_(indices, train_indices)]
202 else:
--> 203 X_subset = safe_indexing(X, indices)
204
205 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
214 indices.dtype.kind == 'i'):
215 # This is often substantially faster than X[indices]
--> 216 return X.take(indices, axis=0)
217 else:
218 return X[indices]
TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray
What is wrong?
Change your input and labels to np.ndarray (see examples here).
Those will be casted to torch.Tensor when needed automatically by skorch.
All in all change your
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
to:
inputs = x_traintensor.numpy() # assuming x is torch.Tensor
labels = y_traintensor.numpy() # assuming y is torch.Tensor
BTW. torch.Variable is deprecated, you should use torch.Tensor(data, requires_grad=True). In this case, inputs and labels do not need gradient, hence Variable is even more out of place.

Sklearn_pandas in a pipeline returns TypeError: 'builtin_function_or_method' object is not iterable

I have a data set with categorical and numerical features on which I want to apply some transformations followed by XGBClassifier.
Link to data set: https://www.kaggle.com/blastchar/telco-customer-churn
As the transformations are different for the numerical and categorical features, I used sklearn_pandas and its DataFrameMapper.
To perform one-hot encoding on the categorical features, I want to use DictVectorizer. But to use DictVectorizer, I first need to convert the dataframe into a dict, which I try to do with a custom transformer Dictifier.
When I run the Pipeline I get the error 'builtin_function_or_method' object is not iterable. Does anyone know what might be causing this error?
import numpy as np
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn_pandas import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import xgboost as xgb
# Importing the data
df = pd.read_csv('../input/WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values=' ')
X, y = df.iloc[:,1:-1], df.iloc[:,-1]
# Label encoding of the target classes
le = LabelEncoder()
y = le.fit_transform(y.values.reshape(y.shape[0], ))
# Defining the num and cat column names
cat_cols = X.columns[X.dtypes == object].tolist()
num_cols = X.columns[X.dtypes != object].tolist()
# DataFrameMappers for num and cat columns
num_transf_mapper = DataFrameMapper([([num_col], [Imputer(strategy="median"), StandardScaler()]) for num_col in num_cols],
input_df=True,
df_out=True)
cat_transf_mapper = DataFrameMapper([(cat_col , [CategoricalImputer()]) for cat_col in cat_cols],
input_df=True,
df_out=True)
# FeatureUnion of num and cat columns
num_cat_union = FeatureUnion([("num_mapper", num_transf_mapper),
("cat_mapper", cat_transf_mapper)])
# Custom transformer to convert Pandas DataFrame into Dict (needed for DictVectorizer)
class Dictifier(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return X.to_dict('records')
# Pipeline
pipeline = Pipeline([("featureunion", num_cat_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)
Error trace
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py:542: FutureWarning: From version 0.22, errors during fit will result in a cross validation score of NaN by default. Use error_score='raise' if you want an exception raised or error_score=np.nan to adopt the behavior from version 0.22.FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-187-96272018fb87> in <module>()
53
54 # Perform cross-validation
---> 55 cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)
/opt/conda/lib/python3.6/site-packages/sklearn_pandas/cross_validation.py in cross_val_score(model, X, *args, **kwargs)
19 warnings.warn(DEPRECATION_MSG, DeprecationWarning)
20 X = DataWrapper(X)
---> 21 return sk_cross_val_score(model, X, *args, **kwargs)
22
23
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
979 # remaining jobs.
980 self._iterating = False
--> 981 if self.dispatch_one_batch(iterator):
982 self._iterating = self._original_iterator is not None
983
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
821 return False
822 else:
--> 823 self._dispatch(tasks)
824 return True
825
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
778 with self._lock:
779 job_idx = len(self._jobs)
--> 780 job = self._backend.apply_async(batch, callback=cb)
781 # A job can complete so quickly than its callback is
782 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
181 def apply_async(self, func, callback=None):
182 """Schedule a func to be run"""
--> 183 result = ImmediateResult(func)
184 if callback:
185 callback(result)
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
541 # Don't delay the application, to avoid keeping the input
542 # arguments in memory
--> 543 self.results = batch()
544
545 def get(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
263 This estimator
264 """
--> 265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
267 self._final_estimator.fit(Xt, y, **fit_params)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
320
321 def __call__(self, *args, **kwargs):
--> 322 return self.func(*args, **kwargs)
323
324 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
790 delayed(_fit_transform_one)(trans, X, y, weight,
791 **fit_params)
--> 792 for name, trans, weight in self._iter())
793
794 if not result:
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
979 # remaining jobs.
980 self._iterating = False
--> 981 if self.dispatch_one_batch(iterator):
982 self._iterating = self._original_iterator is not None
983
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
821 return False
822 else:
--> 823 self._dispatch(tasks)
824 return True
825
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
778 with self._lock:
779 job_idx = len(self._jobs)
--> 780 job = self._backend.apply_async(batch, callback=cb)
781 # A job can complete so quickly than its callback is
782 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
181 def apply_async(self, func, callback=None):
182 """Schedule a func to be run"""
--> 183 result = ImmediateResult(func)
184 if callback:
185 callback(result)
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
541 # Don't delay the application, to avoid keeping the input
542 # arguments in memory
--> 543 self.results = batch()
544
545 def get(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
460 else:
461 # fit method of arity 2 (supervised transformation)
--> 462 return self.fit(X, y, **fit_params).transform(X)
463
464
/opt/conda/lib/python3.6/site-packages/sklearn_pandas/dataframe_mapper.py in transform(self, X)
342 stacked,
343 columns=self.transformed_names_,
--> 344 index=index)
345 # preserve types
346 for col, dtype in zip(self.transformed_names_, dtypes):
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
377 else:
378 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 379 copy=copy)
380 elif isinstance(data, (list, types.GeneratorType)):
381 if isinstance(data, types.GeneratorType):
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in _init_ndarray(self, values, index, columns, dtype, copy)
525 raise_with_traceback(e)
526
--> 527 index, columns = _get_axes(*values.shape)
528 values = values.T
529
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in _get_axes(N, K, index, columns)
482 index = com._default_index(N)
483 else:
--> 484 index = _ensure_index(index)
485
486 if columns is None:
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _ensure_index(index_like, copy)
4972 index_like = copy(index_like)
4973
-> 4974 return Index(index_like)
4975
4976
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
449 data, names=name or kwargs.get('names'))
450 # other iterable of some kind
--> 451 subarr = com._asarray_tuplesafe(data, dtype=object)
452 return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
453
/opt/conda/lib/python3.6/site-packages/pandas/core/common.py in _asarray_tuplesafe(values, dtype)
303
304 if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')):
--> 305 values = list(values)
306 elif isinstance(values, Index):
307 return values.values
TypeError: 'builtin_function_or_method' object is not iterable
This seems like a bug in sklearn_pandas.cross_val_score.
sklearn_pandas wraps the dataframe you supply in a DataWrapper object as seen in source code here:
def cross_val_score(model, X, *args, **kwargs):
warnings.warn(DEPRECATION_MSG, DeprecationWarning)
X = DataWrapper(X)
return sk_cross_val_score(model, X, *args, **kwargs)
which is apparently done to handle older versions of sklearn.cross_validation.cross_val_score which did not handle pandas DataFrames well. DataWrapper returns a list instance when divided into train and test.
But then it is not handled correctly during transform() of DataframeMapper as given in source code here
if self.df_out:
# if no rows were dropped preserve the original index,
# otherwise use a new integer one
no_rows_dropped = len(X) == len(stacked)
if no_rows_dropped:
index = X.index # <== This here is the source of error
else:
index = None
Here, X is not a DataFrame, but a list object, so index is not a list the actual index of pandas as intended, but actually a function of list and hence the error you got.
But since newer sklearn cross_val_score handles DataFrame correctly, you dont have to use the other import.
Change it from:
from sklearn_pandas import cross_val_score
to this:
from sklearn.model_selection import cross_val_score
So now you wont get that error anymore.
But, still further down the code, you will get another error about:
AttributeError: 'numpy.ndarray' object has no attribute 'to_dict'
This is because you wrap both your DataFrameMapper objects into a FeatureUnion by doing this:
num_cat_union = FeatureUnion([("num_mapper", num_transf_mapper),
("cat_mapper", cat_transf_mapper)])
and then do this:
pipeline = Pipeline([("featureunion", num_cat_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))])
Your Dictifier expects a DataFrame to be passed to it, so that it can call to_dict() on it, but the previous step in pipeline FeatureUnion will not preserve the DataFrame, it will convert that into a numpy array.
Generally, DataFrameMapper and FeatureUnion don't work well together. I would advise you to remove the FeatureUnion altogether and instead combine your both DataFrameMapper objects into a single object. This will effectively work as you wanted FeatureUnion to work.
Something like this:
transformers = []
# Combine both your operations here only
transformers.extend([([num_col], [Imputer(strategy="median"),
StandardScaler()]) for num_col in num_cols])
transformers.extend([(cat_col , [CategoricalImputer()]) for cat_col in cat_cols])
num_cat_union = DataFrameMapper(transformers,
input_df=True,
df_out=True)
# Your other code
...
...
Let show just part of the code as I do
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBClassifier(max_depth=3)),
])
All together in a pipeline. I hope this might help.
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()
# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
[([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
input_df=True,
df_out=True
)
# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
[(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
input_df=True,
df_out=True
)
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion
# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
("num_mapper", numeric_imputation_mapper),
("cat_mapper", categorical_imputation_mapper)
])
# Create full pipeline
pipeline = Pipeline([
("featureunion", numeric_categorical_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier())
])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)
# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))

Categories