Related
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import BoxCoxTransformer, PowerTransformer
high_card_cols = ['brand', 'model', 'location']
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
processor = make_column_transformer(
(RareLabelEncoder(n_categories = 9), ['brand', 'model']), #to group rare cateogircal observations
(MeanEncoder(), high_card_cols), # To encode categorical observations with target mean
(OrdinalEncoder(), cat_cols), #to encode low cardinal variables
(PowerTransformer(), ['milage_kmpl']), # transform continuous variables using Exponential transformation
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),# transform continuous variables using BoxCox
remainder = "passthrough"
)
I am currently on a regression task. I have 2 categorical columns that have high cardinality and rare observations. I created a pipeline that includes rarelabelencoder followed by meanencoder and other encoders.
When I try to fit a simple linear regression model, I get the following error:
ValueError: could not convert string to float: 'Rare'
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
lr_pipe = make_pipeline(
(processor),
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-1c31eaf7c59a> in <module>
8 )
9
---> 10 lr_pipe.fit(X_train, y_train.price)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
700 else:
701 # fit method of arity 2 (supervised transformation)
--> 702 return self.fit(X, y, **fit_params).transform(X)
703
704
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
671 array = array.astype(dtype, casting="unsafe", copy=False)
672 else:
--> 673 array = np.asarray(array, order=order, dtype=dtype)
674 except ComplexWarning as complex_warning:
675 raise ValueError("Complex data not supported\n"
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: 'Rare'
How to overcome this issue?
The beauty of Feature-engine transformers is that you can select the variables directly at the transformer, so there is no need to use sklearn's column transformer at all. You can place all Feature-engine transformers directly within a Pipeline.
lr_pipe = make_pipeline(
(RareLabelEncoder(n_categories = 9,variables= ['brand', 'model']),
(MeanEncoder(variables=high_card_cols),
(OrdinalEncoder(variables = cat_cols),
etc...
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
Update:
I managed to solve the problem the following way:
I added rarelabelencoder to the pipeline rather than column-transformer. This solved the issue for me.
lr_pipe = make_pipeline(
(RareLabelEncoder(0.002, variables = ['brand', 'model'])),
(nontree_processor),
(StandardScaler()),
(LinearRegression())
)
ColumnTransformer applies its transformers in parallel, so the brand column actually shows up twice coming out of the processor: once with rare labels grouped, but not otherwise encoded (throwing the error), and then again mean-encoded (but with rare groups getting different values). You can use pipelines to get around that:
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
brandmodel_pipe = make_pipeline(
RareLabelEncoder(n_categories=9),
MeanEncoder(),
)
processor = make_column_transformer(
(brandmodel_pipe, ['brand', 'model']),
(MeanEncoder(), ['location']),
(OrdinalEncoder(), cat_cols),
(PowerTransformer(), ['milage_kmpl']),
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),
remainder = "passthrough"
)
Trying to work through the project in Chapter 2 of Hands-on ML with Sci-Kit Learn and Tensorflow, I am being given a type error when trying to run the data through a pipeline prior to building a model.
I keep getting a TypeError telling me that fit_transform() takes 2 positional arguments and yet 3 are given. Not sure what I am doing wrong as I am following along the best I can with the exercise. Please advise, let me know or if more information is needed as I tried to stick to the minimum amount of code required to generate the error sustained. Thanks for what insight you may be able to kindly provide.
Code is as follows
#Load the Data
import os
import tarfile
from six.moves import urllib
download_root = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
housing_path = os.path.join('datasets', 'housing')
housing_url = download_root + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=housing_url, housing_path = housing_path):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, 'housing.tgz')
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()
# Pipeline Build
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, x, y=None):
return self
def transform(self, x):
return x[self.attribute_names].values
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True):
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_per_house = X[:, rooms_ix] / X[:, household_ix]
pop_per_house = X[:, population_ix] / X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
return np.c_[X, rooms_per_house, pop_per_house,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_house, pop_per_house]
num_attribs = list(house_num)
cat_attribs = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler())
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer())
])
from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
('num_pipeline', num_pipeline, num_attribs),
('cat_pipeline', cat_pipeline, cat_attribs),
])
Type Error encountered is as follows
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-50-925e65d2e69a> in <module>
----> 1 house_prep = full_pipeline.fit_transform(house)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
474 self._validate_remainder(X)
475
--> 476 result = self._fit_transform(X, y, _fit_transform_one)
477
478 if not result:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in _fit_transform(self, X, y, func, fitted)
418 message=self._log_message(name, idx, len(transformers)))
419 for idx, (name, trans, column, weight) in enumerate(
--> 420 self._iter(fitted=fitted, replace_strings=True), 1))
421 except ValueError as e:
422 if "Expected 2D array, got 1D array instead" in str(e):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
922 self._iterating = self._original_iterator is not None
923
--> 924 while self.dispatch_one_batch(iterator):
925 pass
926
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
I'm trying to use GridSearchCV with KMeans clustering to explore the optimal number to clusters to use in order to get the best results on a classification problem.
I've got the following code:
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
faces = fetch_olivetti_faces()
X_data, y_data = faces.data, faces.target
log_reg = LogisticRegression()
split = StratifiedShuffleSplit(n_splits = 1, test_size=.2, random_state=42)
for train_index, test_index in split.split(X_train, y_train):
X_train_set , y_train_set = X_data[train_index,], y_data[train_index,]
X_test_set, y_test_set = X_data[test_index,], y_data[test_index, ]
pipeline = Pipeline([
('kmeans', KMeans(n_clusters = 30)),
('log_reg', LogisticRegression())
])
cluster_grid = dict(n_clusters=range(2,100))
grid = GridSearchCV(pipeline, cluster_grid)
grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
Here's the entire traceback:
-------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-80e6a3932897> in <module>
----> 1 grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
686 return results
687
--> 688 self._run_search(evaluate_candidates)
689
690 # For multi-metric evaluation, store the best_index_, best_params_ and
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1147 def _run_search(self, evaluate_candidates):
1148 """Search all candidates in param_grid"""
-> 1149 evaluate_candidates(ParameterGrid(self.param_grid))
1150
1151
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
665 for parameters, (train, test)
666 in product(candidate_params,
--> 667 cv.split(X, y, groups)))
668
669 if len(out) < 1:
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
501 train_scores = {}
502 if parameters is not None:
--> 503 estimator.set_params(**parameters)
504
505 start_time = time.time()
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
162 self
163 """
--> 164 self._set_params('steps', **kwargs)
165 return self
166
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
48 self._replace_estimator(attr, name, params.pop(name))
49 # 3. Step parameters and other initialisation arguments
---> 50 super().set_params(**params)
51 return self
52
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py in set_params(self, **params)
222 'Check the list of available parameters '
223 'with `estimator.get_params().keys()`.' %
--> 224 (key, self))
225
226 if delim:
ValueError: Invalid parameter n_clusters for estimator Pipeline(memory=None,
steps=[('kmeans',
KMeans(algorithm='auto', copy_x=True, init='k-means++',
max_iter=300, n_clusters=30, n_init=10, n_jobs=None,
precompute_distances='auto', random_state=None,
tol=0.0001, verbose=0)),
('log_reg',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I have no idea what the heck is going on...I'm not sure how to interpret this error message but my parameter grid doesn't seem to be out of wack. PLEASE HELP!
When you are using pipeline you need to give the parameters as following:
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
where kmeans is taken from ('kmeans', KMeans())
So, your code should look as the following:
pipeline = Pipeline([
('kmeans', KMeans(),
('log_reg', LogisticRegression())
])
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
The parameter n_clusters is only applicable to KMeans and not LogisticRegression
Specify in your cluster_grid that the grid param is only meant for KMeans:
# Parameters of pipelines can be set using ‘__’ separated parameter names:
cluster_grid = dict(kmeans__n_clusters=range(2,100))
Reference : https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
I'm new to pytorch. I'm trying to do a cross validation, and I found the skorch library, which allow users to use sklearn functions with a torch model. So, I define a neural network class:
torch.manual_seed(42)
class Netcross(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(5,30)
self.sig1 = nn.Tanh()
#self.dout = nn.Dropout(0.2)
self.fc2 = nn.Linear(30,30)
self.sig2 = nn.Sigmoid()
self.out = nn.Linear(30, 1)
self.out_act = nn.Sigmoid()
#self.fc1.weight = torch.nn.Parameter(torch.rand(50,5))
def forward(self, x):
x = self.fc1(x)
x = self.sig1(x)
#x = self.dout(x)
x = self.fc2(x)
x = self.sig2(x)
x = self.out(x)
y = self.out_act(x)
return y
crossnet1 = NeuralNet(
Netcross,
max_epochs = 5,
criterion=torch.nn.BCELoss,
#user defined coeff.
callbacks = [epoch_acc, epoch_f1, epoch_phi],
optimizer=torch.optim.SGD,
optimizer__momentum=0.9,
lr=0.85,
)
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
crossnet1.fit(inputs, labels)
so far everything is fine, the function returns credible results without any errors. The problem appears when I try to use the GridSearchCV function:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_epochs':[5, 10, 20],
'lr': [0.1, 0.65, 0.8],
}
gs = GridSearchCV(estimator = crossnet1, param_grid = param_grid, refit = False, cv = 3, scoring = "accuracy")
gs.fit(inputs, labels)
I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-41-e1f3dbd9a2b0> in <module>
3 labels1 = torch.from_numpy(np.array(labels))
4
----> 5 gs.fit(inputs1, labels1)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
516 start_time = time.time()
517
--> 518 X_train, y_train = _safe_split(estimator, X, y, train)
519 X_test, y_test = _safe_split(estimator, X, y, test, train)
520
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
201 X_subset = X[np.ix_(indices, train_indices)]
202 else:
--> 203 X_subset = safe_indexing(X, indices)
204
205 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
214 indices.dtype.kind == 'i'):
215 # This is often substantially faster than X[indices]
--> 216 return X.take(indices, axis=0)
217 else:
218 return X[indices]
TypeError: take(): argument 'index' (position 1) must be Tensor, not numpy.ndarray
What is wrong?
Change your input and labels to np.ndarray (see examples here).
Those will be casted to torch.Tensor when needed automatically by skorch.
All in all change your
inputs = Variable(x_traintensor)
labels = Variable(y_traintensor)
to:
inputs = x_traintensor.numpy() # assuming x is torch.Tensor
labels = y_traintensor.numpy() # assuming y is torch.Tensor
BTW. torch.Variable is deprecated, you should use torch.Tensor(data, requires_grad=True). In this case, inputs and labels do not need gradient, hence Variable is even more out of place.
I have a data set with categorical and numerical features on which I want to apply some transformations followed by XGBClassifier.
Link to data set: https://www.kaggle.com/blastchar/telco-customer-churn
As the transformations are different for the numerical and categorical features, I used sklearn_pandas and its DataFrameMapper.
To perform one-hot encoding on the categorical features, I want to use DictVectorizer. But to use DictVectorizer, I first need to convert the dataframe into a dict, which I try to do with a custom transformer Dictifier.
When I run the Pipeline I get the error 'builtin_function_or_method' object is not iterable. Does anyone know what might be causing this error?
import numpy as np
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn_pandas import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
import xgboost as xgb
# Importing the data
df = pd.read_csv('../input/WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values=' ')
X, y = df.iloc[:,1:-1], df.iloc[:,-1]
# Label encoding of the target classes
le = LabelEncoder()
y = le.fit_transform(y.values.reshape(y.shape[0], ))
# Defining the num and cat column names
cat_cols = X.columns[X.dtypes == object].tolist()
num_cols = X.columns[X.dtypes != object].tolist()
# DataFrameMappers for num and cat columns
num_transf_mapper = DataFrameMapper([([num_col], [Imputer(strategy="median"), StandardScaler()]) for num_col in num_cols],
input_df=True,
df_out=True)
cat_transf_mapper = DataFrameMapper([(cat_col , [CategoricalImputer()]) for cat_col in cat_cols],
input_df=True,
df_out=True)
# FeatureUnion of num and cat columns
num_cat_union = FeatureUnion([("num_mapper", num_transf_mapper),
("cat_mapper", cat_transf_mapper)])
# Custom transformer to convert Pandas DataFrame into Dict (needed for DictVectorizer)
class Dictifier(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return X.to_dict('records')
# Pipeline
pipeline = Pipeline([("featureunion", num_cat_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)
Error trace
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py:542: FutureWarning: From version 0.22, errors during fit will result in a cross validation score of NaN by default. Use error_score='raise' if you want an exception raised or error_score=np.nan to adopt the behavior from version 0.22.FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-187-96272018fb87> in <module>()
53
54 # Perform cross-validation
---> 55 cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)
/opt/conda/lib/python3.6/site-packages/sklearn_pandas/cross_validation.py in cross_val_score(model, X, *args, **kwargs)
19 warnings.warn(DEPRECATION_MSG, DeprecationWarning)
20 X = DataWrapper(X)
---> 21 return sk_cross_val_score(model, X, *args, **kwargs)
22
23
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
979 # remaining jobs.
980 self._iterating = False
--> 981 if self.dispatch_one_batch(iterator):
982 self._iterating = self._original_iterator is not None
983
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
821 return False
822 else:
--> 823 self._dispatch(tasks)
824 return True
825
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
778 with self._lock:
779 job_idx = len(self._jobs)
--> 780 job = self._backend.apply_async(batch, callback=cb)
781 # A job can complete so quickly than its callback is
782 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
181 def apply_async(self, func, callback=None):
182 """Schedule a func to be run"""
--> 183 result = ImmediateResult(func)
184 if callback:
185 callback(result)
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
541 # Don't delay the application, to avoid keeping the input
542 # arguments in memory
--> 543 self.results = batch()
544
545 def get(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
263 This estimator
264 """
--> 265 Xt, fit_params = self._fit(X, y, **fit_params)
266 if self._final_estimator is not None:
267 self._final_estimator.fit(Xt, y, **fit_params)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
228 Xt, fitted_transformer = fit_transform_one_cached(
229 cloned_transformer, Xt, y, None,
--> 230 **fit_params_steps[name])
231 # Replace the transformer of the step with the fitted
232 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
320
321 def __call__(self, *args, **kwargs):
--> 322 return self.func(*args, **kwargs)
323
324 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
790 delayed(_fit_transform_one)(trans, X, y, weight,
791 **fit_params)
--> 792 for name, trans, weight in self._iter())
793
794 if not result:
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
979 # remaining jobs.
980 self._iterating = False
--> 981 if self.dispatch_one_batch(iterator):
982 self._iterating = self._original_iterator is not None
983
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
821 return False
822 else:
--> 823 self._dispatch(tasks)
824 return True
825
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
778 with self._lock:
779 job_idx = len(self._jobs)
--> 780 job = self._backend.apply_async(batch, callback=cb)
781 # A job can complete so quickly than its callback is
782 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
181 def apply_async(self, func, callback=None):
182 """Schedule a func to be run"""
--> 183 result = ImmediateResult(func)
184 if callback:
185 callback(result)
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
541 # Don't delay the application, to avoid keeping the input
542 # arguments in memory
--> 543 self.results = batch()
544
545 def get(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, **fit_params)
612 def _fit_transform_one(transformer, X, y, weight, **fit_params):
613 if hasattr(transformer, 'fit_transform'):
--> 614 res = transformer.fit_transform(X, y, **fit_params)
615 else:
616 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
460 else:
461 # fit method of arity 2 (supervised transformation)
--> 462 return self.fit(X, y, **fit_params).transform(X)
463
464
/opt/conda/lib/python3.6/site-packages/sklearn_pandas/dataframe_mapper.py in transform(self, X)
342 stacked,
343 columns=self.transformed_names_,
--> 344 index=index)
345 # preserve types
346 for col, dtype in zip(self.transformed_names_, dtypes):
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
377 else:
378 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 379 copy=copy)
380 elif isinstance(data, (list, types.GeneratorType)):
381 if isinstance(data, types.GeneratorType):
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in _init_ndarray(self, values, index, columns, dtype, copy)
525 raise_with_traceback(e)
526
--> 527 index, columns = _get_axes(*values.shape)
528 values = values.T
529
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in _get_axes(N, K, index, columns)
482 index = com._default_index(N)
483 else:
--> 484 index = _ensure_index(index)
485
486 if columns is None:
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _ensure_index(index_like, copy)
4972 index_like = copy(index_like)
4973
-> 4974 return Index(index_like)
4975
4976
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
449 data, names=name or kwargs.get('names'))
450 # other iterable of some kind
--> 451 subarr = com._asarray_tuplesafe(data, dtype=object)
452 return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
453
/opt/conda/lib/python3.6/site-packages/pandas/core/common.py in _asarray_tuplesafe(values, dtype)
303
304 if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')):
--> 305 values = list(values)
306 elif isinstance(values, Index):
307 return values.values
TypeError: 'builtin_function_or_method' object is not iterable
This seems like a bug in sklearn_pandas.cross_val_score.
sklearn_pandas wraps the dataframe you supply in a DataWrapper object as seen in source code here:
def cross_val_score(model, X, *args, **kwargs):
warnings.warn(DEPRECATION_MSG, DeprecationWarning)
X = DataWrapper(X)
return sk_cross_val_score(model, X, *args, **kwargs)
which is apparently done to handle older versions of sklearn.cross_validation.cross_val_score which did not handle pandas DataFrames well. DataWrapper returns a list instance when divided into train and test.
But then it is not handled correctly during transform() of DataframeMapper as given in source code here
if self.df_out:
# if no rows were dropped preserve the original index,
# otherwise use a new integer one
no_rows_dropped = len(X) == len(stacked)
if no_rows_dropped:
index = X.index # <== This here is the source of error
else:
index = None
Here, X is not a DataFrame, but a list object, so index is not a list the actual index of pandas as intended, but actually a function of list and hence the error you got.
But since newer sklearn cross_val_score handles DataFrame correctly, you dont have to use the other import.
Change it from:
from sklearn_pandas import cross_val_score
to this:
from sklearn.model_selection import cross_val_score
So now you wont get that error anymore.
But, still further down the code, you will get another error about:
AttributeError: 'numpy.ndarray' object has no attribute 'to_dict'
This is because you wrap both your DataFrameMapper objects into a FeatureUnion by doing this:
num_cat_union = FeatureUnion([("num_mapper", num_transf_mapper),
("cat_mapper", cat_transf_mapper)])
and then do this:
pipeline = Pipeline([("featureunion", num_cat_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier(max_depth=3))])
Your Dictifier expects a DataFrame to be passed to it, so that it can call to_dict() on it, but the previous step in pipeline FeatureUnion will not preserve the DataFrame, it will convert that into a numpy array.
Generally, DataFrameMapper and FeatureUnion don't work well together. I would advise you to remove the FeatureUnion altogether and instead combine your both DataFrameMapper objects into a single object. This will effectively work as you wanted FeatureUnion to work.
Something like this:
transformers = []
# Combine both your operations here only
transformers.extend([([num_col], [Imputer(strategy="median"),
StandardScaler()]) for num_col in num_cols])
transformers.extend([(cat_col , [CategoricalImputer()]) for cat_col in cat_cols])
num_cat_union = DataFrameMapper(transformers,
input_df=True,
df_out=True)
# Your other code
...
...
Let show just part of the code as I do
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBClassifier(max_depth=3)),
])
All together in a pipeline. I hope this might help.
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()
# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
[([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
input_df=True,
df_out=True
)
# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
[(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
input_df=True,
df_out=True
)
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion
# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
("num_mapper", numeric_imputation_mapper),
("cat_mapper", categorical_imputation_mapper)
])
# Create full pipeline
pipeline = Pipeline([
("featureunion", numeric_categorical_union),
("dictifier", Dictifier()),
("vectorizer", DictVectorizer(sort=False)),
("clf", xgb.XGBClassifier())
])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)
# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))