I am using Jupyter notebook in Windows 10 PC.
I am trying to run 3 models in a dictionary (Logistic, KNeighbors, and RandomForestClassifier)
I went into the miniconda environment and started jupyter notebook.
The dataset is properly loaded, split to 'X' and 'y' and fitted properly.
I am trying to predict target values from a dataset by using 3 types of models which are packed in a dictionary.
I then created a function which has a loop which cycles through the dictionary and provides answers for each model, that will be fed to the empty dictionary *model_scores{}
When I try to run the function I get an error which is provided below.
ny code is:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# models from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# model evaluations
from sklearn.model_selection import train_test_split, cross_val_score
models = {"Logistic Regression": LogisticRegression(),
"KNN": KNeighborsClassifier(),
"Random Forest": RandomForestClassifier()}
def fit_and_score(models, X_train, X_test, y_train, y_test):
np.random.seed(42)
model_scores = {}
for name, model in models.items():
model.fit(X_train, y_train)
model_scores[name] = model.score(X_test, y_test)
return model_scores
model_scores = fit_and_score(models=models,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test)
the error I get:
AttributeError: 'str' object has no attribute 'decode'
full error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-68-45006ce4a749> in <module>
1 # run the function
2
----> 3 model_scores = fit_and_score(models=models,
4 X_train=X_train,
5 X_test=X_test,
<ipython-input-67-005647140cd3> in fit_and_score(models, X_train, X_test, y_train, y_test)
22 for name, model in models.items():
23 # fit the model to the data
---> 24 model.fit(X_train, y_train)
25 # evaluate the model and append its score to model_scores
26 model_scores[name] = model.score(X_test, y_test)
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\sklearn\linear_model\_logistic.py in fit(self, X, y, sample_weight)
1405 else:
1406 prefer = 'processes'
-> 1407 fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
1408 **_joblib_parallel_args(prefer=prefer))(
1409 path_func(X, y, pos_class=class_, Cs=[C_],
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\sklearn\linear_model\_logistic.py in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio)
760 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
761 )
--> 762 n_iter_i = _check_optimize_result(
763 solver, opt_res, max_iter,
764 extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\sklearn\utils\optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
After importing the train_test_split from sklearn.model_selection, you are using X_train, X_test, y_train, y_test without defining them.
here i used a Social_Network_Ads.csv random dataset found on internet and it works without showing any error:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
from sklearn.model_selection import train_test_split,cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
def fit_and_score(models, X_train, X_test, y_train, y_test):
np.random.seed(42)
model_scores = {}
for name, model in models.items():
model.fit(X_train, y_train)
model_scores[name] = model.score(X_test, y_test)
return model_scores
model_scores = fit_and_score(models=models,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test)
Related
My CNN mode, used for binary classification of images, is trained on NumPy arrays with an input shape of (224, 224, 3). On the other hand, my logistic regression mode, used for binary classification of tabular data, is trained on a pandas DataFrame that is two-dimensional. I really want to stack these two models using StackingCVClassifier from the mlxtend package. But the main hurdle I am having right now is that I cannot easily combine the two datasets I have since they are of different shapes. That's why, instead of training my StackingCVClassifier on the combined datasets (tabular and image data), I trained on the predictions of my classifiers:
from sklearn.model_selection import train_test_split
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from tensorflow import keras
from keras import layers
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.base import BaseEstimator, TransformerMixin
#import tensorflow as tf
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
X_train_tabular, X_test_tabular, y_train_tabular, y_test_tabular = train_test_split(
tabular, label.values, test_size=0.2)
X_train_images, X_test_images, y_train_images, y_test_images = train_test_split(
numpy_arrays, label.values, test_size=0.2)
y_train_tabular = np.ravel(y_train_tabular)
y_test_tabular = np.ravel(y_test_tabular)
logistic = LogisticRegression()
logistic.fit(X_train_tabular, y_train_tabular)
def sample_model():
clf = Sequential()
clf.add(Input(shape=(224, 224, 3)))
clf.add(Dropout(0.2))
clf.add(Flatten())
clf.add(Dense(units = 1, activation = 'relu')) # Compile model
clf.compile(optimizer="adam", loss='binary_crossentropy',metrics=[keras.metrics.AUC(), 'accuracy'])
return clf
clf_model = KerasClassifier(build_fn=sample_model)
clf_model._estimator_type = "classifier"
clf_model.fit(X_train_images, y_train_images)
print(clf_model.score(X_train_images, y_train_images))
logistic_predictions = logistic.predict_proba(X_test_tabular)[:, 1]
cnn_predictions = clf_model.predict_proba(X_test_images)[:, 1]
X_test_combined = np.column_stack((logistic_predictions, cnn_predictions))
loo = LeaveOneOut()
scores = []
for train_index, test_index in loo.split(X_test_combined):
X_train, X_test = X_test_combined[train_index], X_test_combined[test_index]
y_train, y_test = y_test_tabular[train_index], y_test_tabular[test_index]
meta_model = StackingCVClassifier(classifiers=[logistic, clf_model], use_features_in_secondary = False,
meta_classifier=RandomForestClassifier(), use_probas=True, cv=5, verbose=1)
meta_model.fit(X_train, y_train)
y_pred = meta_model.predict(X_test)
scores.append(accuracy_score(y_test, y_pred))
print("LOOCV Accuracy:", np.mean(scores))
However, my code, once again, encounters an error.
ValueError Traceback (most recent call last)
Input In [114], in <cell line: 61>()
64 meta_model = StackingCVClassifier(classifiers=[logistic, clf_model], use_features_in_secondary = False,
65 meta_classifier=RandomForestClassifier(), use_probas=True, cv=5, verbose=1)
66 # meta_model = RandomForestClassifier()
---> 67 meta_model.fit(X_train, y_train)
68 y_pred = meta_model.predict(X_test)
69 scores.append(accuracy_score(y_test, y_pred))
File ~/opt/anaconda3/lib/python3.9/site-packages/mlxtend/classifier/stacking_cv_classification.py:261, in StackingCVClassifier.fit(self, X, y, groups, sample_weight)
258 if self.verbose > 1:
259 print(_name_estimators((model,))[0][1])
--> 261 prediction = cross_val_predict(
262 model,
263 X,
264 y,
265 groups=groups,
266 cv=final_cv,
267 n_jobs=self.n_jobs,
268 fit_params=fit_params,
269 verbose=self.verbose,
270 pre_dispatch=self.pre_dispatch,
271 method="predict_proba" if self.use_probas else "predict",
272 )
274 if not self.use_probas:
275 prediction = prediction[:, np.newaxis]
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:968, in cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
965 # We clone the estimator to make sure that all the folds are
966 # independent, and that it is pickle-able.
967 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 968 predictions = parallel(
969 delayed(_fit_and_predict)(
970 clone(estimator), X, y, train, test, verbose, fit_params, method
971 )
972 for train, test in splits
973 )
975 inv_test_indices = np.empty(len(test_indices), dtype=int)
976 inv_test_indices[test_indices] = np.arange(len(test_indices))
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:1043, in Parallel.__call__(self, iterable)
1034 try:
1035 # Only set self._iterating to True if at least a batch
1036 # was dispatched. In particular this covers the edge
(...)
1040 # was very quick and its callback already dispatched all the
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1046 while self.dispatch_one_batch(iterator):
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:779, in Parallel._dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
782 # grow. To ensure correct results ordering, .insert is
783 # used (rather than .append) in the following line
784 self._jobs.insert(job_idx, job)
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
569 def __init__(self, batch):
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:262, in BatchedCalls.__call__(self)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:262, in <listcomp>(.0)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/fixes.py:117, in _FuncWrapper.__call__(self, *args, **kwargs)
115 def __call__(self, *args, **kwargs):
116 with config_context(**self.config):
--> 117 return self.function(*args, **kwargs)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:1050, in _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method)
1048 estimator.fit(X_train, **fit_params)
1049 else:
-> 1050 estimator.fit(X_train, y_train, **fit_params)
1051 func = getattr(estimator, method)
1052 predictions = func(X_test)
File ~/opt/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py:248, in KerasClassifier.fit(self, x, y, **kwargs)
246 raise ValueError("Invalid shape for y: " + str(y.shape))
247 self.n_classes_ = len(self.classes_)
--> 248 return super().fit(x, y, **kwargs)
File ~/opt/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py:175, in BaseWrapper.fit(self, x, y, **kwargs)
172 fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
173 fit_args.update(kwargs)
--> 175 history = self.model.fit(x, y, **fit_args)
177 return history
File ~/opt/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File /var/folders/v0/88rsx9gs4ql39bfw7jvs94c80000gn/T/__autograph_generated_filel6i44dds.py:15, in outer_factory.<locals>.inner_factory.<locals>.tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
ValueError: in user code:
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function *
return step_function(self, iterator)
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step **
outputs = model.train_step(data)
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 993, in train_step
y_pred = self(x, training=True)
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
raise ValueError(
ValueError: Input 0 of layer "sequential_5" is incompatible with the layer: expected shape=(None, 224, 224, 3), found shape=(None, 2)
This is caused by the differing input shapes of what my CNN is trained on and the shape of my predictions X_test_combined. If this approach won't work, please help me in any way you can. I want to use StackingCVClassifier on different datasets and won't require combining the two different datasets.
I am trying to make a simple crossvalidation process by scikitlearn, crossvalidate, and I get the following TypeError:
TypeError Traceback (most recent call last)
<ipython-input-59-0471fb78d8f0> in <module>
5
6 model = NMF(n_components=185, init='random', random_state=0)
----> 7 scores = cross_validate(model, df4_array, cv=5, scoring=('neg_mean_squared_error'))
8 W = model.fit_transform(df4_array)
9 H = model.components_
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
234 return_times=True, return_estimator=return_estimator,
235 error_score=error_score)
--> 236 for train, test in cv.split(X, y, groups))
237
238 zipped_scores = list(zip(*scores))
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
587 scorer = _MultimetricScorer(**scorer)
588 if y_test is None:
--> 589 scores = scorer(estimator, X_test)
590 else:
591 scores = scorer(estimator, X_test, y_test)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 if isinstance(scorer, _BaseScorer):
86 score = scorer._score(cached_call, estimator,
---> 87 *args, **kwargs)
88 else:
89 score = scorer(estimator, *args, **kwargs)
TypeError: _score() missing 1 required positional argument: 'y_true'
I do not know why, because I am trying to do a Recommender System for tue University using Non negative matrix factorization, which is an unsupervised method ... shouldn`t the code work without y?
Code:
from sklearn.decomposition import NMF
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
model = NMF(n_components=185, init='random', random_state=0)
scores = cross_validate(model, df4_array, cv=5, scoring=('neg_mean_squared_error'))
W = model.fit_transform(df4_array)
H = model.components_
So I think the problem is with the scoring method.
Any form of MSE(mean_squared_error) is a function for regression type problems as the formula involves a dependent variable(y) component of regression.
I would suggest to look through this link for different unsupervised scoring methods.
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
Hope this helped.
May the force be with you.
I'm trying to use GridSearchCV with KMeans clustering to explore the optimal number to clusters to use in order to get the best results on a classification problem.
I've got the following code:
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
faces = fetch_olivetti_faces()
X_data, y_data = faces.data, faces.target
log_reg = LogisticRegression()
split = StratifiedShuffleSplit(n_splits = 1, test_size=.2, random_state=42)
for train_index, test_index in split.split(X_train, y_train):
X_train_set , y_train_set = X_data[train_index,], y_data[train_index,]
X_test_set, y_test_set = X_data[test_index,], y_data[test_index, ]
pipeline = Pipeline([
('kmeans', KMeans(n_clusters = 30)),
('log_reg', LogisticRegression())
])
cluster_grid = dict(n_clusters=range(2,100))
grid = GridSearchCV(pipeline, cluster_grid)
grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
Here's the entire traceback:
-------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-80e6a3932897> in <module>
----> 1 grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
686 return results
687
--> 688 self._run_search(evaluate_candidates)
689
690 # For multi-metric evaluation, store the best_index_, best_params_ and
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1147 def _run_search(self, evaluate_candidates):
1148 """Search all candidates in param_grid"""
-> 1149 evaluate_candidates(ParameterGrid(self.param_grid))
1150
1151
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
665 for parameters, (train, test)
666 in product(candidate_params,
--> 667 cv.split(X, y, groups)))
668
669 if len(out) < 1:
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
501 train_scores = {}
502 if parameters is not None:
--> 503 estimator.set_params(**parameters)
504
505 start_time = time.time()
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
162 self
163 """
--> 164 self._set_params('steps', **kwargs)
165 return self
166
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
48 self._replace_estimator(attr, name, params.pop(name))
49 # 3. Step parameters and other initialisation arguments
---> 50 super().set_params(**params)
51 return self
52
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py in set_params(self, **params)
222 'Check the list of available parameters '
223 'with `estimator.get_params().keys()`.' %
--> 224 (key, self))
225
226 if delim:
ValueError: Invalid parameter n_clusters for estimator Pipeline(memory=None,
steps=[('kmeans',
KMeans(algorithm='auto', copy_x=True, init='k-means++',
max_iter=300, n_clusters=30, n_init=10, n_jobs=None,
precompute_distances='auto', random_state=None,
tol=0.0001, verbose=0)),
('log_reg',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I have no idea what the heck is going on...I'm not sure how to interpret this error message but my parameter grid doesn't seem to be out of wack. PLEASE HELP!
When you are using pipeline you need to give the parameters as following:
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
where kmeans is taken from ('kmeans', KMeans())
So, your code should look as the following:
pipeline = Pipeline([
('kmeans', KMeans(),
('log_reg', LogisticRegression())
])
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
The parameter n_clusters is only applicable to KMeans and not LogisticRegression
Specify in your cluster_grid that the grid param is only meant for KMeans:
# Parameters of pipelines can be set using ‘__’ separated parameter names:
cluster_grid = dict(kmeans__n_clusters=range(2,100))
Reference : https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
I think I'm having issues getting my vectorizer working within a gridsearch pipeline:
data as panda df x_train:
bathrooms bedrooms price building_id manager_id
10 1.5 3 3000 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc
10000 1.0 2 5465 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d
100004 1.0 1 2850 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01
100007 1.0 1 3275 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316
100013 1.0 4 3350 0 98e13ad4b495b9613cef886d79a6291f
numeric_predictors = ['bathrooms', 'bedrooms', 'price']
categorical_predictors = ['building_id', 'manager_id']
minMaxScaler fit & transform:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
class MyScaler(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y=None):
self.scaler = MinMaxScaler()
self.scaler.fit(X[self.cols])
return self
def transform(self, X):
return self.scaler.transform(X[self.cols])
My categorical feature hashing vectorizer:
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer
class MyVectorizer(BaseEstimator, TransformerMixin):
"""
Vectorize a set of categorical variables
"""
def __init__(self, cols, hashing=None):
"""
args:
cols: a list of column names of the categorical variables
hashing:
If None, then vectorization is a simple one-hot-encoding.
If an integer, then hashing is the number of features in the output.
"""
self.cols = cols
self.hashing = hashing
def fit(self, X, y=None):
data = X[self.cols]
# Choose a vectorizer
if self.hashing is None:
self.myvec = HashingVectorizer()
else:
self.myvec = FeatureHasher(n_features = self.hashing)
self.myvec.fit(X[self.cols].to_dict(orient='records'))
return self
def transform(self, X):
# Vectorize Input
if self.hashing is None:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')),
columns = self.myvec.feature_names_
)
else:
return pd.DataFrame(
self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray()
)
GridSearch hyperparameters:
search_params = {
'preprocess__vectorize__hashing': [20, 40, 80],
'predict__alpha': [.01, .1, 1, 2, 10]
}
pipeline:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
('preprocess', FeatureUnion([
('scale', MyScaler(cols=numeric_predictors)),
('vectorize', MyVectorizer(cols=categorical_predictors, hashing=5))
])),
('predict', MultinomialNB())
])
And last, calling this with the gridsearchCV classifier:
grid_search = GridSearchCV(pipeline, search_params)
grid_search.fit(x_train, y_train)
I get a ValueError: Input X must be non-negative. I checked and my numeric_predictor columns' data all non-negative, so I am narrowing it down to an issue with the hashing of the categorical predictors.
ValueError Traceback (most recent call last)
<ipython-input-62-50522376d1e5> in <module>()
1 grid_search = GridSearchCV(pipeline, search_params)
----> 2 grid_search.fit(x_train, y_train)
3 grid_search.best_params_
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
636 error_score=self.error_score)
637 for parameters, (train, test) in product(candidate_params,
--> 638 cv.split(X, y, groups)))
639
640 # if one choose to see train score, "out" will contain train score info
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
435 estimator.fit(X_train, **fit_params)
436 else:
--> 437 estimator.fit(X_train, y_train, **fit_params)
438
439 except Exception as e:
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
257 Xt, fit_params = self._fit(X, y, **fit_params)
258 if self._final_estimator is not None:
--> 259 self._final_estimator.fit(Xt, y, **fit_params)
260 return self
261
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in fit(self, X, y, sample_weight)
602 self.feature_count_ = np.zeros((n_effective_classes, n_features),
603 dtype=np.float64)
--> 604 self._count(X, Y)
605 alpha = self._check_alpha()
606 self._update_feature_log_prob(alpha)
/home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)
ValueError: Input X must be non-negative
> /home/fred/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.py(708)_count()
706 """Count and smooth feature occurrences."""
707 if np.any((X.data if issparse(X) else X) < 0):
--> 708 raise ValueError("Input X must be non-negative")
709 self.feature_count_ += safe_sparse_dot(Y.T, X)
710 self.class_count_ += Y.sum(axis=0)
Yes, when hashing is not None, FeatureHasher() is chosen, which can output negative values.
But you can remove convert those negative values to positive by using the non_negative parameter of FeatureHashser as given in documentation:
non_negative : boolean, optional, default False
When True, an absolute value is applied to the features matrix prior
to returning it. When used in conjunction with
alternate_sign=True, this significantly reduces the inner product
preservation property.
So change this line in MyVectorizer:
self.myvec = FeatureHasher(n_features = self.hashing)
to this:
self.myvec = FeatureHasher(n_features = self.hashing, non_negative=True)
Note:
This parameter has been deprecated since version 0.19 and will be removed in 0.21.
You need to study how this parameter will affect your classification problem.
I'm attempting to run GridSearchCV for Logistic Regression in sklearn and the code is giving me the following error:
ValueError: X has 21 features per sample; expecting 19
The shapes of the training and testing data are
X_train.shape
(891L, 21L)
X_test.shape
(418L, 21L)
The code I'm using to run the GridSearchCV with is
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
logistic = LogisticRegression()
parameters = [{'C' : [1.0, 10.0, 100.0, 1000.0],
'fit_intercept' : ['True', 'False'],
'intercept_scaling' : [0, 1, 10, 100, 1000],
'class_weight' : ['auto'],
'random_state' : [26],
'tol' : [0.001, 0.01, 0.1, 1, 10, 100]
}]
logistic = GridSearchCV(LogisticRegression(),
parameters,
cv=3,
refit=True,
verbose=1)
logistic = logistic.fit(X_train, y_train)
logit_pred = logistic.predict(X_test)
The traceback I'm getting is:
ValueError Traceback (most recent call last)
C:\Code\kaggle\titanic\titanic.py in <module>()
351
352
--> 353 logistic = logistic.fit(X_train, y_train)
354
355 logit_pred = logistic.predict(X_test)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\grid_search.pyc in fit(self, X, y)
594
595 """
--> 596 return self._fit(X, y, ParameterGrid(self.param_grid))
597
598
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\grid_search.pyc in _fit(self, X, y, parameter_iterable)
376 train, test, self.verbose, parameters,
377 self.fit_params, return_parameters=True)
--> 378 for parameters in parameter_iterable
379 for train, test in cv)
380
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
651 self._iterating = True
652 for function, args, kwargs in iterable:
--> 653 self.dispatch(function, args, kwargs)
654
655 if pre_dispatch == "all" or n_jobs == 1:
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch(self, func, args, kwargs)
398 """
399 if self._pool is None:
--> 400 job = ImmediateApply(func, args, kwargs)
401 index = len(self._jobs)
402 if not _verbosity_filter(index, self.verbose):
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __init__(self, func, args, kwargs)
136 # Don't delay the application, to avoid keeping the input
137 # arguments in memory
--> 138 self.results = func(*args, **kwargs)
139
140 def get(self):
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
1238 else:
1239 estimator.fit(X_train, y_train, **fit_params)
-> 1240 test_score = _score(estimator, X_test, y_test, scorer)
1241 if return_train_score:
1242 train_score = _score(estimator, X_train, y_train, scorer)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1294 score = scorer(estimator, X_test)
1295 else:
-> 1296 score = scorer(estimator, X_test, y_test)
1297 if not isinstance(score, numbers.Number):
1298 raise ValueError("scoring must return a number, got %s (%s) instead."
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\metrics\scorer.pyc in _passthrough_scorer(estimator, *args, **kwargs)
174 def _passthrough_scorer(estimator, *args, **kwargs):
175 """Function that wraps estimator.score"""
--> 176 return estimator.score(*args, **kwargs)
177
178
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\base.pyc in score(self, X, y, sample_weight)
289 """
290 from .metrics import accuracy_score
--> 291 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
292
293
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc in predict(self, X)
213 Predicted class label per sample.
214 """
--> 215 scores = self.decision_function(X)
216 if len(scores.shape) == 1:
217 indices = (scores > 0).astype(np.int)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc in decision_function(self, X)
194 if X.shape[1] != n_features:
195 raise ValueError("X has %d features per sample; expecting %d"
--> 196 % (X.shape[1], n_features))
197
198 scores = safe_sparse_dot(X, self.coef_.T,
ValueError: X has 21 features per sample; expecting 19
Why is GridSearchCV expecting a different number of features than the dataset contains?
UPDATE:
Thanks for the response Andy. The datasets are all type numpy.ndarray and dtype is float64.
type(X_Train) type(y_train) type(X_test)
numpy.ndarray numpy.ndarray numpy.ndarray
The steps right before I bring them into sklearn:
train_data = traindf.values
test_data = testdf.values
X_train = train_data[0::, 1::] # training features
y_train = train_data[0::, 0] # training targets
X_test = test_data[0::, 0::] # test features
The next step is the GridSearchCV code I typed above...
UPDATE 2: Link to Data
Here is a link to the datasets
The error is cause by intercept_scaling=0. Looks like a bug in scikit-learn.