My CNN mode, used for binary classification of images, is trained on NumPy arrays with an input shape of (224, 224, 3). On the other hand, my logistic regression mode, used for binary classification of tabular data, is trained on a pandas DataFrame that is two-dimensional. I really want to stack these two models using StackingCVClassifier from the mlxtend package. But the main hurdle I am having right now is that I cannot easily combine the two datasets I have since they are of different shapes. That's why, instead of training my StackingCVClassifier on the combined datasets (tabular and image data), I trained on the predictions of my classifiers:
from sklearn.model_selection import train_test_split
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from tensorflow import keras
from keras import layers
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.base import BaseEstimator, TransformerMixin
#import tensorflow as tf
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
X_train_tabular, X_test_tabular, y_train_tabular, y_test_tabular = train_test_split(
tabular, label.values, test_size=0.2)
X_train_images, X_test_images, y_train_images, y_test_images = train_test_split(
numpy_arrays, label.values, test_size=0.2)
y_train_tabular = np.ravel(y_train_tabular)
y_test_tabular = np.ravel(y_test_tabular)
logistic = LogisticRegression()
logistic.fit(X_train_tabular, y_train_tabular)
def sample_model():
clf = Sequential()
clf.add(Input(shape=(224, 224, 3)))
clf.add(Dropout(0.2))
clf.add(Flatten())
clf.add(Dense(units = 1, activation = 'relu')) # Compile model
clf.compile(optimizer="adam", loss='binary_crossentropy',metrics=[keras.metrics.AUC(), 'accuracy'])
return clf
clf_model = KerasClassifier(build_fn=sample_model)
clf_model._estimator_type = "classifier"
clf_model.fit(X_train_images, y_train_images)
print(clf_model.score(X_train_images, y_train_images))
logistic_predictions = logistic.predict_proba(X_test_tabular)[:, 1]
cnn_predictions = clf_model.predict_proba(X_test_images)[:, 1]
X_test_combined = np.column_stack((logistic_predictions, cnn_predictions))
loo = LeaveOneOut()
scores = []
for train_index, test_index in loo.split(X_test_combined):
X_train, X_test = X_test_combined[train_index], X_test_combined[test_index]
y_train, y_test = y_test_tabular[train_index], y_test_tabular[test_index]
meta_model = StackingCVClassifier(classifiers=[logistic, clf_model], use_features_in_secondary = False,
meta_classifier=RandomForestClassifier(), use_probas=True, cv=5, verbose=1)
meta_model.fit(X_train, y_train)
y_pred = meta_model.predict(X_test)
scores.append(accuracy_score(y_test, y_pred))
print("LOOCV Accuracy:", np.mean(scores))
However, my code, once again, encounters an error.
ValueError Traceback (most recent call last)
Input In [114], in <cell line: 61>()
64 meta_model = StackingCVClassifier(classifiers=[logistic, clf_model], use_features_in_secondary = False,
65 meta_classifier=RandomForestClassifier(), use_probas=True, cv=5, verbose=1)
66 # meta_model = RandomForestClassifier()
---> 67 meta_model.fit(X_train, y_train)
68 y_pred = meta_model.predict(X_test)
69 scores.append(accuracy_score(y_test, y_pred))
File ~/opt/anaconda3/lib/python3.9/site-packages/mlxtend/classifier/stacking_cv_classification.py:261, in StackingCVClassifier.fit(self, X, y, groups, sample_weight)
258 if self.verbose > 1:
259 print(_name_estimators((model,))[0][1])
--> 261 prediction = cross_val_predict(
262 model,
263 X,
264 y,
265 groups=groups,
266 cv=final_cv,
267 n_jobs=self.n_jobs,
268 fit_params=fit_params,
269 verbose=self.verbose,
270 pre_dispatch=self.pre_dispatch,
271 method="predict_proba" if self.use_probas else "predict",
272 )
274 if not self.use_probas:
275 prediction = prediction[:, np.newaxis]
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:968, in cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
965 # We clone the estimator to make sure that all the folds are
966 # independent, and that it is pickle-able.
967 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 968 predictions = parallel(
969 delayed(_fit_and_predict)(
970 clone(estimator), X, y, train, test, verbose, fit_params, method
971 )
972 for train, test in splits
973 )
975 inv_test_indices = np.empty(len(test_indices), dtype=int)
976 inv_test_indices[test_indices] = np.arange(len(test_indices))
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:1043, in Parallel.__call__(self, iterable)
1034 try:
1035 # Only set self._iterating to True if at least a batch
1036 # was dispatched. In particular this covers the edge
(...)
1040 # was very quick and its callback already dispatched all the
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1046 while self.dispatch_one_batch(iterator):
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:779, in Parallel._dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
782 # grow. To ensure correct results ordering, .insert is
783 # used (rather than .append) in the following line
784 self._jobs.insert(job_idx, job)
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
569 def __init__(self, batch):
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:262, in BatchedCalls.__call__(self)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py:262, in <listcomp>(.0)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/fixes.py:117, in _FuncWrapper.__call__(self, *args, **kwargs)
115 def __call__(self, *args, **kwargs):
116 with config_context(**self.config):
--> 117 return self.function(*args, **kwargs)
File ~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:1050, in _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method)
1048 estimator.fit(X_train, **fit_params)
1049 else:
-> 1050 estimator.fit(X_train, y_train, **fit_params)
1051 func = getattr(estimator, method)
1052 predictions = func(X_test)
File ~/opt/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py:248, in KerasClassifier.fit(self, x, y, **kwargs)
246 raise ValueError("Invalid shape for y: " + str(y.shape))
247 self.n_classes_ = len(self.classes_)
--> 248 return super().fit(x, y, **kwargs)
File ~/opt/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py:175, in BaseWrapper.fit(self, x, y, **kwargs)
172 fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
173 fit_args.update(kwargs)
--> 175 history = self.model.fit(x, y, **fit_args)
177 return history
File ~/opt/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File /var/folders/v0/88rsx9gs4ql39bfw7jvs94c80000gn/T/__autograph_generated_filel6i44dds.py:15, in outer_factory.<locals>.inner_factory.<locals>.tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
ValueError: in user code:
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function *
return step_function(self, iterator)
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step **
outputs = model.train_step(data)
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 993, in train_step
y_pred = self(x, training=True)
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/Users/User/opt/anaconda3/lib/python3.9/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
raise ValueError(
ValueError: Input 0 of layer "sequential_5" is incompatible with the layer: expected shape=(None, 224, 224, 3), found shape=(None, 2)
This is caused by the differing input shapes of what my CNN is trained on and the shape of my predictions X_test_combined. If this approach won't work, please help me in any way you can. I want to use StackingCVClassifier on different datasets and won't require combining the two different datasets.
Related
I am using Jupyter notebook in Windows 10 PC.
I am trying to run 3 models in a dictionary (Logistic, KNeighbors, and RandomForestClassifier)
I went into the miniconda environment and started jupyter notebook.
The dataset is properly loaded, split to 'X' and 'y' and fitted properly.
I am trying to predict target values from a dataset by using 3 types of models which are packed in a dictionary.
I then created a function which has a loop which cycles through the dictionary and provides answers for each model, that will be fed to the empty dictionary *model_scores{}
When I try to run the function I get an error which is provided below.
ny code is:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# models from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# model evaluations
from sklearn.model_selection import train_test_split, cross_val_score
models = {"Logistic Regression": LogisticRegression(),
"KNN": KNeighborsClassifier(),
"Random Forest": RandomForestClassifier()}
def fit_and_score(models, X_train, X_test, y_train, y_test):
np.random.seed(42)
model_scores = {}
for name, model in models.items():
model.fit(X_train, y_train)
model_scores[name] = model.score(X_test, y_test)
return model_scores
model_scores = fit_and_score(models=models,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test)
the error I get:
AttributeError: 'str' object has no attribute 'decode'
full error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-68-45006ce4a749> in <module>
1 # run the function
2
----> 3 model_scores = fit_and_score(models=models,
4 X_train=X_train,
5 X_test=X_test,
<ipython-input-67-005647140cd3> in fit_and_score(models, X_train, X_test, y_train, y_test)
22 for name, model in models.items():
23 # fit the model to the data
---> 24 model.fit(X_train, y_train)
25 # evaluate the model and append its score to model_scores
26 model_scores[name] = model.score(X_test, y_test)
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\sklearn\linear_model\_logistic.py in fit(self, X, y, sample_weight)
1405 else:
1406 prefer = 'processes'
-> 1407 fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
1408 **_joblib_parallel_args(prefer=prefer))(
1409 path_func(X, y, pos_class=class_, Cs=[C_],
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\sklearn\linear_model\_logistic.py in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio)
760 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
761 )
--> 762 n_iter_i = _check_optimize_result(
763 solver, opt_res, max_iter,
764 extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
F:\UdemyProjects\DataScienceZTM\heart-disease-project\env\lib\site-packages\sklearn\utils\optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
After importing the train_test_split from sklearn.model_selection, you are using X_train, X_test, y_train, y_test without defining them.
here i used a Social_Network_Ads.csv random dataset found on internet and it works without showing any error:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
from sklearn.model_selection import train_test_split,cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
def fit_and_score(models, X_train, X_test, y_train, y_test):
np.random.seed(42)
model_scores = {}
for name, model in models.items():
model.fit(X_train, y_train)
model_scores[name] = model.score(X_test, y_test)
return model_scores
model_scores = fit_and_score(models=models,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test)
I am trying to make a simple crossvalidation process by scikitlearn, crossvalidate, and I get the following TypeError:
TypeError Traceback (most recent call last)
<ipython-input-59-0471fb78d8f0> in <module>
5
6 model = NMF(n_components=185, init='random', random_state=0)
----> 7 scores = cross_validate(model, df4_array, cv=5, scoring=('neg_mean_squared_error'))
8 W = model.fit_transform(df4_array)
9 H = model.components_
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
234 return_times=True, return_estimator=return_estimator,
235 error_score=error_score)
--> 236 for train, test in cv.split(X, y, groups))
237
238 zipped_scores = list(zip(*scores))
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
587 scorer = _MultimetricScorer(**scorer)
588 if y_test is None:
--> 589 scores = scorer(estimator, X_test)
590 else:
591 scores = scorer(estimator, X_test, y_test)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 if isinstance(scorer, _BaseScorer):
86 score = scorer._score(cached_call, estimator,
---> 87 *args, **kwargs)
88 else:
89 score = scorer(estimator, *args, **kwargs)
TypeError: _score() missing 1 required positional argument: 'y_true'
I do not know why, because I am trying to do a Recommender System for tue University using Non negative matrix factorization, which is an unsupervised method ... shouldn`t the code work without y?
Code:
from sklearn.decomposition import NMF
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
model = NMF(n_components=185, init='random', random_state=0)
scores = cross_validate(model, df4_array, cv=5, scoring=('neg_mean_squared_error'))
W = model.fit_transform(df4_array)
H = model.components_
So I think the problem is with the scoring method.
Any form of MSE(mean_squared_error) is a function for regression type problems as the formula involves a dependent variable(y) component of regression.
I would suggest to look through this link for different unsupervised scoring methods.
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
Hope this helped.
May the force be with you.
I'm trying to use GridSearchCV with KMeans clustering to explore the optimal number to clusters to use in order to get the best results on a classification problem.
I've got the following code:
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
faces = fetch_olivetti_faces()
X_data, y_data = faces.data, faces.target
log_reg = LogisticRegression()
split = StratifiedShuffleSplit(n_splits = 1, test_size=.2, random_state=42)
for train_index, test_index in split.split(X_train, y_train):
X_train_set , y_train_set = X_data[train_index,], y_data[train_index,]
X_test_set, y_test_set = X_data[test_index,], y_data[test_index, ]
pipeline = Pipeline([
('kmeans', KMeans(n_clusters = 30)),
('log_reg', LogisticRegression())
])
cluster_grid = dict(n_clusters=range(2,100))
grid = GridSearchCV(pipeline, cluster_grid)
grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
Here's the entire traceback:
-------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-80e6a3932897> in <module>
----> 1 grid.fit(X_train_set, y_train_set, cv=5, verbose=2)
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
686 return results
687
--> 688 self._run_search(evaluate_candidates)
689
690 # For multi-metric evaluation, store the best_index_, best_params_ and
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1147 def _run_search(self, evaluate_candidates):
1148 """Search all candidates in param_grid"""
-> 1149 evaluate_candidates(ParameterGrid(self.param_grid))
1150
1151
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
665 for parameters, (train, test)
666 in product(candidate_params,
--> 667 cv.split(X, y, groups)))
668
669 if len(out) < 1:
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
501 train_scores = {}
502 if parameters is not None:
--> 503 estimator.set_params(**parameters)
504
505 start_time = time.time()
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
162 self
163 """
--> 164 self._set_params('steps', **kwargs)
165 return self
166
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
48 self._replace_estimator(attr, name, params.pop(name))
49 # 3. Step parameters and other initialisation arguments
---> 50 super().set_params(**params)
51 return self
52
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/base.py in set_params(self, **params)
222 'Check the list of available parameters '
223 'with `estimator.get_params().keys()`.' %
--> 224 (key, self))
225
226 if delim:
ValueError: Invalid parameter n_clusters for estimator Pipeline(memory=None,
steps=[('kmeans',
KMeans(algorithm='auto', copy_x=True, init='k-means++',
max_iter=300, n_clusters=30, n_init=10, n_jobs=None,
precompute_distances='auto', random_state=None,
tol=0.0001, verbose=0)),
('log_reg',
LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
I have no idea what the heck is going on...I'm not sure how to interpret this error message but my parameter grid doesn't seem to be out of wack. PLEASE HELP!
When you are using pipeline you need to give the parameters as following:
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
where kmeans is taken from ('kmeans', KMeans())
So, your code should look as the following:
pipeline = Pipeline([
('kmeans', KMeans(),
('log_reg', LogisticRegression())
])
cluster_grid = {
'kmeans__n_clusters': range(2,100)
}
# adding n_jobs to run in parallel
grid = GridSearchCV(pipeline, cluster_grid, n_jobs=-1)
The parameter n_clusters is only applicable to KMeans and not LogisticRegression
Specify in your cluster_grid that the grid param is only meant for KMeans:
# Parameters of pipelines can be set using ‘__’ separated parameter names:
cluster_grid = dict(kmeans__n_clusters=range(2,100))
Reference : https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
I am finding the optimal value of hyperparameter alpha for my Multinpmial Naive Bayes model which uses cross validation and neg_log_loss as metric. I wrote thie code:
alphas = list(range(1, 500))
#perform k fold cross validation for different metrics
def cross_val(metric):
MSE = []
cv_scores = []
training_scores = []
for alpha in alphas:
naive_bayes = MultinomialNB(alpha=alpha)
scores = cross_val_score(naive_bayes, x_train_counts, y_train, cv=20, scoring='neg_log_loss')
#score() returns the mean accuracy on the given test data and labels
scores_training = naive_bayes.fit(x_train_counts, y_train).score(x_train_counts, y_train)
cv_scores.append(scores.mean())
training_scores.append(scores_training)
#changing to misclassification error
MSE = [1 - x for x in cv_scores]
#determining best alpha
optimal_alpha = alphas[MSE.index(min(MSE))]
print('\nThe optimal value of alpha for %s is %f' % (metric, optimal_alpha))
return optimal_alpha
optimal_alpha = cross_val('neg_log_loss')
The above code was initially working. Now it's throwing following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-43-facbaa3537ca> in <module>()
----> 1 optimal_alpha = cross_val('neg_log_loss')
2 prediction(optimal_alpha, 'neg_log_loss')
<ipython-input-41-ff0a9191d45c> in cross_val(metric)
13 for alpha in alphas:
14 naive_bayes = MultinomialNB(alpha=alpha)
---> 15 scores = cross_val_score(naive_bayes, x_train_counts, y_train, cv=20, scoring='neg_log_loss')
16
17 #score() returns the mean accuracy on the given test data and labels
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
1579 train, test, verbose, None,
1580 fit_params)
-> 1581 for train, test in cv)
1582 return np.array(scores)[:, 0]
1583
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1692
1693 else:
-> 1694 test_score = _score(estimator, X_test, y_test, scorer)
1695 if return_train_score:
1696 train_score = _score(estimator, X_train, y_train, scorer)
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/cross_validation.py in _score(estimator, X_test, y_test, scorer)
1749 score = scorer(estimator, X_test)
1750 else:
-> 1751 score = scorer(estimator, X_test, y_test)
1752 if hasattr(score, 'item'):
1753 try:
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
142 **self._kwargs)
143 else:
--> 144 return self._sign * self._score_func(y, y_pred, **self._kwargs)
145
146 def _factory_args(self):
~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/sklearn/metrics/classification.py in log_loss(y_true, y_pred, eps, normalize, sample_weight, labels)
1684 "y_true: {2}".format(transformed_labels.shape[1],
1685 y_pred.shape[1],
-> 1686 lb.classes_))
1687 else:
1688 raise ValueError('The number of classes in labels is different '
ValueError: y_true and y_pred contain different number of classes 26, 27. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [ 2 4 5 6 7 8 9 10 11 12 14 15 16 17 19 21 22 23 24 27 29 30 31 32
33 35]
This code worked few times initially. Suddenly, it stopped working. How can I make it work?
Check the shape of x_train_counts & y_train. There is a mismatch between the # of records between them.
I'm attempting to run GridSearchCV for Logistic Regression in sklearn and the code is giving me the following error:
ValueError: X has 21 features per sample; expecting 19
The shapes of the training and testing data are
X_train.shape
(891L, 21L)
X_test.shape
(418L, 21L)
The code I'm using to run the GridSearchCV with is
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
logistic = LogisticRegression()
parameters = [{'C' : [1.0, 10.0, 100.0, 1000.0],
'fit_intercept' : ['True', 'False'],
'intercept_scaling' : [0, 1, 10, 100, 1000],
'class_weight' : ['auto'],
'random_state' : [26],
'tol' : [0.001, 0.01, 0.1, 1, 10, 100]
}]
logistic = GridSearchCV(LogisticRegression(),
parameters,
cv=3,
refit=True,
verbose=1)
logistic = logistic.fit(X_train, y_train)
logit_pred = logistic.predict(X_test)
The traceback I'm getting is:
ValueError Traceback (most recent call last)
C:\Code\kaggle\titanic\titanic.py in <module>()
351
352
--> 353 logistic = logistic.fit(X_train, y_train)
354
355 logit_pred = logistic.predict(X_test)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\grid_search.pyc in fit(self, X, y)
594
595 """
--> 596 return self._fit(X, y, ParameterGrid(self.param_grid))
597
598
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\grid_search.pyc in _fit(self, X, y, parameter_iterable)
376 train, test, self.verbose, parameters,
377 self.fit_params, return_parameters=True)
--> 378 for parameters in parameter_iterable
379 for train, test in cv)
380
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
651 self._iterating = True
652 for function, args, kwargs in iterable:
--> 653 self.dispatch(function, args, kwargs)
654
655 if pre_dispatch == "all" or n_jobs == 1:
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch(self, func, args, kwargs)
398 """
399 if self._pool is None:
--> 400 job = ImmediateApply(func, args, kwargs)
401 index = len(self._jobs)
402 if not _verbosity_filter(index, self.verbose):
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __init__(self, func, args, kwargs)
136 # Don't delay the application, to avoid keeping the input
137 # arguments in memory
--> 138 self.results = func(*args, **kwargs)
139
140 def get(self):
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
1238 else:
1239 estimator.fit(X_train, y_train, **fit_params)
-> 1240 test_score = _score(estimator, X_test, y_test, scorer)
1241 if return_train_score:
1242 train_score = _score(estimator, X_train, y_train, scorer)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1294 score = scorer(estimator, X_test)
1295 else:
-> 1296 score = scorer(estimator, X_test, y_test)
1297 if not isinstance(score, numbers.Number):
1298 raise ValueError("scoring must return a number, got %s (%s) instead."
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\metrics\scorer.pyc in _passthrough_scorer(estimator, *args, **kwargs)
174 def _passthrough_scorer(estimator, *args, **kwargs):
175 """Function that wraps estimator.score"""
--> 176 return estimator.score(*args, **kwargs)
177
178
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\base.pyc in score(self, X, y, sample_weight)
289 """
290 from .metrics import accuracy_score
--> 291 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
292
293
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc in predict(self, X)
213 Predicted class label per sample.
214 """
--> 215 scores = self.decision_function(X)
216 if len(scores.shape) == 1:
217 indices = (scores > 0).astype(np.int)
C:\Users\User\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc in decision_function(self, X)
194 if X.shape[1] != n_features:
195 raise ValueError("X has %d features per sample; expecting %d"
--> 196 % (X.shape[1], n_features))
197
198 scores = safe_sparse_dot(X, self.coef_.T,
ValueError: X has 21 features per sample; expecting 19
Why is GridSearchCV expecting a different number of features than the dataset contains?
UPDATE:
Thanks for the response Andy. The datasets are all type numpy.ndarray and dtype is float64.
type(X_Train) type(y_train) type(X_test)
numpy.ndarray numpy.ndarray numpy.ndarray
The steps right before I bring them into sklearn:
train_data = traindf.values
test_data = testdf.values
X_train = train_data[0::, 1::] # training features
y_train = train_data[0::, 0] # training targets
X_test = test_data[0::, 0::] # test features
The next step is the GridSearchCV code I typed above...
UPDATE 2: Link to Data
Here is a link to the datasets
The error is cause by intercept_scaling=0. Looks like a bug in scikit-learn.