sklearn VotingClassifier with RandomizedSearchCV gives pickle error - python

I'm trying to get randomized hyperparameter search to work with the voting classifier from sklearn by adapting the example given in the sklearn documentation.
I've seen this minimal working example, but it breaks in many ways using my version of sklearn.
Here is a stripped-down example:
import numpy as np
from sklearn import __version__ as skv
from sklearn.ensemble import RandomForestClassifier as RFClassi
from sklearn.ensemble import HistGradientBoostingClassifier as HGBClassi
from sklearn.tree import DecisionTreeClassifier as DTClassi
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.datasets import load_iris
print(f"sklearn version: {skv}")
df_X, target = load_iris(return_X_y=True, as_frame=True)
ensemble = ['rf','dtree','hgb']
hy_pa_grid = {
'hgb': dict(learning_rate = list(np.linspace(0.01,0.5,10).round(3))),
'rf':dict(criterion = ['gini', 'entropy']),
'dtree':dict(criterion = ['gini', 'entropy']),
}
clfs = {'hgb' : HGBClassi(), 'rf': RFClassi(), 'dtree' : DTClassi()}
vc = VotingClassifier(estimators = clfs.items(), voting = 'soft')
params = {
f"{c}__{p}" : hy_pa_grid[c][p]
for c in ensemble
for p in hy_pa_grid[c].keys()
}
print("\n".join(map(str,params.items())))
clf = RandomizedSearchCV(estimator = vc, param_distributions = params)
clf.fit(df_X,target)
The output I get is this:
sklearn version: 1.1.3
{'rf__criterion': ['gini', 'entropy'], 'dtree__criterion': ['gini', 'entropy'], 'hgb__learning_rate': [0.01, 0.064, 0.119, 0.173, 0.228, 0.282, 0.337, 0.391, 0.446, 0.5]}
Traceback (most recent call last):
File "vc.py", line 34, in <module>
clf.fit(df_X,target)
File "/home/USER/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 789, in fit
base_estimator = clone(self.estimator)
File "/home/USER/.local/lib/python3.8/site-packages/sklearn/base.py", line 87, in clone
new_object_params[name] = clone(param, safe=False)
File "/home/USER/.local/lib/python3.8/site-packages/sklearn/base.py", line 68, in clone
return copy.deepcopy(estimator)
File "/usr/lib/python3.8/copy.py", line 161, in deepcopy
rv = reductor(4)
TypeError: cannot pickle 'dict_items' object
Any ideas for getting round this? I also tried doing it with GridSearchCV, as in the example, but I get the same error.

Oops, it turns out the problem was in
estimators = clfs.items()
All was well once I wrapped it in tuple() to be an actual tuple rather than a generator.

Related

Feature selection with cross validation: AttributeError: 'dict' object has no attribute 'fit'

I want to perform feature selection and nested cross validation on a data set. I wrote this script:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
#full_X_train = df
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest')
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
#model.set_params(**best_params)
rfecv = {'RFECV Features': {'cv': 5,
'estimator': model,
'step': 1,
'scoring': 'accuracy',
'verbose': 50}}
rfecv.fit(split_x_train,split_y_train)
print(rfecv.n_features_)
X_selected_train = rfecv.transform(split_x_train)
X_selected_test = rfecv.transform(split_x_test)
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
result = search.fit(X_selected_train,split_y_train)
best_model = result.best_estimator_
y_pred_train = best_model.predict(X_selected_train)
y_pred_test = best_model.predict(X_selected_test)
accuracy_train = metrics.accuracy_score(split_y_train, y_pred_train)
accuracy_test = metrics.accuracy_score(split_y_test, y_pred_test)
return
param_grid = [{
# 'random_forest_with_hpo_no_fs_geno_class__bootstrap':[True,False],
# 'random_forest_with_hpo_no_fs_geno_class__max_depth':[10,20,30,40,50,60,70,80],
# 'random_forest_with_hpo_no_fs_geno_class__max_features':['auto','sqrt'],
'min_samples_leaf':[1,3,5],
# 'random_forest_with_hpo_no_fs_geno_class__n_estimators':[200,500,700,1000,1500,2000]
}]
run_model_with_grid_search(param_grid=param_grid)
And I receive the error:
File "test3.py", line 83, in <module>
run_model_with_grid_search(param_grid=param_grid)
File "test3.py", line 57, in run_model_with_grid_search
rfecv.fit(split_x_train,split_y_train)
AttributeError: 'dict' object has no attribute 'fit'
Could someone please tell me how to fix this? Thank you.

Implementing GridSearchCV and Pipelines to perform Hyperparameters Tuning for KNN Algorithm

I have been reading about perfroming Hyperparameters Tuning for KNN Algorthim, and understood that the best practice of implementing it is to make sure that for each fold, my dataset should be normalized and oversamplmed using a pipeline (To avoid data leakage and overfitting).
What I'm trying to do is that I'm trying to identify the best number of neighbors (n_neighbors) possible that gives me the best accuracy in training. In the code I have set the number of neighbors to be a list range (1,50), and the number of iterations cv=10.
My code below:
# dataset reading & preprocessing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
#oversmapling
from imblearn.over_sampling import SMOTE
#KNN Model related Libraries
import cuml
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from cuml.neighbors import KNeighborsClassifier
#loading the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dataset/IanDataset.csv")
#filling missing values with zeros
df = df.fillna(0)
#replace the data in from being objects to integers
df["command response"].replace({"b'0'": "0", "b'1'": "1"}, inplace=True)
df["binary result"].replace({"b'0'": "0", "b'1'": "1"}, inplace=True)
#change the datatype of some features to be able to be used later
df["command response"] = pd.to_numeric(df["command response"]).astype(float)
df["binary result"] = pd.to_numeric(df["binary result"]).astype(int)
# dataset splitting
X = df.iloc[:, 0:17]
y_bin = df.iloc[:, 17]
# spliting the dataset into train and test for binary classification
X_train, X_test, y_bin_train, y_bin_test = train_test_split(X, y_bin, random_state=0, test_size=0.2)
#making pipleline that normalize, oversample and use classifier before GridSearchCV
pipe = Pipeline([
('normalization', MinMaxScaler()),
('oversampling', SMOTE()),
('classifier', KNeighborsClassifier(metric='eculidean', output='input'))
])
#Using GridSearchCV
neighbors = list(range(1,50))
parameters = {
'classifier__n_neighbors': neighbors
}
grid_search = GridSearchCV(pipe, parameters, cv=10)
grid_search.fit(X_train, y_bin_train)
print("Best Accuracy: {}" .format(grid_search.best_score_))
print("Best num of neighbors: {}" .format(grid_search.best_estimator_.get_params()['n_neighbors']))
At step grid_search.fit(X_train, y_bin_train), the program is repeating the error that i'm getting is :
/usr/local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:619: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/usr/local/lib/python3.7/dist-packages/imblearn/pipeline.py", line 266, in fit
self._final_estimator.fit(Xt, yt, **fit_params_last_step)
File "/usr/local/lib/python3.7/site-packages/cuml/internals/api_decorators.py", line 409, in inner_with_setters
return func(*args, **kwargs)
File "cuml/neighbors/kneighbors_classifier.pyx", line 176, in cuml.neighbors.kneighbors_classifier.KNeighborsClassifier.fit
File "/usr/local/lib/python3.7/site-packages/cuml/internals/api_decorators.py", line 409, in inner_with_setters
return func(*args, **kwargs)
File "cuml/neighbors/nearest_neighbors.pyx", line 397, in cuml.neighbors.nearest_neighbors.NearestNeighbors.fit
ValueError: Metric is not valid. Use sorted(cuml.neighbors.VALID_METRICSeculidean[brute]) to get valid options.
I'm not sure from which side is this error coming from, is it because I'm importing KNN Algorthim from cuML Library instead of sklearn ? Or is there something wrong wtih my Pipeline and GridSearchCV implementation?
This error indicates you've passed an invalid value for the metric parameter (in both scikit-learn and cuML). You've misspelled "euclidean".
import cuml
from sklearn import datasets
​
from sklearn.preprocessing import MinMaxScaler
​
from imblearn.over_sampling import SMOTE
​
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from cuml.neighbors import KNeighborsClassifier
​
X, y = datasets.make_classification(
n_samples=100
)
​
pipe = Pipeline([
('normalization', MinMaxScaler()),
('oversampling', SMOTE()),
('classifier', KNeighborsClassifier(metric='euclidean', output='input'))
])
​
parameters = {
'classifier__n_neighbors': [1,3,6]
}
​
grid_search = GridSearchCV(pipe, parameters, cv=2)
grid_search.fit(X, y)
GridSearchCV(cv=2,
estimator=Pipeline(steps=[('normalization', MinMaxScaler()),
('oversampling', SMOTE()),
('classifier', KNeighborsClassifier())]),
param_grid={'classifier__n_neighbors': [1, 3, 6]})

Why is a float required, and how do I fix it?

I have this Decision Tree algorithm:
import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
import numpy as np
import pylab as pl
features_train, labels_train, features_test, labels_test = makeTerrainData()
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
prediction = clf.predict(features_test, labels_test)
acc = accuracy_score(labels_test, prediction)
def submitAccuracies():
return {"acc":round(acc,3)}
When I run it, I get the following error:
Traceback (most recent call last):
File "vm_main.py", line 33, in <module>
import main
File "/tmp/vmuser_yafetikvhw/main.py", line 2, in <module>
import studentMain
File "/tmp/vmuser_yafetikvhw/studentMain.py", line 10, in <module>
student_output = student_code.submitAccuracies()
File "/tmp/vmuser_yafetikvhw/decisionTreeAccuracyQuiz.py", line 34, in submitAccuracies
return float({"acc":round(acc,3)})
TypeError: a float is required
I'm stuck here. I've tried turning my input into a float but still get same error. For example: acc = acc/1.0 or acc = float(acc)
Thanks.
score
returns the number of correctly classified samples (int).
From here:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
Try this one:
return {"acc":round(float(acc), 3)}

Feature Selection

I tried to do recursive feature selection in scikit learn with following code.
from sklearn import datasets, svm
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
import numpy as np
input_file_iris = "/home/anuradha/Project/NSL_KDD_master/Modified/iris.csv"
dataset = np.loadtxt(input_file_iris, delimiter=",")
X = dataset[:,0:4]
y = dataset[:,4]
estimator= svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
selector = RFE(estimator,3, step=1)
selector = selector.fit(X,y)
But it gives following error
Traceback (most recent call last):
File "/home/anuradha/PycharmProjects/LearnPython/Scikit-learn/univariate.py", line 30, in <module>
File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_selection/rfe.py", line 131, in fit
return self._fit(X, y)
File "/usr/local/lib/python2.7/dist-packages/sklearn/feature_selection/rfe.py", line 182, in _fit
raise RuntimeError('The classifier does not expose '
RuntimeError: The classifier does not expose "coef_" or
"feature_importances_" attributes
Please some one can help me to solve this or guide me to another solution
Change your kernel to linear and your code would work.
Besides, svm.OneClassSVM is used for unsupervised outlier detection. Are you sure that you want to use it as estimator? Or perhaps you want to use svm.SVC(). Look the following link for documentation.
http://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html
Lastly, iris data set is already available in sklearn. You have imported the sklearn.datasets. So you can simply load iris as:
iris = datasets.load_iris()
X = iris.data
y = iris.target

GridSearch for an estimator inside a OneVsRestClassifier

I want to perform GridSearchCV in a SVC model, but that uses the one-vs-all strategy. For the latter part, I can just do this:
model_to_set = OneVsRestClassifier(SVC(kernel="poly"))
My problem is with the parameters. Let's say I want to try the following values:
parameters = {"C":[1,2,4,8], "kernel":["poly","rbf"],"degree":[1,2,3,4]}
In order to perform GridSearchCV, I should do something like:
cv_generator = StratifiedKFold(y, k=10)
model_tunning = GridSearchCV(model_to_set, param_grid=parameters, score_func=f1_score, n_jobs=1, cv=cv_generator)
However, then I execute it I get:
Traceback (most recent call last):
File "/.../main.py", line 66, in <module>
argclass_sys.set_model_parameters(model_name="SVC", verbose=3, file_path=PATH_ROOT_MODELS)
File "/.../base.py", line 187, in set_model_parameters
model_tunning.fit(self.feature_encoder.transform(self.train_feats), self.label_encoder.transform(self.train_labels))
File "/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.py", line 354, in fit
return self._fit(X, y)
File "/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.py", line 392, in _fit
for clf_params in grid for train, test in cv)
File "/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.py", line 473, in __call__
self.dispatch(function, args, kwargs)
File "/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.py", line 296, in dispatch
job = ImmediateApply(func, args, kwargs)
File "/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.py", line 124, in __init__
self.results = func(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.py", line 85, in fit_grid_point
clf.set_params(**clf_params)
File "/usr/local/lib/python2.7/dist-packages/sklearn/base.py", line 241, in set_params
% (key, self.__class__.__name__))
ValueError: Invalid parameter kernel for estimator OneVsRestClassifier
Basically, since the SVC is inside a OneVsRestClassifier and that's the estimator I send to the GridSearchCV, the SVC's parameters can't be accessed.
In order to accomplish what I want, I see two solutions:
When creating the SVC, somehow tell it not to use the one-vs-one strategy but the one-vs-all.
Somehow indicate the GridSearchCV that the parameters correspond to the estimator inside the OneVsRestClassifier.
I'm yet to find a way to do any of the mentioned alternatives. Do you know if there's a way to do any of them? Or maybe you could suggest another way to get to the same result?
Thanks!
When you use nested estimators with grid search you can scope the parameters with __ as a separator. In this case the SVC model is stored as an attribute named estimator inside the OneVsRestClassifier model:
from sklearn.datasets import load_iris
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
iris = load_iris()
model_to_set = OneVsRestClassifier(SVC(kernel="poly"))
parameters = {
"estimator__C": [1,2,4,8],
"estimator__kernel": ["poly","rbf"],
"estimator__degree":[1, 2, 3, 4],
}
model_tunning = GridSearchCV(model_to_set, param_grid=parameters,
score_func=f1_score)
model_tunning.fit(iris.data, iris.target)
print model_tunning.best_score_
print model_tunning.best_params_
That yields:
0.973290762737
{'estimator__kernel': 'poly', 'estimator__C': 1, 'estimator__degree': 2}
For Python 3, the following code should be used
from sklearn.datasets import load_iris
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
iris = load_iris()
model_to_set = OneVsRestClassifier(SVC(kernel="poly"))
parameters = {
"estimator__C": [1,2,4,8],
"estimator__kernel": ["poly","rbf"],
"estimator__degree":[1, 2, 3, 4],
}
model_tunning = GridSearchCV(model_to_set, param_grid=parameters,
scoring='f1_weighted')
model_tunning.fit(iris.data, iris.target)
print(model_tunning.best_score_)
print(model_tunning.best_params_)
param_grid = {"estimator__alpha": [10**-5, 10**-3, 10**-1, 10**1, 10**2]}
clf = OneVsRestClassifier(SGDClassifier(loss='log',penalty='l1'))
model = GridSearchCV(clf,param_grid, scoring = 'f1_micro', cv=2,n_jobs=-1)
model.fit(x_train_multilabel, y_train)

Categories