Pipeline: Multiple classifiers? - python

I read following example on Pipelines and GridSearchCV in Python:
http://www.davidsbatista.net/blog/2017/04/01/document_classification/
Logistic Regression:
pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(LogisticRegression(solver='sag')),
])
parameters = {
'tfidf__max_df': (0.25, 0.5, 0.75),
'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
"clf__estimator__C": [0.01, 0.1, 1],
"clf__estimator__class_weight": ['balanced', None],
}
SVM:
pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(LinearSVC()),
])
parameters = {
'tfidf__max_df': (0.25, 0.5, 0.75),
'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
"clf__estimator__C": [0.01, 0.1, 1],
"clf__estimator__class_weight": ['balanced', None],
}
Is there a way that Logistic Regression and SVM could be combined into one Pipeline? Say, I have a TfidfVectorizer and like to test against multiple classifiers that each then output the best model/parameters.

Here is an easy way to optimize over any classifier and for each classifier any settings of parameters.
Create a switcher class that works for any estimator
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
Now you can pass in anything for the estimator parameter. And you can optimize any parameter for any estimator you pass in as follows:
Perform hyper-parameter optimization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', ClfSwitcher()),
])
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': ['english', None],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3)
gscv.fit(train_data, train_labels)
How to interpret clf__estimator__loss
clf__estimator__loss is interpreted as the loss parameter for whatever estimator is, where estimator = SGDClassifier() in the top most example and is itself a parameter of clf which is a ClfSwitcher object.

Yes, you can do that by building a wrapper function. The idea is to pass it two dictionaries: the models and the the parameters;
Then you iteratively call the models with all the parameters to test, using GridSearchCV for this.
Check this example, there is added extra functionality so that at the end you output a data frame with the summary of the different models/parameters and different performance scores.
EDIT: It's too much code to paste here, you can check a full working example here:
http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

This is how I did it without a wrapper function.
You can evaluate any number of classifiers. Each one can have multiple parameters for hyperparameter optimization.
The one with best score will be saved to disk using pickle
from sklearn.svm import SVC
from operator import itemgetter
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
#pipeline parameters
parameters = \
[ \
{
'clf': [MultinomialNB()],
'tf-idf__stop_words': ['english', None],
'clf__alpha': [0.001, 0.1, 1, 10, 100]
},
{
'clf': [SVC()],
'tf-idf__stop_words': ['english', None],
'clf__C': [0.001, 0.1, 1, 10, 100, 10e5],
'clf__kernel': ['linear', 'rbf'],
'clf__class_weight': ['balanced'],
'clf__probability': [True]
},
{
'clf': [DecisionTreeClassifier()],
'tf-idf__stop_words': ['english', None],
'clf__criterion': ['gini','entropy'],
'clf__splitter': ['best','random'],
'clf__class_weight':['balanced', None]
}
]
#evaluating multiple classifiers
#based on pipeline parameters
#-------------------------------
result=[]
for params in parameters:
#classifier
clf = params['clf'][0]
#getting arguments by
#popping out classifier
params.pop('clf')
#pipeline
steps = [('tf-idf', TfidfVectorizer()), ('clf',clf)]
#cross validation using
#Grid Search
grid = GridSearchCV(Pipeline(steps), param_grid=params, cv=3)
grid.fit(features, labels)
#storing result
result.append\
(
{
'grid': grid,
'classifier': grid.best_estimator_,
'best score': grid.best_score_,
'best params': grid.best_params_,
'cv': grid.cv
}
)
#sorting result by best score
result = sorted(result, key=itemgetter('best score'),reverse=True)
#saving best classifier
grid = result[0]['grid']
joblib.dump(grid, 'classifier.pickle')

Related

Issue with grid cross validation in decision tree regressor

Let's assume that I have defined a regressor like that
tree = MultiOutputRegressor(DecisionTreeRegressor(random_state=0))
tree.fit(X_train, y_train)
And now I want to do a grid cross validation to optimize the parameter ccp_alpha (I don't know if it is the best parameter to optimize but I take it as example). Thus I do it like that:
alphas = np.arange(0,2,0.1)
pipe_tree = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('tree', tree)], memory = 'tmp')
treeCV = GridSearchCV(pipe_tree, dict( pca__n_components=n_components, tree__ccp_alpha=alphas ), cv=5, scoring ='r2', n_jobs=-1)
start_time = time.time()
treeCV.fit(X_train, y_train)
The problem is that I take this issue:
ValueError: Invalid parameter ccp_alpha for estimator Pipeline(memory='tmp',
steps=[('scaler', StandardScaler()), ('pca', PCA()),
('tree',
MultiOutputRegressor(estimator=DecisionTreeRegressor(random_state=0)))]). Check the list of available parameters with `estimator.get_params().keys()`.
If I use the command tree.get_params().keys() it prints a list of possible parameters to change in my model. I think the problem is this tree__ccp_alpha=alphas in GridSearchCV() command. But whatever change I do, it doesn't work.
I am not sure what is tree in your post, but it seems like a multipleregressor on top of your decision tree. If you set that up correctly it should work. First we define the params:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np
alphas = np.arange(0,2,0.1)
n_components = [3,4,5]
Then set up the steps:
scaler = StandardScaler()
pca = PCA()
from sklearn.multioutput import MultiOutputRegressor
tree = MultiOutputRegressor(DecisionTreeClassifier())
Toy data:
X_train = np.random.normal(0,1,(100,10))
y_train = np.random.binomial(1,0.5,(100,2))
Pipeline:
pipe_tree = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('tree', tree)])
tree.get_params()
{'estimator__ccp_alpha': 0.0,
'estimator__class_weight': None,
'estimator__criterion': 'gini',
'estimator__max_depth': None,
'estimator__max_features': None,
'estimator__max_leaf_nodes': None,
'estimator__min_impurity_decrease': 0.0,
'estimator__min_impurity_split': None,
'estimator__min_samples_leaf': 1,
'estimator__min_samples_split': 2,
'estimator__min_weight_fraction_leaf': 0.0,
'estimator__presort': 'deprecated',
'estimator__random_state': None,
'estimator__splitter': 'best',
'estimator': DecisionTreeClassifier(),
'n_jobs': None}
The param should be estimator__ccp_alpha. So if we append tree before it, with tree__estimator__ccp_alpha = alphas it works:
treeCV = GridSearchCV(pipe_tree, dict( pca__n_components=n_components, tree__estimator__ccp_alpha=alphas ),
cv=5, scoring ='r2', n_jobs=-1)
treeCV.fit(X_train, y_train)
If I use yours:
treeCV = GridSearchCV(pipe_tree, dict( pca__n_components=n_components, tree__ccp_alpha=alphas ),
cv=5, scoring ='r2', n_jobs=-1)
I get the same error

How to perform a preprocessing method on the data with Perceptron in GridSearchCV?

I have already checked this question but the answers didn't help.
I am trying to use a preprocessing method such as StandardScaler and Normalizer with Perceptron in GridSearchCV:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import Perceptron
param_grid = [{
'tol': [1e-1, 1e-3, 1e-5],
'penalty': ['l2', 'l1', 'elasticnet'],
'eta0': [0.0001, 0.001, 0.01, 0.1, 1.0]
}]
scoring = {
'AUC-ROC': 'roc_auc',
'Accuracy': 'accuracy',
'AUC-PR': 'average_precision'
}
pipe = Pipeline([('scale', StandardScaler()), ('clf', Perceptron())])
search = GridSearchCV(pipe,
param_grid,
scoring=scoring,
refit='AUC-ROC',
cv=skf,
return_train_score=True)
results = search.fit(Xtrain, ytrain)
When I run the code I get:
ValueError: Invalid parameter class_weight for estimator Pipeline(steps=[('scale', StandardScaler()), ('clf', Perceptron())]). Check the list of available parameters with `estimator.get_params().keys()`.
I think this error is raised as the param_grid provided is not applicable to StandardScaler(). In addition, when I print search.get_params().keys() I get:
dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__scale', 'estimator__clf', 'estimator__scale__copy', 'estimator__scale__with_mean', 'estimator__scale__with_std', 'estimator__clf__alpha', 'estimator__clf__class_weight', 'estimator__clf__early_stopping', 'estimator__clf__eta0', 'estimator__clf__fit_intercept', 'estimator__clf__l1_ratio', 'estimator__clf__max_iter', 'estimator__clf__n_iter_no_change', 'estimator__clf__n_jobs', 'estimator__clf__penalty', 'estimator__clf__random_state', 'estimator__clf__shuffle', 'estimator__clf__tol', 'estimator__clf__validation_fraction', 'estimator__clf__verbose', 'estimator__clf__warm_start', 'estimator', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])
How do I fix it?
You should specify to which transform in the pipeline the param_grid parameters should be applied:
param_grid = [{
'clf__tol': [1e-1, 1e-3, 1e-5],
'clf__penalty': ['l2', 'l1', 'elasticnet'],
'clf__eta0': [0.0001, 0.001, 0.01, 0.1, 1.0]
}]

How to perform nested Cross Validation (LightGBM Regression) with Bayesian Hyperparameter optimization and TimeSeriesSplit?

I want to do predictions with a Regression model.
I try to optimize my LightGBM model for the best hyperparameters while aiming for the lowest generalization RMSE score without overfitting/underfitting.
All examples I've seen use Classifications and split randomly without awareness for Time Series data + use GridSearch which are all not applicable to my problem.
How can I get bayesian hyperparameter optimization for my final model while using nested CV and TimeSeriesSplit?
My code for simple CV so far:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, Trials
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
... import data via pandas ...
y = df["target"] # predictor y
features = df.drop("target", axis=1).columns
X = df[traffic_features] # features X
days = len(df)- 60 # 2 Months for test data / ~20%
X_train, X_test = X[:days], X[days:]
y_train, y_test = y[:days], y[days:]
# hyperopt
random_state = 42
def lightgbm_cv(params, random_state=random_state, cv=cvTSS, X=X_train, y=y_train):
params = {
'n_estimators': int(params['n_estimators']),
'max_depth': int(params['max_depth']),
'learning_rate': params['learning_rate'],
'min_child_weight': params['min_child_weight'],
'feature_fraction': params['feature_fraction'],
'bagging_fraction': params['bagging_fraction'],
'bagging_freq': int(params['bagging_freq']),
'num_leaves': int(params['num_leaves']),
'max_bin': int(params['max_bin']),
'num_iterations': int(params['num_iterations']),
'objective': 'rmse',
}
# we use this params to create a new LGBM Regressor
model = lgb.LGBMRegressor(random_state=random_state, **params)
# and then conduct the cross validation with the same folds as before
score = -cross_val_score(model, X, y, cv=cv, scoring="neg_root_mean_squared_error", n_jobs=-1).mean()
print(score)
return score
space={
'n_estimators': hp.quniform('n_estimators', 100, 10_000, 1),
'max_depth' : hp.quniform('max_depth', 2, 100, 1),
'learning_rate': hp.loguniform('learning_rate', -5, 2),
'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
'feature_fraction': hp.quniform('feature_fraction', 0.1, 1, 0.1),
'bagging_fraction': hp.quniform('bagging_fraction', 0.1, 1, 0.1),
'bagging_freq': hp.quniform('bagging_freq', 1, 1_000, 1),
"num_leaves": hp.quniform('num_leaves', 10, 1_000, 1),
"max_bin": hp.quniform('max_bin', 10, 2_000, 1),
"num_iterations": hp.quniform('num_iterations', 100, 10_000, 1),
'objective': 'rmse',
#'verbose': 0,
}
# trials will contain logging information
trials = Trials()
cvTSS = TimeSeriesSplit(max_train_size=None, n_splits=10) #
n_iter = 100
best=fmin(fn=lightgbm_cv, # function to optimize
space=space,
algo=tpe.suggest, # optimization, hyperotp will select its parameters automatically
max_evals=n_iter, # maximum number of iterations
trials=trials, # logging
stratified = False,
rstate=np.random.RandomState(random_state) # fixing random state for the reproducibility
)
# computing the score on the test set - some parameters from "space" are missing here, not important atm
model = lgb.LGBMRegressor(random_state=random_state, n_estimators=int(best['n_estimators']),
max_depth=int(best['max_depth']),learning_rate=best['learning_rate'])
model.fit(X_train, y_train)
tpe_test_score = mean_squared_error(y_test, model.predict(X_test), squared=False)
print("Best RMSE {:.3f} params {}".format( lightgbm_cv(best), best))

GridSearchCV - FitFailedWarning: Estimator fit failed

I am running this:
# Hyperparameter tuning - Random Forest #
# Hyperparameters' grid
parameters = {'n_estimators': list(range(100, 250, 25)), 'criterion': ['gini', 'entropy'],
'max_depth': list(range(2, 11, 2)), 'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
'class_weight': [{0: 1, 1: i} for i in np.arange(1, 4, 0.2).tolist()], 'min_samples_split': list(range(2, 7))}
# Instantiate random forest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
# Execute grid search and retrieve the best classifier
from sklearn.model_selection import GridSearchCV
classifiers_grid = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='balanced_accuracy',
cv=5, refit=True, n_jobs=-1)
classifiers_grid.fit(X, y)
and I am receiving this warning:
.../anaconda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:536:
FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
TypeError: '<' not supported between instances of 'str' and 'int'
Why is this and how can I fix it?
I had similar issue of FitFailedWarning with different details, after many runs I found, the parameter value passing has the error, try
parameters = {'n_estimators': [100,125,150,175,200,225,250],
'criterion': ['gini', 'entropy'],
'max_depth': [2,4,6,8,10],
'max_features': [0.1, 0.2, 0.3, 0.4, 0.5],
'class_weight': [0.2,0.4,0.6,0.8,1.0],
'min_samples_split': [2,3,4,5,6,7]}
This will pass for sure, for me it happened in XGBClassifier, somehow the values datatype mixing up
One more is if the value exceeds the range, for example in XGBClassifier 'subsample' paramerters max value is 1.0, if it is set as 1.1, FitFailedWarning will occur
For me this was giving same error but after removing none from max_dept it is fitting properly.
param_grid={'n_estimators':[100,200,300,400,500],
'criterion':['gini', 'entropy'],
'max_depth':['None',5,10,20,30,40,50,60,70],
'min_samples_split':[5,10,20,25,30,40,50],
'max_features':[ 'sqrt', 'log2'],
'max_leaf_nodes':[5,10,20,25,30,40,50],
'min_samples_leaf':[1,100,200,300,400,500]
}
code which is running properly:
param_grid={'n_estimators':[100,200,300,400,500],
'criterion':['gini', 'entropy'],
'max_depth':[5,10,20,30,40,50,60,70],
'min_samples_split':[5,10,20,25,30,40,50],
'max_features':[ 'sqrt', 'log2'],
'max_leaf_nodes':[5,10,20,25,30,40,50],
'min_samples_leaf':[1,100,200,300,400,500]
}
I too got same error and when I passed hyperparameters as in MachineLearningMastery, I got output without warning...
Try this way if anyone get similar issues...
# grid search logistic regression model on the sonar dataset
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X, y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
Make sure the y-variable is an int, not bool or str.
Change your last line of code to make the y series a 0 or 1, for example:
classifiers_grid.fit(X, list(map(int, y)))

Multiple classification models in a scikit pipeline python

I am solving a binary classification problem over some text documents using Python and implementing the scikit-learn library, and I wish to try different models to compare and contrast results - mainly using a Naive Bayes Classifier, SVM with K-Fold CV, and CV=5. I am finding a difficulty in combining all of the methods into one pipeline, given that the latter two models use gridSearchCV(). I cannot have multiple Pipelines running during a single implementation due to concurrency issues, hence I need to implement all the different models using one pipeline.
This is what I have till now,
# pipeline for naive bayes
naive_bayes_pipeline = Pipeline([
('bow_transformer', CountVectorizer(analyzer=split_into_lemmas, stop_words='english')),
('tf_idf', TfidfTransformer()),
('classifier', MultinomialNB())
])
# accessing and using the pipelines
naive_bayes = naive_bayes_pipeline.fit(train_data['data'], train_data['gender'])
# pipeline for SVM
svm_pipeline = Pipeline([
('bow_transformer', CountVectorizer(analyzer=split_into_lemmas, stop_words='english')),
('tf_idf', TfidfTransformer()),
('classifier', SVC())
])
param_svm = [
{'classifier__C': [1, 10], 'classifier__kernel': ['linear']},
{'classifier__C': [1, 10], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]
grid_svm_skf = GridSearchCV(
svm_pipeline, # pipeline from above
param_grid=param_svm, # parameters to tune via cross validation
refit=True, # fit using all data, on the best detected classifier
n_jobs=-1, # number of cores to use for parallelization; -1 uses "all cores"
scoring='accuracy',
cv=StratifiedKFold(train_data['gender'], n_folds=5), # using StratifiedKFold CV with 5 folds
)
svm_skf = grid_svm_skf.fit(train_data['data'], train_data['gender'])
predictions_svm_skf = svm_skf.predict(test_data['data'])
EDIT 1:
The second pipeline is the only pipeline using gridSearchCV(), and never seems to be executed.
EDIT 2:
Added more code to show gridSearchCV() use.
Consider checking out similar questions here:
Compare multiple algorithms with sklearn pipeline
Pipeline: Multiple classifiers?
To summarize,
Here is an easy way to optimize over any classifier and for each classifier any settings of parameters.
Create a switcher class that works for any estimator
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
Now you can pass in anything for the estimator parameter. And you can optimize any parameter for any estimator you pass in as follows:
Perform hyper-parameter optimization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', ClfSwitcher()),
])
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': ['english', None],
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'tfidf__max_df': (0.25, 0.5, 0.75, 1.0),
'tfidf__stop_words': [None],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, return_train_score=False, verbose=3)
gscv.fit(train_data, train_labels)
How to interpret clf__estimator__loss
clf__estimator__loss is interpreted as the loss parameter for whatever estimator is, where estimator = SGDClassifier() in the top most example and is itself a parameter of clf which is a ClfSwitcher object.

Categories