I have a problem where I'd like to test multiple models that don't all have the same named parameters. How would you use a list of parameters for a pipeline in RandomizedSearchCV like you can use in this example with GridSearchCV?
Example from:
https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
pipe = Pipeline([
# the reduce_dim stage is populated by the param_grid
('reduce_dim', None),
('classify', LinearSVC())
])
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
{
'reduce_dim': [SelectKBest(chi2)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
]
grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)
This is an old issue that was resolved for a while now (not sure starting from which scikit-learn version).
You can now pass a list of dictionaries for RandomizedSearchCV in the param_distributions parameter. Your example code would become:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
pipe = Pipeline([
# the reduce_dim stage is populated by the param_grid
('reduce_dim', None),
('classify', LinearSVC())
])
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
'reduce_dim': [PCA(iterated_power=7), NMF()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
{
'reduce_dim': [SelectKBest(chi2)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'classify__C': C_OPTIONS
},
]
grid = RandomizedSearchCV(pipe, cv=3, n_jobs=2, param_distributions=param_grid)
digits = load_digits()
grid.fit(digits.data, digits.target)
I'm using sklearn version 0.23.1 .
I have found a way around, that relies on duck-typing, and doesn't get too much in the way.
It relies on passing complete estimators as parameters to the pipeline. We first sample the kind of model, and then its parameters. For that we define two classes that can be sampled :
from sklearn.model_selection import ParameterSampler
class EstimatorSampler:
"""
Class that holds a model and its parameters distribution.
When sampled, the parameters are first sampled and set to the model,
which is returned.
# Arguments
===========
model : sklearn.base.BaseEstimator
param_distributions : dict
Input to ParameterSampler
# Returns
=========
sampled : sklearn.base.BaseEstimator
"""
def __init__(self, model, param_distributions):
self.model = model
self.param_distributions = param_distributions
def rvs(self, random_state=None):
sampled_params = next(iter(
ParameterSampler(self.param_distributions,
n_iter=1,
random_state=random_state)))
return self.model.set_params(**sampled_params)
class ListSampler:
"""
List container that when sampled, returns one of its item,
with probabilities defined by `probs`.
# Arguments
===========
items : 1-D array-like
probs : 1-D array-like of floats
If not None, it should be the same length of `items`
and sum to 1.
# Returns
=========
sampled item
"""
def __init__(self, items, probs=None):
self.items = items
self.probs = probs
def rvs(self, random_state=None):
item = np.random.choice(self.items, p=self.probs)
if hasattr(item, 'rvs'):
return item.rvs(random_state=random_state)
return item
And the rest of the code is defined below.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
pipe = Pipeline([
# the reduce_dim stage is populated by the param_grid
('reduce_dim', None),
('classify', None)
])
N_FEATURES_OPTIONS = [2, 4, 8]
dim_reducers = ListSampler([EstimatorSampler(est, {'n_components': N_FEATURES_OPTIONS})
for est in [PCA(iterated_power=7), NMF()]] +
[EstimatorSampler(SelectKBest(chi2), {'k': N_FEATURES_OPTIONS})])
C_OPTIONS = [1, 10, 100, 1000]
classifiers = EstimatorSampler(LinearSVC(), {'C': C_OPTIONS})
param_dist = {
'reduce_dim': dim_reducers,
'classify': classifiers
}
grid = RandomizedSearchCV(pipe, cv=3, n_jobs=2, scoring='accuracy', param_distributions=param_dist)
digits = load_digits()
grid.fit(digits.data, digits.target)
Hyperopt supports Hyperparameter Tuning across multiple estimators, check this wiki for more details (2.2 A Search Space Example: scikit-learn section).
Check out this post if you want to use sklearn's GridSearch to do that. It suggests an implementation of EstimatorSelectionHelper estimator which can run different estimators, each with its own grid of parameters.
Related
I'm trying to tune or search for parameters for a scoring function in scikit-learn.
For example, in the pipeline below, I first perform feature selection with SelectKBest, which requires a scoring function (e.g., mutual_info_regression), and finally pass the best features to LinearRegression().
I want to tune the hyperparameter n_neighbors in the mutual_info_regression function, which is the scoring function provided to SelectKBest, but it isn't clear to me how I can tune n_neighbors?
Appreciate any help anyone can provide. Thanks!
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import GridSearchCV
# generate some data
X = np.random.normal(size=(10, 15))
y = np.random.normal(size=10)
# test scoring function
# default hyperparameter is n_neighbors=3
mutual_info_regression(X, y, n_neighbors=3)
# create pipeline
kbest = SelectKBest(mutual_info_regression) # using default n_neighbors value
pipe = make_pipeline(kbest, LinearRegression())
# how to tune search space for mutual_info_regression n_neighbors?
params = {"selectkbest__score_func": []} # how to define n_neighbors?
grid = GridSearchCV(pipe, params)
Below is a solution I've come up with but I'm not sure if it's the simplest/best way to tune such a hyperparameter.
I've used functools.partial to create partial objects when creating the parameters grid dictionary params, and each object has a different n_neighbors value.
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import GridSearchCV
from functools import partial # to create partial objects
X = np.random.normal(size=(10, 15))
y = np.random.normal(size=10)
kbest = SelectKBest(mutual_info_regression)
pipe = make_pipeline(kbest, LinearRegression())
# use functools.partial
params = {
"selectkbest__score_func": [
partial(mutual_info_regression, n_neighbors=n) for n in range(1, 5)
]
}
grid = GridSearchCV(pipe, params)
I'm trying to use random forest with grid search but this error shows up
ValueError: Invalid parameter classifier for estimator Pipeline(steps=[('tfidf_vectorizer', TfidfVectorizer()),
('rf_classifier', RandomForestClassifier())]).
Check the list of available parameters with `estimator.get_params().keys()`.
import numpy as np # linear algebra
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import pipeline,ensemble,preprocessing,feature_extraction,metrics
train=pd.read_json('cleaned_data1')
#split dataset into X , Y
X=train.iloc[:,0]
Y=train.iloc[:,2]
estimators=pipeline.Pipeline([
('tfidf_vectorizer', feature_extraction.text.TfidfVectorizer(lowercase=True)),
('rf_classifier', ensemble.RandomForestClassifier())
])
print(estimators.get_params().keys())
params = {"classifier__max_depth": [3, None],
"classifier__max_features": [1, 3, 10],
"classifier__min_samples_split": [1, 3, 10],
"classifier__min_samples_leaf": [1, 3, 10],
# "bootstrap": [True, False],
"classifier__criterion": ["gini", "entropy"]}
X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.2)
rf_classifier=GridSearchCV(estimators,params, cv=10 , n_jobs=-1 ,scoring='accuracy',iid=True)
rf_classifier.fit(X_train,y_train)
y_pred=rf_classifier.predict(X_test)
metrics.confusion_matrix(y_test,y_pred)
print(metrics.accuracy_score(y_test,y_pred))
I've tried to add those params
param_grid = {
'n_estimators': [200, 500],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [4,5,6,7,8],
'criterion' :['gini', 'entropy']
}
but still the same error
Please ensure that when you reference something in the pipeline, you use the same naming convention when you are initializing a parameter grid.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# Define a pipeline to search for the best combination of PCA truncation
# and classifier regularization.
pca = PCA()
# set the tolerance to a large value to make the example faster
logistic = LogisticRegression(max_iter=10000, tol=0.1)
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
X_digits, y_digits = datasets.load_digits(return_X_y=True)
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
'pca__n_components': [5, 15, 30, 45, 64],
'logistic__C': np.logspace(-4, 4, 4),
}
search = GridSearchCV(pipe, param_grid, n_jobs=-1)
search.fit(X_digits, y_digits)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
In this example, we reference LogisticRegression model as 'logistic'. Also on a side note, please note that for RandomForestClassifiers, a value of min_samples_split = 1 is not possible and will result in an error.
This is from the sklearn documentation
Where you have called the random forest ensemble 'rf_classifier' within the pipeline, you should rename this to 'classifier' which should solve the issue.
The params look for something named 'classifier' in the pipeline so they can apply themselves however at current there is nothing named this and therefore this error is thrown.
If you want (I'm not sure if this will work but worth testing), you could change "classifier__" in the params list to "rf_classifier__" to see if the params will then recognise the passed classifier.
I need to perform leave-one-out cross validation of RF model.
I successfully built a model with high predictive ability.
Now I need to perform LOO test prior to the publication.
Here is my code:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
FC_data = pd.read_excel('C:\\Users\\Dre\\Desktop\\My Papers\\Furocoumarins_paper_2018\\Furocoumarins_NEW1.xlsx', index_col=0)
FC_data.head()
# Create correlation matrix
corr_matrix = FC_data.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
FC_data1 = FC_data.drop(FC_data[to_drop], axis=1)
y = FC_data1.LogFiT
X = FC_data1.drop(['LogFiT', 'LogS'], axis=1)
X_train = X.drop(["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"], axis=0)
X_train.head(21)
y_train = y.drop(["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"], axis=0)
y_train.head(21)
X_test = X.loc[["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"]]
X_test.head(5)
y_test = y.loc[["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"]]
y_test.head(5)
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
randomforest = RandomForestRegressor(n_jobs=-1)
selector = SelectFromModel(randomforest)
features_important = selector.fit_transform(X_train, y_train)
model = randomforest.fit(features_important, y_train)
from sklearn.model_selection import GridSearchCV
clf_rf = RandomForestRegressor()
parameters = {"n_estimators":[1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 100], "max_depth":[1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 100]}
grid_search_cv_clf = GridSearchCV(clf_rf, parameters, cv=5)
grid_search_cv_clf.fit(features_important, y_train)
from sklearn.metrics import r2_score
y_pred = grid_search_cv_clf.predict(features_important)
r2_score(y_train, y_pred)
grid_search_cv_clf.best_params_
best_clf = grid_search_cv_clf.best_estimator_
X_test_filtered = X_test.iloc[:,selector.get_support()]
best_clf.score(X_test_filtered, y_test)
feature_importances = best_clf.feature_importances_
feature_importances_df = pd.DataFrame({'features': X_test_filtered.columns.values,
'feature_importances':feature_importances})
importances = feature_importances_df.sort_values('feature_importances', ascending=False)
importances.head(25)
Now I need q2 value.
Finally, I wrote this code and got a reasonably high score 0.9071543776303185
.
from sklearn.model_selection import LeaveOneOut
parameters = {"n_estimators":[4], "max_depth":[20]}
loo_clf = GridSearchCV(best_clf, parameters, cv=LeaveOneOut())
loo_clf.fit(features_important, y_train)
loo_clf.score(features_important, y_train)
I'm not sure if it is q2 or not. How do you think?
I also decided to obtain 5-fold cross-validation score. However, it gives ridiculous values like, for example: -36.58997717, 0.76801832, -1.59900448, 0.1834304 , -2.38256389 and a mean of -7.924019361863889.
from sklearn.model_selection import cross_val_score
cvs=cross_val_score(best_clf, features_important, y_train)
mean_cross_val_score = cvs.mean()
mean_cross_val_score
Probably, there is a way to fix it?
You should not run the hyper-parameters search before to make the model evaluation. Instead, you should the 2 cross-validations, otherwise, you are leaking some information. To know more about this, you should look at the following example from the scikit-learn documentation: https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html#sphx-glr-auto-examples-model-selection-plot-nested-cross-validation-iris-py
Therefore, in your particular use-case, you should use: GridSearchCV, SelectFromModel, and cross_val_score:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
X, y = make_regression(n_samples=100)
feature_selector = SelectFromModel(
RandomForestRegressor(n_jobs=-1), threshold="mean"
)
pipe = make_pipeline(
feature_selector, RandomForestRegressor(n_jobs=-1)
)
param_grid = {
# define the grid of the random-forest for the feature selection
"selectfrommodel__estimator__n_estimators": [10, 20],
"selectfrommodel__estimator__max_depth": [3, 5],
# define the grid of the random-forest for the prediction
"randomforestregressor__n_estimators": [10, 20],
"randomforestregressor__max_depth": [5, 8],
}
grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, cv=3)
# You can use the LOO in this way. Be aware that this not a good practise,
# it leads to large variance when evaluating your model.
# scores = cross_val_score(pipe, X, y, cv=LeaveOneOut(), error_score='raise')
scores = cross_val_score(pipe, X, y, cv=2, error_score='raise')
score.mean()
You need to specify the scoring and the cv arguments.
Use this:
from sklearn.model_selection import cross_val_score
mycv = LeaveOneOut()
cvs=cross_val_score(best_clf, features_important, y_train, scoring='r2',cv = mycv)
mean_cross_val_score = cvs.mean()
print(mean_cross_val_score)
This will return the mean cross-validated R2 score using LOOCV.
For more scoring options see here: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
So the basic requirement is that, I get a dictionary of models from user, and a dictionary of their hyper parameters and give a report. Currently goal is for binary classification, but this can be extended later.
This is what I am currently doing:
import numpy as np
import pandas as pd
# import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer
from sklearn import datasets
# import joblib
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
def build_model(model_name, model_class, params=None):
"""
return model instance
"""
if 'Ridge' in model_name:
model = model_class(penalty='l2')
elif 'Lasso' in model_name:
model = model_class(penalty='l1')
elif 'Ensemble' in model_name:
model = model_class(estimators=[('rf', RandomForestClassifier()), ('gbm', GradientBoostingClassifier())], voting='hard')
else:
model = model_class()
if params is not None:
print('Custom Model Parameters provided. Implementing Randomized Search for {} model'.format(model_name))
rscv = RandomizedSearchCV(estimator=model, param_distributions=params[model_name],
random_state=22, n_iter=10, cv=5, verbose=1, n_jobs=-1,
scoring=make_scorer(f1_score), error_score=0.0)
return rscv
print('No model parameters provided. Using sklearn default values for {} model'.format(model_name))
return model
def fit_model(model_name, model_instance, xTrain, yTrain):
"""
fit model
"""
if model_name == 'SVM':
scaler = StandardScaler()
model = model_instance.fit(scaler.fit_transform(xTrain), yTrain)
else:
model = model_instance.fit(xTrain, yTrain)
return model
def predict_vals(fitted_model, xTest):
"""
predict and return vals
"""
if model_name == 'SVM':
scaler = StandardScaler()
y_prediction = fitted_model.predict(scaler.fit_transform(xTest))
else:
y_prediction = fitted_model.predict(xTest)
return y_prediction
def get_metrics(yTest, y_prediction):
"""
get metrics after getting prediction
"""
return [recall_score(yTest, y_prediction),
precision_score(yTest, y_prediction),
f1_score(yTest, y_prediction),
roc_auc_score(yTest, y_prediction)]
def model_report(list_of_metrics):
"""
add metrics to df, return df
"""
df = pd.DataFrame(list_of_metrics, columns=['Model', 'Recall', 'Precision', 'f1', 'roc_auc'])
df = df.round(3)
return df
models = {
'Logistic Regression Ridge': LogisticRegression,
'Logistic Regression Lasso': LogisticRegression,
'Random Forest': RandomForestClassifier,
'SVM': SVC,
'GBM': GradientBoostingClassifier,
'EnsembleRFGBM': VotingClassifier
}
model_parameters = {
'SVM': {
'C': np.random.uniform(50, 1, [25]),#[1, 10, 100, 1000],
'class_weight': ['balanced'],
'gamma': [0.0001, 0.001],
'kernel': ['linear']
},
'Random Forest': {
'n_estimators': [5, 10, 50, 100, 200],
'max_depth': [3, 5, 10, 20, 40],
'criterion': ['gini', 'entropy'],
'bootstrap': [True, False],
'min_samples_leaf': [np.random.randint(1,10)]
},
'Logistic Regression Ridge': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'Logistic Regression Lasso': {
'C': np.random.rand(25),
'class_weight': ['balanced']
},
'GBM': {
'n_estimators': [10, 50, 100, 200, 500],
'max_depth': [3, 5, 10, None],
'min_samples_leaf': [np.random.randint(1,10)]
},
'EnsembleRFGBM': {
'rf__n_estimators': [5, 10, 50, 100, 200],
'rf__max_depth': [3, 5, 10, 20, 40],
'rf__min_samples_leaf': [np.random.randint(1,10)],
'gbm__n_estimators': [10, 50, 100, 200, 500],
'gbm__max_depth': [3, 5, 10, None],
'gbm__min_samples_leaf': [np.random.randint(1,10)]
}
}
Without parameters I get the following report.
# without parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
With parameters given as input
# with parameters
lst = []
for model_name, model_class in models.items():
model_instance = build_model(model_name, model_class, model_parameters)
fitted_model = fit_model(model_name, model_instance, X_train, y_train)
y_predicted = predict_vals(fitted_model, X_test)
metrics = get_metrics(y_test, y_predicted)
lst.append([model_name] + metrics)
model_report(lst)
The task given to me right now is as follows.
Take from user, a dictionary of models, and their parameters. If parameters are not provided, then use defaults of the models.
Give as output the report (as seen in images)
I was told that I should change the functions to classes. And avoid for loops if possible.
My challenges:
How do I change all the functions into a class and methods? Basically my senior wants something like
report.getReport # gives the dataFrame of the report
But the above sounds to me like it can be done in a function as follows (I don't understand why/how a class would be beneficial)
customReport(whatever inputs I'd like to give) # gives df of report
How do I avoid for loops to get through the user inputs for various models? What I thought was that I could use sklearn pipeline, since according to my understanding, pipeline is a series of steps, so from user take the params and models, and execute them as a series of steps. This avoids the for loops.
Something like this
customPipeline = Pipeline([ ('rf', RandomForestClassifier(with relevant params from params dict),
'SVC', SVC(with relevant params from params dict)) ] )
Similar solution I found is here but I would like to avoid for loops as such.
Another related solution here is using a class which can switch between different models. But here I would require that the user be able to give option whether he wants to do Gridsearch/RandomizedSearch/CV/None. My thinking is that I use this class, then inherit this to another class which the user can give input to choose Gridsearch/RandomizedSearch/CV/None etc. I'm not sure if I'm thinking in the right direction.
NOTE A full working solution is desirable (would love it) but not mandatory. It is ok if your answer has a skeleton which can give me a direction how to proceed. I am ok with exploring and learning from it.
I have implemented a working solution. I should have worded my question better. I initially misunderstood how GridsearchCV or RandomizedSearchCV works internally. cv_results_ gives all the results of the grid available. I thought only the best estimator was available to us.
Using this, for each type of model, I took the max rank_test_score, and got the parameters making up the model. In this example, it is 4 models. Now I ran each of those models, i.e. the best combination of parameters for each model, with my test data, and predicted the required scores. I think this solution can be extended to RandomizedSearchCV and a lot more other options.
NOTE: This is just a trivial solution. Lot of modifications necessary, like needing to scale data for specific models, etc. This solution will just serve as a starting point which can be modified according to the user's needs.
Credits to this answer for the ClfSwitcher() class.
Following is the implementation of the class (suggestions to improve are welcomed).
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
import warnings
warnings.filterwarnings('ignore')
cancer = datasets.load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
target = df['target']
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target', axis=1), target, test_size=0.4, random_state=13, stratify=target)
class ClfSwitcher(BaseEstimator):
def __init__(self, model=RandomForestClassifier()):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.model = model
def fit(self, X, y=None, **kwargs):
self.model.fit(X, y)
return self
def predict(self, X, y=None):
return self.model.predict(X)
def predict_proba(self, X):
return self.model.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
class report(ClfSwitcher):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.grid = None
self.full_report = None
self.concise_report = None
self.scoring_metrics = {
'precision': precision_score,
'recall': recall_score,
'f1': f1_score,
'roc_auc': roc_auc_score
}
def griddy(self, pipeLine, parameters, **kwargs):
self.grid = GridSearchCV(pipeLine, parameters, scoring='accuracy', n_jobs=-1)
def fit_grid(self, X_train, y_train=None, **kwargs):
self.grid.fit(X_train, y_train)
def make_grid_report(self):
self.full_report = pd.DataFrame(self.grid.cv_results_)
#staticmethod
def get_names(col):
return col.__class__.__name__
#staticmethod
def calc_score(col, metric):
return round(metric(y_test, col.fit(X_train, y_train).predict(X_test)), 4)
def make_concise_report(self):
self.concise_report = pd.DataFrame(self.grid.cv_results_)
self.concise_report['model_names'] = self.concise_report['param_cst__model'].apply(self.get_names)
self.concise_report = self.concise_report.sort_values(['model_names', 'rank_test_score'], ascending=[True, False]) \
.groupby(['model_names']).head(1)[['param_cst__model', 'model_names']] \
.reset_index(drop=True)
for metric_name, metric_func in self.scoring_metrics.items():
self.concise_report[metric_name] = self.concise_report['param_cst__model'].apply(self.calc_score, metric=metric_func)
self.concise_report = self.concise_report[['model_names', 'precision', 'recall', 'f1', 'roc_auc', 'param_cst__model']]
pipeline = Pipeline([
('cst', ClfSwitcher()),
])
parameters = [
{
'cst__model': [RandomForestClassifier()],
'cst__model__n_estimators': [10, 20],
'cst__model__max_depth': [5, 10],
'cst__model__criterion': ['gini', 'entropy']
},
{
'cst__model': [SVC()],
'cst__model__C': [10, 20],
'cst__model__kernel': ['linear'],
'cst__model__gamma': [0.0001, 0.001]
},
{
'cst__model': [LogisticRegression()],
'cst__model__C': [13, 17],
'cst__model__penalty': ['l1', 'l2']
},
{
'cst__model': [GradientBoostingClassifier()],
'cst__model__n_estimators': [10, 50],
'cst__model__max_depth': [3, 5],
'cst__model__min_samples_leaf': [1, 2]
}
]
my_report = report()
my_report.griddy(pipeline, parameters, scoring='f1')
my_report.fit_grid(X_train, y_train)
my_report.make_concise_report()
my_report.concise_report
Output Report as desired.
You can consider using map(), details here: https://www.geeksforgeeks.org/python-map-function/
Some programmers have the habit of avoiding raw loops - "A raw loop is any loop inside a function where the function serves purpose larger than the algorithm
implemented by the loop". More details here: https://sean-parent.stlab.cc/presentations/2013-09-11-cpp-seasoning/cpp-seasoning.pdf
I think that's the reason you are asked to remove for loop.
I want to write a Naive Base text classificator.
Because sklearn does not accept 'text form' features I am transforming them using TfidfVectorizer.
I was successfully able to create such classificatory using only the transformed data as features. The code looks like this:
### text vectorization--go from strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed = vectorizer.transform(X_test_raw['url'])
### feature selection, because text is super high dimensional and
### can be really computationally chewy as a result
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)
X_train = selector.transform(X_train_transformed).toarray()
X_test = selector.transform(X_test_transformed).toarray()
clf = GaussianNB()
clf.fit(X_train, y_train_raw)
.....
Everything works as intended but I am having problems when I want to add another feature eg. flag indicating weather the given text contains a certain keyword.
I tried multiple things to properly transform the 'url' feature and then combine the transformed feature with another boolean feature but I was unsuccessfully.
Any tips how it should be done assuming that I have a pandas frame containing two features: 'url' (which I want to transform) and 'contains_keyword' flag?
The solution which failed looks like this:
vectorizer = CountVectorizer(min_df=1)
X_train_transformed = vectorizer.fit_transform(X_train_raw['url'])
X_test_transformed = vectorizer.transform(X_test_raw['url'])
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(X_train_transformed, y_train_raw)
X_train_selected = selector.transform(X_train_transformed)
X_test_selected = selector.transform(X_test_transformed)
X_train_raw['transformed_url'] = X_train_selected.toarray().tolist()
X_train_without = X_train_raw.drop(['url'], axis=1)
X_train = X_train_without.values
This produces rows containing a boolean flag and a list which is a wrong input for sklearn model. I have no idea how should i properly transform this. Grateful for any help.
Here are test data:
url,target,ads_keyword
googleadapis l google com,1,True
googleadapis l google com,1,True
clients1 google com,1,False
c go-mpulse net,1,False
translate google pl,1,False
url - splitted domain taken from dns query
target - target class for classification
ads_keyword - flag indicating weather the 'url' contains the 'ads' word.
I want to transform the 'url' using the TfidfVectorizer and use the transformed data together with 'ads_keyword' (and possibly more features in the future) as features used to train the Naive Bayes model.
Here is a demo, showing how to union features and how to tune up hyperparameters using GridSearchCV.
Unfortunately your sample data set is too tiny to train a real model...
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
import os
import re
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, name=None, position=None,
as_cat_codes=False, sparse=False):
self.name = name
self.position = position
self.as_cat_codes = as_cat_codes
self.sparse = sparse
def fit(self, X, y=None):
return self
def transform(self, X, **kwargs):
if self.name is not None:
col_pos = X.columns.get_loc(self.name)
elif self.position is not None:
col_pos = self.position
else:
raise Exception('either [name] or [position] parameter must be not-None')
if self.as_cat_codes and X.dtypes.iloc[col_pos] == 'category':
ret = X.iloc[:, col_pos].cat.codes
else:
ret = X.iloc[:, col_pos]
if self.sparse:
ret = csr_matrix(ret.values.reshape(-1,1))
return ret
union = FeatureUnion([
('text',
Pipeline([
('select', ColumnSelector('url')),
#('pct', SelectPercentile(percentile=1)),
('vect', TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')),
]) ),
('ads',
Pipeline([
('select', ColumnSelector('ads_keyword', sparse=True,
as_cat_codes=True)),
#('scale', StandardScaler(with_mean=False)),
]) )
])
pipe = Pipeline([
('union', union),
('clf', MultinomialNB())
])
param_grid = [
{
'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
max_df=0.5,
stop_words='english')],
'clf': [SGDClassifier(max_iter=500)],
'union__text__vect__ngram_range': [(1,1), (2,5)],
'union__text__vect__analyzer': ['word','char_wb'],
'clf__alpha': np.logspace(-5, 0, 6),
#'clf__max_iter': [500],
},
{
'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
max_df=0.5,
stop_words='english')],
'clf': [MultinomialNB()],
'union__text__vect__ngram_range': [(1,1), (2,5)],
'union__text__vect__analyzer': ['word','char_wb'],
'clf__alpha': np.logspace(-4, 2, 7),
},
#{ # NOTE: does NOT support sparse matrices!
# 'union__text__vect': [TfidfVectorizer(sublinear_tf=True,
# max_df=0.5,
# stop_words='english')],
# 'clf': [GaussianNB()],
# 'union__text__vect__ngram_range': [(1,1), (2,5)],
# 'union__text__vect__analyzer': ['word','char_wb'],
#},
]
gs_kwargs = dict(scoring='roc_auc', cv=3, n_jobs=1, verbose=2)
X_train, X_test, y_train, y_test = \
train_test_split(df[['url','ads_keyword']], df['target'], test_size=0.33)
grid = GridSearchCV(pipe, param_grid=param_grid, **gs_kwargs)
grid.fit(X_train, y_train)
# prediction
predicted = grid.predict(X_test)