Including log transformation within GridSearchCV tuning - python

I would like to include log transformation as part of my hyperparameter tuning. I'm currently running GridSearchCV twice and then select the best model from both runs. Is there a way to do this as part of GridSearchCV instead?
Example of my current model below:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression
X, y = make_regression(n_features=4, n_informative=2,
random_state=0, shuffle=False)
rf = RandomForestRegressor()
def do_log(y):
y_t = np.log(y+1)
return(y_t)
def do_exp(x):
y = np.exp(y)-1
return(y_t)
transformed_rf = TransformedTargetRegressor(rf, func = do_log, inverse_func=do_exp)
param_grid = {'regressor__n_estimators': [100, 500, 1000]}
grid_search1 = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 10)
grid_search1.fit(X, y)
grid_search2 = GridSearchCV(estimator = transformed_rf, param_grid = param_grid, cv = 10)
grid_search2.fit(X, y)
if (grid_search1.best_score_ > grid_search2.best_score_):
best_model = grid_search1.best_estimator_
elif (grid_search1.best_score_ < grid_search2.best_score_):
best_model = grid_search2.best_estimator_
else:
print("same performance for both models")
best_model = grid_search1.best_estimator_
I'm looking for something like this:
param_grid = {'estimators': [100, 500, 1000],
'log_transform': [True, False],
}
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 10)
grid_search.fit(X, y)
best_model = grid_search.best_estimator_

Related

TypeError when debugging scikit-learn GridSearchCV in visual studio code

I am getting the following error when I try to debug my python code in visual studio. Surprisingly, the code works fine when I run the code without debugging. Any clues?
TypeError: unbound method new_CreateProcess() must be called with _winapi instance as first argument (got str instance instead)
PS C:\GitHub\Project>
Here is the piece of code causing the error. It is a standard grid-search initialization in scikit-learn. I am not sure what I am missing. I am using Python 2.7.
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(estimator=model, param_grid=p_grid, scoring='roc_auc', n_jobs=4, cv=inner_cv, refit=True, return_train_score=True)
clf.fit(x, y)
The clf.fit() line is where the error happens. Here is the complete traceback for the above code:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
df = pd.read_csv([some file])
x = df.iloc[:,3:-1]
y = df.loc[:, 'Label']
colNames = x.columns
scaler = MinMaxScaler()
for i in range(2,x.shape[1]):
columnName = x.columns[i]
x[columnName] = scaler.fit_transform(x[columnName].values.reshape(-1,1))
p_grid = {'n_estimators' : list(range(10,101,10)),
'max_features' : list(range(6,30,5))}
model = RandomForestClassifier()
numTrials = 30
non_nested_scores = np.zeros(numTrials)
nested_scores = []
scores = {'auc': 'roc_auc', 'f1': 'f1', 'precision':'precision', 'recall':'recall', 'accuracy':'accuracy'}
resultsLog = pd.DataFrame(columns=['trial','auc','f1','precision','recall','accuracy'])
for i in range(numTrials):
n = 5
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
# Non_nested parameter search and scoring
clf = GridSearchCV(estimator=model, param_grid=p_grid, scoring='roc_auc', n_jobs=-1, cv=inner_cv, refit=True, return_train_score=True)
clf = clf.fit(x,y)
non_nested_scores[i] = clf.best_score_
# Nested CV with parameter optimization
nested_score = cross_validate(clf,
X=x,
y=y,
scoring=scores,
cv=outer_cv,
n_jobs=-1)
m1 = np.mean(nested_score['test_auc'])
m2 = np.mean(nested_score['test_f1'])
m3 = np.mean(nested_score['test_precision'])
m4 = np.mean(nested_score['test_recall'])
m5 = np.mean(nested_score['test_accuracy'])
newRow = pd.DataFrame({'trial':[i], 'auc':[m1], 'f1':[m2], 'precision':[m3], 'recall':[m4], 'accuracy':[m5]})
resultsLog = pd.concat([resultsLog, newRow], ignore_index=True)
nested_scores.append(nested_score)

How to do only simple cross validation using GridSearchCV

How I am using below code to perform both simple cross validation and K-fold cross validation
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import numpy as np
# our hyperparameters to choose from
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2]
n_estimators = [30, 50, 100, 150, 200]
param_grid = dict(learning_rate = learning_rate, n_estimators = n_estimators)
xgb_model = xgb.XGBClassifier(random_state=42, n_jobs = -1)
clf = GridSearchCV(xgb_model, param_grid, scoring = 'roc_auc', cv=3, return_train_score=True)
sc = clf.fit(X_train, y_train)
# getting all the results
scores = clf.cv_results_
# getting train scores and cross validation scores
train_score = scores['mean_train_score']
cv_score = scores['mean_test_score']
Access the classifier trained with the best set of hyper-parameters, then call the score method, which will make predictions from X_cv and score accuracy compared to y_cv:
clf.best_estimator_.score(X_cv,y_cv)
If you just want the predictions, then call the predict method instead with X_cv as argument.

How to perform cross-validation of a random-forest model in scikit-learn?

I need to perform leave-one-out cross validation of RF model.
I successfully built a model with high predictive ability.
Now I need to perform LOO test prior to the publication.
Here is my code:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
FC_data = pd.read_excel('C:\\Users\\Dre\\Desktop\\My Papers\\Furocoumarins_paper_2018\\Furocoumarins_NEW1.xlsx', index_col=0)
FC_data.head()
# Create correlation matrix
corr_matrix = FC_data.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
FC_data1 = FC_data.drop(FC_data[to_drop], axis=1)
y = FC_data1.LogFiT
X = FC_data1.drop(['LogFiT', 'LogS'], axis=1)
X_train = X.drop(["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"], axis=0)
X_train.head(21)
y_train = y.drop(["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"], axis=0)
y_train.head(21)
X_test = X.loc[["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"]]
X_test.head(5)
y_test = y.loc[["3-Acetoisopseudopsoralen", "3-Carbethoxypsoralen", "4,4'-Dimethylangelicin",
"4,7,4'-Trimethylallopsoralen", "Psoralen"]]
y_test.head(5)
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
randomforest = RandomForestRegressor(n_jobs=-1)
selector = SelectFromModel(randomforest)
features_important = selector.fit_transform(X_train, y_train)
model = randomforest.fit(features_important, y_train)
from sklearn.model_selection import GridSearchCV
clf_rf = RandomForestRegressor()
parameters = {"n_estimators":[1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 100], "max_depth":[1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 100]}
grid_search_cv_clf = GridSearchCV(clf_rf, parameters, cv=5)
grid_search_cv_clf.fit(features_important, y_train)
from sklearn.metrics import r2_score
y_pred = grid_search_cv_clf.predict(features_important)
r2_score(y_train, y_pred)
grid_search_cv_clf.best_params_
best_clf = grid_search_cv_clf.best_estimator_
X_test_filtered = X_test.iloc[:,selector.get_support()]
best_clf.score(X_test_filtered, y_test)
feature_importances = best_clf.feature_importances_
feature_importances_df = pd.DataFrame({'features': X_test_filtered.columns.values,
'feature_importances':feature_importances})
importances = feature_importances_df.sort_values('feature_importances', ascending=False)
importances.head(25)
Now I need q2 value.
Finally, I wrote this code and got a reasonably high score 0.9071543776303185
.
from sklearn.model_selection import LeaveOneOut
parameters = {"n_estimators":[4], "max_depth":[20]}
loo_clf = GridSearchCV(best_clf, parameters, cv=LeaveOneOut())
loo_clf.fit(features_important, y_train)
loo_clf.score(features_important, y_train)
I'm not sure if it is q2 or not. How do you think?
I also decided to obtain 5-fold cross-validation score. However, it gives ridiculous values like, for example: -36.58997717, 0.76801832, -1.59900448, 0.1834304 , -2.38256389 and a mean of -7.924019361863889.
from sklearn.model_selection import cross_val_score
cvs=cross_val_score(best_clf, features_important, y_train)
mean_cross_val_score = cvs.mean()
mean_cross_val_score
Probably, there is a way to fix it?
You should not run the hyper-parameters search before to make the model evaluation. Instead, you should the 2 cross-validations, otherwise, you are leaking some information. To know more about this, you should look at the following example from the scikit-learn documentation: https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html#sphx-glr-auto-examples-model-selection-plot-nested-cross-validation-iris-py
Therefore, in your particular use-case, you should use: GridSearchCV, SelectFromModel, and cross_val_score:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
X, y = make_regression(n_samples=100)
feature_selector = SelectFromModel(
RandomForestRegressor(n_jobs=-1), threshold="mean"
)
pipe = make_pipeline(
feature_selector, RandomForestRegressor(n_jobs=-1)
)
param_grid = {
# define the grid of the random-forest for the feature selection
"selectfrommodel__estimator__n_estimators": [10, 20],
"selectfrommodel__estimator__max_depth": [3, 5],
# define the grid of the random-forest for the prediction
"randomforestregressor__n_estimators": [10, 20],
"randomforestregressor__max_depth": [5, 8],
}
grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, cv=3)
# You can use the LOO in this way. Be aware that this not a good practise,
# it leads to large variance when evaluating your model.
# scores = cross_val_score(pipe, X, y, cv=LeaveOneOut(), error_score='raise')
scores = cross_val_score(pipe, X, y, cv=2, error_score='raise')
score.mean()
You need to specify the scoring and the cv arguments.
Use this:
from sklearn.model_selection import cross_val_score
mycv = LeaveOneOut()
cvs=cross_val_score(best_clf, features_important, y_train, scoring='r2',cv = mycv)
mean_cross_val_score = cvs.mean()
print(mean_cross_val_score)
This will return the mean cross-validated R2 score using LOOCV.
For more scoring options see here: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values

Print the selected parameters in nested cross-validation

Below is an example of using scikit-learn to get cross-validated predictions from k-nearest neighbors, with k chosen by cross-validation. The code seems to work, but how can I also print the k that was selected in each of the outer folds?
import numpy as np, sklearn
n = 100
X = np.random.randn(n, 2)
y = np.where(np.sum(X, axis = 1) + np.random.randn(n) > 0, "blue", "red")
preds = sklearn.model_selection.cross_val_predict(
X = X,
y = y,
estimator = sklearn.model_selection.GridSearchCV(
estimator = sklearn.neighbors.KNeighborsClassifier(),
param_grid = {'n_neighbors': range(1, 7)},
cv = sklearn.model_selection.KFold(10, random_state = 133),
scoring = 'accuracy'),
cv = sklearn.model_selection.KFold(10, random_state = 144))
You can't get this directly from that function, so you would need to replace cross_val_predict with cross_validate and set the return_estimator flag to True. You can then select the estimators used in the returned dictionary with the key estimator. The selected parameters of the estimators is stored in the attribute best_params_. So
import numpy as np
import sklearn
# sklearn 0.20.3 doesn't seem to import submodules in __init__
# So importing them directly is required.
import sklearn.model_selection
import sklearn.neighbors
n = 100
X = np.random.randn(n, 2)
y = np.where(np.sum(X, axis = 1) + np.random.randn(n) > 0, "blue", "red")
scores = sklearn.model_selection.cross_validate(
X = X,
y = y,
estimator = sklearn.model_selection.GridSearchCV(
estimator = sklearn.neighbors.KNeighborsClassifier(),
param_grid = {'n_neighbors': range(1, 7)},
cv = sklearn.model_selection.KFold(10, random_state = 133),
scoring = 'accuracy'),
cv = sklearn.model_selection.KFold(10, random_state = 144),
return_estimator=True)
# Selected hyper-parameters for the estimator from the first fold
print(scores['estimator'][0].best_params_)
Unfortunately you can't get the actual predictions AND the hyper-parameters selected from the same function. If you want that, you will have to do the nested cross-validation manually:
cv = sklearn.model_selection.KFold(10, random_state = 144)
estimator = sklearn.model_selection.GridSearchCV(
estimator = sklearn.neighbors.KNeighborsClassifier(),
param_grid = {'n_neighbors': range(1, 7)},
cv = sklearn.model_selection.KFold(10, random_state = 133),
scoring = 'accuracy')
for train, test in cv.split(X,y):
X_train, y_train = X[train], y[train]
X_test, y_test = X[test], y[test]
m = estimator.fit(X_train, y_train)
print(m.best_params_)
y_pred = m.predict(X_test)
print(y_pred)

Problems setting up conditional search space in hyperopt

I'll fully admit that I may be setting up the conditional space wrong here but for some reason, I just can't get this to function at all. I am attempting to use hyperopt to tune a logistic regression model and depending on the solver there are some other parameters that need to be explored. If you choose the liblinear solver you can choose penalties, and depending on the penalty you can also choose dual. When I try and run hyperopt on this search space though, it keeps giving me an error because its passing the entire dictionary as show below. Any ideas?
The error I'm getting is
ValueError: Logistic Regression supports only liblinear, newton-cg, lbfgs and sag solvers, got {'solver': 'sag'}'
This format worked when setting up a random forest search space so I'm at a loss.
import numpy as np
import scipy as sp
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="white")
import pyodbc
import statsmodels as sm
from pandasql import sqldf
import math
from tqdm import tqdm
import pickle
from sklearn.preprocessing import RobustScaler, OneHotEncoder, MinMaxScaler
from sklearn.utils import shuffle
from sklearn.cross_validation import KFold, StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold as StratifiedKFoldIt
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectFromModel, SelectKBest
from sklearn.decomposition import PCA, IncrementalPCA, FactorAnalysis
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, accuracy_score, classification_report, confusion_matrix, f1_score, log_loss
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, NearMiss, NeighbourhoodCleaningRule, OneSidedSelection
from xgboost.sklearn import XGBClassifier
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
space4lr = {
'C': hp.uniform('C', .0001, 100.0),
'solver' : hp.choice('solver', [
{'solver' : 'newton-cg',},
{'solver' : 'lbfgs',},
{'solver' : 'sag'},
{'solver' : 'liblinear', 'penalty' : hp.choice('penalty', [
{'penalty' : 'l1'},
{'penalty' : 'l2', 'dual' : hp.choice('dual', [True, False])}]
)},
]),
'fit_intercept': hp.choice('fit_intercept', ['True', 'False']),
'class_weight': hp.choice('class_weight', ['balanced', None]),
'max_iter': 50000,
'random_state': 84,
'n_jobs': 8
}
lab = 0
results = pd.DataFrame()
for i in feature_elims:
target = 'Binary_over_3'
alt_targets = ['year2_PER', 'year2_GP' ,'year2_Min', 'year2_EFF' ,'year2_WS/40' ,'year2_Pts/Poss' ,'Round' ,'GRZ_Pick'
,'GRZ_Player_Rating' ,'Binary_over_2', 'Binary_over_3' ,'Binary_over_4' ,'Binary_5' ,'Draft_Strength']
#alt_targets.remove(target)
nondata_columns = ['display_name' ,'player_global_id', 'season' ,'season_' ,'team_global_id', 'birth_date', 'Draft_Day']
nondata_columns.extend(alt_targets)
AGG_SET_CART_PERC = sqldf("""SELECT * FROM AGG_SET_PLAYED_ADJ_SOS_Jan1 t1
LEFT JOIN RANKINGS t2 ON t1.[player_global_id] = t2.[player_global_id]
LEFT JOIN Phys_Training t3 ON t1.[player_global_id] = t3.[player_global_id]""")
AGG_SET_CART_PERC['HS_RSCI'] = AGG_SET_CART_PERC['HS_RSCI'].fillna(110)
AGG_SET_CART_PERC['HS_Avg_Rank'] = AGG_SET_CART_PERC['HS_Avg_Rank'].fillna(1)
AGG_SET_CART_PERC['HS_years_ranked'] = AGG_SET_CART_PERC['HS_years_ranked'].fillna(0)
AGG_SET_CART_PERC = shuffle(AGG_SET_CART_PERC, random_state=8675309)
rus = RandomUnderSampler(random_state=8675309)
ros = RandomOverSampler(random_state=8675309)
rs = RobustScaler()
X = AGG_SET_CART_PERC
y = X[target]
X = pd.DataFrame(X.drop(nondata_columns, axis=1))
position = pd.get_dummies(X['position'])
for idx, row in position.iterrows():
if row['F/C'] == 1:
row['F'] = 1
row['C'] = 1
if row['G/F'] == 1:
row['G'] = 1
row['F'] = 1
position = position.drop(['F/C', 'G/F'], axis=1)
X = pd.concat([X, position], axis=1).drop(['position'], axis=1)
X = rs.fit_transform(X, y=None)
X = i.transform(X)
def hyperopt_train_test(params):
clf = LogisticRegression(**params)
#cvs = cross_val_score(xgbc, X, y, scoring='recall', cv=skf).mean()
skf = StratifiedKFold(y, n_folds=6, shuffle=False, random_state=1)
metrics = []
tuning_met = []
accuracy = []
precision = []
recall = []
f1 = []
log = []
for i, (train, test) in enumerate(skf):
X_train = X[train]
y_train = y[train]
X_test = X[test]
y_test = y[test]
X_train, y_train = ros.fit_sample(X_train, y_train)
X_train, y_train = rus.fit_sample(X_train, y_train)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
tuning_met.append((((precision_score(y_test, y_pred))*4) + recall_score(y_test, y_pred))/5)
accuracy.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred))
recall.append(recall_score(y_test, y_pred))
f1.append(f1_score(y_test, y_pred))
log.append(log_loss(y_test, y_pred))
metrics.append(sum(tuning_met) / len(tuning_met))
metrics.append(sum(accuracy) / len(accuracy))
metrics.append(sum(precision) / len(precision))
metrics.append(sum(recall) / len(recall))
metrics.append(sum(f1) / len(f1))
metrics.append(sum(log) / len(log))
return(metrics)
best = 0
count = 0
def f(params):
global best, count, results, lab, met
met = hyperopt_train_test(params.copy())
met.append(params)
met.append(featureset_labels[lab])
acc = met[0]
results = results.append([met])
if acc > best:
print(featureset_labels[lab],'new best:', acc, 'Accuracy:', met[1], 'Precision:', met[2], 'Recall:', met[3], 'using', params, """
""")
best = acc
else:
print(acc, featureset_labels[lab], count)
count = count + 1
return {'loss': -acc, 'status': STATUS_OK}
trials = Trials()
best = fmin(f, space4lr, algo=tpe.suggest, max_evals=1000, trials=trials)
print(featureset_labels[lab], ' best:')
print(best, """
""")
lab = lab + 1
Though replying too late, but yesterday I faced this issue. Below mentioned code snippet, I think will help you out.
space = hp.choice('classifier',[
{
'model': LogisticRegression,
'param':
{
'hyper_param_groups' :hp.choice('hyper_param_groups',
[
{
'penalty':hp.choice('penalty_block1', ['l2']),
'solver':hp.choice('solver_block1', ['newton-cg', 'sag', 'saga', 'lbfgs']),
'multi_class':hp.choice('multi_class', ['ovr', 'multinomial']),
},
{
'penalty':hp.choice('penalty_block2', ['l2']),
'solver':hp.choice('solver_block2', ['liblinear']),
'multi_class':hp.choice('multi_class_block2', ['ovr']),
},
{
'penalty':hp.choice('penalty_block3', ['l1']),
'solver':hp.choice('solver_block3', ['saga']),
'multi_class':hp.choice('multi_class_block3', ['ovr', 'multinomial']),
},
]),
'dual':hp.choice('dual', [False]),
'class_weight':hp.choice('class_weight', ['balanced', None]),
'random_state':hp.choice('random_state', [10,267]),
'max_iter':hp.choice('max_iter', [100,500]),
'verbose':hp.choice('verbose', [0])
}
}])
and how to use it in callable method:
penalty = args['param']['hyper_param_groups']['penalty']
solver = args['param']['hyper_param_groups']['solver']
multi_class = args['param']['hyper_param_groups']['multi_class']
dual = args['param']['dual']
class_weight = args['param']['class_weight']
random_state = args['param']['random_state']
max_iter = args['param']['max_iter']
verbose = args['param']['verbose']

Categories