I have data with differing weights for each sample. In my application, it is important that these weights are accounted for in estimating the model and comparing alternative models.
I'm using sklearn to estimate models and to compare alternative hyperparameter choices. But this unit test shows that GridSearchCV does not apply sample_weights to estimate scores.
Is there a way to have sklearn use sample_weight to score the models?
Unit test:
from __future__ import division
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()
for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y[test_ndx]
clf.fit(X=X_train, y=y_train, sample_weight=w_train)
y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)
results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})
for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})
best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param
if __name__ == "__main__":
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
# sample_weight = np.array([1 for _ in range(len(X))])
inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}
fit_params = {"sample_weight": sample_weight}
my_scorer = make_scorer(log_loss,
greater_is_better=False,
needs_proba=True,
needs_threshold=False)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y, **fit_params)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))
score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))
Which produces output:
This is the best out-of-sample score using GridSearchCV: 0.135692.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
Explanation: Since manually computing the loss without weighting produces the same scoring as GridSearchCV, we know that the sample weights are not being used.
The GridSearchCV takes a scoring as input, which can be callable. You can see the details of how to change the scoring function, and also how to pass your own scoring function here. Here's the relevant piece of code from that page for the sake of completeness:
EDIT: The fit_params is passed only to the fit functions, and not the score functions. If there are parameters which are supposed to be passed to the scorer, they should be passed to the make_scorer. But that still doesn't solve the issue here, since that would mean that the whole sample_weight parameter would be passed to log_loss, whereas only the part which corresponds to y_test at the time of calculating the loss should be passed.
sklearn does NOT support such a thing, but you can hack your way through, using a padas.DataFrame. The good news is, sklearn understands a DataFrame, and keeps it that way. Which means you can exploit the index of a DataFrame as you see in the code here:
# more code
X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)
# more code
def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)
score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)
# more code
As you see, the score_f uses the index of y_true to find which parts of sample_weight to use. For the sake of completeness, here's the whole code:
from __future__ import division
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.metrics import make_scorer
import pandas as pd
def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()
for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y_in[test_ndx]
clf.fit(X=X_train, y=y_train, sample_weight=w_train)
y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)
results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})
for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})
best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param
#if __name__ == "__main__":
if True:
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)
# sample_weight = np.array([1 for _ in range(len(X))])
inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}
def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)
score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))
score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))
The output of the code is then:
This is the best out-of-sample score using GridSearchCV: 0.095439.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
EDIT 2: as the comment bellow says:
the difference in my score and the sklearn score using this solution
originates in the way that I was computing a weighted average of
scores. If you omit the weighted average portion of the code, the two
outputs match to machine precision.
Currently in sklearn, GridSearchCV(and any classes inherit BaseSearchCV) only allow sample_weight in **fit_params but not using it in scoring, which is not correct, since CV pick the "best estimator" via unweighted score. Notes, when you grid.fit(X, y, sample_weight=w) only use sample weights in fit, not score.
There are two ways to solve this problems:
Handy method: add weight as the first columns in X. write your customized scoring function and transformer in your model.
from sklearn.base import BaseEstimator, TransformerMixin
# customized scorer
def weight_remover_scorer(estimator, X, y):
y_pred = estimator.predict(X)
w = X[:,0]
return your_scorer(y, y_pred, sample_weight=w)
# customized transformer
class WeightRemover(TransformerMixin, BaseEstimator):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
return X[:,1:]
# in your main function
if __name__=='__main__':
pipe = Pipeline([('remove_weight', WeightRemover()),('model',model)])
params_grid = {'model__'+k:v for k,v in params_grid.items()}
X = np.c_[train_w, X]
X_test = np.c_[test_w, X_test]
grid = GridSearchCV(pipe, params_grid, cv=5, scoring=weight_remover_scorer)
grid.fit(X, y)
add features in sklearn class (wait for new upgrade). Just add parameters sample_weight in BaseSearchCV (default is None), safer indexing them in the same way as fit_params = _check_fit_params(X, fit_params).
Just pointing out that there is an ongoing effort to support this important feature: https://github.com/scikit-learn/scikit-learn/pull/13432
But it seems that because of backward compatibility issues and the desire to tackle the more general problem of passing arbitrary sample related information it is taking a bit too long. The last attempt seems to be: https://github.com/scikit-learn/scikit-learn/pull/16079
Here is a good review of the issue: http://deaktator.github.io/2019/03/10/the-error-in-the-comparator/
Related
I have a custom scorer function whose inputs depend on the specific train and validation fold, additionally, the estimator's .predict_survival_function output is also needed. To give a more concrete example:
I am trying to run a GridSearch for a Random Survival Forest (scikit-survival package) with the Integrated Brier Score (IBS)
as the scoring method. The challenge is in the fact that the domain of the IBS is data- (and therefore, fold-) specific as it relies on the Kaplan-Meyer estimate at some point. Moreover, the .predict_survival_function method needs to be called every time during the scoring evaluation step and not only at the end of it.
It seems that I managed to to deal with the first issue by creating the following function:
def IB_time_interval(y_train, y_test):
y_times_tr = [i[2] for i in y_train]
y_times_te = [i[2] for i in y_test]
T1 = np.percentile(y_times_tr, 5, interpolation='higher')
T2 = np.percentile(y_times_tr, 95, interpolation='lower')
T3 = np.percentile(y_times_te, 5, interpolation='higher')
T4 = np.percentile(y_times_te, 95, interpolation='lower')
return np.linspace(np.maximum(T1,T3), np.minimum(T2, T4))
That is robust enough to work throughout all the folds. However, I am unable to retrieve the estimator's predictions during the grid search phase, as a non-fitted copy of it seems to passed instead every time the custom scorer function is called. The workaround that I I tried is to re-fit the estimator inside the scoring function, but not only this is conceptually wrong, it also raises errors.
The custom scorer function looks like the following:
def IB_scorer(y_true, y_pred, times=times_linspace, y=y, clf=rsf):
rsf.fit(X_train,y_train) #<--- = conceptually wrong
survs_test = rsf.predict_survival_function(X_test, return_array=False) #<---
T1, T2 = survs_test[0].x.min(), survs_test[0].x.max()
mask2 = np.logical_or(times >= T2, times < T1) # mask outer interval
times = times[~mask2]
#preds has shape (n_y-s, n_times)
preds_test = np.asarray([[fn(t) for t in times] for fn in survs_test])
return integrated_brier_score(y, y_true, preds_test, times)
and I create the scoring object immediately afterwards:
trial_IB_scorer = make_scorer(IB_scorer, greater_is_better=False)
Any suggestions? It would be great to be able to use GridSearch with more complex scoring functions, especially for the survival analysis case!
PS. I will paste the rest of the minimal working code here:
import numpy as np
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sksurv.metrics import integrated_brier_score
from sksurv.datasets import load_breast_cancer
X, y = load_breast_cancer()
X = X.drop(["er", "grade"], axis=1)
y_cens = np.array([i[0] for i in y]) #censoring status 1 or 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2,
shuffle=True,
random_state=0,
stratify = y_cens)
param_grid = {
'max_depth': [4, 20, None],
'max_features': ["sqrt", None]
}
rsf = RandomSurvivalForest(n_jobs=1, random_state=0)
times_linspace = IB_time_interval(y_train, y_test)
clf = GridSearchCV(rsf, param_grid, refit=True, n_jobs=1,
scoring=trial_IB_scorer)
clf.fit(X_train, y_train)
print("final score clf", clf.score(X_train, y_train))
print(clf.best_params_)
I am working on knn without using any library. The problem is that the labels are numeric
label = [1.5171, 1.7999, 2.4493, 2.8622, 2.9961, 3.6356, 3.7742, 5.8069, 7.1357 etc..]}
from each label there is one value
I want to predict the label for a new data but how should i choose the winning label if from each one there is one value?
prediction = max(set(label_neighbors), key=label_neighbors.count)
I'm guessing that you want to learn the mechanics of KNN, right. See the sample code below. This should do what you want.
import numpy as np
import scipy.spatial
from collections import Counter
# loading the Iris-Flower dataset from Sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 42, test_size = 0.2)
class KNN:
def __init__(self, k):
self.k = k
def fit(self, X, y):
self.X_train = X
self.y_train = y
def distance(self, X1, X2):
distance = scipy.spatial.distance.euclidean(X1, X2)
def predict(self, X_test):
final_output = []
for i in range(len(X_test)):
d = []
votes = []
for j in range(len(X_train)):
dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])
d.append([dist, j])
d.sort()
d = d[0:self.k]
for d, j in d:
votes.append(y_train[j])
ans = Counter(votes).most_common(1)[0][0]
final_output.append(ans)
return final_output
def score(self, X_test, y_test):
predictions = self.predict(X_test)
return (predictions == y_test).sum() / len(y_test)
clf = KNN(3)
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
for i in prediction:
print(i, end= ' ')
prediction == y_test
clf.score(X_test, y_test)
# Result:
# 1.0
Well, look at that! We got 100%! Not bad, not bad at all!!
Reference:
https://medium.com/analytics-vidhya/implementing-k-nearest-neighbours-knn-without-using-scikit-learn-3905b4decc3c
I am using RandomForestClassifier as follows using cross validation for a binary classification (class labels are 0 and 1).
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
clf=RandomForestClassifier(random_state = 42, class_weight="balanced")
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
accuracy = cross_val_score(clf, X, y, cv=k_fold, scoring = 'accuracy')
print("Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%")
f1 = cross_val_score(clf, X, y, cv=k_fold, scoring = 'f1_weighted')
print("F Measure: " + str(round(100*f1.mean(), 2)) + "%")
Now I want to order my data using prediction probabilities of class 1 with cross validation results. For that I tried the following two ways.
pred = clf.predict_proba(X)[:,1]
print(pred)
probs = clf.predict_proba(X)
best_n = np.argsort(probs, axis=1)[:,-6:]
I get the following error
NotFittedError: This RandomForestClassifier instance is not fitted
yet. Call 'fit' with appropriate arguments before using this method.
for both the situations.
I am just wondering where I am making things wrong.
I am happy to provide more details if needed.
In case, you want to use the CV model for a unseen data point/s, use the following approach.
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
iris = datasets.load_iris()
X = iris.data
y = iris.target
clf = RandomForestClassifier(n_estimators=10, random_state = 42, class_weight="balanced")
cv_results = cross_validate(clf, X, y, cv=3, return_estimator=True)
clf_fold_0 = cv_results['estimator'][0]
clf_fold_0.predict_proba([iris.data[133]])
# array([[0. , 0.5, 0.5]])
Have a look at the documentation it specifies that the probability is calculated based on the mean results from the trees.
In your case, you first need to call the fit() method to generate the tress in the model. Once you fit the model on the training data, you can call the predict_proba() method.
This is also specified in the error.
# Fit model
model = RandomForestClassifier(...)
model.fit(X_train, Y_train)
# Probabilty
model.predict_proba(X)[:,1]
I solved my problem using the following code:
proba = cross_val_predict(clf, X, y, cv=k_fold, method='predict_proba')
print(proba[:,1])
print(np.argsort(proba[:,1]))
As I understand the GridSearchCV should find the best parameter for model based on average cross_val_score() evaluation (please see a description of best_score_ result field here).
However I get a such max_depth that looks not optimal. Say, if I pass max_depth=2 manually I get a little better result then the result returned by GridSearchCV where 2 is in list of max_depth grid. The GridSearchCV, however, find that 8 is better than 2 for max_depth.
When I manually test the result of found 'best' max_depth parameter I find that it's not best.
Please, could you explain me why could GridSearchCV find parameter with 'accuracy' result worse then I get with cross_val_score with a first arbitrary parameter?
For details, please, look to the code below.
UPDATE. I have updated the code having added the parameter random_state for RandomForestClassifier (as Nain tell me kindly). Now scores are very close but not exactly same. Why aren't they same?..
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
DATA_LEN = 500
NUM_CLASSES = 3
NUM_VARS = 20
np.random.seed(0)
# data generation (some feature rows are equal to target y, some of these rows are random)
y = np.random.choice(NUM_CLASSES, DATA_LEN)
xs = []
for j in range(DATA_LEN):
row = []
for i in range(NUM_VARS):
rand_int = np.random.choice(NUM_CLASSES)
if j < int(DATA_LEN / (i + 2)):
row.append(y[j])
else:
row.append(rand_int)
xs.append(row)
X = np.array(xs).reshape(DATA_LEN, -1)
# predict: cross_val_score with max_depth=2 and max_depth=8 (the last one is found by GridSearchCV)
np.random.seed(0)
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X, y)
np.random.seed(0)
# average accuracy is '0.579718326935'
print np.average(cross_val_score(clf, X, y, cv=5, scoring='accuracy'))
np.random.seed(0)
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
clf.fit(X, y)
np.random.seed(0)
# average accuracy is '0.596127702566' which is not exactly the same as best score '0.59799999999999998'
print np.average(cross_val_score(clf, X, y, cv=5, scoring='accuracy'))
grid_params = {'max_depth': range(2, 20)}
clf = RandomForestClassifier(n_estimators=100, random_state=0)
np.random.seed(0)
clf_searcher = GridSearchCV(clf, grid_params, scoring='accuracy', cv=5)
clf_searcher.fit(X, y)
# outputs best score '0.59799999999999998'
print 'best_score=', clf_searcher.best_score_
# outputs "best_params= {'max_depth': 5}""
print 'best_params=', clf_searcher.best_params_
Using python and scikit-learn, I'd like to do a grid search. But some of my models end up being empty. How can I make the grid search function to ignore those models?
I guess I can have a scoring function which returns 0 if the models is empty, but I'm not sure how.
predictor = sklearn.svm.LinearSVC(penalty='l1', dual=False, class_weight='auto')
param_dist = {'C': pow(2.0, np.arange(-10, 11))}
learner = sklearn.grid_search.GridSearchCV(estimator=predictor,
param_grid=param_dist,
n_jobs=self.n_jobs, cv=5,
verbose=0)
learner.fit(X, y)
My data's in a way that this learner object will choose a C corresponding to an empty model. Any idea how I can make sure the model's not empty?
EDIT: by an "empty model" I mean a model that has selected 0 features to use. Specially with an l1 regularized model, this can easily happen. So in this case, if the C in the SVM is small enough, the optimization problem will find the 0 vector as the optimal solution for the coefficients. Therefore predictor.coef_ will be a vector of 0s.
Try to implement custom scorer, something similar to:
import numpy as np
def scorer_(estimator, X, y):
# Your criterion here
if np.allclose(estimator.coef_, np.zeros_like(estimator.coef_)):
return 0
else:
return estimator.score(X, y)
learner = sklearn.grid_search.GridSearchCV(...
scoring=scorer_)
I don't think there is such a built-in function; it's easy, however, to make a custom gridsearcher:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
import itertools
from sklearn import metrics
import operator
def model_eval(X, y, model, cv):
scores = []
for train_idx, test_idx in cv:
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
model.fit(X_train, y_train)
nonzero_coefs = len(np.nonzero(model.coef_)[0]) #check for nonzero coefs
if nonzero_coefs == 0: #if they're all zero, don't evaluate any further; move to next hyperparameter combo
return 0
predictions = model.predict(X_test)
score = metrics.accuracy_score(y_test, predictions)
scores.append(score)
return np.array(scores).mean()
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
C = pow(2.0, np.arange(-20, 11))
penalty = {'l1', 'l2'}
parameter_grid = itertools.product(C, penalty)
kf = KFold(X.shape[0], n_folds=5) #use the same folds to evaluate each hyperparameter combo
hyperparameter_scores = {}
for C, penalty in parameter_grid:
model = svm.LinearSVC(dual=False, C=C, penalty=penalty)
result = model_eval(X, y, model, kf)
hyperparameter_scores[(C, penalty)] = result
sorted_scores = sorted(hyperparameter_scores.items(), key=operator.itemgetter(1))
best_parameters, best_score = sorted_scores[-1]
print best_parameters
print best_score