I just learned about Machine Learning. I wrote a piece of code about KNN but I can't add weight = 1/distances in my code. Can anyone help me, please?
Thank you very much!
I'm sorry about the wording because I used GG translate
Below is the code I have referenced:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets
import pandas as pd
from sklearn.model_selection import train_test_split
class KNearestNeighbor():
def __init__(self , k):
self.k = k
def train(self,X,y):
self.X_train = X
self.y_train = y
def predict(self,X_test):
distances= self.euclidean_distance(X_test)
return self.predict_labels(distances)
def euclidean_distance(self,X_test):
num_test = X_test.shape[0]
num_train= self.X_train.shape[0]
distances= np.zeros((num_test,num_train))
for i in range(num_test):
for j in range(num_train):
distances[i,j]= np.sqrt( np.sum((X_test[i,:] - self.X_train[j,:])**2))
return distances
def predict_labels(self,distances):
num_test= distances.shape[0]
y_pred= np.zeros(num_test)
for i in range(num_test):
y_indices= np.argsort(distances[i,:])
k_closet_classes= self.y_train[y_indices[: self.k]].astype(int)
y_pred[i]= np.argmax(np.bincount(k_closet_classes))
return y_pred
if __name__ == '__main__':
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2,random_state=2)
KNN = KNearestNeighbor(k=10)
KNN.train(X_train,y_train)
y_pred = KNN.predict(X_test)
print(f'Accuracy: {sum(y_pred == y_test )/y_test.shape[0]}')
I was creating a custom rbf function for the SVC class of sklearn as following:
def rbf_kernel(x, y, gamma):
dis = np.sqrt(((x.reshape(-1, 1)) - y.reshape(1, -1)) ** 2)
return np.exp(-(gamma*dis)**2)
def eval_kernel(kernel):
model = SVC(kernel=kernel, C=C, gamma=gamma, degree=degree, coef0=coef0)
model.fit(X_train, y_train)
X_test_predict = model.predict(X_test)
acc = (X_test_predict == y_test).sum() / y_test.shape[0]
return acc
for k1, k2 in [('rbf', lambda x, y: rbf_kernel(x, y, gamma))]:
acc1 = eval_kernel(k1)
acc2 = eval_kernel(k2)
assert(abs(acc1 - acc2) < eps)
The shape of X_train is (396, 10), y_train is (396, 10) and X_test is (132, 10).
However, when I try to run it, I get an error saying:
ValueError: X.shape[1] = 3960 should be equal to 396, the number of samples at training time
It seems the errors are due to the difference in the dimension of X_test and X_train, but is there any way to fix this error?
Thank you in advance!
Your rbf kernel is written incorrectly. You need to return a matrix that is (n_samples, n_samples). In your code you basically unravelled everything, hence the error. You can refer to the actual code for rbf_kernel used by sklearn , and if we insert that it will work:
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X,y = make_classification(528)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
def my_kernel(X, Y, gamma=0.1):
K = euclidean_distances(X, Y, squared=True)
K *= -gamma
np.exp(K, K) # exponentiate K in-place
return K
def eval_kernel(kernel):
model = SVC(kernel=kernel,gamma=0.1)
model.fit(X_train, y_train)
X_test_predict = model.predict(X_test)
acc = (X_test_predict == y_test).sum() / y_test.shape[0]
return acc
eval_kernel('rbf')
0.8409090909090909
eval_kernel(my_kernel)
0.8409090909090909
I have data with differing weights for each sample. In my application, it is important that these weights are accounted for in estimating the model and comparing alternative models.
I'm using sklearn to estimate models and to compare alternative hyperparameter choices. But this unit test shows that GridSearchCV does not apply sample_weights to estimate scores.
Is there a way to have sklearn use sample_weight to score the models?
Unit test:
from __future__ import division
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()
for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y[test_ndx]
clf.fit(X=X_train, y=y_train, sample_weight=w_train)
y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)
results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})
for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})
best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param
if __name__ == "__main__":
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
# sample_weight = np.array([1 for _ in range(len(X))])
inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}
fit_params = {"sample_weight": sample_weight}
my_scorer = make_scorer(log_loss,
greater_is_better=False,
needs_proba=True,
needs_threshold=False)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y, **fit_params)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))
score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))
Which produces output:
This is the best out-of-sample score using GridSearchCV: 0.135692.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
Explanation: Since manually computing the loss without weighting produces the same scoring as GridSearchCV, we know that the sample weights are not being used.
The GridSearchCV takes a scoring as input, which can be callable. You can see the details of how to change the scoring function, and also how to pass your own scoring function here. Here's the relevant piece of code from that page for the sake of completeness:
EDIT: The fit_params is passed only to the fit functions, and not the score functions. If there are parameters which are supposed to be passed to the scorer, they should be passed to the make_scorer. But that still doesn't solve the issue here, since that would mean that the whole sample_weight parameter would be passed to log_loss, whereas only the part which corresponds to y_test at the time of calculating the loss should be passed.
sklearn does NOT support such a thing, but you can hack your way through, using a padas.DataFrame. The good news is, sklearn understands a DataFrame, and keeps it that way. Which means you can exploit the index of a DataFrame as you see in the code here:
# more code
X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)
# more code
def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)
score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)
# more code
As you see, the score_f uses the index of y_true to find which parts of sample_weight to use. For the sake of completeness, here's the whole code:
from __future__ import division
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.metrics import make_scorer
import pandas as pd
def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()
for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y_in[test_ndx]
clf.fit(X=X_train, y=y_train, sample_weight=w_train)
y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)
results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})
for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})
best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param
#if __name__ == "__main__":
if True:
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)
# sample_weight = np.array([1 for _ in range(len(X))])
inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}
def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)
score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))
score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))
The output of the code is then:
This is the best out-of-sample score using GridSearchCV: 0.095439.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
EDIT 2: as the comment bellow says:
the difference in my score and the sklearn score using this solution
originates in the way that I was computing a weighted average of
scores. If you omit the weighted average portion of the code, the two
outputs match to machine precision.
Currently in sklearn, GridSearchCV(and any classes inherit BaseSearchCV) only allow sample_weight in **fit_params but not using it in scoring, which is not correct, since CV pick the "best estimator" via unweighted score. Notes, when you grid.fit(X, y, sample_weight=w) only use sample weights in fit, not score.
There are two ways to solve this problems:
Handy method: add weight as the first columns in X. write your customized scoring function and transformer in your model.
from sklearn.base import BaseEstimator, TransformerMixin
# customized scorer
def weight_remover_scorer(estimator, X, y):
y_pred = estimator.predict(X)
w = X[:,0]
return your_scorer(y, y_pred, sample_weight=w)
# customized transformer
class WeightRemover(TransformerMixin, BaseEstimator):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
return X[:,1:]
# in your main function
if __name__=='__main__':
pipe = Pipeline([('remove_weight', WeightRemover()),('model',model)])
params_grid = {'model__'+k:v for k,v in params_grid.items()}
X = np.c_[train_w, X]
X_test = np.c_[test_w, X_test]
grid = GridSearchCV(pipe, params_grid, cv=5, scoring=weight_remover_scorer)
grid.fit(X, y)
add features in sklearn class (wait for new upgrade). Just add parameters sample_weight in BaseSearchCV (default is None), safer indexing them in the same way as fit_params = _check_fit_params(X, fit_params).
Just pointing out that there is an ongoing effort to support this important feature: https://github.com/scikit-learn/scikit-learn/pull/13432
But it seems that because of backward compatibility issues and the desire to tackle the more general problem of passing arbitrary sample related information it is taking a bit too long. The last attempt seems to be: https://github.com/scikit-learn/scikit-learn/pull/16079
Here is a good review of the issue: http://deaktator.github.io/2019/03/10/the-error-in-the-comparator/
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
r = pd.read_csv("vitalsign_test.csv")
clm_list = []
for column in r.columns:
clm_list.append(column)
X = r[clm_list[1:len(clm_list)-1]].values
y = r[clm_list[len(clm_list)-1]].values
X_train, X_test, y_train, y_test = train_test_split (X,y, test_size = 0.3, random_state=4)
k_range = range(1,25)
scores = []
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
plt.plot(k_range,scores)
plt.xlabel('value of k for clf')
plt.ylabel('testing accuracy')
reponse that I am getting is
ValueError: x and y must have same first dimension
my feature and response shape is:
y.shape
Out[60]: (500,)
X.shape
Out[61]: (500, 6)
It has nothing to do with your X and y, it is about x and y arguments to plot, since your scores has one element, and k_range has 25. The error is incorrect indentation:
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
should be
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
Using python and scikit-learn, I'd like to do a grid search. But some of my models end up being empty. How can I make the grid search function to ignore those models?
I guess I can have a scoring function which returns 0 if the models is empty, but I'm not sure how.
predictor = sklearn.svm.LinearSVC(penalty='l1', dual=False, class_weight='auto')
param_dist = {'C': pow(2.0, np.arange(-10, 11))}
learner = sklearn.grid_search.GridSearchCV(estimator=predictor,
param_grid=param_dist,
n_jobs=self.n_jobs, cv=5,
verbose=0)
learner.fit(X, y)
My data's in a way that this learner object will choose a C corresponding to an empty model. Any idea how I can make sure the model's not empty?
EDIT: by an "empty model" I mean a model that has selected 0 features to use. Specially with an l1 regularized model, this can easily happen. So in this case, if the C in the SVM is small enough, the optimization problem will find the 0 vector as the optimal solution for the coefficients. Therefore predictor.coef_ will be a vector of 0s.
Try to implement custom scorer, something similar to:
import numpy as np
def scorer_(estimator, X, y):
# Your criterion here
if np.allclose(estimator.coef_, np.zeros_like(estimator.coef_)):
return 0
else:
return estimator.score(X, y)
learner = sklearn.grid_search.GridSearchCV(...
scoring=scorer_)
I don't think there is such a built-in function; it's easy, however, to make a custom gridsearcher:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
import itertools
from sklearn import metrics
import operator
def model_eval(X, y, model, cv):
scores = []
for train_idx, test_idx in cv:
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
model.fit(X_train, y_train)
nonzero_coefs = len(np.nonzero(model.coef_)[0]) #check for nonzero coefs
if nonzero_coefs == 0: #if they're all zero, don't evaluate any further; move to next hyperparameter combo
return 0
predictions = model.predict(X_test)
score = metrics.accuracy_score(y_test, predictions)
scores.append(score)
return np.array(scores).mean()
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
C = pow(2.0, np.arange(-20, 11))
penalty = {'l1', 'l2'}
parameter_grid = itertools.product(C, penalty)
kf = KFold(X.shape[0], n_folds=5) #use the same folds to evaluate each hyperparameter combo
hyperparameter_scores = {}
for C, penalty in parameter_grid:
model = svm.LinearSVC(dual=False, C=C, penalty=penalty)
result = model_eval(X, y, model, kf)
hyperparameter_scores[(C, penalty)] = result
sorted_scores = sorted(hyperparameter_scores.items(), key=operator.itemgetter(1))
best_parameters, best_score = sorted_scores[-1]
print best_parameters
print best_score