I want to know the best k for k-nearest-neighbor. I am using LeaveOneOut to divide my data into train and test sets. In the code below I have 150 data entries, so I get 150 different train and test sets. K should be in-between 1 and 40.
I want to plot the cross-validation average classification error as a function of k, too see which k is the best for KNN.
Here is my code:
import scipy.io as sio
import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut
error = []
array = np.array(range(1,41))
dataset = pd.read_excel('Data/iris.xls')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
loo = LeaveOneOut()
loo.get_n_splits(X)
for train_index, test_index in loo.split(X):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
#print(X_train, X_test, y_train, y_test)
for i in range(1, 41):
classifier = KNeighborsClassifier(n_neighbors=i)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
error.append(np.mean(y_pred != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 41), error, color='red', linestyle='dashed', marker='o', markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
You are calculating error at each prediction, thats why you have 6000 points in your error array. You need to collect the predictions of all points in the fold for a given 'n_neighbors' and then calculate the error for that value.
You can do this:
# Loop over possible values of "n_neighbors"
for i in range(1, 41):
# Collect the actual and predicted values for all splits for a single "n_neighbors"
actual = []
predicted = []
for train_index, test_index in loo.split(X):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier = KNeighborsClassifier(n_neighbors=i)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# Append the single predictions and actual values here.
actual.append(y_test[0])
predicted.append(y_pred[0])
# Outside the loop, calculate the error.
error.append(np.mean(np.array(predicted) != np.array(actual)))
Rest of your code is okay.
There is a more compact way to do this if you use the cross_val_predict
from sklearn.model_selection import cross_val_predict
for i in range(1, 41):
classifier = KNeighborsClassifier(n_neighbors=i)
y_pred = cross_val_predict(classifier, X, y, cv=loo)
error.append(np.mean(y_pred != y))
Related
I am currently attempting to train a neural network that predicts a 1kHz sine wave.
While the model itself has an accuracy score of 0.89, it does not accurately predict my test data.
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
#Generate 1kHz sine wave
pi = np.pi
X = np.arange(0,2*pi,0.05)
y = np.sin(1000*X)
tscv = TimeSeriesSplit(n_splits=3, test_size=30)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
plt.plot(X_train, y_train)
plt.plot(X_test, y_test)
#Training using train samples
sc = StandardScaler()
X_train = sc.fit_transform(X_train.reshape(-1,1))
X_test = sc.fit_transform(X_test.reshape(-1,1))
regr = MLPRegressor(random_state=1, max_iter=1000,hidden_layer_sizes=(32, 32)).fit(X_train, y_train)
regr.fit(X_train, y_train)
plt.plot(X_train, regr.predict(X_train), color = 'red')
plt.scatter(X_train, y_train)
regr.score(X_train, y_train)
Result of training
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, regr.predict(X_test))
Result of test
As you can see, the test data is far less periodic than the ML model. Why is this the case?
I want to change my code so that instead of this part:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.2)
train_data = X_train.copy()
train_data.loc[:, 'target'] = y_train
test_data = X_test.copy()
test_data.loc[:, 'target'] = y_test
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for
regression. Multi-Task Classification is not implemented
continuous_cols=train_data.columns.tolist(),
categorical_cols=[],
normalize_continuous_features=True
)
trainer_config = TrainerConfig(
auto_lr_find=True,
batch_size=64,
max_epochs=10,
)
optimizer_config = {'optimizer':'Adam', 'optimizer_params':{'weight_decay': 0, 'amsgrad':
False}, 'lr_scheduler':None, 'lr_scheduler_params':{},
'lr_scheduler_monitor_metric':'valid_loss'}
model_config = NodeConfig(
task="classification",
num_layers=2,
num_trees=512,
learning_rate=1,
embed_categorical=True,
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
)
tabular_model.fit(train=train_data, test=test_data)
pred = tabular_model.predict(test_data)
pred['prediction'] = pred['prediction'].astype(int)
pred.loc[(pred['prediction'] >= 1 )] = 1
print_metrics(test_data['target'], pred["prediction"].astype('int'), tag="Holdout")
I want to Use the K fold method with k = 5 or 10.
Thank you for your advice.
The complete code example that I have used method train_test_split is above.
Here is an example of the k-fold method:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=0)
X_train.shape, y_train.shape
X_test.shape, y_test.shape
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)
result (in this example):
0.9666666666666667
The example is from here: https://scikit-learn.org/stable/modules/cross_validation.html
How can I access the train and test data for each fold in cross validation? I would like to save these in .csv files. I tried using the split function which generates the indices but it returns a generator object, not the indices.
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
skf = StratifiedKFold(n_splits=3)
x = skf.split(X, y, groups)
x
Output:
<generator object _BaseKFold.split at 0x7ff195979580>
StratifiedKFold returns a generator, therefore you it to iterate over it as follows:
skf = StratifiedKFold(n_splits=3)
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
I try logistic regression classification using "k-fold cross validation" in python.
my code:
`import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score
data = pd.read_csv('xxx.csv')
X = data[["a","b","c",...]]
y = data["Class"]
def get_predictions(clf, X_train, y_train, X_test):
clf = clf
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
train_pred = clf.predict(X_train)
print('train-set confusion matrix:\n', confusion_matrix(y_train,train_pred))
return y_pred, y_pred_prob
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
pred_test_full=0
cv_score=[]
i=1
for train_index, test_index in skf.split(X, y):
X_train, y_train = X.loc[train_index], y.loc[train_index]
X_test, y_test = X.loc[test_index], y.loc[test_index]
log_cfl = LogisticRegression(C=2);
log_cfl.fit(X_train, y_train)
y_pred, y_pred_prob = get_predictions(LogisticRegression(C=2), X_train, y_train, X_test)
score=roc_auc_score(y_test,log_cfl.predict(X_test))
print('ROC AUC score: ',score)
cv_score.append(score)
pred_test_full = pred_test_full + y_pred_prob
i+=1`
I get error at this line of code:
`pred_test_full = pred_test_full + y_pred_prob`
For loop runs 2 times. Then in third, I get the error.
'operands could not be broadcast together with shapes <56962,2> <5696..' error.
I couldn't understand what is wrong, could you help to figure out?
I have got time series data. So I used TimeSeriesSplit with 3 splits for XGBRegressor, see the following code.
from sklearn.model_selection import TimeSeriesSplit
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
import math
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_test = X[test_index]
X_train = X[train_index]
y_test = y[test_index]
y_train = y[train_index]
model = XGBRegressor()
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)
rmse = (math.sqrt(mean_squared_error(y_test, y_pred_test)))
print(rmse)
My questions are:
1) Is the rmse result already the mean result of the 3 values? If it´s just the result of one fold, how can I show the results of the other 2 to calculate the average?
2) How can I track the validation rmse of each fold to plot the training/ test curve to check over-/ underfitting? Without TimeSeriesSplit I defined
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric=['rmse'],verbose=True)
So I got:
[1] validation_0-rmse:0.565858 validation_1-rmse:0.574236
[2] validation_0-rmse:0.550307 validation_1-rmse:0.567077
[3] validation_0-rmse:0.53824 validation_1-rmse:0.56323
... ...