Evaluate and track/plot of TimeSeriesSplit - python

I have got time series data. So I used TimeSeriesSplit with 3 splits for XGBRegressor, see the following code.
from sklearn.model_selection import TimeSeriesSplit
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
import math
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_test = X[test_index]
X_train = X[train_index]
y_test = y[test_index]
y_train = y[train_index]
model = XGBRegressor()
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)
rmse = (math.sqrt(mean_squared_error(y_test, y_pred_test)))
print(rmse)
My questions are:
1) Is the rmse result already the mean result of the 3 values? If it´s just the result of one fold, how can I show the results of the other 2 to calculate the average?
2) How can I track the validation rmse of each fold to plot the training/ test curve to check over-/ underfitting? Without TimeSeriesSplit I defined
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric=['rmse'],verbose=True)
So I got:
[1] validation_0-rmse:0.565858 validation_1-rmse:0.574236
[2] validation_0-rmse:0.550307 validation_1-rmse:0.567077
[3] validation_0-rmse:0.53824 validation_1-rmse:0.56323
... ...

Related

Accuracy score for sklearn not returning a value

I have a dataset and have one hot encoded the target column (5 different strings throughout the entire column) using pd.get_dummies. I have then used sklearn's train_test_split function to create the training, testing and validation sets. The training set (features) were then normalized with standardScalar(). I have fit the training sets of both the features and the target to a logistic regression model.
I am now trying to calculate the accuracy score for the training, validation and test sets but am having no luck. My code up to this part is below:
dataset = pd.read_csv('tabular_data/clean_tabular_data.csv')
features, label = load_airbnb(dataset, 'Category')
label_series = dataset['Category']
label_encoded = pd.get_dummies(label_series)
X_train, X_test, y_train, y_test = train_test_split(features, label_encoded, test_size=0.3)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5)
# normalize the features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)
# get baseline classification model
model = LogisticRegression()
y_train = y_train.iloc[:, 0]
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=0)
y_validation_pred = model.predict(X_validation_scaled)
y_validation_pred = np.argmax(y_validation_pred, axis =0)
y_test_pred = model.predict(X_test_scaled)
y_test_pred = np.argmax(y_test_pred, axis = 0)
# evaluate model using accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
validation_acc = accuracy_score(y_validation, y_validation_pred)
The error I am getting is here: "File "C:\Users\lcox1\Documents\VSCode\AiCore\Data science\classification_prac.py", line 56, in
train_acc = accuracy_score(y_train, y_train_pred)
TypeError: Singleton array 16 cannot be considered a valid collection."
I am fairly new to python so have no idea what the issue is. Any help appreciated.
You are getting that error because of these lines:
y_train_pred = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=0)
When you call model.predict(), it actually returns you an array of predicted labels, and not the probabilities. And if you do argmax of this array, you get 1 value, which is the index of the maximum value, hence it throws you the error, during prediction.
Most likely you mean to do:
y_train_pred = model.predict_proba(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=1)
y_train_pred
As #BenReiniger pointed out in the comments, if you are trying to train a model on multi class labels, you should not one-hot encode. Try something below, where I used an example dataset, and have the labels as a category:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
data = load_iris()
features = data.data
label_series = pd.Series(data.target).map({0:"setosa",1:"virginica",2:"versicolor"})
label_series = pd.Categorical(label_series)
le = LabelEncoder()
label_encoded = le.fit_transform(label_series)
Running your code with some changes:
X_train, X_test, y_train, y_test = train_test_split(features, label_encoded, test_size=0.3)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)
# get baseline classification model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict_proba(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=1)
y_validation_pred = model.predict_proba(X_validation_scaled)
y_validation_pred = np.argmax(y_validation_pred, axis =1)
y_test_pred = model.predict_proba(X_test_scaled)
y_test_pred = np.argmax(y_test_pred, axis = 1)
# evaluate model using accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
validation_acc = accuracy_score(y_validation, y_validation_pred
The results:
print(train_acc,test_acc,validation_acc)
0.9809523809523809 0.9090909090909091 1.0

ValueError: Expected 2D array, got 1D array instead for K-fold cross validation

I am trying to perform K-Fold Cross Validation for my dataset but an error message popped up instead
ValueError: Expected 2D array, got 1D array instead:
The data looks like this:
Fuel Consumption Distance
13.046 298.89
14.717 468.60
15.032 464.38
Below is the code that I have used:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3,shuffle=True,random_state=42)
kf
x = data.loc[:,'Fuel Consumption'].values
y = data.loc[:, 'Distance'].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42, shuffle=True)
for train_index, test_index in kf.split(y):
print(train_index, test_index)
def get_score(model, x_train, x_test, y_train, y_test):
model.fit(x_train, y_train)
return model.score(x_test, y_test)
scores = []
best_svr = SVR(kernel='rbf')
cv = KFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in cv.split(x):
print("Train Index: ", train_index, "\n")
print("Test Index: ", test_index)
x_train, x_test, y_train, y_test = x[train_index], x[test_index], y[train_index], y[test_index]
x_train = x_train.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)
best_svr.fit(x_train, y_train)
scores.append(best_svr.score(x_test, y_test))
Thanks!

Getting several splits from each fold in StratifiedKFold

I want to perform stratified 10-fold cross validation using sklearn. The train and test indices can be obtained using
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10)
for fold, (train_index, test_index) in enumerate(kf.split(X, y), 1):
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
However, I would like to set not one, but two folds aside (one for tuning of hyperparameters). So, I want each iteration to consist of 8 folds for training, 1 for tuning and 1 for testing. Is this possible with sklearns StratifiedKFold? Or would I need to write a custom split method?
You could use StratifiedShuffleSplit to further split the test set in a stratified way too:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
kf = StratifiedKFold(n_splits=10)
for fold, (train_index, test_index) in enumerate(kf.split(X, y), 1):
X_train = X[train_index]
y_train = y[train_index]
X_test = X[test_index]
y_test = y[test_index]
#stratified split on the test set
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)
X_test_ix, X_tune_ix = next(sss.split(X_test, y_test))
X_test_ = X_test[X_test_ix]
y_test_ = y_test[X_test_ix]
X_tune = X_test[X_tune_ix]
y_tune = y_test[X_tune_ix]

operand could not be broadcast together with shapes <56962,2> .. error

I try logistic regression classification using "k-fold cross validation" in python.
my code:
`import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score
data = pd.read_csv('xxx.csv')
X = data[["a","b","c",...]]
y = data["Class"]
def get_predictions(clf, X_train, y_train, X_test):
clf = clf
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
train_pred = clf.predict(X_train)
print('train-set confusion matrix:\n', confusion_matrix(y_train,train_pred))
return y_pred, y_pred_prob
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
pred_test_full=0
cv_score=[]
i=1
for train_index, test_index in skf.split(X, y):
X_train, y_train = X.loc[train_index], y.loc[train_index]
X_test, y_test = X.loc[test_index], y.loc[test_index]
log_cfl = LogisticRegression(C=2);
log_cfl.fit(X_train, y_train)
y_pred, y_pred_prob = get_predictions(LogisticRegression(C=2), X_train, y_train, X_test)
score=roc_auc_score(y_test,log_cfl.predict(X_test))
print('ROC AUC score: ',score)
cv_score.append(score)
pred_test_full = pred_test_full + y_pred_prob
i+=1`
I get error at this line of code:
`pred_test_full = pred_test_full + y_pred_prob`
For loop runs 2 times. Then in third, I get the error.
'operands could not be broadcast together with shapes <56962,2> <5696..' error.
I couldn't understand what is wrong, could you help to figure out?

LeaveOneOut to determine k of knn

I want to know the best k for k-nearest-neighbor. I am using LeaveOneOut to divide my data into train and test sets. In the code below I have 150 data entries, so I get 150 different train and test sets. K should be in-between 1 and 40.
I want to plot the cross-validation average classification error as a function of k, too see which k is the best for KNN.
Here is my code:
import scipy.io as sio
import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut
error = []
array = np.array(range(1,41))
dataset = pd.read_excel('Data/iris.xls')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
loo = LeaveOneOut()
loo.get_n_splits(X)
for train_index, test_index in loo.split(X):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
#print(X_train, X_test, y_train, y_test)
for i in range(1, 41):
classifier = KNeighborsClassifier(n_neighbors=i)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
error.append(np.mean(y_pred != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 41), error, color='red', linestyle='dashed', marker='o', markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
You are calculating error at each prediction, thats why you have 6000 points in your error array. You need to collect the predictions of all points in the fold for a given 'n_neighbors' and then calculate the error for that value.
You can do this:
# Loop over possible values of "n_neighbors"
for i in range(1, 41):
# Collect the actual and predicted values for all splits for a single "n_neighbors"
actual = []
predicted = []
for train_index, test_index in loo.split(X):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier = KNeighborsClassifier(n_neighbors=i)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# Append the single predictions and actual values here.
actual.append(y_test[0])
predicted.append(y_pred[0])
# Outside the loop, calculate the error.
error.append(np.mean(np.array(predicted) != np.array(actual)))
Rest of your code is okay.
There is a more compact way to do this if you use the cross_val_predict
from sklearn.model_selection import cross_val_predict
for i in range(1, 41):
classifier = KNeighborsClassifier(n_neighbors=i)
y_pred = cross_val_predict(classifier, X, y, cv=loo)
error.append(np.mean(y_pred != y))

Categories