KNN Regression results in zero MSE on training set (sklearn) - python

Using sklearn and trying to evaluate a KNN regression function with the below code:
def cross_validate(X,y,n_neighbors, test_size=0.20):
training_mses = []
test_mses = []
n = X.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = np.arange(n)
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
X_test, y_test = X[test_indices], y[test_indices]
X_train,y_train = X[training_indices], y[training_indices]
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights = "distance",
algorithm = 'brute')
model = knn.fit(X_train,y_train)
y_hat = model.predict( X_train)
training_mse = mse( y_train - y_hat)
model2 = knn.fit(X_test,y_test)
y_hat = model2.predict( X_test)
test_mse = mse( y_test - y_hat)
return training_mse, test_mse
I did something similar with linear regression. The difference I have found is that when I run it on KNN regression, the training_mse and test_mse are both 0. If I use the test data on the model fitted with the training set, it gives me an mse value that is non-zero. But I just don't believe that the fitted values for the training and test set are the same as the observed values. What am I doing wrong? The function I was trying to emulate is below and gives non-zero values for mse:
def cross_validate( formula, data, test_size=0.20):
training_mses = []
test_mses = []
n = data.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = deepcopy( data.index).values
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
test_set = data.ix[ test_indices]
training_set = data.ix[ training_indices]
y, X = patsy.dmatrices( formula, training_set, return_type="matrix")
model = linear.LinearRegression( fit_intercept=False).fit( X, y)
y_hat = model.predict( X)
training_mse = mse( y - y_hat)
y, X = patsy.dmatrices( formula, test_set, return_type="matrix")
y_hat = model.predict( X)
test_mse = mse( y - y_hat)
return training_mse, test_mse

Related

What is the function get_error_rate doing?

Below given code is an implementation of Adaboost algorithm. I am using this code to train on my dataset. I can understand most of the code but what I am not able to get is : What is the function get_error_rate doing here ? Is it calculating the entropy or Gini Impunity ?
Also the github link from where I have taken the code is https://github.com/jaimeps/adaboost-implementation
""" HELPER FUNCTION: GET ERROR RATE ========================================="""
def get_error_rate(pred, Y):
return sum(pred != Y) / float(len(Y))
""" HELPER FUNCTION: GENERIC CLASSIFIER ====================================="""
def generic_clf(Y_train, X_train, Y_test, X_test, clf):
clf.fit(X_train,Y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" ADABOOST IMPLEMENTATION ================================================="""
def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf):
n_train, n_test = len(X_train), len(X_test)
# Initialize weights
w = np.ones(n_train) / n_train
pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
for i in range(M):
# Fit a classifier with the specific weights
clf.fit(X_train, Y_train, sample_weight = w)
pred_train_i = clf.predict(X_train)
pred_test_i = clf.predict(X_test)
# Indicator function
miss = [int(x) for x in (pred_train_i != Y_train)]
# Equivalent with 1/-1 to update weights
miss2 = [x if x==1 else -1 for x in miss]
# Error
err_m = np.dot(w,miss) / sum(w)
# Alpha
alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m))
# New weights
w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
# Add to prediction
pred_train = [sum(x) for x in zip(pred_train,
[x * alpha_m for x in pred_train_i])]
pred_test = [sum(x) for x in zip(pred_test,
[x * alpha_m for x in pred_test_i])]
pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)
# Return error rate in train and test set
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
# Fit a simple decision tree first
clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)
er_tree = generic_clf(y_train_1, X_train_1, y_test_1, X_test_1, clf_tree)
# Fit Adaboost classifier using a decision tree as base estimator
# Test with different number of iterations
er_train, er_test = [er_tree[0]], [er_tree[1]]
x_range = range(10, 410, 10)
for i in x_range:
er_i = adaboost_clf(y_train_1, X_train_1, y_test_1, X_test_1, i, clf_tree)
er_train.append(er_i[0])
er_test.append(er_i[1])

Retrieving ValueError DataFrame.dtypes for data must be int, float or bool

I am trying to run a Python code using train XGBoost using multioutput regression, I am getting value error. Thanks for helping.
Please find my data sample
Layers Model Technique Accuracy-1 Accuracy-2 Latency time
18-27 Net 1 0.96 0.99 334368.0 0.99
38-37 MNet 1 0.76 0.99 313348.0 0.99
Below is my code using XGBoost
def optimize(trial,x,y,regressor):
max_depth = trial.suggest_int("max_depth",3,30)
n_estimators = trial.suggest_int("n_estimators",100,3000)
max_leaves= trial.suggest_int("max_leaves",1,10)
colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0, 1.0)
gamma = trial.suggest_uniform('gamma', 0.0, 0.05)
min_child_weight = trial.suggest_uniform('min_child_weight',1,3)
reg_lambda = trial.suggest_uniform('reg_lambda',0.5,1)
model = xgb.XGBRegressor(
objective ='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
colsample_bytree=colsample_bytree,
gamma=gamma,
min_child_weight=min_child_weight,
reg_lambda=reg_lambda,
max_leaves=max_leaves)
kf=model_selection.KFold(n_splits=5)
error=[]
for idx in kf.split(X=x , y=y):
train_idx , test_idx= idx[0],idx[1]
xtrain=x[train_idx]
ytrain=y[train_idx]
xtest=x[test_idx]
ytest=y[test_idx]
model.fit(x,y)
y_pred = model.predict(xtest)
fold_err = metrics.mean_squared_error(ytest,y_pred)
error.append(fold_err)
return np.mean(error)
def optimize_xgb(X,y):
list_of_y = ["Target 1","Target 2", "Target 3","Target 4"]
for i,m in zip(range(y.shape[1]),list_of_y):
print("{} optimized Parameters on MSE Error".format(m))
optimization_function = partial(optimize , x=X,y=y[:,i],regressor="random_forest")
study = optuna.create_study(direction="minimize")
study.optimize(optimization_function,n_trials=1)
data["Latency"] = minmax_scale(data["Latency"])
X = data[["Layers ","Model"]]
Y = data[['Accuracy-1', 'Accuracy-2','Latency', 'time ']]
encoder = OneHotEncoder(sparse=False)
onehot = encoder.fit_transform(X)
X_encoded = encoder.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
np.array(X_encoded), np.array(Y), test_size=0.3, random_state=42)
def modeling(X,y,max_depth=10,n_estimators=300,max_leaves=10,
learning_rate=0.01,colsample_bytree=0.001,gamma=0.0001,min_child_weight=2,
reg_lambda=0.3):
model = xgb.XGBRegressor(objective='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
max_leaves=max_leaves,
learning_rate=learning_rate,
gamma=gamma,
min_child_weight=min_child_weight,
colsample_bytree=colsample_bytree)
if y.shape[1] ==1:
print(" Apply Xgboost for one single Target....\n")
model_xgb = model.fit(X, y)
else:
print(" Apply Xgboost for {} Targets....".format(y.shape[1]))
model_xgb = MOR(model).fit(X, y)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = []
for i in range(y.shape[1]):
scores.append(np.abs(cross_val_score(model, X, y[:,i], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)))
print('Mean MSE of the {} target : {} ({})'.format(i,scores[i].mean(), scores[i].std()) )
return model_xgb
model_xgb = modeling(X_train,y_train,optimize="no")
y_estimated = model_xgb.predict(X_test)
mse(y_estimated,y_test)
################
y = np.random.random((1000,1))
model_xgb = modeling(X,y,optimize="no")
Error retrieving ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields Layers, Model

On calculating the mean R-Squared score of K-Fold CV in Sci-kit

I've done DecisionTreeRegression as well as RandomForestRegression on the same dataset.
For RandomForest I've used 5 random best combinations and the results were all similar results as you'd expect. I've calculated the average of R^2, RMSE and MAE and have gotten
R^2 : 0.7, MAE: 145716, RMSE: 251828.
For DecisionTree I've used Repeated K-Fold, calculated the average and gotten:
R^2: 0.29, MAE: 121791, RMSE: 198280.
No transformations or scaling have been done on the response variable which is Home Prices.
I'm new to statistics, but I'm pretty sure R^2 should be higher, if MAE and RMSE are lower on the same dataset when there is no scaling done. That being said, the dataset in question is pretty low in quality compared to the other datasets that I'm using which are yielding the appropriate proportions in error scores.
My question is, since this dataset is poor in quality, and I'm sure that there will be negative R2 values as well as above one for the DecisionTree Model for this dataset: Is it possible that calculating the mean of scores after cross-validation gives arbitrary results for R^2, if some of the R^2 values are not in the 0-1 interval, or is it more likely that there's an issue with the logic of my code(or something else)?
def decisionTreeRegression(df, features):
df = df.sample(frac=1, random_state=0)
scaler = StandardScaler()
X = df[features]
y = df[['Price']]
param_grid = {'max_depth': np.arange(1,40,3)}
tree = GridSearchCV(DecisionTreeRegressor(), param_grid,return_train_score=False)
tree.fit(X,y)
tree_final = DecisionTreeRegressor(max_depth=tree.best_params_['max_depth'])
cv = RepeatedKFold(n_splits=5, n_repeats=100)
mae_scores = cross_val_score(tree_final, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
mse_scores = cross_val_score(tree_final, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
r2_scores = cross_val_score(tree_final, X, y, scoring='r2', cv=cv, n_jobs=-1)
return makeScoresCV(mae_scores,mse_scores,r2_scores)
def makeScoresCV(mae_scores,mse_scores,r2_scores):
# convert scores to positive
mae_scores= absolute(mae_scores)
mse_scores= absolute(mse_scores)
# summarize the result
s_mean = mean(mae_scores)
s_mean2 = mean(mse_scores)
s_mean3 = mean(r2_scores)
return s_mean,np.sqrt(s_mean2),s_mean3
mae, rmse, r2 = decisionTreeRegression(df_de,fe_de)
print("mae : " + str(mae))
print("rmse : " + str(rmse))
print("r2 : " + str(r2))
Console:
mae : 153189.34673362423
rmse : 253284.5137707182
r2 : 0.30183525616923246
Random Forest (seperate notebook):
scaler = StandardScaler()
X = df.drop('Price', axis = 1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=123, shuffle=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
def evaluate(model, test_features, test_labels):
predictions = model.predict(test_features)
rmse = (np.sqrt(mean_squared_error(test_labels,predictions)))
r2 = r2_score(test_labels,predictions) # from sklearn.metrics
mae = np.sum(np.absolute((test_labels - predictions))) / len(predictions)
return mae,r2,rmse
maes = []
rmses = []
r2s = []
for i in range(10):
rf_random.fit(X_train, y_train)
best_random = rf_random.best_estimator_
mae,r2,rmse = evaluate(best_random, X_test, y_test)
maes.append(mae)
rmses.append(rmse)
r2s.append(r2)
print("MAE")
print(math.fsum(maes) / len(maes))
print("RMSE")
print(math.fsum(rmses) / len(rmses))
print("R2")
print(math.fsum(r2s) / len(r2s))
Console:
MAE
145716.7264983288
RMSE
251828.40328030512
R2
0.7082730127977784

Denormalization of output from neural network

I have used the MinMax normalization in order to normalize my dataset, both features and label. My question is, it's correct to normalize also the label? If yes, how can I denormalize the output of the neural network (the one that I predict with the test set that is normalized)?
I can't upload the dataset, but it is composed by 18 features and 1 label. It is a regression task, the features and the label are physical quantities.
So the problem is that the y_train_pred and y_test_pred are between 0 and 1. How can I predict the "real value"?
The code:
dataset = pd.read_csv('DataSet.csv', decimal=',', delimiter = ";")
label = dataset.iloc[:,-1]
features = dataset.drop(columns = ['Label'])
features = features[best_features]
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, label, test_size = 0.25, random_state = 1, shuffle = True)
y_test2 = y_test1.to_frame()
y_train2 = y_train1.to_frame()
scaler1 = preprocessing.MinMaxScaler()
scaler2 = preprocessing.MinMaxScaler()
X_train = scaler1.fit_transform(X_train1)
X_test = scaler2.fit_transform(X_test1)
scaler3 = preprocessing.MinMaxScaler()
scaler4 = preprocessing.MinMaxScaler()
y_train = scaler3.fit_transform(y_train2)
y_test = scaler4.fit_transform(y_test2)
optimizer = tf.keras.optimizers.Adamax(lr=0.001)
model = Sequential()
model.add(Dense(80, input_shape = (X_train.shape[1],), activation = 'relu',kernel_initializer='random_normal'))
model.add(Dropout(0.15))
model.add(Dense(120, activation = 'relu',kernel_initializer='random_normal'))
model.add(Dropout(0.15))
model.add(Dense(80, activation = 'relu',kernel_initializer='random_normal'))
model.add(Dense(1,activation = 'linear'))
model.compile(loss = 'mse', optimizer = optimizer, metrics = ['mse'])
history = model.fit(X_train, y_train, epochs = 300,
validation_split = 0.1, shuffle=False, batch_size=120
)
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
You should denormalize so you can get real world predictions to your neural network, rather than a number between 0-1
The min - max normalization is defined by:
z = (x - min)/(max - min)
With z being the normalized value, x being the label value, max being the max x value, and min being the min x value. So if we have z, min, and max we can resolve for x as follows:
x = z(max - min) + min
Thus before you normalize your data, define variables for the max and min value for the label if it is continuous. Then after you get your pred values, you can use the following function:
y_max_pre_normalize = max(label)
y_min_pre_normalize = min(label)
def denormalize(y):
final_value = y(y_max_pre_normalize - y_min_pre_normalize) + y_min_pre_normalize
return final_value
And apply this function to your y_test/y_pred to get the corresponding value.
You can use this link here to better visualize this.

Doing cross validation from scratch

I found this function definition on Stack Overflow:
def fold_i_of_k(dataset, i, k):
n = len(dataset)
return len(dataset[n*(i-1)//k:n*i//k])
# this is my code below
#Constants
FOLD_I = 1
FOLD_K =10
#Creating 10 folds
counter = 1
s=0
total_ac = 0
while counter!=FOLD_K+1:
print("Fold ",counter)
fold = fold_i_of_k(dataset,counter,10)
d_fold = dataset[s:s + fold]
#print(d_fold.index.values)
#print(d_fold.iloc[1:3,0:2])
d_test = d_fold
X_test = d_test.iloc[:,0:11]
y_test = d_test.iloc[:,11:12]
d_train = dataset.drop(dataset.index[s:s+fold])
X_train = d_train.iloc[:,0:11]
y_train = d_train.iloc[:,11:12]
##print(dataset)
##print(d_fold)
##print(d_train)
##print(d_test)
##print(len(X_train))
##print(len(y_train))
##print(X_test)
##print(y_test)
#print(fold)
X_train = X_train.as_matrix()
X_train = preprocessing.scale(X_train)
y_train = y_train.as_matrix()
X_test = X_test.as_matrix()
X_test = preprocessing.scale(X_test)
y_test = y_test.as_matrix()
#l1 = len(y_train)
#np.reshape(y_train, l1)
#print(y_train)
from numpy import array
#l = len(y_test)
#np.reshape(y_test, l)
#print(y_test)
data.reshape((data.shape[0], 1))
y_train = array(y_train)
print(y_train.shape)
lr = LogisticRegression()
lr.fit(X_train,y_train)
#lr_pred = lr.predict(X_test)
#ac = accuracy_score(y_test,lr_pred)
#print(ac)
##print(classification_report(y_test,lr_pred))
total_ac = total_ac + ac
s = s + fold
counter= counter+1
total_ac = total_ac / FOLD_K
print("Cross validation accuracy is: ",total_ac)`
I am getting following error:
/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:578:
DataConversionWarning: A column-vector y was passed when a 1d array
was expected. Please change the shape of y to (n_samples, ), for
example using ravel().
y = column_or_1d(y, warn=True)
How can I fix it?
y_train.ravel() solved the problem.

Categories