What is the function get_error_rate doing? - python

Below given code is an implementation of Adaboost algorithm. I am using this code to train on my dataset. I can understand most of the code but what I am not able to get is : What is the function get_error_rate doing here ? Is it calculating the entropy or Gini Impunity ?
Also the github link from where I have taken the code is https://github.com/jaimeps/adaboost-implementation
""" HELPER FUNCTION: GET ERROR RATE ========================================="""
def get_error_rate(pred, Y):
return sum(pred != Y) / float(len(Y))
""" HELPER FUNCTION: GENERIC CLASSIFIER ====================================="""
def generic_clf(Y_train, X_train, Y_test, X_test, clf):
clf.fit(X_train,Y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" ADABOOST IMPLEMENTATION ================================================="""
def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf):
n_train, n_test = len(X_train), len(X_test)
# Initialize weights
w = np.ones(n_train) / n_train
pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
for i in range(M):
# Fit a classifier with the specific weights
clf.fit(X_train, Y_train, sample_weight = w)
pred_train_i = clf.predict(X_train)
pred_test_i = clf.predict(X_test)
# Indicator function
miss = [int(x) for x in (pred_train_i != Y_train)]
# Equivalent with 1/-1 to update weights
miss2 = [x if x==1 else -1 for x in miss]
# Error
err_m = np.dot(w,miss) / sum(w)
# Alpha
alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m))
# New weights
w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
# Add to prediction
pred_train = [sum(x) for x in zip(pred_train,
[x * alpha_m for x in pred_train_i])]
pred_test = [sum(x) for x in zip(pred_test,
[x * alpha_m for x in pred_test_i])]
pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)
# Return error rate in train and test set
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
# Fit a simple decision tree first
clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)
er_tree = generic_clf(y_train_1, X_train_1, y_test_1, X_test_1, clf_tree)
# Fit Adaboost classifier using a decision tree as base estimator
# Test with different number of iterations
er_train, er_test = [er_tree[0]], [er_tree[1]]
x_range = range(10, 410, 10)
for i in x_range:
er_i = adaboost_clf(y_train_1, X_train_1, y_test_1, X_test_1, i, clf_tree)
er_train.append(er_i[0])
er_test.append(er_i[1])

Related

ValueError: 'p' must be 1-dimensional

I am trying to do feature selection using Ant colony optimization (ACO) for a rainfall dataset. The implementation of the code is below
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
X = x
y = df_cap['PRECTOTCORR_SUM']
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define ACO feature selection function
def aco_feature_selection(X_train, X_test, y_train, y_test, num_ants=10, max_iter=50, alpha=1.0, beta=2.0, evaporation=0.5, q0=0.9):
num_features = X_train.shape[1]
pheromone = np.ones(num_features)
best_solution = None
best_accuracy = 0.0
# Run ACO algorithm
for i in range(max_iter):
ant_solutions = []
ant_accuracies = []
# Generate ant solutions
for ant in range(num_ants):
features = np.random.choice([0, 1], size=num_features, p=[1-pheromone,pheromone])
X_train_selected = X_train[:, features == 1]
X_test_selected = X_test[:, features == 1]
knn = KNeighborsClassifier()
knn.fit(X_train_selected, y_train)
y_pred = knn.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
ant_solutions.append(features)
ant_accuracies.append(accuracy)
# Update best solution
if accuracy > best_accuracy:
best_solution = features
best_accuracy = accuracy
# Update pheromone levels
pheromone *= evaporation
for ant in range(num_ants):
features = ant_solutions[ant]
accuracy = ant_accuracies[ant]
if accuracy >= np.mean(ant_accuracies):
pheromone[features == 1] += alpha
else:
pheromone[features == 1] += beta
# Apply elitism
if best_solution is not None:
pheromone[best_solution == 1] += q0
return best_solution
# Run ACO feature selection
selected_features = aco_feature_selection(X_train, X_test, y_train, y_test)
# Print selected features
print("Selected features:", np.where(selected_features == 1)[0])
but I get this error
ValueError
Input In [175], in aco_feature_selection(X_train, X_test, y_train, y_test, num_ants, max_iter, alpha, beta, evaporation, q0)
26 # Generate ant solutions
27 for ant in range(num_ants):
---> 28 features = np.random.choice([0, 1], size=num_features, p=[1-pheromone,pheromone])
29 X_train_selected = X_train[:, features == 1]
30 X_test_selected = X_test[:, features == 1]
File mtrand.pyx:930, in numpy.random.mtrand.RandomState.choice()
ValueError: 'p' must be 1-dimensional
I suspect the issue comes list inside a list because it makes it 2-dimentional instead of 1-dimensional using something like flatten() throws this error
ValueError: 'a' and 'p' must have same size
how do I fix this?
The issue is that p is an array of probabilities and you are passing a 1 - array and an array into that argument. Without getting into the detail of the algorithm I can suggest that you need to choose a specific pheromone value for this feature.
And if you want to generate a series of 0 and 1 with given probabilities you need to iterate over pheromone

Sklearn SVM custom rbf kernel function

I was creating a custom rbf function for the SVC class of sklearn as following:
def rbf_kernel(x, y, gamma):
dis = np.sqrt(((x.reshape(-1, 1)) - y.reshape(1, -1)) ** 2)
return np.exp(-(gamma*dis)**2)
def eval_kernel(kernel):
model = SVC(kernel=kernel, C=C, gamma=gamma, degree=degree, coef0=coef0)
model.fit(X_train, y_train)
X_test_predict = model.predict(X_test)
acc = (X_test_predict == y_test).sum() / y_test.shape[0]
return acc
for k1, k2 in [('rbf', lambda x, y: rbf_kernel(x, y, gamma))]:
acc1 = eval_kernel(k1)
acc2 = eval_kernel(k2)
assert(abs(acc1 - acc2) < eps)
The shape of X_train is (396, 10), y_train is (396, 10) and X_test is (132, 10).
However, when I try to run it, I get an error saying:
ValueError: X.shape[1] = 3960 should be equal to 396, the number of samples at training time
It seems the errors are due to the difference in the dimension of X_test and X_train, but is there any way to fix this error?
Thank you in advance!
Your rbf kernel is written incorrectly. You need to return a matrix that is (n_samples, n_samples). In your code you basically unravelled everything, hence the error. You can refer to the actual code for rbf_kernel used by sklearn , and if we insert that it will work:
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X,y = make_classification(528)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
def my_kernel(X, Y, gamma=0.1):
K = euclidean_distances(X, Y, squared=True)
K *= -gamma
np.exp(K, K) # exponentiate K in-place
return K
def eval_kernel(kernel):
model = SVC(kernel=kernel,gamma=0.1)
model.fit(X_train, y_train)
X_test_predict = model.predict(X_test)
acc = (X_test_predict == y_test).sum() / y_test.shape[0]
return acc
eval_kernel('rbf')
0.8409090909090909
eval_kernel(my_kernel)
0.8409090909090909

Retrieving ValueError DataFrame.dtypes for data must be int, float or bool

I am trying to run a Python code using train XGBoost using multioutput regression, I am getting value error. Thanks for helping.
Please find my data sample
Layers Model Technique Accuracy-1 Accuracy-2 Latency time
18-27 Net 1 0.96 0.99 334368.0 0.99
38-37 MNet 1 0.76 0.99 313348.0 0.99
Below is my code using XGBoost
def optimize(trial,x,y,regressor):
max_depth = trial.suggest_int("max_depth",3,30)
n_estimators = trial.suggest_int("n_estimators",100,3000)
max_leaves= trial.suggest_int("max_leaves",1,10)
colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0, 1.0)
gamma = trial.suggest_uniform('gamma', 0.0, 0.05)
min_child_weight = trial.suggest_uniform('min_child_weight',1,3)
reg_lambda = trial.suggest_uniform('reg_lambda',0.5,1)
model = xgb.XGBRegressor(
objective ='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
colsample_bytree=colsample_bytree,
gamma=gamma,
min_child_weight=min_child_weight,
reg_lambda=reg_lambda,
max_leaves=max_leaves)
kf=model_selection.KFold(n_splits=5)
error=[]
for idx in kf.split(X=x , y=y):
train_idx , test_idx= idx[0],idx[1]
xtrain=x[train_idx]
ytrain=y[train_idx]
xtest=x[test_idx]
ytest=y[test_idx]
model.fit(x,y)
y_pred = model.predict(xtest)
fold_err = metrics.mean_squared_error(ytest,y_pred)
error.append(fold_err)
return np.mean(error)
def optimize_xgb(X,y):
list_of_y = ["Target 1","Target 2", "Target 3","Target 4"]
for i,m in zip(range(y.shape[1]),list_of_y):
print("{} optimized Parameters on MSE Error".format(m))
optimization_function = partial(optimize , x=X,y=y[:,i],regressor="random_forest")
study = optuna.create_study(direction="minimize")
study.optimize(optimization_function,n_trials=1)
data["Latency"] = minmax_scale(data["Latency"])
X = data[["Layers ","Model"]]
Y = data[['Accuracy-1', 'Accuracy-2','Latency', 'time ']]
encoder = OneHotEncoder(sparse=False)
onehot = encoder.fit_transform(X)
X_encoded = encoder.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
np.array(X_encoded), np.array(Y), test_size=0.3, random_state=42)
def modeling(X,y,max_depth=10,n_estimators=300,max_leaves=10,
learning_rate=0.01,colsample_bytree=0.001,gamma=0.0001,min_child_weight=2,
reg_lambda=0.3):
model = xgb.XGBRegressor(objective='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
max_leaves=max_leaves,
learning_rate=learning_rate,
gamma=gamma,
min_child_weight=min_child_weight,
colsample_bytree=colsample_bytree)
if y.shape[1] ==1:
print(" Apply Xgboost for one single Target....\n")
model_xgb = model.fit(X, y)
else:
print(" Apply Xgboost for {} Targets....".format(y.shape[1]))
model_xgb = MOR(model).fit(X, y)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = []
for i in range(y.shape[1]):
scores.append(np.abs(cross_val_score(model, X, y[:,i], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)))
print('Mean MSE of the {} target : {} ({})'.format(i,scores[i].mean(), scores[i].std()) )
return model_xgb
model_xgb = modeling(X_train,y_train,optimize="no")
y_estimated = model_xgb.predict(X_test)
mse(y_estimated,y_test)
################
y = np.random.random((1000,1))
model_xgb = modeling(X,y,optimize="no")
Error retrieving ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields Layers, Model

KNN without using Sklearn

I am working on knn without using any library. The problem is that the labels are numeric
label = [1.5171, 1.7999, 2.4493, 2.8622, 2.9961, 3.6356, 3.7742, 5.8069, 7.1357 etc..]}
from each label there is one value
I want to predict the label for a new data but how should i choose the winning label if from each one there is one value?
prediction = max(set(label_neighbors), key=label_neighbors.count)
I'm guessing that you want to learn the mechanics of KNN, right. See the sample code below. This should do what you want.
import numpy as np
import scipy.spatial
from collections import Counter
# loading the Iris-Flower dataset from Sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 42, test_size = 0.2)
class KNN:
def __init__(self, k):
self.k = k
def fit(self, X, y):
self.X_train = X
self.y_train = y
def distance(self, X1, X2):
distance = scipy.spatial.distance.euclidean(X1, X2)
def predict(self, X_test):
final_output = []
for i in range(len(X_test)):
d = []
votes = []
for j in range(len(X_train)):
dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])
d.append([dist, j])
d.sort()
d = d[0:self.k]
for d, j in d:
votes.append(y_train[j])
ans = Counter(votes).most_common(1)[0][0]
final_output.append(ans)
return final_output
def score(self, X_test, y_test):
predictions = self.predict(X_test)
return (predictions == y_test).sum() / len(y_test)
clf = KNN(3)
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
for i in prediction:
print(i, end= ' ')
prediction == y_test
clf.score(X_test, y_test)
# Result:
# 1.0
Well, look at that! We got 100%! Not bad, not bad at all!!
Reference:
https://medium.com/analytics-vidhya/implementing-k-nearest-neighbours-knn-without-using-scikit-learn-3905b4decc3c

KNN Regression results in zero MSE on training set (sklearn)

Using sklearn and trying to evaluate a KNN regression function with the below code:
def cross_validate(X,y,n_neighbors, test_size=0.20):
training_mses = []
test_mses = []
n = X.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = np.arange(n)
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
X_test, y_test = X[test_indices], y[test_indices]
X_train,y_train = X[training_indices], y[training_indices]
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights = "distance",
algorithm = 'brute')
model = knn.fit(X_train,y_train)
y_hat = model.predict( X_train)
training_mse = mse( y_train - y_hat)
model2 = knn.fit(X_test,y_test)
y_hat = model2.predict( X_test)
test_mse = mse( y_test - y_hat)
return training_mse, test_mse
I did something similar with linear regression. The difference I have found is that when I run it on KNN regression, the training_mse and test_mse are both 0. If I use the test data on the model fitted with the training set, it gives me an mse value that is non-zero. But I just don't believe that the fitted values for the training and test set are the same as the observed values. What am I doing wrong? The function I was trying to emulate is below and gives non-zero values for mse:
def cross_validate( formula, data, test_size=0.20):
training_mses = []
test_mses = []
n = data.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = deepcopy( data.index).values
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
test_set = data.ix[ test_indices]
training_set = data.ix[ training_indices]
y, X = patsy.dmatrices( formula, training_set, return_type="matrix")
model = linear.LinearRegression( fit_intercept=False).fit( X, y)
y_hat = model.predict( X)
training_mse = mse( y - y_hat)
y, X = patsy.dmatrices( formula, test_set, return_type="matrix")
y_hat = model.predict( X)
test_mse = mse( y - y_hat)
return training_mse, test_mse

Categories