ROC_CURVE- IndexError: too many indices for array - python

classification, when I input numpy arrays having test label and test probabilities, it throws the following error
dataset = read_csv('C:/.../dataset/KDDREAL.csv')
dataset = dataset.values
X = dataset[:, :-1]
Y = dataset[:, -1]
encoder = LabelEncoder().fit(Y)
encoded_Y = encoder.transform(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X , encoded_Y , test_size=0.3, random_state=0)
model_svm = svm.SVC()
model_svm.fit(X_train, Y_train)
results_svm = model_svm.predict(X_test)
fpr2 = dict()
tpr2 = dict()
roc_auc2 = dict()
for i in range(num_classes):
fpr2[i], tpr2[i], _ = roc_curve(Y_test , results_svm[:, i])
roc_auc2[i] = auc(fpr2[i], tpr2[i])
# Compute micro-average ROC curve and ROC area
fpr2, tpr2, _ = roc_curve(y_test.ravel(), PGD20_X_test_trm.ravel())
roc_auc2 = auc(fpr2, tpr2)
fpr2[i], tpr2[i], _ = roc_curve(Y_test , results_svm[:, i])
IndexError: too many indices for array

For the line of code given below -
for i in range(num_classes):
fpr2[i], tpr2[i], _ = roc_curve(Y_test , results_svm[:, i])
Try and replace it with -
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], results_svm[:, i])
In case you are copying this line please check for the indentation.

Related

Why are my features selection selected wrong even though I specify the threshold?

I plotted a bar graph to see the feature importance using sklearn.feature_selection from SelectFromModel. When I specify a threshold = 0.1, it supposed to take mould temp and dosage time but it took cylinder zone 4 and mould temp instead. I have no idea what I did wrong.
model3 = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state=0, n_jobs=-1)
cv = sk(n_splits=4, shuffle=True, random_state=0)
# accuracy list
accuracy_l = []
# result list
result_matrix =[]
for tr_i, te_i in cv.split(df_X, df_y):
X_tr = df_X.iloc[tr_i]
X_v = df_X.iloc[te_i]
y_tr = df_y.iloc[tr_i]
y_v = df_y.iloc[te_i]
model3.fit(X_tr, y_tr.values.ravel())
y_p = model3.predict(X_v)
importances = model3.feature_importances_
indices = np.argsort(importances)[::-1]
names = [df_X.columns[i] for i in indices]
print("Feature names:", names)
print("Feature importance:", importances[indices])
plt.figure()
plt.title("Feature Importance")
plt.bar(range(df_X.shape[1]), importances[indices])
plt.xticks(range(df_X.shape[1]), names, rotation=90)
plt.show()
selector = SelectFromModel(model3, threshold=0.1)
features_important= selector.fit_transform(df_X, df_y.values.ravel())
print(features_important)

What is the function get_error_rate doing?

Below given code is an implementation of Adaboost algorithm. I am using this code to train on my dataset. I can understand most of the code but what I am not able to get is : What is the function get_error_rate doing here ? Is it calculating the entropy or Gini Impunity ?
Also the github link from where I have taken the code is https://github.com/jaimeps/adaboost-implementation
""" HELPER FUNCTION: GET ERROR RATE ========================================="""
def get_error_rate(pred, Y):
return sum(pred != Y) / float(len(Y))
""" HELPER FUNCTION: GENERIC CLASSIFIER ====================================="""
def generic_clf(Y_train, X_train, Y_test, X_test, clf):
clf.fit(X_train,Y_train)
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" ADABOOST IMPLEMENTATION ================================================="""
def adaboost_clf(Y_train, X_train, Y_test, X_test, M, clf):
n_train, n_test = len(X_train), len(X_test)
# Initialize weights
w = np.ones(n_train) / n_train
pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
for i in range(M):
# Fit a classifier with the specific weights
clf.fit(X_train, Y_train, sample_weight = w)
pred_train_i = clf.predict(X_train)
pred_test_i = clf.predict(X_test)
# Indicator function
miss = [int(x) for x in (pred_train_i != Y_train)]
# Equivalent with 1/-1 to update weights
miss2 = [x if x==1 else -1 for x in miss]
# Error
err_m = np.dot(w,miss) / sum(w)
# Alpha
alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m))
# New weights
w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
# Add to prediction
pred_train = [sum(x) for x in zip(pred_train,
[x * alpha_m for x in pred_train_i])]
pred_test = [sum(x) for x in zip(pred_test,
[x * alpha_m for x in pred_test_i])]
pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)
# Return error rate in train and test set
return get_error_rate(pred_train, Y_train), \
get_error_rate(pred_test, Y_test)
""" MAIN SCRIPT ============================================================="""
if __name__ == '__main__':
# Fit a simple decision tree first
clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)
er_tree = generic_clf(y_train_1, X_train_1, y_test_1, X_test_1, clf_tree)
# Fit Adaboost classifier using a decision tree as base estimator
# Test with different number of iterations
er_train, er_test = [er_tree[0]], [er_tree[1]]
x_range = range(10, 410, 10)
for i in x_range:
er_i = adaboost_clf(y_train_1, X_train_1, y_test_1, X_test_1, i, clf_tree)
er_train.append(er_i[0])
er_test.append(er_i[1])

How to iterate over rows in dataset for distance calculation

i have Iris dataset and i want to calculate the distance between all rows, i.e. 0 and 1, 0 and 2..... , 1 and 2, 1 and 3.... for KNN.
my code:
import numpy as np
from sklearn import datasets
import pandas as pd
#1 Handle the data
iris = datasets.load_iris()
x = iris.data[:, :4]
y = iris.target.reshape((150,1))
def shuffle(x,y,percentage):
iris_data = np.concatenate((x,y), axis=1)
shuffling = iris_data[np.random.permutation(len(iris_data))]
train, test = np.split(shuffling,[int(percentage*len(iris_data))])
x_train = train[:, :4]
y_train = train[:, -1]
x_test = test[:, :4]
y_test = test[:, -1]
return [iris_data, x_train, y_train, x_test, y_test]
shuf = shuffle(x,y,0.7)
x_train= shuf[1]; y_train= shuf[2]
x_test= shuf[3]; y_test= shuf[4]
#2 Distance function
def distance(x, x_test, y, y_test):
cont= 0
dist = {}
for i in range(x.shape[0]):
for j in range(x.shape[0]):
cont += (x[i] - x_test[j])**2
dist[i] = (np.sqrt(cont), y[i])
return dist
but i get a dictionary with numpy arrays (4,) instead of array of scalars.
i tried to use itertools.combinations but i have some errors.
one more question, how can i store my output in dataframe with the distances and the lables instead of dict (dist = {}) ?
thank you

KNN Regression results in zero MSE on training set (sklearn)

Using sklearn and trying to evaluate a KNN regression function with the below code:
def cross_validate(X,y,n_neighbors, test_size=0.20):
training_mses = []
test_mses = []
n = X.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = np.arange(n)
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
X_test, y_test = X[test_indices], y[test_indices]
X_train,y_train = X[training_indices], y[training_indices]
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights = "distance",
algorithm = 'brute')
model = knn.fit(X_train,y_train)
y_hat = model.predict( X_train)
training_mse = mse( y_train - y_hat)
model2 = knn.fit(X_test,y_test)
y_hat = model2.predict( X_test)
test_mse = mse( y_test - y_hat)
return training_mse, test_mse
I did something similar with linear regression. The difference I have found is that when I run it on KNN regression, the training_mse and test_mse are both 0. If I use the test data on the model fitted with the training set, it gives me an mse value that is non-zero. But I just don't believe that the fitted values for the training and test set are the same as the observed values. What am I doing wrong? The function I was trying to emulate is below and gives non-zero values for mse:
def cross_validate( formula, data, test_size=0.20):
training_mses = []
test_mses = []
n = data.shape[ 0]
test_n = int( np.round( test_size * n, 0))
indices = deepcopy( data.index).values
random.shuffle( indices)
test_indices = indices[ 0:test_n]
training_indices = indices[test_n:]
test_set = data.ix[ test_indices]
training_set = data.ix[ training_indices]
y, X = patsy.dmatrices( formula, training_set, return_type="matrix")
model = linear.LinearRegression( fit_intercept=False).fit( X, y)
y_hat = model.predict( X)
training_mse = mse( y - y_hat)
y, X = patsy.dmatrices( formula, test_set, return_type="matrix")
y_hat = model.predict( X)
test_mse = mse( y - y_hat)
return training_mse, test_mse

Doing cross validation from scratch

I found this function definition on Stack Overflow:
def fold_i_of_k(dataset, i, k):
n = len(dataset)
return len(dataset[n*(i-1)//k:n*i//k])
# this is my code below
#Constants
FOLD_I = 1
FOLD_K =10
#Creating 10 folds
counter = 1
s=0
total_ac = 0
while counter!=FOLD_K+1:
print("Fold ",counter)
fold = fold_i_of_k(dataset,counter,10)
d_fold = dataset[s:s + fold]
#print(d_fold.index.values)
#print(d_fold.iloc[1:3,0:2])
d_test = d_fold
X_test = d_test.iloc[:,0:11]
y_test = d_test.iloc[:,11:12]
d_train = dataset.drop(dataset.index[s:s+fold])
X_train = d_train.iloc[:,0:11]
y_train = d_train.iloc[:,11:12]
##print(dataset)
##print(d_fold)
##print(d_train)
##print(d_test)
##print(len(X_train))
##print(len(y_train))
##print(X_test)
##print(y_test)
#print(fold)
X_train = X_train.as_matrix()
X_train = preprocessing.scale(X_train)
y_train = y_train.as_matrix()
X_test = X_test.as_matrix()
X_test = preprocessing.scale(X_test)
y_test = y_test.as_matrix()
#l1 = len(y_train)
#np.reshape(y_train, l1)
#print(y_train)
from numpy import array
#l = len(y_test)
#np.reshape(y_test, l)
#print(y_test)
data.reshape((data.shape[0], 1))
y_train = array(y_train)
print(y_train.shape)
lr = LogisticRegression()
lr.fit(X_train,y_train)
#lr_pred = lr.predict(X_test)
#ac = accuracy_score(y_test,lr_pred)
#print(ac)
##print(classification_report(y_test,lr_pred))
total_ac = total_ac + ac
s = s + fold
counter= counter+1
total_ac = total_ac / FOLD_K
print("Cross validation accuracy is: ",total_ac)`
I am getting following error:
/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:578:
DataConversionWarning: A column-vector y was passed when a 1d array
was expected. Please change the shape of y to (n_samples, ), for
example using ravel().
y = column_or_1d(y, warn=True)
How can I fix it?
y_train.ravel() solved the problem.

Categories