I have a time-series dataset with two lables (0 and 1). I am using Dynamic Time Warping (DTW) as a similarity measure for classification using k-nearest neighbour (kNN) as described in these two wonderful blog posts:
https://nbviewer.jupyter.org/github/markdregan/K-Nearest-Neighbors-with-Dynamic-Time-Warping/blob/master/K_Nearest_Neighbor_Dynamic_Time_Warping.ipynb
http://alexminnaar.com/2014/04/16/Time-Series-Classification-and-Clustering-with-Python.html
Arguments
---------
n_neighbors : int, optional (default = 5)
Number of neighbors to use by default for KNN
max_warping_window : int, optional (default = infinity)
Maximum warping window allowed by the DTW dynamic
programming function
subsample_step : int, optional (default = 1)
Step size for the timeseries array. By setting subsample_step = 2,
the timeseries length will be reduced by 50% because every second
item is skipped. Implemented by x[:, ::subsample_step]
"""
def __init__(self, n_neighbors=5, max_warping_window=10000, subsample_step=1):
self.n_neighbors = n_neighbors
self.max_warping_window = max_warping_window
self.subsample_step = subsample_step
def fit(self, x, l):
"""Fit the model using x as training data and l as class labels
Arguments
---------
x : array of shape [n_samples, n_timepoints]
Training data set for input into KNN classifer
l : array of shape [n_samples]
Training labels for input into KNN classifier
"""
self.x = x
self.l = l
def _dtw_distance(self, ts_a, ts_b, d = lambda x,y: abs(x-y)):
"""Returns the DTW similarity distance between two 2-D
timeseries numpy arrays.
Arguments
---------
ts_a, ts_b : array of shape [n_samples, n_timepoints]
Two arrays containing n_samples of timeseries data
whose DTW distance between each sample of A and B
will be compared
d : DistanceMetric object (default = abs(x-y))
the distance measure used for A_i - B_j in the
DTW dynamic programming function
Returns
-------
DTW distance between A and B
"""
# Create cost matrix via broadcasting with large int
ts_a, ts_b = np.array(ts_a), np.array(ts_b)
M, N = len(ts_a), len(ts_b)
cost = sys.maxint * np.ones((M, N))
# Initialize the first row and column
cost[0, 0] = d(ts_a[0], ts_b[0])
for i in xrange(1, M):
cost[i, 0] = cost[i-1, 0] + d(ts_a[i], ts_b[0])
for j in xrange(1, N):
cost[0, j] = cost[0, j-1] + d(ts_a[0], ts_b[j])
# Populate rest of cost matrix within window
for i in xrange(1, M):
for j in xrange(max(1, i - self.max_warping_window),
min(N, i + self.max_warping_window)):
choices = cost[i - 1, j - 1], cost[i, j-1], cost[i-1, j]
cost[i, j] = min(choices) + d(ts_a[i], ts_b[j])
# Return DTW distance given window
return cost[-1, -1]
def _dist_matrix(self, x, y):
"""Computes the M x N distance matrix between the training
dataset and testing dataset (y) using the DTW distance measure
Arguments
---------
x : array of shape [n_samples, n_timepoints]
y : array of shape [n_samples, n_timepoints]
Returns
-------
Distance matrix between each item of x and y with
shape [training_n_samples, testing_n_samples]
"""
# Compute the distance matrix
dm_count = 0
# Compute condensed distance matrix (upper triangle) of pairwise dtw distances
# when x and y are the same array
if(np.array_equal(x, y)):
x_s = np.shape(x)
dm = np.zeros((x_s[0] * (x_s[0] - 1)) // 2, dtype=np.double)
p = ProgressBar(shape(dm)[0])
for i in xrange(0, x_s[0] - 1):
for j in xrange(i + 1, x_s[0]):
dm[dm_count] = self._dtw_distance(x[i, ::self.subsample_step],
y[j, ::self.subsample_step])
dm_count += 1
p.animate(dm_count)
# Convert to squareform
dm = squareform(dm)
return dm
# Compute full distance matrix of dtw distnces between x and y
else:
x_s = np.shape(x)
y_s = np.shape(y)
dm = np.zeros((x_s[0], y_s[0]))
dm_size = x_s[0]*y_s[0]
p = ProgressBar(dm_size)
for i in xrange(0, x_s[0]):
for j in xrange(0, y_s[0]):
dm[i, j] = self._dtw_distance(x[i, ::self.subsample_step],
y[j, ::self.subsample_step])
# Update progress bar
dm_count += 1
p.animate(dm_count)
return dm
def predict(self, x):
"""Predict the class labels or probability estimates for
the provided data
Arguments
---------
x : array of shape [n_samples, n_timepoints]
Array containing the testing data set to be classified
Returns
-------
2 arrays representing:
(1) the predicted class labels
(2) the knn label count probability
"""
dm = self._dist_matrix(x, self.x)
# Identify the k nearest neighbors
knn_idx = dm.argsort()[:, :self.n_neighbors]
# Identify k nearest labels
knn_labels = self.l[knn_idx]
# Model Label
mode_data = mode(knn_labels, axis=1)
mode_label = mode_data[0]
mode_proba = mode_data[1]/self.n_neighbors
return mode_label.ravel(), mode_proba.ravel()
However, for classification with kNN the two posts use their own kNN algorithms.
I want to use sklearn's options such as gridsearchcv in my classification. Therefore, I would like to know how I can use Dynamic Time Warping (DTW) with sklearn kNN.
Note: I am not limited to sklearn and happy to receive answers in other libraries as well
I am happy to provide more details if needed.
You can use a custom metric for KNN.
Therefore you only need to implement DTW yourself (or use/adapt any existing DTW implementation in python) [gist of this code].
import numpy as np
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
#toy dataset
X = np.random.random((100,10))
y = np.random.randint(0,2, (100))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#custom metric
def DTW(a, b):
an = a.size
bn = b.size
pointwise_distance = distance.cdist(a.reshape(-1,1),b.reshape(-1,1))
cumdist = np.matrix(np.ones((an+1,bn+1)) * np.inf)
cumdist[0,0] = 0
for ai in range(an):
for bi in range(bn):
minimum_cost = np.min([cumdist[ai, bi+1],
cumdist[ai+1, bi],
cumdist[ai, bi]])
cumdist[ai+1, bi+1] = pointwise_distance[ai,bi] + minimum_cost
return cumdist[an, bn]
#train
parameters = {'n_neighbors':[2, 4, 8]}
clf = GridSearchCV(KNeighborsClassifier(metric=DTW), parameters, cv=3, verbose=1)
clf.fit(X_train, y_train)
#evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
Which yields
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=1)]: Done 9 out of 9 | elapsed: 29.0s finished
precision recall f1-score support
0 0.57 0.89 0.70 18
1 0.60 0.20 0.30 15
avg / total 0.58 0.58 0.52 33
Use dtaidistance. This is the simplified pipeline of what I'm using in order to find the best fit for all windows with lengths between 1 and 20:
from dtaidistance import dtw
from sklearn.metrics import f1_score
def knn(trainX,trainY,testX,testY,w):
predictions = np.zeros(len(testX))
for testSampleIndex,testSample in enumerate(testX):
minimumDistance = float('inf')
for trainingSampleIndex, trainingSample in enumerate(trainX):
distanceBetweenTestAndTrainingAScan = dtw.distance(testSample,trainingSample,use_c=True,window=w,max_dist=minimumDistance)
if (distanceBetweenTestAndTrainingAScan < minimumDistance):
minimumDistance = distanceBetweenTestAndTrainingAScan
predictions[testSampleIndex] = trainY[trainingSampleIndex]
return [testY,predictions]
def DTWForCurrentDataSet(testX,testY,trainX,trainY,testDataSetID):
testDataSetBestF1Score = -float("inf")
testDataSetBestPredictions = []
for w in range(1,21):
[testY,predictions] = knn(trainX,trainY,testX,testY,w)
microF1Score = f1_score(testY, predictions, average='micro')
if (microF1Score > testDataSetBestF1Score):
testDataSetBestF1Score = microF1Score
testDataSetBestPredictions = predictions
return testDataSetBestPredictions
def runDTW(database):
for testDataSetID in database:
[testX,testY,trainX,trainY,patientIDsForTraining] = createTestingAndTrainingSets(database,testDataSetID)
testDataSetBestPredictions = DTWForCurrentDataSet(testX,testY,trainX,trainY,testDataSetID)
The tslearn library has DTW metric and can be used with sklearn.
from tslearn.metrics import dtw
clf = KNeighborsClassifier(n_neighbors=10, metric=dtw)
Related
I got this code for spectral clustering.
https://github.com/BirdYin/scllc/blob/master/scllc.py
This is a landmark-based spectral clustering code.
What does the locality_linear_coding function do in this code?
class Scllc:
def __locality_linear_coding(self, data, neighbors):
indicator = np.ones([neighbors.shape[0], 1])
penalty = np.eye(self.n_neighbors)
# Get the weights of every neighbors
z = neighbors - indicator.dot(data.reshape(-1,1).T)
local_variance = z.dot(z.T)
local_variance = local_variance + self.lambda_val * penalty
weights = scipy.linalg.solve(local_variance, indicator)
weights = weights / np.sum(weights)
weights = weights / np.sum(np.abs(weights))
weights = np.abs(weights)
return weights.reshape(self.n_neighbors)
def fit(self, X):
[n_data, n_dim] = X.shape
# Select landmarks
if self.func_landmark == 'kmeans':
landmarks, centers, unknown = k_means(X, self.n_landmarks, n_init=1, max_iter=100)
nbrs = NearestNeighbors(metric='euclidean').fit(landmarks)
# Create properties of the sparse matrix Z
[dist, indy] = nbrs.kneighbors(X, n_neighbors = self.n_neighbors)
indx = np.ones([n_data, self.n_neighbors]) * np.asarray(range(n_data))[:, None]
valx = np.zeros([n_data, self.n_neighbors])
self.delta = np.mean(valx)
# Compute all the coded data
for index in range(n_data):
# Compute the weights of its neighbors
localmarks = landmarks[indy[index,:], :]
weights = self.__locality_linear_coding(X[index,:], localmarks)
# Compute the coded data
valx[index] = weights
# Construct sparse matrix
indx = indx.reshape(n_data * self.n_neighbors)
indy = indy.reshape(n_data * self.n_neighbors)
valx = valx.reshape(n_data * self.n_neighbors)
Z = sparse.coo_matrix((valx,(indx,indy)),shape=(n_data,self.n_landmarks))
Z = Z / np.sqrt(np.sum(Z, 0))
# Get first k eigenvectors
[U, Sigma, V] = svds(Z, k = self.n_clusters + 1)
U = U[:, 0:self.n_clusters]
embedded_data = U / np.sqrt(np.sum(U * U, 0))
You can see the documentation of numpy module to deal with n-dimensional array
.For exemple, the dot method do the product of the matrices
Than They have use the scipy module, you can also see the documentation on internet.
the first function of a class is always an initialize method. Because the user have to call it to fully use the class. It is the first function where are defined and saved all the variables that the user want
I wrote multilayer-perceptron, using three layers (0,1,2). I want to plot the decision boundary and the data-set(eight features long) that i classified, Using python.
How do i plot it on the screen, using one of the python libraries?
Weight function -> matrix[3][8]
Sample x -> vector[8]
#-- Trains the boundary decision, and test it. --#
def perceptron(x, y):
m = len(x)
d = len(x[0])
eta = 0.1
w = [[0 for k in range(d)] for j in range(3)]
T = 2500
for t in range(0, T):
i = random.randint(0, m - 1)
v = [float(j) for j in x[i]]
y_hat = np.argmax(np.dot(w, v))
if y_hat != y[i]:
w[y[i]] = np.add(w[y[i]], np.array(v) * eta)
w[y_hat] = np.subtract(w[y_hat], np.array(v) * eta)
w_perceptron = w
#-- Test the decision boundary that we trained. --#
#-- Prints the loss weight function. --#
M_perceptron = 0
for t in range(0, m):
y_hat = np.argmax(np.dot(w_perceptron, x[t]))
if y[t] != y_hat:
M_perceptron = M_perceptron + 1
return float(M_perceptron) / m
def main():
y = []
x = [[]]
x = readTrain_X(sys.argv[1], x) # Reads data trainning set.
readTrain_Y(sys.argv[2], y) # Reads right classified training set.
print(perceptron(x, y))
You cannot plot 8 features. There is no way you can visualize a 8D space. But what you can do is to perform dimensionality reduction using PCA/t-SNE to 2D for visualization. If you can reduce it to 2D then you can use create a grid of values and use the probabilities returned by the model to visualize the decision boundary.
Reference: Link
I'm trying to implement emcee MCMC sampling in Python with a predefined likelihood function to find the best boundary between two populations of data.
For emcee see: http://dfm.io/emcee/current/user/line/
The likelihood function calculates the true positive and true negative classifications, given some linear boundary line, and is used to minimise the difference between the two values whilst maximising their sum.
This way you can imagine a TP and TN rate of 1 respectively will give a likelihood value of 1 while TP and TN rates of 0 will return a likelihood value of 0.
But when I attempt to sample the parameter space for m and b, the gradient and offset (or bias), for the boundary line, I get some wildly big and/or small values for the walks.
I have put an example code below which generates some nicely divided populations and then MCMCs around the initial guesses of the parameter values. I'm unsure as to why the MCMC chains don't converge nicely to an appropriate value here so any help would be greatly appreciated.
The following code should run out-of-the-box.
import emcee
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
#generate some test x and y data
folded_xy_train = np.random.uniform(0,1,10000) #test x data
folded_z_train = np.random.uniform(0,1,10000) #test y data
#define the true gradient and offset for the boundary line
m_true, b_true = 5,-2.5
#generate labels for the test data
rounded_labels_train = np.ones(len(folded_z_train))
model = (m_true*folded_xy_train) + b_true
difference = model - folded_z_train
rounded_labels_train[difference<0] = 0
#show the test data
plt.figure()
plt.scatter(folded_xy_train,folded_z_train,c=rounded_labels_train,s=1.0)
#define a likelihood function for the boundary line
def lnlike(theta, x, y, labels):
m, b = theta
model = (m*x) + b
difference = model - y
classifications = np.ones(len(y))
classifications[difference<0]=0
cfm = confusion_matrix(labels,classifications)
cm = cfm.astype('float') / cfm.sum(axis=1)[:, np.newaxis]
tn, fp, fn, tp = cm.ravel()
likelihood_val = (0.5*(tp+tn))/(1+np.abs(tp-tn))
ln_like = -np.log(likelihood_val)
return ln_like
#define a wide flat prior
def lnprior(theta):
m, b, = theta
if 0 < m < 10 and -20 < b < 5:
return 0.0
return -np.inf
#define the posterior
def lnprob(p, x, y, labels):
lp = lnprior(p)
if not np.isfinite(lp):
return 0
return lp + lnlike(p, x, y, labels)
#setup the MCMC sampling
nwalkers = 4
ndim = 2
p0 = np.array([4.2,-2]) + [np.random.rand(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(folded_xy_train, folded_z_train, rounded_labels_train))
sampler.run_mcmc(p0, 500)
#extract the MCMC paramater value chains
samples = sampler.chain[:, 50:, :].reshape((-1, ndim))
#view the parameter chains
plt.figure()
plt.subplot(211)
plt.plot(samples[:,0])
plt.subplot(212)
plt.plot(samples[:,1])
The initial test data, showing an obvious boundary line for given x y data (coloured by binary class label):
The sample walks, showing strange sampling for the gradient parameter (top) and offset parameter (bottom). The x-axis denotes the MCMC walk step number and the y-axis denotes the MCMC parameter values at a given step:
I'm working through my Matlab code for the Andrew NG Coursera course and turning it into python. I am working on non-regularized logistic regression and after writing my gradient and cost functions I needed something similar to fminunc and after some googling, I found a couple options. They are both returning the same results, but they do not match what is in Andrew NG's expected results code. Others seem to be getting this to work correctly, but I'm wondering why my specific code does not seem to return the desired result when using scipy.optimize functions, but does for the cost and gradient pieces earlier in the code.
The data I'm using can be found at the link below;
ex2data1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as op
#Machine Learning Online Class - Exercise 2: Logistic Regression
#Load Data
#The first two columns contains the exam scores and the third column contains the label.
data = pd.read_csv('ex2data1.txt', header = None)
X = np.array(data.iloc[:, 0:2]) #100 x 3
y = np.array(data.iloc[:,2]) #100 x 1
y.shape = (len(y), 1)
#Creating sub-dataframes for plotting
pos_plot = data[data[2] == 1]
neg_plot = data[data[2] == 0]
#==================== Part 1: Plotting ====================
#We start the exercise by first plotting the data to understand the
#the problem we are working with.
print('Plotting data with + indicating (y = 1) examples and o indicating (y = 0) examples.')
plt.plot(pos_plot[0], pos_plot[1], "+", label = "Admitted")
plt.plot(neg_plot[0], neg_plot[1], "o", label = "Not Admitted")
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend()
plt.show()
def sigmoid(z):
'''
SIGMOID Compute sigmoid function
g = SIGMOID(z) computes the sigmoid of z.
Instructions: Compute the sigmoid of each value of z (z can be a matrix,
vector or scalar).
'''
g = 1 / (1 + np.exp(-z))
return g
def costFunction(theta, X, y):
'''
COSTFUNCTION Compute cost and gradient for logistic regression
J = COSTFUNCTION(theta, X, y) computes the cost of using theta as the
parameter for logistic regression and the gradient of the cost
w.r.t. to the parameters.
'''
m = len(y) #number of training examples
h = sigmoid(X.dot(theta)) #logisitic regression hypothesis
J = (1/m) * np.sum((-y*np.log(h)) - ((1-y)*np.log(1-h)))
#h is 100x1, y is %100x1, these end up as 2 vector we subtract from each other
#then we sum the values by rows
#cost function for logisitic regression
return J
def gradient(theta, X, y):
m = len(y)
grad = np.zeros((theta.shape))
h = sigmoid(X.dot(theta))
for i in range(len(theta)): #number of rows in theta
XT = X[:,i]
XT.shape = (len(X),1)
grad[i] = (1/m) * np.sum((h-y)*XT) #updating each row of the gradient
return grad
#============ Part 2: Compute Cost and Gradient ============
#In this part of the exercise, you will implement the cost and gradient
#for logistic regression. You neeed to complete the code in costFunction.m
#Add intercept term to x and X_test
Bias = np.ones((len(X), 1))
X = np.column_stack((Bias, X))
#Initialize fitting parameters
initial_theta = np.zeros((len(X[0]), 1))
#Compute and display initial cost and gradient
(cost, grad) = costFunction(initial_theta, X, y), gradient(initial_theta, X, y)
print('Cost at initial theta (zeros): %f' % cost)
print('Expected cost (approx): 0.693\n')
print('Gradient at initial theta (zeros):')
print(grad)
print('Expected gradients (approx):\n -0.1000\n -12.0092\n -11.2628')
#Compute and display cost and gradient with non-zero theta
test_theta = np.array([[-24], [0.2], [0.2]]);
(cost, grad) = costFunction(test_theta, X, y), gradient(test_theta, X, y)
print('\nCost at test theta: %f' % cost)
print('Expected cost (approx): 0.218\n')
print('Gradient at test theta:')
print(grad)
print('Expected gradients (approx):\n 0.043\n 2.566\n 2.647\n')
result = op.fmin_tnc(func = costFunction, x0 = initial_theta, fprime = gradient, args = (X,y))
result[1]
Result = op.minimize(fun = costFunction,
x0 = initial_theta,
args = (X, y),
method = 'TNC',
jac = gradient, options={'gtol': 1e-3, 'disp': True, 'maxiter': 1000})
theta = Result.x
theta
test = np.array([[1, 45, 85]])
prob = sigmoid(test.dot(theta))
print('For a student with scores 45 and 85, we predict an admission probability of %f,' % prob)
print('Expected value: 0.775 +/- 0.002\n')
This was a very difficult problem to debug, and illustrates a poorly documented aspect of the scipy.optimize interface. The documentation vaguely indicates that theta will be passed around as a vector:
Minimization of scalar function of one or more variables.
In general, the optimization problems are of the form:
minimize f(x) subject to
g_i(x) >= 0, i = 1,...,m
h_j(x) = 0, j = 1,...,p
where x is a vector of one or more variables.
What's important is that they really mean vector in the most primitive sense, a 1-dimensional array. So you have to expect that whenever theta is passed into one of your callbacks, it will be passed in as a 1-d array. But in numpy, 1-d arrays sometimes behave differently from 2-d row arrays (and, obviously, from 2-d column arrays).
I don't know exactly why it's causing a problem in your case, but it's easily fixed regardless. You just have to add the following at the top of both your cost function and your gradient function:
theta = theta.reshape(-1, 1)
This guarantees that theta will be a 2-d column array, as expected. Once you've done this, the results are correct.
I have had similar issues with Scipy dealing with the same problem as you. As senderle points out the interface is not the easiest to deal with, especially combined with the numpy array interface... Here is my implementation which works as expected.
Defining the cost and gradient functions
Note that initial_theta is passed as a simple array of shape (3,) and converted to a column vector of shape (3,1) within the function. The gradient function then returns the grad.ravel() which has shape (3,) again. This is important as doing otherwise caused an error message with various optimization methods in Scipy.optimize.
Note that different methods have different behaviours but returning .ravel() seems to fix most issues...
import pandas as pd
import numpy as np
import scipy.optimize as opt
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def CostFunc(theta,X,y):
#Initializing variables
m = len(y)
J = 0
grad = np.zeros(theta.shape)
#Vectorized computations
z = X # theta
h = sigmoid(z)
J = (1/m) * ( (-y.T # np.log(h)) - (1 - y).T # np.log(1-h));
return J
def Gradient(theta,X,y):
#Initializing variables
m = len(y)
theta = theta[:,np.newaxis]
grad = np.zeros(theta.shape)
#Vectorized computations
z = X # theta
h = sigmoid(z)
grad = (1/m)*(X.T # ( h - y));
return grad.ravel() #<-- This is the trick
Initializing variables and parameters
Note that initial_theta.shape returns (3,)
X = data1.iloc[:,0:2].values
m,n = X.shape
X = np.concatenate((np.ones(m)[:,np.newaxis],X),1)
y = data1.iloc[:,-1].values[:,np.newaxis]
initial_theta = np.zeros((n+1))
Calling Scipy.optimize
model = opt.minimize(fun = CostFunc, x0 = initial_theta, args = (X, y), method = 'TNC', jac = Gradient)
Any comments from more knowledgeable people are welcome, this Scipy interface is a mystery to me, thanks
I am able to get a ROC curve using scikit-learn with
fpr, tpr, thresholds = metrics.roc_curve(y_true,y_pred, pos_label=1), where y_true is a list of values based on my gold standard (i.e., 0 for negative and 1 for positive cases) and y_pred is a corresponding list of scores (e.g., 0.053497243, 0.008521122, 0.022781548, 0.101885263, 0.012913795, 0.0, 0.042881547 [...])
I am trying to figure out how to add confidence intervals to that curve, but didn't find any easy way to do that with sklearn.
You can bootstrap the ROC computations (sample with replacement new versions of y_true / y_pred out of the original y_true / y_pred and recompute a new value for roc_curve each time) and the estimate a confidence interval this way.
To take the variability induced by the train test split into account, you can also use the ShuffleSplit CV iterator many times, fit a model on the train split, generate y_pred for each model and thus gather an empirical distribution of roc_curves as well and finally compute confidence intervals for those.
Edit: bootstrapping in python
Here is an example for bootstrapping the ROC AUC score out of the predictions of a single model. I chose to bootstrap the ROC AUC to make it easier to follow as a Stack Overflow answer, but it can be adapted to bootstrap the whole curve instead:
import numpy as np
from scipy.stats import sem
from sklearn.metrics import roc_auc_score
y_pred = np.array([0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04])
y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0 ])
print("Original ROC area: {:0.3f}".format(roc_auc_score(y_true, y_pred)))
n_bootstraps = 1000
rng_seed = 42 # control reproducibility
bootstrapped_scores = []
rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
# bootstrap by sampling with replacement on the prediction indices
indices = rng.randint(0, len(y_pred), len(y_pred))
if len(np.unique(y_true[indices])) < 2:
# We need at least one positive and one negative sample for ROC AUC
# to be defined: reject the sample
continue
score = roc_auc_score(y_true[indices], y_pred[indices])
bootstrapped_scores.append(score)
print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
You can see that we need to reject some invalid resamples. However on real data with many predictions this is a very rare event and should not impact the confidence interval significantly (you can try to vary the rng_seed to check).
Here is the histogram:
import matplotlib.pyplot as plt
plt.hist(bootstrapped_scores, bins=50)
plt.title('Histogram of the bootstrapped ROC AUC scores')
plt.show()
Note that the resampled scores are censored in the [0 - 1] range causing a high number of scores in the last bin.
To get a confidence interval one can sort the samples:
sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
# Computing the lower and upper bound of the 90% confidence interval
# You can change the bounds percentiles to 0.025 and 0.975 to get
# a 95% confidence interval instead.
confidence_lower = sorted_scores[int(0.05 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.95 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
confidence_lower, confidence_upper))
which gives:
Confidence interval for the score: [0.444 - 1.0]
The confidence interval is very wide but this is probably a consequence of my choice of predictions (3 mistakes out of 9 predictions) and the total number of predictions is quite small.
Another remark on the plot: the scores are quantized (many empty histogram bins). This is a consequence of the small number of predictions. One could introduce a bit of Gaussian noise on the scores (or the y_pred values) to smooth the distribution and make the histogram look better. But then the choice of the smoothing bandwidth is tricky.
Finally as stated earlier this confidence interval is specific to you training set. To get a better estimate of the variability of the ROC induced by your model class and parameters, you should do iterated cross-validation instead. However this is often much more costly as you need to train a new model for each random train / test split.
EDIT: since I first wrote this reply, there is a bootstrap implementation in scipy directly:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html
DeLong Solution
[NO bootstrapping]
As some of here suggested, the pROC package in R comes very handy for ROC AUC confidence intervals out-of-the-box, but that packages is not found in python. According to pROC documentation, confidence intervals are calculated via DeLong:
DeLong is an asymptotically exact method to evaluate the uncertainty
of an AUC (DeLong et al. (1988)). Since version 1.9, pROC uses the
algorithm proposed by Sun and Xu (2014) which has an O(N log N)
complexity and is always faster than bootstrapping. By default, pROC
will choose the DeLong method whenever possible.
And luckily for us, Yandex Data School has a Fast DeLong implementation on their public repo:
https://github.com/yandexdataschool/roc_comparison
So all credits to them for the DeLong implementation used in this example.
So here is how you get a CI via DeLong:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 6 10:06:52 2018
#author: yandexdataschool
Original Code found in:
https://github.com/yandexdataschool/roc_comparison
updated: Raul Sanchez-Vazquez
"""
import numpy as np
import scipy.stats
from scipy import stats
# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = 0.5*(i + j - 1)
i = j
T2 = np.empty(N, dtype=np.float)
# Note(kazeevn) +1 is due to Python using 0-based indexing
# instead of 1-based in the AUC formula in the paper
T2[J] = T + 1
return T2
def compute_midrank_weight(x, sample_weight):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
cumulative_weight = np.cumsum(sample_weight[J])
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = cumulative_weight[i:j].mean()
i = j
T2 = np.empty(N, dtype=np.float)
T2[J] = T
return T2
def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
if sample_weight is None:
return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
else:
return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)
def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
#article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
total_positive_weights = sample_weight[:m].sum()
total_negative_weights = sample_weight[m:].sum()
pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
total_pair_weights = pair_weights.sum()
aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
#article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating
Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank(positive_examples[r, :])
ty[r, :] = compute_midrank(negative_examples[r, :])
tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
v01 = (tz[:, :m] - tx[:, :]) / n
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def calc_pvalue(aucs, sigma):
"""Computes log(10) of p-values.
Args:
aucs: 1D array of AUCs
sigma: AUC DeLong covariances
Returns:
log10(pvalue)
"""
l = np.array([[1, -1]])
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
def compute_ground_truth_statistics(ground_truth, sample_weight):
assert np.array_equal(np.unique(ground_truth), [0, 1])
order = (-ground_truth).argsort()
label_1_count = int(ground_truth.sum())
if sample_weight is None:
ordered_sample_weight = None
else:
ordered_sample_weight = sample_weight[order]
return order, label_1_count, ordered_sample_weight
def delong_roc_variance(ground_truth, predictions, sample_weight=None):
"""
Computes ROC AUC variance for a single set of predictions
Args:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
ground_truth, sample_weight)
predictions_sorted_transposed = predictions[np.newaxis, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
return aucs[0], delongcov
alpha = .95
y_pred = np.array([0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04])
y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0 ])
auc, auc_cov = delong_roc_variance(
y_true,
y_pred)
auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc,
scale=auc_std)
ci[ci > 1] = 1
print('AUC:', auc)
print('AUC COV:', auc_cov)
print('95% AUC CI:', ci)
output:
AUC: 0.8
AUC COV: 0.028749999999999998
95% AUC CI: [0.46767194, 1.]
I've also checked that this implementation matches the pROC results obtained from R:
library(pROC)
y_true = c(0, 1, 0, 0, 1, 1, 0, 1, 0)
y_pred = c(0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04)
# Build a ROC object and compute the AUC
roc = roc(y_true, y_pred)
roc
output:
Call:
roc.default(response = y_true, predictor = y_pred)
Data: y_pred in 5 controls (y_true 0) < 4 cases (y_true 1).
Area under the curve: 0.8
Then
# Compute the Confidence Interval
ci(roc)
output
95% CI: 0.4677-1 (DeLong)