I'm working on a Quantile Regression Neural Network (QRNN) that can act as a forecaster for wind power as well as a detector for false data injection attacks.
I pretty much finished it but I am trying to quantify it's accuracy by taking the median quantile (tau = 0.5) and comparing it to the actual wind power by using an RMSE (while also trying to plot the actual wind power vs the median quantile). However, I keep running into issues with it.
Because q_hat (the predicted median quantile) is a numpy array and y_test (actual wind power test data) is a pandas dataframe, I have to convert y_test to numpy, but it's giving me this error:
"AttributeError: 'numpy.ndarray' object has no attribute 'index'"
Here is the pinball_loss.py file that is needed for this code:
import numpy as np
import tensorflow as tf
from keras import backend as K
K.set_floatx('float64')
# pinball loss function with penalty
def pinball_loss(y, q, tau, alpha = 0.001, kappa = 0, margin = 0):
"""
:param y: target
:param q: predicted quantile
:param tau: coverage level
:param alpha: smoothing parameter
:param kappa: penalty term
:param margin: margin for quantile cross-over
:return: quantile loss
"""
# calculate smooth pinball loss function
error = (y - q)
quantile_loss = K.mean(tau * error + alpha * K.softplus(-error / alpha) )
# calculate smooth cross-over penalty
diff = q[:, 1:] - q[:, :-1]
penalty = kappa * K.mean(tf.square(K.maximum(tf.Variable(tf.zeros([1], dtype=tf.float64)), margin - diff)))
return quantile_loss + penalty
Here is my code in Jupyter Notebook (I separated my cells by dashed lines for convenience):
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K
---------------------------------------------------------------------------------------------------------
data = pd.read_csv('2018 - Lillgrund (Sweden) Offshore Wind Farm Data.csv', header = [0,1])
data = pd.DataFrame(data, columns= [['local_time','output','windspeed'],['Europe/Copenhagen','kW','m/s']])
#Combine local time with Europe/Copenhagen and output with kW as one header.
data.columns = data.columns.map('_'.join)
---------------------------------------------------------------------------------------------------------
#Use the date column as the index.
data['local_time_Europe/Copenhagen'] = pd.to_datetime(data['local_time_Europe/Copenhagen'])
#Two weeks of data from 1/1/18 to 2/1/18 indexed hourly (744 total data points).
date_rng = (data['local_time_Europe/Copenhagen'] >= '2018-1-01') & (data['local_time_Europe/Copenhagen'] <= '2018-2-1 00:00:00')
data = data.loc[date_rng]
#Sets array as dates instead of row number
data.set_index('local_time_Europe/Copenhagen', inplace=True)
#Check to see index
# data.index
---------------------------------------------------------------------------------------------------------
# create lagged time series features
lags = range(1, 6)
df = pd.DataFrame(data= data['output_kW'], index=data.index)
df = df.assign(**{'{} (t-{})'.format("Windspeed", t): data["windspeed_m/s"].shift(t) for t in lags}).dropna()
#Windspeeds are taken from hour before. i.e. original windspeed # 6am was 10.124 m/s, windspeed # t-1 (5am) is 9.596 m/s.
---------------------------------------------------------------------------------------------------------
# create training and testing data
X = df.drop(columns=["output_kW"])
Y = df['output_kW']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
---------------------------------------------------------------------------------------------------------
#tau = np.arange(0.05, 1.0, 0.05) Original tau with multiple quantiles that I've already plotted
tau = 0.5 #Single point Quantile prediction
#N_tau = len(tau) #tau starts from 0.05 to 1, incremented by 0.05, which is 19 points.
N_tau = 1
N_features = 5 #Windspeed is the feature for X.
hidden_dim = 40 # number of hidden nodes
Lambda = 0.001 # L2 regularization
# loss function parameters (no need to modify)
loss_param={
'tau': tau,
'alpha': 0.01,
}
---------------------------------------------------------------------------------------------------------
def pinball_loss(y, q, tau, alpha):
"""
:param y: target
:param q: predicted quantile
:param tau: coverage level
:param alpha: smoothing parameter
:param kappa: penalty term
:param margin: margin for quantile cross-over
:return: quantile loss
"""
# calculate smooth pinball loss function
error = (y - q)
quantile_loss = K.mean(tau * error + alpha * K.softplus(-error / alpha) )
return quantile_loss
---------------------------------------------------------------------------------------------------------
# add first layer to a sequential model
model = Sequential()
model.add(Dense(hidden_dim,
input_dim=N_features,
kernel_regularizer=regularizers.l2(Lambda),
activation='relu'))
# add output layer
model.add(Dense(N_tau))
# compile model
model.compile(loss=lambda Y, Q: pinball_loss(y = Y, q = Q, **loss_param), optimizer='Adam')
# fit the model
history = model.fit(X_train, y_train, epochs=1000, verbose=0, batch_size=32, validation_split=0.1);
---------------------------------------------------------------------------------------------------------
plt.plot(history.history['loss']);
plt.plot(history.history['val_loss']);
plt.legend(('Loss', 'Val Loss'))
plt.ylabel('Loss');
plt.xlabel('Epoch');
plt.title("Training Loss");
---------------------------------------------------------------------------------------------------------
# estimate quantiles of testing data
q_hat = model.predict(X_test)
q_hat = pd.DataFrame(q_hat, index = X_test.index)
---------------------------------------------------------------------------------------------------------
N_PI = int(N_tau/2)
#y3 = y_test.to_numpy() #THIS ISN'T WORKING EITHER
y3 = np.array(y_test)
plt.figure(figsize=(12,6))
plt.plot(y3, color='red')
x = y3.index.values
for i in range(N_PI):
y1 = q_hat.iloc[:,i]
y2 = q_hat.iloc[:,-1-i]
plt.fill_between(x, y1, y2, color='blue', alpha=str(1/N_PI))
plt.title('RMSE: %.2f'% np.sqrt(sum((q_hat-y3)**2)/len(y3)));
plt.ylabel('Wind Power')
plt.xlabel('Time (hour)');
The final cell above is where I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-20-338503ab24ea> in <module>
4 plt.figure(figsize=(12,6))
5 plt.plot(y3, color='red')
----> 6 x = y3.index.values
7 for i in range(N_PI):
8 y1 = q_hat.iloc[:,i]
AttributeError: 'numpy.ndarray' object has no attribute 'index'
Here is what the QRNN looks like with multiple quantiles vs the actual wind power (red line)
Here is what happens after the "Atribute Error" above when I try to plot the Median Quantile vs the Actual Wind Power (red line) with the RMSE
I've also tried doing the following with the code where I made edits to the cell where I established tau and the last cell:
tau = np.arange(0.49, 0.51, 0.01) #Single point Quantile prediction
N_tau = len(tau) #tau starts from 0.05 to 1, incremented by 0.05, which is 19 points.
N_features = 5 #Windspeed is the feature for X.
hidden_dim = 40 # number of hidden nodes
Lambda = 0.001 # L2 regularization
# loss function parameters (no need to modify)
loss_param={
'tau': tau,
'alpha': 0.01,
}
---------------------------------------------------------------------------------------------------------
N_PI = int(N_tau/2)
plt.figure(figsize=(12,6))
plt.plot(y_test, color='red')
x = y_test.index.values
for i in range(N_PI):
y1 = q_hat.iloc[:,i]
y2 = q_hat.iloc[:,-1-i]
plt.fill_between(x, y1, y2, color='blue', alpha=str(1/N_PI))
y3 = (y2.values+y1.values)/2
#y3 is averaging the upper and lower quantiles, which is technically wrong since it isn't the median...
plt.title('RMSE: %.2f'% np.sqrt(sum((y3-y_test)**2)/len(y_test)));
plt.ylabel('Wind Power')
plt.xlabel('Time (hour)');
To which I get the following plot:
This is when I do tau = np.arange(0.49, 0.51, 0.01) and N_tau = len(tau)
Although this looks correct and is pretty much what I want, there is another problem! Every time I restart the entire Kernel, the RMSE keeps changing (gets smaller by a little bit each time, i.e. from 15,000 to 14950 to 14830, etc.).
I am so close to finishing this, I just want to be able to validate the accuracy of my QRNN by comparing the median quantile with the actual wind power, but this part (which is probably a simple fix) is giving me a lot of trouble. Please help!
Related
I am triyng to use scipy curve_fit function to fit a gaussian function to my data to estimate a theoretical power spectrum density. While doing so, the curve_fit function always return the initial parameters (p0=[1,1,1]) , thus telling me that the fitting didn't work.
I don't know where the issue is. I am using python 3.9 (spyder 5.1.5) from the anaconda distribution on windows 11.
here a Wetransfer link to the data file
https://wetransfer.com/downloads/6097ebe81ee0c29ee95a497128c1c2e420220704110130/86bf2d
Here is my code below. Can someone tell me what the issue is, and how can i solve it?
on the picture of the plot, the blue plot is my experimental PSD and the orange one is the result of the fit.
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.constants as cst
File = np.loadtxt('test5.dat')
X = File[:, 1]
Y = File[:, 2]
f_sample = 50000
time=[]
for i in range(1,len(X)+1):
t=i*(1/f_sample)
time= np.append(time,t)
N = X.shape[0] # number of observation
N1=int(N/2)
delta_t = time[2] - time[1]
T_mes = N * delta_t
freq = np.arange(1/T_mes, (N+1)/T_mes, 1/T_mes)
freq=freq[0:N1]
fNyq = f_sample/2 # Nyquist frequency
nb = 350
freq_block = []
# discrete fourier tansform
X_ft = delta_t*np.fft.fft(X, n=N)
X_ft=X_ft[0:N1]
plt.figure()
plt.plot(time, X)
plt.xlabel('t [s]')
plt.ylabel('x [micro m]')
# Experimental power spectrum on both raw and blocked data
PSD_X_exp = (np.abs(X_ft)**2/T_mes)
PSD_X_exp_b = []
STD_PSD_X_exp_b = []
for i in range(0, N1+2, nb):
freq_b = np.array(freq[i:i+nb]) # i-nb:i
psd_b = np.array(PSD_X_exp[i:i+nb])
freq_block = np.append(freq_block, (1/nb)*np.sum(freq_b))
PSD_X_exp_b = np.append(PSD_X_exp_b, (1/nb)*np.sum(psd_b))
STD_PSD_X_exp_b = np.append(STD_PSD_X_exp_b, PSD_X_exp_b/np.sqrt(nb))
plt.figure()
plt.loglog(freq, PSD_X_exp)
plt.legend(['Raw Experimental PSD'])
plt.xlabel('f [Hz]')
plt.ylabel('PSD')
plt.figure()
plt.loglog(freq_block, PSD_X_exp_b)
plt.legend(['Experimental PSD after blocking'])
plt.xlabel('f [Hz]')
plt.ylabel('PSD')
kB = cst.k # Boltzmann constant [m^2kg/s^2K]
T = 273.15 + 25 # Temperature [K]
r = (2.8 / 2) * 1e-6 # Particle radius [m]
v = 0.00002414 * 10 ** (247.8 / (-140 + T)) # Water viscosity [Pa*s]
gamma = np.pi * 6 * r * v # [m*Pa*s]
Do = kB*T/gamma # expected value for D
f3db_o = 50000 # expected value for f3db
fc_o = 300 # expected value pour fc
n = np.arange(-10,11)
def theo_spectrum_lorentzian_filter(x, D_, fc_, f3db_):
PSD_theo=[]
for i in range(0,len(x)):
# print(i)
psd_theo=np.sum((((D_*Do)/2*math.pi**2)/((fc_*fc_o)**2+(x[i]+n*f_sample)
** 2))*(1/(1+((x[i]+n*f_sample)/(f3db_*f3db_o))**2)))
PSD_theo= np.append(PSD_theo,psd_theo)
return PSD_theo
popt, pcov = curve_fit(theo_spectrum_lorentzian_filter, freq_block, PSD_X_exp_b, p0=[1, 1, 1], sigma=STD_PSD_X_exp_b, absolute_sigma=True, check_finite=True,bounds=(0.1, 10), method='trf', jac=None)
D_, fc_, f3db_ = popt
D1 = D_*Do
fc1 = fc_*fc_o
f3db1 = f3db_*f3db_o
print('Diffusion constant D = ', D1, ' Corner frequency fc= ',fc1, 'f3db(diode,eff)= ', f3db1)
I believe I've successfully fitted your data. Here's the approach I took.
First, I plotted your model (with popt=[1, 1, 1]) and the data you had. I noticed your data was significantly lower than the model. Then I started fiddling with the parameters. I wanted to push the model upwards. I did that by multiplying popt[0] by increasingly large values. I ended up with 1E13 as a ballpark value. Note that I have no idea if this is physically possible for your model. Then I jury-rigged your fitting function to multiply D_ by 1E13 and ran your code. I got this fit:
So I believe it's a problem of 1) inappropriate starting values and 2) inappropriate bounds. In your position, I would revise this model, check if there's any problems with units and so on.
Here's what I used to try to fit your model:
plt.figure()
plt.loglog(freq_block[:170], PSD_X_exp_b[:170], label='Exp')
plt.loglog(freq_block[:170],
theo_spectrum_lorentzian_filter(
freq_block[:170],
1E13*popt[0], popt[1], popt[2]),
label='model'
)
plt.xlabel('f [Hz]')
plt.ylabel('PSD')
plt.legend()
I limited the data to point 170 because there were some weird backwards values that made me uncomfortable. I would recheck them if I were you.
Here's the model code I used. I didn't change the curve_fit call (except to limit x to :170.
def theo_spectrum_lorentzian_filter(x, D_, fc_, f3db_):
PSD_theo=[]
D_ = 1E13*D_ # I only changed here
for i in range(0,len(x)):
psd_theo=np.sum((((D_*Do)/2*math.pi**2)/((fc_*fc_o)**2+(x[i]+n*f_sample)
** 2))*(1/(1+((x[i]+n*f_sample)/(f3db_*f3db_o))**2)))
PSD_theo= np.append(PSD_theo,psd_theo)
return PSD_theo
I'm trying to implement emcee MCMC sampling in Python with a predefined likelihood function to find the best boundary between two populations of data.
For emcee see: http://dfm.io/emcee/current/user/line/
The likelihood function calculates the true positive and true negative classifications, given some linear boundary line, and is used to minimise the difference between the two values whilst maximising their sum.
This way you can imagine a TP and TN rate of 1 respectively will give a likelihood value of 1 while TP and TN rates of 0 will return a likelihood value of 0.
But when I attempt to sample the parameter space for m and b, the gradient and offset (or bias), for the boundary line, I get some wildly big and/or small values for the walks.
I have put an example code below which generates some nicely divided populations and then MCMCs around the initial guesses of the parameter values. I'm unsure as to why the MCMC chains don't converge nicely to an appropriate value here so any help would be greatly appreciated.
The following code should run out-of-the-box.
import emcee
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
#generate some test x and y data
folded_xy_train = np.random.uniform(0,1,10000) #test x data
folded_z_train = np.random.uniform(0,1,10000) #test y data
#define the true gradient and offset for the boundary line
m_true, b_true = 5,-2.5
#generate labels for the test data
rounded_labels_train = np.ones(len(folded_z_train))
model = (m_true*folded_xy_train) + b_true
difference = model - folded_z_train
rounded_labels_train[difference<0] = 0
#show the test data
plt.figure()
plt.scatter(folded_xy_train,folded_z_train,c=rounded_labels_train,s=1.0)
#define a likelihood function for the boundary line
def lnlike(theta, x, y, labels):
m, b = theta
model = (m*x) + b
difference = model - y
classifications = np.ones(len(y))
classifications[difference<0]=0
cfm = confusion_matrix(labels,classifications)
cm = cfm.astype('float') / cfm.sum(axis=1)[:, np.newaxis]
tn, fp, fn, tp = cm.ravel()
likelihood_val = (0.5*(tp+tn))/(1+np.abs(tp-tn))
ln_like = -np.log(likelihood_val)
return ln_like
#define a wide flat prior
def lnprior(theta):
m, b, = theta
if 0 < m < 10 and -20 < b < 5:
return 0.0
return -np.inf
#define the posterior
def lnprob(p, x, y, labels):
lp = lnprior(p)
if not np.isfinite(lp):
return 0
return lp + lnlike(p, x, y, labels)
#setup the MCMC sampling
nwalkers = 4
ndim = 2
p0 = np.array([4.2,-2]) + [np.random.rand(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(folded_xy_train, folded_z_train, rounded_labels_train))
sampler.run_mcmc(p0, 500)
#extract the MCMC paramater value chains
samples = sampler.chain[:, 50:, :].reshape((-1, ndim))
#view the parameter chains
plt.figure()
plt.subplot(211)
plt.plot(samples[:,0])
plt.subplot(212)
plt.plot(samples[:,1])
The initial test data, showing an obvious boundary line for given x y data (coloured by binary class label):
The sample walks, showing strange sampling for the gradient parameter (top) and offset parameter (bottom). The x-axis denotes the MCMC walk step number and the y-axis denotes the MCMC parameter values at a given step:
I'm trying to recreate a plot from An Introduction to Statistical Learning and I'm having trouble figuring out how to calculate the confidence interval for a probability prediction. Specifically, I'm trying to recreate the right-hand panel of this figure (figure 7.1) which is predicting the probability that wage>250 based on a degree 4 polynomial of age with associated 95% confidence intervals. The wage data is here if anyone cares.
I can predict and plot the predicted probabilities fine with the following code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
wage = pd.read_csv('../../data/Wage.csv', index_col=0)
wage['wage250'] = 0
wage.loc[wage['wage'] > 250, 'wage250'] = 1
poly = Polynomialfeatures(degree=4)
age = poly.fit_transform(wage['age'].values.reshape(-1, 1))
logit = sm.Logit(wage['wage250'], age).fit()
age_range_poly = poly.fit_transform(np.arange(18, 81).reshape(-1, 1))
y_proba = logit.predict(age_range_poly)
plt.plot(age_range_poly[:, 1], y_proba)
But I'm at a loss as to how the confidence intervals of the predicted probabilities are calculated. I have thought about bootstrapping the data many times to get the distribution of probabilities for each age but I know there is an easier way which is just beyond my grasp.
I have the estimated coefficient covariance matrix and the standard errors associated with each estimated coefficient. How would I go about calculating the confidence intervals as shown in the right-hand panel of the figure above given this information?
Thanks!
You can use delta method to find approximate variance for predicted probability. Namely,
var(proba) = np.dot(np.dot(gradient.T, cov), gradient)
where gradient is the vector of derivatives of predicted probability by model coefficients, and cov is the covariance matrix of coefficients.
Delta method is proven to work asymptotically for all maximum likelihood estimates. However, if you have a small training sample, asymptotic methods may not work well, and you should consider bootstrapping.
Here is a toy example of applying delta method to logistic regression:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
# generate data
np.random.seed(1)
x = np.arange(100)
y = (x * 0.5 + np.random.normal(size=100,scale=10)>30)
# estimate the model
X = sm.add_constant(x)
model = sm.Logit(y, X).fit()
proba = model.predict(X) # predicted probability
# estimate confidence interval for predicted probabilities
cov = model.cov_params()
gradient = (proba * (1 - proba) * X.T).T # matrix of gradients for each observation
std_errors = np.array([np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient])
c = 1.96 # multiplier for confidence interval
upper = np.maximum(0, np.minimum(1, proba + std_errors * c))
lower = np.maximum(0, np.minimum(1, proba - std_errors * c))
plt.plot(x, proba)
plt.plot(x, lower, color='g')
plt.plot(x, upper, color='g')
plt.show()
It draws the following nice picture:
For your example the code would be
proba = logit.predict(age_range_poly)
cov = logit.cov_params()
gradient = (proba * (1 - proba) * age_range_poly.T).T
std_errors = np.array([np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient])
c = 1.96
upper = np.maximum(0, np.minimum(1, proba + std_errors * c))
lower = np.maximum(0, np.minimum(1, proba - std_errors * c))
plt.plot(age_range_poly[:, 1], proba)
plt.plot(age_range_poly[:, 1], lower, color='g')
plt.plot(age_range_poly[:, 1], upper, color='g')
plt.show()
and it would give the following picture
Looks pretty much like a boa-constrictor with an elephant inside.
You could compare it with the bootstrap estimates:
preds = []
for i in range(1000):
boot_idx = np.random.choice(len(age), replace=True, size=len(age))
model = sm.Logit(wage['wage250'].iloc[boot_idx], age[boot_idx]).fit(disp=0)
preds.append(model.predict(age_range_poly))
p = np.array(preds)
plt.plot(age_range_poly[:, 1], np.percentile(p, 97.5, axis=0))
plt.plot(age_range_poly[:, 1], np.percentile(p, 2.5, axis=0))
plt.show()
Results of delta method and bootstrap look pretty much the same.
Authors of the book, however, go the third way. They use the fact that
proba = np.exp(np.dot(x, params)) / (1 + np.exp(np.dot(x, params)))
and calculate confidence interval for the linear part, and then transform with the logit function
xb = np.dot(age_range_poly, logit.params)
std_errors = np.array([np.sqrt(np.dot(np.dot(g, cov), g)) for g in age_range_poly])
upper_xb = xb + c * std_errors
lower_xb = xb - c * std_errors
upper = np.exp(upper_xb) / (1 + np.exp(upper_xb))
lower = np.exp(lower_xb) / (1 + np.exp(lower_xb))
plt.plot(age_range_poly[:, 1], upper)
plt.plot(age_range_poly[:, 1], lower)
plt.show()
So they get the diverging interval:
These methods produce so different results because they assume different things (predicted probability and log-odds) being distributed normally. Namely, delta method assumes predicted probabilites are normal, and in the book, log-odds are normal. In fact, none of them are normal in finite samples, and they all converge to normal in infinite samples, but their variances converge to zero at the same time. Maximum likelihood estimates are insensitive to reparametrization, but their estimated distribution is, and that's the problem.
Here is an instructive and efficient method to calculate the standard errors ('se') of the fit ('mean_se') and single observations ('obs_se') on top of a statsmodels Logit().fit() object ('fit'), identical to the method in the book ISLR and the last method from the answer by David Dale:
fit_mean = fit.model.exog.dot(fit.params)
fit_mean_se = ((fit.model.exog*fit.model.exog.dot(fit.cov_params())).sum(axis=1))**0.5
fit_obs_se = ( ((fit.model.endog-fit_mean).std(ddof=fit.params.shape[0]))**2 + \
fit_mean_se**2 )**0.5
A figure similar to the one in the book ISLR
The shaded regions represent the 95% confidence intervals for the fit and single observations.
Ideas for improvement are most welcome.
I am able to get a ROC curve using scikit-learn with
fpr, tpr, thresholds = metrics.roc_curve(y_true,y_pred, pos_label=1), where y_true is a list of values based on my gold standard (i.e., 0 for negative and 1 for positive cases) and y_pred is a corresponding list of scores (e.g., 0.053497243, 0.008521122, 0.022781548, 0.101885263, 0.012913795, 0.0, 0.042881547 [...])
I am trying to figure out how to add confidence intervals to that curve, but didn't find any easy way to do that with sklearn.
You can bootstrap the ROC computations (sample with replacement new versions of y_true / y_pred out of the original y_true / y_pred and recompute a new value for roc_curve each time) and the estimate a confidence interval this way.
To take the variability induced by the train test split into account, you can also use the ShuffleSplit CV iterator many times, fit a model on the train split, generate y_pred for each model and thus gather an empirical distribution of roc_curves as well and finally compute confidence intervals for those.
Edit: bootstrapping in python
Here is an example for bootstrapping the ROC AUC score out of the predictions of a single model. I chose to bootstrap the ROC AUC to make it easier to follow as a Stack Overflow answer, but it can be adapted to bootstrap the whole curve instead:
import numpy as np
from scipy.stats import sem
from sklearn.metrics import roc_auc_score
y_pred = np.array([0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04])
y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0 ])
print("Original ROC area: {:0.3f}".format(roc_auc_score(y_true, y_pred)))
n_bootstraps = 1000
rng_seed = 42 # control reproducibility
bootstrapped_scores = []
rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
# bootstrap by sampling with replacement on the prediction indices
indices = rng.randint(0, len(y_pred), len(y_pred))
if len(np.unique(y_true[indices])) < 2:
# We need at least one positive and one negative sample for ROC AUC
# to be defined: reject the sample
continue
score = roc_auc_score(y_true[indices], y_pred[indices])
bootstrapped_scores.append(score)
print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
You can see that we need to reject some invalid resamples. However on real data with many predictions this is a very rare event and should not impact the confidence interval significantly (you can try to vary the rng_seed to check).
Here is the histogram:
import matplotlib.pyplot as plt
plt.hist(bootstrapped_scores, bins=50)
plt.title('Histogram of the bootstrapped ROC AUC scores')
plt.show()
Note that the resampled scores are censored in the [0 - 1] range causing a high number of scores in the last bin.
To get a confidence interval one can sort the samples:
sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
# Computing the lower and upper bound of the 90% confidence interval
# You can change the bounds percentiles to 0.025 and 0.975 to get
# a 95% confidence interval instead.
confidence_lower = sorted_scores[int(0.05 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.95 * len(sorted_scores))]
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
confidence_lower, confidence_upper))
which gives:
Confidence interval for the score: [0.444 - 1.0]
The confidence interval is very wide but this is probably a consequence of my choice of predictions (3 mistakes out of 9 predictions) and the total number of predictions is quite small.
Another remark on the plot: the scores are quantized (many empty histogram bins). This is a consequence of the small number of predictions. One could introduce a bit of Gaussian noise on the scores (or the y_pred values) to smooth the distribution and make the histogram look better. But then the choice of the smoothing bandwidth is tricky.
Finally as stated earlier this confidence interval is specific to you training set. To get a better estimate of the variability of the ROC induced by your model class and parameters, you should do iterated cross-validation instead. However this is often much more costly as you need to train a new model for each random train / test split.
EDIT: since I first wrote this reply, there is a bootstrap implementation in scipy directly:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html
DeLong Solution
[NO bootstrapping]
As some of here suggested, the pROC package in R comes very handy for ROC AUC confidence intervals out-of-the-box, but that packages is not found in python. According to pROC documentation, confidence intervals are calculated via DeLong:
DeLong is an asymptotically exact method to evaluate the uncertainty
of an AUC (DeLong et al. (1988)). Since version 1.9, pROC uses the
algorithm proposed by Sun and Xu (2014) which has an O(N log N)
complexity and is always faster than bootstrapping. By default, pROC
will choose the DeLong method whenever possible.
And luckily for us, Yandex Data School has a Fast DeLong implementation on their public repo:
https://github.com/yandexdataschool/roc_comparison
So all credits to them for the DeLong implementation used in this example.
So here is how you get a CI via DeLong:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 6 10:06:52 2018
#author: yandexdataschool
Original Code found in:
https://github.com/yandexdataschool/roc_comparison
updated: Raul Sanchez-Vazquez
"""
import numpy as np
import scipy.stats
from scipy import stats
# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = 0.5*(i + j - 1)
i = j
T2 = np.empty(N, dtype=np.float)
# Note(kazeevn) +1 is due to Python using 0-based indexing
# instead of 1-based in the AUC formula in the paper
T2[J] = T + 1
return T2
def compute_midrank_weight(x, sample_weight):
"""Computes midranks.
Args:
x - a 1D numpy array
Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
cumulative_weight = np.cumsum(sample_weight[J])
N = len(x)
T = np.zeros(N, dtype=np.float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = cumulative_weight[i:j].mean()
i = j
T2 = np.empty(N, dtype=np.float)
T2[J] = T
return T2
def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
if sample_weight is None:
return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
else:
return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)
def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
#article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
total_positive_weights = sample_weight[:m].sum()
total_negative_weights = sample_weight[m:].sum()
pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
total_pair_weights = pair_weights.sum()
aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
Args:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
Returns:
(AUC value, DeLong covariance)
Reference:
#article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for
Comparing the Areas Under Correlated Receiver Oerating
Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=np.float)
ty = np.empty([k, n], dtype=np.float)
tz = np.empty([k, m + n], dtype=np.float)
for r in range(k):
tx[r, :] = compute_midrank(positive_examples[r, :])
ty[r, :] = compute_midrank(negative_examples[r, :])
tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
v01 = (tz[:, :m] - tx[:, :]) / n
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
def calc_pvalue(aucs, sigma):
"""Computes log(10) of p-values.
Args:
aucs: 1D array of AUCs
sigma: AUC DeLong covariances
Returns:
log10(pvalue)
"""
l = np.array([[1, -1]])
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
def compute_ground_truth_statistics(ground_truth, sample_weight):
assert np.array_equal(np.unique(ground_truth), [0, 1])
order = (-ground_truth).argsort()
label_1_count = int(ground_truth.sum())
if sample_weight is None:
ordered_sample_weight = None
else:
ordered_sample_weight = sample_weight[order]
return order, label_1_count, ordered_sample_weight
def delong_roc_variance(ground_truth, predictions, sample_weight=None):
"""
Computes ROC AUC variance for a single set of predictions
Args:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
ground_truth, sample_weight)
predictions_sorted_transposed = predictions[np.newaxis, order]
aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
return aucs[0], delongcov
alpha = .95
y_pred = np.array([0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04])
y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0 ])
auc, auc_cov = delong_roc_variance(
y_true,
y_pred)
auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc,
scale=auc_std)
ci[ci > 1] = 1
print('AUC:', auc)
print('AUC COV:', auc_cov)
print('95% AUC CI:', ci)
output:
AUC: 0.8
AUC COV: 0.028749999999999998
95% AUC CI: [0.46767194, 1.]
I've also checked that this implementation matches the pROC results obtained from R:
library(pROC)
y_true = c(0, 1, 0, 0, 1, 1, 0, 1, 0)
y_pred = c(0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04)
# Build a ROC object and compute the AUC
roc = roc(y_true, y_pred)
roc
output:
Call:
roc.default(response = y_true, predictor = y_pred)
Data: y_pred in 5 controls (y_true 0) < 4 cases (y_true 1).
Area under the curve: 0.8
Then
# Compute the Confidence Interval
ci(roc)
output
95% CI: 0.4677-1 (DeLong)
I do this linear regression with StatsModels:
import numpy as np
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
n = 100
x = np.linspace(0, 10, n)
e = np.random.normal(size=n)
y = 1 + 0.5*x + 2*e
X = sm.add_constant(x)
re = sm.OLS(y, X).fit()
print(re.summary())
prstd, iv_l, iv_u = wls_prediction_std(re)
My questions are, iv_l and iv_u are the upper and lower confidence intervals or prediction intervals?
How I get others?
I need the confidence and prediction intervals for all points, to do a plot.
For test data you can try to use the following.
predictions = result.get_prediction(out_of_sample_df)
predictions.summary_frame(alpha=0.05)
I found the summary_frame() method buried here and you can find the get_prediction() method here. You can change the significance level of the confidence interval and prediction interval by modifying the "alpha" parameter.
I am posting this here because this was the first post that comes up when looking for a solution for confidence & prediction intervals – even though this concerns itself with test data rather.
Here's a function to take a model, new data, and an arbitrary quantile, using this approach:
def ols_quantile(m, X, q):
# m: OLS model.
# X: X matrix.
# q: Quantile.
#
# Set alpha based on q.
a = q * 2
if q > 0.5:
a = 2 * (1 - q)
predictions = m.get_prediction(X)
frame = predictions.summary_frame(alpha=a)
if q > 0.5:
return frame.obs_ci_upper
return frame.obs_ci_lower
update see the second answer which is more recent. Many of the models and results classes have now a get_prediction method that provides additional information including prediction intervals and/or confidence intervals for the predicted mean.
old answer:
iv_l and iv_u give you the limits of the prediction interval for each point.
Prediction interval is the confidence interval for an observation and includes the estimate of the error.
I think, confidence interval for the mean prediction is not yet available in statsmodels.
(Actually, the confidence interval for the fitted values is hiding inside the summary_table of influence_outlier, but I need to verify this.)
Proper prediction methods for statsmodels are on the TODO list.
Addition
Confidence intervals are there for OLS but the access is a bit clumsy.
To be included after running your script:
from statsmodels.stats.outliers_influence import summary_table
st, data, ss2 = summary_table(re, alpha=0.05)
fittedvalues = data[:, 2]
predict_mean_se = data[:, 3]
predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
predict_ci_low, predict_ci_upp = data[:, 6:8].T
# Check we got the right things
print np.max(np.abs(re.fittedvalues - fittedvalues))
print np.max(np.abs(iv_l - predict_ci_low))
print np.max(np.abs(iv_u - predict_ci_upp))
plt.plot(x, y, 'o')
plt.plot(x, fittedvalues, '-', lw=2)
plt.plot(x, predict_ci_low, 'r--', lw=2)
plt.plot(x, predict_ci_upp, 'r--', lw=2)
plt.plot(x, predict_mean_ci_low, 'r--', lw=2)
plt.plot(x, predict_mean_ci_upp, 'r--', lw=2)
plt.show()
This should give the same results as SAS, http://jpktd.blogspot.ca/2012/01/nice-thing-about-seeing-zeros.html
With time series results, you get a much smoother plot using the get_forecast() method. An example of time series is below:
# Seasonal Arima Modeling, no exogenous variable
model = SARIMAX(train['MI'], order=(1,1,1), seasonal_order=(1,1,0,12), enforce_invertibility=True)
results = model.fit()
results.summary()
The next step is to make the predictions, this generates the confidence intervals.
# make the predictions for 11 steps ahead
predictions_int = results.get_forecast(steps=11)
predictions_int.predicted_mean
These can be put in a data frame but need some cleaning up:
# get a better view
predictions_int.conf_int()
Concatenate the data frame, but clean up the headers
conf_df = pd.concat([test['MI'],predictions_int.predicted_mean, predictions_int.conf_int()], axis = 1)
conf_df.head()
Then we rename the columns.
conf_df = conf_df.rename(columns={0: 'Predictions', 'lower MI': 'Lower CI', 'upper MI': 'Upper CI'})
conf_df.head()
Make the plot.
# make a plot of model fit
# color = 'skyblue'
fig = plt.figure(figsize = (16,8))
ax1 = fig.add_subplot(111)
x = conf_df.index.values
upper = conf_df['Upper CI']
lower = conf_df['Lower CI']
conf_df['MI'].plot(color = 'blue', label = 'Actual')
conf_df['Predictions'].plot(color = 'orange',label = 'Predicted' )
upper.plot(color = 'grey', label = 'Upper CI')
lower.plot(color = 'grey', label = 'Lower CI')
# plot the legend for the first plot
plt.legend(loc = 'lower left', fontsize = 12)
# fill between the conf intervals
plt.fill_between(x, lower, upper, color='grey', alpha='0.2')
plt.ylim(1000,3500)
plt.show()
You can get the prediction intervals by using LRPI() class from the Ipython notebook in my repo (https://github.com/shahejokarian/regression-prediction-interval).
You need to set the t value to get the desired confidence interval for the prediction values, otherwise the default is 95% conf. interval.
The LRPI class uses sklearn.linear_model's LinearRegression , numpy and pandas libraries.
There is an example shown in the notebook too.
summary_frame and summary_table work well when you need exact results for a single quantile, but don't vectorize well. This will provide a normal approximation of the prediction interval (not confidence interval) and works for a vector of quantiles:
def ols_quantile(m, X, q):
# m: Statsmodels OLS model.
# X: X matrix of data to predict.
# q: Quantile.
#
from scipy.stats import norm
mean_pred = m.predict(X)
se = np.sqrt(m.scale)
return mean_pred + norm.ppf(q) * se
To add to Max Ghenis' response here - you can use .get_prediction() to generate confidence intervals, not just prediction intervals, by using .conf_int() after.
predictions = result.get_prediction(out_of_sample_df)
predictions.conf_int(alpha = 0.05)
You can calculate them based on results given by statsmodel and the normality assumptions.
Here is an example for OLS and CI for the mean value:
import statsmodels.api as sm
import numpy as np
from scipy import stats
#Significance level:
sl = 0.05
#Evaluate mean value at a required point x0. Here, at the point (0.0,2.0) for N_model=2:
x0 = np.asarray([1.0, 0.0, 2.0])# If you have no constant in your model, remove the first 1.0. For more dimensions, add the desired values.
#Get an OLS model based on output y and the prepared vector X (as in your notation):
model = sm.OLS(endog = y, exog = X )
results = model.fit()
#Get two-tailed t-values:
(t_minus, t_plus) = stats.t.interval(alpha = (1.0 - sl), df = len(results.resid) - len(x0) )
y_value_at_x0 = np.dot(results.params, x0)
lower_bound = y_value_at_x0 + t_minus*np.sqrt(results.mse_resid*( np.dot(np.dot(x0.T,results.normalized_cov_params),x0) ))
upper_bound = y_value_at_x0 + t_plus*np.sqrt(results.mse_resid*( np.dot(np.dot(x0.T,results.normalized_cov_params),x0) ))
You can wrap a nice function around this with input results, point x0 and significance level sl.
I am unsure now if you can use this for WLS() since there are extra things happening there.
Ref: Ch3 in [D.C. Montgomery and E.A. Peck. “Introduction to Linear Regression Analysis.” 4th. Ed., Wiley, 1992].