Different results in computing KL Divergence using Pytorch Distributions vs manually

Different results in computing KL Divergence using Pytorch Distributions vs manually - python

I noticed the KL-Divergence term KL(Q(x)||P(x)) is computed differently when using
mean(Q(x)*(log Q(x) - log P(x)))
vs
torch.distributions.kl_divergence(Q, P)
where
Q = torch.distributions.Normal(some mean, some sigma)
P = torch.distributions.Normal(0, 1)
and when I plot the KL-divergence losses, I get this two similar but different plots:
here
Can anyone point out what is causing this difference?
The full code is below:
import numpy as np
import torch
import torch.distributions as dist
import matplotlib.pyplot as plt
def kl_1(log_qx, log_px):
"""
inputs: [B, z_dim] torch
"""
return (log_qx.exp() * (log_qx-log_px)).mean()
# ground-truth (target) P(x)
P = dist.Normal(0, 1)
mus = np.arange(-5, 5, 0.1)
sigma = 1
N = 100
kls = {"1": [], "2": []}
for mu in mus:
# prediction (current) Q(x)
Q = dist.Normal(mu, sigma)
# sample from Q
qx = Q.sample((N,))
# log prob
log_qx = Q.log_prob(qx)
log_px = P.log_prob(qx)
# kl 1
kl1 = kl_1(log_qx, log_px)
kls['1'].append(kl1.numpy())
# kl 2
kl2 = dist.kl_divergence(Q, P)
kls['2'].append(kl2.numpy())
plt.figure()
plt.scatter(mus, kls['1'], label="Q*(logQ-logP)")
plt.scatter(mus, kls['2'], label="kl_divergence")
plt.xlabel("mean of Q(x)")
plt.ylabel("computed KL Divergence")
plt.legend()
plt.show()

You have the sample weighted by the probability density if you are computing the expected value from an integral on dx. If you are using a sample from the given distribution then you approximate the expected value as the mean directly, that corresponds to integration on d cq(x) thus d cq(x) = q(x) dx, where cq(x) is the cumulative probability function, and q(x) id the probability density funciton of the variable Q.
import numpy as np
import torch
import torch.distributions as dist
import matplotlib.pyplot as plt
def kl_1(log_qx, log_px):
"""
inputs: [B, z_dim] torch
"""
return (log_qx-log_px).mean()
# ground-truth (target) P(x)
P = dist.Normal(0, 1)
mus = np.arange(-5, 5, 0.1)
sigma = 1
N = 100
kls = {"1": [], "2": []}
for mu in mus:
# prediction (current) Q(x)
Q = dist.Normal(mu, sigma)
# sample from Q
qx = Q.sample((N,))
# log prob
log_qx = Q.log_prob(qx)
log_px = P.log_prob(qx)
# kl 1
kl1 = kl_1(log_qx, log_px)
kls['1'].append(kl1.numpy())
# kl 2
kl2 = dist.kl_divergence(Q, P)
kls['2'].append(kl2.numpy())
plt.figure()
plt.scatter(mus, kls['1'], label="Q*(logQ-logP)")
plt.scatter(mus, kls['2'], label="kl_divergence")
plt.xlabel("mean of Q(x)")
plt.ylabel("computed KL Divergence")
plt.legend()

Related

Quantile residual Q-Q plot in python

I know how I can get Normal Q-Q plots in Python but how can I get quantile residual Q-Q plots?
I tried to do the three steps written here (Chapter 20.2.6.1):
First I tried to adapt this solution for use with smf.glm (I need to use smf because I have a huge dataframe with hundreds of variables I need to pass):
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf
# generate some data to check
nobs = 1000
n, p = 50, 0.25
dist0 = stats.nbinom(n, p)
y = dist0.rvs(size=nobs)
x = np.ones(nobs)
df_test = pd.DataFrame({'y': y, 'x': x})
loglike_method = 'nb2' # or use 'nb2'
#res = sm.NegativeBinomial(y, x, loglike_method=loglike_method).fit(start_params=[0.1, 0.1])
res = smf.glm(formula="y ~ x", data=df_test, family=sm.families.NegativeBinomial()).fit(start_params=[0.1, 0.1])
print(dist0.mean())
print(res.params)
mu = res.predict() # use this for mean if not constant
mu = mu.mean()
#mu = np.exp(res.params[0]) # shortcut, we just regress on a constant
alpha = res.params[0]
if loglike_method == 'nb1':
Q = 1
elif loglike_method == 'nb2':
Q = 0
size = 1. / alpha * mu**Q
prob = size / (size + mu)
print('data generating parameters'.format(n, p))
print('estimated params'.format(size, prob))
#estimated distribution
dist_est = stats.nbinom(size, prob)
But the estimated parameters are totally off.
Next step would be to call stats.nbinom.cdf with those parameters to simulate values ...
Is this the right way?
And how can I get the correct values for size and prob from my fitted model?

Can one create a distribution characterizing the multiplication of two distributions in Python?

I have two distributions and I would like to know the properties of the multiplication of these distributions.
For example, if I had the distribution of properties velocity and time, I want the characteristics of the probability distribution of distance.
With reasonable estimates for the inegration bounds, I can calculate the probability density function from the product of two random variables:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
T, dt = np.linspace(0,20,201, retstep = True)
T = T[1:] # avoid divide by zero below
V = np.linspace(0,20,201)
D = np.linspace(0,120,201)
P_t = stats.gamma(4,1) # probability distribution for time
P_v = stats.norm(8,2) # probability distribution for speed
# complete integration
P_d = [np.trapz(P_t.pdf(T) * P_v.pdf(d / T) / T, dx = dt) for d in D]
plt.plot(T, P_t.pdf(T), label = 'time')
plt.plot(V, P_v.pdf(V), label = 'velocity')
plt.plot(D, P_d, label = 'distance')
plt.legend()
plt.ylabel('Probability density')
I would like to be able to compute things like P_d.sf(d), P_d.cdf(d), etc., for arbitrary values of d. Can I create a new distribution (perhaps using scipy.stats.rv_continuous) to characterize distance?

The solution took a bit of time to understand the rv_continuous. Cobbling together knowledge from a bunch of examples (I should have documented them--sorry) I think I got a working solution.
The only issue is that the domain needs to be known in advance, but I can work with that. If someone has ideas for how to fix that, please let me know.
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import scipy as sp
interp1d = sp.interpolate.interp1d
trapz = sp.integrate.trapz
# Time domain vector - needed in class
dt = 0.01
t_max = 10
T = np.arange(dt, t_max + dt, dt)
# Distance domain vector - needed in class
dd = 0.01
d_max = 30
D = np.arange(0, d_max + dd, dd)
class MultiplicativeModel(stats.rv_continuous):
def __init__(self, Tmodel, Vmodel, *args, **kwargs):
super().__init__(*args, **kwargs)
self.Tmodel = Tmodel # The time-domain probability function
self.Vmodel = Vmodel # The velocity-domain probability function
# Create vectors for interpolation of distributions
self.pdf_vec = np.array([trapz(self.Tmodel.pdf(T) * \
self.Vmodel.pdf(_ / T) / T, dx = dt) \
for _ in D])
self.cdf_vec = np.cumsum(self.pdf_vec) * dd
self.sf_vec = 1 - self.cdf_vec
# define key functions for rv_continuous class
self._pdf = interp1d(D, self.pdf_vec, assume_sorted=True)
self._sf = interp1d(D, self.sf_vec, assume_sorted=True)
self._cdf = interp1d(D, self.cdf_vec, assume_sorted=True)
# Extraolation option below is necessary because sometimes rvs picks
# a number really really close to 1 or 0 and this spits out an error if it
# is outside of the interpolation range.
self._ppf = interp1d(self.cdf_vec, D, assume_sorted=True,
fill_value = 'extrapolate')
# Moments
self._munp = lambda n, *args: np.trapz(self.pdf_vec * D ** n, dx=dd)
With the above defined, we get results like:
dv = 0.01
v_max = 10
V = np.arange(0, v_max + dv, dv)
model = MultiplicativeModel(stats.norm(3, 1),
stats.uniform(loc=2, scale = 2))
# test moments and stats functions
print(f'median: {model.median()}')
# median: 8.700970199181763
print(f'moments: {model.stats(moments = "mvsk")}')
#moments: (array(9.00872026), array(12.2315612), array(0.44131568), array(0.16819043))
plt.figure(figsize=(6,4))
plt.plot(T, model.Tmodel.pdf(T), label = 'Time PDF')
plt.plot(V, model.Vmodel.pdf(V), label = 'Velocity PDF')
plt.plot(D, model.pdf(D), label = 'Distance PDF')
plt.plot(D, model.cdf(D), label = 'Distance CDF')
plt.plot(D, model.sf(D), label = 'Distance SF')
x = model.rvs(size=10**5)
plt.hist(x, bins = 50, density = True, alpha = 0.5, label = 'Sampled distribution')
plt.legend()
plt.xlim([0,30])

How to calculate "relative error in the sum of squares" and "relative error in the approximate solution" from least squares method?

I have implemented a 3D gaussian fit using scipy.optimize.leastsq and now I would like to tweak the arguments ftol and xtol to optimize the performances. However, I don't understand the "units" of these two parameters in order to make a proper choice. Is it possible to calculate these two parameters from the results? That would give me an understanding of how to choose them. My data is numpy arrays of np.uint8. I tried to read the FORTRAN source code of MINIPACK but my FORTRAN knowledge is zero. I also read checked the Levenberg-Marquardt algorithm, but I could not really get a number that was below the ftol for example.
Here is a minimal example of what I do:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import leastsq
class gaussian_model:
def __init__(self):
self.prev_iter_model = None
self.f_vals = []
def gaussian_1D(self, coeffs, xx):
A, sigma, mu = coeffs
# Center rotation around peak center
x0 = xx - mu
model = A*np.exp(-(x0**2)/(2*(sigma**2)))
return model
def residuals(self, coeffs, I_obs, xx, model_func):
model = model_func(coeffs, xx)
residuals = I_obs - model
if self.prev_iter_model is not None:
self.f = np.sum(((model-self.prev_iter_model)/model)**2)
self.f_vals.append(self.f)
self.prev_iter_model = model
return residuals
# x data
x_start = 1
x_stop = 10
num = 100
xx, dx = np.linspace(x_start, x_stop, num, retstep=True)
# Simulated data with some noise
A, s_x, mu = 10, 0.5, 3
coeffs = [A, s_x, mu]
model = gaussian_model()
yy = model.gaussian_1D(coeffs, xx)
noise_ampl = 0.5
noise = np.random.normal(0, noise_ampl, size=num)
yy += noise
# LM Least squares
initial_guess = [1, 1, 1]
pred_coeffs, cov_x, info, mesg, ier = leastsq(model.residuals, initial_guess,
args=(yy, xx, model.gaussian_1D),
ftol=1E-6, full_output=True)
yy_fit = model.gaussian_1D(pred_coeffs, xx)
rel_SSD = np.sum(((yy-yy_fit)/yy)**2)
RMS_SSD = np.sqrt(rel_SSD/num)
print(RMS_SSD)
print(model.f)
print(model.f_vals)
fig, ax = plt.subplots(1,2)
# Plot results
ax[0].scatter(xx, yy)
ax[0].plot(xx, yy_fit, c='r')
ax[1].scatter(range(len(model.f_vals)), model.f_vals, c='r')
# ax[1].set_ylim(0, 1E-6)
plt.show()
rel_SSD is around 1 and definitely not something below ftol = 1E-6.
EDIT: Based on #user12750353 answer below I updated my minimal example to try to recreate how lmdif determines termination with ftol. The problem is that my f_vals are too small, so they are not the right values. The reason I would like to recreate this is that I would like to see what kind of numbers I am getting on my main code to decide on a ftol that would terminate the fitting process earlier.

Since you are giving a function without the gradient, the method called is lmdif. Instead of gradients it will use forward difference gradient estimate, f(x + delta) - f(x) ~ delta * df(x)/dx (I will write as if the parameter).
There you find the following description
c ftol is a nonnegative input variable. termination
c occurs when both the actual and predicted relative
c reductions in the sum of squares are at most ftol.
c therefore, ftol measures the relative error desired
c in the sum of squares.
c
c xtol is a nonnegative input variable. termination
c occurs when the relative error between two consecutive
c iterates is at most xtol. therefore, xtol measures the
c relative error desired in the approximate solution.
Looking in the code the actual reduction acred = 1 - (fnorm1/fnorm)**2 is what you calculated for rel_SSD, but between the two last iterations, not between the fitted function and the target points.
Example
The problem here is that we need to discover what are the values assumed by the internal variables. An attempt to do so is to save the coefficients and the residual norm every time the function is called as follows.
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import leastsq
class gaussian_model:
def __init__(self):
self.prev_iter_model = None
self.fnorm = []
self.x = []
def gaussian_1D(self, coeffs, xx):
A, sigma, mu = coeffs
# Center rotation around peak center
x0 = xx - mu
model = A*np.exp(-(x0**2)/(2*(sigma**2)))
grad = np.array([
model / A,
model * x0**2 / (sigma**3),
model * 2 * x0 / (2*(sigma**2))
]).transpose();
return model, grad
def residuals(self, coeffs, I_obs, xx, model_func):
model, grad = model_func(coeffs, xx)
residuals = I_obs - model
self.x.append(np.copy(coeffs));
self.fnorm.append(np.sqrt(np.sum(residuals**2)))
return residuals
def grad(self, coeffs, I_obs, xx, model_func):
model, grad = model_func(coeffs, xx)
residuals = I_obs - model
return -grad
def plot_progress(self):
x = np.array(self.x)
dx = np.sqrt(np.sum(np.diff(x, axis=0)**2, axis=1))
plt.plot(dx / np.sqrt(np.sum(x[1:, :]**2, axis=1)))
fnorm = np.array(self.fnorm)
plt.plot(1 - (fnorm[1:]/fnorm[:-1])**2)
plt.legend(['$||\Delta f||$', '$||\Delta x||$'], loc='upper left');
# x data
x_start = 1
x_stop = 10
num = 100
xx, dx = np.linspace(x_start, x_stop, num, retstep=True)
# Simulated data with some noise
A, s_x, mu = 10, 0.5, 3
coeffs = [A, s_x, mu]
model = gaussian_model()
yy, _ = model.gaussian_1D(coeffs, xx)
noise_ampl = 0.5
noise = np.random.normal(0, noise_ampl, size=num)
yy += noise
Then we can see the relative variation of $x$ and $f$
initial_guess = [1, 1, 1]
pred_coeffs, cov_x, info, mesg, ier = leastsq(model.residuals, initial_guess,
args=(yy, xx, model.gaussian_1D),
xtol=1e-6,
ftol=1e-6, full_output=True)
plt.figure(figsize=(14, 6))
plt.subplot(121)
model.plot_progress()
plt.yscale('log')
plt.grid()
plt.subplot(122)
yy_fit,_ = model.gaussian_1D(pred_coeffs, xx)
# Plot results
plt.scatter(xx, yy)
plt.plot(xx, yy_fit, c='r')
plt.show()
The problem with this is that the function is evaluated both to compute f and to compute the gradient of f. To produce a cleaner plot what can be done is to implement pass Dfun so that it evaluate func only once per iteration.
# x data
x_start = 1
x_stop = 10
num = 100
xx, dx = np.linspace(x_start, x_stop, num, retstep=True)
# Simulated data with some noise
A, s_x, mu = 10, 0.5, 3
coeffs = [A, s_x, mu]
model = gaussian_model()
yy, _ = model.gaussian_1D(coeffs, xx)
noise_ampl = 0.5
noise = np.random.normal(0, noise_ampl, size=num)
yy += noise
# LM Least squares
initial_guess = [1, 1, 1]
pred_coeffs, cov_x, info, mesg, ier = leastsq(model.residuals, initial_guess,
args=(yy, xx, model.gaussian_1D),
Dfun=model.grad,
xtol=1e-6,
ftol=1e-6, full_output=True)
plt.figure(figsize=(14, 6))
plt.subplot(121)
model.plot_progress()
plt.yscale('log')
plt.grid()
plt.subplot(122)
yy_fit,_ = model.gaussian_1D(pred_coeffs, xx)
# Plot results
plt.scatter(xx, yy)
plt.plot(xx, yy_fit, c='r')
plt.show()
Well, the value I am obtaining for xtol is not exactly what is in the lmdif implementation.

Bayesian Calibration with PyMC3, Kennedy O'Hagan

I'm quite new to probabilistic programming and pymc3...
Currently, I want to implement the Kennedy-O’Hagan framework in pymc3.
The setup is according to the paper of Kennedy and O'Hagan as follows:
We have n observations zi of the form
zi = f(xi , theta) + g(xi) + ei,
where xi are known imputs and theta are unknown calibration parameters and ei are iid error terms. We also have m model evaluations yj of the form
yj = f(x'j, thetaj), where both x'j (different than xi above) and thetaj are known. Therefore, the data consists of all zi and yj. In the paper, Kennedy-O'Hagan model f, g using gaussian processes:
f ~ GP{m1 (.,.), Sigma1[(.,.),(.,.)] }
g ~ GP{m2 (.), Sigma2[(.),(.)] }
Among other things, the goal is to get posterior samples for the unknow calibration parameters theta.
What I've done so far:
import pymc3 as pm
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import freeze_support
import sys
import theano
import theano.tensor as tt
from mpl_toolkits.mplot3d import Axes3D
import pyDOE
from scipy.stats.distributions import uniform
def physical_system(x):
return 0.65 * x / (1 + x / 5)
def observation(x):
return physical_system(x[:]) + np.random.normal(0,0.01,len(x))
def computational_system(input):
return input[:,0]*input[:,1]
if __name__ == "__main__":
freeze_support()
# observations with noise
x_obs = np.linspace(0,4,10)
y_real = physical_system(x_obs[:])
y_obs = observation(x_obs[:])
# computation model
N = 60
design = pyDOE.lhs(2, samples=N, criterion='center')
left = [-0.2,-0.2]; right = [4.2,1.2]
for i in range(2):
design[:,i] = uniform(loc=left[i],scale=right[i]-left[i]).ppf(design[:,i])
x_comp = design[:,0][:,None]; t_comp = design[:,1][:,None]
input_comp = np.hstack((x_comp,t_comp))
y_comp = computational_system(input_comp)
x_obs_shared = theano.shared(x_obs[:, None])
with pm.Model() as model:
noise = pm.HalfCauchy('noise',beta=5)
ls_1 = pm.Gamma('ls_1', alpha=1, beta=1, shape=2)
cov = pm.gp.cov.ExpQuad(2,ls=ls_1)
f = pm.gp.Marginal(cov_func=cov)
# train the gp f with data from computer model:
f_0 = f.marginal_likelihood('f_0', X=input_comp, y=y_comp, noise=noise)
trace = pm.sample(500, pm.Metropolis(), chains=4)
burned_trace = trace[300:]
Until here, everything is fine. My GP f is trained according the computer model.
Now, I want to test if I could fit this trained GP to my observed data:
#gp f is now trained to data from computer model
#now I want to fit this trained gp to observed data and find posterior for theta
with model:
sd = pm.Gamma('eta', alpha=1, beta=1)
theta = pm.Normal('theta', mu=0, sd=sd)
sigma = pm.Gamma('sigma', alpha=1, beta=1)
input_1 = tt.concatenate([x_obs_shared, tt.tile(theta, len(x_obs[:,None]), ndim=2).T], axis=1)
f_1 = gp1.conditional('f_1', Xnew=input_1, shape=(10,))
y_ = pm.Normal('y_', mu=f_1,sd=sigma, observed=y_obs)
step = pm.Metropolis()
trace_ = pm.sample(30000, step,start=pm.find_MAP(), chains=4)
Is this formulation correct? I get very unstable results...
The full formulation according KOH should be something like this:
with pm.Model() as model:
theta = pm.Normal('theta', mu=0, sd=10)
noise = pm.HalfCauchy('noise',beta=5)
ls_1 = pm.Gamma('ls_1', alpha=1, beta=1, shape=2)
cov = pm.gp.cov.ExpQuad(2,ls=ls_1)
gp1 = pm.gp.Marginal(cov_func=cov)
gp2 = pm.gp.Marginal(cov_func=cov)
gp = gp1 + gp2
input_1 = tt.concatenate([x_obs_shared, tt.tile(theta, len(x_obs), ndim=2).T], axis=1)
f_0 = gp1.marginal_likelihood('f_0', X=input_comp, y=y_comp, noise=noise)
f_1 = gp1.marginal_likelihood('f_1', X=input_1, y=y_obs, noise=noise)
f = gp.marginal_likelihood('f', X=input_1, y=y_obs, noise=noise)
Could somebody give me some advise how to formulate the KOH properly with pymc3? I am desperate... Would appreciate any help. Thank you!

You might have found the solution but if not, that's a good one (Guidelines for the Bayesian calibration of building energy models)

Fitting peaks with Scipy curve_fit, error optimal parameters not found

I recently started with Python because I have an enormous amount of data where I want to automatically fit a Gaussian to the peaks in spectra. Below is an example of three peaks that I want to fit with three individual peaks.
I have found a question where someone is looking for something very similar, How can I fit multiple Gaussian curved to mass spectrometry data in Python?, and adopted it to my script.
I have added my code at the bottom and when I run the last section I get the error "RuntimeError: Optimal parameters not found: Number of calls to function has reached maxfev = 800." What am I missing?
The data can be downloaded at https://www.dropbox.com/s/zowawljcjco70yh/data_so.h5?dl=0
#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.sparse.linalg import spsolve
from scipy.optimize import curve_fit
#%% Read data
path = 'D:/Python/data_so.h5'
f = pd.read_hdf(path, mode = 'r')
t = f.loc[:, 'Time stamp']
d = f.drop(['Time stamp', 'Name spectrum'], axis = 1)
#%% Extract desired wavenumber range
wn_st=2000
wn_ed=2500
ix_st=np.argmin(abs(d.columns.values-wn_st))
ix_ed=np.argmin(abs(d.columns.values-wn_ed))
d = d.iloc[:, ix_st:ix_ed+1]
#%% AsLS baseline correction
spectrum = 230
y = d.iloc[spectrum]
niter = 10
lam = 200000
p = 0.005
L = len(y)
D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
w = np.ones(L)
for i in range(niter):
W = sparse.spdiags(w, 0, L, L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z, w*y)
w = p * (y > z) + (1-p) * (y < z)
corr = d.iloc[spectrum,:] - z
#%% Plot spectrum, baseline and corrected spectrum
plt.clf()
plt.plot(d.columns, d.iloc[spectrum,:])
plt.plot(d.columns, z)
plt.plot(d.columns, corr)
plt.gca().invert_xaxis()
plt.show()
#%%
x = d.columns.values
def gauss(x, a, mu, sig):
return a*np.exp(-(x.astype(float)-mu)**2/(2*sig**2))
fitx = x[(x>2232)*(x<2252)]
fity = y[(x>2232)*(x<2252)]
mu=np.sum(fitx*fity)/np.sum(fity)
sig=np.sqrt(np.sum(fity*(fitx-mu)**2)/np.sum(fity))
popt, pcov = curve_fit(gauss, fitx, fity, p0=[max(fity),mu, sig])
plt.plot(x, gauss(x, popt[0],popt[1],popt[2]), 'r-', label='fit')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Different results in computing KL Divergence using Pytorch Distributions vs manually - python

Related

Quantile residual Q-Q plot in python

Can one create a distribution characterizing the multiplication of two distributions in Python?

How to calculate "relative error in the sum of squares" and "relative error in the approximate solution" from least squares method?

Bayesian Calibration with PyMC3, Kennedy O'Hagan

Fitting peaks with Scipy curve_fit, error optimal parameters not found

Categories

Resources