minimizing non-convex function with Nelder-Mead - python

I am using scipy.optimize.minimize, with the default method ('Neldear-Mead').
The function I try to minimize is not strictly convex. It stays at the same value on some significant areas.
The issue that I have is that the steps taken by the algorithm are too small. For example my starting point has a first coordinate x0 = 0.2 . I know that the function will result in a different value only for a significant step, for example moving by 0.05. Unfortunately, I can see that the algorithm makes very small step (moving by around 0.000001). As a result, my function returns the same value, and the algorithm does not converge. Can I change that behaviour?
For convenience, here's the scipy code:
def _minimize_neldermead(func, x0, args=(), callback=None,
xtol=1e-4, ftol=1e-4, maxiter=None, maxfev=None,
disp=False, return_all=False,
**unknown_options):
"""
Minimization of scalar function of one or more variables using the
Nelder-Mead algorithm.
Options for the Nelder-Mead algorithm are:
disp : bool
Set to True to print convergence messages.
xtol : float
Relative error in solution `xopt` acceptable for convergence.
ftol : float
Relative error in ``fun(xopt)`` acceptable for convergence.
maxiter : int
Maximum number of iterations to perform.
maxfev : int
Maximum number of function evaluations to make.
This function is called by the `minimize` function with
`method=Nelder-Mead`. It is not supposed to be called directly.
"""
_check_unknown_options(unknown_options)
maxfun = maxfev
retall = return_all
fcalls, func = wrap_function(func, args)
x0 = asfarray(x0).flatten()
N = len(x0)
rank = len(x0.shape)
if not -1 < rank < 2:
raise ValueError("Initial guess must be a scalar or rank-1 sequence.")
if maxiter is None:
maxiter = N * 200
if maxfun is None:
maxfun = N * 200
rho = 1
chi = 2
psi = 0.5
sigma = 0.5
one2np1 = list(range(1, N + 1))
if rank == 0:
sim = numpy.zeros((N + 1,), dtype=x0.dtype)
else:
sim = numpy.zeros((N + 1, N), dtype=x0.dtype)
fsim = numpy.zeros((N + 1,), float)
sim[0] = x0
if retall:
allvecs = [sim[0]]
fsim[0] = func(x0)
nonzdelt = 0.05
zdelt = 0.00025
for k in range(0, N):
y = numpy.array(x0, copy=True)
if y[k] != 0:
y[k] = (1 + nonzdelt)*y[k]
else:
y[k] = zdelt
sim[k + 1] = y
f = func(y)
fsim[k + 1] = f
ind = numpy.argsort(fsim)
fsim = numpy.take(fsim, ind, 0)
# sort so sim[0,:] has the lowest function value
sim = numpy.take(sim, ind, 0)
iterations = 1
while (fcalls[0] < maxfun and iterations < maxiter):
if (numpy.max(numpy.ravel(numpy.abs(sim[1:] - sim[0]))) <= xtol and
numpy.max(numpy.abs(fsim[0] - fsim[1:])) <= ftol):
break
xbar = numpy.add.reduce(sim[:-1], 0) / N
xr = (1 + rho) * xbar - rho * sim[-1]
fxr = func(xr)
doshrink = 0
if fxr < fsim[0]:
xe = (1 + rho * chi) * xbar - rho * chi * sim[-1]
fxe = func(xe)
if fxe < fxr:
sim[-1] = xe
fsim[-1] = fxe
else:
sim[-1] = xr
fsim[-1] = fxr
else: # fsim[0] <= fxr
if fxr < fsim[-2]:
sim[-1] = xr
fsim[-1] = fxr
else: # fxr >= fsim[-2]
# Perform contraction
if fxr < fsim[-1]:
xc = (1 + psi * rho) * xbar - psi * rho * sim[-1]
fxc = func(xc)
if fxc <= fxr:
sim[-1] = xc
fsim[-1] = fxc
else:
doshrink = 1
else:
# Perform an inside contraction
xcc = (1 - psi) * xbar + psi * sim[-1]
fxcc = func(xcc)
if fxcc < fsim[-1]:
sim[-1] = xcc
fsim[-1] = fxcc
else:
doshrink = 1
if doshrink:
for j in one2np1:
sim[j] = sim[0] + sigma * (sim[j] - sim[0])
fsim[j] = func(sim[j])
ind = numpy.argsort(fsim)
sim = numpy.take(sim, ind, 0)
fsim = numpy.take(fsim, ind, 0)
if callback is not None:
callback(sim[0])
iterations += 1
if retall:
allvecs.append(sim[0])
x = sim[0]
fval = numpy.min(fsim)
warnflag = 0
if fcalls[0] >= maxfun:
warnflag = 1
msg = _status_message['maxfev']
if disp:
print('Warning: ' + msg)
elif iterations >= maxiter:
warnflag = 2
msg = _status_message['maxiter']
if disp:
print('Warning: ' + msg)
else:
msg = _status_message['success']
if disp:
print(msg)
print(" Current function value: %f" % fval)
print(" Iterations: %d" % iterations)
print(" Function evaluations: %d" % fcalls[0])
result = OptimizeResult(fun=fval, nit=iterations, nfev=fcalls[0],
status=warnflag, success=(warnflag == 0),
message=msg, x=x)
if retall:
result['allvecs'] = allvecs
return result

I have used Nelder-Mead long time ago,but as I remember that you will find different local minima if you start from different starting points.You didn't give us your function,so we could only guess what should be best strategy for you.You should also read this
http://www.webpages.uidaho.edu/~fuchang/res/ANMS.pdf
Then you can try pure Python implementation
https://github.com/fchollet/nelder-mead/blob/master/nelder_mead.py

Related

why is this Python while-loop terminating after 2 iterations only?

I am struggling with understanding why this while loop (below in the code) terminates after 2 iterations only.
It is from an adative-step size RungeKutta45 Fehlberg Method (https://www.uni-muenster.de/imperia/md/content/physik_tp/lectures/ss2017/numerische_Methoden_fuer_komplexe_Systeme_II/rkm-1.pdf) page 10/11.
The below results in this output:
$ python3 runge_kutta_45_adaptive_optimalstepsizes.py
error at this step: 0.0
error at this step: 1.6543612251060553e-24
no of iterations of the while loop was: 2
last time t was: 0.001
import numpy as np
import os
import matplotlib
from matplotlib import pyplot as plt
# using current y_n and t_n, finds the largest possible dt_new such that the TRUNCATION ERROR
# after 1 integration step with timestep = this new dt_new (the one we want to settle upon)
# REMAINS below some given desired accuracy epsilon_0
# ADAPTIVE STEP-SIZE CONTROL
# always change dt to dt = h_new (optimal timestep change)
rhs_of_diff_Eq_str = "3 * t ** 2"
def first_derivative(t, y): # the first derivative of the function y(t)
first_derivative_value = 3 * t ** 2
return first_derivative_value
def get_RKF4_approx(t, y, dt):
k1 = first_derivative(t, y )
k2 = first_derivative(t + dt/4. , y + dt*( (1./4.)*k1 ) )
k3 = first_derivative(t + dt*(3./8.) , y + dt*( (3./32.)*k1 + (9./32.)*k2 ) )
k4 = first_derivative(t + dt*(12./13.) , y + dt*( (1932./2197.)*k1 - (7200./2197.)*k2 + (7296./2197.)*k3 ) )
k5 = first_derivative(t + dt, y + dt*( (439./216.)*k1 - 8.*k2 + (3680./513.)*k3 - (845./4104)*k4 ) )
RKF4 = y + dt * ( (25./216)*k1 + (1408/2565.)*k3 + (2197./4104.)*k4 - (1./5.)*k5 )
return np.array([RKF4, k1, k2, k3, k4, k5])
def get_RKF5_approx_efficiently(t, y, dt, ks): # efficient ! re-uses derivative evaluations from RKF4 (previous) calculation.
# ks is a numpy array
# ks[0] is K1, ks[1] is K2, ... , ks[4] is K5
k6 = first_derivative(t + dt*(1./2.), y + dt*(-(8./27.)*ks[0] + 2.*ks[1] - (3544./2565.)*ks[2] + (1859./4104.)*ks[3] - (11./40.)*ks[4]) )
RKF5 = y + dt * ( (16./135.)*ks[0] + (6656./12825.)*ks[2] + (28561./56430.)*ks[3] - (9./50.)*ks[4] +(2./55.)*k6 )
return RKF5 # a number
ts = []
ys = []
tfinal = 10.0
nmax = 10**6
epsilon_0 = 10**(-6)
contor = 0
dt = 0.001
beta = 0.9
t = 0.0 # initial condition
y = 0.0 # initial condition
while (t < tfinal and contor < nmax):
contor += 1
container_from_RKF4method = get_RKF4_approx(t, y, dt)
RKF4 = container_from_RKF4method[0] # the RKF4 method's approximation for y_{n+1}
ks = container_from_RKF4method[1:]
RKF5 = get_RKF5_approx_efficiently(t, y, dt, ks)
error_at_this_step = abs(RKF5 - RKF4)
print("error at this step: {}".format(error_at_this_step))
if (error_at_this_step < epsilon_0 and error_at_this_step != 0.0):
# yes, step accepted! need optimal timestep
dt_new = beta * dt * (epsilon_0/error_at_this_step)**(0.25)
ts.append(t)
t += dt_new
dt = dt_new
y_new = RKF5
ys.append(y_new)
y = y_new
else:
if (error_at_this_step == 0.0): # it's perfect! keep carrying on with this timestep which gives 0 error.
ts.append(t)
t += dt
y_new = RKF5
ys.append(y_new)
y = y_new
else: # means that error_at_this_step > epsilon_0 and that error_at_this_step != 0
# no, step not accepted. reiterate step using a lower timestep
dt_new = beta * dt * (epsilon_0/error_at_this_step)**(0.2)
dt = dt_new
# no changes made to time t and y
# repeat this step (reiterate step)
# HERE THE PROBLEM SHALL BE! I DON'T KNOW WHY THE ABOVE 2 instructions are bad!
print("no of iterations of the while loop was: {}".format(contor))
ts = np.array(ts)
print("last time t was: {}".format(ts[-1]))
ys = np.array(ys)
plt.figure()
plt.plot(ts, ys, label='y values', color='red')
plt.xlabel('t')
plt.ylabel('y')
plt.title("RK45 adaptive step-size (optimal step-size always chosen) integration for dy/dt = f(y,t) \n" + "f(y,t) = " + rhs_of_diff_Eq_str)
plt.savefig("RK45_adaptive_step_size_optimal_step_size_results.pdf", bbox_inches='tight')
I have tried to look at instructions execution with breakpoint() and pressing n and/or s.
It seems that the while-loop literally stops after the 2nd iteration.
Do you see why this is the case?
Time t doesn't reach tfinal or contor=10**6-1=nmax!
The bit of the pdb debugging which shows the problem is:
> /mnt/c/Users/iusti/Desktop/runge_kutta_45_adaptive_optimalstepsizes.py(46)<module>()
-> while (t < tfinal and contor < nmax):
(Pdb) s
> /mnt/c/Users/iusti/Desktop/runge_kutta_45_adaptive_optimalstepsizes.py(79)<module>()
-> print("no of iterations of the while loop was: {}".format(contor))
(Pdb) s
[2]+ Stopped python3 runge_kutta_45_adaptive_optimalstepsizes.py
Thanks!
In the second iteration, try to print dt_new before the line t += dt_new in this block:
if (error_at_this_step < epsilon_0 and error_at_this_step != 0.0):
# yes, step accepted! need optimal timestep
dt_new = beta * dt * (epsilon_0/error_at_this_step)**(0.25)
ts.append(t)
t += dt_new
dt = dt_new
y_new = RKF5
ys.append(y_new)
y = y_new
I guess the dt_new value would be so big that adding it to t will cause t to be >= tfinal hence the while condition on the third iteration will not hold anymore which causes the termination.

Gradient Descent returns NaN values for slope and error

I'm new to machine learning and am trying to implement gradient descent. The code I have looks like this and has been resulting in NaN values for all parameters:
def compute_error_for_line_given_points(b,m,points):
totalError = 0 #sum of square error formula
for i in range (0, len(points)):
x = points[i, 0]
y = points[i, 1]
totalError += (y-(m*x + b)) ** 2
return totalError/ float(len(points))
def step_gradient(b_current, m_current, points, learning_rate):
#gradient descent
b_gradient = 0
m_gradient = 0
N = float(len(points))
for i in range(0, len(points)):
x = points[i, 0]
y = points[i, 1]
b_gradient += -(2/N) * (y - (m_current * x + b_current))
m_gradient += -(2/N) * x * (y - (m_current * x + b_current))
new_b = b_current - (learning_rate * b_gradient)
new_m = m_current - (learning_rate * m_gradient)
return [new_b,new_m]
def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
b = starting_b
m = starting_m
for i in range(num_iterations):
b,m = step_gradient(b, m, array(points), learning_rate)
return [b,m]
def run():
#Step 1: Collect the data
points = genfromtxt("C:/Users/mishruti/Downloads/For Linear Regression.csv", delimiter = ",")
#Step 2: Define our Hyperparameters
learning_rate = 0.0000001 #how fast the data converge
#y=mx+b (Slope formule)
initial_b = 0 # initial y-intercept guess
initial_m = 0 # initial slope guess
num_iterations = 4
print("Starting gradient descent at b = {0}, m = {1}, error = {2}".format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points)))
print("Running...")
[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)
print("After {0} iterations b = {1}, m = {2}, error = {3}".format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)))
# main function
if __name__ == "__main__":
run()
A sample from my data set is attached. Can someone please help me figure this out? Thanks!

Monte Carlo simulation of a system of Lennard-Jones + FENE potential

I want to generate two linear chains of 20 monomers each at some distance to each other. The following code generates a single chain. Could someone help me with how to generate the second chain?
The two chains are fixed to a surface i.e the first monomer of the chain is fixed and the rest of the monomers move freely in x-y-z directions but the z component of the monomers should be positive.
Something like this:
import numpy as np
import numba as nb
#import pandas as pd
#nb.jit()
def gen_chain(N):
x = np.zeros(N)
y = np.zeros(N)
z = np.linspace(0, (N)*0.9, num=N)
return np.column_stack((x, y, z)), np.column_stack((x1, y1, z1))
#coordinates = np.loadtxt('2GN_50_T_10.txt', skiprows=199950)
#return coordinates
#nb.jit()
def lj(rij2):
sig_by_r6 = np.power(sigma**2 / rij2, 3)
sig_by_r12 = np.power(sigma**2 / rij2, 6)
lje = 4 * epsilon * (sig_by_r12 - sig_by_r6)
return lje
#nb.jit()
def fene(rij2):
return (-0.5 * K * np.power(R, 2) * np.log(1 - ((np.sqrt(rij2) - r0) / R)**2))
#nb.jit()
def total_energy(coord):
# Non-bonded energy.
e_nb = 0.0
for i in range(N):
for j in range(i - 1):
ri = coord[i]
rj = coord[j]
rij = ri - rj
rij2 = np.dot(rij, rij)
if (rij2 < rcutoff_sq):
e_nb += lj(rij2)
# Bonded FENE potential energy.
e_bond = 0.0
for i in range(1, N):
ri = coord[i]
rj = coord[i - 1] # Can be [i+1] ??
rij = ri - rj
rij2 = np.dot(rij, rij)
e_bond += fene(rij2)
return e_nb + e_bond
#nb.jit()
def move(coord):
trial = np.ndarray.copy(coord)
for i in range(1, N):
while True:
delta = (2 * np.random.rand(3) - 1) * max_delta
trial[i] += delta
#while True:
if trial[i,2] > 0.0:
break
trial[i] -= delta
return trial
#nb.jit()
def accept(delta_e):
beta = 1.0 / T
if delta_e < 0.0:
return True
random_number = np.random.rand(1)
p_acc = np.exp(-beta * delta_e)
if random_number < p_acc:
return True
return False
if __name__ == "__main__":
# FENE potential parameters.
K = 40.0
R = 0.3
r0 = 0.7
# L-J potential parameters
sigma = 0.5716
epsilon = 1.0
# MC parameters
N = 20 # Numbers of monomers
rcutoff = 2.5 * sigma
rcutoff_sq = rcutoff * rcutoff
max_delta = 0.01
n_steps = 100000
T = 10
# MAIN PART OF THE CODE
coord = gen_chain(N)
energy_current = total_energy(coord)
traj = open('2GN_20_T_10.xyz', 'w')
traj_txt = open('2GN_20_T_10.txt', 'w')
for step in range(n_steps):
if step % 1000 == 0:
traj.write(str(N) + '\n\n')
for i in range(N):
traj.write("C %10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2]))
traj_txt.write("%10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2]))
print(step, energy_current)
coord_trial = move(coord)
energy_trial = total_energy(coord_trial)
delta_e = energy_trial - energy_current
if accept(delta_e):
coord = coord_trial
energy_current = energy_trial
traj.close()
I except the chain of particles to collapse into a globule.
There is some problem with the logic of the MC you are implementing.
To perform a MC you need to ATTEMPT a move, evaluate the energy of the new state and then accept/reject according to a random number.
In your code there is not the slightest sign of the attempt to move a particle.
You need to move one (or more of them), evaluate the energy, and then update your coordinates.
By the way, I suppose this is not your entire code. There are many parameters that are not defined like the "k" and the "R0" in your fene potential
The FENE potential models bond interactions. What your code is saying is that all particles within the cutoff are bonded by FENE springs, and that the bonds are not fixed but rather defined by the cutoff. With a r_cutoff = 3.0, larger than equilibrium distance of the LJ well, you are essentially considering that each particle is bonded to potentially many others. You are treating the FENE potential as a non-bonded one.
For the bond interactions you should ignore the cutoff and only evaluate the energy for the actual pairs that are bonded according to your topology, which means that first you need to define a topology. I suggest generating a linear molecule of N atoms in a box big enough to contain the whole stretched molecule, and consider the i-th atom as bonded to the (i-1)-th atom, with i = 2, ..., N. In this way the topology is well defined and persistent. Then consider both interactions separately, non-bonded and bond, and add them at the end.
Something like this, in pseudo-code:
e_nb = 0
for particle i = 1 to N:
for particle j = 1 to i-1:
if (dist(i, j) < rcutoff):
e_nb += lj(i, j)
e_bond = 0
for particle i = 2 to N:
e_bond += fene(i, i-1)
e_tot = e_nb + e_bond
Below you can find a modified version of your code. To make things simpler, in this version there is no box and no boundary conditions, just a chain in free space. The chain is initialized as a linear sequence of particles each distant 80% of R0 from the next, since R0 is the maximum length of the FENE bond. The code considers that particle i is bonded with i+1 and the bond is not broken. This code is just a proof of concept.
#!/usr/bin/python
import numpy as np
def gen_chain(N, R):
x = np.linspace(0, (N-1)*R*0.8, num=N)
y = np.zeros(N)
z = np.zeros(N)
return np.column_stack((x, y, z))
def lj(rij2):
sig_by_r6 = np.power(sigma/rij2, 3)
sig_by_r12 = np.power(sig_by_r6, 2)
lje = 4.0 * epsilon * (sig_by_r12 - sig_by_r6)
return lje
def fene(rij2):
return (-0.5 * K * R0**2 * np.log(1-(rij2/R0**2)))
def total_energy(coord):
# Non-bonded
e_nb = 0
for i in range(N):
for j in range(i-1):
ri = coord[i]
rj = coord[j]
rij = ri - rj
rij2 = np.dot(rij, rij)
if (rij2 < rcutoff):
e_nb += lj(rij2)
# Bonded
e_bond = 0
for i in range(1, N):
ri = coord[i]
rj = coord[i-1]
rij = ri - rj
rij2 = np.dot(rij, rij)
e_bond += fene(rij2)
return e_nb + e_bond
def move(coord):
trial = np.ndarray.copy(coord)
for i in range(N):
delta = (2.0 * np.random.rand(3) - 1) * max_delta
trial[i] += delta
return trial
def accept(delta_e):
beta = 1.0/T
if delta_e <= 0.0:
return True
random_number = np.random.rand(1)
p_acc = np.exp(-beta*delta_e)
if random_number < p_acc:
return True
return False
if __name__ == "__main__":
# FENE parameters
K = 40
R0 = 1.5
# LJ parameters
sigma = 1.0
epsilon = 1.0
# MC parameters
N = 50 # number of particles
rcutoff = 3.5
max_delta = 0.01
n_steps = 10000000
T = 1.5
coord = gen_chain(N, R0)
energy_current = total_energy(coord)
traj = open('traj.xyz', 'w')
for step in range(n_steps):
if step % 1000 == 0:
traj.write(str(N) + '\n\n')
for i in range(N):
traj.write("C %10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2]))
print(step, energy_current)
coord_trial = move(coord)
energy_trial = total_energy(coord_trial)
delta_e = energy_trial - energy_current
if accept(delta_e):
coord = coord_trial
energy_current = energy_trial
traj.close()
The code prints the current configuration at each step, you can just load it up on VMD and see how it behaves. The bonds will not show correctly at first on VMD, you must use a bead representation for the particles and define the bonds manually or with a script within VMD. In any case, you don't need to see the bonds to notice that the chain does not collapse.
Please bear in mind that if you want to simulate a chain at a certain density, you need to be careful to generate the correct topology. I recommend the EMC package to efficiently generate polymers at the desired thermodynamic conditions. It is by no means a trivial problem, especially for larger chains.
By the way, your code had an error in the FENE energy evaluation. rij2 is already squared, you squared it again.
Below you can see how the total energy as a function of the number of steps behaves for T = 1.0, N = 20, rcutoff = 3.5, and also the last current configuration after 10 thousand steps.
And below for N = 50, T = 1.5, max_delta = 0.01, K = 40, R = 1.5, rcutoff = 3.5, and 10 million steps. This is the last current configuration.
The full "trajectory", which isn't really a trajectory since this is MC, you can find here (it's under 6 MB).

Conversion Matlab to Python code - DOSNES algorithm

I'm trying to implement the DOSNES algorithm from this publication but in Python for a project. I found this Matlab Implementation which works well but I probably mistranslated one or more steps in my code (mainly with axis I guess) because I clearly don't reach the same result. This is the part I'm strugglering with in Matlab:
P(1:n + 1:end) = 0; % set diagonal to zero
P = 0.5 * (P + P'); '% symmetrize P-values
P = max(P ./ sum(P(:)), realmin); % make sure P-values sum to one
const = sum(P(:) .* log(P(:))); % constant in KL divergence
ydata = .0001 * randn(n, no_dims);
y_incs = zeros(size(ydata));
gains = ones(size(ydata));
% Run the iterations
for iter=1:max_iter
% Compute joint probability that point i and j are neighbors
sum_ydata = sum(ydata .^ 2, 2);
num = 1 ./ (1 + bsxfun(#plus, sum_ydata, bsxfun(#plus, sum_ydata', -2 * (ydata * ydata')))); % Student-t distribution
num(1:n+1:end) = 0; % set diagonal to zero
Q = max(num ./ sum(num(:)), realmin); % normalize to get probabilities
% Compute the gradients (faster implementation)
L = (P - Q) .* num;
y_grads = 4 * (diag(sum(L, 1)) - L) * ydata;
% Update the solution
gains = (gains + .2) .* (sign(y_grads) ~= sign(y_incs)) ... % note that the y_grads are actually -y_grads
+ (gains * .8) .* (sign(y_grads) == sign(y_incs));
gains(gains < min_gain) = min_gain;
y_incs = momentum * y_incs - epsilon * (gains .* y_grads);
ydata = ydata + y_incs;
% Spherical projection
ydata = bsxfun(#minus, ydata, mean(ydata, 1));
r_mean = mean(sqrt(sum(ydata.^2,2)),1);
ydata = bsxfun(#times, ydata, r_mean./ sqrt(sum(ydata.^2,2)) );
% Update the momentum if necessary
if iter == mom_switch_iter
momentum = final_momentum;
end
% Print out progress
if ~rem(iter, 10)
cost = const - sum(P(:) .* log(Q(:)));
disp(['Iteration ' num2str(iter) ': error is ' num2str(cost)]);
end
end
and this is my python version :
no_dims = 3
n = X.shape[0]
min_gain = 0.01
momentum = 0.5
final_momentum = 0.8
epsilon = 500
mom_switch_iter = 250
max_iter = 1000
P[np.diag_indices_from(P)] = 0.
P = ( P + P.T )/2
P = np.max(P / np.sum(P), axis=0)
const = np.sum( P * np.log(P) )
ydata = 1e-4 * np.random.random(size=(n, no_dims))
y_incs = np.zeros(shape=ydata.shape)
gains = np.ones(shape=ydata.shape)
for iter in range(max_iter):
sum_ydata = np.sum(ydata**2, axis = 1)
bsxfun_1 = sum_ydata.T + -2*np.dot(ydata, ydata.T)
bsxfun_2 = sum_ydata + bsxfun_1
num = 1. / ( 1 + bsxfun_2 )
num[np.diag_indices_from(num)] = 0.
Q = np.max(num / np.sum(num), axis=0)
L = (P - Q) * num
t = np.diag( L.sum(axis=0) ) - L
y_grads = 4 * np.dot( t , ydata )
gains = (gains + 0.2) * ( np.sign(y_grads) != np.sign(y_incs) ) \
+ (gains * 0.8) * ( np.sign(y_grads) == np.sign(y_incs) )
# gains[np.where(np.sign(y_grads) != np.sign(y_incs))] += 0.2
# gains[np.where(np.sign(y_grads) == np.sign(y_incs))] *= 0.8
gains = np.clip(gains, a_min = min_gain, a_max = None)
y_incs = momentum * y_incs - epsilon * gains * y_grads
ydata += y_incs
ydata -= ydata.mean(axis=0)
alpha = np.sqrt(np.sum(ydata ** 2, axis=1))
r_mean = np.mean(alpha)
ydata = ydata * (r_mean / alpha).reshape(-1, 1)
if iter == mom_switch_iter:
momentum = final_momentum
if iter % 10 == 0:
cost = const - np.sum( P * np.log(Q) )
print( "Iteration {} : error is {}".format(iter, cost) )
If you want to do trials, you can download the repository here which uses Iris Dataset and an attached library. test.py is my test implementation with Iris dataset and visu.py is the result the paper has on MNIST dataset but restricted to 1000k random points.
Many thanks for your support,
Nicolas
EDIT 1
This is the final code working as expected :
P[np.diag_indices_from(P)] = 0.
P = ( P + P.T )/2
P = P / np.sum(P)
const = np.sum(xlogy(P, P))
ydata = 1e-4 * np.random.random(size=(n, no_dims))
y_incs = np.zeros(shape=ydata.shape)
gains = np.ones(shape=ydata.shape)
for iter in range(max_iter):
sum_ydata = np.sum(ydata**2, axis = 1)
bsxfun_1 = sum_ydata.T + -2*np.dot(ydata, ydata.T)
bsxfun_2 = sum_ydata + bsxfun_1
num = 1. / ( 1 + bsxfun_2 )
num[np.diag_indices_from(num)] = 0.
Q = num / np.sum(num)
L = (P - Q) * num
t = np.diag( L.sum(axis=0) ) - L
y_grads = 4 * np.dot( t , ydata )
gains = (gains + 0.2) * ( np.sign(y_grads) != np.sign(y_incs) ) \
+ (gains * 0.8) * ( np.sign(y_grads) == np.sign(y_incs) )
gains = np.clip(gains, a_min = min_gain, a_max = None)
y_incs = momentum * y_incs - epsilon * gains * y_grads
ydata += y_incs
ydata -= ydata.mean(axis=0)
alpha = np.sqrt(np.sum(ydata ** 2, axis=1))
r_mean = np.mean(alpha)
ydata = ydata * (r_mean / alpha).reshape(-1, 1)
if iter == mom_switch_iter:
momentum = final_momentum
if iter % 10 == 0:
cost = const - np.sum( xlogy(P, Q) )
print( "Iteration {} : error is {}".format(iter, cost) )
Right at the beginning you seem to replace a nonreducing max in matlab (it has two arguments, so it will compare those one by one and return a full size P) with a reducing max in python (axis=0 will reduce along this axis, meaning that the result will have one dimension less).
My advice, however, is to leave out the max altogether because it looks pretty much like an amateurish attempt of sidestepping the problem of p log p being defined at 0 only via taking the limit p->0 which using L'Hopital's rule can be shown to be 0, whereas the computer will returm NaN when asked to compute 0 * log(0).
The proper way of going about this is using scipy.special.xlogy which treats 0 correctly.

Python implementation of the Wilson Score Interval?

After reading How Not to Sort by Average Rating, I was curious if anyone has a Python implementation of a Lower bound of Wilson score confidence interval for a Bernoulli parameter?
Reddit uses the Wilson score interval for comment ranking, an explanation and python implementation can be found here
#Rewritten code from /r2/r2/lib/db/_sorts.pyx
from math import sqrt
def confidence(ups, downs):
n = ups + downs
if n == 0:
return 0
z = 1.0 #1.44 = 85%, 1.96 = 95%
phat = float(ups) / n
return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
I think this one has a wrong wilson call, because if you have 1 up 0 down you get NaN because you can't do a sqrt on the negative value.
The correct one can be found when looking at the ruby example from the article How not to sort by average page:
return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
To get the Wilson CI without continuity correction, you can use proportion_confint in statsmodels.stats.proportion. To get the Wilson CI with continuity correction, you can use the code below.
# cf.
# [1] R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998
# [2] R. G. Newcombe. Interval Estimation for the difference between independent proportions: comparison of eleven methods, 1998
import numpy as np
from statsmodels.stats.proportion import proportion_confint
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def propci_wilson_cc(count, nobs, alpha=0.05):
# get confidence limits for proportion
# using wilson score method w/ cont correction
# i.e. Method 4 in Newcombe [1];
# verified via Table 1
from scipy import stats
n = nobs
p = count/n
q = 1.-p
z = stats.norm.isf(alpha / 2.)
z2 = z**2
denom = 2*(n+z2)
num = 2.*n*p+z2-1.-z*np.sqrt(z2-2-1./n+4*p*(n*q+1))
ci_l = num/denom
num = 2.*n*p+z2+1.+z*np.sqrt(z2+2-1./n+4*p*(n*q-1))
ci_u = num/denom
if p == 0:
ci_l = 0.
elif p == 1:
ci_u = 1.
return ci_l, ci_u
def dpropci_wilson_nocc(a,m,b,n,alpha=0.05):
# get confidence limits for difference in proportions
# a/m - b/n
# using wilson score method WITHOUT cont correction
# i.e. Method 10 in Newcombe [2]
# verified via Table II
theta = a/m - b/n
l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
l2, u2 = proportion_confint(count=b, nobs=n, alpha=0.05, method='wilson')
ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)
return ci_l, ci_u
def dpropci_wilson_cc(a,m,b,n,alpha=0.05):
# get confidence limits for difference in proportions
# a/m - b/n
# using wilson score method w/ cont correction
# i.e. Method 11 in Newcombe [2]
# verified via Table II
theta = a/m - b/n
l1, u1 = propci_wilson_cc(count=a, nobs=m, alpha=alpha)
l2, u2 = propci_wilson_cc(count=b, nobs=n, alpha=alpha)
ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)
return ci_l, ci_u
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# single proportion testing
# these come from Newcombe [1] (Table 1)
a_vec = np.array([81, 15, 0, 1])
m_vec = np.array([263, 148, 20, 29])
for (a,m) in zip(a_vec,m_vec):
l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
l2, u2 = propci_wilson_cc(count=a, nobs=m, alpha=0.05)
print(a,m,l1,u1,' ',l2,u2)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# difference in proportions testing
# these come from Newcombe [2] (Table II)
a_vec = np.array([56,9,6,5,0,0,10,10],dtype=float)
m_vec = np.array([70,10,7,56,10,10,10,10],dtype=float)
b_vec = np.array([48,3,2,0,0,0,0,0],dtype=float)
n_vec = np.array([80,10,7,29,20,10,20,10],dtype=float)
print('\nWilson without CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
l, u = dpropci_wilson_nocc(a,m,b,n,alpha=0.05)
print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))
print('\nWilson with CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
l, u = dpropci_wilson_cc(a,m,b,n,alpha=0.05)
print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))
HTH
The accepted solution seems to use a hard-coded z-value (best for performance).
In the event that you wanted a direct python equivalent of the ruby formula from the blogpost with a dynamic z-value (based on the confidence interval):
import math
import scipy.stats as st
def ci_lower_bound(pos, n, confidence):
if n == 0:
return 0
z = st.norm.ppf(1 - (1 - confidence) / 2)
phat = 1.0 * pos / n
return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)
If you'd like to actually calculate z directly from a confidence bound and want to avoid installing numpy/scipy, you can use the following snippet of code,
import math
def binconf(p, n, c=0.95):
'''
Calculate binomial confidence interval based on the number of positive and
negative events observed. Uses Wilson score and approximations to inverse
of normal cumulative density function.
Parameters
----------
p: int
number of positive events observed
n: int
number of negative events observed
c : optional, [0,1]
confidence percentage. e.g. 0.95 means 95% confident the probability of
success lies between the 2 returned values
Returns
-------
theta_low : float
lower bound on confidence interval
theta_high : float
upper bound on confidence interval
'''
p, n = float(p), float(n)
N = p + n
if N == 0.0: return (0.0, 1.0)
p = p / N
z = normcdfi(1 - 0.5 * (1-c))
a1 = 1.0 / (1.0 + z * z / N)
a2 = p + z * z / (2 * N)
a3 = z * math.sqrt(p * (1-p) / N + z * z / (4 * N * N))
return (a1 * (a2 - a3), a1 * (a2 + a3))
def erfi(x):
"""Approximation to inverse error function"""
a = 0.147 # MAGIC!!!
a1 = math.log(1 - x * x)
a2 = (
2.0 / (math.pi * a)
+ a1 / 2.0
)
return (
sign(x) *
math.sqrt( math.sqrt(a2 * a2 - a1 / a) - a2 )
)
def sign(x):
if x < 0: return -1
if x == 0: return 0
if x > 0: return 1
def normcdfi(p, mu=0.0, sigma2=1.0):
"""Inverse CDF of normal distribution"""
if mu == 0.0 and sigma2 == 1.0:
return math.sqrt(2) * erfi(2 * p - 1)
else:
return mu + math.sqrt(sigma2) * normcdfi(p)
Here is a simplified (no need for numpy) and slightly improved (0 and n values for k do not cause a math domain error) version of the Wilson score confidence interval with continuity correction, from the original sourcecode written by batesbatesbates in another answer, and also a pure python no-numpy non-continuity correction version, with 2 equivalent ways to calculate (can be switched with eqmode argument, but both ways give the exact same non-continuity correction results):
import math
def propci_wilson_nocc(k, n, z=1.96, eqmode=0):
# Calculates the Binomial Proportion Confidence Interval using the Wilson Score method without continuation correction
# Equations eqmode == 1 from: https://en.wikipedia.org/w/index.php?title=Binomial_proportion_confidence_interval&oldid=1101942017#Wilson_score_interval
# Equations eqmode == 0 from: https://www.evanmiller.org/how-not-to-sort-by-average-rating.html
# The results should be close to:
# from statsmodels.stats.proportion import proportion_confint
# proportion_confint(k, n, alpha=0.05, method='wilson')
#z=1.44 = 85%, 1.96 = 95%
if n == 0:
return 0
p_hat = float(k) / n
z2 = z**2
if eqmode == 0:
ci_l = (p_hat + z2/(2*n) - z*math.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n)
else:
ci_l = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) - (z / (1 + z2/n)) * math.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2)))))
if eqmode == 0:
ci_u = (p_hat + z2/(2*n) + z*math.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n)
else:
ci_u = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) + (z / (1 + z2/n)) * math.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2)))))
return [ci_l, ci_u]
def propci_wilson_cc(n, k, z=1.96):
# Calculates the Binomial Proportion Confidence Interval using the Wilson Score method with continuation correction
# i.e. Method 4 in Newcombe [1]: R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998;
# verified via Table 1
# originally written by batesbatesbates https://stackoverflow.com/questions/10029588/python-implementation-of-the-wilson-score-interval/74021634#74021634
p_hat = k/n
q = 1.0-p
z2 = z**2
denom = 2*(n+z2)
num = 2.0*n*p_hat + z2 - 1.0 - z*math.sqrt(max(0.0, z2 - 2 - 1.0/n + 4*p_hat*(n*q + 1)))
ci_l = num/denom
num2 = 2.0*n*p_hat + z2 + 1.0 + z*math.sqrt(max(0.0, z2 + 2 - 1.0/n + 4*p_hat*(n*q - 1)))
ci_u = num2/denom
if p_hat == 0:
ci_l = 0.0
elif p_hat == 1:
ci_u = 1.0
return [ci_l, ci_u]
Note that the returned value will always be bounded between [0.0, 1.0] (due to how p_hat is a ratio of k/n), this is why it's a score and not really a confidence interval, but it's easy to project back to a confidence interval by multiplying ci_l * n and ci_u * n, these values will be in the same domain as k and can be plotted alongside.
Here is a much more readable version for how to compute the Wilson Score interval without continuity correction, by Bartosz Mikulski:
from math import sqrt
def wilson(p, n, z = 1.96):
denominator = 1 + z**2/n
centre_adjusted_probability = p + z*z / (2*n)
adjusted_standard_deviation = sqrt((p*(1 - p) + z*z / (4*n)) / n)
lower_bound = (centre_adjusted_probability - z*adjusted_standard_deviation) / denominator
upper_bound = (centre_adjusted_probability + z*adjusted_standard_deviation) / denominator
return (lower_bound, upper_bound)

Categories