While loop overperforming for loop - Python - python

I'm writing a simple integration approximation function. Thing is that no matter what I do in my code, it seems that my while implementation always beats my for implementation which is very weird, since for should be faster as it does not check a boolean expression and increase a variable on each iteration.
The code:
import math
import time
import numpy as np
def function(x):
return math.cos(x)
def integrate_while(func, x_start, x_end, n_steps=10_000):
step_length = (x_end - x_start) / n_steps
area = 0
x = x_start
x_end -= step_length
while x < x_end:
area += func(x) + func(x + step_length)
x += step_length
area *= step_length / 2
return area
def integrate_for(func, x_start, x_end, n_steps=10_000):
step_length = (x_end - x_start) / n_steps
area = 0
ls = np.linspace(x_start, x_end, n_steps)
for x in ls:
area += func(x) + func(x + step_length)
area *= step_length / 2
return area
def integrate_for_range(func, x_start, x_end, n_steps=10_000):
step_length = (x_end - x_start) / n_steps
area = 0
for i in range(n_steps):
x = x_start + i * step_length
area += func(x) + func(x + step_length)
area *= step_length / 2
return area
def test():
integrate_funcs = [integrate_while, integrate_for, integrate_for_range]
for integrate_func in integrate_funcs:
t1 = time.time_ns()
result = integrate_func(function, 0, math.pi / 2, n_steps=1_000_000)
t2 = time.time_ns()
print(f'Function {integrate_func.__name__}. Result: {round(result, 4)}, Elapsed ns: {t2-t1:,}.')
test()
Results:
Function integrate_while. Result: 1.0, Elapsed ns: 569,587,400.
Function integrate_for. Result: 1.0, Elapsed ns: 638,829,800.
Function integrate_for_range. Result: 1.0, Elapsed ns: 596,499,300.
Edit: Already checked the np.linspace object creation impact on total excecution time of the for loop and it is less than 1% of total time.

Original np.linspace was the problem since numpy arrays are not thought to be iterated as there is important overhead when you do so.
After changing np.linspace with native range type, I could overperform the simple while loop.
Also some extra changes improved performance too:
Setting variable types before entering into for loop (10% less time).
Change loop logic to avoid extra function calls (50% less time).
Final function code:
def integrate_for_range_opt(func, x_start, x_end, n_steps=10_000):
step_length = (x_end - x_start) / n_steps
area = 0.0
x = float(x_start) + step_length
for i in range(n_steps - 1):
area = area + func(x)
x = x + step_length
area = area * 2.0 + func(x_start) + func(x_end)
return area * step_length / 2.0

Your question doesn't actually ask a question and to me it seemed to ask for an explanation, but your comments and your own answer give me the impression that what you really want is a fast solution. So here's a faster one, making Python functions do more work for you:
from itertools import accumulate, repeat
def integrate_map_accumulate(func, x_start, x_end, n_steps=10_000):
step_length = (x_end - x_start) / n_steps
xs = accumulate(repeat(step_length, n_steps - 2),
initial = x_start + step_length)
area = sum(map(func, xs))
area = area * 2.0 + func(x_start) + func(x_end)
return area * step_length / 2.0
In my testing with your benchmark code, that reduced the time from ~0.21 seconds (for your answer's solution) to ~0.14 seconds.

Related

Monte Carlo simulation of a system of Lennard-Jones + FENE potential

I want to generate two linear chains of 20 monomers each at some distance to each other. The following code generates a single chain. Could someone help me with how to generate the second chain?
The two chains are fixed to a surface i.e the first monomer of the chain is fixed and the rest of the monomers move freely in x-y-z directions but the z component of the monomers should be positive.
Something like this:
import numpy as np
import numba as nb
#import pandas as pd
#nb.jit()
def gen_chain(N):
x = np.zeros(N)
y = np.zeros(N)
z = np.linspace(0, (N)*0.9, num=N)
return np.column_stack((x, y, z)), np.column_stack((x1, y1, z1))
#coordinates = np.loadtxt('2GN_50_T_10.txt', skiprows=199950)
#return coordinates
#nb.jit()
def lj(rij2):
sig_by_r6 = np.power(sigma**2 / rij2, 3)
sig_by_r12 = np.power(sigma**2 / rij2, 6)
lje = 4 * epsilon * (sig_by_r12 - sig_by_r6)
return lje
#nb.jit()
def fene(rij2):
return (-0.5 * K * np.power(R, 2) * np.log(1 - ((np.sqrt(rij2) - r0) / R)**2))
#nb.jit()
def total_energy(coord):
# Non-bonded energy.
e_nb = 0.0
for i in range(N):
for j in range(i - 1):
ri = coord[i]
rj = coord[j]
rij = ri - rj
rij2 = np.dot(rij, rij)
if (rij2 < rcutoff_sq):
e_nb += lj(rij2)
# Bonded FENE potential energy.
e_bond = 0.0
for i in range(1, N):
ri = coord[i]
rj = coord[i - 1] # Can be [i+1] ??
rij = ri - rj
rij2 = np.dot(rij, rij)
e_bond += fene(rij2)
return e_nb + e_bond
#nb.jit()
def move(coord):
trial = np.ndarray.copy(coord)
for i in range(1, N):
while True:
delta = (2 * np.random.rand(3) - 1) * max_delta
trial[i] += delta
#while True:
if trial[i,2] > 0.0:
break
trial[i] -= delta
return trial
#nb.jit()
def accept(delta_e):
beta = 1.0 / T
if delta_e < 0.0:
return True
random_number = np.random.rand(1)
p_acc = np.exp(-beta * delta_e)
if random_number < p_acc:
return True
return False
if __name__ == "__main__":
# FENE potential parameters.
K = 40.0
R = 0.3
r0 = 0.7
# L-J potential parameters
sigma = 0.5716
epsilon = 1.0
# MC parameters
N = 20 # Numbers of monomers
rcutoff = 2.5 * sigma
rcutoff_sq = rcutoff * rcutoff
max_delta = 0.01
n_steps = 100000
T = 10
# MAIN PART OF THE CODE
coord = gen_chain(N)
energy_current = total_energy(coord)
traj = open('2GN_20_T_10.xyz', 'w')
traj_txt = open('2GN_20_T_10.txt', 'w')
for step in range(n_steps):
if step % 1000 == 0:
traj.write(str(N) + '\n\n')
for i in range(N):
traj.write("C %10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2]))
traj_txt.write("%10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2]))
print(step, energy_current)
coord_trial = move(coord)
energy_trial = total_energy(coord_trial)
delta_e = energy_trial - energy_current
if accept(delta_e):
coord = coord_trial
energy_current = energy_trial
traj.close()
I except the chain of particles to collapse into a globule.
There is some problem with the logic of the MC you are implementing.
To perform a MC you need to ATTEMPT a move, evaluate the energy of the new state and then accept/reject according to a random number.
In your code there is not the slightest sign of the attempt to move a particle.
You need to move one (or more of them), evaluate the energy, and then update your coordinates.
By the way, I suppose this is not your entire code. There are many parameters that are not defined like the "k" and the "R0" in your fene potential
The FENE potential models bond interactions. What your code is saying is that all particles within the cutoff are bonded by FENE springs, and that the bonds are not fixed but rather defined by the cutoff. With a r_cutoff = 3.0, larger than equilibrium distance of the LJ well, you are essentially considering that each particle is bonded to potentially many others. You are treating the FENE potential as a non-bonded one.
For the bond interactions you should ignore the cutoff and only evaluate the energy for the actual pairs that are bonded according to your topology, which means that first you need to define a topology. I suggest generating a linear molecule of N atoms in a box big enough to contain the whole stretched molecule, and consider the i-th atom as bonded to the (i-1)-th atom, with i = 2, ..., N. In this way the topology is well defined and persistent. Then consider both interactions separately, non-bonded and bond, and add them at the end.
Something like this, in pseudo-code:
e_nb = 0
for particle i = 1 to N:
for particle j = 1 to i-1:
if (dist(i, j) < rcutoff):
e_nb += lj(i, j)
e_bond = 0
for particle i = 2 to N:
e_bond += fene(i, i-1)
e_tot = e_nb + e_bond
Below you can find a modified version of your code. To make things simpler, in this version there is no box and no boundary conditions, just a chain in free space. The chain is initialized as a linear sequence of particles each distant 80% of R0 from the next, since R0 is the maximum length of the FENE bond. The code considers that particle i is bonded with i+1 and the bond is not broken. This code is just a proof of concept.
#!/usr/bin/python
import numpy as np
def gen_chain(N, R):
x = np.linspace(0, (N-1)*R*0.8, num=N)
y = np.zeros(N)
z = np.zeros(N)
return np.column_stack((x, y, z))
def lj(rij2):
sig_by_r6 = np.power(sigma/rij2, 3)
sig_by_r12 = np.power(sig_by_r6, 2)
lje = 4.0 * epsilon * (sig_by_r12 - sig_by_r6)
return lje
def fene(rij2):
return (-0.5 * K * R0**2 * np.log(1-(rij2/R0**2)))
def total_energy(coord):
# Non-bonded
e_nb = 0
for i in range(N):
for j in range(i-1):
ri = coord[i]
rj = coord[j]
rij = ri - rj
rij2 = np.dot(rij, rij)
if (rij2 < rcutoff):
e_nb += lj(rij2)
# Bonded
e_bond = 0
for i in range(1, N):
ri = coord[i]
rj = coord[i-1]
rij = ri - rj
rij2 = np.dot(rij, rij)
e_bond += fene(rij2)
return e_nb + e_bond
def move(coord):
trial = np.ndarray.copy(coord)
for i in range(N):
delta = (2.0 * np.random.rand(3) - 1) * max_delta
trial[i] += delta
return trial
def accept(delta_e):
beta = 1.0/T
if delta_e <= 0.0:
return True
random_number = np.random.rand(1)
p_acc = np.exp(-beta*delta_e)
if random_number < p_acc:
return True
return False
if __name__ == "__main__":
# FENE parameters
K = 40
R0 = 1.5
# LJ parameters
sigma = 1.0
epsilon = 1.0
# MC parameters
N = 50 # number of particles
rcutoff = 3.5
max_delta = 0.01
n_steps = 10000000
T = 1.5
coord = gen_chain(N, R0)
energy_current = total_energy(coord)
traj = open('traj.xyz', 'w')
for step in range(n_steps):
if step % 1000 == 0:
traj.write(str(N) + '\n\n')
for i in range(N):
traj.write("C %10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2]))
print(step, energy_current)
coord_trial = move(coord)
energy_trial = total_energy(coord_trial)
delta_e = energy_trial - energy_current
if accept(delta_e):
coord = coord_trial
energy_current = energy_trial
traj.close()
The code prints the current configuration at each step, you can just load it up on VMD and see how it behaves. The bonds will not show correctly at first on VMD, you must use a bead representation for the particles and define the bonds manually or with a script within VMD. In any case, you don't need to see the bonds to notice that the chain does not collapse.
Please bear in mind that if you want to simulate a chain at a certain density, you need to be careful to generate the correct topology. I recommend the EMC package to efficiently generate polymers at the desired thermodynamic conditions. It is by no means a trivial problem, especially for larger chains.
By the way, your code had an error in the FENE energy evaluation. rij2 is already squared, you squared it again.
Below you can see how the total energy as a function of the number of steps behaves for T = 1.0, N = 20, rcutoff = 3.5, and also the last current configuration after 10 thousand steps.
And below for N = 50, T = 1.5, max_delta = 0.01, K = 40, R = 1.5, rcutoff = 3.5, and 10 million steps. This is the last current configuration.
The full "trajectory", which isn't really a trajectory since this is MC, you can find here (it's under 6 MB).

making poisson spheres distribution on python but cannot figure out where is the bug

I am new to programming, so I hope my stupid questions do not bug you.
I am now trying to calculate the poisson sphere distribution(a 3D version of the poisson disk) using python and then plug in the result to POV-RAY so that I can generate some random distributed packing rocks.
I am following these two links:
[https://github.com/CodingTrain/Rainbow-Code/blob/master/CodingChallenges/CC_33_poisson_disc/sketch.js#L13]
[https://www.cs.ubc.ca/~rbridson/docs/bridson-siggraph07-poissondisk.pdf]
tl;dr
0.Create an n-dimensional grid array and cell size = r/sqrt(n) where r is the minimum distance between each sphere. All arrays are set to be default -1 which stands for 'without point'
1.Create an initial sample. (it should be placed randomly but I choose to put it in the middle). Put it in the grid array. Also, intialize an active array. Put the initial sample in the active array.
2.While the active list is not empty, pick a random index. Generate points near it and make sure the points are not overlapping with nearby points(only test with the nearby arrays). If no sample can be created near the 'random index', kick the 'random index' out. Loop the process.
And here is my code:
import math
from random import uniform
import numpy
import random
radius = 1 #you can change the size of each sphere
mindis = 2 * radius
maxx = 10 #you can change the size of the container
maxy = 10
maxz = 10
k = 30
cellsize = mindis / math.sqrt(3)
nrofx = math.floor(maxx / cellsize)
nrofy = math.floor(maxy / cellsize)
nrofz = math.floor(maxz / cellsize)
grid = []
active = []
default = numpy.array((-1, -1, -1))
for fillindex in range(nrofx * nrofy * nrofz):
grid.append(default)
x = uniform(0, maxx)
y = uniform(0, maxy)
z = uniform(0, maxz)
firstpos = numpy.array((x, y, z))
firsti = maxx // 2
firstj = maxy // 2
firstk = maxz // 2
grid[firsti + nrofx * (firstj + nrofy * firstk)] = firstpos
active.append(firstpos)
while (len(active) > 0) :
randindex = math.floor(uniform(0,len(active)))
pos = active[randindex]
found = False
for attempt in range(k):
offsetx = uniform(mindis, 2 * mindis)
offsety = uniform(mindis, 2 * mindis)
offsetz = uniform(mindis, 2 * mindis)
samplex = offsetx * random.choice([1,-1])
sampley = offsety * random.choice([1,-1])
samplez = offsetz * random.choice([1,-1])
sample = numpy.array((samplex, sampley, samplez))
sample = numpy.add(sample, pos)
xcoor = math.floor(sample.item(0) / cellsize)
ycoor = math.floor(sample.item(1) / cellsize)
zcoor = math.floor(sample.item(2) / cellsize)
attemptindex = xcoor + nrofx * (ycoor + nrofy * zcoor)
if attemptindex >= 0 and attemptindex < nrofx * nrofy * nrofz and numpy.all([sample, default]) == True and xcoor > 0 and ycoor > 0 and zcoor > 0 :
test = True
for testx in range(-1,2):
for testy in range(-1, 2):
for testz in range(-1, 2):
testindex = (xcoor + testx) + nrofx * ((ycoor + testy) + nrofy * (zcoor + testz))
if testindex >=0 and testindex < nrofx * nrofy * nrofz :
neighbour = grid[testindex]
if numpy.all([neighbour, sample]) == False:
if numpy.all([neighbour, default]) == False:
distance = numpy.linalg.norm(sample - neighbour)
if distance > mindis:
test = False
if test == True and len(active)<len(grid):
found = True
grid[attemptindex] = sample
active.append(sample)
if found == False:
del active[randindex]
for printout in range(len(grid)):
print("<" + str(active[printout][0]) + "," + str(active[printout][1]) + "," + str(active[printout][2]) + ">")
print(len(grid))
My code seems to run forever.
Therefore I tried to add a print(len(active)) in the last of the while loop.
Surprisingly, I think I discovered the bug as the length of the active list just keep increasing! (It is supposed to be the same length as the grid) I think the problem is caused by the active.append(), but I can't figure out where is the problem as the code is literally the 90% the same as the one made by Mr.Shiffman.
I don't want to free ride this but I have already checked again and again while correcting again and again for this code :(. Still, I don't know where the bug is. (why do the active[] keep appending!?)
Thank you for the precious time.

IndexError: index 10000 is out of bounds for axis 0 with size 10000

For my physics degree, I have to take some Python lessons. I'm an absolute beginner and as such, I can't understand other answers. The code is to plot an object's trajectory with air resistance. I would really appreciate a quick fix - I think it has something to do with the time variable being too small but increasing it doesn't help.
import matplotlib.pyplot as plt
import numpy as np
import math # need math module for trigonometric functions
g = 9.81 #gravitational constant
dt = 1e-3 #integration time step (delta t)
v0 = 40 # initial speed at t = 0
angle = math.pi/4 #math.pi = 3.14, launch angle in radians
time = np.arange(0, 10, dt) #time axis
vx0 = math.cos(angle)*v0 # starting velocity along x axis
vy0 = math.sin(angle)*v0 # starting velocity along y axis
xa = vx0*time # compute x coordinates
ya = -0.5*g*time**2 + vy0*time # compute y coordinates
def traj_fric(angle, v0): # function for trajectory
vx0 = math.cos(angle) * v0 # for some launch angle and starting velocity
vy0 = math.sin(angle) * v0 # compute x and y component of starting velocity
x = np.zeros(len(time)) #initialise x and y arrays
y = np.zeros(len(time))
x[0], y[0], 0 #projecitle starts at 0,0
x[1], y[1] = x[0] + vx0 * dt, y[0] + vy0 * dt # second elements of x and
# y are determined by initial
# velocity
i = 1
while y[i] >= 0: # conditional loop continuous until
# projectile hits ground
gamma = 0.005 # constant of friction
height = 100 # height at which air friction disappears
f = 0.5 * gamma * (height - y[i]) * dt
x[i + 1] = (2 * x[i] - x[i - 1] + f * x[i - 1])/1 + f # numerical integration to find x[i + 1]
y[i + 1] = (2 * y[i] - y[i - 1] + f * y[i - 1] - g * dt ** 2)/ 1 + f # and y[i + 1]
i = i + 1 # increment i for next loop
x = x[0:i+1] # truncate x and y arrays
y = y[0:i+1]
return x, y, (dt*i), x[i] # return x, y, flight time, range of projectile
x, y, duration, distance = traj_fric(angle, v0)
fig1 = plt.figure()
plt.plot(xa, ya) # plot y versus x
plt.xlabel ("x")
plt.ylabel ("y")
plt.ylim(0, max(ya)+max(ya)*0.2)
plt.xlim(0, distance+distance*0.1)
plt.show()
print "Distance:" ,distance
print "Duration:" ,duration
n = 5
angles = np.linspace(0, math.pi/2, n)
maxrange = np.zeros(n)
for i in range(n):
x,y, duration, maxrange [i] = traj_fric(angles[i], v0)
angles = angles/2/math.pi*360 #convert rad to degress
print "Optimum angle:", angles[np.where(maxrange==np.max(maxrange))]
The error is:
File "C:/Python27/Lib/site-packages/xy/projectile_fric.py", line 43, in traj_fric
x[i + 1] = (2 * x[i] - x[i - 1] + f * x[i - 1])/1 + f # numerical integration to find x[i + 1]
IndexError: index 10000 is out of bounds for axis 0 with size 10000
This is pretty straightforward. When you have a size of 10000, element index 10000 is out of bounds because indexing begins with 0, not 1. Therefore, the 10,000th element is index 9999, and anything larger than that is out of bounds.
Mason Wheeler's answer told you what Python was telling you. The problem occurs in this loop:
while y[i] >= 0: # conditional loop continuous until
# projectile hits ground
gamma = 0.005 # constant of friction
height = 100 # height at which air friction disappears
f = 0.5 * gamma * (height - y[i]) * dt
x[i + 1] = (2 * x[i] - x[i - 1] + f * x[i - 1])/1 + f # numerical integration to find x[i + 1]
y[i + 1] = (2 * y[i] - y[i - 1] + f * y[i - 1] - g * dt ** 2)/ 1 + f # and y[i + 1]
i = i + 1 # increment i for next loop
The simple fix is to change the loop to something like (I don't know Python syntax, so bear with me):
while (y[i] >= 0) and (i < len(time)):
That will stop the sim when you run out of array, but it will (potentially) also stop the sim with the projectile hanging in mid-air.
What you have here is a very simple ballistic projectile simulation, modeling atmospheric friction as a linear function of altitude. QUALITATIVELY, what is happening is that your projectile is not hitting the ground in the time you allowed, and you are attempting to overrun your tracking arrays. This is caused by failure to allow sufficient time-of-flight. Observe that the greatest possible time-of-flight occurs when atmospheric friction is zero, and it is then trivial to compute a closed-form upper bound for time-of-flight. You then use that upper bound as your time, and you will allocate sufficient array space to simulate the projectile all the way to impact.
enter code heredef data_to_array(total):
random.shuffle(total)
X = np.zeros((len(total_train), 224, 224, 3)).astype('float')
y = []
for i, img_path in enumerate(total):
img = cv2.imread('/content/gdrive/My Drive/PP/Training/COVID/COVID-19 (538).jpg')
img = cv2.resize(img, (224, 224))
X[i] = img - 1
if len(re.findall('covid', '/content/gdrive/My Drive/PP/Training/COVID/COVID-19 (538).jpg')) == 3:
y.append(0)
else:
y.append(1)
y = np.array(y)
return X, y
X_train, y_train = data_to_array(total_train)
X_test, y_test = data_to_array(total_val)

Python double integral taking too long to compute

I am trying to compute the fresnel integral over a grid of coordinates using dblquad. But its taking very long and finally it's not giving any result.
Below is my code. In this code I integrated only over a 10 x 10 grid but I need to integrate at least over a 500 x 500 grid.
import time
st = time.time()
import pylab
import scipy.integrate as inte
import numpy as np
print 'imhere 0'
def sinIntegrand(y,x, X , Y):
a = 0.0001
R = 2e-3
z = 10e-3
Lambda = 0.5e-6
alpha = 0.01
k = np.pi * 2 / Lambda
return np.cos(k * (((x-R)**2)*a + (R-(x**2 + y**2)) * np.tan(np.radians(alpha)) + ((x - X)**2 + (y - Y)**2) / (2 * z)))
print 'im here 1'
def cosIntegrand(y,x,X,Y):
a = 0.0001
R = 2e-3
z = 10e-3
Lambda = 0.5e-6
alpha = 0.01
k = np.pi * 2 / Lambda
return np.sin(k * (((x-R)**2)*a + (R-(x**2 + y**2)) * np.tan(np.radians(alpha)) + ((x - X)**2 + (y - Y)**2) / (2 * z)))
def y1(x,R = 2e-3):
return (R**2 - x**2)**0.5
def y2(x, R = 2e-3):
return -1*(R**2 - x**2)**0.5
points = np.linspace(-1e-3,1e-3,10)
points2 = np.linspace(1e-3,-1e-3,10)
yv,xv = np.meshgrid(points , points2)
#def integrate_on_grid(func, lo, hi,y1,y2):
# """Returns a callable that can be evaluated on a grid."""
# return np.vectorize(lambda n,m: dblquad(func, lo, hi,y1,y2,(n,m))[0])
#
#intensity = abs(integrate_on_grid(sinIntegrand,-1e-3 ,1e-3,y1, y2)(yv,xv))**2 + abs(integrate_on_grid(cosIntegrand,-1e-3 ,1e-3,y1, y2)(yv,xv))**2
Intensity = []
print 'im here2'
for i in points:
row = []
for j in points2:
print 'im here'
intensity = abs(inte.dblquad(sinIntegrand,-1e-3 ,1e-3,y1, y2,(i,j))[0])**2 + abs(inte.dblquad(cosIntegrand,-1e-3 ,1e-3,y1, y2,(i,j))[0])**2
row.append(intensity)
Intensity.append(row)
Intensity = np.asarray(Intensity)
pylab.imshow(Intensity,cmap = 'gray')
pylab.show()
print str(time.time() - st)
I would really appreciate if you could tell any better way of doing this.
Using a scipy.integrate.dblquad to calculate every pixel of your image is going to be slow in any case.
You should try rewriting your mathematical problem so you can use some classical function in scipy.special instead. For instance, scipy.special.fresnel might work, although it is 1D and your problem seems to be in 2D. Otherwise, that there is a relationship between the Fresnel integral and the incomplete Gamma function (scipy.special.gammainc), if that helps.
If none of this work, as a last resort you can spend time optimizing your code and adapting it to Cython. This it will probably give a speed up of a factor of 10 to 100 (see this answer). Though this wouldn't be sufficient to go from a grid 10x10 to a grid 500x500.

Python implementation of the Wilson Score Interval?

After reading How Not to Sort by Average Rating, I was curious if anyone has a Python implementation of a Lower bound of Wilson score confidence interval for a Bernoulli parameter?
Reddit uses the Wilson score interval for comment ranking, an explanation and python implementation can be found here
#Rewritten code from /r2/r2/lib/db/_sorts.pyx
from math import sqrt
def confidence(ups, downs):
n = ups + downs
if n == 0:
return 0
z = 1.0 #1.44 = 85%, 1.96 = 95%
phat = float(ups) / n
return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
I think this one has a wrong wilson call, because if you have 1 up 0 down you get NaN because you can't do a sqrt on the negative value.
The correct one can be found when looking at the ruby example from the article How not to sort by average page:
return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
To get the Wilson CI without continuity correction, you can use proportion_confint in statsmodels.stats.proportion. To get the Wilson CI with continuity correction, you can use the code below.
# cf.
# [1] R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998
# [2] R. G. Newcombe. Interval Estimation for the difference between independent proportions: comparison of eleven methods, 1998
import numpy as np
from statsmodels.stats.proportion import proportion_confint
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
def propci_wilson_cc(count, nobs, alpha=0.05):
# get confidence limits for proportion
# using wilson score method w/ cont correction
# i.e. Method 4 in Newcombe [1];
# verified via Table 1
from scipy import stats
n = nobs
p = count/n
q = 1.-p
z = stats.norm.isf(alpha / 2.)
z2 = z**2
denom = 2*(n+z2)
num = 2.*n*p+z2-1.-z*np.sqrt(z2-2-1./n+4*p*(n*q+1))
ci_l = num/denom
num = 2.*n*p+z2+1.+z*np.sqrt(z2+2-1./n+4*p*(n*q-1))
ci_u = num/denom
if p == 0:
ci_l = 0.
elif p == 1:
ci_u = 1.
return ci_l, ci_u
def dpropci_wilson_nocc(a,m,b,n,alpha=0.05):
# get confidence limits for difference in proportions
# a/m - b/n
# using wilson score method WITHOUT cont correction
# i.e. Method 10 in Newcombe [2]
# verified via Table II
theta = a/m - b/n
l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
l2, u2 = proportion_confint(count=b, nobs=n, alpha=0.05, method='wilson')
ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)
return ci_l, ci_u
def dpropci_wilson_cc(a,m,b,n,alpha=0.05):
# get confidence limits for difference in proportions
# a/m - b/n
# using wilson score method w/ cont correction
# i.e. Method 11 in Newcombe [2]
# verified via Table II
theta = a/m - b/n
l1, u1 = propci_wilson_cc(count=a, nobs=m, alpha=alpha)
l2, u2 = propci_wilson_cc(count=b, nobs=n, alpha=alpha)
ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)
return ci_l, ci_u
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# single proportion testing
# these come from Newcombe [1] (Table 1)
a_vec = np.array([81, 15, 0, 1])
m_vec = np.array([263, 148, 20, 29])
for (a,m) in zip(a_vec,m_vec):
l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
l2, u2 = propci_wilson_cc(count=a, nobs=m, alpha=0.05)
print(a,m,l1,u1,' ',l2,u2)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# difference in proportions testing
# these come from Newcombe [2] (Table II)
a_vec = np.array([56,9,6,5,0,0,10,10],dtype=float)
m_vec = np.array([70,10,7,56,10,10,10,10],dtype=float)
b_vec = np.array([48,3,2,0,0,0,0,0],dtype=float)
n_vec = np.array([80,10,7,29,20,10,20,10],dtype=float)
print('\nWilson without CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
l, u = dpropci_wilson_nocc(a,m,b,n,alpha=0.05)
print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))
print('\nWilson with CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
l, u = dpropci_wilson_cc(a,m,b,n,alpha=0.05)
print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))
HTH
The accepted solution seems to use a hard-coded z-value (best for performance).
In the event that you wanted a direct python equivalent of the ruby formula from the blogpost with a dynamic z-value (based on the confidence interval):
import math
import scipy.stats as st
def ci_lower_bound(pos, n, confidence):
if n == 0:
return 0
z = st.norm.ppf(1 - (1 - confidence) / 2)
phat = 1.0 * pos / n
return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)
If you'd like to actually calculate z directly from a confidence bound and want to avoid installing numpy/scipy, you can use the following snippet of code,
import math
def binconf(p, n, c=0.95):
'''
Calculate binomial confidence interval based on the number of positive and
negative events observed. Uses Wilson score and approximations to inverse
of normal cumulative density function.
Parameters
----------
p: int
number of positive events observed
n: int
number of negative events observed
c : optional, [0,1]
confidence percentage. e.g. 0.95 means 95% confident the probability of
success lies between the 2 returned values
Returns
-------
theta_low : float
lower bound on confidence interval
theta_high : float
upper bound on confidence interval
'''
p, n = float(p), float(n)
N = p + n
if N == 0.0: return (0.0, 1.0)
p = p / N
z = normcdfi(1 - 0.5 * (1-c))
a1 = 1.0 / (1.0 + z * z / N)
a2 = p + z * z / (2 * N)
a3 = z * math.sqrt(p * (1-p) / N + z * z / (4 * N * N))
return (a1 * (a2 - a3), a1 * (a2 + a3))
def erfi(x):
"""Approximation to inverse error function"""
a = 0.147 # MAGIC!!!
a1 = math.log(1 - x * x)
a2 = (
2.0 / (math.pi * a)
+ a1 / 2.0
)
return (
sign(x) *
math.sqrt( math.sqrt(a2 * a2 - a1 / a) - a2 )
)
def sign(x):
if x < 0: return -1
if x == 0: return 0
if x > 0: return 1
def normcdfi(p, mu=0.0, sigma2=1.0):
"""Inverse CDF of normal distribution"""
if mu == 0.0 and sigma2 == 1.0:
return math.sqrt(2) * erfi(2 * p - 1)
else:
return mu + math.sqrt(sigma2) * normcdfi(p)
Here is a simplified (no need for numpy) and slightly improved (0 and n values for k do not cause a math domain error) version of the Wilson score confidence interval with continuity correction, from the original sourcecode written by batesbatesbates in another answer, and also a pure python no-numpy non-continuity correction version, with 2 equivalent ways to calculate (can be switched with eqmode argument, but both ways give the exact same non-continuity correction results):
import math
def propci_wilson_nocc(k, n, z=1.96, eqmode=0):
# Calculates the Binomial Proportion Confidence Interval using the Wilson Score method without continuation correction
# Equations eqmode == 1 from: https://en.wikipedia.org/w/index.php?title=Binomial_proportion_confidence_interval&oldid=1101942017#Wilson_score_interval
# Equations eqmode == 0 from: https://www.evanmiller.org/how-not-to-sort-by-average-rating.html
# The results should be close to:
# from statsmodels.stats.proportion import proportion_confint
# proportion_confint(k, n, alpha=0.05, method='wilson')
#z=1.44 = 85%, 1.96 = 95%
if n == 0:
return 0
p_hat = float(k) / n
z2 = z**2
if eqmode == 0:
ci_l = (p_hat + z2/(2*n) - z*math.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n)
else:
ci_l = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) - (z / (1 + z2/n)) * math.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2)))))
if eqmode == 0:
ci_u = (p_hat + z2/(2*n) + z*math.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n)
else:
ci_u = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) + (z / (1 + z2/n)) * math.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2)))))
return [ci_l, ci_u]
def propci_wilson_cc(n, k, z=1.96):
# Calculates the Binomial Proportion Confidence Interval using the Wilson Score method with continuation correction
# i.e. Method 4 in Newcombe [1]: R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998;
# verified via Table 1
# originally written by batesbatesbates https://stackoverflow.com/questions/10029588/python-implementation-of-the-wilson-score-interval/74021634#74021634
p_hat = k/n
q = 1.0-p
z2 = z**2
denom = 2*(n+z2)
num = 2.0*n*p_hat + z2 - 1.0 - z*math.sqrt(max(0.0, z2 - 2 - 1.0/n + 4*p_hat*(n*q + 1)))
ci_l = num/denom
num2 = 2.0*n*p_hat + z2 + 1.0 + z*math.sqrt(max(0.0, z2 + 2 - 1.0/n + 4*p_hat*(n*q - 1)))
ci_u = num2/denom
if p_hat == 0:
ci_l = 0.0
elif p_hat == 1:
ci_u = 1.0
return [ci_l, ci_u]
Note that the returned value will always be bounded between [0.0, 1.0] (due to how p_hat is a ratio of k/n), this is why it's a score and not really a confidence interval, but it's easy to project back to a confidence interval by multiplying ci_l * n and ci_u * n, these values will be in the same domain as k and can be plotted alongside.
Here is a much more readable version for how to compute the Wilson Score interval without continuity correction, by Bartosz Mikulski:
from math import sqrt
def wilson(p, n, z = 1.96):
denominator = 1 + z**2/n
centre_adjusted_probability = p + z*z / (2*n)
adjusted_standard_deviation = sqrt((p*(1 - p) + z*z / (4*n)) / n)
lower_bound = (centre_adjusted_probability - z*adjusted_standard_deviation) / denominator
upper_bound = (centre_adjusted_probability + z*adjusted_standard_deviation) / denominator
return (lower_bound, upper_bound)

Categories