Need help vectorizing a mathematical model written in python - python
I've recently written a python implementation of the Beddington DeAngelis model which is used for modeling populations of predators and preys.
My issue is that the code is extremely slow. 10.000 iterations take 230 seconds when this program has to be able to iterate 1 million times in a reasonable time frame.
I understand that I could just rewrite in C, since this is mostly just math, but I really want to learn how to properly vectorize a program like this in python.
The situation, to simplify, is that I have two arrays of shapes 200x200 and I need to iterate every element of each array while using the same index element from the other array plus some surrounding elements of the same array. So for example if I'm working on a[1][1], I will also need:
b[1][1]
a[0][1]
a[0][-1]
a[1][0]
a[-1][0]
The entire operation should be fully vectorizable, because I am changing all 200x200x2 elements in a single time step.
So how would I call this function to get these indexes?
Any advice would be greatly appreciated.
Full code for context: (it looks intimidating, but is actually really straightforward)
import numpy as np
import copy
import time
def get_cell_zero_flux(m,i,j,x,y,prey):
"""
Fetch an array element that could be outside of the border
"""
if i >= x or i < 0 or j >= y or j < 0:
if prey: return 0.43058
else: return 0.718555
return m[i][j]
def get_laplacian(n,i,j,x,y,neighbors,h,prey):
"""
Generate the laplacian value for the given element
"""
total = 0
for ng in neighbors:
cell = get_cell_zero_flux(n,i+ng[0],j+ng[1],x,y,prey)
total += cell
return (total - 4*n[i][j]) / (h**2)
def next_n(n,p,nl,pl,d11,d12,d21,d22,t,r,e,beta,k,ni,w,b):
"""
Integrate prey population function
"""
return n + t * (r * ( 1 - n / k ) * n
- beta * n / ( b + n + w * p ) * p + d11 * nl + d12 * pl)
def next_p(n,p,nl,pl,d11,d12,d21,d22,t,r,e,beta,k,ni,w,b):
"""
Integrate predator population function
"""
return p + t * (e * beta * n / ( b + n + w * p )
* p - ni * p + d21 * nl + d22 * pl)
def generate_preys(x,y,epsilon,n_start):
"""
Generate the initial population of preys
"""
n = np.random.rand(x, y)
n = np.interp(n,(n.min(),n.max()),(-epsilon/2,epsilon/2))
n = n + n_start
return n
def generate_predators(x,y,p_start):
"""
Generate the initial population of predators
"""
p = np.ones((x,y))
p.fill(p_start)
return p
def generate_n(n0,n,p,x,y,neighbors,h,d11,d12,t,r,e,beta,k,ni,w,b):
"""
Vectorized element iteration attempt for preys
"""
i,j = np.where(n==n0) # this wouldnt work, need the current element
n_laplacian = get_laplacian(n,i,j,x,y,neighbors,h,True)
p_laplacian = get_laplacian(p,i,j,x,y,neighbors,h,False)
p0 = p[i,j]
return next_n(n0,p0,laplacian,d11,d12,t,r,e,beta,k,ni,w,b)
def generate_p(p0,p,n,x,y,neighbors,h,d21,d22,t,r,e,beta,k,ni,w,b):
"""
Vectorized element iteration attempt for predators
"""
i,j = np.where(p==p0) # this wouldnt work, need the current element
n_laplacian = get_laplacian(n,i,j,x,y,neighbors,h,True)
p_laplacian = get_laplacian(p,i,j,x,y,neighbors,h,False)
n0 = n[i,j]
return next_p(n0,p0,n_laplacian,
p_laplacian,d11,d12,d21,d22,t,r,e,beta,k,ni,w,b)
def generate_system(x,y,h,d11,d12,d21,d22,t,r,e,
beta,k,ni,w,b,ite,n_start,p_start,epsilon):
"""
System generation
"""
# Initial distribution
n = generate_preys(x,y,epsilon,n_start)
p = generate_predators(x,y,p_start)
#n = n.tolist()
#p = p.tolist()
ps = []
ns = []
# neighbor list for easy laplacian neighbor fetch
neighbors = [[-1,0],[1,0],[0,1],[0,-1]]
t1 = time.time()
for it in range(ite):
# record each iteration
old_n = copy.copy(n)
old_p = copy.copy(p)
ns.append(old_n)
ps.append(old_p)
# main array element iteration for prey and predator arrays
for i in range(x):
for j in range(y):
n_laplacian = get_laplacian(old_n,i,j,x,y,neighbors,h,True)
p_laplacian = get_laplacian(old_p,i,j,x,y,neighbors,h,False)
n0 = old_n[i][j]
p0 = old_p[i][j]
n[i][j] = next_n(n0,p0,n_laplacian,p_laplacian,
d11,d12,d21,d22,t,r,e,beta,k,ni,w,b)
p[i][j] = next_p(n0,p0,n_laplacian,p_laplacian,
d11,d12,d21,d22,t,r,e,beta,k,ni,w,b)
"""
n = generate_n(old_n,old_n,old_p,x,y,neighbors,
h,d11,d12,t,r,e,beta,k,ni,w,b)
p = generate_p(old_p,old_p,old_n,x,y,neighbors,
h,d21,d22,t,r,e,beta,k,ni,w,b)
"""
t2 = time.time()
print(t2-t1)
return ns,ps
ns,ps = generate_system(x=50,y=50,h=0.25,d11=0.01,d12=0.0115,d21=0.01,d22=1,
t=0.01,r=0.5,e=1,beta=0.6,k=2.6,ni=0.25,w=0.4,b=0.3154,
ite=10,n_start=0.43058,p_start=0.718555,epsilon=0.001)
Expected output is calculating 1 million iterations in a few minutes on a 200x200 grid, but taking 230 seconds just for 10.000 in a 40x40 grid
EDIT
I managed to vectorize the whole program. Performance boost was 400x fold. WOW
Here is the new code:
import numpy as np
import copy
import time
def next_n(n,p,nl,pl,d11,d12,d21,d22,t,r,e,beta,k,ni,w,b):
"""
Integrate prey population function
"""
return n + t * (r * ( 1 - n / k ) * n
- beta * n / ( b + n + w * p ) * p + d11 * nl + d12 * pl)
def next_p(n,p,nl,pl,d11,d12,d21,d22,t,r,e,beta,k,ni,w,b):
"""
Integrate predator population function
"""
return p + t * (e * beta * n / ( b + n + w * p )
* p - ni * p + d21 * nl + d22 * pl)
def generate_preys(x,y,epsilon,n_start):
"""
Generate the initial population of preys
"""
n = np.random.rand(x, y)
n = np.interp(n,(n.min(),n.max()),(-epsilon/2,epsilon/2))
n = n + n_start
n[0,:] = n_start
n[-1:,:] = n_start
n[:,0] = n_start
n[:,-1:] = n_start
return n
def generate_predators(x,y,p_start):
"""
Generate the initial population of predators
"""
p = np.ones((x,y))
p.fill(p_start)
return p
def get_laps(a,x,y,h):
center = a[1:-1,1:-1]
left = a[1:-1,0:-2]
right = a[1:-1,2:]
top = a[0:-2,1:-1]
bottom = a[2:,1:-1]
return (left+right+top+bottom - 4*center) / (h**2)
def generate_system(x,y,h,d11,d12,d21,d22,t,r,e,
beta,k,ni,w,b,ite,n_start,p_start,epsilon):
"""
System generation
"""
# Initial distribution
n = generate_preys(x+2,y+2,epsilon,n_start)
p = generate_predators(x+2,y+2,p_start)
ps = []
ns = []
t1 = time.time()
for it in range(ite):
if it % 10000 == 0:
print(f"iterations passed: {it}")
ns.append(copy.copy(n))
ps.append(copy.copy(p))
# record each iteration
nl = get_laps(n,x,y,h)
pl = get_laps(p,x,y,h)
nc = n[1:-1,1:-1]
pc = p[1:-1,1:-1]
n[1:-1,1:-1] = next_n(nc,pc,nl,pl,d11,d12,d21,d22,t,r,e,beta,k,ni,w,b)
p[1:-1,1:-1] = next_p(nc,pc,nl,pl,d11,d12,d21,d22,t,r,e,beta,k,ni,w,b)
t2 = time.time()
print(f"Time taken: {t2-t1}")
return ns,ps
ns,ps = generate_system(x=200,y=200,h=0.25,d11=0.01,d12=0.0115,d21=0.01,d22=1,
t=0.01,r=0.5,e=1,beta=0.6,k=2.6,ni=0.25,w=0.4,b=0.3154,
ite=100,n_start=0.43058,p_start=0.718555,epsilon=0.001)
Related
How to use argmin() and find minimum value from array
I'm new to python so the code may not be the best. I'm trying to find the minimum Total Cost (TotalC) and the corresponding m,k and xM values that go with this minimum cost. I'm not sure how to do this. I have tried using min(TotalC) however this gives an error within the loop or outside the loop only returns the value of TotalC and not the corresponding m, k, and xM values. Any help would be appreciated. This section is at the end of the code, I have included my entire code. I have tried using minIndex = TotalC.argmin() but I'm not sure how to use it and it only returns 0 each time. import numpy as np import matplotlib.pyplot as plt def Load(x): Fpeak = (1000 + (9*(x**2) - (183*x))) *1000 #Fpeak in N td = (20 - ((0.12)*(x**2)) + (4.2*(x))) / 1000 #td in s return Fpeak, td ##################################################################################################### ####################### Part 2 ######################## def displacement(m,k,x,dt): #Displacement function Fpeak, td = Load(x) #Load Function from step 1 w = np.sqrt(k/m) # Natural circular frequency T = 2 * np.pi /w #Natural period of blast (s) time = np.arange(0,2*T,0.001) #Time array with range (0 - 2*T) with steps of 2*T/100 zt = [] #Create a lsit to store displacement values for t in time: if (t <= td): zt.append((Fpeak/k) * (1 - np.cos(w*t)) + (Fpeak/(k*td)) * ((np.sin(w*t)/w) - t)) else: zt.append((Fpeak/(k*w*td)) * (np.sin(w*t) - np.sin(w*(t-td))) - ((Fpeak/k) * np.cos(w*t))) zmax=max(zt) #Find the max displacement from the list of zt values return zmax #Return max displacement k = 1E6 m = 200 dt = 0.0001 x = 0 z = displacement(m,k,x,dt) ################################################################################### ############### Part 3 ####################### # k = 1E6 , m = 200kg , Deflection = 0.1m k_values = np.arange(1E6, 7E6, ((7E6-1E6)/10)) #List of k values between min and max (1E6 and 7E6). m_values = np.arange(200,1200,((1200-200)/10)) #List of m values between min and max 200kg and 1200kg xM = [] for k in k_values: # values of k for m in m_values: # values of m within k for loop def bisector(m,k,dpoint,dt): #dpoint = decimal point accuracy xL = 0 xR = 10 xM = (xL + xR)/2 zmax = 99 while round(zmax, dpoint) !=0.1: zmax = displacement(m,k,xM,dt) if zmax > 0.1: xL = xM xM = (xL + xR)/2 else: xR = xM xM = (xL + xR)/2 return xM xM = bisector(m, k, 4, 0.001) print('xM value =',xM) ##################################################### #######Step 4 def cost (m,k,xM): Ck = 900 + 825*((k/1E6)**2) - (1725*(k/1E6)) Cm = 10*m - 2000 Cx = 2400*((xM**2)/4) TotalC = Ck + Cm + Cx minIndex = TotalC.argmin(0) print(minIndex) return TotalC TotalC = cost(m, k, xM) minIndex = TotalC.argmin() print(minIndex) print([xM, m, k, TotalC])
argmin() returns the index of a minimum value. If you are looking for the minimum itself, try using .min(). There is also a possibility that 0 is the lowest value in Your array so bear that in mind
NIST Suite Test for Nonlinear dynamical system
In my following code i m running a lorentz chaotic equation from which i will get random numbers in terms of xs , ys and zs import numpy as np def lorenz(x, y, z, a=10,b=8/3,c=28 ): x_dot = a*(y -x) y_dot = - y +c*x - x*z z_dot = -b*z + x*y return x_dot, y_dot, z_dot dt = 0.01 num_steps = 10000 # Need one more for the initial values xs = np.empty(num_steps + 1) ys = np.empty(num_steps + 1) zs = np.empty(num_steps + 1) # Set initial values xs[0], ys[0], zs[0]= (1,1,1) # Step through "time", calculating the partial derivatives at the current point # and using them to estimate the next point for i in range(num_steps): x_dot, y_dot, z_dot= lorenz(xs[i], ys[i], zs[i]) xs[i + 1] = xs[i] + (x_dot * dt) ys[i + 1] = ys[i] + (y_dot * dt) zs[i + 1] = zs[i] + (z_dot * dt) I am actually trying to test the xs, ys and zs value for random number generating test via NIST 800 by using the code below from __future__ import print_function import math from fractions import Fraction from scipy.special import gamma, gammainc, gammaincc # from gamma_functions import * import numpy import cmath import random #ones_table = [bin(i)[2:].count('1') for i in range(256)] def count_ones_zeroes(bits): ones = 0 zeroes = 0 for bit in bits: if (bit == 1): ones += 1 else: zeroes += 1 return (zeroes,ones) def runs_test(bits): n = len(bits) zeroes,ones = count_ones_zeroes(bits) prop = float(ones)/float(n) print(" prop ",prop) tau = 2.0/math.sqrt(n) print(" tau ",tau) if abs(prop-0.5) > tau: return (False,0.0,None) vobs = 1.0 for i in range(n-1): if bits[i] != bits[i+1]: vobs += 1.0 print(" vobs ",vobs) p = math.erfc(abs(vobs - (2.0*n*prop*(1.0-prop)))/(2.0*math.sqrt(2.0*n)*prop*(1-prop) )) success = (p >= 0.01) return (success,p,None) print(runs_test(xs)) #%% from __future__ import print_function import math def count_ones_zeroes(bits): ones = 0 zeroes = 0 for bit in bits: if (bit == 1): ones += 1 else: zeroes += 1 return (zeroes,ones) def monobit_test(bits): n = len(bits) zeroes,ones = count_ones_zeroes(bits) s = abs(ones-zeroes) print(" Ones count = %d" % ones) print(" Zeroes count = %d" % zeroes) p = math.erfc(float(s)/(math.sqrt(float(n)) * math.sqrt(2.0))) success = (p >= 0.01) return (success,p,None) print(runs_test(xs)) the output which i m getting is false i.e output: prop 0.00019998000199980003 tau 0.01999900007499375 (False, 0.0, None) what should i do now?
The Lorenz system is chaotic, not random. You implemented the differential equation solver well, but it seems that count_ones_zeroes doesn't do what its name implies, at least, not on the data you provide. on xs, it returns that (zeroes, ones) = (9999, 2), which is not what you want. The code checks the value within the xs array, i.e. an x value (e.g. 8.2) against 1, but x is a float between -20 and 20, so it will be usually non1, and will be counted as 0. Only x==1 will be counted as ones. In python, int/int results in float, so there is no need to cast it to float, in contrast to e.g. C or C++, so instead of prop = float(ones)/float(n), you can write prop = ones/n Similar statements hold for +,- and *
Monte Carlo simulation of a system of Lennard-Jones + FENE potential
I want to generate two linear chains of 20 monomers each at some distance to each other. The following code generates a single chain. Could someone help me with how to generate the second chain? The two chains are fixed to a surface i.e the first monomer of the chain is fixed and the rest of the monomers move freely in x-y-z directions but the z component of the monomers should be positive. Something like this: import numpy as np import numba as nb #import pandas as pd #nb.jit() def gen_chain(N): x = np.zeros(N) y = np.zeros(N) z = np.linspace(0, (N)*0.9, num=N) return np.column_stack((x, y, z)), np.column_stack((x1, y1, z1)) #coordinates = np.loadtxt('2GN_50_T_10.txt', skiprows=199950) #return coordinates #nb.jit() def lj(rij2): sig_by_r6 = np.power(sigma**2 / rij2, 3) sig_by_r12 = np.power(sigma**2 / rij2, 6) lje = 4 * epsilon * (sig_by_r12 - sig_by_r6) return lje #nb.jit() def fene(rij2): return (-0.5 * K * np.power(R, 2) * np.log(1 - ((np.sqrt(rij2) - r0) / R)**2)) #nb.jit() def total_energy(coord): # Non-bonded energy. e_nb = 0.0 for i in range(N): for j in range(i - 1): ri = coord[i] rj = coord[j] rij = ri - rj rij2 = np.dot(rij, rij) if (rij2 < rcutoff_sq): e_nb += lj(rij2) # Bonded FENE potential energy. e_bond = 0.0 for i in range(1, N): ri = coord[i] rj = coord[i - 1] # Can be [i+1] ?? rij = ri - rj rij2 = np.dot(rij, rij) e_bond += fene(rij2) return e_nb + e_bond #nb.jit() def move(coord): trial = np.ndarray.copy(coord) for i in range(1, N): while True: delta = (2 * np.random.rand(3) - 1) * max_delta trial[i] += delta #while True: if trial[i,2] > 0.0: break trial[i] -= delta return trial #nb.jit() def accept(delta_e): beta = 1.0 / T if delta_e < 0.0: return True random_number = np.random.rand(1) p_acc = np.exp(-beta * delta_e) if random_number < p_acc: return True return False if __name__ == "__main__": # FENE potential parameters. K = 40.0 R = 0.3 r0 = 0.7 # L-J potential parameters sigma = 0.5716 epsilon = 1.0 # MC parameters N = 20 # Numbers of monomers rcutoff = 2.5 * sigma rcutoff_sq = rcutoff * rcutoff max_delta = 0.01 n_steps = 100000 T = 10 # MAIN PART OF THE CODE coord = gen_chain(N) energy_current = total_energy(coord) traj = open('2GN_20_T_10.xyz', 'w') traj_txt = open('2GN_20_T_10.txt', 'w') for step in range(n_steps): if step % 1000 == 0: traj.write(str(N) + '\n\n') for i in range(N): traj.write("C %10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2])) traj_txt.write("%10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2])) print(step, energy_current) coord_trial = move(coord) energy_trial = total_energy(coord_trial) delta_e = energy_trial - energy_current if accept(delta_e): coord = coord_trial energy_current = energy_trial traj.close() I except the chain of particles to collapse into a globule.
There is some problem with the logic of the MC you are implementing. To perform a MC you need to ATTEMPT a move, evaluate the energy of the new state and then accept/reject according to a random number. In your code there is not the slightest sign of the attempt to move a particle. You need to move one (or more of them), evaluate the energy, and then update your coordinates. By the way, I suppose this is not your entire code. There are many parameters that are not defined like the "k" and the "R0" in your fene potential
The FENE potential models bond interactions. What your code is saying is that all particles within the cutoff are bonded by FENE springs, and that the bonds are not fixed but rather defined by the cutoff. With a r_cutoff = 3.0, larger than equilibrium distance of the LJ well, you are essentially considering that each particle is bonded to potentially many others. You are treating the FENE potential as a non-bonded one. For the bond interactions you should ignore the cutoff and only evaluate the energy for the actual pairs that are bonded according to your topology, which means that first you need to define a topology. I suggest generating a linear molecule of N atoms in a box big enough to contain the whole stretched molecule, and consider the i-th atom as bonded to the (i-1)-th atom, with i = 2, ..., N. In this way the topology is well defined and persistent. Then consider both interactions separately, non-bonded and bond, and add them at the end. Something like this, in pseudo-code: e_nb = 0 for particle i = 1 to N: for particle j = 1 to i-1: if (dist(i, j) < rcutoff): e_nb += lj(i, j) e_bond = 0 for particle i = 2 to N: e_bond += fene(i, i-1) e_tot = e_nb + e_bond
Below you can find a modified version of your code. To make things simpler, in this version there is no box and no boundary conditions, just a chain in free space. The chain is initialized as a linear sequence of particles each distant 80% of R0 from the next, since R0 is the maximum length of the FENE bond. The code considers that particle i is bonded with i+1 and the bond is not broken. This code is just a proof of concept. #!/usr/bin/python import numpy as np def gen_chain(N, R): x = np.linspace(0, (N-1)*R*0.8, num=N) y = np.zeros(N) z = np.zeros(N) return np.column_stack((x, y, z)) def lj(rij2): sig_by_r6 = np.power(sigma/rij2, 3) sig_by_r12 = np.power(sig_by_r6, 2) lje = 4.0 * epsilon * (sig_by_r12 - sig_by_r6) return lje def fene(rij2): return (-0.5 * K * R0**2 * np.log(1-(rij2/R0**2))) def total_energy(coord): # Non-bonded e_nb = 0 for i in range(N): for j in range(i-1): ri = coord[i] rj = coord[j] rij = ri - rj rij2 = np.dot(rij, rij) if (rij2 < rcutoff): e_nb += lj(rij2) # Bonded e_bond = 0 for i in range(1, N): ri = coord[i] rj = coord[i-1] rij = ri - rj rij2 = np.dot(rij, rij) e_bond += fene(rij2) return e_nb + e_bond def move(coord): trial = np.ndarray.copy(coord) for i in range(N): delta = (2.0 * np.random.rand(3) - 1) * max_delta trial[i] += delta return trial def accept(delta_e): beta = 1.0/T if delta_e <= 0.0: return True random_number = np.random.rand(1) p_acc = np.exp(-beta*delta_e) if random_number < p_acc: return True return False if __name__ == "__main__": # FENE parameters K = 40 R0 = 1.5 # LJ parameters sigma = 1.0 epsilon = 1.0 # MC parameters N = 50 # number of particles rcutoff = 3.5 max_delta = 0.01 n_steps = 10000000 T = 1.5 coord = gen_chain(N, R0) energy_current = total_energy(coord) traj = open('traj.xyz', 'w') for step in range(n_steps): if step % 1000 == 0: traj.write(str(N) + '\n\n') for i in range(N): traj.write("C %10.5f %10.5f %10.5f\n" % (coord[i][0], coord[i][1], coord[i][2])) print(step, energy_current) coord_trial = move(coord) energy_trial = total_energy(coord_trial) delta_e = energy_trial - energy_current if accept(delta_e): coord = coord_trial energy_current = energy_trial traj.close() The code prints the current configuration at each step, you can just load it up on VMD and see how it behaves. The bonds will not show correctly at first on VMD, you must use a bead representation for the particles and define the bonds manually or with a script within VMD. In any case, you don't need to see the bonds to notice that the chain does not collapse. Please bear in mind that if you want to simulate a chain at a certain density, you need to be careful to generate the correct topology. I recommend the EMC package to efficiently generate polymers at the desired thermodynamic conditions. It is by no means a trivial problem, especially for larger chains. By the way, your code had an error in the FENE energy evaluation. rij2 is already squared, you squared it again. Below you can see how the total energy as a function of the number of steps behaves for T = 1.0, N = 20, rcutoff = 3.5, and also the last current configuration after 10 thousand steps. And below for N = 50, T = 1.5, max_delta = 0.01, K = 40, R = 1.5, rcutoff = 3.5, and 10 million steps. This is the last current configuration. The full "trajectory", which isn't really a trajectory since this is MC, you can find here (it's under 6 MB).
Approximation by sin waves using DFT on python. What's wrong?
I'm writing the prorgram on python that can approximate time series by sin waves. The program uses DFT to find sin waves, after that it chooses sin waves with biggest amplitudes. Here's my code: __author__ = 'FATVVS' import math # Wave - (amplitude,frequency,phase) # This class was created to sort sin waves: # - by anplitude( set freq_sort=False) # - by frequency (set freq_sort=True) class Wave: #flag for choosing sort mode: # False-sort by amplitude # True-by frequency freq_sort = False def __init__(self, amp, freq, phase): self.freq = freq #frequency self.amp = amp #amplitude self.phase = phase def __lt__(self, other): if self.freq_sort: return self.freq < other.freq else: return self.amp < other.amp def __gt__(self, other): if self.freq_sort: return self.freq > other.freq else: return self.amp > other.amp def __le__(self, other): if self.freq_sort: return self.freq <= other.freq else: return self.amp <= other.amp def __ge__(self, other): if self.freq_sort: return self.freq >= other.freq else: return self.amp >= other.amp def __str__(self): s = "(amp=" + str(self.amp) + ",frq=" + str(self.freq) + ",phase=" + str(self.phase) + ")" return s def __repr__(self): return self.__str__() #Discrete Fourier Transform def dft(series: list): n = len(series) m = int(n / 2) real = [0 for _ in range(n)] imag = [0 for _ in range(n)] amplitude = [] phase = [] angle_const = 2 * math.pi / n for w in range(m): a = w * angle_const for t in range(n): real[w] += series[t] * math.cos(a * t) imag[w] += series[t] * math.sin(a * t) amplitude.append(math.sqrt(real[w] * real[w] + imag[w] * imag[w]) / n) phase.append(math.atan(imag[w] / real[w])) return amplitude, phase #extract waves from time series # series - time series # num - number of waves def get_waves(series: list, num): amp, phase = dft(series) m = len(amp) waves = [] for i in range(m): waves.append(Wave(amp[i], 2 * math.pi * i / m, phase[i])) waves.sort() waves.reverse() waves = waves[0:num]#extract best waves print("the program found the next %s sin waves:"%(num)) print(waves)#print best waves return waves #approximation by sin waves #series - time series #num- number of sin waves def sin_waves_appr(series: list, num): n = len(series) freq = get_waves(series, num) m = len(freq) model = [] for i in range(n): summ = 0 for j in range(m): #sum by sin waves summ += freq[j].amp * math.sin(freq[j].freq * i + freq[j].phase) model.append(summ) return model if __name__ == '__main__': import matplotlib.pyplot as plt N = 500 # length of time series num = 2 # number of sin wawes, that we want to find #y - generate time series y = [2 * math.sin(0.05 * t + 0.5) + 0.5 * math.sin(0.2 * t + 1.5) for t in range(N)] model = sin_waves_appr(y, num) #generate approximation model ## ------------------plotting----------------- plt.figure(1) # plotting of time series and his approximation model plt.subplot(211) h_signal, = plt.plot(y, label='source timeseries') h_model, = plt.plot(model, label='model', linestyle='--') plt.legend(handles=[h_signal, h_model]) plt.grid() # plotting of spectre amp, _ = dft(y) xaxis = [2*math.pi*i / N for i in range(len(amp))] plt.subplot(212) h_freq, = plt.plot(xaxis, amp, label='spectre') plt.legend(handles=[h_freq]) plt.grid() plt.show() But I've got a strange result: In the program I've created a time series from two sin waves: y = [2 * math.sin(0.05 * t + 0.5) + 0.5 * math.sin(0.2 * t + 1.5) for t in range(N)] And my program found wrong parameters of the sin waves: the program found the next 2 sin waves: [(amp=0.9998029885151699,frq=0.10053096491487339,phase=1.1411803525843616), (amp=0.24800925225626422,frq=0.40212385965949354,phase=0.346757128184013)] I suppuse, that my problem is wrong scaling of wave parameters, but I'm not sure. There're two places, where the program does scaling. The first place is creating of waves: for i in range(m): waves.append(Wave(amp[i], 2 * math.pi * i / m, phase[i])) And the second place is sclaling of the x-axis: xaxis = [2*math.pi*i / N for i in range(len(amp))] But my suppose may be wrong. I've tried to change scaling many times, and it haven't solved my problem. What may be wrong with the code?
So, these lines I believe are wrong: for t in range(n): real[w] += series[t] * math.cos(a * t) imag[w] += series[t] * math.sin(a * t) amplitude.append(math.sqrt(real[w] * real[w] + imag[w] * imag[w]) / n) phase.append(math.atan(imag[w] / real[w])) I believe it should be dividing by m instead of n, since you are only working with computing half the points. That will fix the amplitude problem. Also, the computation of imag[w] is missing a negative sign. Taking into account the atan2 fix, it would look like: for t in range(n): real[w] += series[t] * math.cos(a * t) imag[w] += -1 * series[t] * math.sin(a * t) amplitude.append(math.sqrt(real[w] * real[w] + imag[w] * imag[w]) / m) phase.append(math.atan2(imag[w], real[w])) The next one is here: for i in range(m): waves.append(Wave(amp[i], 2 * math.pi * i / m, phase[i])) The divide by m is not right. amp has only half the points it should, so using the length of amp isn't right here. It should be: for i in range(m): waves.append(Wave(amp[i], 2 * math.pi * i / (m * 2), phase[i])) Finally, your model reconstruction has a problem: for j in range(m): #sum by sin waves summ += freq[j].amp * math.sin(freq[j].freq * i + freq[j].phase) It should use cosine instead (sine introduces a phase shift): for j in range(m): #sum by cos waves summ += freq[j].amp * math.cos(freq[j].freq * i + freq[j].phase) When I fix all of that, the model and the DFT both make sense:
Python implementation of the Wilson Score Interval?
After reading How Not to Sort by Average Rating, I was curious if anyone has a Python implementation of a Lower bound of Wilson score confidence interval for a Bernoulli parameter?
Reddit uses the Wilson score interval for comment ranking, an explanation and python implementation can be found here #Rewritten code from /r2/r2/lib/db/_sorts.pyx from math import sqrt def confidence(ups, downs): n = ups + downs if n == 0: return 0 z = 1.0 #1.44 = 85%, 1.96 = 95% phat = float(ups) / n return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
I think this one has a wrong wilson call, because if you have 1 up 0 down you get NaN because you can't do a sqrt on the negative value. The correct one can be found when looking at the ruby example from the article How not to sort by average page: return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
To get the Wilson CI without continuity correction, you can use proportion_confint in statsmodels.stats.proportion. To get the Wilson CI with continuity correction, you can use the code below. # cf. # [1] R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998 # [2] R. G. Newcombe. Interval Estimation for the difference between independent proportions: comparison of eleven methods, 1998 import numpy as np from statsmodels.stats.proportion import proportion_confint # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def propci_wilson_cc(count, nobs, alpha=0.05): # get confidence limits for proportion # using wilson score method w/ cont correction # i.e. Method 4 in Newcombe [1]; # verified via Table 1 from scipy import stats n = nobs p = count/n q = 1.-p z = stats.norm.isf(alpha / 2.) z2 = z**2 denom = 2*(n+z2) num = 2.*n*p+z2-1.-z*np.sqrt(z2-2-1./n+4*p*(n*q+1)) ci_l = num/denom num = 2.*n*p+z2+1.+z*np.sqrt(z2+2-1./n+4*p*(n*q-1)) ci_u = num/denom if p == 0: ci_l = 0. elif p == 1: ci_u = 1. return ci_l, ci_u def dpropci_wilson_nocc(a,m,b,n,alpha=0.05): # get confidence limits for difference in proportions # a/m - b/n # using wilson score method WITHOUT cont correction # i.e. Method 10 in Newcombe [2] # verified via Table II theta = a/m - b/n l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson') l2, u2 = proportion_confint(count=b, nobs=n, alpha=0.05, method='wilson') ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2) ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2) return ci_l, ci_u def dpropci_wilson_cc(a,m,b,n,alpha=0.05): # get confidence limits for difference in proportions # a/m - b/n # using wilson score method w/ cont correction # i.e. Method 11 in Newcombe [2] # verified via Table II theta = a/m - b/n l1, u1 = propci_wilson_cc(count=a, nobs=m, alpha=alpha) l2, u2 = propci_wilson_cc(count=b, nobs=n, alpha=alpha) ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2) ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2) return ci_l, ci_u # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # single proportion testing # these come from Newcombe [1] (Table 1) a_vec = np.array([81, 15, 0, 1]) m_vec = np.array([263, 148, 20, 29]) for (a,m) in zip(a_vec,m_vec): l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson') l2, u2 = propci_wilson_cc(count=a, nobs=m, alpha=0.05) print(a,m,l1,u1,' ',l2,u2) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # difference in proportions testing # these come from Newcombe [2] (Table II) a_vec = np.array([56,9,6,5,0,0,10,10],dtype=float) m_vec = np.array([70,10,7,56,10,10,10,10],dtype=float) b_vec = np.array([48,3,2,0,0,0,0,0],dtype=float) n_vec = np.array([80,10,7,29,20,10,20,10],dtype=float) print('\nWilson without CC') for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec): l, u = dpropci_wilson_nocc(a,m,b,n,alpha=0.05) print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u)) print('\nWilson with CC') for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec): l, u = dpropci_wilson_cc(a,m,b,n,alpha=0.05) print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u)) HTH
The accepted solution seems to use a hard-coded z-value (best for performance). In the event that you wanted a direct python equivalent of the ruby formula from the blogpost with a dynamic z-value (based on the confidence interval): import math import scipy.stats as st def ci_lower_bound(pos, n, confidence): if n == 0: return 0 z = st.norm.ppf(1 - (1 - confidence) / 2) phat = 1.0 * pos / n return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)
If you'd like to actually calculate z directly from a confidence bound and want to avoid installing numpy/scipy, you can use the following snippet of code, import math def binconf(p, n, c=0.95): ''' Calculate binomial confidence interval based on the number of positive and negative events observed. Uses Wilson score and approximations to inverse of normal cumulative density function. Parameters ---------- p: int number of positive events observed n: int number of negative events observed c : optional, [0,1] confidence percentage. e.g. 0.95 means 95% confident the probability of success lies between the 2 returned values Returns ------- theta_low : float lower bound on confidence interval theta_high : float upper bound on confidence interval ''' p, n = float(p), float(n) N = p + n if N == 0.0: return (0.0, 1.0) p = p / N z = normcdfi(1 - 0.5 * (1-c)) a1 = 1.0 / (1.0 + z * z / N) a2 = p + z * z / (2 * N) a3 = z * math.sqrt(p * (1-p) / N + z * z / (4 * N * N)) return (a1 * (a2 - a3), a1 * (a2 + a3)) def erfi(x): """Approximation to inverse error function""" a = 0.147 # MAGIC!!! a1 = math.log(1 - x * x) a2 = ( 2.0 / (math.pi * a) + a1 / 2.0 ) return ( sign(x) * math.sqrt( math.sqrt(a2 * a2 - a1 / a) - a2 ) ) def sign(x): if x < 0: return -1 if x == 0: return 0 if x > 0: return 1 def normcdfi(p, mu=0.0, sigma2=1.0): """Inverse CDF of normal distribution""" if mu == 0.0 and sigma2 == 1.0: return math.sqrt(2) * erfi(2 * p - 1) else: return mu + math.sqrt(sigma2) * normcdfi(p)
Here is a simplified (no need for numpy) and slightly improved (0 and n values for k do not cause a math domain error) version of the Wilson score confidence interval with continuity correction, from the original sourcecode written by batesbatesbates in another answer, and also a pure python no-numpy non-continuity correction version, with 2 equivalent ways to calculate (can be switched with eqmode argument, but both ways give the exact same non-continuity correction results): import math def propci_wilson_nocc(k, n, z=1.96, eqmode=0): # Calculates the Binomial Proportion Confidence Interval using the Wilson Score method without continuation correction # Equations eqmode == 1 from: https://en.wikipedia.org/w/index.php?title=Binomial_proportion_confidence_interval&oldid=1101942017#Wilson_score_interval # Equations eqmode == 0 from: https://www.evanmiller.org/how-not-to-sort-by-average-rating.html # The results should be close to: # from statsmodels.stats.proportion import proportion_confint # proportion_confint(k, n, alpha=0.05, method='wilson') #z=1.44 = 85%, 1.96 = 95% if n == 0: return 0 p_hat = float(k) / n z2 = z**2 if eqmode == 0: ci_l = (p_hat + z2/(2*n) - z*math.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n) else: ci_l = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) - (z / (1 + z2/n)) * math.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2))))) if eqmode == 0: ci_u = (p_hat + z2/(2*n) + z*math.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n) else: ci_u = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) + (z / (1 + z2/n)) * math.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2))))) return [ci_l, ci_u] def propci_wilson_cc(n, k, z=1.96): # Calculates the Binomial Proportion Confidence Interval using the Wilson Score method with continuation correction # i.e. Method 4 in Newcombe [1]: R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998; # verified via Table 1 # originally written by batesbatesbates https://stackoverflow.com/questions/10029588/python-implementation-of-the-wilson-score-interval/74021634#74021634 p_hat = k/n q = 1.0-p z2 = z**2 denom = 2*(n+z2) num = 2.0*n*p_hat + z2 - 1.0 - z*math.sqrt(max(0.0, z2 - 2 - 1.0/n + 4*p_hat*(n*q + 1))) ci_l = num/denom num2 = 2.0*n*p_hat + z2 + 1.0 + z*math.sqrt(max(0.0, z2 + 2 - 1.0/n + 4*p_hat*(n*q - 1))) ci_u = num2/denom if p_hat == 0: ci_l = 0.0 elif p_hat == 1: ci_u = 1.0 return [ci_l, ci_u] Note that the returned value will always be bounded between [0.0, 1.0] (due to how p_hat is a ratio of k/n), this is why it's a score and not really a confidence interval, but it's easy to project back to a confidence interval by multiplying ci_l * n and ci_u * n, these values will be in the same domain as k and can be plotted alongside.
Here is a much more readable version for how to compute the Wilson Score interval without continuity correction, by Bartosz Mikulski: from math import sqrt def wilson(p, n, z = 1.96): denominator = 1 + z**2/n centre_adjusted_probability = p + z*z / (2*n) adjusted_standard_deviation = sqrt((p*(1 - p) + z*z / (4*n)) / n) lower_bound = (centre_adjusted_probability - z*adjusted_standard_deviation) / denominator upper_bound = (centre_adjusted_probability + z*adjusted_standard_deviation) / denominator return (lower_bound, upper_bound)