I'm working on programming a MLE for the Polya distribution using scipy. The Nelder-Mead method is working, however I get a "Desired error not necessarily achieved due to precision loss." error when running BFGS. The Nelder-Mead method seems like it's too slow for my needs (I have a lot of fairly big data, say 1000 tables in some cases as big as 10x10000). I've tried using the check_grad function and the result is smallish on the example below (order 10^-2), so I'm not sure if that means there's a bug in the gradient of the log likelihood or the likelihood is just very strongly peaked. For what it's worth, I've stared quite hard at my code and I can't see the issue. Here's some example code to recreate the problem
#setup some data
from numpy.random import dirichlet, multinomial
from scipy.optimize import check_grad
alpha = [10,30,50]
p = pd.DataFrame(dirichlet(alpha,200))
data = p.apply(lambda x: multinomial(500,x),1)
a = np.array(data.mean(0))
#optimize
result = minimize(lambda a: -1*llike(data,exp(a)),
x0=np.log(a),
method='Nelder-Mead')
x0=result.x
result = minimize(lambda a: -1*llike(data,exp(a)),
x0=x0,
jac=lambda a: -1*gradient_llike(data,np.exp(a)),
method='BFGS')
exp(result.x) #should be close to alpha
#uhoh, let's check that this is right.
check_grad(func=lambda a: -1*llike(data,a),grad=lambda a: -1*gradient_llike(data,a),x0=alpha)
Here's the code for my functions
def log_polya(Z,alpha):
"""
Z is a vector of counts
https://en.wikipedia.org/wiki/Dirichlet-multinomial_distribution
http://mimno.infosci.cornell.edu/info6150/exercises/polya.pdf
"""
if not isinstance(alpha,np.ndarray):
alpha = np.array(alpha)
if not isinstance(Z,np.ndarray):
Z = np.array(Z)
#Concentration Parameter
A = sum(alpha)
#Number of Datapoints
N = sum(Z)
return gammaln(A) - gammaln(N+A) + sum(gammaln(Z+alpha) - gammaln(alpha))
def llike(data,alpha):
return sum(data.apply(log_polya,1,alpha=alpha))
def log_polya_derivative(Z,alpha):
if not isinstance(alpha,np.ndarray):
alpha = np.array(alpha)
if not isinstance(Z,np.ndarray):
Z = np.array(Z)
if 0. in Z+alpha:
Warning("invalid prior parameter,nans should be produced")
#Concentration Parameter
A = sum(alpha)
#Number of Datapoints
N = sum(Z)
K = len(Z)
return np.array([psi(A) - psi(N+A) + psi(Z[i]+alpha[i]) - psi(alpha[i]) for i in xrange(K)])
def gradient_llike(data,alpha):
return np.array(data.apply(log_polya_derivative,1,alpha=alpha).sum(0))
UPDATE: Still curious about this, but for those interested in a working implementation for this problem, the following code for implementing the Minka Fixed Point Algorithm seems to work well (i.e. recovers quickly values that are close to the true dirichlet parameter).
def minka_mle_polya(data):
"""
http://research.microsoft.com/en-us/um/people/minka/papers/dirichlet/minka-dirichlet.pdf
"""
data = np.array(data)
K = np.shape(data)[1]
alpha = np.array(data.mean(0))
alpha_new = np.ndarray((K))
precision = 10
while precision > 10**-5:
for k in range(K):
A = sum(alpha)
N = data.sum(1)
numerator = sum(
psi(data[:,k]+alpha[k])-psi(alpha[k])
)
denominator = sum(
psi(N+A)-psi(A)
)
alpha_new[k] = alpha[k]*numerator/denominator
precision = sum(abs(alpha_new - alpha))
alpha_old = np.array(alpha)
alpha = np.array(alpha_new)
print "Gap", precision
Related
Now the full code / questions
I would like to estimate the random fluctuations of the function v - therefore I would like to calculate the RMS value of it:
import numpy as np
import matplotlib.pyplot as plt
def HHmodel(I,length, area):
v = []
m = []
h = []
z = []
n = []
squares = []
vsquare = (-60)*(-60)
sumsquares = 0
rms = []
a= []
dt = 0.05
t = np.linspace(0,100,length)
#constants
Cm = area#microFarad
ENa=50 #miliVolt
EK=-77 #miliVolt
El=-54 #miliVolt
g_Na=120*area #mScm-2
g_K=36*area #mScm-2
g_l=0.03*area #mScm-2
def alphaN(v):
return 0.01*(v+50)/(1-np.exp(-(v+50)/10))
def betaN(v):
return 0.125*np.exp(-(v+60)/80)
def alphaM(v):
return 0.1*(v+35)/(1-np.exp(-(v+35)/10))
def betaM(v):
return 4.0*np.exp(-0.0556*(v+60))
def alphaH(v):
return 0.07*np.exp(-0.05*(v+60))
def betaH(v):
return 1/(1+np.exp(-(0.1)*(v+30)))
#Initialize the voltage and the channels :
v.append(-60)
rms.append(1)
m0 = alphaM(v[0])/(alphaM(v[0])+betaM(v[0]))
n0 = alphaN(v[0])/(alphaN(v[0])+betaN(v[0]))
h0 = alphaH(v[0])/(alphaH(v[0])+betaH(v[0]))
#t.append(0)
m.append(m0)
n.append(n0)
h.append(h0)
#solving ODE using Euler's method:
for i in range(1,len(t)):
m.append(m[i-1] + dt*((alphaM(v[i-1])*(1-m[i-1]))-betaM(v[i-1])*m[i-1]))
n.append(n[i-1] + dt*((alphaN(v[i-1])*(1-n[i-1]))-betaN(v[i-1])*n[i-1]))
h.append(h[i-1] + dt*((alphaH(v[i-1])*(1-h[i-1]))-betaH(v[i-1])*h[i-1]))
gNa = g_Na * h[i-1]*(m[i-1])**3
gK=g_K*n[i-1]**4
gl=g_l
INa = gNa*(v[i-1]-ENa)
IK = gK*(v[i-1]-EK)
Il=gl*(v[i-1]-El)
v.append(v[i-1]+(dt)*((1/Cm)*(I[i-1]-(INa+IK+Il))))
#v.append(v[i-1]+(dt)*((1/Cm)*(I-(INa+IK+Il))))
meansquare = np.sqrt((np.square(v).sum()))
return v,area,meansquare
spikeEvents = [] #timing each spike
length = 1000*5 #the time period
fluctuations = []
output = []
for j in range(1, 10):
barcode = np.zeros(length)
noisyI = np.random.normal(0,9,length)
area = 1.0+0.1*j
res = HHmodel(noisyI,length,area)
output.append(res[2])
print('Done.')
The goal should be that the fluctuations of v increase in some way with the size of the are a - I was thinking here of the rms amplitude as a reasonable measure
BR
edit:
for i in range(1,len(t)):
m.append(m[i-1] + dt*((alphaM(v[i-1])*(1-m[i-1]))-betaM(v[i-1])*m[i-1]))
n.append(n[i-1] + dt*((alphaN(v[i-1])*(1-n[i-1]))-betaN(v[i-1])*n[i-1]))
h.append(h[i-1] + dt*((alphaH(v[i-1])*(1-h[i-1]))-betaH(v[i-1])*h[i-1]))
gNa = g_Na * h[i-1]*(m[i-1])**3
gK=g_K*n[i-1]**4
gl=g_l
INa = gNa*(v[i-1]-ENa)
IK = gK*(v[i-1]-EK)
Il=gl*(v[i-1]-El)
v.append(v[i-1]+(dt)*((1/Cm)*(I[i-1]-(INa+IK+Il))))
z.append(v[i-1]-np.mean(v))
#v.append(v[i-1]+(dt)*((1/Cm)*(I-(INa+IK+Il))))
mean = sum(np.square(v))/len(v)
squared_diffs =[(item-mean)**2 for item in v]
ms_diff = sum(squared_diffs)/len(squared_diffs)
rms_diff =np.sqrt(ms_diff)
return v,area,rms_diff
edit2:
Plot for j in range(1,10) - blue: rmsvalue as calculated in edit 1, yellow 1/sqrt(j)
edit3:
Plot for j in range(1,100) - but the "size" of fluctuations should increase, and not decrease and center somewhere
A few minor notes:
So, basically your "function" v is a one-timestep discrete evaluation of some function rather than a true function, but that's not really relevant here.
As indicated by comments above, you should calculate v for all timesteps and aggregate the squared values, then sum them outside of the loop and normalize by dividing by len(v).
It is also unclear why in iteration i you calculate v[i] but the corresponding squared value you calculate is v[i-1] squared. Should use same index on same loop iteration or you'll likely end up missing an element.
I would say that the reason that the result is not useful is that root-mean square is not really ever used for a function's outputs (RMS in this case is just some sort of less useful mean that gives extra weight to outliers); rather RMS is generally used on the error or variance of that function's outputs. RMS error or variance tells you how far, in the function's original units, does the average function value differ from the average value?). Note that this is only really an imporant metric if you expect the value of v to be constant.
Given all this, it's hard to say from your question what your intention is and what you're actually trying to do with this info so I will guess that what you really care about is how much the value of v is varying from the mean. In this case, you can use RMS difference from mean value of v calculated as such:
for i in range(1,len(t)):
#calculate v[i] here, omitted for simplicity
# get mean value
mean = sum(squares)/len(squares)
# you want to get the squared value of the difference, not the value itself
squared_diffs = [(item - mean)**2 for item in v)]
# get mean squared diff
ms_diff = sum(squared_diffs) / len(squared_diffs)
# return root of mean squared diff
rms_diff = np.sqrt(ms_diff)
return v,area,rms_diff
Again, this is only useful if you expect the outputs of v to be a constant. If not, you would try to fit a different model (linear, quadratic, etc.) to the function and then calculate the RMS error. Question would be much clearer if you indicated goal of this calculation.
I am aware that SGD has been asked before on SO but I wanted to have an opinion on my code as below:
import numpy as np
import matplotlib.pyplot as plt
# Generating data
m,n = 10000,4
x = np.random.normal(loc=0,scale=1,size=(m,4))
theta_0 = 2
theta = np.append([],[1,0.5,0.25,0.125]).reshape(n,1)
y = np.matmul(x,theta) + theta_0*np.ones(m).reshape((m,1)) + np.random.normal(loc=0,scale=0.25,size=(m,1))
# input features
x0 = np.ones([m,1])
X = np.append(x0,x,axis=1)
# defining the cost function
def compute_cost(X,y,theta_GD):
return np.sum(np.power(y-np.matmul(np.transpose(theta_GD),X),2))/2
# initializations
theta_GD = np.append([theta_0],[theta]).reshape(n+1,1)
alp = 1e-5
num_iterations = 10000
# Batch Sum
def batch(i,j,theta_GD):
batch_sum = 0
for k in range(i,i+9):
batch_sum += float((y[k]-np.transpose(theta_GD).dot(X[k]))*X[k][j])
return batch_sum
# Gradient Step
def gradient_step(theta_current, X, y, alp,i):
for j in range(0,n):
theta_current[j]-= alp*batch(i,j,theta_current)/10
theta_updated = theta_current
return theta_updated
# gradient descent
cost_vec = []
for i in range(num_iterations):
cost_vec.append(compute_cost(X[i], y[i], theta_GD))
theta_GD = gradient_step(theta_GD, X, y, alp,i)
plt.plot(cost_vec)
plt.xlabel('iterations')
plt.ylabel('cost')
I was trying a mini-batch GD with a batch size of 10. I am getting extremely oscillatory behavior for the MSE. Where's the issue? Thanks.
P.S. I was following NG's https://www.coursera.org/learn/machine-learning/lecture/9zJUs/mini-batch-gradient-descent
This is a description of the underlying mathematical principle, not a code based solution...
The cost function is highly nonlinear (np.power()) and recursive and recursive and nonlinear systems can oscillate ( self-oscillation https://en.wikipedia.org/wiki/Self-oscillation ). In mathematics this is subject to chaos theory / theory of nonlinear dynamical systems ( https://pdfs.semanticscholar.org/8e0d/ee3c433b1806bfa0d98286836096f8c2681d.pdf ), cf the Logistic Map
( https://en.wikipedia.org/wiki/Logistic_map ). The logistic map oscillates if the growth factor r exceeds a threshold. The growth factor is a measure for how much energy is in the system.
In your code the critical parts are the cost function, the cost vector, that is the history of the system and the time steps :
def compute_cost(X,y,theta_GD):
return np.sum(np.power(y-np.matmul(np.transpose(theta_GD),X),2))/2
cost_vec = []
for i in range(num_iterations):
cost_vec.append(compute_cost(X[i], y[i], theta_GD))
theta_GD = gradient_step(theta_GD, X, y, alp,i)
# Gradient Step
def gradient_step(theta_current, X, y, alp,i):
for j in range(0,n):
theta_current[j]-= alp*batch(i,j,theta_current)/10
theta_updated = theta_current
return theta_updated
If you compare this to an implementation of the logistic map you see the similarities
from pylab import show, scatter, xlim, ylim
from random import randint
iter = 1000 # Number of iterations per point
seed = 0.5 # Seed value for x in (0, 1)
spacing = .0001 # Spacing between points on domain (r-axis)
res = 8 # Largest n-cycle visible
# Initialize r and x lists
rlist = []
xlist = []
def logisticmap(x, r): <------------------ nonlinear function
return x * r * (1 - x)
# Return nth iteration of logisticmap(x. r)
def iterate(n, x, r):
for i in range(1,n):
x = logisticmap(x, r)
return x
# Generate list values -- iterate for each value of r
for r in [i * spacing for i in range(int(1/spacing),int(4/spacing))]:
rlist.append(r)
xlist.append(iterate(randint(iter-res/2,iter+res/2), seed, r)) <--------- similar to cost_vector, the history of the system
scatter(rlist, xlist, s = .01)
xlim(0.9, 4.1)
ylim(-0.1,1.1)
show()
source of code : https://www.reddit.com/r/learnpython/comments/zzh28/a_simple_python_implementation_of_the_logistic_map/
Basing on this you can try to modify your cost function by introducing a factor similar to the growth factor in the logistic map to reduce the intensity of oscillation of the system
def gradient_step(theta_current, X, y, alp,i):
for j in range(0,n):
theta_current[j]-= alp*batch(i,j,theta_current)/10 <--- introduce a factor somewhere to keep the system under the oscillation threshold
theta_updated = theta_current
return theta_updated
or
def compute_cost(X,y,theta_GD):
return np.sum(np.power(y-np.matmul(np.transpose(theta_GD),X),2))/2 <--- introduce a factor somewhere to keep the system under the oscillation threshold
If this is not working maybe follow the suggestions in https://www.reddit.com/r/MachineLearning/comments/3y9gkj/how_can_i_avoid_oscillations_in_gradient_descent/ ( timesteps,... )
My implementation of steepest descent for solving Ax = b is showing some weird behavior: for any matrix large enough (~10 x 10, have only tested square matrices so far), the returned x contains all huge values (on the order of 1x10^10).
def steepestDescent(A, b, numIter=100, x=None):
"""Solves Ax = b using steepest descent method"""
warnings.filterwarnings(action="error",category=RuntimeWarning)
# Reshape b in case it has shape (nL,)
b = b.reshape(len(b), 1)
exes = []
res = []
# Make a guess for x if none is provided
if x==None:
x = np.zeros((len(A[0]), 1))
exes.append(x)
for i in range(numIter):
# Re-calculate r(i) using r(i) = b - Ax(i) every five iterations
# to prevent roundoff error. Also calculates initial direction
# of steepest descent.
if (numIter % 5)==0:
r = b - np.dot(A, x)
# Otherwise use r(i+1) = r(i) - step * Ar(i)
else:
r = r - step * np.dot(A, r)
res.append(r)
# Calculate step size. Catching the runtime warning allows the function
# to stop and return before all iterations are completed. This is
# necessary because once the solution x has been found, r = 0, so the
# calculation below divides by 0, turning step into "nan", which then
# goes on to overwrite the correct answer in x with "nan"s
try:
step = np.dot(r.T, r) / np.dot( np.dot(r.T, A), r )
except RuntimeWarning:
warnings.resetwarnings()
return x
# Update x
x = x + step * r
exes.append(x)
warnings.resetwarnings()
return x, exes, res
(exes and res are returned for debugging)
I assume the problem must be with calculating r or step (or some deeper issue) but I can't make out what it is.
The code seems correct. For example, the following test work for me (both linalg.solve and steepestDescent give the close answer, most of the time):
import numpy as np
n = 100
A = np.random.random(size=(n,n)) + 10 * np.eye(n)
print(np.linalg.eig(A)[0])
b = np.random.random(size=(n,1))
x, xs, r = steepestDescent(A,b, numIter=50)
print(x - np.linalg.solve(A,b))
The problem is in the math. This algorithm is guaranteed to converge to the correct solution if A is positive definite matrix. By adding the 10 * identity matrix to a random matrix, we increase the probability that all the eigen-values are positive
If you test with large random matrices (for example A = random.random(size=(n,n)), you are almost certain to have a negative eigenvalue, and the algorithm will not converge.
lately i am been working fitting a fourier series function to a periodic signal for retrieve the amplitude and the phase of each component via least squares, so i modified the code of this file for it:
import math
import numpy as np
#period of the signal
per=1.0
w = 2.0*np.pi/per
#number of fourier components.
nf = 5
fp = open("file.cat","r")
# m1 is the number of unknown coefficients.
m1 = 2*nf + 1
# Create empty matrices.
x = np.zeros((m1,m1))
y = np.zeros((m1,1))
xi = [0.0]*m1
# Read (time, value) from each line of the file.
for line in fp:
t = float(line.split()[0])
yi = float(line.split()[1])
xi[0] = 1.0
for k in range(1,nf+1):
xi[2*k-1] = np.sin(k*w*t)
xi[2*k] = np.cos(k*w*t)
for j in range(m1):
for k in range(m1):
x[j,k] += xi[j]*xi[k]
y[j] += yi*xi[j]
fp.close()
# Copy to big matrices.
X = np.mat( x.copy() )
Y = np.mat( y.copy() )
# Invert X and multiply by Y to get coefficients.
A = X.I*Y
A0 = A[0]
# Solution is A0 + Sum[ Amp*sin(k*wt + phi) ]
print "a[0] = %f" % A[0]
for k in range(1,nf+1):
amp = math.sqrt(A[2*k-1]**2 + A[2*k]**2)
phs = math.atan2(A[2*k],A[2*k-1])
print "amp[%d] = %f phi = %f" % (k, amp, phs)
but the plot show this (without the points, of course):
and it should show something like this:
somebody can tell me how can i compute the phase and the amplitude in another simpler way? a guide maybe, i will be very grateful.
cheers!
PD. I will attach the FILE that i used, just because :)
EDITED
The error was with a index :(
First, I defined the vector with the values:
amp = np.array([np.sqrt((A[2*k-1])**2 + (A[2*k])**2) for k in range(1,nf+1)])
phs = np.array([math.atan2(A[2*k],A[2*k-1]) for k in range(1,nf+1)])
and then, to build the signal, I defined:
def term(t): return np.array([amp[k]*np.sin(k*w*t + phs[k]) for k in range(len(amp))])
Signal = np.array([A0+sum(term(phase[i])) for i in range(len(mag))])
but within the np.sin(), k should be k+1, because the index start in 0 ·__·
def term(t): return np.array([amp[k]*np.sin((k+1)*w*t + phs[k]) for k in range(len(amp))])
plt.plot(phase,Signal,'r-',lw=3)
and that is all.
Thanks Marco Tompitak for the help!!
You're specifying the wrong period for the signal:
#period of the signal
per=0.178556
This gives you the resulting Fourier fit, indeed with a maximum period of ~0.17. The problem is that this number specifies the longest period that is present in your Fourier series. The function only has components with perior 0.17 or shorter. Apparently you are expecting a fit with period ~1, so it can never approximate that properly. You should specify per=1.0. There's nothing wrong with the algorithm; a quick writeup of a similar algorithm in Mathematica gives the same output and plausible results:
I'm trying to implement Bayesian PCA using PyMC library for python. But, I'm stuck where I define lower dimensional coordinates...
Model is
x = Wz + e
where x is observation vector, W is the transformation matrix, and z is lower dimensional coordinate vector.
First I define a distribution for the transformation matrix W. Each column is drawn from a normal distribution (zero mean, and identity covariance for simplicity)
def W_logp(value):
logLikes = np.array([multivariate_normal.logpdf(value[:,i], mean=np.zeros(dimX), cov=1) for i in range(0, dimZ)])
return logLikes.sum()
def W_random():
W = np.zeros([dimX, dimZ])
for i in range(0, dimZ):
W[:,i] = multivariate_normal.rvs(mean=np.zeros(dimX), cov=1)
return W
w0 = np.random.randn(dimX, dimZ)
W = pymc.Stochastic(
logp = W_logp,
doc = 'Transformation',
name = 'W',
parents = {},
random = W_random,
trace = True,
value = w0,
dtype = float,
rseed = 116.,
observed = False,
cache_depth = 2,
plot = False,
verbose = 0)
Then, I want to define distribution for z that is again a multivariate normal (zero mean, and identity covariance). However, I need to draw a z for each observation separately while W is common for all of them. So, I tried
z = pymc.MvNormal('z', np.zeros(dimZ), np.eye(dimZ), size=N)
However, pymc.MvNormal does not have a size parameter. So it raises an error. Next step would be
m = Data.mean(axis=0) + np.dot(W, z)
obs = pymc.MvNormal('Obs', m, C, value=Data, observed=True)
I did not give the specification for C above since it is irrelevant for now. Any ideas how to implement?
Thanks
EDIT
After Chris Fonnesbeck's answer I changed my code as follows
numD, dimX = Data.shape
dimZ = 3
mm = Data.mean(axis=0)
tau = pymc.Gamma('tau', alpha=10, beta=2)
tauW = pymc.Gamma('tauW', alpha=20, beta=2, size=dimZ)
#pymc.deterministic(dtype=float)
def C(tau=tau):
return (tau)*np.eye(dimX)
#pymc.deterministic(dtype=float)
def CW(tau=tauW):
return np.diag(tau)
W = [pymc.MvNormal('W%i'%i, np.zeros(dimZ), CW) for i in range(dimX)]
z = [pymc.MvNormal('z%i'%i, np.zeros(dimZ), np.eye(dimZ)) for i in range(numD)]
mu = [pymc.Lambda('mu%i'%i, lambda W=W, z=z: mm + np.dot(np.array(W), np.array(z[i]))) for i in range(numD)]
obs = [pymc.MvNormal('Obs%i'%i, mu[i], C, value=Data[i,:], observed=True) for i in range(numD)]
model = pymc.Model([tau, tauW] + obs + W + z)
mcmc = pymc.MCMC(model)
But this time, it tries to allocate a huge amount of memory (more than 8GB) when running pymc.MCMC(model), with numD=45 and dimX=504. Even when I try it with only numD=1 (so creating only 1 z, mu, and obs), it does the same. Any idea why?
Unfortunately, PyMC does not easily let you define vectors of multivariate stochastics. Hopefully we can make this happen in PyMC 3. For now, you would have to specify this using a container. For example:
z = [pymc.MvNormal('z_%i' % i, np.zeros(dimZ), np.eye(dimZ)) for i in range(N)]
Regarding the memory issue, try using a different backend for the traces. The default ("ram") keeps everything in RAM. You can try something like "pickle" or "sqlite" instead.
Regarding the plate notation, it might be something we could pursue for PyMC 3. Feel free to create an issue suggesting this in our issue tracker.