Scipy minimize result is higher than with initial guess values - python

I am trying to estimate the parameters of the models jointly using nonlinear least squares, minimizing the sum of squared differences between the actual and model based estimates. However the resulting value is higher than the SSE with my guessed values. Guessed values SSE is 2,951,687, optimized parameters SSE is 4,281,096.
versions: python 3.7.6, numpy 1.19.2, scipy 1.5.2
import numpy as np
import pandas as pd
from scipy.optimize import minimize
###################### importing the excel file ######################
df = pd.read_csv('data2.csv')
###################### Setting up variables and arrays ######################
a = df.loc[:,'C(ADD)'].values #measured added customers
l = df.loc[:,'C(Loss)'].values #measured lost customers
m = df.loc[:,'m'].values #number of months
mkt = df.loc[:,'Marketing Expense'].values #maketing dollars in each month
e = 5596 #end measured value, Calculated from the cac/total marketing spend over the time period
n = len(df) #creates a variable of the length of the dataframe
###################### Defining equations ######################
g0 = np.zeros(n) #guess values
g0[0] = 0.0001
g0[1] = 0.006
g0[2] = 96755.00
g0[3] = 1.7
g0[4] = 0.6
g0[5] = 0.1
g0[6] = 0.006
g0[7] = 1.7
g0[8] = 0.6
def addhat(g): #Add predict values
pNT = g[0]
r = g[1]
alpha = g[2]
c = g[3]
Bm = g[4]
ah = np.empty(len(df)) #an empty array for the add hat values
b = np.empty(n) #an empty array for the B(m,m') values
b[0] = np.exp(np.log(mkt[0])*Bm)
ah[0] = 400000*((1-pNT) * (1 - (alpha/(alpha + b[0]))**r))
for i in range(1, n):
b[i] = b[i-1] + (m[i]**c - m[i-1]**c)*np.exp(np.log(mkt[i])*Bm)
ah[i] = 400000*((1-pNT) * (1 - (alpha/(alpha + (b[i])))**r))
return ah
print('add pred values: ' + str(addhat(g0)))
def rethat(g): #Retention percentage
rr = g[5]
alphar = g[6]
cr = g[7]
Bmr = g[8]
k = np.empty(n) #an empty array for exponent section of the formula
w = np.empty(n) #an empty array for the retention values
#The value of b(t)r when i = 0
k[0] = np.exp(np.log(mkt[0])*Bmr)
w[0] = 1 - (alphar/(alphar + k[0]))**rr
# the value of B(t) for all other values of q
for i in range(1, n):
k[i] = k[i-1] + (m[i]**cr - m[i-1]**cr)*np.exp(np.log(mkt[i])*Bmr)
w[i] = 1 - (alphar/(alphar + (k[i])))**rr
return w
def endpred(g): #predicting the end hat values
eh = np.empty(n) #an empty array for the end hat values
eh[0] = 213
for i in range(1, n):
eh[i] = (eh[i-1] * rethat(g)[i]) + addhat(g)[i]
return eh
endhat = sum(endpred(g0))
def losshat(g):
lh = np.empty(n) #an empty array for the loss hat values
lh[0] = 0
for i in range(1, n):
lh[i] = endpred(g)[i-1] - (endpred(g)[i] - addhat(g)[i])
return lh
###################### Sum of square errors ######################
def objective(g):
sse = sum((addhat(g)-a)**2 + (losshat(g)-l)**2) + (endhat-e)**2
return sse
print("SSE Initial: " + str(objective(g0)))
###################### Constraints ######################
def constraint1(g): #c is greater than 1
return g[3] - 1
def constraint2(g): #cr is greater than 1
return g[7] - 1
def constraint3(g): #pNT is greater than 0
return g[0]
con1 = {'type': 'ineq', 'fun': constraint1}
con2 = {'type': 'ineq', 'fun': constraint2}
con3 = {'type': 'ineq', 'fun': constraint3}
cons = [con1, con2, con3]
###################### Optimize ######################
s = minimize(objective, g0, method='SLSQP', constraints = cons)
g = s.x
print("SSE Final: " + str(objective(g)))
The resulting SSE value is 4,281,096.9 with the values being:
3.48133574e+02, 6.84452015e+02, 9.67550032e+04, 2.22008198e+00, -3.28153006e+03, -1.91454144e+02, 2.20947909e+02, 1.70207912e+00, -1.24649708e+01
The initial guess values I have used are quite close to the actual result values (I'm checking my code with a problem I know the result to). The results should be 0.0001001361, 0.006035783, 96,755.64542, 1.78204741, 0.636357403, 0.152, 0.0065432195, 1.73490796, 0.62625507 which have a SSE of 912,278.
Link to the data2.csv.
Thanks again for your help

It looks like you are using the same variable l for 2 different purposes.
It is first initialized with a fixed value from the CSV file, but also used as an internal variable in function rethat. And is also used in the objective function - which means that every time you optimize, you also change the objective function.
That does not look good...


how to implement least square polynomial with no built in methods using python?

currently running into a problem solving this.
The objective of the exercise given is to find a polynom of certian degree (the degree is given) from a dataset of points (that can be noist) and to best fit it using least sqaure method.
I don't understand the steps that lead to solving the linear equations?
what are the steps or should anyone provide such a python program that lead to the matrix that I put as an argument in my decomposition program?
Note:I have a python program for cubic splines ,LU decomposition/Guassian decomposition.
I tried to apply guassin / LU decomposition straight away on the dataset but I understand there are more steps to the solution...
I donwt understand how cubic splines add to the mix either..
guassian elimintaion :
import numpy as np
import math
def swapRows(v,i,j):
if len(v.shape) == 1:
v[i],v[j] = v[j],v[i]
v[[i,j],:] = v[[j,i],:]
def swapCols(v,i,j):
v[:,[i,j]] = v[:,[j,i]]
def gaussPivot(a,b,tol=1.0e-12):
n = len(b)
# Set up scale factors
s = np.zeros(n)
for i in range(n):
s[i] = max(np.abs(a[i,:]))
for k in range(0,n-1):
# Row interchange, if needed
p = np.argmax(np.abs(a[k:n,k])/s[k:n]) + k
if abs(a[p,k]) < tol: error.err('Matrix is singular')
if p != k:
# Elimination
for i in range(k+1,n):
if a[i,k] != 0.0:
lam = a[i,k]/a[k,k]
a[i,k+1:n] = a[i,k+1:n] - lam*a[k,k+1:n]
b[i] = b[i] - lam*b[k]
if abs(a[n-1,n-1]) < tol: error.err('Matrix is singular')
# Back substitution
b[n-1] = b[n-1]/a[n-1,n-1]
for k in range(n-2,-1,-1):
b[k] = (b[k] -[k,k+1:n],b[k+1:n]))/a[k,k]
return b
def polyFit(xData,yData,m):
a = np.zeros((m+1,m+1))
b = np.zeros(m+1)
s = np.zeros(2*m+1)
for i in range(len(xData)):
temp = yData[i]
for j in range(m+1):
b[j] = b[j] + temp
temp = temp*xData[i]
temp = 1.0
for j in range(2*m+1):
s[j] = s[j] + temp
temp = temp*xData[i]
for i in range(m+1):
for j in range(m+1):
a[i,j] = s[i+j]
return gaussPivot(a,b)
degree = 10 # can be any degree
I was under the impression the code above gets a dataset of points and a degree. The output should be coeefients of a polynom that fits those points but I have a grader that was provided by my proffesor , and after checking the grading the polynom that returns has a lrage error.
After that I tried the following LU decomposition instead:
import numpy as np
def swapRows(v,i,j):
if len(v.shape) == 1:
v[i],v[j] = v[j],v[i]
v[[i,j],:] = v[[j,i],:]
def swapCols(v,i,j):
v[:,[i,j]] = v[:,[j,i]]
def LUdecomp(a,tol=1.0e-9):
n = len(a)
seq = np.array(range(n))
# Set up scale factors
s = np.zeros((n))
for i in range(n):
s[i] = max(abs(a[i,:]))
for k in range(0,n-1):
# Row interchange, if needed
p = np.argmax(np.abs(a[k:n,k])/s[k:n]) + k
if abs(a[p,k]) < tol: error.err('Matrix is singular')
if p != k:
# Elimination
for i in range(k+1,n):
if a[i,k] != 0.0:
lam = a[i,k]/a[k,k]
a[i,k+1:n] = a[i,k+1:n] - lam*a[k,k+1:n]
a[i,k] = lam
return a,seq
def LUsolve(a,b,seq):
n = len(a)
# Rearrange constant vector; store it in [x]
x = b.copy()
for i in range(n):
x[i] = b[seq[i]]
# Solution
for k in range(1,n):
x[k] = x[k] -[k,0:k],x[0:k])
x[n-1] = x[n-1]/a[n-1,n-1]
for k in range(n-2,-1,-1):
x[k] = (x[k] -[k,k+1:n],x[k+1:n]))/a[k,k]
return x
the results were a bit better but nowhere near what it should be
Edit 2:
I tried the chebyshev method suggested in the comments and came up with:
import numpy as np
def chebyshev_transform(x, n):
Transforms x-coordinates to Chebyshev coordinates
return np.cos(n * np.arccos(x))
def chebyshev_design_matrix(x, n):
Constructs the Chebyshev design matrix
x_cheb = chebyshev_transform(x, n)
T = np.zeros((len(x), n+1))
T[:,0] = 1
T[:,1] = x_cheb
for i in range(2, n+1):
T[:,i] = 2 * x_cheb * T[:,i-1] - T[:,i-2]
return T
degree =10
f = lambda x: np.cos(X)
xdata = np.linspace(-1,1,num=100)
ydata = np.array([f(i) for i in xdata])
M = chebyshev_design_matrix(xdata,degree)
D_x ,D_y = np.linalg.qr(M)
D_x, seq = LUdecomp(D_x)
A = LUsolve(D_x,D_y,seq)
I can't use linalg.qr in my program , it was just for checking how it works.In addition , I didn't get the 'slow way' of the formula that were in the comment.
The program cant get an x point that is not between -1 and 1 , is there any way around it , any normalizition?
Thanks a lot.
You are probably asked for an unsophisticated method. If the degree of the polynomial remains low, you can use the straightforward approach below. For the sake of the explanation, I'll use a cubic model.
Assume that you want to fit your data to this polynomial, by observing that it seems to follow a cubic behavior:
ax³ + bx² + cx + d ~ y
[All x and y should be understood with an index i which is omitted for notational convenience.]
If there are more than four data points, you get an overdetermined system of equations, usually with no solution. The trick is to consider the error on the individual equations, e = ax³ + bx² + cx + d - y, and to minimize the total error. As the error is a signed number, negative errors would make minimization impossible. Instead, we minimize the sum of squared errors. (The sum of absolute errors is another option but it unfortunately leads to a much harder problem.)
Min(a, b, c, d) Σ(ax³ + bx² + cx + d - y)²
As the unknown parameters are unconstrained, it suffices to look for a stationary point, i.e. cancel the gradient of the total error. By differentiation on the unknowns a, b, c and d, we obtain
2Σ(ax³x³ + bx²x³ + cxx³ + dx³ - yx³) = 0
2Σ(ax³x² + bx²x² + cxx² + dx² - yx²) = 0
2Σ(ax³x + bx²x + cxx + dx - yx ) = 0
2Σ(ax³ + bx² + cx + d - y ) = 0
As you can recognize, this is a square linear system of equations.

How to assign to a variable an infinite value in gekko?

I am trying to assign an infinite value to a variable of gekko. I have tried with the numpy's infinite value and python's own infinite but it is still not working due to a problem of recognition of gekko.
The main objective of this idea is to force a variable to be strictly equal to 0, at least in the first iteration of the solver.
from gekko import GEKKO
from numpy import Inf
And the error I am getting:
Exception: #error: Model Expression
*** Error in syntax of function string: Invalid element: inf
Moreover, sometimes other variables are also required to be infinite, again, variables that are located in the denominator of a model equation. This is quite useful in order to try different scenarios of the simulation I am working with and check the systems behavior.
Hope you can help me, thank you.
The large-scale NLP and MINLP solvers don't know how to compute gradients with a np.nan value so initializing with NaN generally doesn't help. Please post example code that demonstrates the issue that you are observing with improved performance from NaN initialization.
Below are four unconstrained optimization methods compared on the same sample problem. The algorithms do not benefit from NaN for initialization. Some solvers substitute NaN with 0 or a high or low number. I suggest that you try giving np.nan as an initial condition to these solution methods to see how it affects the search for the minimum.
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
# define objective function
def f(x):
x1 = x[0]
x2 = x[1]
obj = x1**2 - 2.0 * x1 * x2 + 4 * x2**2
return obj
# define objective gradient
def dfdx(x):
x1 = x[0]
x2 = x[1]
grad = []
grad.append(2.0 * x1 - 2.0 * x2)
grad.append(-2.0 * x1 + 8.0 * x2)
return grad
# Exact 2nd derivatives (hessian)
H = [[2.0, -2.0],[-2.0, 8.0]]
# Start location
x_start = [-3.0, 2.0]
# Design variables at mesh points
i1 = np.arange(-4.0, 4.0, 0.1)
i2 = np.arange(-4.0, 4.0, 0.1)
x1_mesh, x2_mesh = np.meshgrid(i1, i2)
f_mesh = x1_mesh**2 - 2.0 * x1_mesh * x2_mesh + 4 * x2_mesh**2
# Create a contour plot
# Specify contour lines
lines = range(2,52,2)
# Plot contours
CS = plt.contour(x1_mesh, x2_mesh, f_mesh,lines)
# Label contours
plt.clabel(CS, inline=1, fontsize=10)
# Add some text to the plot
plt.title(r'$f(x)=x_1^2 - 2x_1x_2 + 4x_2^2$')
# Newton's method
xn = np.zeros((2,2))
xn[0] = x_start
# Get gradient at start location (df/dx or grad(f))
gn = dfdx(xn[0])
# Compute search direction and magnitude (dx)
# with dx = -inv(H) * grad
delta_xn = np.empty((1,2))
delta_xn = -np.linalg.solve(H,gn)
xn[1] = xn[0]+delta_xn
# Steepest descent method
# Number of iterations
n = 8
# Use this alpha for every line search
alpha = 0.15
# Initialize xs
xs = np.zeros((n+1,2))
xs[0] = x_start
# Get gradient at start location (df/dx or grad(f))
for i in range(n):
gs = dfdx(xs[i])
# Compute search direction and magnitude (dx)
# with dx = - grad but no line searching
xs[i+1] = xs[i] -,dfdx(xs[i]))
# Conjugate gradient method
# Number of iterations
n = 8
# Use this alpha for the first line search
alpha = 0.15
neg = [[-1.0,0.0],[0.0,-1.0]]
# Initialize xc
xc = np.zeros((n+1,2))
xc[0] = x_start
# Initialize delta_gc
delta_cg = np.zeros((n+1,2))
# Initialize gc
gc = np.zeros((n+1,2))
# Get gradient at start location (df/dx or grad(f))
for i in range(n):
gc[i] = dfdx(xc[i])
# Compute search direction and magnitude (dx)
# with dx = - grad but no line searching
if i==0:
beta = 0
delta_cg[i] = -,dfdx(xc[i]))
beta =[i],gc[i]) /[i-1],gc[i-1])
delta_cg[i] = alpha *,dfdx(xc[i])) + beta * delta_cg[i-1]
xc[i+1] = xc[i] + delta_cg[i]
# Quasi-Newton method
# Number of iterations
n = 8
# Use this alpha for every line search
alpha = np.linspace(0.1,1.0,n)
# Initialize delta_xq and gamma
delta_xq = np.zeros((2,1))
gamma = np.zeros((2,1))
part1 = np.zeros((2,2))
part2 = np.zeros((2,2))
part3 = np.zeros((2,2))
part4 = np.zeros((2,2))
part5 = np.zeros((2,2))
part6 = np.zeros((2,1))
part7 = np.zeros((1,1))
part8 = np.zeros((2,2))
part9 = np.zeros((2,2))
# Initialize xq
xq = np.zeros((n+1,2))
xq[0] = x_start
# Initialize gradient storage
g = np.zeros((n+1,2))
g[0] = dfdx(xq[0])
# Initialize hessian storage
h = np.zeros((n+1,2,2))
h[0] = [[1, 0.0],[0.0, 1]]
for i in range(n):
# Compute search direction and magnitude (dx)
# with dx = -alpha * inv(h) * grad
delta_xq =[i],np.linalg.solve(h[i],g[i]))
xq[i+1] = xq[i] + delta_xq
# Get gradient update for next step
g[i+1] = dfdx(xq[i+1])
# Get hessian update for next step
gamma = g[i+1]-g[i]
part1 = np.outer(gamma,gamma)
part2 = np.outer(gamma,delta_xq)
part3 =,part1)
part4 = np.outer(delta_xq,delta_xq)
part5 =[i],part4)
part6 =,h[i])
part7 =,h[i])
part8 =,delta_xq)
part9 =,1/part8)
h[i+1] = h[i] + part3 - part9
More information is available in the design optimization course.
Response to Edit
Thanks for clarifying the question and for including a source code example. While it isn't possible to include Inf as a guess, an equivalent form with an additional variable x may be able to accomplish the desired behavior. This sets the term (T[1]-T[0])/R initially equal to zero at the beginning iteration.
from gekko import GEKKO
from numpy import Inf

Plotting a System of Two Differential Eqns Python

I am having some trouble with a model I want to analyze. I am trying to plot two differential equations however I am very new to doing this and am not getting it to work. Any help is appreciated
#Polyaneuploid cell development during cancer
#two eqns
#Fixed Points:
import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
def modelC(C,t):
λc = 0.0601
K = 2000
α = 1 * (10**-4)
ν = 1 * (10**-6)
λp = 0.1
γ = 2
def modelP(P,t):
λc = 0.0601
K = 2000
α = 1 * (10**-4)
ν = 1 * (10**-6)
λp = 0.1
γ = 2
#returning odes
dPdt = ((λp))*P(1-(C+(γ*P))/K)+ (α*C)
dCdt = ((λc)*C)(1-(C+(γ*P))/K)-(α*C) + (ν*P)
return dPdt, dCdt
#initial conditions
C0= 256
P0 = 0
#time points
t = np.linspace(0,30)
#solve odes
P = odeint(modelP,t,P0, args = (C0,))
C = odeint(modelC,t,C0, args= (P0,))
#P = odeint(modelP, P0 , t)
#P = P[:, 2]
#C = odeint(modelC, C0 , t)
#C = C[:, 2]
#plot results
plt.xlabel('time in days')
This is just what I have so far, and currently I am getting this error: ValueError: diff requires input that is at least one dimensional
Any tips on how to get the graphs to show?
You need to put your initial conditions in a list like so:
initial_conditions = [C0, P0]
P = odeint(modelP,t,initial_conditions)
you still have some error in your P function where try to access C which is not defined in the local scope of your function neither passed as an argument.
def modelP(P,t,C):
λc = 0.0601
K = 2000
α = 1 * (10**-4)
ν = 1 * (10**-6)
λp = 0.1
γ = 2
#returning odes
dPdt = ((λp))*P(1-(C+(γ*P))/K)+ (α*C)
dCdt = ((λc)*C)(1-(C+(γ*P))/K)-(α*C) + (ν*P)
return dPdt, dCdt
#initial conditions
C0= 256
P0 = 0
Pconds = [P0]
#time points
t = np.linspace(0,30)
#solve odes
P = odeint(modelP,t, Pconds, args=(C0,))
The solver deals with flat arrays with no inherent meaning in the components. You need to add that meaning, unpack the input vector into the state object, at the start of the model function, and remove that meaning, reduce the state to a flat array or list, at the end of the model function.
Here this is simple, the state consists of 2 scalars. Thus a structure for the model function is
def model(X,t):
P, C = X
return dPdt, dCdt
Then integrate as
X = odeint(model,(P0,C0),t)
P,C = X.T

Can't manage to use correctly a library - Errors on import or on function not recognized

From pybss, I can't execute a simple using of the function ffdiag, which is by the way defined in my script :
from numpy import dot,diag,eye,zeros
from numpy.linalg import svd,pinv,multi_dot,norm,inv,cholesky
from scipy.linalg import expm
# remove later
import numpy as np
from . import linalg
# dimension
# number of matrices
# Load spectro and WL+GCph+XC
FISH_GCsp = np.loadtxt('Fisher_GCsp_flat.txt')
FISH_XC = np.loadtxt('Fisher_XC_GCph_WL_flat.txt')
# Marginalizing over uncommon parameters between the two matrices
COV_GCsp_first = np.linalg.inv(FISH_GCsp)
COV_XC_first = np.linalg.inv(FISH_XC)
COV_GCsp = COV_GCsp_first[0:m,0:m]
COV_XC = COV_XC_first[0:m,0:m]
# Invert to get Fisher matrix
FISH_sp = np.linalg.inv(COV_GCsp)
FISH_xc = np.linalg.inv(COV_XC)
# Drawing a random set of commuting matrices
C[0] = np.array(FISH_sp)
C[1] = np.array(FISH_xc)
# Perform operation of diagonalisation
invV, B, ut = ffdiag(C, 10, 1.0e-10, 100)
# Print diagonal matrices
M0 =,C[0]),B.T)
M1 =,C[1]),B.T)
FISH_final = M0 + M1
def ffdiag_update(R_tau,ortho):
Single update for the non-orthogonal FFDIAG algorithm. Set ortho = True to
do the proper update for the orthogonal version of the algorithm.
Dk = {}
Ek = {}
dim = len(R_tau[0])
n_lags = len(R_tau.keys())
for tau in R_tau.keys():
Dk[tau] = diag(diag(R_tau[tau]))
Ek[tau] = R_tau[tau] - Dk[tau]
W = zeros((dim,dim))
if ortho is False:
z = zeros(W.shape)
y = zeros(W.shape)
for i in range(0,dim):
for j in range(0,dim):
for tau in range(0,n_lags):
z[i,j] += Dk[tau][i,i]*Dk[tau][j,j]
y[i,j] += Dk[tau][j,j]*Ek[tau][i,j]
# compute W
for i in range(0,dim):
for j in range(i+1,dim):
W[i][j] = (z[i,j]*y[j,i]-z[i,i]*y[i,j])/(z[j,j]*z[i,i]-z[i,j]*z[i,j])
W[j][i] = (z[i,j]*y[i,j]-z[j,j]*y[j,i])/(z[j,j]*z[i,i]-z[i,j]*z[i,j])
num = zeros((dim,dim))
den = zeros((dim,dim))
for i in range(0,dim):
for j in range(i+1,dim):
for tau in range(0,n_lags):
num[i,j] += Ek[tau][i,j]*(Dk[tau][i,i] - Dk[tau][j,j])
den[i,j] += (Dk[tau][i,i]-Dk[tau][j,j])**2
if i != j:
W[i,j] = num[i,j]/den[i,j]
# W must be skew-symmetric (W = -W^T)
W[j,i] = -W[i,j]
return W
def amuse(X, tau = 1):
Runs the AMUSE algorithm on the signal matrix X; extracts a full
set of X.shape[0] sources
X : array, required
N_sig x t matrix of signal mixtures
tau : integer, optional
sets the lag used for cross-correlation matrix
A : array
n_sig x n_sig mixing matrix
W : array
n_sig x n_sig unmixing matrix
S : array
n_sig x t array of extracted sources
Rx = dot(X,X.T)
ux,sx,vx = svd(Rx, full_matrices = False)
psi = sx**0.5
C = diag(1/psi)
Y = dot(C,X)
t = X.shape[1]
Ry = linalg.lagged_covariance(Y,tau)[tau]
uy,sy,vy = svd((Ry + Ry.T)/2, full_matrices=False)
S = dot(vy.T,Y)
A = dot(dot(uy, diag(psi)), vy)
W = pinv(A)
return A,W,S
def sobi(X, max_lag = 15):
Blind source separation using SOBI (second-order blind identification)
X : array, required
N_sig x t matrix of signal mixtures
max_lag : int, optional
maximum lag (in samples) for covariance matrix calculation
Kw,Kd = linalg.whitening_matrix(X, len(X[:,0]))
Z = dot(Kw,X)
R_tau = linalg.lagged_covariance(Z,max_lag)
D = linalg.joint_diagonalizer(R_tau)
S = dot(D.T,Z)
A = dot(Kd,D)
W = dot(D.T,Kw)
return A,W,S
def ffdiag(X, max_lag = 10, eps = 1.0e-10, max_iter = 100):
Blind source separation using FFDIAG. This version does not require that
the estimated mixing matrix be orthogonal.
X : array, required
N_sig x t matrix of signal mixtures
max_lag : int, optional
maximum lag (in samples) for covariance matrix calculation
eps : double, optional
convergence criterion for matrix updates
max_iter : int, optional
maximum number of iterations/updates
R_tau = linalg.lagged_covariance(X,max_lag)
dim = len(R_tau[0])
n_lags = len(R_tau.keys())
W = zeros((dim,dim))
V = eye(dim)
C = R_tau
niter = 0
theta = 0.9
iter_eps = 1.0
while iter_eps > eps and niter < max_iter:
niter += 1
Vn1 = V
for tau in range(0,n_lags):
C[tau] = multi_dot([eye(dim) + W,C[tau],(eye(dim)+W).T])
# update term
W = ffdiag_update(C,False)
if norm(W) > theta:
W = (W*theta)/norm(W)
# update V
V = dot(eye(dim) + W,V)
delta = 0
for i in range(0,dim):
for j in range(0,dim):
if i == j:
delta += (V[i][j]-Vn1[i][j])**2
iter_eps = (delta/(dim*(dim-1)))
ut = dot(V,X)
return inv(V),V, ut
def ortho_ffdiag(X, max_lag = 10, eps = 1.0e-08, max_iter = 100):
Blind source separation using FFDIAG. This version (like SOBI, AMUSE, etc.)
finds an orthogonal mixing matrix.
X : array, required
N_sig x t matrix of signal mixtures
max_lag : int, optional
maximum lag (in samples) for covariance matrix calculation
eps : double, optional
convergence criterion for matrix updates
max_iter : int, optional
maximum number of iterations/updates
R_tau = linalg.lagged_covariance(X,max_lag)
dim = len(R_tau[0]) # formerly N
n_lags = len(R_tau.keys()) # formerly K
W = zeros((dim,dim))
V = eye(dim)
C = R_tau
n_iter = 0
theta = 0.9
iter_eps = 1.0
while iter_eps > eps and n_iter < max_iter:
n_iter += 1
Vn1 = V
for tau in range(0,n_lags):
C[tau] = multi_dot([eye(dim) + W,C[tau],(eye(dim)+W).T])
W = ffdiag_update(C,True)
if norm(W) > theta:
W = (W*theta)/norm(W)
# update V
V = dot(expm(W),V)
delta = 0
for i in range(0,dim):
for j in range(0,dim):
if i != j:
delta += (V[i][j]-Vn1[i][j])**2
eps = (delta/(dim*(dim-1)))
ut = dot(V,X)
return inv(V),V, ut
def fobi(X):
Blind source separation via the FOBI (fourth order blind identification)
R_x = linalg.lagged_covariance(X,0)
C = cholesky(R_x[0])
Y = dot(inv(C),X)
R_y = (norm(Y)**2)*(dot(Y,Y.T))
u,s,Y_i = svd(R_y)
alpha = dot(Y_i,Y) # messages (extracted sources)
X_i = dot(C,Y_i) # signatures (mixing matrix)
W_i = pinv(X_i) # unmixing matrix
# return things in A,W,S order to conform to other bss methods
return X_i,W_i,alpha
But I get a common error that I can't manage to fix :
Traceback (most recent call last):
File "", line 8, in
from . import linalg
ImportError: attempted relative import with no known parent package
You can see the browsing on this image :
I have also to put :
from linalg import linalg instead of but problem remains.
Another thing to do is to put :
from linalg import *
In this case, I get the following error :
Traceback (most recent call last):
File "", line 36, in <module>
invV, B, ut = ffdiag(C, 10, 1.0e-10, 100)
NameError: name 'ffdiag' is not defined
Is there a link between this error message and the line "from linalg import *"

Trying to build neural net for digit recognition in Python. Unable to get theta2 and predictions correct

I am following Andrew's Coursera course on machine learning. I am trying to build a 3 layers neural net for digit recognition in Python (784 input, 25 hidden, 10 output). However, I am unable to get the predictions (of the training data) correct (accuracy < 5% at 100 iter, accuracy not increasing with iteration).
J (the cost function) seems to be going down (see photo 1) and I have done gradient checking (before minimizing) and it seems to match to around 1e-11 (see photo 2).
I have compared the theta1 and theta2 after 100 iterations to my working matlab code (see code snippet 1 for octave and code snippet 2 for python). It seems theta1 is reasonably similar but theta2 is very different -- see code snippet 2. (I know they should differ because of the different optimisation routines. However, firstly, I have place the same initial thetas into both codes. Secondly, my reasoning is that they should start to converge, or at least get close, after 100 iterations)
The only error I see is:
-c:32: RuntimeWarning: overflow encountered in exp
when running the sigmoid during the optimising. However, I was told that this is not essential and it is normal to encounter this error during optimising? Furthermore, because it is a sigmoid, anytime the input is large, it will tend towards 1 anyways.
I have also attached my code in snippet 3. I have cut out all the other non-essential bits (like gradient checking) to make it as short as possible.
I would appreciate any help into this as I cannot even find where it is going wrong, let alone fix it. Thank you.
J (cost function) decreasing to 1.8 after 12 iterations
Gradient checking before optimizing, they look very similar
Code snippet:
Initializing Neural Network Parameters ...
Doing fminunc
Training Neural Network...
Iteration 100 | Cost: 6.219605e-01
[-0.01001 -0.07714 -0.11138 -0.02301 0.05478 -0.05055 -0.07312 -0.09887
0.0128 -0.08554 -0.10025 -0.11372 -0.06693 -0.09999 0.00845 -0.03632
-0.05886 -0.04311 -0.11337 -0.03263 0.02828 0.00524 -0.11346 -0.06177
[ 0.02737 0.1026 -0.05021 -0.06991 0.01906 0.1004 0.07846 -0.00759
-0.03621 0.02862]
Doing fminunc
-c:32: RuntimeWarning: overflow encountered in exp
[-0.00997202 -0.07680716 -0.11086841 -0.02292044 0.05455335 -0.05034252
-0.07280686 -0.09842603 0.01275117 -0.08516515 -0.0997987 -0.11319546
-0.06664666 -0.09954009 0.00841804 -0.03617494 -0.05861458 -0.04293555
-0.1128474 -0.0325006 0.02816879 0.00522031 -0.1129369 -0.06151103
[ 0.27954826 -0.08007496 -0.36449273 -0.22988024 0.06849659 -0.47803973
1.09023041 -0.25570559 -0.24537494 -0.40341995]
#-----------------BEGIN HEADERS-----------------
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import csv
import scipy
#-----------------END HEADERS-----------------
#-----------------BEGIN FUNCTION 1-----------------
def randinitialize(L_in, L_out):
w = np.zeros((L_out, 1 + L_in))
epsilon_init = 0.12
w = np.random.rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init
return w
#-----------------END FUNCTION 1-----------------
#-----------------BEGIN FUNCTION 2-----------------
def sigmoid(lz):
g = 1.0/(1.0+np.exp(-lz))
return g
#-----------------END FUNCTION 2-----------------
#-----------------BEGIN FUNCTION 3-----------------
def sigmoidgradient(lz):
g = np.multiply(sigmoid(lz),(1-sigmoid(lz)))
return g
#-----------------END FUNCTION 3-----------------
#-----------------BEGIN FUNCTION 4-----------------
def nncostfunction(ltheta_ravel, linput_layer_size, lhidden_layer_size, lnum_labels, lx, ly, llambda_reg):
ltheta1 = np.array(np.reshape(ltheta_ravel[:lhidden_layer_size * (linput_layer_size + 1)], (lhidden_layer_size, (linput_layer_size + 1))))
ltheta2 = np.array(np.reshape(ltheta_ravel[lhidden_layer_size * (linput_layer_size + 1):], (lnum_labels, (lhidden_layer_size + 1))))
ltheta1_grad = np.zeros((np.shape(ltheta1)))
ltheta2_grad = np.zeros((np.shape(ltheta2)))
y_matrix = []
lm = np.shape(lx)[0]
eye_matrix = np.eye(lnum_labels)
for i in range(len(ly)):
y_matrix.append(eye_matrix[int(ly[i])-1,:]) #The minus one as python is zero based
y_matrix = np.array(y_matrix)
a1 = np.hstack((np.ones((lm,1)), lx)).astype(float)
z2 = sigmoid(
a2 = (np.concatenate((np.ones((np.shape(z2)[1], 1)), z2.T), axis=1)).astype(float)
a3 = sigmoid(
h = a3
J_unreg = 0
J = 0
J_unreg = (1/float(lm))*np.sum(\
J = J_unreg + (llambda_reg/(2*float(lm)))*\
delta3 = a3.T - y_matrix
delta2 = np.multiply(([:,1:])), (sigmoidgradient(
cdelta2 = ((a2.T).dot(delta3)).T
cdelta1 = ((a1.T).dot(delta2)).T
ltheta1_grad = (1/float(lm))*cdelta1
ltheta2_grad = (1/float(lm))*cdelta2
theta1_hold = ltheta1
theta2_hold = ltheta2
theta1_hold[:,0] = 0;
theta2_hold[:,0] = 0;
ltheta1_grad = ltheta1_grad + (llambda_reg/float(lm))*theta1_hold;
ltheta2_grad = ltheta2_grad + (llambda_reg/float(lm))*theta2_hold;
thetagrad_ravel = np.concatenate((np.ravel(ltheta1_grad), np.ravel(ltheta2_grad)))
return (J, thetagrad_ravel)
#-----------------END FUNCTION 4-----------------
#-----------------BEGIN FUNCTION 5-----------------
def predict(ltheta1, ltheta2, x):
m, n = np.shape(x)
p = np.zeros(m)
h1 = sigmoid((np.hstack((np.ones((m,1)),x.astype(float)))).dot(ltheta1.T))
h2 = sigmoid((np.hstack((np.ones((m,1)),h1))).dot(ltheta2.T))
for i in range(0,np.shape(h2)[0]):
p[i] = np.argmax(h2[i,:])
return p
#-----------------END FUNCTION 5-----------------
## Setup the parameters you will use for this exercise
input_layer_size = 784; # 28x28 Input Images of Digits
hidden_layer_size = 25; # 25 hidden units
num_labels = 10; # 10 labels, from 0 to 9
data = []
#Reading in data, split into X and y, rewrite label 0 to 10 (for easy comparison to course)
with open('train.csv', 'rb') as csvfile:
has_header = csv.Sniffer().has_header( # rewind
data_csv = csv.reader(csvfile, delimiter=',')
if has_header:
for row in data_csv:
data = np.array(data)
x = data[:,1:]
y = data[:,0]
y = y.astype(int)
for i in range(len(y)):
if y[i] == 0:
y[i] = 10
#Set basic parameters
m, n = np.shape(x)
lambda_reg = 1.0
#Randomly initalize weights for Theta_initial
#theta1_initial = np.genfromtxt('tt1.csv', delimiter=',')
#theta2_initial = np.genfromtxt('tt2.csv', delimiter=',')
theta1_initial = randinitialize(input_layer_size, hidden_layer_size);
theta2_initial = randinitialize(hidden_layer_size, num_labels);
theta_initial_ravel = np.concatenate((np.ravel(theta1_initial), np.ravel(theta2_initial)))
#Doing optimize
fmin = scipy.optimize.minimize(fun=nncostfunction, x0=theta_initial_ravel, args=(input_layer_size, hidden_layer_size, num_labels, x, y, lambda_reg), method='L-BFGS-B', jac=True, options={'maxiter': 10, 'disp': True})
theta1 = np.array(np.reshape(fmin.x[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, (input_layer_size + 1))))
theta2 = np.array(np.reshape(fmin.x[hidden_layer_size * (input_layer_size + 1):], (num_labels, (hidden_layer_size + 1))))
p = predict(theta1, theta2, x);
for i in range(len(y)):
if y[i] == 10:
y[i] = 0
correct = [1 if a == b else 0 for (a, b) in zip(p,y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print 'accuracy = {0}%'.format(accuracy * 100)
I think I have fixed the problem: it seems I messed up the index
should be:
instead of:
