SIR Estimation Finding Cause of Error While Estimating Parameters - python

I'm trying to fit SIR Epidemics Spread Model to the current new case data of the countries. In order to do that I used the work here: https://github.com/epimath/param-estimation-SIR . Main Idea was to fit best possible SIR's Infected curve to the new case data for that specific country, and calculate total predicted case number and the days that belong to %98 and %95 of total cases. The problem is, when I select Brazil, Mexico or United States. It shows that it will never end. I am curious about the reason. Any help about what can be done to deal with this non converging cases would be appreciated.
Please change the selected_country variable from "Spain" to one of those three countries(Brazil, Mexico or United States) to reproduce the result that leads me to ask here.
P.S. I know the limitations of the work. For example, new case number is bound to the number of tests etc. Please ignore those limitations. I'd like to see what is needed to produce a result out of the following code.
Here are some outputs:
Spain (Expected Output Example)
Turkey (Expected Output Example)
France (Expected Output Example)
USA (Unexpected Output Example)
Brazil (Unexpected Output Example)
I suspect something that cause gamma(the rate of recovering) parameter too small which leads the same amount of cases for each day. But I couldn't go further and found out what causing that. (I understood that by checking paramests variable by printing and examining it's values.)
Here you can find my code below.
import scipy.optimize as optimize
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson
from scipy.stats import norm
import json
from scipy.integrate import odeint as ode
import pandas as pd
from datetime import datetime
time_start = datetime.timestamp(datetime.now())
output = {"result": "error"}
error = False
def model(ini, time_step, params):
Y = np.zeros(3) # column vector for the state variables
X = ini
mu = 0
beta = params[0]
gamma = params[1]
Y[0] = mu - beta * X[0] * X[1] - mu * X[0] # S
Y[1] = beta * X[0] * X[1] - gamma * X[1] - mu * X[1] # I
Y[2] = gamma * X[1] - mu * X[2] # R
return Y
def x0fcn(params, data):
S0 = 1.0 - (data[0] / params[2])
I0 = data[0] / params[2]
R0 = 0.0
X0 = [S0, I0, R0]
return X0
def yfcn(res, params):
return res[:, 1] * params[2]
# cost function for the SIR model for python 2.7
# Marisa Eisenberg (marisae#umich.edu)
# Yu-Han Kao (kaoyh#umich.edu) -7-9-17
def NLL(params, data, times): # negative log likelihood
params = np.abs(params)
data = np.array(data)
res = ode(model, x0fcn(params, data), times, args=(params,))
y = yfcn(res, params)
nll = sum(y) - sum(data * np.log(y))
# note this is a slightly shortened version--there's an additive constant term missing but it
# makes calculation faster and won't alter the threshold. Alternatively, can do:
# nll = -sum(np.log(poisson.pmf(np.round(data),np.round(y)))) # the round is b/c Poisson is for (integer) count data
# this can also barf if data and y are too far apart because the dpois will be ~0, which makes the log angry
# ML using normally distributed measurement error (least squares)
# nll = -sum(np.log(norm.pdf(data,y,0.1*np.mean(data)))) # example WLS assuming sigma = 0.1*mean(data)
# nll = sum((y - data)**2) # alternatively can do OLS but note this will mess with the thresholds
# for the profile! This version of OLS is off by a scaling factor from
# actual LL units.
return nll
df = pd.read_csv('https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv')
selected_location = 'Spain'
selected_df = df[df.location == selected_location].reset_index()
selected_df.date = pd.to_datetime(selected_df.date)
print(selected_df.head())
selected_df.date = pd.to_datetime(selected_df.date)
selected_df = selected_df[['date', 'new_cases']]
print(selected_df)
df = selected_df
optimizer = optimize.minimize(NLL, params, args=(data, times), method='Nelder-Mead',
options={'disp': False, 'return_all': False, 'xatol': 3.1201, 'fatol': 0.0001,
'adaptive': False})
paramests = np.abs(optimizer.x)
iniests = x0fcn(paramests, data)
print('Paramests:')
print(paramests)
times_long = range(0, int(len(times) * 10))
start_day = df['date'][0]
dates_long = []
for i in range(0, int(len(times) * 10)):
dates_long.append(start_day + (np.timedelta64(1, 'D') * i))
# print(df)
# print(dates_long)
# sys.exit()
#### Re-simulate and plot the model with the final parameter estimates ####
xest = ode(model, iniests, times_long, args=(paramests,))
# print(xest)
est_measure = yfcn(xest, paramests)
# plt.plot(times, data, 'k-o', linewidth=1, label='Data')
json_dict = {}
time_end = datetime.timestamp(datetime.now())
json_dict['duration'] = time_end - time_start
json_df = pd.DataFrame()
json_df['dates'] = dates_long
json_df['new_cases'] = df['new_cases']
json_df['prediction'] = est_measure
json_df = json_df.fillna("")
json_df['cumulative'] = json_df['prediction'].cumsum()
json_df = json_df[json_df['prediction'] >= 1]
if error == True:
json_dict['result'] = 'error'
json_dict['message'] = error_message
json_dict['timestamp'] = datetime.timestamp(datetime.now())
json_dict['chart_data'] = json_df.drop(columns=['prediction'], axis=1)
else:
json_dict['result'] = 'success'
json_dict['day_for_95_percent_predicted_cases'] = \
json_df[json_df['cumulative'] > (json_df['cumulative'].iloc[-1] * 0.95)]['dates'].reset_index(drop=True)[0]
json_dict['day_for_98_percent_predicted_cases'] = \
json_df[json_df['cumulative'] > (json_df['cumulative'].iloc[-1] * 0.98)]['dates'].reset_index(drop=True)[0]
# json_dict['timestamp'] = str(f"{datetime.now():%Y-%m-%d %H:%M:%S}")
json_dict['timestamp'] = datetime.timestamp(datetime.now())
json_dict['chart_data'] = json_df.to_dict()
json_string = json.dumps(json_dict, default=str)
print(json_string)
output = json_string # json string
plt.plot(json_df['dates'], json_df['prediction'], 'r-', linewidth=3, label='Predicted New Cases')
plt.bar(df['date'], data)
plt.axvline(x=json_dict['day_for_95_percent_predicted_cases'], label='(95%) '+str(json_dict['day_for_95_percent_predicted_cases'].date()),color='red')
plt.axvline(x=json_dict['day_for_98_percent_predicted_cases'], label='(98%) '+str(json_dict['day_for_98_percent_predicted_cases'].date()),color='green')
plt.xlabel('Time')
plt.ylabel('Individuals')
plt.legend()
plt.show()

Related

linear regression: my plotting doesn't show the line

I am working on implementing from scratch a linear regression model means without using Sklearn package.
all was working just fine , until i tried ploting the result.
my fit line isn't showing:
i looked at a bunch of solution but neither of them was for myy problem
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv(r'C:\Salary.csv')
x=data['Salary']
y=data['YearsExperience']
#y= mx+b
m = 0
b = 0
Learning_Rate = .01
epochs = 5000
n = np.float(x.shape[0])
error = []
for i in range(epochs):
Y_hat = m*x+b
#error
mse= (1/n)*np.sum((y-Y_hat)**2)
error.append(mse)
#gradient descend
db = (-2/n) * np.sum(x*(y-Y_hat))
dm = (-2/n) * np.sum((y-Y_hat))
m = m - Learning_Rate * dm
b = b - Learning_Rate * db
#tracing x and y line
x_line = np.linspace(0, 15, 100)
y_line = (m*x_line)+ b
#ploting result
plt.figure(figsize=(8,6))
plt.title('LR result')
**plt.plot(x_line,y_line) #the problem is apparently here
# i just don't know what to do**
plt.scatter(x,y)
plt.show()
appart from that, there is no problem with the code .
Your code has multiple problems:
you are plotting the line from 0 and 15, while data range from about 40000 to 140000. Even if you are correctly computing the line, you are going to plot it in a region far away from your data
in the loop there is a mistake in the computation of dm and db, they are swapped. The corrected expressions are:
dm = (-2/n)*np.sum(x*(y - Y_hat))
db = (-2/n)*np.sum((y - Y_hat))
your x and y data are on very different scales: x is ~10⁴ magnitude, while y is ~10¹. For this reason, also m and b will likely be very different from each other (different orders of magnitude). This is the reason why you should use two different learning rate for the different quantities you are optimizing: Learning_Rate_m for m and Learning_Rate_b for b
finally, the gradient descent method is strongly affected by the initial guess: it may lead to find local minima (fake solutions) in place of the global minima (true solution). For this reason, you should try with different initial guesses for m and b, possibly close to their estimated value:
m = 0
b = -2
Complete Code
import numpy as np
import matplotlib.pyplot as plt
N = 40
np.random.seed(42)
x = np.random.randint(low = 38000, high = 145000, size = N)
y = (13 - 1)/(140000 - 40000)*(x - 40000) + 1 + 0.5*np.random.randn(N)
# initial guess
m = 0
b = -2
Learning_Rate_m = 1e-10
Learning_Rate_b = 1e-2
epochs = 5000
n = np.float(x.shape[0])
error = []
for i in range(epochs):
Y_hat = m*x + b
mse = 1/n*np.sum((y - Y_hat)**2)
error.append(mse)
dm = -2/n*np.sum(x*(y - Y_hat))
db = -2/n*np.sum((y - Y_hat))
m = m - Learning_Rate_m*dm
b = b - Learning_Rate_b*db
x_line = np.linspace(x.min(), x.max(), 100)
y_line = (m*x_line) + b
plt.figure(figsize=(8,6))
plt.title('LR result')
plt.plot(x_line,y_line, 'red')
plt.scatter(x,y)
plt.show()
Plot
The problem is not happening while plotting, the problem is with the parameters in plt.plot(x_line,y_line), I tested your code and found that y_line is all NaN values, double check the calculations (y_line, m, dm).

solve_ivp discards imaginary part of complex solution

I am computing a solution to the free basis expansion of the dirac equation for electron-positron pairproduction. For this i need to solve a system of equations that looks like this:
Equation for pairproduction, from Mocken at al.
EDIT: This has been solved by passing y0 as complex type into the solver. As is stated in this issue: https://github.com/scipy/scipy/issues/8453 I would definitely consider this a bug but it seems like it has gone under the rock for at least 4 years
for this i am using SciPy's solve_ivp integrator in the following way:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from scipy.integrate import solve_ivp
import scipy.constants as constants
#Impulse
px, py = 0 , 0
#physics constants
e = constants.e
m = constants.m_e # electronmass
c = constants.c
hbar = constants.hbar
#relativistic energy
E = np.sqrt(m**2 *c**4 + (px**2+py**2) * c**2) # E_p
#adiabatic parameter
xi = 1
#Parameter of the system
w = 0.840 #frequency in 1/m_e
N = 8 # amount of amplitudes in window
T = 2* np.pi/w
#unit system
c = 1
hbar = 1
m = 1
#strength of electric field
E_0 = xi*m*c*w/e
print(E_0)
#vectorpotential
A = lambda t,F: -E_0/w *np.sin(t)*F
def linearFenster2(t):
conditions = [t <=0, (t/w>=0) and (t/w <= T/2), (t/w >= T/2) and (t/w<=T*(N+1/2)), (t/w>=T*(N+1/2)) and (t/w<=T*(N+1)), t/w>=T*(N+1)]
funcs = [lambda t: 0, lambda t: 1/np.pi *t, lambda t: 1, lambda t: 1-w/np.pi * (t/w-T*(N+1/2)), lambda t: 0]
return np.piecewise(t,conditions,funcs)
#Coefficient functions
nu = lambda t: -1j/hbar *e*A(w*t,linearFenster2(w*t)) *np.exp(2*1j/hbar * E*t) *(px*py*c**2 /(E*(E+m*c**2)) + 1j*(1- c**2 *py**2/(E*(E+m*c**2))))
kappa = lambda t: 1j*e*A(t,linearFenster2(w*t))* c*py/(E * hbar)
#System to solve
def System(t, y, nu, kappa):
df = kappa(t) *y[0] + nu(t) * y[1]
dg = -np.conjugate(nu(t)) * y[0] + np.conjugate(kappa(t))*y[1]
return np.array([df,dg], dtype=np.cdouble)
def solver(tmin, tmax,teval=None,f0=0,g0=1):
'''solves the system.
#tmin: starttime
#tmax: endtime
#f0: starting percentage of already present electrons of positive energy usually 0
#g0: starting percentage of already present electrons of negative energy, usually 1, therefore full vaccuum
'''
y0=[f0,g0]
tspan = np.array([tmin, tmax])
koeff = np.array([nu,kappa])
sol = solve_ivp(System,tspan,y0,t_eval= teval,args=koeff)
return sol
#Plotting of windowfunction
amount = 10**2
t = np.arange(0, T*(N+1), 1/amount)
vlinearFenster2 = np.array([linearFenster2(w*a) for a in t ], dtype = float)
fig3, ax3 = plt.subplots(1,1,figsize=[24,8])
ax3.plot(t,E_0/w * vlinearFenster2)
ax3.plot(t,A(w*t,vlinearFenster2))
ax3.plot(t,-E_0 /w * vlinearFenster2)
ax3.xaxis.set_minor_locator(ticker.AutoMinorLocator())
ax3.set_xlabel("t in s")
ax3.grid(which = 'both')
plt.show()
sol = solver(0, 70,teval = t)
ts= sol.t
f=sol.y[0]
fsquared = 2* np.absolute(f)**2
plt.plot(ts,fsquared)
plt.show()
The plot for the window function looks like this (and is correct)
window function
however the plot for the solution looks like this:
Plot of pairproduction probability
This is not correct based on the papers graphs (and further testing using mathematica instead).
When running the line 'sol = solver(..)' it says:
\numpy\core\_asarray.py:102: ComplexWarning: Casting complex values to real discards the imaginary part
return array(a, dtype, copy=False, order=order)
I simply do not know why solve_ivp discard the imaginary part. Its absolutely necessary.
Can someone enlighten me who knows more or sees the mistake?
According to the documentation, the y0 passed to solve_ivp must be of type complex in order for the integration to be over the complex domain. A robust way of ensuring this is to add the following to your code:
def solver(tmin, tmax,teval=None,f0=0,g0=1):
'''solves the system.
#tmin: starttime
#tmax: endtime
#f0: starting percentage of already present electrons of positive energy usually 0
#g0: starting percentage of already present electrons of negative energy, usually 1, therefore full vaccuum
'''
f0 = complex(f0) # <-- added
g0 = complex(g0) # <-- added
y0=[f0,g0]
tspan = np.array([tmin, tmax])
koeff = np.array([nu,kappa])
sol = solve_ivp(System,tspan,y0,t_eval= teval,args=koeff)
return sol
I tried the above, and it indeed made the warning disappear. However, the result of the integration seems to be the same regardless.

scipy curve_fi returns initial parameters estimates

I am triyng to use scipy curve_fit function to fit a gaussian function to my data to estimate a theoretical power spectrum density. While doing so, the curve_fit function always return the initial parameters (p0=[1,1,1]) , thus telling me that the fitting didn't work.
I don't know where the issue is. I am using python 3.9 (spyder 5.1.5) from the anaconda distribution on windows 11.
here a Wetransfer link to the data file
https://wetransfer.com/downloads/6097ebe81ee0c29ee95a497128c1c2e420220704110130/86bf2d
Here is my code below. Can someone tell me what the issue is, and how can i solve it?
on the picture of the plot, the blue plot is my experimental PSD and the orange one is the result of the fit.
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import scipy.constants as cst
File = np.loadtxt('test5.dat')
X = File[:, 1]
Y = File[:, 2]
f_sample = 50000
time=[]
for i in range(1,len(X)+1):
t=i*(1/f_sample)
time= np.append(time,t)
N = X.shape[0] # number of observation
N1=int(N/2)
delta_t = time[2] - time[1]
T_mes = N * delta_t
freq = np.arange(1/T_mes, (N+1)/T_mes, 1/T_mes)
freq=freq[0:N1]
fNyq = f_sample/2 # Nyquist frequency
nb = 350
freq_block = []
# discrete fourier tansform
X_ft = delta_t*np.fft.fft(X, n=N)
X_ft=X_ft[0:N1]
plt.figure()
plt.plot(time, X)
plt.xlabel('t [s]')
plt.ylabel('x [micro m]')
# Experimental power spectrum on both raw and blocked data
PSD_X_exp = (np.abs(X_ft)**2/T_mes)
PSD_X_exp_b = []
STD_PSD_X_exp_b = []
for i in range(0, N1+2, nb):
freq_b = np.array(freq[i:i+nb]) # i-nb:i
psd_b = np.array(PSD_X_exp[i:i+nb])
freq_block = np.append(freq_block, (1/nb)*np.sum(freq_b))
PSD_X_exp_b = np.append(PSD_X_exp_b, (1/nb)*np.sum(psd_b))
STD_PSD_X_exp_b = np.append(STD_PSD_X_exp_b, PSD_X_exp_b/np.sqrt(nb))
plt.figure()
plt.loglog(freq, PSD_X_exp)
plt.legend(['Raw Experimental PSD'])
plt.xlabel('f [Hz]')
plt.ylabel('PSD')
plt.figure()
plt.loglog(freq_block, PSD_X_exp_b)
plt.legend(['Experimental PSD after blocking'])
plt.xlabel('f [Hz]')
plt.ylabel('PSD')
kB = cst.k # Boltzmann constant [m^2kg/s^2K]
T = 273.15 + 25 # Temperature [K]
r = (2.8 / 2) * 1e-6 # Particle radius [m]
v = 0.00002414 * 10 ** (247.8 / (-140 + T)) # Water viscosity [Pa*s]
gamma = np.pi * 6 * r * v # [m*Pa*s]
Do = kB*T/gamma # expected value for D
f3db_o = 50000 # expected value for f3db
fc_o = 300 # expected value pour fc
n = np.arange(-10,11)
def theo_spectrum_lorentzian_filter(x, D_, fc_, f3db_):
PSD_theo=[]
for i in range(0,len(x)):
# print(i)
psd_theo=np.sum((((D_*Do)/2*math.pi**2)/((fc_*fc_o)**2+(x[i]+n*f_sample)
** 2))*(1/(1+((x[i]+n*f_sample)/(f3db_*f3db_o))**2)))
PSD_theo= np.append(PSD_theo,psd_theo)
return PSD_theo
popt, pcov = curve_fit(theo_spectrum_lorentzian_filter, freq_block, PSD_X_exp_b, p0=[1, 1, 1], sigma=STD_PSD_X_exp_b, absolute_sigma=True, check_finite=True,bounds=(0.1, 10), method='trf', jac=None)
D_, fc_, f3db_ = popt
D1 = D_*Do
fc1 = fc_*fc_o
f3db1 = f3db_*f3db_o
print('Diffusion constant D = ', D1, ' Corner frequency fc= ',fc1, 'f3db(diode,eff)= ', f3db1)
I believe I've successfully fitted your data. Here's the approach I took.
First, I plotted your model (with popt=[1, 1, 1]) and the data you had. I noticed your data was significantly lower than the model. Then I started fiddling with the parameters. I wanted to push the model upwards. I did that by multiplying popt[0] by increasingly large values. I ended up with 1E13 as a ballpark value. Note that I have no idea if this is physically possible for your model. Then I jury-rigged your fitting function to multiply D_ by 1E13 and ran your code. I got this fit:
So I believe it's a problem of 1) inappropriate starting values and 2) inappropriate bounds. In your position, I would revise this model, check if there's any problems with units and so on.
Here's what I used to try to fit your model:
plt.figure()
plt.loglog(freq_block[:170], PSD_X_exp_b[:170], label='Exp')
plt.loglog(freq_block[:170],
theo_spectrum_lorentzian_filter(
freq_block[:170],
1E13*popt[0], popt[1], popt[2]),
label='model'
)
plt.xlabel('f [Hz]')
plt.ylabel('PSD')
plt.legend()
I limited the data to point 170 because there were some weird backwards values that made me uncomfortable. I would recheck them if I were you.
Here's the model code I used. I didn't change the curve_fit call (except to limit x to :170.
def theo_spectrum_lorentzian_filter(x, D_, fc_, f3db_):
PSD_theo=[]
D_ = 1E13*D_ # I only changed here
for i in range(0,len(x)):
psd_theo=np.sum((((D_*Do)/2*math.pi**2)/((fc_*fc_o)**2+(x[i]+n*f_sample)
** 2))*(1/(1+((x[i]+n*f_sample)/(f3db_*f3db_o))**2)))
PSD_theo= np.append(PSD_theo,psd_theo)
return PSD_theo

how to create a proper sigmoid curve?

I'm trying to use logistic regression on the popularity of hits songs on Spotify from 2010-2019 based on their durations and durability, whose data are collected from an .csv file. Basically, since the popularity values of each song is numerical, I have converted each of them to binary numbers "0" to "1". If the popularity value of a hit song is less than 70, I will replace its current value to 0, and vice versa if its value is more than 70.
The current sigmoid curve is being "log" right now, hence it is showing a straight line. However, in the context of this code, I am still not sure how to add in a proper sigmoid curve, instead of just the straight line. Is there anything i need to add to my code in order to show both a solid sigmoid curve and the log of the curve in the same graph? It would be deeply appreciated if someone can help me with the final step.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('top10s [SubtitleTools.com] (2).csv')
BPM = df.bpm
BPM = np.array(BPM)
Energy = df.nrgy
Energy = np.array(Energy)
Dance = df.dnce
Dance = np.array(Dance)
dB = df.dB
dB = np.array(dB)
Live = df.live
Live = np.array(Live)
Valence = df.val
Valence = np.array(Valence)
Acous = df.acous
Acous = np.array(Acous)
Speech = df.spch
Speech = np.array(Speech)
df.loc[df['popu'] <= 70, 'popu'] = 0
df.loc[df['popu'] > 70, 'popu'] = 1
def Logistic_Regression(X, y, iterations, alpha):
ones = np.ones((X.shape[0], ))
X = np.vstack((ones, X))
X = X.T
b = np.zeros(X.shape[1])
for i in range(iterations):
z = np.dot(X, b)
p_hat = sigmoid(z)
gradient = np.dot(X.T, (y - p_hat))/y.size
b = b + alpha * gradient
if (i % 1000 == 0):
print('LL, i ', log_likelihood(X, y, b), i)
return b
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def log_likelihood(X, y, b):
z = np.dot(X, b)
LL = np.sum(y*z - np.log(1 + np.exp(z)))
return LL
def LR1():
Dur = df.dur
Dur = np.array(Dur)
Pop = df.popu
Pop = [int(i) for i in Pop]; Pop = np.array(Pop)
plt.figure(figsize=(10,8))
colormap = np.array(['r', 'b'])
plt.scatter(Dur, Pop, c = colormap[Pop], alpha = .4)
b = Logistic_Regression(Dur, Pop, iterations = 8000, alpha = 0.00005)
print('Done')
p_hat = sigmoid(np.dot(Dur, b[1]) + b[0])
idxDur = np.argsort(Dur)
plt.plot(Dur[idxDur], p_hat[idxDur])
plt.show()
LR1()
My dataset:
CSV File
My Current Graph
What i want to have:
Shape of sigmoid i want
at first glance, your Logistic_Regression initialization seems very wrong.
I think you packed X with [X, 1] then tries to learn W = [Weight, bias], which should be [1, 0] to start with.
Note the 1 is vector [1, 1, 1...] with length = feature vector length.
try something like this:
x_range = np.linspace(Dur.min(), Dur.max(), 100)
p_hat = sigmoid(np.dot(x_range, b[1]), b[0])
plt.plot(x_range, p_hat)
plt.show()

Trying to build neural net for digit recognition in Python. Unable to get theta2 and predictions correct

I am following Andrew's Coursera course on machine learning. I am trying to build a 3 layers neural net for digit recognition in Python (784 input, 25 hidden, 10 output). However, I am unable to get the predictions (of the training data) correct (accuracy < 5% at 100 iter, accuracy not increasing with iteration).
J (the cost function) seems to be going down (see photo 1) and I have done gradient checking (before minimizing) and it seems to match to around 1e-11 (see photo 2).
I have compared the theta1 and theta2 after 100 iterations to my working matlab code (see code snippet 1 for octave and code snippet 2 for python). It seems theta1 is reasonably similar but theta2 is very different -- see code snippet 2. (I know they should differ because of the different optimisation routines. However, firstly, I have place the same initial thetas into both codes. Secondly, my reasoning is that they should start to converge, or at least get close, after 100 iterations)
The only error I see is:
-c:32: RuntimeWarning: overflow encountered in exp
when running the sigmoid during the optimising. However, I was told that this is not essential and it is normal to encounter this error during optimising? Furthermore, because it is a sigmoid, anytime the input is large, it will tend towards 1 anyways.
I have also attached my code in snippet 3. I have cut out all the other non-essential bits (like gradient checking) to make it as short as possible.
I would appreciate any help into this as I cannot even find where it is going wrong, let alone fix it. Thank you.
Photos:
J (cost function) decreasing to 1.8 after 12 iterations
Gradient checking before optimizing, they look very similar
Code snippet:
Initializing Neural Network Parameters ...
initial1
-0.0100100
-0.0771400
-0.1113800
-0.0230100
0.0547800
-0.0505500
-0.0731200
-0.0988700
0.0128000
-0.0855400
-0.1002500
-0.1137200
-0.0669300
-0.0999900
0.0084500
-0.0363200
-0.0588600
-0.0431100
-0.1133700
-0.0326300
0.0282800
0.0052400
-0.1134600
-0.0617700
0.0267600
initial2
0.0273700
0.1026000
-0.0502100
-0.0699100
0.0190600
0.1004000
0.0784600
-0.0075900
-0.0362100
0.0286200
Doing fminunc
Training Neural Network...
Iteration 100 | Cost: 6.219605e-01
theta1
-0.0099719
-0.0768462
-0.1109559
-0.0229224
0.0545714
-0.0503575
-0.0728415
-0.0984935
0.0127513
-0.0852143
-0.0998682
-0.1132869
-0.0666751
-0.0996092
0.0084178
-0.0361817
-0.0586359
-0.0429458
-0.1129383
-0.0325057
0.0281723
0.0052200
-0.1130279
-0.0615348
0.0266581
theta2
1.124918
1.603780
-1.266390
-0.848874
0.037956
-1.360841
2.145562
-1.448657
-1.262285
-1.357635
theta1_initial
[-0.01001 -0.07714 -0.11138 -0.02301 0.05478 -0.05055 -0.07312 -0.09887
0.0128 -0.08554 -0.10025 -0.11372 -0.06693 -0.09999 0.00845 -0.03632
-0.05886 -0.04311 -0.11337 -0.03263 0.02828 0.00524 -0.11346 -0.06177
0.02676]
theta2_initial
[ 0.02737 0.1026 -0.05021 -0.06991 0.01906 0.1004 0.07846 -0.00759
-0.03621 0.02862]
Doing fminunc
-c:32: RuntimeWarning: overflow encountered in exp
theta1
[-0.00997202 -0.07680716 -0.11086841 -0.02292044 0.05455335 -0.05034252
-0.07280686 -0.09842603 0.01275117 -0.08516515 -0.0997987 -0.11319546
-0.06664666 -0.09954009 0.00841804 -0.03617494 -0.05861458 -0.04293555
-0.1128474 -0.0325006 0.02816879 0.00522031 -0.1129369 -0.06151103
0.02665508]
theta2
[ 0.27954826 -0.08007496 -0.36449273 -0.22988024 0.06849659 -0.47803973
1.09023041 -0.25570559 -0.24537494 -0.40341995]
#-----------------BEGIN HEADERS-----------------
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import csv
import scipy
#-----------------END HEADERS-----------------
#-----------------BEGIN FUNCTION 1-----------------
def randinitialize(L_in, L_out):
w = np.zeros((L_out, 1 + L_in))
epsilon_init = 0.12
w = np.random.rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init
return w
#-----------------END FUNCTION 1-----------------
#-----------------BEGIN FUNCTION 2-----------------
def sigmoid(lz):
g = 1.0/(1.0+np.exp(-lz))
return g
#-----------------END FUNCTION 2-----------------
#-----------------BEGIN FUNCTION 3-----------------
def sigmoidgradient(lz):
g = np.multiply(sigmoid(lz),(1-sigmoid(lz)))
return g
#-----------------END FUNCTION 3-----------------
#-----------------BEGIN FUNCTION 4-----------------
def nncostfunction(ltheta_ravel, linput_layer_size, lhidden_layer_size, lnum_labels, lx, ly, llambda_reg):
ltheta1 = np.array(np.reshape(ltheta_ravel[:lhidden_layer_size * (linput_layer_size + 1)], (lhidden_layer_size, (linput_layer_size + 1))))
ltheta2 = np.array(np.reshape(ltheta_ravel[lhidden_layer_size * (linput_layer_size + 1):], (lnum_labels, (lhidden_layer_size + 1))))
ltheta1_grad = np.zeros((np.shape(ltheta1)))
ltheta2_grad = np.zeros((np.shape(ltheta2)))
y_matrix = []
lm = np.shape(lx)[0]
eye_matrix = np.eye(lnum_labels)
for i in range(len(ly)):
y_matrix.append(eye_matrix[int(ly[i])-1,:]) #The minus one as python is zero based
y_matrix = np.array(y_matrix)
a1 = np.hstack((np.ones((lm,1)), lx)).astype(float)
z2 = sigmoid(ltheta1.dot(a1.T))
a2 = (np.concatenate((np.ones((np.shape(z2)[1], 1)), z2.T), axis=1)).astype(float)
a3 = sigmoid(ltheta2.dot(a2.T))
h = a3
J_unreg = 0
J = 0
J_unreg = (1/float(lm))*np.sum(\
-np.multiply(y_matrix,np.log(h.T))\
-np.multiply((1-y_matrix),np.log(1-h.T))\
,axis=None)
J = J_unreg + (llambda_reg/(2*float(lm)))*\
(np.sum(\
np.multiply(ltheta1[:,1:],ltheta1[:,1:])\
,axis=None)+np.sum(\
np.multiply(ltheta2[:,1:],ltheta2[:,1:])\
,axis=None))
delta3 = a3.T - y_matrix
delta2 = np.multiply((delta3.dot(ltheta2[:,1:])), (sigmoidgradient(ltheta1.dot(a1.T))).T)
cdelta2 = ((a2.T).dot(delta3)).T
cdelta1 = ((a1.T).dot(delta2)).T
ltheta1_grad = (1/float(lm))*cdelta1
ltheta2_grad = (1/float(lm))*cdelta2
theta1_hold = ltheta1
theta2_hold = ltheta2
theta1_hold[:,0] = 0;
theta2_hold[:,0] = 0;
ltheta1_grad = ltheta1_grad + (llambda_reg/float(lm))*theta1_hold;
ltheta2_grad = ltheta2_grad + (llambda_reg/float(lm))*theta2_hold;
thetagrad_ravel = np.concatenate((np.ravel(ltheta1_grad), np.ravel(ltheta2_grad)))
return (J, thetagrad_ravel)
#-----------------END FUNCTION 4-----------------
#-----------------BEGIN FUNCTION 5-----------------
def predict(ltheta1, ltheta2, x):
m, n = np.shape(x)
p = np.zeros(m)
h1 = sigmoid((np.hstack((np.ones((m,1)),x.astype(float)))).dot(ltheta1.T))
h2 = sigmoid((np.hstack((np.ones((m,1)),h1))).dot(ltheta2.T))
for i in range(0,np.shape(h2)[0]):
p[i] = np.argmax(h2[i,:])
return p
#-----------------END FUNCTION 5-----------------
## Setup the parameters you will use for this exercise
input_layer_size = 784; # 28x28 Input Images of Digits
hidden_layer_size = 25; # 25 hidden units
num_labels = 10; # 10 labels, from 0 to 9
data = []
#Reading in data, split into X and y, rewrite label 0 to 10 (for easy comparison to course)
with open('train.csv', 'rb') as csvfile:
has_header = csv.Sniffer().has_header(csvfile.read(1024))
csvfile.seek(0) # rewind
data_csv = csv.reader(csvfile, delimiter=',')
if has_header:
next(data_csv)
for row in data_csv:
data.append(row)
data = np.array(data)
x = data[:,1:]
y = data[:,0]
y = y.astype(int)
for i in range(len(y)):
if y[i] == 0:
y[i] = 10
#Set basic parameters
m, n = np.shape(x)
lambda_reg = 1.0
#Randomly initalize weights for Theta_initial
#theta1_initial = np.genfromtxt('tt1.csv', delimiter=',')
#theta2_initial = np.genfromtxt('tt2.csv', delimiter=',')
theta1_initial = randinitialize(input_layer_size, hidden_layer_size);
theta2_initial = randinitialize(hidden_layer_size, num_labels);
theta_initial_ravel = np.concatenate((np.ravel(theta1_initial), np.ravel(theta2_initial)))
#Doing optimize
fmin = scipy.optimize.minimize(fun=nncostfunction, x0=theta_initial_ravel, args=(input_layer_size, hidden_layer_size, num_labels, x, y, lambda_reg), method='L-BFGS-B', jac=True, options={'maxiter': 10, 'disp': True})
fmin
theta1 = np.array(np.reshape(fmin.x[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, (input_layer_size + 1))))
theta2 = np.array(np.reshape(fmin.x[hidden_layer_size * (input_layer_size + 1):], (num_labels, (hidden_layer_size + 1))))
p = predict(theta1, theta2, x);
for i in range(len(y)):
if y[i] == 10:
y[i] = 0
correct = [1 if a == b else 0 for (a, b) in zip(p,y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print 'accuracy = {0}%'.format(accuracy * 100)
I think I have fixed the problem: it seems I messed up the index
should be:
y_matrix.append(eye_matrix[int(ly[i]),:])
instead of:
y_matrix.append(eye_matrix[int(ly[i])-1,:])

Categories