theano GRU rnn adam optimizer - python
Technical information:
OS: Mac OS X 10.9.5
IDE: Eclipse Mars.1 Release (4.5.1), with PyDev and Anaconda interpreter (grammar version 3.4)
GPU: NVIDIA GeForce GT 650M
Libs: numpy, aeosa, Sphinx-1.3.1, Theano 0.7, nltk-3.1
My background: I am very new to theano and numpy and haven't taken a formal course in machine learning or discrete math.
The recurrent neural network for natural language processing I currently use is taken from here:
https://github.com/dennybritz/rnn-tutorial-gru-lstm/blob/master/gru_theano.py
The only change made to this file is replacing references to theano.config.floatX with the string 'float32'.
I also use the utils.py and train.py modules included in the repository, with only minor changes.
The adam optimizer I plan to incorporate in place of the sgd/rms code implemented in the example repository is found here: https://gist.github.com/skaae/ae7225263ca8806868cb
Reproduced here (again with references to the .config.floatX replaced with the hard-coded 'float32'):
(theano as th, theano.shared as thsh, theano.tensor as T, numpy as np)
def adam(loss, all_params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
"""
ADAM update rules
Default values are taken from [Kingma2014]
References:
[Kingma2014] Kingma, Diederik, and Jimmy Ba.
"Adam: A Method for Stochastic Optimization."
arXiv preprint arXiv:1412.6980 (2014).
http://arxiv.org/pdf/1412.6980v4.pdf
"""
updates = []
all_grads = th.grad(loss, all_params)
alpha = learning_rate
t = thsh(np.float32(1))
b1_t = b1*gamma**(t-1) #(Decay the first moment running average coefficient)
for theta_previous, g in zip(all_params, all_grads):
m_previous = thsh(np.zeros(theta_previous.get_value().shape.astype('float32')))
v_previous = thsh(np.zeros(theta_previous.get_value().shape.astype('float32')))
m = b1_t*m_previous + (1 - b1_t)*g # (Update biased first moment estimate)
v = b2*v_previous + (1 - b2)*g**2 # (Update biased second raw moment estimate)
m_hat = m / (1-b1**t) # (Compute bias-corrected first moment estimate)
v_hat = v / (1-b2**t) # (Compute bias-corrected second raw moment estimate)
theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e) #(Update parameters)
updates.append((m_previous, m))
updates.append((v_previous, v))
updates.append((theta_previous, theta) )
updates.append((t, t + 1.))
return updates
My question is this:
How would you modify the GRUTheano module to use the Adam method above in place of the builtin sgd/rmsprop function?
It looks like the key changes would be to lines 99-126 of GRUTheano:
# SGD parameters
learning_rate = T.scalar('learning_rate')
decay = T.scalar('decay')
# rmsprop cache updates
mE = decay * self.mE + (1 - decay) * dE ** 2
mU = decay * self.mU + (1 - decay) * dU ** 2
mW = decay * self.mW + (1 - decay) * dW ** 2
mV = decay * self.mV + (1 - decay) * dV ** 2
mb = decay * self.mb + (1 - decay) * db ** 2
mc = decay * self.mc + (1 - decay) * dc ** 2
self.sgd_step = theano.function(
[x, y, learning_rate, theano.Param(decay, default=0.9)],
[],
updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
(U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
(W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
(V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
(b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
(c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
(self.mE, mE),
(self.mU, mU),
(self.mW, mW),
(self.mV, mV),
(self.mb, mb),
(self.mc, mc)
])
I haven't tested this code, but the only thing you need to change is to tell updates to use adam(..) instead of the updates already provided here, so something like this should work (complete code looks like this (we need to get rid of rmsprop stuff)):
import numpy as np
import theano as theano
import theano.tensor as T
from theano.gradient import grad_clip
import time
import operator
class GRUTheano(object):
def __init__(self, word_dim, hidden_dim=128, bptt_truncate=-1):
# Assign instance variables
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
# Initialize the network parameters
E = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
U = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (6, hidden_dim, hidden_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
b = np.zeros((6, hidden_dim))
c = np.zeros(word_dim)
# Theano: Created shared variables
self.E = theano.shared(name='E', value=E.astype(theano.config.floatX))
self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
self.W = theano.shared(name='W', value=W.astype(theano.config.floatX))
self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
self.b = theano.shared(name='b', value=b.astype(theano.config.floatX))
self.c = theano.shared(name='c', value=c.astype(theano.config.floatX))
# We store the Theano graph here
self.theano = {}
self.__theano_build__()
def __theano_build__(self):
E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c
x = T.ivector('x')
y = T.ivector('y')
def forward_prop_step(x_t, s_t1_prev, s_t2_prev):
# This is how we calculated the hidden state in a simple RNN. No longer!
# s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))
# Word embedding layer
x_e = E[:,x_t]
# GRU Layer 1
z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0])
r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1])
c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev
# GRU Layer 2
z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3])
r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4])
c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev
# Final output calculation
# Theano's softmax returns a matrix with one row, we only need the row
o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]
return [o_t, s_t1, s_t2]
[o, s, s2], updates = theano.scan(
forward_prop_step,
sequences=x,
truncate_gradient=self.bptt_truncate,
outputs_info=[None,
dict(initial=T.zeros(self.hidden_dim)),
dict(initial=T.zeros(self.hidden_dim))])
prediction = T.argmax(o, axis=1)
o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
# Total cost (could add regularization here)
cost = o_error
# Gradients
dE = T.grad(cost, E)
dU = T.grad(cost, U)
dW = T.grad(cost, W)
db = T.grad(cost, b)
dV = T.grad(cost, V)
dc = T.grad(cost, c)
# Assign functions
self.predict = theano.function([x], o)
self.predict_class = theano.function([x], prediction)
self.ce_error = theano.function([x, y], cost)
self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc])
self.params = [self.E, self.U, self.W, self.V, self.b, self.c]
updates=adam(cost, self.params)
self.sgd_step = theano.function(
inputs=[x, y],
outputs=[],
updates=updates
)
def calculate_total_loss(self, X, Y):
return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
def calculate_loss(self, X, Y):
# Divide calculate_loss by the number of words
num_words = np.sum([len(y) for y in Y])
return self.calculate_total_loss(X,Y)/float(num_words)
def adam(loss, all_params, learning_rate=0.001, b1=0.9, b2=0.999, e=1e-8,
gamma=1-1e-8):
"""
ADAM update rules
Default values are taken from [Kingma2014]
References:
[Kingma2014] Kingma, Diederik, and Jimmy Ba.
"Adam: A Method for Stochastic Optimization."
arXiv preprint arXiv:1412.6980 (2014).
http://arxiv.org/pdf/1412.6980v4.pdf
"""
updates = []
all_grads = theano.grad(loss, all_params)
alpha = learning_rate
t = theano.shared(np.float32(1))
b1_t = b1*gamma**(t-1) #(Decay the first moment running average coefficient)
for theta_previous, g in zip(all_params, all_grads):
m_previous = theano.shared(np.zeros(theta_previous.get_value().shape,
dtype=theano.config.floatX))
v_previous = theano.shared(np.zeros(theta_previous.get_value().shape,
dtype=theano.config.floatX))
m = b1_t*m_previous + (1 - b1_t)*g # (Update biased first moment estimate)
v = b2*v_previous + (1 - b2)*g**2 # (Update biased second raw moment estimate)
m_hat = m / (1-b1**t) # (Compute bias-corrected first moment estimate)
v_hat = v / (1-b2**t) # (Compute bias-corrected second raw moment estimate)
theta = theta_previous - (alpha * m_hat) / (T.sqrt(v_hat) + e) #(Update parameters)
updates.append((m_previous, m))
updates.append((v_previous, v))
updates.append((theta_previous, theta) )
updates.append((t, t + 1.))
return updates
Related
Scipy minimization one variable while keeping some variables constant - args defining problem
I have a function where I need to minimize c_init to make totalF output zero. the problem is that I have some constants over this function which are Es, f_slong and f_c. I need to repeat this whole minimization for 30 different cases meaning that I have 30 different constant variables and 30 different c_init initial values. Then, my aim is to obtain what the values for c_init will be at the end of the algorithm. However, those constant variables give me trouble. I do not have any inequality (I am not sure if I have to define it anyway), I strongly feel that my problem is to define the location and inputs of *args, I've tested out many different scenarios but they all failed. Could anyone help me out? Those constant variables should be coming from their list of array in every iteration and sending through the minimization function. def c_neutral_un(c_init, Es, f_slong, f_c): eps_s = e_cu_un * (d - c_init) / c_init eps_s_prime = e_cu_un * (c_init - d_prime) / c_init if Es * eps_s > f_slong: f_s = f_slong else: f_s = Es * eps_s if Es * eps_s_prime > f_slong: f_s_prime = f_slong else: f_s_prime = Es * eps_s_prime T = As * f_s Cs_prime = As_prime * (f_s_prime - alfa1 * f_c) Cc_conc = alfa1 * f_c * b * beta1 * c_init totalF = Cc_conc + Cs_prime - T return totalF c = [] for i in range(31) bnd = ([0, 200]) x0 = c_init[i] Es = Es[i] f_slong = f_slong[i] f_c = f_c[i] res = minimize(c_neutral_un, x0, args=([Es, f_slong, f_c], True) method = "SLSQP", bounds = bnd) c.append(res.x)
THere is an error for constant assignment like Es = Es[i] def c_neutral_un(c_init, Es, f_slong, f_c): eps_s = e_cu_un * (d - c_init) / c_init eps_s_prime = e_cu_un * (c_init - d_prime) / c_init if Es * eps_s > f_slong: f_s = f_slong else: f_s = Es * eps_s if Es * eps_s_prime > f_slong: f_s_prime = f_slong else: f_s_prime = Es * eps_s_prime T = As * f_s Cs_prime = As_prime * (f_s_prime - alfa1 * f_c) Cc_conc = alfa1 * f_c * b * beta1 * c_init totalF = Cc_conc + Cs_prime - T return totalF c = [] for i in range(31): bnd = ([0, 200]) x0 = c_init[i] Es2 = Es[i] f_slong2 = f_slong[i] f_c2 = f_c[i] res = minimize(c_neutral_un, x0, args=([Es2, f_slong2, f_c2], True),method = "SLSQP", bounds = bnd) c.append(res.x)
Understanding the implementation of ConvLSTM in tensorflow
In the tensorflow implementation of convLSTM cell the following lines of code are written as: x_i = self.input_conv(inputs_i, kernel_i, bias_i, padding=self.padding) x_f = self.input_conv(inputs_f, kernel_f, bias_f, padding=self.padding) x_c = self.input_conv(inputs_c, kernel_c, bias_c, padding=self.padding) x_o = self.input_conv(inputs_o, kernel_o, bias_o, padding=self.padding) h_i = self.recurrent_conv(h_tm1_i, recurrent_kernel_i) h_f = self.recurrent_conv(h_tm1_f, recurrent_kernel_f) h_c = self.recurrent_conv(h_tm1_c, recurrent_kernel_c) h_o = self.recurrent_conv(h_tm1_o, recurrent_kernel_o) i = self.recurrent_activation(x_i + h_i) f = self.recurrent_activation(x_f + h_f) c = f * c_tm1 + i * self.activation(x_c + h_c) o = self.recurrent_activation(x_o + h_o) h = o * self.activation(c) The corresponding equations as described in the paper are: I am not able to see how W_ci, W_cf, W_co C_{t-1}, C_t is used in the input, forget and output gates. Where does it being used in computing the 4 gates?
Of course you cannot find those in that implementation of ConvLSTM cell, because it not using peephole: Peephole connections allow the gates to utilize the previous internal state as well as the previous hidden state (which is what LSTMCell is limited to) The tf.keras.experimental.PeepholeLSTMCell follow the equations you post above, as you can see in it source code: x_i, x_f, x_c, x_o = x h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1 i = self.recurrent_activation( x_i + K.dot(h_tm1_i, self.recurrent_kernel[:, :self.units]) + self.input_gate_peephole_weights * c_tm1) f = self.recurrent_activation(x_f + K.dot( h_tm1_f, self.recurrent_kernel[:, self.units:self.units * 2]) + self.forget_gate_peephole_weights * c_tm1) c = f * c_tm1 + i * self.activation(x_c + K.dot( h_tm1_c, self.recurrent_kernel[:, self.units * 2:self.units * 3])) o = self.recurrent_activation( x_o + K.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:]) + self.output_gate_peephole_weights * c) Or more clear, if you look at source code of tf.compat.v1.nn.rnn_cell.LSTMCell: if self._use_peepholes: c = ( sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = ( sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j))
Implementing A3C on TensorFlow 2
After finishing Coursera's Practical RL course on A3C, I'm trying to implement my own A3C agent using tensorflow 2. To start, I'm training it on the Cartpole environment but I can't get good results. For now, I've already launched several training with the following code, changing the entropy coefficient to see its impact (the results are shown below). Does it come from my implementation, or is it more a fine-tuning issue ? class A3C: def __init__(self, state_dim, n_actions, optimizer=tf.keras.optimizers.Adam(1e-3)): self.state_input = Input(shape=state_dim) self.x = Dense(256, activation='relu')(self.state_input) self.head_v = Dense(1, activation='linear')(self.x) self.head_p = Dense(n_actions, activation='linear')(self.x) self.network = tf.keras.Model(inputs=[self.state_input], outputs=[self.head_v, self.head_p]) self.optimizer = optimizer def forward(self, state): return self.network(state) def sample(self, logits): policy = np.exp(logits.numpy()) / np.sum(np.exp(logits.numpy()), axis=-1, keepdims=True) return np.array([np.random.choice(len(p), p=p) for p in policy]) def evaluate(agent, env, n_games=1): """Plays an a game from start till done, returns per-game rewards """ game_rewards = [] for _ in range(n_games): state = env.reset() total_reward = 0 while True: action = agent.sample(agent.forward(np.array([state]))[1])[0] state, reward, done, info = env.step(action) total_reward += reward if done: break game_rewards.append(total_reward) return game_rewards class EnvBatch: def __init__(self, n_envs = 10): self.envs = [gym.make(env_id) for _ in range(n_envs)] def reset(self): return np.array([env.reset() for env in self.envs]) def step(self, actions): results = [env.step(a) for env, a in zip(self.envs, actions)] new_obs, rewards, done, infos = map(np.array, zip(*results)) for i in range(len(self.envs)): if done[i]: new_obs[i] = self.envs[i].reset() return new_obs, rewards, done, infos env_id = "CartPole-v0" env = gym.make(env_id) state_dim = env.observation_space.shape n_actions = env.action_space.n agent = A3C(state_dim, n_actions) env_batch = EnvBatch(10) batch_states = env_batch.reset() gamma=0.99 rewards_history = [] entropy_history = [] for i in trange(200000): with tf.GradientTape() as t: batch_values, batch_logits = agent.forward(batch_states) batch_actions = agent.sample(batch_logits) batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions) batch_next_values, btach_next_logits = agent.forward(batch_next_states) batch_next_values *= (1 - batch_dones) probs = tf.nn.softmax(batch_logits) logprobs = tf.nn.log_softmax(batch_logits) logp_actions = tf.reduce_sum(logprobs * tf.one_hot(batch_actions, n_actions), axis=-1) advantage = batch_rewards + gamma*batch_next_values - batch_values entropy = -tf.reduce_sum(probs * logprobs, 1, name="entropy") actor_loss = - tf.reduce_mean(logp_actions * tf.stop_gradient(advantage)) - 0.005 * tf.reduce_mean(entropy) target_state_values = batch_rewards + gamma*batch_next_values critic_loss = tf.reduce_mean((batch_values - tf.stop_gradient(target_state_values))**2 ) loss = actor_loss + critic_loss var_list = agent.network.trainable_variables grads = t.gradient(loss,var_list) agent.optimizer.apply_gradients(zip(grads, var_list)) batch_states = batch_next_states entropy_history.append(np.mean(entropy)) if i % 500 == 0: if i % 2500 == 0: rewards_history.append(np.mean(evaluate(agent, env, n_games=3))) clear_output(True) plt.figure(figsize=[8, 4]) plt.subplot(1, 2, 1) plt.plot(rewards_history, label='rewards') plt.title("Session rewards") plt.grid() plt.legend() plt.subplot(1, 2, 2) plt.plot(entropy_history, label='entropy') plt.title("Policy entropy") plt.grid() plt.legend() plt.show() Beta = 0.005 - Training 1 Beta = 0.005 - Training 2 Beta = 0.005 - Training 3 Beta = 0.05 - Training 1 Beta = 0.05 - Training 2 Beta = 0.05 - Training 3
I've looked through your code, and it doesn't look like there's any problem with the algorithm. That is, it seems to me that the Hyper Parameter was chosen incorrectly. Try different Hyper Parameter Sets. If it doesn't work properly, refer to repository
The critic loss is wrong. You should get first expect returns, predicting the next state and iterate over it with bellman equation. Here is an example: def getExpectedReturns(self, states, next_states, done, rewards, standarize=True): # Get next value if done[-1] == 1.0: arr_idx = np.zeros((rewards.shape[0], 1)) arr_idx[-1] = 1.0 values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32) next_value = tf.reduce_sum(rewards * values_rewards_sum_one_hot, axis=0) else: values_rewards_sum = self.model_a2c(next_states)[-1] arr_idx = np.zeros((rewards.shape[0], 1)) arr_idx[0] = 1.0 values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32) next_value = tf.reduce_sum(values_rewards_sum * values_rewards_sum_one_hot, axis=0) # Iterate over rewards list_true_values = [] for i in reversed(range(0, len(rewards))): if done[i]==0.0: next_value = rewards[i] + next_value * self.gamma else: next_value = rewards[i] list_true_values.append(next_value) list_true_values.reverse() list_true_values = tf.convert_to_tensor(list_true_values, dtype=tf.float32) if standarize: list_true_values = ((list_true_values - tf.math.reduce_mean(list_true_values)) / (tf.math.reduce_std(list_true_values) + tf.constant(1e-12))) return list_true_values with tf.GradientTape() as tape: # Advantage returns = self.getExpectedReturns(states, next_states, done, rewards, standarize=False) actions_probs_logits, values = self.model_a2c(states) advantage = returns - values advantage = tf.squeeze(advantage) # Actions probs actions_probs_softmax = tf.nn.softmax(actions_probs_logits) actions_log_probs_softmax = tf.nn.log_softmax(actions_probs_logits) actions_one_hot = tf.one_hot(actions, self.num_actions, 1.0, 0.0) actions_log_probs = tf.reduce_sum(actions_log_probs_softmax * actions_one_hot, axis=-1) # Entropy entropy = self.entropy_coef * tf.reduce_mean(actions_probs_softmax * actions_log_probs_softmax, axis=1) # Losses actor_loss = -tf.reduce_mean(actions_log_probs * tf.stop_gradient(advantage), axis=0) critic_loss = self.critic_coef * tf.reduce_mean(tf.math.pow(advantage, 2), axis=0) total_loss = actor_loss + critic_loss - entropy
My LSTM learns, loss decreases, but Numerical Gradients don't match Analytical Gradients
The following is self-contained, and when you run it it will: 1. print the loss to verify it's decreasing (learning a sin wave), 2. Check the numeric gradients against my hand-derived gradient function. The two gradients tend to match within 1e-1 to 1e-2 (which is still bad, but shows it's trying) and there are occasional extreme outliers. I have spent all saturday backing out to a normal FFNN, getting that to work (yay, gradients match!), and now sunday on this LSTM, and well, I can't find the bug in my logic. Oh, and it's heavily depends on my random seed, sometimes it's great, sometimes awful. I've hand checked my implementation against hand derived derivatives for the LSTM equations (i did the calculus), and against the implementations in these 3 blogs/gist: http://blog.varunajayasiri.com/numpy_lstm.html https://gist.github.com/karpathy/d4dee566867f8291f086 http://colah.github.io/posts/2015-08-Understanding-LSTMs/ And tried the (amazing) debugging methods suggested here: https://blog.slavv.com/37-reasons-why-your-neural-network-is-not-working-4020854bd607 Can you help see where I have implemented something wrong? import numpy as np np.set_printoptions(precision=3, suppress=True) def check_grad(params, In, Target, f, df_analytical, delta=1e-5, tolerance=1e-7, num_checks=10): """ delta : how far on either side of the param value to go tolerance : how far the analytical and numerical values can diverge """ h_n = params['Wf'].shape[1] # TODO: h & c should be passed in (?) h = np.zeros(h_n) c = np.zeros(h_n) y, outputs, loss, h, c, caches = f(params, h, c, inputs, targets) dparams = df_analytical(params, inputs, targets, outputs, caches) passes = True for _ in range(num_checks): print() for pname, p, dpname, dp in zip(params.keys(), params.values(), dparams.keys(), dparams.values()): pix = np.random.randint(0, p.size) old_val = p.flat[pix] # d = delta * abs(old_val) if old_val != 0 else 1e-5 d = delta p.flat[pix] = old_val + d _, _, loss_plus, _, _, _ = f(params, h, c, In, Target) # note `_` is the cache p.flat[pix] = old_val - d _, _, loss_minus, _, _, _ = f(params, h, c, In, Target) p.flat[pix] = old_val grad_analytic = dp.flat[pix] grad_numeric = (loss_plus - loss_minus) / (2 * d) denom = abs(grad_numeric + grad_analytic) + 1e-12 # max((abs(grad_numeric), abs(grad_analytic))) relative_error = abs(grad_analytic - grad_numeric) / denom if relative_error > tolerance: print(("fails: %s % 4d | r: % 3.4f, a: % 3.4f, n: % 3.4f, a/n: %0.2f") % (pname, pix, relative_error, grad_analytic, grad_numeric, grad_analytic/grad_numeric)) passes &= relative_error <= tolerance return passes # ---------- def lstm(params, inp, h_old, c_old): Wf, Wi, Wg, Wo, Wy = params['Wf'], params['Wi'], params['Wg'], params['Wo'], params['Wy'] bf, bi, bg, bo, by = params['bf'], params['bi'], params['bg'], params['bo'], params['by'] xh = np.concatenate([inp, h_old]) f = np.dot(xh, Wf) + bf f_sigm = 1 / (1 + np.exp(-f)) i = np.dot(xh, Wi) + bi i_sigm = 1 / (1 + np.exp(-i)) g = np.dot(xh, Wg) + bg # C-tilde or C-bar in the literature g_tanh = np.tanh(g) o = np.dot(xh, Wo) + bo o_sigm = 1 / (1 + np.exp(-o)) c = f_sigm * c_old + i_sigm * g_tanh c_tanh = np.tanh(c) h = o_sigm * c_tanh y = np.dot(h, Wy) + by # NOTE: this is a dense layer bolted on after a normal LSTM # TODO: should it have a nonlinearity after it? MSE would not work well with, for ex, a sigmoid cache = (xh, f, f_sigm, i, i_sigm, g, g_tanh, o, o_sigm, c, c_tanh, c_old, h) return y, h, c, cache def dlstm(params, dy, dh_next, dc_next, cache): Wf, Wi, Wg, Wo, Wy = params['Wf'], params['Wi'], params['Wg'], params['Wo'], params['Wy'] bf, bi, bg, bo, by = params['bf'], params['bi'], params['bg'], params['bo'], params['by'] xh, f, f_sigm, i, i_sigm, g, g_tanh, o, o_sigm, c, c_tanh, c_old, h = cache dby = dy.copy() dWy = np.outer(h, dy) dh = np.dot(dy, Wy.T) + dh_next.copy() do = c_tanh * dh * o_sigm * (1 - o_sigm) dc = dc_next.copy() + o_sigm * dh * (1 - c_tanh ** 2) # TODO: copy? dg = i_sigm * dc * (1 - g_tanh ** 2) di = g_tanh * dc * i_sigm * (1 - i_sigm) df = c_old * dc * f_sigm * (1 - f_sigm) # ERROR FIXED: ??? c_old -> c?, c->c_old? dWo = np.outer(xh, do); dbo = do; dXo = np.dot(do, Wo.T) dWg = np.outer(xh, dg); dbg = dg; dXg = np.dot(dg, Wg.T) dWi = np.outer(xh, di); dbi = di; dXi = np.dot(di, Wi.T) dWf = np.outer(xh, df); dbf = df; dXf = np.dot(df, Wf.T) dX = dXo + dXg + dXi + dXf dh_next = dX[-h.size:] dc_next = f_sigm * dc dparams = dict(Wf = dWf, Wi = dWi, Wg = dWg, Wo = dWo, Wy = dWy, bf = dbf, bi = dbi, bg = dbg, bo = dbo, by = dby) return dparams, dh_next, dc_next def lstm_loss(params, h, c, inputs, targets): loss = 0 outputs = [] caches = [] for inp, target in zip(inputs, targets): y, h, c, cache = lstm(params, inp, h, c) loss += np.mean((y - target) ** 2) outputs.append(y) caches.append(cache) loss = loss # / inputs.shape[0] return y, outputs, loss, h, c, caches def dlstm_loss(params, inputs, targets, outputs, caches): h_shape = caches[0][-1].shape dparams = {k:np.zeros_like(v) for k, v in params.items()} dh = np.zeros(h_shape) dc = np.zeros(h_shape) for inp, out, target, cache in reversed(list(zip(inputs, outputs, targets, caches))): dy = 2 * (out - target) dps, dh, dc = dlstm(params, dy, dh, dc, cache) for dpk, dpv in dps.items(): dparams[dpk] += dpv return dparams # ---------- # setup x_n = 1 h_n = 5 o_n = 1 params = dict( Wf = np.random.normal(size=(x_n + h_n, h_n)), Wi = np.random.normal(size=(x_n + h_n, h_n)), Wg = np.random.normal(size=(x_n + h_n, h_n)), Wo = np.random.normal(size=(x_n + h_n, h_n)), Wy = np.random.normal(size=(h_n, o_n)), bf = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1, bi = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1, bg = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1, bo = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1, by = np.zeros(o_n) + np.random.normal(size=o_n) * 0.1, ) for name in ['Wf', 'Wi', 'Wg', 'Wo', 'Wy']: W = params[name] W *= np.sqrt(2 / (W.shape[0] + W.shape[1])) # Xavier initialization for name in params: params[name] = params[name].astype('float64') # ---------- # Sanity check, learn sin wave def test_sin(): emaloss = 1 # EMA average emak = 0.99 for t in range(5000): data = np.sin(np.linspace(0, 3 * np.pi, 30)) start = np.random.randint(0, data.size // 4) end = np.random.randint((data.size * 3) // 4, data.size) inputs = data[start:end, None] targets = np.roll(inputs, 1, axis=0) h_n = params['Wf'].shape[1] # TODO: h & c should be passed in h = np.random.normal(size=h_n) c = np.random.normal(size=h_n) y, outputs, loss, h, c, caches = lstm_loss(params, h, c, inputs, targets) dparams = dlstm_loss(params, inputs, targets, outputs, caches) for k in params.keys(): params[k] -= dparams[k] * 0.01 emaloss = emaloss * emak + loss * (1 - emak) if t % 100 == 0: print('%.4f' % emaloss) test_sin() # ---------- data = np.sin(np.linspace(0, 4 * np.pi, 90)) start = np.random.randint(0, data.size // 4) end = np.random.randint((data.size * 3) // 4, data.size) inputs = data[start:end, None] targets = np.roll(inputs, 1, axis=0) for inp, targ in zip(inputs, targets): assert(check_grad(params, inputs, targets, lstm_loss, dlstm_loss, delta=1e-5, tolerance=1e-7, num_checks=10)) print('grads are ok') # <- i never reach here
Solved it! in my check_grad, I need to build the caches which is served to df_analytical, but in so doing, I also overwrite the h and c which should have been np.zeroes. y, outputs, loss, h, c, caches = f(params, h, c, inputs, targets) _, _, loss_minus, _, _, _ = f(params, h, c, inputs, targets) p.flat[pix] = old_val So, simply not overwriting h and c fixes it, and the LSTM code was a.o.k. _, outputs, loss, _, _, caches = f(params, h, c, inputs, targets)
I think the problem might be this line: c = f_sigm * c_old + i_sigm * g_tanh
Backpropagation Cost Function Error Increases instead of Decreasing
I am new to python and Machine Learning. Can Someone please let me know what is the problem in the implementation of ann backpropagation algorithm. The error values seem to be increasing instead of decreasing. Code is as Follows As It can be seen in the output the error value is increasing. import math import random from random import seed n_inputs = 3 n_hidden = 3 n_outputs = 1 dataset = [[1, 0, 1], [1]] wih = [[random.random() for i in range(n_hidden)] for i in range(n_inputs)] who = [random.random() for i in range(n_hidden)] def sigmoid(x): return 1.0 / (1.0 + math.exp(-x)) def derivative_sigmoid(x): return x * (1 - x) def activate_ih(data): activation = [0, 0, 0] for i in range(n_inputs): for j in range(n_hidden): activation[j] += data[i] * wih[i][j] return activation def activate_ho(data): activation = 0 for i in range(n_hidden): activation += data[i] + who[i] return activation def forward_pass(): input = [] for x in dataset[0]: input.append(sigmoid(x)) input_h = activate_ih(input) output_h = [] for x in input_h: output_h.append(sigmoid(x)) input_o = activate_ho(output_h) output_o = sigmoid(input_o) return input_h, output_h, input_o, output_o def backpropagate_oh(learning_rate, output_h, input_o, output_o): error_o = dataset[1][0] - output_o output_delta = error_o * derivative_sigmoid(input_o) for i in range(n_hidden): delta_weight = output_h[i] * output_delta who[i] = who[i] + learning_rate*delta_weight return output_delta def backpropagate_hi(learning_rate, input_h, output_delta): hidden_delta = [] for i in range(n_hidden): error = who[i] * output_delta hidden_delta.append(error * derivative_sigmoid(input_h[i])) for i in range(n_input): for j in range(n_hidden): delta_weight = hidden_delta[j] * dataset[0][j] wih[i][j] = wih[i][j] + learning_rate * delta_weight def trainNetwork(epochs, learning_rate): for i in range(epochs): sum_error = 0 inp_h, out_h, inp_o, out_o = forward_pass() sum_error = dataset[1][0] - out_o print('Epoch {0} \tError'.format(i), sum_error, '\tOuput: ' , out_o, '\tTarget: ', dataset[1][0]) out_delta = backpropagate_oh(learning_rate, out_h, inp_o, out_o) backpropagate_hi(learning_rate, inp_h, out_delta) trainNetwork(epochs=20, learning_rate=0.5)
From a quick look it looks like you are taking a step in the wrong direction. After you find the gradient you want to take a step on the OTHER direction, because you want to go down the slope. Try who[i] = who[i] - learning_rate*delta_weight (This is not a full answer but I can't comment yet so I need to post this.)