Backpropagation on Neural Network in Python - python

I've implemented a neural network (deep autoencoder), on which I'm trying to perform backpropagation. The network consist of sigmoid activation functions and a softmax activation function at the output layer. To calculate the error, I use the Cross Entropy error function. The data input is bag of word matrices, where the words are divided by the length of the document to normalize the data.
I'm using the method Conjugate Gradient in order to find local minima. My problem is basically that the error is rising during backpropagation. I believe that it has something to do with me calculating the gradient wrong?
The code to calculate the error and gradient is given below:
def get_grad_and_error(self,weights,weight_sizes,x):
weights = self.__convert__(weights, weight_sizes)
x = append(x,ones((len(x),1),dtype = float64),axis = 1)
xout, z_values = self.__generate_output_data__(x, weights)
f = -sum(x[:,:-1]*log(xout)) # Cross-entropy error function
# Gradient
number_of_weights = len(weights)
gradients = []
delta_k = None
for i in range(len(weights)-1,-1,-1):
if i == number_of_weights-1:
delta = (xout-x[:,:-1])
grad = dot(z_values[i-1].T,delta)
elif i == 0:
delta = dot(delta_k,weights[i+1].T)*z_values[i]*(1-z_values[i])
delta = delta[:,:-1]
grad = dot(x.T,delta)
delta = dot(delta_k,weights[i+1].T)*z_values[i]*(1-z_values[i])
delta = delta[:,:-1]
grad = dot(z_values[i-1].T,delta)
delta_k = delta
gradients_formatted = []
for g in gradients:
gradients_formatted = append(gradients_formatted,reshape(g,(1,len(g)*len(g[0])))[0])
return f,gradients_formatted
To calculate the output of the network I use following method:
def __generate_output_data__(self, x, weight_matrices_added_biases):
z_values = []
for i in range(len(weight_matrices_added_biases)-1):
if i == 0:
z = dbn.sigmoid(dot(x,weight_matrices_added_biases[i]))
z = dbn.sigmoid(dot(z_values[i-1],weight_matrices_added_biases[i]))
z = append(z,ones((len(x),1),dtype = float64),axis = 1)
xout = dbn.softmax(dot(z_values[-1],weight_matrices_added_biases[-1]))
return xout, z_values
I calculate the sigmoid and softmax values as follows:
def sigmoid(x):
return 1./(1+exp(-x))
def softmax(x):
numerator = exp(x)
denominator = numerator.sum(axis = 1)
denominator = denominator.reshape((x.shape[0],1))
softmax = numerator/denominator
return softmax
I would really appreciate if anyone could be of assistance? Please let me know if you need me to elaborate on any of the above info? Thanks.


Scipy fails to minimize cost function

Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
# Appending the params to the theta matrix
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs =, currentActivations)
# Storing the weighted inputs
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)

PyTorch does not converge when approximating square function with linear model

I'm trying to learn some PyTorch and am referencing this discussion here
The author provides a minimum working piece of code that illustrates how you can use PyTorch to solve for an unknown linear function that has been polluted with random noise.
This code runs fine for me.
However, when I change the function such that I want t = X^2, the parameter does not seem to converge.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
# Let's make some data for a linear regression.
A = 3.1415926
b = 2.7189351
error = 0.1
N = 100 # number of data points
# Data
X = Variable(torch.randn(N, 1))
# (noisy) Target values that we want to learn.
t = X * X + Variable(torch.randn(N, 1) * error)
# Creating a model, making the optimizer, defining loss
model = nn.Linear(1, 1)
optimizer = optim.SGD(model.parameters(), lr=0.05)
loss_fn = nn.MSELoss()
# Run training
niter = 50
for _ in range(0, niter):
predictions = model(X)
loss = loss_fn(predictions, t)
print("-" * 50)
print("error = {}".format([0]))
print("learned A = {}".format(list(model.parameters())[0].data[0, 0]))
print("learned b = {}".format(list(model.parameters())[1].data[0]))
When I execute this code, the new A and b parameters are seemingly random thus it does not converge. I think this should converge because you can approximate any function with a slope and offset function. My theory is that I'm using PyTorch incorrectly.
Can any identify a problem with my t = X * X + Variable(torch.randn(N, 1) * error) line of code?
You cannot fit a 2nd degree polynomial with a linear function. You cannot expect more than random (since you have random samples from the polynomial).
What you can do is try and have two inputs, x and x^2 and fit from them:
model = nn.Linear(2, 1) # you have 2 inputs now
X_input =, X**2), dim=1) # have 2 inputs per entry
# ...
predictions = model(X_input) # 2 inputs -> 1 output
loss = loss_fn(predictions, t)
# ...
# learning t = c*x^2 + a*x + b
print("learned a = {}".format(list(model.parameters())[0].data[0, 0]))
print("learned c = {}".format(list(model.parameters())[0].data[0, 1]))
print("learned b = {}".format(list(model.parameters())[1].data[0]))

Neural network backprop not fully training

I have this neural network that I've trained seen bellow, it works, or at least appears to work, but the problem is with the training. I'm trying to train it to act as an OR gate, but it never seems to get there, the output tends to looks like this:
prior to training:
post training:
[0.754652 ]
expected output:
I have this loss graph:
Its strange it appears to be training, but at the same time not quite getting to the expected output. I know that it would never really achieve the 0s and 1s, but at the same time I expect it to manage and get something a little bit closer to the expected output.
I had some issues trying to figure out how to back prop the error as I wanted to make this network have any number of hidden layers, so I stored the local gradient in a layer, along side the weights, and sent the error from the end back.
The main functions I suspect are the culprits are NeuralNetwork.train and both forward methods.
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
class NeuralNetwork:
class __Layer:
def __init__(self,args):
self.__epsilon = 1e-6
self.localGrad = 0
self.__weights = np.random.randn(
self.__biases = np.zeros(
def __str__(self):
return str(self.__weights)
def forward(self,X):
a =, self.__weights) + self.__biases
self.localGrad =,self.__sigmoidPrime(a))
return self.__sigmoid(a)
def adjustWeights(self, err):
self.__weights -= (err * self.__epsilon)
def __sigmoid(self, z):
return 1/(1 + np.exp(-z))
def __sigmoidPrime(self, a):
return self.__sigmoid(a)*(1 - self.__sigmoid(a))
def __init__(self,args):
self.__inputDimensions = args["inputDimensions"]
self.__outputDimensions = args["outputDimensions"]
self.__hiddenDimensions = args["hiddenDimensions"]
self.__layers = []
def __constructLayers(self):
"biasHeight": self.__inputDimensions[0],
"previousLayerHeight": self.__inputDimensions[1],
"height": self.__hiddenDimensions[0][0]
if len(self.__hiddenDimensions) > 0
else self.__outputDimensions[0]
for i in range(len(self.__hiddenDimensions)):
"biasHeight": self.__hiddenDimensions[i + 1][0]
if i + 1 < len(self.__hiddenDimensions)
else self.__outputDimensions[0],
"previousLayerHeight": self.__hiddenDimensions[i][0],
"height": self.__hiddenDimensions[i + 1][0]
if i + 1 < len(self.__hiddenDimensions)
else self.__outputDimensions[0]
def forward(self,X):
out = self.__layers[0].forward(X)
for i in range(len(self.__layers) - 1):
out = self.__layers[i+1].forward(out)
return out
def train(self,X,Y,loss,epoch=5000000):
for i in range(epoch):
YHat = self.forward(X)
delta = -(Y-YHat)
err = np.sum([-1].localGrad,delta.T), axis=1)
err.shape = (self.__hiddenDimensions[-1][0],1)
for l in reversed(self.__layers[:-1]):
err =, err)
i += 1
def printLayers(self):
for l in self.__layers:
def main(args):
X = np.array([[x,y] for x,y in product([0,1],repeat=2)])
Y = np.array([[0],[1],[1],[1]])
nn = NeuralNetwork(
"inputDimensions": (4,2),
"outputDimensions": (1,1),
print("expected output:\n\n",Y,"\n")
print("prior to training:\n\n",nn.forward(X), "\n")
loss = []
print("post training:\n\n",nn.forward(X), "\n")
fig,ax = plt.subplots()
x = np.array([x for x in range(5000000)])
loss = np.array(loss)
ax.set(xlabel="epoch",ylabel="loss",title="logic gate training")
Could someone please point out what I'm doing wrong here, I strongly suspect it has to do with the way I'm dealing with matrices but at the same time I don't have the slightest idea what's going on.
Thanks for taking the time to read my question, and taking the time to respond (if relevant).
Actually quite a lot is wrong with this but I'm still a bit confused over how to fix it. Although the loss graph looks like its training, and it kind of is, the math I've done above is wrong.
Look at the training function.
def train(self,X,Y,loss,epoch=5000000):
for i in range(epoch):
YHat = self.forward(X)
delta = -(Y-YHat)
err = np.sum([-1].localGrad,delta.T), axis=1)
err.shape = (self.__hiddenDimensions[-1][0],1)
for l in reversed(self.__layers[:-1]):
err =, err)
i += 1
Note how I get delta = -(Y-Yhat) and then dot product it with the "local gradient" of the last layer. The "local gradient" is the local W gradient.
def forward(self,X):
a =, self.__weights) + self.__biases
self.localGrad =,self.__sigmoidPrime(a))
return self.__sigmoid(a)
I'm skipping a step in the chain rule. I should really be multiplying by W* sigprime(XW + b) first as that's the local gradient of X, then by the local W gradient. I tried that, but I'm still getting issues, here is the new forward method (note the __init__ for layers needs to be initialised for the new vars, and I changed the activation function to tanh)
def forward(self, X):
a =, self.__weights) + self.__biases
self.localPartialGrad = self.__tanhPrime(a)
self.localWGrad =, self.localPartialGrad)
self.localXGrad =,self.__weights.T)
return self.__tanh(a)
and updated the training method to look something like this:
def train(self, X, Y, loss, epoch=5000):
for e in range(epoch):
Yhat = self.forward(X)
err = -(Y-Yhat)
for l in self.__layers[::-1]:
if(l != self.__layers[0]):
err = np.multiply(err,l.localPartialGrad)
err = np.multiply(err,l.localXGrad)
The new graphs I'm getting are all over the place, I have no idea what's going on. Here is the final bit of code I changed:
def adjustWeights(self, err):
perr = np.multiply(err, self.localPartialGrad)
werr = np.sum(,perr.T),axis=1)
werr = werr * self.__epsilon
werr.shape = (self.__weights.shape[0],1)
self.__weights = self.__weights - werr
Your network is learning, as can be seen from the loss chart, so backprop implementation is correct (congrats!). The main problem with this particular architecture is the choice of the activation function: sigmoid. I have replaced sigmoid with tanh and it works much better instantly.
From this discussion on CV.SE:
There are two reasons for that choice (assuming you have normalized
your data, and this is very important):
Having stronger gradients: since data is centered around 0, the
derivatives are higher. To see this, calculate the derivative of the
tanh function and notice that input values are in the range [0,1]. The
range of the tanh function is [-1,1] and that of the sigmoid function
is [0,1]
Avoiding bias in the gradients. This is explained very well in the
paper, and it is worth reading it to understand these issues.
Though I'm sure sigmoid-based NN can be trained as well, looks like it's much more sensitive to input values (note that they are not zero-centered), because the activation itself is not zero-centered. tanh is better than sigmoid by all means, so a simpler approach is just use that activation function.
The key change is this:
def __tanh(self, z):
return np.tanh(z)
def __tanhPrime(self, a):
return 1 - self.__tanh(a) ** 2
... instead of __sigmoid and __sigmoidPrime.
I have also tuned hyperparameters a little bit, so that the network now learns in 100k epochs, instead of 5m:
prior to training:
[[ 0. ]
post training:
[[0. ]
A complete code is in this gist.
Well I'm an idiot. I was right about being wrong but I was wrong about how wrong I was. Let me explain.
Within the backwards training method I got the last layer trained correctly, but all layers after that wasn't trained correctly, hence why the above network was coming up with a result, it was indeed training, but only one layer.
So what did i do wrong? Well I was only multiplying by the local graident of the Weights with respect to the output, and thus the chain rule was partially correct.
Lets say the loss function was this:
t = Y-X2
loss = 1/2*(t)^2
a2 = X1W2 + b
X2 = activation(a2)
a1 = X0W1 + b
X1 = activation(a1)
We know that the the derivative of loss with respect to W2 would be -(Y-X2)*X1. This was done in the first part of my training function:
def train(self,X,Y,loss,epoch=5000000):
for i in range(epoch):
#First part
YHat = self.forward(X)
delta = -(Y-YHat)
err = np.sum([-1].localGrad,delta.T), axis=1)
err.shape = (self.__hiddenDimensions[-1][0],1)
#Second part
for l in reversed(self.__layers[:-1]):
err =, err)
i += 1
However the second part is where I screwed up. In order to calculate the loss with respect to W1, I must multiply the original error -(Y-X2) by W2 as W2 is the local X Gradient of the last layer, and due to the chain rule this must be done first. Then I could multiply by the local W gradient (X1) to get the loss with respect to W1. I failed to do the multiplication of the local X gradient first, so the last layer was indeed training, but all layers after that had an error that magnified as the layer increased.
To solve this I updated the train method:
def train(self,X,Y,loss,epoch=10000):
for i in range(epoch):
YHat = self.forward(X)
err = -(Y-YHat)
werr = np.sum([-1].localWGrad,err.T), axis=1)
werr.shape = (self.__hiddenDimensions[-1][0],1)
for l in reversed(self.__layers[:-1]):
err = np.multiply(err, l.localXGrad)
werr = np.sum(,err.T),axis=1)
Now the loss graph I got looks like this:

Inverting Gradients in Keras

I'm trying to port the BoundingLayer function from this file to the agent in keras-rl but I'm having some trouble with the implementation.
I modified the get_gradients(loss, params) method in to add this:
action_bounds = [-30, 50]
inverted_grads = []
for g,p in zip(modified_grads, params):
is_above_upper_bound = K.greater(p, K.constant(action_bounds[1], dtype='float32'))
is_under_lower_bound = K.less(p, K.constant(action_bounds[0], dtype='float32'))
is_gradient_positive = K.greater(g, K.constant(0, dtype='float32'))
is_gradient_negative = K.less(g, K.constant(0, dtype='float32'))
invert_gradient = tf.logical_or(
tf.logical_and(is_above_upper_bound, is_gradient_negative),
tf.logical_and(is_under_lower_bound, is_gradient_positive)
inverted_grads.extend(K.switch(invert_gradient, -g, g))
modified_grads = inverted_grads[:]
But I get an error about the shape:
ValueError: Shape must be rank 0 but is rank 2 for 'cond/Switch' (op: 'Switch') with input shapes: [2,400], [2,400].
keras-rl "get_gradients" function uses gradients calculated with a combined actor-critic model, but you need the gradient of the critic output wrt the action input to apply the inverting gradients feature.
I've recently implemented it on a RDPG prototype I'm working on, using keras-rl. Still testing, the code can be optimized and is not bug free for sure, but I've put the inverting gradient to work by modifying some keras-rl lines of code. In order to modify the gradient of the critic output wrt the action input, I've followed the original formula to compute the actor gradient, with the help of this great post from Patrick Emami:
I'm putting here the entire "compile" function, redefined in a class that inherits from "DDPAgent", where the inverting gradient feature is implemented.
def compile(self, optimizer, metrics=[]):
metrics += [mean_q]
if type(optimizer) in (list, tuple):
if len(optimizer) != 2:
raise ValueError('More than two optimizers provided. Please only provide a maximum of two optimizers, the first one for the actor and the second one for the critic.')
actor_optimizer, critic_optimizer = optimizer
actor_optimizer = optimizer
critic_optimizer = clone_optimizer(optimizer)
if type(actor_optimizer) is str:
actor_optimizer = optimizers.get(actor_optimizer)
if type(critic_optimizer) is str:
critic_optimizer = optimizers.get(critic_optimizer)
assert actor_optimizer != critic_optimizer
if len(metrics) == 2 and hasattr(metrics[0], '__len__') and hasattr(metrics[1], '__len__'):
actor_metrics, critic_metrics = metrics
actor_metrics = critic_metrics = metrics
def clipped_error(y_true, y_pred):
return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)
# Compile target networks. We only use them in feed-forward mode, hence we can pass any
# optimizer and loss since we never use it anyway.
self.target_actor = clone_model(, self.custom_model_objects)
self.target_actor.compile(optimizer='sgd', loss='mse')
self.target_critic = clone_model(self.critic, self.custom_model_objects)
self.target_critic.compile(optimizer='sgd', loss='mse')
# We also compile the actor. We never optimize the actor using Keras but instead compute
# the policy gradient ourselves. However, we need the actor in feed-forward mode, hence
# we also compile it with any optimzer and'sgd', loss='mse')
# Compile the critic.
if self.target_model_update < 1.:
# We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
critic_updates = get_soft_target_model_updates(self.target_critic, self.critic, self.target_model_update)
critic_optimizer = AdditionalUpdatesOptimizer(critic_optimizer, critic_updates)
self.critic.compile(optimizer=critic_optimizer, loss=clipped_error, metrics=critic_metrics)
clipnorm = getattr(actor_optimizer, 'clipnorm', 0.)
clipvalue = getattr(actor_optimizer, 'clipvalue', 0.)
critic_gradients_wrt_action_input = tf.gradients(self.critic.output, self.critic_action_input)
critic_gradients_wrt_action_input = [g / float(self.batch_size) for g in critic_gradients_wrt_action_input] # since TF sums over the batch
action_bounds = [(-1.,1.) for i in range(self.nb_actions)]
def calculate_inverted_gradient():
Applies "inverting gradient" feature to the action-value gradients.
gradient_wrt_action = -critic_gradients_wrt_action_input[0]
inverted_gradients = []
for n in range(self.batch_size):
inverted_gradient = []
for i in range(gradient_wrt_action[n].shape[0].value):
action = self.critic_action_input[n][i]
is_gradient_negative = K.less(gradient_wrt_action[n][i], K.constant(0, dtype='float32'))
adjust_for_upper_bound = gradient_wrt_action[n][i] * ((action_bounds[i][1] - action) / (action_bounds[i][1] - action_bounds[i][0]))
adjust_for_lower_bound = gradient_wrt_action[n][i] * ((action - action_bounds[i][0]) / (action_bounds[i][1] - action_bounds[i][0]))
modified_gradient = K.switch(is_gradient_negative, adjust_for_upper_bound, adjust_for_lower_bound)
inverted_gradient.append( modified_gradient )
gradient_wrt_action = tf.stack(inverted_gradients)
return gradient_wrt_action
actor_gradients_wrt_weights = tf.gradients(,, grad_ys=calculate_inverted_gradient())
actor_gradients_wrt_weights = [g / float(self.batch_size) for g in actor_gradients_wrt_weights] # since TF sums over the batch
def get_gradients(loss, params):
""" Used by the actor optimizer.
Returns the gradients to train the actor.
These gradients are obtained by multiplying the gradients of the actor output w.r.t. its weights
with the gradients of the critic output w.r.t. its action input. """
# Aplly clipping if defined
modified_grads = [g for g in actor_gradients_wrt_weights]
if clipnorm > 0.:
norm = K.sqrt(sum([K.sum(K.square(g)) for g in modified_grads]))
modified_grads = [optimizers.clip_norm(g, clipnorm, norm) for g in modified_grads]
if clipvalue > 0.:
modified_grads = [K.clip(g, -clipvalue, clipvalue) for g in modified_grads]
return modified_grads
actor_optimizer.get_gradients = get_gradients
# get_updates is the optimizer function that changes the weights of the network
updates = actor_optimizer.get_updates(,, None)
if self.target_model_update < 1.:
# Include soft target model updates.
updates += get_soft_target_model_updates(self.target_actor,, self.target_model_update)
updates += # include other updates of the actor, e.g. for BN
# Finally, combine it all into a callable function.
# The inputs will be all the necessary placeholders to compute the gradients (actor and critic inputs)
inputs =[:] + [self.critic_action_input, self.critic_history_input]
self.actor_train_fn = K.function(inputs, [], updates=updates)
self.actor_optimizer = actor_optimizer
self.compiled = True
When training the actor, you should now pass 3 inputs instead of 2: the observation inputs + the action input (with a prediction from the actor network), so you must also modify the "backward" function. In my case:
if self.episode > self.nb_steps_warmup_actor:
action =
inputs = [history_batch, action, history_batch]
actor_train_result = self.actor_train_fn(inputs)
action_values = actor_train_result[0]
assert action_values.shape == (self.batch_size, self.nb_actions)
After that you can have your actor with a linear activation in the output.

Trying to write my own Neural Network in Python

Last semester I took an online machine learning course from Standford taught by Professor Ng. I thought it was pretty informative. To brush up/understand neural networks better I tried to write my own in python. Here it is:
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.cost = []
def update(self, input):
if input.shape[1] !=[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. I'm expecting inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def predict(self, input):
return self.a[-1]
But it doesn't work =(. Inspecting the cost vs. iteration I can see a blip in the cost and the prediction for A is all the same. Can someone help me understand why my neural network is not converging?
Sorry about the amount of code (maybe someone will find it useful).
Instead of using random data I've got some structured data from the UCI Machine Learning Repository. The particular data set is the burned area of forest fires, in the northeast region of Portugal, using meteorological and other data: I modified the data so that days and months were numbers:
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)
features = data[:,0:11]
targets = numpy.matrix(data[:,12]).T
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
n = NN([11, 10, 1]) #The class takes the list of how many nodes in each layer
n.train(nfeatures, targets, 0.003, 0.0)
import matplotlib.pyplot
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
Why does the cost bottom out around 4000 and why does the Data Vs. Predicted not have any trend? You can see the graphs here:
(Sorry, I don't have enough rep to add comments, so I'll just keep posting answers instead.)
Yes, it does seem strange. If, however, after training you generate a new matrix B:
B = numpy.random.rand(5, 4)/5
Targets = B*X
print n.predict(B)
print B*X
it will work fine (most of the times - sometimes it will still give the average(Targets) as the answer).
Note: I switched from using 100 features to using just 4 in my example.
Also, I don't think that running 5000 iterations on 50 elements of the data set will do you any good. You should generally try to use as much training data as you can - and here you can use as much as you want, but you use even less examples than you have features.
This is fun, I'll think about it some more :) I was using your network for a more simple example - as Input I provided two numbers, and expected their sum as Output. It worked more or less okay.
The neural network was unable to train on the Forest Fire data for a few reasons.
First the numpy.tanh() sigmoid function is not behaving as expected. The code should be changed from:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)),numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1])))))
Second numpy and matplotlib are not playing nice. The numpy matrices seem to be plotted backwards. This can be fixed by using matrix.tolist(). Code changed from:
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
Finally the number of nodes should be approximately 10% of the example size. Instead of 10 it is better to use 50 nodes.
The working neural network code is posted below with a new function autoparam which tries to find the best learning rate and regularization constant. You can see the graphs for the Forest Fire cost vs iteration and data vs predicted here:
Thanks for reading! I hope my neural network can help people.
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.cost = []
def update(self, input):
if input.shape[1] !=[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. Expected inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1]))))) #sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def autoparam(self, data, alpha = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3], lamda = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]):
#data: numpy matrix with targets in last column
#alpha: learning rate
#lamda: regularization term
#Create training, cross validation, and test sets
while 1:
numpy.seterr(invalid = 'raise')
numpy.random.shuffle(data) #Shuffle data
training_set = data[0:data.shape[0]/10*6, 0:-1]
self.ntraining_set = (training_set-training_set.mean(axis=0))/training_set.std(axis=0)
self.training_tgt = numpy.matrix(data[0:data.shape[0]/10*6, -1]).T
cv_set = data[data.shape[0]/10*6:data.shape[0]/10*8, 0:-1]
self.ncv_set = (cv_set-cv_set.mean(axis=0))/cv_set.std(axis=0)
self.cv_tgt = numpy.matrix(data[data.shape[0]/10*6:data.shape[0]/10*8, -1]).T
test_set = data[data.shape[0]/10*8:, 0:-1]
self.ntest_set = (test_set-test_set.mean(axis=0))/test_set.std(axis=0)
self.test_tgt = numpy.matrix(data[data.shape[0]/10*8:, -1]).T
except FloatingPointError:
numpy.seterr(invalid = 'warn')
cost = 999999
for i in alpha:
for j in lamda:
self.train(self.ntraining_set, self.training_tgt, i, j, 2000)
current_cost = 1/float(cv_set.shape[0])*sum(numpy.square(self.predict(self.ncv_set) - self.cv_tgt)).tolist()[0][0]
print current_cost
if current_cost < cost:
cost = current_cost
self.learning_rate = i
self.regularization = j
def predict(self, input):
return self.a[-1]
Loading data, Plotting, etc...
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)#Load
features = data[:,0:11]
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
targets = numpy.matrix(data[:, 12]).T
n = NN([11, 50, 1])
n.train(nfeatures, targets, 0.07, 0.0, 2000)
import matplotlib.pyplot
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
matplotlib.pyplot.plot(targets.tolist(), targets.tolist(), c = 'r')
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
I think that your bias should be subtracted somewhere from the weighted inputs (or set to -1). From what I see in your code, the neurons add all the inputs, including the bias (which is set to +1.
