Related
Howdy!
Recently I have built my own library for neural networks.
Without convolutional layers it worked fine. However, now, that I have implemented convolutional layers, it doesn't improve (in comparison to the dense nn) at all, which is inacceptable for more complex tasks like for instance Pneumonia Detection.
For backprop, every layer updates it's own values and passes it's input gradient to the layer behind it. For forwardprop every layer just gives its output as input to the next layer.
In the example below the model is set up for pneumonia detection with 128px * 128px images.
The accuracy allways stays under 60% no matter how much it's trained.
Here is the relevant code:
model = nG.NeuralNetwork([
nG.Convolutional(1, (128, 128), 3, 8),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(8, (63, 63), 3, 16),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(16, (31, 31), 3, 16),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(16, (15, 15), 3, 32),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(32, (7, 7), 3, 64),
nG.ReLU(),
nG.Reshape((64, 5, 5), (1600, 1)),
nG.Dense(1600, 128),
nG.Tanh(),
nG.Dense(128, 2),
nG.Tanh()],
nG.MSE()
)
model.train(images, labels, epochs=3, lr=0.01)
class NeuralNetwork:
def __init__(self, layers, loss):
self.layers = layers
self.loss = loss
def forwardProp(self, input):
output = input
for layer in self.layers:
output = layer.forwardProp(output)
return output
def backwardProp(self, errorDeriv, lr):
deltaOutput = errorDeriv
for layer in reversed(self.layers):
deltaOutput = layer.backwardProp(deltaOutput, lr)
def train(self, xTrain, yTrain, epochs=1, lr=1, interimResult=False):
corrects = 0
print("Precompiling ... This might take a few seconds", end="\n\n")
for epoch in range(epochs):
print(f"{epoch+1}th epoch:")
round_start = time.time()
i = -1
for X, Y in zip(xTrain, yTrain):
i += 1
start = time.time()
output = self.forwardProp(X)
errorDeriv = self.loss.errorDerivative(output, Y)
self.backwardProp(errorDeriv, lr)
--
class Convolutional():
def __init__(self, input_depth, input_size, kernel_size, depth):
self.input_depth = input_depth
self.input_size = input_size
self.kernel_size = kernel_size
self.depth = depth
self.kernels = np.random.uniform(-0.5, 0.5, (depth, input_depth, kernel_size, kernel_size))
self.bias = [np.random.uniform(-0.5, 0.5, (input_size[0] - kernel_size + 1, input_size[1] - kernel_size + 1)) for i in range(depth)]
self.input = None
def forwardProp(self, input):
self.input = input
output = get_output(input, self.depth, self.input_size, self.kernel_size, self.input_depth, self.kernels, self.bias)
return output
def backwardProp(self, output_delta, lr):
kernels_gradient, input_delta = get_gradients(self.kernels, self.input, self.depth, self.input_depth, output_delta)
self.kernels -= lr * kernels_gradient
self.bias -= lr * output_delta
return input_delta
#numba.njit
def get_gradients(kernels, input, depth, input_depth, output_delta):
kernels_gradient = np.zeros(kernels.shape)
input_delta = np.zeros(input.shape)
for i in range(depth):
for j in range(input_depth):
kernels_gradient[i, j] = valid_correlate(input[j], output_delta[i])
input_delta[j] += full_convolve(output_delta[i], kernels[i, j])
return kernels_gradient, input_delta
#numba.njit(fastmath=True, nogil=True)
def get_output(input, depth, input_size, kernel_size, input_depth, kernels, bias):
out = np.zeros((depth, input_size[0] - kernel_size + 1, input_size[0] - kernel_size + 1))
for k in range(depth):
for i in range(input_depth):
out[k] += valid_correlate(input[i], kernels[k][i])
out[k] += bias[k]
return out
class Pooling:
def __init__(self, size):
self.size = size
self.input = None
def forwardProp(self, input):
self.input = input
output = []
for i in range(input.shape[0]):
output.append(pool(input[i], self.size))
output = np.asarray(output)
return output
def backwardProp(self, output_delta, lr):
input_delta = anti_pool(output_delta, self.input.shape, self.size, self.input)
return input_delta
def anti_pool(output_delta, input_shape, size, input):
input_delta = np.zeros(input_shape)
for l in range(input_delta.shape[0]):
for x in range(output_delta.shape[1]):
for y in range(output_delta.shape[2]):
area_start = (x * size, y * size)
area_end = (min((x + 1) * size, input_delta.shape[1]),
min((y + 1) * size, input_delta.shape[2]))
area = (input[l, area_start[0]:area_end[0], area_start[1]:area_end[1]])
highest_pos = np.unravel_index(area.argmax(), area.shape)
highest_pos = [x * size + highest_pos[0],
y * size + highest_pos[1]]
input_delta[l, highest_pos[0], highest_pos[1]] = output_delta[l, x, y]
return input_delta
#numba.njit("float64[:,:](float64[:,:], int32)")
def pool(mat, size):
def pool_at_position(mat, pos):
end_pos = (min(mat.shape[0], pos[0] + size),
min(mat.shape[1], pos[1] + size))
area = mat[pos[0]:end_pos[0], pos[1]:end_pos[1]]
result = np.max(area)
return result
output_size = (int(np.ceil(mat.shape[0] / size)), int(np.ceil(mat.shape[1] / size)))
output = np.zeros(output_size)
for x in range(output_size[0]):
for y in range(output_size[1]):
output[x, y] = pool_at_position(mat, (x * size, y * size))
return output
class Dense:
def __init__(self, inputSize, outputSize):
self.weights = np.random.randn(outputSize, inputSize)
self.bias = np.random.randn(outputSize, 1)
def forwardProp(self, input):
self.input = input
return np.dot(self.weights, self.input) + self.bias
def backwardProp(self, output_gradient, lr):
weights_gradient = np.dot(output_gradient, self.input.T)
input_gradient = np.dot(self.weights.T, output_gradient)
self.weights -= lr * weights_gradient
self.bias -= lr * output_gradient
return input_gradient
class Tanh:
def __init__(self):
self.input = None
self.output = None
def forwardProp(self, input):
self.input = input
self.output = tanh(input)
return self.output
def backwardProp(self, outputDelta, lr):
inputDelta = 1 - (np.tanh(self.input) ** 2)
inputDelta *= outputDelta
return inputDelta
#numba.vectorize
def tanh(x):
return np.tanh(x)
class ReLU:
def __init__(self):
self.input = None
self.output = None
def forwardProp(self, input):
self.input = input
self.output = np.maximum(input, 0)
return self.output
def backwardProp(self, outputDelta, lr):
inputDelta = np.multiply(outputDelta, np.vectorize(self.anti_relu)(self.input))
return inputDelta
def anti_relu(self, x):
if x < 0:
return 0
else:
return 1
class MSE :
def __init__(self):
pass
def errorFunction(self, output, Y):
error = (output - Y) ** 2
return error
def errorDerivative(self, output, Y):
error_deriv = 2 * (output - Y)
return error_deriv
For the functions/Classes that I've not included I'm dead sure that they work.
I spent the last couple of days reading over the code and still haven't found the problem yet.
I would be extremely thankful for any kind of response.
Kind Regards
Eirik
this is fragment of my code
def train(self, features, targets):
for X, y in zip(features, targets):
X = X.reshape(1, X.shape[0])
outputs = self.feed_forward(X)
when I try to use the method with data:
train(np.array([gameDataList[n].ball_position, gameDataList[n].wall_position]), np.array(gameDataList[n].upOrDown))
where gameDataList[n].upOrDown is an array e.g. [0.1, 0.9], and gameDataList[n].ball_position and gameDataList[n].wall_position are floats, I get this error.
Full code:
#### Imports ####
import numpy as np
#### Neural Network Class ####
class MLP:
##### Constructor ####
def __init__(self, n_input_nodes, hidden_nodes, n_output_nodes, lr):
## Network ##
self.n_input_nodes = n_input_nodes
self.n_output_nodes = n_output_nodes
self.nodes = hidden_nodes
self.nodes.insert(0, n_input_nodes)
self.nodes.append(n_output_nodes)
## Weights and Biases##
self.weights = []
self.biases = []
for i in range(1, len(self.nodes)):
self.weights.append(np.random.uniform(-1.0, 1.0, (self.nodes[i - 1], self.nodes[i])))
self.biases.append(np.random.uniform(-1.0, 1.0, (1, self.nodes[i])))
## Learning Rate ##
self.lr = lr
## Activation Functions ##
# Linear Activation
self.linear = lambda x: x
self.d_linear = lambda x: np.ones(x.shape)
# Relu Activation
def relu(x):
x[x < 0] = 0
return x
def d_relu(out):
out: x[x > 0] = 1
return out
self.relu = relu
self.d_relu = d_relu
# Sigmoid Activation
self.sigmoid = lambda x: 1 / (1 + np.exp(-x))
self.d_sigmoid = lambda out: out * (1 - out) # assumes out is tanh(x)
# Hyperbolic Tangent Activation
self.tanh = lambda x: np.tanh(x)
self.d_tanh = lambda out: 1 - out ** 2 # assumes out is tanh(x)
def getWeights(self):
return self.weights.copy()
def getBiases(self):
return self.biases.copy()
def setWeights(self, weights):
self.weights = weights.copy()
def setBiases(self, biases):
self.biases = biases.copy()
#### Feed Forward ####
def feed_forward(self, X):
outputs = [X]
logits = np.dot(X, self.weights[0]) + self.biases[0]
for i in range(1, len(self.nodes) - 1):
out = self.sigmoid(logits)
outputs.append(out)
logits = np.dot(out, self.weights[i]) + self.biases[i]
out = self.sigmoid(logits)
outputs.append(out)
return outputs
#### Backpropagation ####
def backpropagation(self, X, y, outputs):
weights_gradients = []
biases_gradients = []
d1 = y - outputs[-1]
d2 = self.d_sigmoid(outputs[-1])
error = d1 * d2
grad = outputs[-2].T * error
weights_gradients.append(grad)
biases_gradients.append(error)
for i in range(len(self.weights) - 2, 1, -1):
d = self.d_sigmoid(outputs[i])
error = np.dot(error, self.weights[i + 1].T) * d
grad = outputs[i - 1].T * error
weights_gradients.append(grad)
biases_gradients.append(error)
return weights_gradients, biases_gradients
#### Training ####
def train(self, features, targets):
# Batch Size for weight update step
batch_size = features.shape[0]
# Delta Weights Variables
delta_weights = [np.zeros(weight.shape) for weight in self.weights]
delta_biases = [np.zeros(bias.shape) for bias in self.biases]
# For every data point, forward pass, backpropogation, store weights change
for X, y in zip(features, targets):
# Forward pass
X = X.reshape(1, X.shape[0])
outputs = self.feed_forward(X)
# Back propogation
weights_gradients, biases_gradients = self.backpropagation(X, y, outputs)
for i in range(len(weights_gradients)):
delta_weights[-(i + 1)] += weights_gradients[i]
delta_biases[-(i + 1)] += biases_gradients[i]
for i in range(len(delta_weights)):
self.weights[i] += (self.lr * delta_weights[i]) / batch_size
self.biases[i] += (self.lr * delta_biases[i]) / batch_size
#### Testing Methods ####
def predict(self, X):
# Gives prediction
return self.feed_forward(X)[-1]
def test(self, features, targets):
predictions = self.predict(features)
n_correct = 0
for i in range(len(predictions)):
prediction = np.argmax(predictions[i])
correct = np.argmax(targets[i])
if prediction == correct:
n_correct += 1
return n_correct / len(targets)
class GameData:
def __init__(self, ball_position, wall_position, upOrDown):
self.wall_position = wall_position
self.ball_position = ball_position
self.upOrDown = upOrDown
I collect data, and train my network, in this way:
gameDataList.append(GameData(ball.trt.ycor(), b.trt.ycor(), [0.1, 0.9]))
mlp = MLP(2, [32, 32], 2, 0.0001)
n = random.randint(0, 999)
mlp.train(np.array([gameDataList[n].ball_position, gameDataList[n].wall_position]), np.array(gameDataList[n].upOrDown))
Problem solved. It was needed to write two square brackets instead of one.
wrong example:
np.array([gameDataList[n].ball_position, gameDataList[n].wall_position])
correct example:
np.array([[gameDataList[n].ball_position, gameDataList[n].wall_position]])
I am training simple variational autoencoder with negative binomial likelihood for decoder. I used python 3.7.1 and tensorflow 2.0.0.
The model was trained well without any problems for tens of epochs, but all weights, loss, and gradients suddenly became NaN during training. I modified the code to find which variables become NaN first (among weights, loss, and gradients) and found that gradients first became nan and this affected other variables.
I have googled similar issues but most of case nan appeared in the loss, which is different from this case. I tried: smaller learning rate, clipping loss... but nothing could resolve the problem.
Here is the model class for autoencoder model:
class Encoder(tf.keras.layers.Layer):
def __init__(self, hidden_dim, latent_dim):
super(Encoder, self).__init__()
self.encoder_fc_1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.encoder_fc_2 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.encoder_latent = tf.keras.layers.Dense(latent_dim + latent_dim)
def call(self, input):
h = tf.math.l2_normalize(input, 1)
h = self.encoder_fc_1(h)
h = self.encoder_fc_2(h)
return tf.split(self.encoder_latent(h), num_or_size_splits=2, axis=1)
class Decoder(tf.keras.layers.Layer):
def __init__(self, hidden_dim, vocab_size):
super(Decoder, self).__init__()
self.decoder_fc_1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.decoder_fc_2 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.decoder_fc_3 = tf.keras.layers.Dense(vocab_size + vocab_size)
def call(self, z):
h = self.decoder_fc_1(z)
h = self.decoder_fc_2(h)
return tf.split(self.decoder_fc_3(h), num_or_size_splits=2, axis=1)
class NBVAE(tf.keras.Model):
def __init__(self, config):
super(NBVAE, self).__init__()
self.optimizer = tf.keras.optimizers.Adam(config["learning_rate"])
self.encoder = Encoder(config["hidden_dim"], config["latent_dim"])
self.decoder = Decoder(config["hidden_dim"], config["vocab_size"])
def call(self, input):
mean, logvar = self.encoder(input)
z = reparameterize(mean, logvar)
h_r, h_p = self.decoder(z)
return mean, logvar, z, h_r, h_p
def reparameterize(mean, logvar):
eps = tf.random.normal(shape=mean.shape)
return tf.add(tf.multiply(eps, tf.math.exp( tf.math.divide(logvar, 2))), mean)
def log_normal_pdf(sample, mean, logvar, raxis=1):
log2pi = tf.math.log(2. * np.pi)
return tf.reduce_sum(-.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi), axis=raxis)
def compute_logpx_z(input, h_r, h_p):
temp = tf.exp(-tf.multiply(tf.exp(h_r), tf.math.log(tf.exp(h_p) + 1)))
temp_cliped = tf.clip_by_value(temp, 1e-5, 1 - 1e-5)
ll = tf.multiply(input, tf.math.log(1 - temp_cliped)) + tf.multiply(1 - input, tf.math.log(temp_cliped))
#print("logpx_z: {}".format(tf.reduce_sum(ll, axis=-1)))
return tf.reduce_sum(ll, axis=-1), temp
def compute_loss(model, input):
mean, logvar, z, h_r, h_p = model(input)
logpx_z, temp = compute_logpx_z(input, h_r, h_p)
logpz = log_normal_pdf(z, 0., 0.)
logqz_x = log_normal_pdf(z, mean, logvar)
return tf.negative(tf.reduce_mean(logpx_z + logpz - logqz_x)), temp
and here is the code snippet for training the model.
I put some if statements in the middle of the code to check which variable become NaN first.
print("start training...")
num_batches = int(np.ceil(len(training_data) / batch_size))
epoch_loss = []
for epoch in range(epochs):
print("epoch: {}".format(epoch+1))
progbar = tf.keras.utils.Progbar(num_batches)
loss_record = []
for i in range(num_batches):
x_batch = training_data[i*batch_size:(i+1)*batch_size]
x_batch = one_hot(x_batch, depth=len(concept2id))
with tf.GradientTape() as tape:
loss, temp = compute_loss(nbvae, x_batch)
print("step{s} loss: {l}".format(s=i, l=loss.numpy()))
# checking the loss
if np.isnan(loss.numpy()):
print("nan loss is detected")
detect_nan = True
break
loss_record.append(loss.numpy())
gradients = tape.gradient(loss, nbvae.trainable_variables)
#gradients, global_norm = tf.clip_by_global_norm(tape.gradient(loss, nbvae.trainable_variables), 10)
print("checking gradients...")
gradient_nancount = 0
for _, grad in enumerate(gradients):
gradient_nancount += np.sum(np.isnan(grad))
if gradient_nancount != 0:
print("nan is detected in gradients")
print("saving the current gradients and weights...")
save_data(os.path.join(output_path, "error_gradients.pkl"), gradients)
save_data(os.path.join(output_path, "error_tvariables.pkl"), nbvae.trainable_variables)
detect_nan = True
break
nbvae.optimizer.apply_gradients(zip(gradients, nbvae.trainable_variables))
print("checking the updated weights...")
weight_nancount = 0
for _, weight in enumerate(nbvae.weights):
weight_nancount += np.sum(np.isnan(weight))
if weight_nancount != 0:
print("nan is detected in weights")
print("saving the current gradients and weights...")
save_data(os.path.join(output_path, "error_gradients.pkl"), gradients)
save_data(os.path.join(output_path, "error_tvariables.pkl"), nbvae.trainable_variables)
detect_nan = True
break
progbar.add(1)
if detect_nan:
epoch_loss.append(np.nan)
nbvae.save_weights(os.path.join(output_path, "nbvae_error{}".format(epoch+1)))
break
print("average epoch loss: {}".format(np.mean(loss_record)))
epoch_loss.append(np.mean(loss_record))
Anyone knows the way to resolve this problem or possible reasons? Thank you for your time in advance.
I'm trying to implement a Neural Net in python without the use of libraries like Keras or Tensorflow. I still have to test the net, right now I just tried to train it on Iris dataset and check afterwards the correctness of the backpropagation algorithm.
To do so, I wrote the gradient checking procedure, calculating the analytical gradients and comparing them with the gradients from backpropagation.
The point is that, even if the backpropagation algorithm seems correct to me, the difference between the gradients is always high (around 0.8, instead of the classic 1e-7).
Layer class
class Dense(Layer):
def __init__(self, input_shape, name=None, activation='relu', regularization='l2'):
self.name = name
self.is_output = False
self.weights = np.random.uniform(low=0.01, high=0.10, size=input_shape)
self.biases = np.ones((1,input_shape[1]))
if activation == 'sigmoid':
self.activation = Activation_Sigmoid()
else: #activation == 'relu':
self.activation = Activation_ReLU()
self.cost = Categorical_CrossEntropyLoss()
def set_as_output(self, is_output=True):
self.is_output = is_output
def forward(self, inputs, debug=False, epsilon=None):
self.net_input = inputs
if debug:
augmented_parameters = np.zeros(epsilon.shape)
weights_column_vector = np.reshape(self.weights,(-1,1))
biases_column_vector = np.reshape(self.biases,(-1,1))
concatenated_parameters = np.concatenate((weights_column_vector, biases_column_vector))
for i in range(concatenated_parameters.shape[0]):
augmented_parameters[i] = concatenated_parameters[i]
# make the augmented parameter long as theta in order to sum them
# this because epsilon is a standard basis vector
augmented_parameters += epsilon
# rebuild the weights matrix and biases vector to apply forward propagation
weights_end = self.weights.shape[0] * self.weights.shape[1]
biases_end = self.biases.shape[0] * self.biases.shape[1] + weights_end
weights = np.reshape(augmented_parameters[0:weights_end],self.weights.shape)
biases = np.reshape(augmented_parameters[weights_end:biases_end], self.biases.shape)
output = np.dot(inputs, weights) + biases
activated_output = self.activation.forward(output)
return activated_output
self.output = np.dot(inputs, self.weights) + self.biases
self.activated_output = self.activation.forward(self.output)
return self.activated_output
def backward(self, X, y, output, step, l2=0.5): #backpropagation
m = X.shape[0] # number of examples
if self.is_output:
error = self.cost.backward(output, y) #(a_k - y_hat_k)
delta_k = self.activation.backward(self.output)* error
# net input for neuron k is a_j^(l-1)
grad = np.dot(self.net_input.T, delta_k)
#update weights with l2 regularization
self.grad_w = grad + (l2 / m)*self.weights
self.grad_b = np.sum(delta_k * 1,axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_k ,self.weights.T)
else:
delta_j = self.activation.backward(self.output) * output
grad = np.dot(self.net_input.T, delta_j)
self.grad_w = grad + (l2 / m) * self.weights
self.grad_b = np.sum(delta_j * 1, axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_j, self.weights.T)
def get_parameters(self):
return self.weights, self.biases
def get_gradients(self):
return self.grad_w, self.grad_b
Neural Net class
class NeuralNet():
def __init__(self):
self.layers = []
self.layers_output = []
self.cost = None
self.regularization = L2_Regularization()
def add(self,layer):
self.layers.append(layer)
def forward(self, inputs, debug=False, epsilon=None):
input = np.copy(inputs)
for layer in self.layers:
output = layer.forward(input, debug=debug, epsilon=epsilon)
input = output
return input
def backward(self, X, y, output, step):
prev_delta = None
out = output
for layer in self.layers[::-1]:
prev_delta = layer.backward(X, y, out, step)
out = prev_delta
def fit(self, X, y, batch_size=1, epochs=10, step=0.05, shuffle=True):
self.layers[-1].set_as_output()
self.error = []
i = 0.005 * epochs
for epoch in range(epochs):
if shuffle:
X = np.random.permutation(X)
batches = int(np.ceil(X.shape[0]/batch_size))
batches_error = []
for t in range(batches):
batch_X = X[t*batch_size:np.min([X.shape[0],(t+1)*batch_size]),:]
batch_y = y[t*batch_size:np.min([y.shape[0],(t+1)*batch_size]),:]
output = self.forward(batch_X)
cost = self.cost.forward(output,batch_y)
cost += self.regularization.forward(X, self.layers)
batches_error.append(cost)
self.backward(batch_X, batch_y, output, step)
self.error.append(np.mean(batches_error))
if epoch % i == 0:
print('epoch:', epoch, 'error:', np.mean(self.error))
return self
def parameters_to_theta(self):
theta = []
for layer in self.layers:
w, b = layer.get_parameters()
#flatten parameter w
new_vector = np.reshape(w, (-1,1))
theta.append(new_vector)
#flatten parameter b
new_vector = np.reshape(b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradients_to_theta(self):
theta = []
for layer in self.layers:
grad_w, grad_b = layer.get_gradients()
new_vector = np.reshape(grad_w, (-1,1))
theta.append(new_vector)
new_vector = np.reshape(grad_b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradient_check(self, X, y, epsilon=1e-7):
theta = self.parameters_to_theta()
dtheta = self.gradients_to_theta()
num_parameters = theta.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
dtheta_approx = np.zeros((num_parameters, 1))
for i in range(num_parameters):
theta_plus = np.zeros((num_parameters,1))
theta_plus[i] = epsilon
J_plus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_plus),y)
theta_minus = np.zeros((num_parameters,1))
theta_minus[i] = - epsilon
J_minus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_minus),y)
dtheta_approx[i] = (J_plus[i] - J_minus[i])/ (2 * epsilon)
numerator = np.linalg.norm(dtheta - dtheta_approx)
denominator = np.linalg.norm(dtheta_approx) + np.linalg.norm(dtheta)
difference = numerator / denominator
return difference
I'm using ReLU and Sigmoid as activation functions, and Categorical Cross Entropy for the cost
import numpy as np
from scipy.special import expit as sigmoid
class Activation_ReLU:
def forward(self, inputs):
return np.maximum(0, inputs)
def backward(self, inputs):
return np.greater(inputs,0).astype(int)
class Activation_Sigmoid:
def forward(self, inputs):
return sigmoid(inputs)
def backward(self, inputs):
return sigmoid(inputs) * (1 - sigmoid(inputs))
class Categorical_CrossEntropyLoss():
def forward(self, y_pred, y_real):
predictions = np.copy(y_pred)
predictions = np.clip(predictions, 1e-12, 1 - 1e-12) # avoid zero values for log
n = y_real.shape[0]
return - (1 / n) * np.sum(y_real * np.log(y_pred))
def backward(self, y_pred, y_real):
return y_real - y_pred
These are the main classes that define the net. The model that I create to train on Iris dataset is a NN with 1 hidden layer.
# random seed is 1
X, y = load_iris(return_X_y=True)
X = (X - np.mean(X)) / np.std(X) # standardize data to improve network convergence
y = y.reshape((-1,1))
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)
model = NeuralNet()
model.add(Dense((4,10),name='input_layer',activation='relu'))
model.add(Dense((10,10),name='hidden_layer',activation='relu'))
model.add(Dense((10,3),name='output_layer',activation='sigmoid'))
model.fit(X_train,y_train, batch_size=5, epochs=200, step=1e-3)
difference = model.gradient_check(X_train, y_train)
And then, the result of print(difference) is
0.7992920544491866
So there is something wrong with my implementation. What things I have to check to determine the causes of this high difference between gradients?
I'm working on a NeuralNetwork class using the back-propogation algorithm from Artificial Intelligence: A Modern Approach. I stepped through a run of the train function with a debugger and everything seems to be working properly, but the error isn't going down when I run it. Can anybody spot what I'm doing wrong?
import math, random
import numpy as np
CLOSE = 0.2
class Perceptron:
'''A single perceptron using sigmoid activation'''
def __init__(self, inputs):
'''Set up the perceptron with the given number of inputs'''
self.weights = np.empty(inputs)
for i in range(inputs):
self.weights[i] = random.random()
self.bias = random.random()
def getOutput(self, inputs):
'''Calculates, stores, and returns the output'''
assert len(inputs) == len(self.weights)
inj = np.sum(inputs * self.weights) + self.bias # Sum inputs
g = 1.0 / (1.0 + math.exp(-inj)) # Sigmoid activation
self.aj = g
return g
def adjust(self, delta):
'''Adjusts the weights and bias'''
self.bias += self.aj * delta
for i in range(len(self.weights)):
self.weights[i] += self.aj * delta
class Layer:
'''Creates a single layer in a single feed-forward neural network'''
def __init__(self, width, inputSize, prevLayer=False):
'''Create a new layer'''
self.prevLayer = prevLayer
self.nextLayer = False
self.nodes = []
for _ in range(width):
self.nodes.append(Perceptron(inputSize))
def setNext(self, nextLayer):
'''Set the next layer in the network'''
self.nextLayer = nextLayer
def getOutput(self, inputs):
'''Get an array of the output of the network'''
output = np.empty(len(self.nodes))
for i in range(len(self.nodes)):
output[i] = self.nodes[i].getOutput(inputs)
if isinstance(self.nextLayer, Layer):
# If this isn't the output layer, recurse to the next layer down
return self.nextLayer.getOutput(output)
return output
def backProp(self, deltas):
'''Back-propogate error through all the layers'''
if isinstance(self.prevLayer, Layer):
# If this isn't the input layer, calculate deltas for the next layer up
crossprod = np.empty((len(deltas), len(self.nodes[0].weights)))
for j in range(len(deltas)):
crossprod[j][:] = self.nodes[j].weights * deltas[j]
nextDeltas = crossprod.sum(axis=0)
for i in range(len(nextDeltas)):
# multiply by g'
nextDeltas[i] *= self.prevLayer.nodes[i].aj * (1.0 - self.prevLayer.nodes[i].aj)
# Recurse upwards
self.prevLayer.backProp(nextDeltas)
# Adjust the weights of neurons in this layer
for i in range(len(self.nodes)):
self.nodes[i].adjust(deltas[i])
class NeuralNetwork:
def __init__(self, layerSizes=np.array(0), filename=""):
'''Creates a neural network with the given layer sizes.'''
prev = False
inputLayer = False
for i in range(len(layerSizes)-1):
inputSize = layerSizes[i]
outputSize = layerSizes[i+1]
layer = Layer(outputSize, inputSize, prev)
if isinstance(prev, Layer):
prev.setNext(layer)
if not isinstance(inputLayer, Layer):
inputLayer = layer
prev = layer
self.inputLayer = inputLayer
self.outputLayer = prev
def train(self, inputs, outputs):
'''Train the network on the given sample'''
pred = self.inputLayer.getOutput(inputs)
# calculate error of output layer
error = outputs - pred
deltas = error * pred * (1.0 - pred)
# back-propogate the error
self.outputLayer.backProp(deltas)
# return error
return np.max(abs(error))
def test(self, inputs, outputs):
'''Test the network on the given sample'''
pred = self.inputLayer.getOutput(inputs)
correct = True
for i in range(len(pred)):
if abs(pred[i] - outputs[i]) > CLOSE:
correct = False
return correct
You can try any of these measures :
Shuffle your data well.
Use a smaller learning rate like 0.001
Use ReLU instead of sigmoid.
Initialize your biases as 1 and not random.
Use softmax at output layer if you are using ReLU.