I am trying to implement a one hidden layer neural net. The weights are getting updated but the predictions are incorrect. I think the values of weights and biases are incorrect but cannot find a solution to the issue.
class ShallowNeuralNetwork:
def sigmoid(self, val):
return 1 / (1 + np.exp(-val))
def sigmoid_derivative(self, val):
return val *(1 - val)
def __init__(self, hidden_nodes, alpha, epochs):
#Declaring variables for the constructor call of the shallow neural net class
#for shallow neural net we only need one hidden layer so we are creating a 2d array to store weights of the hidden layer
#and output_weights for weights to the single output node
self.hidden_nodes = hidden_nodes
self.alpha = alpha
self.epochs = epochs
self.hidden_weights = None
self.hidden_bias = None
self.output_weights = None
self.output_bias = None
def fit(self, X, y):
#MxN weights where M is the number of input nodes and N is the number of weights each node in N gets m weights
self.hidden_weights = np.random.rand(X.shape[1], self.hidden_nodes)
#M biases for the hidden layer
self.hidden_bias = np.random.rand(self.hidden_nodes)
#N weights for calculating output
self.output_weights = np.random.rand(self.hidden_nodes,1)
#bias value for output
self.output_bias = np.random.rand(1)
for _ in range(self.epochs):
for x_one, y_one in zip(X, y):
Z1 = np.dot(x_one, self.hidden_weights) + self.hidden_bias
A1 = self.sigmoid(Z1)
Z2 = np.dot(Z1, self.output_weights) + self.output_bias
A2 = self.sigmoid(Z2)
error = A2 - y_one
delta_output_layer = error * self.sigmoid_derivative(A2)
error_hidden_layer = np.dot(delta_output_layer, self.output_weights.T)
delta_hidden_layer = error_hidden_layer * self.sigmoid_derivative(A1)
self.hidden_weights -= self.alpha*delta_hidden_layer
self.hidden_bias -= self.alpha*np.sum(error_hidden_layer)
self.output_weights -= self.alpha*delta_output_layer
self.output_bias -= self.alpha*np.sum(error)
def predict(self, X):
Z1 = np.dot(X, self.hidden_weights) + self.hidden_bias
A1 = self.sigmoid(Z1)
Z2 = np.dot(Z1, self.output_weights) + self.output_bias
pred = self.sigmoid(Z2)
return pred
I think I am updating the weights incorrectly , any ideas
Do you have any idea why this network doesn't want to learn? The idea is that it uses ReLU as an activation function in earlier layers and sigmoid as an activation function in the last layer. The network learned fine when I used only sigmoid. To verify the network I used MNIST.
def sigmoid( z ):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
def RELU(z):
return z*(z>0)
def RELU_Prime(z):
return (z>0)
# x - training data in mnist for example (1,784) vector
# y - training label in mnist for example (1,10) vector
# nabla is gradient for the current x and y
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
index =0
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
if index == len(self.weights)-1:
activation = sigmoid(z)
#previous layers are RELU
activation = RELU(z)
index +=1
# backward pass
delta = self.cost_derivative(activations[-1], y) *\
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in range(2, self.num_layers):
z = zs[-l]
sp = RELU_Prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
--------------- Edit -----------------------------
def cost_derivative(self, output_activations, y):
return (output_activations-y)
--------------- Edit 2 -----------------------------
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
eta > 0
For those in future the answer for this problem is simple but hidden :). It turns out the weight initialization was wrong. To make it work you have to use Xavier initialization and multiply it by 2.
Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
# Appending the params to the theta matrix
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)
I'm trying to make my first Neural Network, and I've tried to make one with 2 hidden layers, but it's not learning.
I'm pretty sure the feedforward part is working fine, but no matter how much I train it(with a XOR table), the error doesn't decrease, it just oscillates around 0.5. I guess there's something wrong with the backpropagation part, but I've revised it many times and even calculated what it should be to no avail.
Here's the code:
class NeuralNetwork:
def __init__(self, input_size, output_size):
# Sets the input and output sizes
self.inputs = input_size
self.outputs = output_size
# Sets size of hidden layers
self.L1 = 2
self.L2 = 2
# Initializes weights
self.W1 = np.random.rand(self.inputs.shape[1], self.L1)
self.W2 = np.random.rand(self.L1, self.L2)
self.W3 = np.random.rand(self.L2, self.outputs.shape[1])
# The network evaluates the inputs
def feedforward(self, inp):
self.inputs = inp
# Evaluates layer 1
Z1 = sig(np.dot(self.inputs, self.W1))
# Evaluates layer 2
Z2 = sig(np.dot(Z1, self.W2))
# Evaluates end result
Z3 = sig(np.dot(Z2, self.W3))
self.outputs = Z3
# Backpropagation
def train(self, inp, outp):
# Evaluate the input
self.inputs = inp
# Evaluates layer 1
Z1 = sig(np.dot(self.inputs, self.W1))
# Evaluates layer 2
Z2 = sig(np.dot(Z1, self.W2))
# Evaluates end result
Z3 = sig(np.dot(Z2, self.W3))
# Evaluate the error
error = Z3 - outp
print("Error: " + str(abs(error.sum())))
# Backpropagates
dcost_dpred = error
dpred_dz = dsig(np.dot(Z2, self.W3))
z_delta = dcost_dpred * dpred_dz
self.W3 -= lr * np.dot(Z2.T, z_delta)
z_delta = np.dot(z_delta, self.W3.T)
self.W2 -= lr * (np.dot(Z1.T, z_delta) * dsig(np.dot(Z1, self.W2)))
z_delta = np.dot(z_delta, self.W2.T)
self.W1 -= lr * (np.dot(self.inputs.T, z_delta) * dsig(np.dot(self.inputs, self.W1)))
Is it really wrong? If it is, how should I fix it?
I am very new to machine learning and am trying to implement an MLP however the cost function seems to be reaching a local minimum before reaching the global minimum. I plotted the cost as a function of iteration (including a 0 value as to not be fooled by where the y-axis starts). Here is the code that I am using at my attempt:
import numpy as np
class NNet(object):
def __init__(self, n_in, n_hidden, n_out):
self.n_in = n_in
self.n_hidden = n_hidden
self.n_out = n_out
self.W1 = np.random.randn(n_in, n_hidden)
self.W2 = np.random.randn(n_hidden, n_out)
self.b1 = np.random.randn(n_hidden,)
self.b2 = np.random.randn(n_out,)
def sigmoid(self, z):
return 1/(1 + np.exp(-z))
def sig_prime(self, z):
return (np.exp(-z))/((1+np.exp(-z))**2)
def propagate_forward(self, X):
self.z1 = np.dot(self.W1.T, X) + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = np.dot(self.W2.T, self.a1) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def cost(self, y, y_hat):
return np.mean([np.sum((y[i] - y_hat[i])**2) for i in range(y.shape[0])])/2
def cost_grad(self, X, y):
y_hat = self.propagate_forward(X)
d2 = np.multiply(self.sig_prime(self.z2), -(y - y_hat))
gJ_W2 = np.matrix(np.multiply(self.a1.T, d2))
d1 = np.dot(self.W2, d2)*self.sig_prime(self.z1)
gJ_W1 = np.dot(np.matrix(X).T, np.matrix(d1))
return [gJ_W1, d1, gJ_W2, d2]
m = 1000
n = 1
X = np.zeros((m, n))
y = np.zeros((m,1))
import random
import math
i = 0
for r, theta in zip(np.linspace(0, 5, num=m), np.linspace(0, 8 * math.pi, num=m)):
r += random.random()
X[i] = [r * math.cos(theta), r * math.sin(theta)]
if i < 333:
y[i] = 0
elif i < 666:
y[i] = 1
y[i] = 2
i += 1
nnet = NNet(n, 5, 1)
learning_rate = 0.2
improvement_threshold = 0.995
cost = np.inf
xs = []
ys = []
iter = 0
while cost > 0.2:
cost = nnet.cost(y, [nnet.propagate_forward(x_train) for x_train
if iter % 100 == 0:
print("Cost", cost)
if iter >= 1000:
print("Gradient descent is taking too long, giving up.")
cost_grads = [nnet.cost_grad(x_train, y_train) for x_train, y_train in zip(X, y)]
gW1 = [grad[0] for grad in cost_grads]
gb1 = [grad[1] for grad in cost_grads]
gW2 = [grad[2] for grad in cost_grads]
gb2 = [grad[3] for grad in cost_grads]
nnet.W1 -= np.mean(gW1, axis=0)/2 * learning_rate
nnet.b1 -= np.mean(gb1, axis=0)/2 * learning_rate
nnet.W2 -= np.mean(gW2, axis=0).T/2 * learning_rate
nnet.b2 -= np.mean(gb2, axis=0)/2 * learning_rate
iter += 1
Why is the cost not improving after a certain point? Also any other tips are highly appreciated.
The generated toy dataset looks like this
Your goal seems to be to predict to which class {0,1,2} belongs the data.
The output of your net is a sigmoid (sigm(x) in [0,1]) and you're
training using mean squared error (MSE), it's impossible for the model to predict a value above 1. So it's always wrong when the class to predict is 2.
The cost probably flattens because your sigmoid unit saturate (when trying to predict 2) and the gradient for saturating sigmoid is 0
For classification neural net normally end with a softmax layer and
are trained using cross-entropy.
If you want to keep using MSE and sigmoids unit for classification, you should consider predicting only two classes at a time in a One-vs-(One/All) kinda way.
Anyway, if you only do bi-class classification by rounding output to 0 or 1,it seems to work. Cost is decreasing and accuracy rising (quickly modified code):
I got the idea of implementing my version of deep feature selection is from the paper here,http://link.springer.com/chapter/10.1007%2F978-3-319-16706-0_20
The basic idea of deep feature selection according to this paper is to add a one to one mapping layer before any full connected hidden layer, then by adding a regularization term (whether lasso or elastic net) to produce zeros in the input layer weights.
My question is, even though it seems I have implemented the deep feature selection framework well, while testing on the random data generated by numpy.rand.random(1000,50) fails to give me any zeros on the initial weight. Is is a common thing for lasso like regularization? Am I going to adjust the parameters I used for this framework (even larger epochs)? Or did I do something wrong with my code.
class DeepFeatureSelectionMLP:
def __init__(self, X, Y, hidden_dims=[100], epochs=1000,
lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.0, learning_rate=0.1):
# Initiate the input layer
# Get the dimension of the input X
n_sample, n_feat = X.shape
n_classes = len(np.unique(Y))
# One hot Y
one_hot_Y = np.zeros((len(Y), n_classes))
for i,j in enumerate(Y):
one_hot_Y[i][j] = 1
self.epochs = epochs
Y = one_hot_Y
# Store up original value
self.X = X
self.Y = Y
# Two variables with undetermined length is created
self.var_X = tf.placeholder(dtype=tf.float32, shape=[None, n_feat], name='x')
self.var_Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes], name='y')
self.input_layer = One2OneInputLayer(self.var_X)
self.hidden_layers = []
layer_input = self.input_layer.output
# Create hidden layers
for dim in hidden_dims:
self.hidden_layers.append(DenseLayer(layer_input, dim))
layer_input = self.hidden_layers[-1].output
# Final classification layer, variable Y is passed
self.softmax_layer = SoftmaxLayer(self.hidden_layers[-1].output, n_classes, self.var_Y)
n_hidden = len(hidden_dims)
# regularization terms on coefficients of input layer
self.L1_input = tf.reduce_sum(tf.abs(self.input_layer.w))
self.L2_input = tf.nn.l2_loss(self.input_layer.w)
# regularization terms on weights of hidden layers
L1s = []
L2_sqrs = []
for i in xrange(n_hidden):
self.L1 = tf.add_n(L1s)
self.L2_sqr = tf.add_n(L2_sqrs)
# Cost with two regularization terms
self.cost = self.softmax_layer.cost \
+ lambda1*(1.0-lambda2)*0.5*self.L2_input + lambda1*lambda2*self.L1_input \
+ alpha1*(1.0-alpha2)*0.5 * self.L2_sqr + alpha1*alpha2*self.L1
self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
self.y = self.softmax_layer.y
def train(self, batch_size=100):
sess = tf.Session()
for i in xrange(self.epochs):
x_batch, y_batch = get_batch(self.X, self.Y, batch_size)
sess.run(self.optimizer, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
if (i + 1) % 50 == 0:
l = sess.run(self.cost, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
print('epoch {0}: global loss = {1}'.format(i, l))
self.selected_w = sess.run(self.input_layer.w)
class One2OneInputLayer(object):
# One to One Mapping!
def __init__(self, input):
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.zeros([n_in,]), name='w')
self.w = w
self.output = self.w * self.input
self.params = [w]
class DenseLayer(object):
# Canonical dense layer
def __init__(self, input, n_out, activation='sigmoid'):
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.ones([n_in, n_out]), name='w')
b = tf.Variable(tf.ones([n_out]), name='b')
output = tf.add(tf.matmul(input, w), b)
output = activate(output, activation)
self.w = w
self.b = b
self.output = output
self.params = [w]
class SoftmaxLayer(object):
def __init__(self, input, n_out, y):
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight and biases for this layer
w = tf.Variable(tf.random_normal([n_in, n_out]), name='w')
b = tf.Variable(tf.random_normal([n_out]), name='b')
pred = tf.add(tf.matmul(input, w), b)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
self.y = y
self.w = w
self.b = b
self.cost = cost
self.params= [w]
Gradient descent algorithms such as Adam do not give exact zeros when using l1 regularization. Instead, something like ftrl or proximal adagrad can give you exact zeros.