Backpropagation on a 2 hidden layer Neural Network - python

I'm trying to make my first Neural Network, and I've tried to make one with 2 hidden layers, but it's not learning.
I'm pretty sure the feedforward part is working fine, but no matter how much I train it(with a XOR table), the error doesn't decrease, it just oscillates around 0.5. I guess there's something wrong with the backpropagation part, but I've revised it many times and even calculated what it should be to no avail.
Here's the code:
class NeuralNetwork:
def __init__(self, input_size, output_size):
# Sets the input and output sizes
self.inputs = input_size
self.outputs = output_size
# Sets size of hidden layers
self.L1 = 2
self.L2 = 2
# Initializes weights
self.W1 = np.random.rand(self.inputs.shape[1], self.L1)
self.W2 = np.random.rand(self.L1, self.L2)
self.W3 = np.random.rand(self.L2, self.outputs.shape[1])
# The network evaluates the inputs
def feedforward(self, inp):
self.inputs = inp
# Evaluates layer 1
Z1 = sig(np.dot(self.inputs, self.W1))
# Evaluates layer 2
Z2 = sig(np.dot(Z1, self.W2))
# Evaluates end result
Z3 = sig(np.dot(Z2, self.W3))
self.outputs = Z3
# Backpropagation
def train(self, inp, outp):
# Evaluate the input
self.inputs = inp
# Evaluates layer 1
Z1 = sig(np.dot(self.inputs, self.W1))
# Evaluates layer 2
Z2 = sig(np.dot(Z1, self.W2))
# Evaluates end result
Z3 = sig(np.dot(Z2, self.W3))
# Evaluate the error
error = Z3 - outp
print("Error: " + str(abs(error.sum())))
# Backpropagates
dcost_dpred = error
dpred_dz = dsig(np.dot(Z2, self.W3))
z_delta = dcost_dpred * dpred_dz
self.W3 -= lr * np.dot(Z2.T, z_delta)
z_delta = np.dot(z_delta, self.W3.T)
self.W2 -= lr * (np.dot(Z1.T, z_delta) * dsig(np.dot(Z1, self.W2)))
z_delta = np.dot(z_delta, self.W2.T)
self.W1 -= lr * (np.dot(self.inputs.T, z_delta) * dsig(np.dot(self.inputs, self.W1)))
Is it really wrong? If it is, how should I fix it?

Related

Backpropagation algorithm not converging

I am trying to implement a one hidden layer neural net. The weights are getting updated but the predictions are incorrect. I think the values of weights and biases are incorrect but cannot find a solution to the issue.
class ShallowNeuralNetwork:
def sigmoid(self, val):
return 1 / (1 + np.exp(-val))
def sigmoid_derivative(self, val):
return val *(1 - val)
def __init__(self, hidden_nodes, alpha, epochs):
#Declaring variables for the constructor call of the shallow neural net class
#for shallow neural net we only need one hidden layer so we are creating a 2d array to store weights of the hidden layer
#and output_weights for weights to the single output node
self.hidden_nodes = hidden_nodes
self.alpha = alpha
self.epochs = epochs
self.hidden_weights = None
self.hidden_bias = None
self.output_weights = None
self.output_bias = None
def fit(self, X, y):
#MxN weights where M is the number of input nodes and N is the number of weights each node in N gets m weights
self.hidden_weights = np.random.rand(X.shape[1], self.hidden_nodes)
#M biases for the hidden layer
self.hidden_bias = np.random.rand(self.hidden_nodes)
#N weights for calculating output
self.output_weights = np.random.rand(self.hidden_nodes,1)
#bias value for output
self.output_bias = np.random.rand(1)
for _ in range(self.epochs):
for x_one, y_one in zip(X, y):
Z1 = np.dot(x_one, self.hidden_weights) + self.hidden_bias
A1 = self.sigmoid(Z1)
Z2 = np.dot(Z1, self.output_weights) + self.output_bias
A2 = self.sigmoid(Z2)
error = A2 - y_one
delta_output_layer = error * self.sigmoid_derivative(A2)
error_hidden_layer = np.dot(delta_output_layer, self.output_weights.T)
delta_hidden_layer = error_hidden_layer * self.sigmoid_derivative(A1)
self.hidden_weights -= self.alpha*delta_hidden_layer
self.hidden_bias -= self.alpha*np.sum(error_hidden_layer)
self.output_weights -= self.alpha*delta_output_layer
self.output_bias -= self.alpha*np.sum(error)
def predict(self, X):
Z1 = np.dot(X, self.hidden_weights) + self.hidden_bias
A1 = self.sigmoid(Z1)
Z2 = np.dot(Z1, self.output_weights) + self.output_bias
pred = self.sigmoid(Z2)
return pred
I think I am updating the weights incorrectly , any ideas

Strange result Neural network Python

I followed an article here: TowardsDataScience.
I wrote math equations about the network, everything made sense.
However, after writing the code, results are pretty strange, like it is predicting always same class...
I spent a lot of time on it, changed many things, but I still cannot understand what I did wrong.
Here is the code:
# coding: utf-8
from mnist import MNIST
import numpy as np
import math
import os
import pdb
DATASETS_PREFIX = '../Datasets/MNIST'
mndata = MNIST(DATASETS_PREFIX)
TRAINING_IMAGES, TRAINING_LABELS = mndata.load_training()
TESTING_IMAGES , TESTING_LABELS = mndata.load_testing()
### UTILS
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def d_sigmoid(x):
return x.T * (1 - x)
#return np.dot(x.T, 1.0 - x)
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
def d_softmax(x):
#This function has not yet been tested.
return x.T * (1 - x)
def tanh(x):
return np.tanh(x)
def d_tanh(x):
return 1 - x.T * x
def normalize(image):
return image / (255.0 * 0.99 + 0.01)
### !UTILS
class NeuralNetwork(object):
"""
This is a 3-layer neural network (1 hidden layer).
#_input : input layer
#_weights1: weights between input layer and hidden layer (matrix shape (input.shape[1], 4))
#_weights2: weights between hidden layer and output layer (matrix shape (4, 1))
#_y : output
#_output : computed output
#_alpha : learning rate
"""
def __init__(self, xshape, yshape):
self._neurones_nb = 20
self._input = None
self._weights1 = np.random.randn(xshape, self._neurones_nb)
self._weights2 = np.random.randn(self._neurones_nb, yshape)
self._y = np.mat(np.zeros(yshape))
self._output = np.mat(np.zeros(yshape))
self._alpha1 = 0.1
self._alpha2 = 0.1
self._function = sigmoid
self._derivative = d_sigmoid
self._epoch = 1
def Train(self, xs, ys):
for j in range(self._epoch):
for i in range(len(xs)):
self._input = normalize(np.mat(xs[i]))
self._y[0, ys[i]] = 1
self.feedforward()
self.backpropagation()
self._y[0, ys[i]] = 0
def Predict(self, image):
self._input = normalize(image)
out = self.feedforward()
return out
def feedforward(self):
self._layer1 = self._function(np.dot(self._input, self._weights1))
self._output = self._function(np.dot(self._layer1, self._weights2))
return self._output
def backpropagation(self):
d_weights2 = np.dot(
self._layer1.T,
2 * (self._y - self._output) * self._derivative(self._output)
)
d_weights1 = np.dot(
self._input.T,
np.dot(
2 * (self._y - self._output) * self._derivative(self._output),
self._weights2.T
) * self._derivative(self._layer1)
)
self._weights1 += self._alpha1 * d_weights1
self._weights2 += self._alpha2 * d_weights2
if __name__ == '__main__':
neural_network = NeuralNetwork(len(TRAINING_IMAGES[0]), 10)
print('* training neural network')
neural_network.Train(TRAINING_IMAGES, TRAINING_LABELS)
print('* testing neural network')
count = 0
for i in range(len(TESTING_IMAGES)):
image = np.mat(TESTING_IMAGES[i])
expected = TESTING_LABELS[i]
prediction = neural_network.Predict(image)
if i % 100 == 0: print(expected, prediction)
#print(f'* results: {count} / {len(TESTING_IMAGES)}')
Thank you for your help, really appreciated.
Julien
Well, I don't see any error in the implementation so considering your network, this could be improved by doing two things :
One epoch is not enough. Like not a all ! You need to pass over your data multiple times (a great minimum is 10 times, average might be around 100 epochs and this could go up to 5000 or more)
You network is a shallow network, e.g. really simple. To detect difficult things (like images), you could implement a CNN (Convolutional Neural Network) or first trying to deepen your network and complexify it
=> Try to add layers (3, 4, 5 etc..) and then add neurons to each layers (50, 60, ..) depending of the size of your input. You can still go up to 800, 900 or more.

I can't find the bug in this implementation of backpropogation?

My data is 4123 rows of inputs and outputs to an xor gate.
I want to write a Neural Network with three input layer neurons (the third one is bias), a hidden layer, and an output layer.
Here's my implementation
import numpy as np
class TwoLayerNetwork:
def __init__(self, input_size, hidden_size, output_size):
"""
input_size: the number of neurons in the input layer
hidden_size: the number of neurons in the hidden layer
output_size: the number of neurons in the output layer
"""
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.params = {}
self.params['W1'] = 0.01 * np.random.randn(input_size, hidden_size) # FxH
self.params['b1'] = np.zeros((hidden_size, 1)) # Hx1
self.params['W2'] = 0.01 * np.random.randn(hidden_size, output_size) # HxO
self.params['b2'] = np.zeros((output_size, 1)) # Ox1
self.optimal_weights = []
self.errors = {}
def train(self, X, y, epochs):
"""
X: input data matrix, NxF
y: output vector, Nx1
returns:
the optimal set of parameters that best minimize the loss function
"""
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
for iteration in range(epochs):
forward_to_hidden = X.dot(W1) # NxH
activate_hidden = sigmoid(forward_to_hidden) # NxH
forward_to_output = activate_hidden.dot(W2) # NxO
output = sigmoid(forward_to_output) # NxO
self.errors[iteration] = np.mean(0.5 * (y**2 - output**2))
output_error = y - output # NxO
output_layer_delta = output_error * sigmoidPrime(output) # NxO
hidden_layer_error = output_layer_delta.dot(W2.T) # NxO . OxH = NxH
hidden_layer_delta = hidden_layer_error * sigmoidPrime(activate_hidden) # NxH
W1_update = X.T.dot(hidden_layer_delta) # FxN . NxH = FxH
W2_update = activate_hidden.T.dot(output_layer_delta) # HxN . NxO = HxO
W1 += W1_update
W2 += W2_update
self.optimal_weights.append(W1)
self.optimal_weights.append(W2)
def predict(self, X):
W1, W2 = self.optimal_weights[0], self.optimal_weights[1]
forward = sigmoid(X.dot(W1)) # NxH
forward = forward.dot(W2) # NxO
forward = sigmoid(forward) # NxO
return forward
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoidPrime(x):
return sigmoid(x) * (1 - sigmoid(x))
I realize that's very vanilla, but that's intentional. I want to understand the most basic form of NN architecture first.
Now, my problem is that my error plot is confusing.
The neural network just stops learning.
My second problem is that my weights are blowing up reaching up to -10000, which causes overflow because of exp in the sigmoid function.
My third problem is that my output vector only outputs 0.5 instead of 1 or 0
import pandas as pd
data = pd.read_csv('xor.csv').sample(frac=1)
X = data.iloc[:, [0, 1]] # 1st and 2nd cols are the input
X = np.hstack((X, np.ones((data.shape[0], 1)))) # adding the bias 1's
y = data.iloc[:, 2][:, np.newaxis] # 3rd col is the output
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
nn.train(X_train, y_train, 100)
plt.plot(range(100), [i for i in nn.errors.values()])
plt.show()
The link for the dataset
So, if I read your code correctly, your network is specified correctly, but is missing a few key points in order to learn XOR by backpropagation.
The fun part is, your error specification is weird.
I made it into
self.errors[iteration] = np.mean(0.5 * (y - output)**2)
for visualization.
With x-axis denoting epoch and y-axis denoting error:
So what happens, the backpropagation hits a plateau, then rapidly blows up the weights. To slow down the blowing up of the weights and allow the network some time to re-evaluate its mistakes, you can add a so-called "learning rate" != 1. This adresses one of the pitfalls.
Another one is the second figure: you hit oscillatory behaviour in the updates and the program will never reach its optimum state. To adress this, you can deliberately enter an imperfection in the form of a "momentum".
Additionally, the initial conditions matter for the speed at which you converge, so you need to have enough epochs to overcome the local plateaux:
Last, but certainly not least, I did find an error with your specification, but all of the above still applies.
In your layer_deltas you do sigmoidPrime(sigmoid(forwards)) which is one call to sigmoid too many.
last_update = np.zeros((X.shape[1], W1.shape[1]))
last_update2 = np.zeros((W1.shape[1], W2.shape[1]))
output_layer_delta = output_error * sigmoidPrime(forward_to_output) # NxO
hidden_layer_delta = hidden_layer_error * sigmoidPrime(forward_to_hidden) # NxH
W1 += 0.001*(W1_update + last_update * 0.5)
W2 += 0.001*(W2_update + last_update2 * 0.5)
# W1 = 0.001*W1_update
# W2 = 0.001*W2_update
last_update = W1_update.copy()
last_update2 = W2_update.copy()
Did the final trick for me. Now please verify and appease this grumbling man who spent the better part of a night and day on figuring it out. ;)

Neural Network loss starts increasing while acc is increasing on both train/val data sets

For past few days I have been debugging my NN but I can't find an issue.
I've created total raw implementation of multi-layer perceptron for identifying MNIST dataset images.
Network seems to learn because after train cycle test data accuracy is above 94% accuracy. I have problem with loss function - it starts increasing after a while, when test/val accuracy reaches ~76%.
Can someone please check my forward/backprop math and tell me if my loss function is properly implemented, or suggest what might be wrong?
NN structure:
input layer: 758 nodes, (1 node per pixel)
hidden layer 1: 300 nodes
hidden layer 2: 75 nodes
output layer: 10 nodes
NN activation functions:
input layer -> hidden layer 1: ReLU
hidden layer 1 -> hidden layer 2: ReLU
hidden layer 2 -> output layer 3: Softmax
NN Loss function:
Categorial Cross-Entropy
Full CLEAN code available here as Jupyter Notebook.
Neural Network forward/backward pass:
def train(self, features, targets):
n_records = features.shape[0]
# placeholders for weights and biases change values
delta_weights_i_h1 = np.zeros(self.weights_i_to_h1.shape)
delta_weights_h1_h2 = np.zeros(self.weights_h1_to_h2.shape)
delta_weights_h2_o = np.zeros(self.weights_h2_to_o.shape)
delta_bias_i_h1 = np.zeros(self.bias_i_to_h1.shape)
delta_bias_h1_h2 = np.zeros(self.bias_h1_to_h2.shape)
delta_bias_h2_o = np.zeros(self.bias_h2_to_o.shape)
for X, y in zip(features, targets):
### forward pass
# input to hidden 1
inputs_to_h1_layer = np.dot(X, self.weights_i_to_h1) + self.bias_i_to_h1
inputs_to_h1_layer_activated = self.activation_ReLU(inputs_to_h1_layer)
# hidden 1 to hidden 2
h1_to_h2_layer = np.dot(inputs_to_h1_layer_activated, self.weights_h1_to_h2) + self.bias_h1_to_h2
h1_to_h2_layer_activated = self.activation_ReLU(h1_to_h2_layer)
# hidden 2 to output
h2_to_output_layer = np.dot(h1_to_h2_layer_activated, self.weights_h2_to_o) + self.bias_h2_to_o
h2_to_output_layer_activated = self.softmax(h2_to_output_layer)
# output
final_outputs = h2_to_output_layer_activated
### backpropagation
# output to hidden2
error = y - final_outputs
output_error_term = error.dot(self.dsoftmax(h2_to_output_layer_activated))
h2_error = np.dot(output_error_term, self.weights_h2_to_o.T)
h2_error_term = h2_error * self.activation_dReLU(h1_to_h2_layer_activated)
# hidden2 to hidden1
h1_error = np.dot(h2_error_term, self.weights_h1_to_h2.T)
h1_error_term = h1_error * self.activation_dReLU(inputs_to_h1_layer_activated)
# weight & bias step (input to hidden)
delta_weights_i_h1 += h1_error_term * X[:, None]
delta_bias_i_h1 = np.sum(h1_error_term, axis=0)
# weight & bias step (hidden1 to hidden2)
delta_weights_h1_h2 += h2_error_term * inputs_to_h1_layer_activated[:, None]
delta_bias_h1_h2 = np.sum(h2_error_term, axis=0)
# weight & bias step (hidden2 to output)
delta_weights_h2_o += output_error_term * h1_to_h2_layer_activated[:, None]
delta_bias_h2_o = np.sum(output_error_term, axis=0)
# update the weights and biases
self.weights_i_to_h1 += self.lr * delta_weights_i_h1 / n_records
self.weights_h1_to_h2 += self.lr * delta_weights_h1_h2 / n_records
self.weights_h2_to_o += self.lr * delta_weights_h2_o / n_records
self.bias_i_to_h1 += self.lr * delta_bias_i_h1 / n_records
self.bias_h1_to_h2 += self.lr * delta_bias_h1_h2 / n_records
self.bias_h2_to_o += self.lr * delta_bias_h2_o / n_records
Activation function implementation:
def activation_ReLU(self, x):
return x * (x > 0)
def activation_dReLU(self, x):
return 1. * (x > 0)
def softmax(self, x):
z = x - np.max(x)
return np.exp(z) / np.sum(np.exp(z))
def dsoftmax(self, x):
# TODO: vectorise math
vec_len = len(x)
J = np.zeros((vec_len, vec_len))
for i in range(vec_len):
for j in range(vec_len):
if i == j:
J[i][j] = x[i] * (1 - x[j])
else:
J[i][j] = -x[i] * x[j]
return J
Loss function implementation:
def categorical_cross_entropy(pred, target):
return (1/len(pred)) * -np.sum(target * np.log(pred))
I managed to find the problem.
Neural Network is large so I couldn't stick everything to this question. Though if you check my Jupiter Notebook you could see implementation of my Softmax activation function and how do I use it in train cycle.
Problem with Loss miscalculation was caused by the fact my Softmax implementation worked only for ndarray dim == 1.
During training step I have put only ndarray with dim 1 to activtion function so NN learned well, but my run() function was returning wrong predictions as I have inserted whole test data to it, not only single row of it in for loop. Because of that it calculated Softmax "matrix-wise" rather than "row-wise".
This is very fast fix for it:
def softmax(self, x):
# TODO: vectorise math to speed up computation
softmax_result = None
if x.ndim == 1:
z = x - np.max(x)
softmax_result = np.exp(z) / np.sum(np.exp(z))
return softmax_result
else:
softmax_result = []
for row in x:
z = row - np.max(row)
row_softmax_result = np.exp(z) / np.sum(np.exp(z))
softmax_result.append(row_softmax_result)
return np.array(softmax_result)
Yet this code should be vectorised to avoid for loops and ifs if possible because currently it's ugly and takes too much PC resources.

Why does the lasso here didn't provide me with zero coefficient?

I got the idea of implementing my version of deep feature selection is from the paper here,http://link.springer.com/chapter/10.1007%2F978-3-319-16706-0_20
The basic idea of deep feature selection according to this paper is to add a one to one mapping layer before any full connected hidden layer, then by adding a regularization term (whether lasso or elastic net) to produce zeros in the input layer weights.
My question is, even though it seems I have implemented the deep feature selection framework well, while testing on the random data generated by numpy.rand.random(1000,50) fails to give me any zeros on the initial weight. Is is a common thing for lasso like regularization? Am I going to adjust the parameters I used for this framework (even larger epochs)? Or did I do something wrong with my code.
class DeepFeatureSelectionMLP:
def __init__(self, X, Y, hidden_dims=[100], epochs=1000,
lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.0, learning_rate=0.1):
# Initiate the input layer
# Get the dimension of the input X
n_sample, n_feat = X.shape
n_classes = len(np.unique(Y))
# One hot Y
one_hot_Y = np.zeros((len(Y), n_classes))
for i,j in enumerate(Y):
one_hot_Y[i][j] = 1
self.epochs = epochs
Y = one_hot_Y
# Store up original value
self.X = X
self.Y = Y
# Two variables with undetermined length is created
self.var_X = tf.placeholder(dtype=tf.float32, shape=[None, n_feat], name='x')
self.var_Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes], name='y')
self.input_layer = One2OneInputLayer(self.var_X)
self.hidden_layers = []
layer_input = self.input_layer.output
# Create hidden layers
for dim in hidden_dims:
self.hidden_layers.append(DenseLayer(layer_input, dim))
layer_input = self.hidden_layers[-1].output
# Final classification layer, variable Y is passed
self.softmax_layer = SoftmaxLayer(self.hidden_layers[-1].output, n_classes, self.var_Y)
n_hidden = len(hidden_dims)
# regularization terms on coefficients of input layer
self.L1_input = tf.reduce_sum(tf.abs(self.input_layer.w))
self.L2_input = tf.nn.l2_loss(self.input_layer.w)
# regularization terms on weights of hidden layers
L1s = []
L2_sqrs = []
for i in xrange(n_hidden):
L1s.append(tf.reduce_sum(tf.abs(self.hidden_layers[i].w)))
L2_sqrs.append(tf.nn.l2_loss(self.hidden_layers[i].w))
L1s.append(tf.reduce_sum(tf.abs(self.softmax_layer.w)))
L2_sqrs.append(tf.nn.l2_loss(self.softmax_layer.w))
self.L1 = tf.add_n(L1s)
self.L2_sqr = tf.add_n(L2_sqrs)
# Cost with two regularization terms
self.cost = self.softmax_layer.cost \
+ lambda1*(1.0-lambda2)*0.5*self.L2_input + lambda1*lambda2*self.L1_input \
+ alpha1*(1.0-alpha2)*0.5 * self.L2_sqr + alpha1*alpha2*self.L1
self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
self.y = self.softmax_layer.y
def train(self, batch_size=100):
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in xrange(self.epochs):
x_batch, y_batch = get_batch(self.X, self.Y, batch_size)
sess.run(self.optimizer, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
if (i + 1) % 50 == 0:
l = sess.run(self.cost, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
print('epoch {0}: global loss = {1}'.format(i, l))
self.selected_w = sess.run(self.input_layer.w)
print(self.selected_w)
class One2OneInputLayer(object):
# One to One Mapping!
def __init__(self, input):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.zeros([n_in,]), name='w')
self.w = w
self.output = self.w * self.input
self.params = [w]
class DenseLayer(object):
# Canonical dense layer
def __init__(self, input, n_out, activation='sigmoid'):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.ones([n_in, n_out]), name='w')
b = tf.Variable(tf.ones([n_out]), name='b')
output = tf.add(tf.matmul(input, w), b)
output = activate(output, activation)
self.w = w
self.b = b
self.output = output
self.params = [w]
class SoftmaxLayer(object):
def __init__(self, input, n_out, y):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight and biases for this layer
w = tf.Variable(tf.random_normal([n_in, n_out]), name='w')
b = tf.Variable(tf.random_normal([n_out]), name='b')
pred = tf.add(tf.matmul(input, w), b)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
self.y = y
self.w = w
self.b = b
self.cost = cost
self.params= [w]
Gradient descent algorithms such as Adam do not give exact zeros when using l1 regularization. Instead, something like ftrl or proximal adagrad can give you exact zeros.

Categories