Softmax Regression - validation and test predictions shows no improvement - python

I'm currently learning how to use Tensorflow and I'm having some issues to implement this Softmax Regression aplication.
There's no error when compiling but, for some reasson text validation and test predictions shows no improvement, only the train prediction is showing improvement.
I'm using Stocastic Gradient Descent(SGD) with minibatches in order to converge faster, but don't know if this could be causing a trouble somehow.
I'll be thankful if you could share some ideas, here's the full code:
import input_data
import numpy as np
import random as ran
import tensorflow as tf
import matplotlib.pyplot as plt
mnist = input_data.read_data_sets('MNIST_Data/', one_hot=True)
#Features & Data
num_features = 784
num_labels = 10
learning_rate = 0.05
batch_size = 128
num_steps = 5001
train_dataset = mnist.train.images
train_labels = mnist.train.labels
test_dataset = mnist.test.images
test_labels = mnist.test.labels
valid_dataset = mnist.validation.images
valid_labels = mnist.validation.labels
graph = tf.Graph()
with graph.as_default():
tf_train_data = tf.placeholder(tf.float32, shape=(batch_size, num_features))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_data = tf.constant(valid_dataset)
tf_test_data = tf.constant(test_dataset)
W = tf.Variable(tf.truncated_normal([num_features, num_labels]))
b = tf.Variable(tf.zeros([num_labels]))
score_vector = tf.matmul(tf_train_data, W) + b
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf_train_labels, logits=score_vector))
score_valid = tf.matmul(tf_test_data, W) + b
score_test = tf.matmul(tf_valid_data, W) + b
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_func)
train_pred = tf.nn.softmax(score_vector)
valid_pred = tf.nn.softmax(score_valid)
test_pred = tf.nn.softmax(score_test)
def accuracy(predictions, labels):
correct_pred = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
accu = (100.0 * correct_pred) / predictions.shape[0]
return accu
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer())
print("Initialized")
for step in range(num_steps):
offset = np.random.randint(0, train_labels.shape[0] - batch_size - 1)
batch_data = train_dataset[offset:(offset+batch_size), :]
batch_labels = train_labels[offset:(offset+batch_size), :]
feed_dict = {tf_train_data : batch_data,
tf_train_labels : batch_labels
}
_, l, predictions = sess.run([optimizer, cost_func, train_pred],
feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step {0}: {1}".format(step, l))
print("Minibatch accuracy: {:.1f}%".format(
accuracy(predictions, batch_labels)))
print("Validation accuracy: {:.1f}%".format(
accuracy(valid_pred.eval(), valid_labels)))
print("\nTest accuracy: {:.1f}%".format(
accuracy(test_pred.eval(), test_labels)))

It sounds like overfitting, which isn't surprising since this model is basically a linear regression model.
There are few options you can try:
1. add hidden layers + activation functions(https://arxiv.org/abs/1511.07289: elu paper works on mnist data set with vanilla DNN).
2. Use either CNN or RNN, although CNN is more apt for image problems.
3. Use a better optimizer. If you are new, try ADAM optimizer (https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer), and then move onto using momentum with nestrov(https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer)
Without feature engineering, it'll be hard to pull off image classification using just linear regression. Also, you do not need to run softmax on your outcomes since softmax is designed to smooth argmax. Lastly, you should input (None,num_features) into shape of placeholders instead to have variational batch size. This will allow you to directly feed your valid and test datasets into feed_dict without having to create additional tensors.

Related

Low accuracy in Deep Neural Network with Tensorflow

I am following the third Jupyter notebook on Tensorflow examples.
Running problem 4, I tried to implement a function which builds automatically a number of hidden layers, without manually coding the configuration of each layer.
However, the model runs providing very low accuracy (10%) so I thought that maybe such function could not be compatible with the graph builder of Tensorflow.
My code is the following:
def hlayers(n_layers, n_nodes, i_size, a, r=0, keep_p=1):
for i in range(n_layers):
if i > 0:
i_size = n_nodes
w = tf.Variable(tf.truncated_normal([i_size, n_nodes]), name=f'W{i}')
b = tf.Variable(tf.zeros([n_nodes]), name=f'b{i}')
pa = tf.nn.relu(tf.add(tf.matmul(a, w), b))
a = tf.nn.dropout(pa, keep_prob=keep_p, name=f'a{i}')
r += tf.nn.l2_loss(w, name=f'r{i}')
return a, r
batch_size = 128
num_nodes = 1024
beta = 0.01
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(
tf.float32,
shape=(batch_size, image_size * image_size),
name='Dataset')
tf_train_labels = tf.placeholder(
tf.float32,
shape=(batch_size, num_labels),
name='Labels')
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
keep_p = tf.placeholder(tf.float32, name='KeepProb')
# Hidden layers.
a, r = hlayers(
n_layers=3,
n_nodes=num_nodes,
i_size=image_size * image_size,
a=tf_train_dataset,
keep_p=keep_p)
# Output layer.
wo = tf.Variable(tf.truncated_normal([num_nodes, num_labels]), name='Wo')
bo = tf.Variable(tf.zeros([num_labels]), name='bo')
logits = tf.add(tf.matmul(a, wo), bo, name='Logits')
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf_train_labels, logits=logits))
# Regularizer.
regularizers = tf.add(r, tf.nn.l2_loss(wo))
loss = tf.reduce_mean(loss + beta * regularizers, name='Loss')
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
a, _ = hlayers(
n_layers=3,
n_nodes=num_nodes,
i_size=image_size * image_size,
a=tf_valid_dataset)
valid_prediction = tf.nn.softmax(tf.add(tf.matmul(a, wo), bo))
a, _ = hlayers(
n_layers=3,
n_nodes=num_nodes,
i_size=image_size * image_size,
a=tf_test_dataset)
test_prediction = tf.nn.softmax(tf.add(tf.matmul(a, wo), bo))
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {
tf_train_dataset : batch_data,
tf_train_labels : batch_labels,
keep_p : 0.5}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
Weight regularization is stronger with more layers. Therefore you could try to reduce the regularization and see if the accuracy increases.
The problem was caused by nan in loss function and weights, as described in this question.
By introducing a different standard deviation for each weights tensor based on its dimensions (as described in this answer and originally in He et al. [1]) I was able to train successfully the network.
[1]: He et al. (2015) Delving Deep into Rectifiers:
Surpassing Human-Level Performance on ImageNet Classification

AdadeltaOptimizer Example code

Does anyone have an example for code that uses tf.train.AdadeltaOptimizer with good results?
I have a TF graph, that was originally set with tf.train.AdamOptimizer, and is working well. When I replace it with AdadeltaOptimizer, with the default params, it gives lousy results.
I used Cuda 7.5.
The below is example code which works with 'AdadeltaOptimizer' optimizer. It works with 'Adam'. The only difference between them that Adam is insensitive to "learning rate" and 'Adadelta' is sensitive.
I advice you to read more about optimization algorithm (like here).
In your own example, just try to change 'learning rate' to be smaller or bigger (it is named 'hyperparameter optimization').
Note:
From my experience, 'Adam' is a very good optimizer for RNN, better than 'AdaDelta' (using example code, 'Adam' achieve better score much faster). On the other hand, for CNN, SGD+Momentum works best.
Code, which learn MNIST classification using Bi-LSTM:
# Mnist classification using Bi-LSTM
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
learning_rate = 0.01
training_epochs = 100
batch_size = 64
seq_length = 28
heigh_image = 28
hidden_size = 128
class_numer = 10
input = tf.placeholder(tf.float32, [None, None, heigh_image])
target = tf.placeholder(tf.float32, [None, class_numer])
seq_len = tf.placeholder(tf.int32, [None])
def fulconn_layer(input_data, output_dim, activation_func=None):
input_dim = int(input_data.get_shape()[1])
W = tf.Variable(tf.random_normal([input_dim, output_dim]))
b = tf.Variable(tf.random_normal([output_dim]))
if activation_func:
return activation_func(tf.matmul(input_data, W) + b)
else:
return tf.matmul(input_data, W) + b
with tf.name_scope("BiLSTM"):
with tf.variable_scope('forward'):
lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True)
with tf.variable_scope('backward'):
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True)
outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, cell_bw=lstm_bw_cell, inputs=input,sequence_length=seq_len, dtype=tf.float32, scope="BiLSTM")
# As we have Bi-LSTM, we have two output, which are not connected. So merge them
outputs = tf.concat(2, outputs)
# As we want do classification, we only need the last output from LSTM.
last_output = outputs[:,0,:]
# Create the final classification layer
yhat = fulconn_layer(last_output, class_numer)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(yhat, target))
optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(cost) # AdamOptimizer
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(target, 1), tf.argmax(yhat, 1)), tf.float32))
gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts)) as session:
session.run(tf.initialize_all_variables())
print ("Start Learing")
for epoch in range(training_epochs):
for i in range(int(mnist.train.num_examples/batch_size)):
x_batch, y_batch = mnist.train.next_batch(batch_size)
x_batch = x_batch.reshape([batch_size, seq_length, heigh_image])
train_seq_len = np.ones(batch_size) * seq_length
session.run([optimizer], feed_dict={input: x_batch, target: y_batch, seq_len: train_seq_len})
train_accuracy = session.run(accuracy, feed_dict={input: x_batch, target: y_batch, seq_len: train_seq_len})
x_test = mnist.test.images.reshape([-1, seq_length, heigh_image])
y_test = mnist.test.labels
test_seq_len = np.ones(x_test.shape[0]) * seq_length
test_accuracy = session.run(accuracy, feed_dict={input: x_test, target: y_test, seq_len: test_seq_len})
print("epoch: %d, train_accuracy: %3f, test_accuracy: %3f" % (epoch, train_accuracy, test_accuracy))

Strange NaN values for loss function (MLP) in TensorFlow

I hope you can help me. I'm implementing a small multilayer perceptron using TensorFlow and a few tutorials I found on the internet. The problem is that the net is able to learn something, and by this I mean that I am able to somehow optimize the value of the training error and get a decent accuracy, and that's what I was aiming for. However, I am recording with Tensorboard some strange NaN values for the loss function. Quite a lot actually. Here you can see my latest Tensorboard recording of the loss function output. Please all those triangles followed by discontinuities - those are the NaN values, note also that the general trend of the function is what you would expect it to be.
Tensorboard report
I thought that a high learning rate could be the problem, or maybe a net that's too deep, causing the gradients to explode, so I lowered the learning rate and used a single hidden layer (this is the configuration of the image above, and the code below). Nothing changed, I just caused the learning process to be slower.
Tensorflow Code
import tensorflow as tf
import numpy as np
import scipy.io, sys, time
from numpy import genfromtxt
from random import shuffle
#shuffles two related lists #TODO check that the two lists have same size
def shuffle_examples(examples, labels):
examples_shuffled = []
labels_shuffled = []
indexes = list(range(len(examples)))
shuffle(indexes)
for i in indexes:
examples_shuffled.append(examples[i])
labels_shuffled.append(labels[i])
examples_shuffled = np.asarray(examples_shuffled)
labels_shuffled = np.asarray(labels_shuffled)
return examples_shuffled, labels_shuffled
# Import and transform dataset
dataset = scipy.io.mmread(sys.argv[1])
dataset = dataset.astype(np.float32)
all_labels = genfromtxt('oh_labels.csv', delimiter=',')
num_examples = all_labels.shape[0]
dataset, all_labels = shuffle_examples(dataset, all_labels)
# Split dataset into training (66%) and test (33%) set
training_set_size = 2000
training_set = dataset[0:training_set_size]
training_labels = all_labels[0:training_set_size]
test_set = dataset[training_set_size:num_examples]
test_labels = all_labels[training_set_size:num_examples]
test_set, test_labels = shuffle_examples(test_set, test_labels)
# Parameters
learning_rate = 0.0001
training_epochs = 150
mini_batch_size = 100
total_batch = int(num_examples/mini_batch_size)
# Network Parameters
n_hidden_1 = 50 # 1st hidden layer of neurons
#n_hidden_2 = 16 # 2nd hidden layer of neurons
n_input = int(sys.argv[2]) # number of features after LSA
n_classes = 2;
# Tensorflow Graph input
with tf.name_scope("input"):
x = tf.placeholder(np.float32, shape=[None, n_input], name="x-data")
y = tf.placeholder(np.float32, shape=[None, n_classes], name="y-labels")
print("Creating model.")
# Create model
def multilayer_perceptron(x, weights, biases):
with tf.name_scope("h_layer_1"):
# First hidden layer with SIGMOID activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.sigmoid(layer_1)
#with tf.name_scope("h_layer_2"):
# Second hidden layer with SIGMOID activation
#layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
#layer_2 = tf.nn.sigmoid(layer_2)
with tf.name_scope("out_layer"):
# Output layer with SIGMOID activation
out_layer = tf.add(tf.matmul(layer_1, weights['out']), biases['bout'])
out_layer = tf.nn.sigmoid(out_layer)
return out_layer
# Layer weights
with tf.name_scope("weights"):
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], stddev=0.01, dtype=np.float32)),
#'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2], stddev=0.05, dtype=np.float32)),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes], stddev=0.01, dtype=np.float32))
}
# Layer biases
with tf.name_scope("biases"):
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1], dtype=np.float32)),
#'b2': tf.Variable(tf.random_normal([n_hidden_2], dtype=np.float32)),
'bout': tf.Variable(tf.random_normal([n_classes], dtype=np.float32))
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
with tf.name_scope("loss"):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
with tf.name_scope("adam"):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Initializing the variables
init = tf.initialize_all_variables()
# Define summaries
tf.scalar_summary("loss", cost)
summary_op = tf.merge_all_summaries()
print("Model ready.")
# Launch the graph
with tf.Session() as sess:
sess.run(init)
board_path = sys.argv[3]+time.strftime("%Y%m%d%H%M%S")+"/"
writer = tf.train.SummaryWriter(board_path, graph=tf.get_default_graph())
print("Starting Training.")
for epoch in range(training_epochs):
training_set, training_labels = shuffle_examples(training_set, training_labels)
for i in range(total_batch):
# example loading
minibatch_x = training_set[i*mini_batch_size:(i+1)*mini_batch_size]
minibatch_y = training_labels[i*mini_batch_size:(i+1)*mini_batch_size]
# Run optimization op (backprop) and cost op
_, summary = sess.run([optimizer, summary_op], feed_dict={x: minibatch_x, y: minibatch_y})
# Write log
writer.add_summary(summary, epoch*total_batch+i)
print("Optimization Finished!")
# Test model
test_error = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
accuracy = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(accuracy, np.float32))
test_error, accuracy = sess.run([test_error, accuracy], feed_dict={x: test_set, y: test_labels})
print("Test Error: " + test_error.__str__() + "; Accuracy: " + accuracy.__str__())
print("Tensorboard path: " + board_path)
I'll post the solution here just in case someone gets stuck in a similar way. If you see that plot very carefully, all of the NaN values (the triangles) come on a regular basis, like if at the end of every loop something causes the output of the loss function to just go NaN.
The problem is that, at every loop, I was giving a mini batch of "empty" examples. The problem lies in how I declared my inner training loop:
for i in range(total_batch):
Now what we'd like here is to have Tensorflow go through the entire training set, one minibatch at a time. So let's look at how total_batch was declared:
total_batch = int(num_examples / mini_batch_size)
That is not quite what we'd want to do - as we want to consider the training set only. So changing this line to:
total_batch = int(training_set_size / mini_batch_size)
Fixed the problem.
It is to be noted that Tensorflow seemed to ignore those "empty" batches, computing NaN for the loss but not updating the gradients - that's why the trend of the loss was one of a net that's learning something.

MLP on TensorFlow is giving the same prediction for all observations after the training

I am trying to train a sparse data with an MLP to predict a forecast. However, the forecast on the test data is giving the same value for all observations. Once I omit the activation function from each layer, the outcome starts being different.
my code is below:
# imports
import numpy as np
import tensorflow as tf
import random
import json
from scipy.sparse import rand
# Parameters
learning_rate= 0.1
training_epochs = 50
batch_size = 100
# Network Parameters
m= 1000 #number of features
n= 5000 # number of observations
hidden_layers = [5,2,4,1,6]
n_layers = len(hidden_layers)
n_input = m
n_classes = 1 # it's a regression problem
X_train = rand(n, m, density=0.2,format = 'csr').todense().astype(np.float32)
Y_train = np.random.randint(4, size=n)
X_test = rand(200, m, density=0.2,format = 'csr').todense().astype(np.float32)
Y_test = np.random.randint(4, size=200)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None])
# Store layers weight & bias
weights = {}
biases = {}
weights['h1']=tf.Variable(tf.random_normal([n_input, hidden_layers[0]])) #first matrice
biases['b1'] = tf.Variable(tf.random_normal([hidden_layers[0]]))
for i in xrange(2,n_layers+1):
weights['h'+str(i)]= tf.Variable(tf.random_normal([hidden_layers[i-2], hidden_layers[i-1]]))
biases['b'+str(i)] = tf.Variable(tf.random_normal([hidden_layers[i-1]]))
weights['out']=tf.Variable(tf.random_normal([hidden_layers[-1], 1])) #matrice between last layer and output
biases['out']= tf.Variable(tf.random_normal([1]))
# Create model
def multilayer_perceptron(_X, _weights, _biases):
layer_begin = tf.nn.relu(tf.add(tf.matmul(_X, _weights['h1'],a_is_sparse=True), _biases['b1']))
for layer in xrange(2,n_layers+1):
layer_begin = tf.nn.relu(tf.add(tf.matmul(layer_begin, _weights['h'+str(layer)]), _biases['b'+str(layer)]))
#layer_end = tf.nn.dropout(layer_begin, 0.3)
return tf.matmul(layer_begin, _weights['out'])+ _biases['out']
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
rmse = tf.reduce_sum(tf.abs(y-pred))/tf.reduce_sum(tf.abs(y)) # rmse loss
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(rmse) # Adam Optimizer
# Initializing the variables
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
#training
for step in xrange(training_epochs):
# Generate a minibatch.
start = random.randrange(1, n - batch_size)
#print start
batch_xs=X_train[start:start+batch_size,:]
batch_ys =Y_train[start:start+batch_size]
#printing
_,rmseRes = sess.run([optimizer, rmse] , feed_dict={x: batch_xs, y: batch_ys} )
if step % 20 == 0:
print "rmse [%s] = %s" % (step, rmseRes)
#testing
pred_test = multilayer_perceptron(X_test, weights, biases)
print "prediction", pred_test.eval()[:20]
print "actual = ", Y_test[:20]
PS: I am generating randomly my data just to reproduce the error. My data is sparse in fact, pretty similar to the one generated randomly. The problem I want to solve is: MLP is giving the same prediction for all observations in the test data.
That's a sign that your training failed. With GoogeLeNet Imagenet training I've seen it label everything as "nematode" when started with a bad choice of hyper-parameters. Things to check -- does your training loss decrease? If it doesn't decrease, try different learning rates/architectures. If it decreases to zero maybe your loss is wrong like was case here

How could I use batch normalization in TensorFlow?

I would like to use batch normalization in TensorFlow. I found the related C++ source code in core/ops/nn_ops.cc. However, I did not find it documented on tensorflow.org.
BN has different semantics in MLP and CNN, so I am not sure what exactly this BN does.
I did not find a method called MovingMoments either.
Update July 2016 The easiest way to use batch normalization in TensorFlow is through the higher-level interfaces provided in either contrib/layers, tflearn, or slim.
Previous answer if you want to DIY:
The documentation string for this has improved since the release - see the docs comment in the master branch instead of the one you found. It clarifies, in particular, that it's the output from tf.nn.moments.
You can see a very simple example of its use in the batch_norm test code. For a more real-world use example, I've included below the helper class and use notes that I scribbled up for my own use (no warranty provided!):
"""A helper class for managing batch normalization state.
This class is designed to simplify adding batch normalization
(http://arxiv.org/pdf/1502.03167v3.pdf) to your model by
managing the state variables associated with it.
Important use note: The function get_assigner() returns
an op that must be executed to save the updated state.
A suggested way to do this is to make execution of the
model optimizer force it, e.g., by:
update_assignments = tf.group(bn1.get_assigner(),
bn2.get_assigner())
with tf.control_dependencies([optimizer]):
optimizer = tf.group(update_assignments)
"""
import tensorflow as tf
class ConvolutionalBatchNormalizer(object):
"""Helper class that groups the normalization logic and variables.
Use:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn = ConvolutionalBatchNormalizer(depth, 0.001, ewma, True)
update_assignments = bn.get_assigner()
x = bn.normalize(y, train=training?)
(the output x will be batch-normalized).
"""
def __init__(self, depth, epsilon, ewma_trainer, scale_after_norm):
self.mean = tf.Variable(tf.constant(0.0, shape=[depth]),
trainable=False)
self.variance = tf.Variable(tf.constant(1.0, shape=[depth]),
trainable=False)
self.beta = tf.Variable(tf.constant(0.0, shape=[depth]))
self.gamma = tf.Variable(tf.constant(1.0, shape=[depth]))
self.ewma_trainer = ewma_trainer
self.epsilon = epsilon
self.scale_after_norm = scale_after_norm
def get_assigner(self):
"""Returns an EWMA apply op that must be invoked after optimization."""
return self.ewma_trainer.apply([self.mean, self.variance])
def normalize(self, x, train=True):
"""Returns a batch-normalized version of x."""
if train:
mean, variance = tf.nn.moments(x, [0, 1, 2])
assign_mean = self.mean.assign(mean)
assign_variance = self.variance.assign(variance)
with tf.control_dependencies([assign_mean, assign_variance]):
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, self.beta, self.gamma,
self.epsilon, self.scale_after_norm)
else:
mean = self.ewma_trainer.average(self.mean)
variance = self.ewma_trainer.average(self.variance)
local_beta = tf.identity(self.beta)
local_gamma = tf.identity(self.gamma)
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, local_beta, local_gamma,
self.epsilon, self.scale_after_norm)
Note that I called it a ConvolutionalBatchNormalizer because it pins the use of tf.nn.moments to sum across axes 0, 1, and 2, whereas for non-convolutional use you might only want axis 0.
Feedback appreciated if you use it.
As of TensorFlow 1.0 (February 2017) there's also the high-level tf.layers.batch_normalization API included in TensorFlow itself.
It's super simple to use:
# Set this to True for training and False for testing
training = tf.placeholder(tf.bool)
x = tf.layers.dense(input_x, units=100)
x = tf.layers.batch_normalization(x, training=training)
x = tf.nn.relu(x)
...except that it adds extra ops to the graph (for updating its mean and variance variables) in such a way that they won't be dependencies of your training op. You can either just run the ops separately:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
sess.run([train_op, extra_update_ops], ...)
or add the update ops as dependencies of your training op manually, then just run your training op as normal:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
train_op = optimizer.minimize(loss)
...
sess.run([train_op], ...)
The following works fine for me, it does not require invoking EMA-apply outside.
import numpy as np
import tensorflow as tf
from tensorflow.python import control_flow_ops
def batch_norm(x, n_out, phase_train, scope='bn'):
"""
Batch normalization on convolutional maps.
Args:
x: Tensor, 4D BHWD input maps
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope):
beta = tf.Variable(tf.constant(0.0, shape=[n_out]),
name='beta', trainable=True)
gamma = tf.Variable(tf.constant(1.0, shape=[n_out]),
name='gamma', trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0,1,2], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train,
mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
Example:
import math
n_in, n_out = 3, 16
ksize = 3
stride = 1
phase_train = tf.placeholder(tf.bool, name='phase_train')
input_image = tf.placeholder(tf.float32, name='input_image')
kernel = tf.Variable(tf.truncated_normal([ksize, ksize, n_in, n_out],
stddev=math.sqrt(2.0/(ksize*ksize*n_out))),
name='kernel')
conv = tf.nn.conv2d(input_image, kernel, [1,stride,stride,1], padding='SAME')
conv_bn = batch_norm(conv, n_out, phase_train)
relu = tf.nn.relu(conv_bn)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
for i in range(20):
test_image = np.random.rand(4,32,32,3)
sess_outputs = session.run([relu],
{input_image.name: test_image, phase_train.name: True})
There is also an "official" batch normalization layer coded by the developers. They don't have very good docs on how to use it but here is how to use it (according to me):
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
def batch_norm_layer(x,train_phase,scope_bn):
bn_train = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
to actually use it you need to create a placeholder for train_phase that indicates if you are in training or inference phase (as in train_phase = tf.placeholder(tf.bool, name='phase_train')). Its value can be filled during inference or training with a tf.session as in:
test_error = sess.run(fetches=cross_entropy, feed_dict={x: batch_xtest, y_:batch_ytest, train_phase: False})
or during training:
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_:batch_ys, train_phase: True})
I'm pretty sure this is correct according to the discussion in github.
Seems there is another useful link:
http://r2rt.com/implementing-batch-normalization-in-tensorflow.html
You can simply use the build-in batch_norm layer:
batch_norm = tf.cond(is_train,
lambda: tf.contrib.layers.batch_norm(prev, activation_fn=tf.nn.relu, is_training=True, reuse=None),
lambda: tf.contrib.layers.batch_norm(prev, activation_fn =tf.nn.relu, is_training=False, reuse=True))
where prev is the output of your previous layer (can be both fully-connected or a convolutional layer) and is_train is a boolean placeholder. Just use batch_norm as the input to the next layer, then.
Since someone recently edited this, I'd like to clarify that this is no longer an issue.
This answer does not seem correct When phase_train is set to false, it still updates the ema mean and variance. This can be verified with the following code snippet.
x = tf.placeholder(tf.float32, [None, 20, 20, 10], name='input')
phase_train = tf.placeholder(tf.bool, name='phase_train')
# generate random noise to pass into batch norm
x_gen = tf.random_normal([50,20,20,10])
pt_false = tf.Variable(tf.constant(True))
#generate a constant variable to pass into batch norm
y = x_gen.eval()
[bn, bn_vars] = batch_norm(x, 10, phase_train)
tf.initialize_all_variables().run()
train_step = lambda: bn.eval({x:x_gen.eval(), phase_train:True})
test_step = lambda: bn.eval({x:y, phase_train:False})
test_step_c = lambda: bn.eval({x:y, phase_train:True})
# Verify that this is different as expected, two different x's have different norms
print(train_step()[0][0][0])
print(train_step()[0][0][0])
# Verify that this is same as expected, same x's (y) have same norm
print(train_step_c()[0][0][0])
print(train_step_c()[0][0][0])
# THIS IS DIFFERENT but should be they same, should only be reading from the ema.
print(test_step()[0][0][0])
print(test_step()[0][0][0])
Using TensorFlow built-in batch_norm layer, below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine. Just FYI this example is mostly built upon the data and code from Udacity DeepLearning course.
P.S. Yes, parts of it were discussed one way or another in answers earlier but I decided to gather in one code snippet everything so that you have example of whole network training process with Batch Normalization and its evaluation
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
So a simple example of the use of this batchnorm class:
from bn_class import *
with tf.name_scope('Batch_norm_conv1') as scope:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn_conv1 = ConvolutionalBatchNormalizer(num_filt_1, 0.001, ewma, True)
update_assignments = bn_conv1.get_assigner()
a_conv1 = bn_conv1.normalize(a_conv1, train=bn_train)
h_conv1 = tf.nn.relu(a_conv1)

Categories