Related
I'm currently learning how to use Tensorflow and I'm having some issues to implement this Softmax Regression aplication.
There's no error when compiling but, for some reasson text validation and test predictions shows no improvement, only the train prediction is showing improvement.
I'm using Stocastic Gradient Descent(SGD) with minibatches in order to converge faster, but don't know if this could be causing a trouble somehow.
I'll be thankful if you could share some ideas, here's the full code:
import input_data
import numpy as np
import random as ran
import tensorflow as tf
import matplotlib.pyplot as plt
mnist = input_data.read_data_sets('MNIST_Data/', one_hot=True)
#Features & Data
num_features = 784
num_labels = 10
learning_rate = 0.05
batch_size = 128
num_steps = 5001
train_dataset = mnist.train.images
train_labels = mnist.train.labels
test_dataset = mnist.test.images
test_labels = mnist.test.labels
valid_dataset = mnist.validation.images
valid_labels = mnist.validation.labels
graph = tf.Graph()
with graph.as_default():
tf_train_data = tf.placeholder(tf.float32, shape=(batch_size, num_features))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_data = tf.constant(valid_dataset)
tf_test_data = tf.constant(test_dataset)
W = tf.Variable(tf.truncated_normal([num_features, num_labels]))
b = tf.Variable(tf.zeros([num_labels]))
score_vector = tf.matmul(tf_train_data, W) + b
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf_train_labels, logits=score_vector))
score_valid = tf.matmul(tf_test_data, W) + b
score_test = tf.matmul(tf_valid_data, W) + b
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_func)
train_pred = tf.nn.softmax(score_vector)
valid_pred = tf.nn.softmax(score_valid)
test_pred = tf.nn.softmax(score_test)
def accuracy(predictions, labels):
correct_pred = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
accu = (100.0 * correct_pred) / predictions.shape[0]
return accu
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer())
print("Initialized")
for step in range(num_steps):
offset = np.random.randint(0, train_labels.shape[0] - batch_size - 1)
batch_data = train_dataset[offset:(offset+batch_size), :]
batch_labels = train_labels[offset:(offset+batch_size), :]
feed_dict = {tf_train_data : batch_data,
tf_train_labels : batch_labels
}
_, l, predictions = sess.run([optimizer, cost_func, train_pred],
feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step {0}: {1}".format(step, l))
print("Minibatch accuracy: {:.1f}%".format(
accuracy(predictions, batch_labels)))
print("Validation accuracy: {:.1f}%".format(
accuracy(valid_pred.eval(), valid_labels)))
print("\nTest accuracy: {:.1f}%".format(
accuracy(test_pred.eval(), test_labels)))
It sounds like overfitting, which isn't surprising since this model is basically a linear regression model.
There are few options you can try:
1. add hidden layers + activation functions(https://arxiv.org/abs/1511.07289: elu paper works on mnist data set with vanilla DNN).
2. Use either CNN or RNN, although CNN is more apt for image problems.
3. Use a better optimizer. If you are new, try ADAM optimizer (https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer), and then move onto using momentum with nestrov(https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer)
Without feature engineering, it'll be hard to pull off image classification using just linear regression. Also, you do not need to run softmax on your outcomes since softmax is designed to smooth argmax. Lastly, you should input (None,num_features) into shape of placeholders instead to have variational batch size. This will allow you to directly feed your valid and test datasets into feed_dict without having to create additional tensors.
I am new to Tensorflow and I am working for training with LSTM-RNN in Tensorflow.
I need to save the model so that I can restore and run with Test data again.
I am not sure what to save.
I need to save sess or I need to save pred
When I save sess, restore and test the Test data as
one_hot_predictions, accuracy, final_loss = sess.run(
[pred, accuracy, cost],
feed_dict={
x: X_test,
y: one_hot(y_test)
}
)
Then the error is unknown for pred.
Since I am new to Tensorflow, I am not sure what to save and what to restore to test with new data?
X_train = load_X(X_train_path)
X_test = load_X(X_test_path)
y_train = load_y(y_train_path)
y_test = load_y(y_test_path)
# proof that it actually works for the skeptical: replace labelled classes with random classes to train on
#for i in range(len(y_train)):
# y_train[i] = randint(0, 5)
# Input Data
training_data_count = len(X_train) # 4519 training series (with 50% overlap between each serie)
test_data_count = len(X_test) # 1197 test series
n_input = len(X_train[0][0]) # num input parameters per timestep
n_hidden = 34 # Hidden layer num of features
n_classes = 6
#updated for learning-rate decay
# calculated as: decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
decaying_learning_rate = True
learning_rate = 0.0025 #used if decaying_learning_rate set to False
init_learning_rate = 0.005
decay_rate = 0.96 #the base of the exponential in the decay
decay_steps = 100000 #used in decay every 60000 steps with a base of 0.96
global_step = tf.Variable(0, trainable=False)
lambda_loss_amount = 0.0015
training_iters = training_data_count *300 # Loop 300 times on the dataset, ie 300 epochs
batch_size = 512
display_iter = batch_size*8 # To show test set accuracy during training
#Utility functions for training:
def LSTM_RNN(_X, _weights, _biases):
# model architecture based on "guillaume-chevalier" and "aymericdamien" under the MIT license.
_X = tf.transpose(_X, [1, 0, 2]) # permute n_steps and batch_size
_X = tf.reshape(_X, [-1, n_input])
# Rectifies Linear Unit activation function used
_X = tf.nn.relu(tf.matmul(_X, _weights['hidden']) + _biases['hidden'])
# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(_X, n_steps, 0)
# Define two stacked LSTM cells (two recurrent layers deep) with tensorflow
lstm_cell_1 = tf.contrib.rnn.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_cell_2 = tf.contrib.rnn.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell_1, lstm_cell_2], state_is_tuple=True)
outputs, states = tf.contrib.rnn.static_rnn(lstm_cells, _X, dtype=tf.float32)
# A single output is produced, in style of "many to one" classifier, refer to http://karpathy.github.io/2015/05/21/rnn-effectiveness/ for details
lstm_last_output = outputs[-1]
# Linear activation
return tf.matmul(lstm_last_output, _weights['out']) + _biases['out']
def extract_batch_size(_train, _labels, _unsampled, batch_size):
# Fetch a "batch_size" amount of data and labels from "(X|y)_train" data.
# Elements of each batch are chosen randomly, without replacement, from X_train with corresponding label from Y_train
# unsampled_indices keeps track of sampled data ensuring non-replacement. Resets when remaining datapoints < batch_size
shape = list(_train.shape)
shape[0] = batch_size
batch_s = np.empty(shape)
batch_labels = np.empty((batch_size,1))
for i in range(batch_size):
# Loop index
# index = random sample from _unsampled (indices)
index = random.choice(_unsampled)
batch_s[i] = _train[index]
batch_labels[i] = _labels[index]
_unsampled.remove(index)
return batch_s, batch_labels, _unsampled
def one_hot(y_):
# One hot encoding of the network outputs
# e.g.: [[5], [0], [3]] --> [[0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0]]
y_ = y_.reshape(len(y_))
n_values = int(np.max(y_)) + 1
return np.eye(n_values)[np.array(y_, dtype=np.int32)] # Returns FLOATS
# Graph input/output
x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
# Graph weights
weights = {
'hidden': tf.Variable(tf.random_normal([n_input, n_hidden])), # Hidden layer weights
'out': tf.Variable(tf.random_normal([n_hidden, n_classes], mean=1.0))
}
biases = {
'hidden': tf.Variable(tf.random_normal([n_hidden])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
pred = LSTM_RNN(x, weights, biases)
# Loss, optimizer and evaluation
l2 = lambda_loss_amount * sum(
tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables()
) # L2 loss prevents this overkill neural network to overfit the data
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=pred)) + l2 # Softmax loss
if decaying_learning_rate:
learning_rate = tf.train.exponential_decay(init_learning_rate, global_step*batch_size, decay_steps, decay_rate, staircase=True)
#decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) #exponentially decayed learning rate
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost,global_step=global_step) # Adam Optimizer
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
#Train the network:
test_losses = []
test_accuracies = []
train_losses = []
train_accuracies = []
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))
init = tf.global_variables_initializer()
# Add ops to save and restore all the variables.
saver = tf.train.Saver()
sess.run(init)
# Perform Training steps with "batch_size" amount of data at each loop.
# Elements of each batch are chosen randomly, without replacement, from X_train,
# restarting when remaining datapoints < batch_size
step = 1
time_start = time.time()
unsampled_indices = range(0,len(X_train))
while step * batch_size <= training_iters:
#print (sess.run(learning_rate)) #decaying learning rate
#print (sess.run(global_step)) # global number of iterations
if len(unsampled_indices) < batch_size:
unsampled_indices = range(0,len(X_train))
batch_xs, raw_labels, unsampled_indicies = extract_batch_size(X_train, y_train, unsampled_indices, batch_size)
batch_ys = one_hot(raw_labels)
# check that encoded output is same length as num_classes, if not, pad it
if len(batch_ys[0]) < n_classes:
temp_ys = np.zeros((batch_size, n_classes))
temp_ys[:batch_ys.shape[0],:batch_ys.shape[1]] = batch_ys
batch_ys = temp_ys
# Fit training using batch data
_, loss, acc = sess.run(
[optimizer, cost, accuracy],
feed_dict={
x: batch_xs,
y: batch_ys
}
)
train_losses.append(loss)
train_accuracies.append(acc)
# Evaluate network only at some steps for faster training:
if (step*batch_size % display_iter == 0) or (step == 1) or (step * batch_size > training_iters):
# To not spam console, show training accuracy/loss in this "if"
print("Iter #" + str(step*batch_size) + \
": Learning rate = " + "{:.6f}".format(sess.run(learning_rate)) + \
": Batch Loss = " + "{:.6f}".format(loss) + \
", Accuracy = {}".format(acc))
# Evaluation on the test set (no learning made here - just evaluation for diagnosis)
loss, acc = sess.run(
[cost, accuracy],
feed_dict={
x: X_test,
y: one_hot(y_test)
}
)
test_losses.append(loss)
test_accuracies.append(acc)
print("PERFORMANCE ON TEST SET: " + \
"Batch Loss = {}".format(loss) + \
", Accuracy = {}".format(acc))
step += 1
print("Optimization Finished!")
EDIT:
I can save the model as
print("Optimization Finished!")
save_path = saver.save(sess, "/home/test/venv/TFCodes/HumanActivityRecognition/model.ckpt")
Then I tried to restore and ok, I can restore. But I don't know how to test with test data.
My restore code is
X_test = load_X(X_test_path)
with tf.Session() as sess:
saver = tf.train.import_meta_graph('/home/nyan/venv/TFCodes/HumanActivityRecognition/model.ckpt.meta')
saver.restore(sess, tf.train.latest_checkpoint('./'))
print("Model restored.")
all_vars = tf.trainable_variables()
for i in range(len(all_vars)):
name = all_vars[i].name
values = sess.run(name)
print('name', name)
#print('value', values)
print('shape',values.shape)
result = sess.run(prediction, feed_dict={X: X_test})
print("loss:", l, "prediction:", result, "true Y:", y_data)
# print char using dic
result_str = [idx2char[c] for c in np.squeeze(res
ult)]
print("\tPrediction str:", ''.join(result_str))
The output is
Model restored.
('name', u'Variable_1:0')
('shape', (36, 34))
('name', u'Variable_2:0')
('shape', (34, 6))
('name', u'Variable_3:0')
('shape', (34,))
('name', u'Variable_4:0')
('shape', (6,))
('name', u'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0')
('shape', (68, 136))
('name', u'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0')
('shape', (136,))
('name', u'rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0')
('shape', (68, 136))
('name', u'rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0')
('shape', (136,))
Traceback (most recent call last):
File "restore.py", line 74, in <module>
result = sess.run(prediction, feed_dict={X: X_test})
NameError: name 'prediction' is not defined
How to test the model restored?
What I find the easiest is the tf.saved_model.simple_save() function. It saves the computation graph you use, the weights, the input and the output in a .pb model and the weight variables.
You can later restore this model or even put it on ml-engine or use tf serving.
An example code snippit with a keras model and applied on YOLO:
inputs = {"image_bytes": model.input,
"shape": image_shape}
outputs = {"boxes": boxes,
"scores": scores,
"classes": classes}
tf.saved_model.simple_save(sess, "saved_model/", inputs, outputs)
I'm working on an RBF network using Tensorflow, but there's this error that comes up at line 112 that says this: ValueError: Cannot feed value of shape (40, 13) for Tensor 'Placeholder:0', which has shape '(?, 12)'
Here's my code below. I created my own activation function for my RBF network by following this tutorial. Also, if there is anything else you notice that needs to be fixed, please point it out to me, because I am very new to Tensorflow so it would be helpful to get any feedback I can get.
import tensorflow as tf
import numpy as np
import math
from sklearn import datasets
from sklearn.model_selection import train_test_split
from tensorflow.python.framework import ops
ops.reset_default_graph()
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)
boston = datasets.load_boston()
data = boston["data"]
target = boston["target"]
N_INSTANCES = data.shape[0]
N_INPUT = data.shape[1] - 1
N_CLASSES = 3
TEST_SIZE = 0.1
TRAIN_SIZE = int(N_INSTANCES * (1 - TEST_SIZE))
batch_size = 40
training_epochs = 400
learning_rate = 0.001
display_step = 20
hidden_size = 200
target_ = np.zeros((N_INSTANCES, N_CLASSES))
data_train, data_test, target_train, target_test = train_test_split(data, target_, test_size=0.1, random_state=100)
x_data = tf.placeholder(shape=[None, N_INPUT], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, N_CLASSES], dtype=tf.float32)
# creates activation function
def gaussian_function(input_layer):
initial = math.exp(-2*math.pow(input_layer, 2))
return initial
np_gaussian_function = np.vectorize(gaussian_function)
def d_gaussian_function(input_layer):
initial = -4 * input_layer * math.exp(-2*math.pow(input_layer, 2))
return initial
np_d_gaussian_function = np.vectorize(d_gaussian_function)
np_d_gaussian_function_32 = lambda input_layer: np_d_gaussian_function(input_layer).astype(np.float32)
def tf_d_gaussian_function(input_layer, name=None):
with ops.name_scope(name, "d_gaussian_function", [input_layer]) as name:
y = tf.py_func(np_d_gaussian_function_32, [input_layer],[tf.float32], name=name, stateful=False)
return y[0]
def py_func(func, inp, Tout, stateful=True, name=None, grad=None):
rnd_name = 'PyFunGrad' + str(np.random.randint(0, 1E+8))
tf.RegisterGradient(rnd_name)(grad)
g = tf.get_default_graph()
with g.gradient_override_map({"PyFunc": rnd_name}):
return tf.py_func(func, inp, Tout, stateful=stateful, name=name)
def gaussian_function_grad(op, grad):
input_variable = op.inputs[0]
n_gr = tf_d_gaussian_function(input_variable)
return grad * n_gr
np_gaussian_function_32 = lambda input_layer: np_gaussian_function(input_layer).astype(np.float32)
def tf_gaussian_function(input_layer, name=None):
with ops.name_scope(name, "gaussian_function", [input_layer]) as name:
y = py_func(np_gaussian_function_32, [input_layer], [tf.float32], name=name, grad=gaussian_function_grad)
return y[0]
# end of defining activation function
def rbf_network(input_layer, weights):
layer1 = tf.matmul(tf_gaussian_function(input_layer), weights['h1'])
layer2 = tf.matmul(tf_gaussian_function(layer1), weights['h2'])
output = tf.matmul(tf_gaussian_function(layer2), weights['output'])
return output
weights = {
'h1': tf.Variable(tf.random_normal([N_INPUT, hidden_size], stddev=0.1)),
'h2': tf.Variable(tf.random_normal([hidden_size, hidden_size], stddev=0.1)),
'output': tf.Variable(tf.random_normal([hidden_size, N_CLASSES], stddev=0.1))
}
pred = rbf_network(x_data, weights)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y_target))
my_opt = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y_target, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)
# Training loop
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(data_train.shape[0] / batch_size)
for i in range(total_batch):
randidx = np.random.randint(int(TRAIN_SIZE), size=batch_size)
batch_xs = data_train[randidx, :]
batch_ys = target_train[randidx, :]
sess.run(my_opt, feed_dict={x_data: batch_xs, y_target: batch_ys})
avg_cost += sess.run(cost, feed_dict={x_data: batch_xs, y_target: batch_ys})/total_batch
if epoch % display_step == 0:
print("Epoch: %03d/%03d cost: %.9f" % (epoch, training_epochs, avg_cost))
train_accuracy = sess.run(accuracy, feed_dict={x_data: batch_xs, y_target: batch_ys})
print("Training accuracy: %.3f" % train_accuracy)
test_acc = sess.run(accuracy, feed_dict={x_data: data_test, y_target: target_test})
print("Test accuracy: %.3f" % (test_acc))
sess.close()
As it has been said, you should have N_Input = data.shape[1].
Actually data.shape[0] relates the number of realisations you have in your data-set and data.shape[1] tells us how many features the network should consider.
The number of features is by definition the size of the input layer regardless how many data you will propose (via feed_dict) to your network.
Plus boston dataset is a regression problem while softmax_cross_entropy is a cost function for classification problem. You can try tf.square to evaluate the euclidean distance between what you are predicting and what you want :
cost = tf.reduce_mean(tf.square(pred - y_target))
You will see that your network is learning, even though the accuracy is not very high.
Edit :
Your code is actually learning well but you used the wrong tool to measure it.
Mainly, your errors still reside in the fact that you are dealing with regression problem not with a classification problem.
In classification problem you can evaluate the accuracy of your on-going learning process using
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y_target, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
It consists in checking if the predicted class is the same as the expected class, for an input among x_test.
In regression problem, doing so is senseless since you are looking for a real number i.e. an infinity of possibility from the classification point of view.
In regression problem you can estimate the error (mean or whatever) between predicted values and expected values. We can use what I suggested below :
cost = tf.reduce_mean(tf.square(pred - y_target))
I modified your code consequently here it is
pred = rbf_network(x_data, weights)
cost = tf.reduce_mean(tf.square(pred - y_target))
my_opt = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
#correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y_target, 1))
#accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)
plt.figure("Error evolution")
plt.xlabel("N_epoch")
plt.ylabel("Error evolution")
tol = 5e-4
epoch, err=0, 1
# Training loop
while epoch <= training_epochs and err >= tol:
avg_cost = 0.
total_batch = int(data_train.shape[0] / batch_size)
for i in range(total_batch):
randidx = np.random.randint(int(TRAIN_SIZE), size=batch_size)
batch_xs = data_train[randidx, :]
batch_ys = target_train[randidx, :]
sess.run(my_opt, feed_dict={x_data: batch_xs, y_target: batch_ys})
avg_cost += sess.run(cost, feed_dict={x_data: batch_xs, y_target: batch_ys})/total_batch
plt.plot(epoch, avg_cost, marker='o', linestyle="none", c='k')
plt.pause(0.05)
err = avg_cost
if epoch % 10 == 0:
print("Epoch: {}/{} err = {}".format(epoch, training_epochs, avg_cost))
epoch +=1
print ("End of learning process")
print ("Final epoch = {}/{} ".format(epoch, training_epochs))
print ("Final error = {}".format(err) )
sess.close()
The output is
Epoch: 0/400 err = 0.107879924503
Epoch: 10/400 err = 0.00520248359747
Epoch: 20/400 err = 0.000651647908274
End of learning process
Final epoch = 26/400
Final error = 0.000474644409471
We plot the evolution of the error in the training through the different epochs
I'm also new to Tensorflow and this is my first answer in stackoverflow. I tried your code and I got the same error.
You can see in the error code ValueError: Cannot feed value of shape (40, 13) for Tensor 'Placeholder:0', which has shape '(?, 12), that there is a mismatch in the shapes of the first placeholder:
x_data = tf.placeholder(shape=[None, N_INPUT], dtype=tf.float32)
so I'm not sure why the N_INPUT has a -1 in this line
N_INPUT = data.shape[1] - 1
I tried removing it and the code runs. Though it looks like the network isn't learning.
While this implementation will do the job, I don't think its the most optimal RBF implementation. You are using a fixed size of 200 centroids (hidden units) in your RBF. This causes the centroids to not be optimally placed and the width of your Gaussian basis function to not be optimally sized. Typically the centroids should be learned in an unsupervised pre-stage by using K Means or any other kind of clustering algorithm.
So your 1st training stage would involve finding the centroids/centers of the RBFs, and the 2nd stage would be the actual classification/regression using the RBF Network
Does anyone have an example for code that uses tf.train.AdadeltaOptimizer with good results?
I have a TF graph, that was originally set with tf.train.AdamOptimizer, and is working well. When I replace it with AdadeltaOptimizer, with the default params, it gives lousy results.
I used Cuda 7.5.
The below is example code which works with 'AdadeltaOptimizer' optimizer. It works with 'Adam'. The only difference between them that Adam is insensitive to "learning rate" and 'Adadelta' is sensitive.
I advice you to read more about optimization algorithm (like here).
In your own example, just try to change 'learning rate' to be smaller or bigger (it is named 'hyperparameter optimization').
Note:
From my experience, 'Adam' is a very good optimizer for RNN, better than 'AdaDelta' (using example code, 'Adam' achieve better score much faster). On the other hand, for CNN, SGD+Momentum works best.
Code, which learn MNIST classification using Bi-LSTM:
# Mnist classification using Bi-LSTM
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
learning_rate = 0.01
training_epochs = 100
batch_size = 64
seq_length = 28
heigh_image = 28
hidden_size = 128
class_numer = 10
input = tf.placeholder(tf.float32, [None, None, heigh_image])
target = tf.placeholder(tf.float32, [None, class_numer])
seq_len = tf.placeholder(tf.int32, [None])
def fulconn_layer(input_data, output_dim, activation_func=None):
input_dim = int(input_data.get_shape()[1])
W = tf.Variable(tf.random_normal([input_dim, output_dim]))
b = tf.Variable(tf.random_normal([output_dim]))
if activation_func:
return activation_func(tf.matmul(input_data, W) + b)
else:
return tf.matmul(input_data, W) + b
with tf.name_scope("BiLSTM"):
with tf.variable_scope('forward'):
lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True)
with tf.variable_scope('backward'):
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=1.0, state_is_tuple=True)
outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, cell_bw=lstm_bw_cell, inputs=input,sequence_length=seq_len, dtype=tf.float32, scope="BiLSTM")
# As we have Bi-LSTM, we have two output, which are not connected. So merge them
outputs = tf.concat(2, outputs)
# As we want do classification, we only need the last output from LSTM.
last_output = outputs[:,0,:]
# Create the final classification layer
yhat = fulconn_layer(last_output, class_numer)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(yhat, target))
optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(cost) # AdamOptimizer
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(target, 1), tf.argmax(yhat, 1)), tf.float32))
gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts)) as session:
session.run(tf.initialize_all_variables())
print ("Start Learing")
for epoch in range(training_epochs):
for i in range(int(mnist.train.num_examples/batch_size)):
x_batch, y_batch = mnist.train.next_batch(batch_size)
x_batch = x_batch.reshape([batch_size, seq_length, heigh_image])
train_seq_len = np.ones(batch_size) * seq_length
session.run([optimizer], feed_dict={input: x_batch, target: y_batch, seq_len: train_seq_len})
train_accuracy = session.run(accuracy, feed_dict={input: x_batch, target: y_batch, seq_len: train_seq_len})
x_test = mnist.test.images.reshape([-1, seq_length, heigh_image])
y_test = mnist.test.labels
test_seq_len = np.ones(x_test.shape[0]) * seq_length
test_accuracy = session.run(accuracy, feed_dict={input: x_test, target: y_test, seq_len: test_seq_len})
print("epoch: %d, train_accuracy: %3f, test_accuracy: %3f" % (epoch, train_accuracy, test_accuracy))
I would like to use batch normalization in TensorFlow. I found the related C++ source code in core/ops/nn_ops.cc. However, I did not find it documented on tensorflow.org.
BN has different semantics in MLP and CNN, so I am not sure what exactly this BN does.
I did not find a method called MovingMoments either.
Update July 2016 The easiest way to use batch normalization in TensorFlow is through the higher-level interfaces provided in either contrib/layers, tflearn, or slim.
Previous answer if you want to DIY:
The documentation string for this has improved since the release - see the docs comment in the master branch instead of the one you found. It clarifies, in particular, that it's the output from tf.nn.moments.
You can see a very simple example of its use in the batch_norm test code. For a more real-world use example, I've included below the helper class and use notes that I scribbled up for my own use (no warranty provided!):
"""A helper class for managing batch normalization state.
This class is designed to simplify adding batch normalization
(http://arxiv.org/pdf/1502.03167v3.pdf) to your model by
managing the state variables associated with it.
Important use note: The function get_assigner() returns
an op that must be executed to save the updated state.
A suggested way to do this is to make execution of the
model optimizer force it, e.g., by:
update_assignments = tf.group(bn1.get_assigner(),
bn2.get_assigner())
with tf.control_dependencies([optimizer]):
optimizer = tf.group(update_assignments)
"""
import tensorflow as tf
class ConvolutionalBatchNormalizer(object):
"""Helper class that groups the normalization logic and variables.
Use:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn = ConvolutionalBatchNormalizer(depth, 0.001, ewma, True)
update_assignments = bn.get_assigner()
x = bn.normalize(y, train=training?)
(the output x will be batch-normalized).
"""
def __init__(self, depth, epsilon, ewma_trainer, scale_after_norm):
self.mean = tf.Variable(tf.constant(0.0, shape=[depth]),
trainable=False)
self.variance = tf.Variable(tf.constant(1.0, shape=[depth]),
trainable=False)
self.beta = tf.Variable(tf.constant(0.0, shape=[depth]))
self.gamma = tf.Variable(tf.constant(1.0, shape=[depth]))
self.ewma_trainer = ewma_trainer
self.epsilon = epsilon
self.scale_after_norm = scale_after_norm
def get_assigner(self):
"""Returns an EWMA apply op that must be invoked after optimization."""
return self.ewma_trainer.apply([self.mean, self.variance])
def normalize(self, x, train=True):
"""Returns a batch-normalized version of x."""
if train:
mean, variance = tf.nn.moments(x, [0, 1, 2])
assign_mean = self.mean.assign(mean)
assign_variance = self.variance.assign(variance)
with tf.control_dependencies([assign_mean, assign_variance]):
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, self.beta, self.gamma,
self.epsilon, self.scale_after_norm)
else:
mean = self.ewma_trainer.average(self.mean)
variance = self.ewma_trainer.average(self.variance)
local_beta = tf.identity(self.beta)
local_gamma = tf.identity(self.gamma)
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, local_beta, local_gamma,
self.epsilon, self.scale_after_norm)
Note that I called it a ConvolutionalBatchNormalizer because it pins the use of tf.nn.moments to sum across axes 0, 1, and 2, whereas for non-convolutional use you might only want axis 0.
Feedback appreciated if you use it.
As of TensorFlow 1.0 (February 2017) there's also the high-level tf.layers.batch_normalization API included in TensorFlow itself.
It's super simple to use:
# Set this to True for training and False for testing
training = tf.placeholder(tf.bool)
x = tf.layers.dense(input_x, units=100)
x = tf.layers.batch_normalization(x, training=training)
x = tf.nn.relu(x)
...except that it adds extra ops to the graph (for updating its mean and variance variables) in such a way that they won't be dependencies of your training op. You can either just run the ops separately:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
sess.run([train_op, extra_update_ops], ...)
or add the update ops as dependencies of your training op manually, then just run your training op as normal:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
train_op = optimizer.minimize(loss)
...
sess.run([train_op], ...)
The following works fine for me, it does not require invoking EMA-apply outside.
import numpy as np
import tensorflow as tf
from tensorflow.python import control_flow_ops
def batch_norm(x, n_out, phase_train, scope='bn'):
"""
Batch normalization on convolutional maps.
Args:
x: Tensor, 4D BHWD input maps
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope):
beta = tf.Variable(tf.constant(0.0, shape=[n_out]),
name='beta', trainable=True)
gamma = tf.Variable(tf.constant(1.0, shape=[n_out]),
name='gamma', trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0,1,2], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train,
mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
Example:
import math
n_in, n_out = 3, 16
ksize = 3
stride = 1
phase_train = tf.placeholder(tf.bool, name='phase_train')
input_image = tf.placeholder(tf.float32, name='input_image')
kernel = tf.Variable(tf.truncated_normal([ksize, ksize, n_in, n_out],
stddev=math.sqrt(2.0/(ksize*ksize*n_out))),
name='kernel')
conv = tf.nn.conv2d(input_image, kernel, [1,stride,stride,1], padding='SAME')
conv_bn = batch_norm(conv, n_out, phase_train)
relu = tf.nn.relu(conv_bn)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
for i in range(20):
test_image = np.random.rand(4,32,32,3)
sess_outputs = session.run([relu],
{input_image.name: test_image, phase_train.name: True})
There is also an "official" batch normalization layer coded by the developers. They don't have very good docs on how to use it but here is how to use it (according to me):
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
def batch_norm_layer(x,train_phase,scope_bn):
bn_train = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
to actually use it you need to create a placeholder for train_phase that indicates if you are in training or inference phase (as in train_phase = tf.placeholder(tf.bool, name='phase_train')). Its value can be filled during inference or training with a tf.session as in:
test_error = sess.run(fetches=cross_entropy, feed_dict={x: batch_xtest, y_:batch_ytest, train_phase: False})
or during training:
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_:batch_ys, train_phase: True})
I'm pretty sure this is correct according to the discussion in github.
Seems there is another useful link:
http://r2rt.com/implementing-batch-normalization-in-tensorflow.html
You can simply use the build-in batch_norm layer:
batch_norm = tf.cond(is_train,
lambda: tf.contrib.layers.batch_norm(prev, activation_fn=tf.nn.relu, is_training=True, reuse=None),
lambda: tf.contrib.layers.batch_norm(prev, activation_fn =tf.nn.relu, is_training=False, reuse=True))
where prev is the output of your previous layer (can be both fully-connected or a convolutional layer) and is_train is a boolean placeholder. Just use batch_norm as the input to the next layer, then.
Since someone recently edited this, I'd like to clarify that this is no longer an issue.
This answer does not seem correct When phase_train is set to false, it still updates the ema mean and variance. This can be verified with the following code snippet.
x = tf.placeholder(tf.float32, [None, 20, 20, 10], name='input')
phase_train = tf.placeholder(tf.bool, name='phase_train')
# generate random noise to pass into batch norm
x_gen = tf.random_normal([50,20,20,10])
pt_false = tf.Variable(tf.constant(True))
#generate a constant variable to pass into batch norm
y = x_gen.eval()
[bn, bn_vars] = batch_norm(x, 10, phase_train)
tf.initialize_all_variables().run()
train_step = lambda: bn.eval({x:x_gen.eval(), phase_train:True})
test_step = lambda: bn.eval({x:y, phase_train:False})
test_step_c = lambda: bn.eval({x:y, phase_train:True})
# Verify that this is different as expected, two different x's have different norms
print(train_step()[0][0][0])
print(train_step()[0][0][0])
# Verify that this is same as expected, same x's (y) have same norm
print(train_step_c()[0][0][0])
print(train_step_c()[0][0][0])
# THIS IS DIFFERENT but should be they same, should only be reading from the ema.
print(test_step()[0][0][0])
print(test_step()[0][0][0])
Using TensorFlow built-in batch_norm layer, below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine. Just FYI this example is mostly built upon the data and code from Udacity DeepLearning course.
P.S. Yes, parts of it were discussed one way or another in answers earlier but I decided to gather in one code snippet everything so that you have example of whole network training process with Batch Normalization and its evaluation
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
So a simple example of the use of this batchnorm class:
from bn_class import *
with tf.name_scope('Batch_norm_conv1') as scope:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn_conv1 = ConvolutionalBatchNormalizer(num_filt_1, 0.001, ewma, True)
update_assignments = bn_conv1.get_assigner()
a_conv1 = bn_conv1.normalize(a_conv1, train=bn_train)
h_conv1 = tf.nn.relu(a_conv1)