GAN not converging. Discriminator loss keeps increasing - python

I am making a simple generative adverserial network on mnist dataset.
This is my implementation :
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/",one_hot=True)
def noise(batch_size):
return np.random.uniform(-1, 1, (batch_size, 100))
learning_rate = 0.001
batch_size = 128
input = tf.placeholder('float', [None, 100])
real_data = tf.placeholder('float', [None, 784])
def generator(x):
weights = {
'hl1' : tf.Variable(tf.random_normal([100, 200])),
'ol' : tf.Variable(tf.random_normal([200, 784]))
}
biases = {
'hl1' : tf.Variable(tf.random_normal([200])),
'ol' : tf.Variable(tf.random_normal([784]))
}
hl1 = tf.add(tf.matmul(x, weights['hl1']), biases['hl1'])
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl1, weights['ol']), biases['ol']))
return ol
def discriminator(x):
weights = {
'hl1' : tf.Variable(tf.random_normal([784, 200])),
'ol' : tf.Variable(tf.random_normal([200, 1]))
}
biases = {
'hl1' : tf.Variable(tf.random_normal([200])),
'ol' : tf.Variable(tf.random_normal([1]))
}
hl1 = tf.add(tf.matmul(x, weights['hl1']), biases['hl1'])
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl1, weights['ol']), biases['ol']))
return ol
with tf.variable_scope("G"):
G = generator(input)
with tf.variable_scope("D"):
D_real = discriminator(real_data)
with tf.variable_scope("D", reuse = True):
D_gen = discriminator(G)
generator_parameters = [x for x in tf.trainable_variables() if x.name.startswith('G/')]
discriminator_parameters = [x for x in tf.trainable_variables() if x.name.startswith('D/')]
G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_gen, labels=tf.ones_like(D_gen)))
D_real_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_real, labels=tf.ones_like(D_real)))
D_fake_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_gen, labels=tf.zeros_like(D_gen)))
D_total_loss = tf.add(D_fake_loss, D_real_loss)
G_train = tf.train.AdamOptimizer(learning_rate).minimize(G_loss,var_list=generator_parameters)
D_train = tf.train.AdamOptimizer(learning_rate).minimize(D_total_loss,var_list=discriminator_parameters)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
loss_g_function = []
loss_d_function = []
for epoch in range(200):
for iteratiion in range(int(len(mnist.train.images)/batch_size)):
real_batch, _ = mnist.train.next_batch(batch_size)
_, d_err = sess.run([D_train, D_total_loss], feed_dict = {real_data : real_batch, input : noise(batch_size)})
_, g_err = sess.run([G_train, G_loss], feed_dict = {input : noise(batch_size)})
print("Epoch = ", epoch)
print("D_loss = ", d_err)
print("G_loss = ", g_err)
loss_g_function.append(g_err)
loss_d_function.append(d_err)
# Visualizing
import matplotlib.pyplot as plt
test_noise = noise(1)
plt.subplot(2, 2, 1)
plt.plot(test_noise[0])
plt.title("Noise")
plt.subplot(2, 2, 2)
plt.imshow(np.reshape(sess.run(G, feed_dict = {input : test_noise})[0], [28, 28]))
plt.title("Generated Image")
plt.subplot(2, 2, 3)
plt.plot(loss_d_function, 'r')
plt.xlabel("Epochs")
plt.ylabel("Discriminator Loss")
plt.title("D-Loss")
plt.subplot(2, 2, 4)
plt.plot(loss_g_function, 'b')
plt.xlabel("Epochs")
plt.ylabel("Generator Loss")
plt.title("G_Loss")
plt.show()
I have tried lr = 0.001 lr = 0.0001 and lr = 0.00003.
These are my results : https://imgur.com/a/6KUnO1H
What could be the reason? My weights initialization are randomly drawn from the normal distribution. Also, please check the loss function, are they correct?

Issues:
It has just a single layer:
hl1 = tf.add(tf.matmul(x, weights['hl1']), biases['hl1'])
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl1, weights['ol']), biases['ol']))
Above network defined for both discriminator and generator has no activation defined for the first layer. This literally means the network is just one layer: y = act(w2(x*w1+b1)+b2) = act(x*w+b)
Sigmoid applied twice:
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl1, weights['ol']) ...
D_real_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(...)
As mentioned in the comments, activation is applied twice.
Weight initializations:
tf.Variable(tf.random_normal([784, 200]))
In case of sigmoid activation if the weights are large, the gradients will be small, which means the weights are effectively not changing values. (Bigger w + very small delta(w)). May be the reason why when i run the above code, the loss seems to not change much. Its better to adopt industry best practices and use something like: xavier_initializer().
Dynamic range inconsistencies:
The input to the generator is in the dynamic range of [-1, 1], it gets multipled by a weight of [-1, 1] but gets outputed to a [ 0 1] range. There is nothing wrong with this, a bias can learn to map the output range. But its better to use a activation layer, that outputs [-1, 1] like a tanh, so the network can learn faster. If tanh is used as activation for the generator, then the input images feed to the descriminator need to be scaled to [-1 1] for training consistency.
With the above changes, you can get something similar to:
The above network is a really simple one and the output quality is not great. I have deliberately not changed the complexity to find out what kind of output one can get out of a simple network.
You can build a bigger network (that includes CNN) and as well try out recent GAN models to obtain better quality results.
Code for reproducing the above can be obtained from here.

Related

Variables not updated after training in TensorFlow even when initiated with uniform random for a simple logistic regression

I am learning TensorFlow by implementing a simple logisitic regression classifier that outputs whether a digit is 7 or not when fed an MNIST image. I am using Stochastic gradient descent. The crux of the Tensorflow code is
# Maximum number of epochs
MaxEpochs = 1
# Learning rate
eta = 1e-2
ops.reset_default_graph()
n_x = 784
n_y = 1
x_tf = tf.placeholder(tf.float32, shape = [n_x, 1], name = 'x_tf')
y_tf = tf.placeholder(tf.float32, shape = [n_y, 1], name = 'y_tf')
w_tf = tf.get_variable(name = "w_tf", shape = [n_x, 1], initializer = tf.initializers.random_uniform());
b_tf = tf.get_variable(name = "b_tf", shape = [n_y, 1], initializer = tf.initializers.random_uniform());
z_tf = tf.add(tf.matmul(w_tf, x_tf, transpose_a = True), b_tf, name = 'z_tf')
yPred_tf = tf.sigmoid(z_tf, name = 'yPred_tf')
Loss_tf = tf.nn.sigmoid_cross_entropy_with_logits(logits = yPred_tf, labels = y_tf, name = 'Loss_tf')
with tf.name_scope('Training'):
optimizer_tf = tf.train.GradientDescentOptimizer(learning_rate = eta)
train_step = optimizer_tf.minimize(Loss_tf)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for Epoch in range(MaxEpochs):
for Sample in range(len(XTrain)):
x = XTrain[Sample]
y = YTrain[Sample].reshape([-1,1])
Train_sample = {x_tf: x, y_tf: y}
sess.run(train_step, feed_dict = Train_sample)
toc = time.time()
print('\nElapsed time is: ', toc-tic,'s');
It builds the following graph (tensorboard related code has been removed for convenience):
The problem is even though the weights and biases are initialised randomly (non-zero), the neuron isn't being trained. The weight histogram is as follows.
I didnt want to post something so trivial, but I am at my wit's end. Sorry for the long post. Thank you very much in advance for any guidance. A little side note, it is taking 93.35s to run, it only took 10 or so seconds when I did this with numpy (same stochastic implementation), why would this be so?
EDIT:
The bias plot over the course of the training is as follows.
EDIT: The entire code, if the issue is cropping up on something outside what I previously thought.
import tensorflow as tf
import numpy as np
import h5py
from tensorflow.python.framework import ops
import time
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
def Flatten(Im):
FlatImArray = Im.reshape([Im.shape[0],-1,1])
return FlatImArray
DigitTested = 7
# Sperating the images with 7s from the rest
TrainIdxs = [];
for i in range(len(y_train)):
if(y_train[i] == DigitTested):
TrainIdxs.append(i)
TestIdxs = [];
for i in range(len(y_test)):
if(y_test[i] == DigitTested):
TestIdxs.append(i)
# Preparing the Datasets for training and testing
XTrain = Flatten(x_train);
YTrain = np.zeros([len(x_train),1]);
YTrain[TrainIdxs] = 1;
XTest = Flatten(x_test);
YTest = np.zeros([len(x_test),1]);
YTest[TestIdxs] = 1;
tic = time.time()
# Maximum number of epochs
MaxEpochs = 1
# Learning rate
eta = 1e-2
# Number of Epochs after which the neuron is validated
ValidationInterval = 1
ops.reset_default_graph() # to be able to rerun the model without overwriting tf variables
n_x = 784
n_y = 1
x_tf = tf.placeholder(tf.float32, shape = [n_x, 1], name = 'x_tf')
y_tf = tf.placeholder(tf.float32, shape = [n_y, 1], name = 'y_tf')
w_tf = tf.get_variable(name = "w_tf", shape = [n_x, 1], initializer = tf.initializers.random_uniform());
b_tf = tf.get_variable(name = "b_tf", shape = [n_y, 1], initializer = tf.initializers.random_uniform());
z_tf = tf.add(tf.matmul(w_tf, x_tf, transpose_a = True), b_tf, name = 'z_tf')
yPred_tf = tf.sigmoid(z_tf, name = 'yPred_tf')
Loss_tf = tf.nn.sigmoid_cross_entropy_with_logits(logits = yPred_tf, labels = y_tf, name = 'Loss_tf')
with tf.name_scope('Training'):
optimizer_tf = tf.train.GradientDescentOptimizer(learning_rate = eta)
train_step = optimizer_tf.minimize(Loss_tf)
writer = tf.summary.FileWriter(r"C:\Users\braja\Documents\TBSummaries\MNIST1NTF\2")
tf.summary.histogram('Weights', w_tf)
tf.summary.scalar('Loss', tf.reshape(Loss_tf, []))
tf.summary.scalar('Bias', tf.reshape(b_tf, []))
merged_summary = tf.summary.merge_all()
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for Epoch in range(MaxEpochs):
for Sample in range(len(XTrain)):
x = XTrain[Sample]
y = YTrain[Sample].reshape([-1,1])
Train_sample = {x_tf: x, y_tf: y}
MergedSumm, _ = sess.run([merged_summary, train_step], feed_dict = Train_sample)
writer.add_summary(summary = MergedSumm, global_step = Sample)
if((Epoch+1) %ValidationInterval == 0):
ValidationError = 0
for Sample in range(len(XTest)):
x = XTest[Sample]
y = YTest[Sample].reshape([-1,1])
Test_sample = {x_tf: x, y_tf: y}
yPred = sess.run(yPred_tf, feed_dict = Test_sample)
ValidationError += abs(yPred - YTest[Sample])
print('Validation Error at', Epoch+1,'Epoch:', ValidationError);
writer.add_graph(tf.Session().graph)
writer.close()
toc = time.time()
print('\nElapsed time is: ', toc-tic,'s');
Looking at the bias value it looks like you are seeing saturation of the sigmoid function.
This happens when you push your sigmoid input(z_tf) to the extreme ends of the sigmoid function. When this happens, the gradient returned is so low that the training stagnates. The probable cause of this is that it seems you have doubled up on sigmoid functions; sigmoid_cross_entropy_with_logits applies a sigmoid to its input, but you have implemented one yourself already. Try removing one of these.
In addition, by default tf.initializers.random_uniform()) produces random values between 0:1. You probably want to initialise your Weights and biases symmetrically about 0 and at really small values to start with. This can be done by passing arguments minval and maxval to tf.initializers.random_uniform().
They should grow during training and again this prevents sigmoid saturation.

Tensorflow adam optimizer in a .pb model

I have developed a TensorFlow code that use Adam optimizer, then saved the graph and export the .pb model and correctly loaded it, my problem is when i feed it with new input image i don't get the same result compared to the result given by this code:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img, array_to_img
import cv2
import tensorflow as tf
import numpy
import numpy as np
def get_image2(imgSrc):
img = load_img(imgSrc, True) # this is a PIL image
x = img_to_array(img) # this is a Numpy array with shape (3, 150, 150
#x = x.reshape((1,) + x.shape)
x = x.astype(float)
x *= 1./255.
#x = cv2.resize(x,(512,512))
return x
def sobel2(image):
# Shape = height x width.
#image = tf.placeholder(tf.float32, shape=[None, None])
# Shape = 1 x height x width x 1.
image_resized = image#tf.expand_dims(image, 0)
Gx = tf.nn.conv2d(image_resized, sobel_x_filter, strides=[1, 1, 1, 1], padding='SAME')
Gy = tf.nn.conv2d(image_resized, sobel_y_filter,strides=[1, 1, 1, 1], padding='SAME')
#grad = tf.sqrt(tf.add(tf.pow(Gx,2),tf.pow(Gy,2)))
#grad = tf.pow(Gx,2) + tf.pow(Gy,2)
#grad = tf.truediv(grad,3.)
#grad = tf.reshape(grad, img_shape)
return Gx, Gy
image = get_image2('1.jpg')
img_shape = image.shape
print img_shape
img_h, img_w,_= img_shape
sobel_x = tf.constant([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], tf.float32)
sobel_x_filter = tf.reshape(sobel_x, [3, 3, 1, 1])
sobel_y_filter = tf.transpose(sobel_x_filter, [1, 0, 2, 3])
input_img = tf.placeholder(tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]], name="input_img")
#input_img = tf.placeholder(tf.float32, [1, 512, 512, 1], name="input_img")
gain = tf.Variable(tf.constant(1, dtype=tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]]), name="gain")
offset = tf.Variable(tf.constant(0, dtype=tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]]), name="offset")
enhanced_img = tf.add(tf.multiply(input_img, gain), offset, name = "enahnced")
#----------------------------------------------------------
# COST
#----------------------------------------------------------
input_img_deriv_x, input_img_deriv_y = sobel2(input_img)
enhanced_img_deriv_x, enhanced_img_deriv_y = sobel2(enhanced_img)
white_img = tf.constant(1, dtype=tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]])
image_pixels_count = img_h * img_w
white_cost = tf.reduce_sum(tf.pow(enhanced_img - white_img, 2))
sobel_cost = tf.reduce_sum(tf.pow(enhanced_img_deriv_x - input_img_deriv_x, 2) +
tf.pow(enhanced_img_deriv_y - input_img_deriv_y,2))
cost = tf.add(white_cost, tf.multiply(0.2, sobel_cost), name = "cost") # + tf.reduce_sum(gain - 1) + tf.reduce_sum(offset)
#----------------------------------------------------------
# TRAIN
#----------------------------------------------------------
# Parameters
learning_rate = 0.0001
training_epochs = 100
display_step = 5
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
image = image.reshape([-1,img_shape[0],img_shape[1],img_shape[2]])
#print image.shape
#print image.shape
feed = {input_img: image }
# Start training
with tf.Session() as sess:
#Run the initializer
print(sess.run(init))
# Fit all training data
for epoch in range(training_epochs):
sess.run([optimizer, cost], feed_dict = feed)
print(tf.reduce_sum(offset.eval()).eval())
if (epoch+1) % display_step == 0:
gen_img = sess.run(enhanced_img, feed_dict = feed)
gen_img = np.squeeze(gen_img, axis=0)
print(gen_img.shape)
gen_img *= 255
cv2.imwrite("result/output_2_{0}.png".format(epoch), gen_img)
I noticed that when i save the graph the optimizer state is also saved, so when i load the model and feed it with new image he will produce a false result since he will use the saved value related to the image i have used when i saved it.
How i can make the model run the optimizer for new images without using the saved parameters from previous input.
Do you understand how the optimizer actually works ?
The goal of the optimizer is to update the model weights with respect to the gradient. Concerning Adam it has two inner variables which are updated during training, it's part of the adam algorithm. So this behavior is perfectly normal. If you want to "reset" adam variables, it's perfectly doable, however I highly doubt that it's what you want to do... Very rare situations require you to do this. Btw. if you reset adam state, you will break the whole logic of the optimizer.
If you try to evaluate a new image at inference time, the optimizer should not be run, and thus your model output should not be impacted by Adam or any other optimizer.
If you try to continue the training from a preivously saved checkpoint, I would recommend that you keep the Adam state if the dataset is the same (not a transfer learning approach), and thus you should not reset adam's variables.
Btw. if you really want to reset adam, this is how you will do it:
optimizer_reset_op = tf.variables_initializer(optimizer.variables())
sess.run(optimizer_reset_op)

TensorFlow neural net outputs linear function

I implemented a basic MLP and I want it to predict a user-generated set of data, but the prediction looks as follows:
.
I am not sure why... I have nonlinearities in the hidden layers, and I tried multiple activations (ReLU, tanh, sigmoid), tried different optimisers, different learning rates, various architectures (more layers, fewer layers, dropout), but I never got this right.
Please note that I do believe it may be because of how I compute the predictions at the end (pred = sess.run(out, feed_dict={inputs:X.reshape(n_input, 1)})) as it may be incorrect, but I wouldn't know why. I also tried other methods like extracting the weights with w = sess.run(weights) and then feeding them to the model() function along with the input, but nothing worked.
Also, when monitoring the error, the error decreases between epochs.
Any ideas?
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# Architecture
input_size = 1
output_size = 1
h1_size = 20
h2_size = 50
# 2 hidden layers network
def model(inputs, weights):
out1 = tf.nn.relu(tf.matmul(inputs, weights['h1']))
out2 = tf.nn.relu(tf.matmul(out1, weights['h2']))
return tf.matmul(out2, weights['h3'])
# Inputs/label placeholders
inputs = tf.placeholder('float', shape=(None, input_size))
labels = tf.placeholder('float', shape=(None, output_size))
# Learnable weights
weights = {
'h1': tf.Variable(tf.random_normal(shape=(input_size, h1_size))),
'h2': tf.Variable(tf.random_normal(shape=(h1_size, h2_size))),
'h3': tf.Variable(tf.random_normal(shape=(h2_size, output_size))),
}
# Stores the result from the net
out = model(inputs, weights)
# Cost and optimisation
cost = tf.reduce_mean(tf.square(out - labels))
opt = tf.train.AdadeltaOptimizer()
opt_operation = opt.minimize(cost)
# Generate some data
n_input = 1000
X = np.linspace(0, 1, n_input).astype('f')
y = X + 5 * np.sin(X * 10)
y /= max(y)
# Train
epochs = 2000
lr = 0.0000001
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(epochs):
_, c = sess.run([opt_operation, cost], feed_dict={
inputs: X.reshape(n_input, 1),
labels: y.reshape(n_input, 1),
})
if not epoch % int(epochs/20):
print(c)
pred = sess.run(out, feed_dict={inputs:X.reshape(n_input, 1)})
plt.scatter(X, pred, color='red', label='prediction')
plt.scatter(X, y, label='data')
plt.legend()
plt.show()
Forgot bias terms: new graph
It works now but not sure if this fixed it?
New code uses:
weights = {
'h1': tf.Variable(tf.random_normal(shape=(input_size, h1_size))),
'h2': tf.Variable(tf.random_normal(shape=(h1_size, h2_size))),
'h3': tf.Variable(tf.random_normal(shape=(h2_size, output_size))),
'b1': tf.Variable(tf.zeros(shape=[1])),
'b2': tf.Variable(tf.zeros(shape=[1])),
'b3': tf.Variable(tf.zeros(shape=[1])),
}
and
def model(inputs, weights):
out1 = tf.nn.relu(tf.matmul(inputs, weights['h1']) + weights['b1'])
out2 = tf.nn.relu(tf.matmul(out1, weights['h2']) + weights['b2'])
return tf.matmul(out2, weights['h3'] + weights['b3'])

New to Tensorflow - trying to repurpose an MNIST multilayer network to a calculator

Could someone help or guide me through what I should do better in order for this to work?
I changed the number of inputs to 2 and generated some random data, "x1" and "x2" (one number to be added to another). The idea is to use variables "add" and "mul" as the real output and base the cost (variable "Y") off of that, but I'm having trouble manipulating the data so it inputs properly.
I tried to make another variable with
x = tf.Variable([100 * np.random.random_sample([100]), 100 * np.random.random_sample([100]))
and a few other alternative ways, but that caused errors. Also, if there's anything else wrong in my code, please critique it! Anything helps.
Thank you.
'''
A Recurrent Neural Network implementation example using TensorFlow Library.
Author: *********
'''
import numpy as np
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
# Parameters
training_iters = 1000
n_epochs = 1000
batch_size = 128
display_step = 100
learning_rate = 0.001
n_observations = 100
n_input = 2 # Input data (Num + Num)
n_steps = 28 # timesteps
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_classes = 1 # Output
X = tf.placeholder("float", [None, n_input])
X1 = tf.placeholder(tf.float32)
X2 = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)
# Random input data
x1 = 100 * np.random.random_sample([100,])
x2 = 100 * np.random.random_sample([100,])
add = tf.add(x1, x2)
mul = tf.mul(X1, X2)
weights = {
'hidden1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
#'hidden2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'hidden1': tf.Variable(tf.random_normal([n_hidden_1])),
#'hidden2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
def RNN(_X1, _weights, _biases):
# Layer 1.1
layer_1 = tf.add(tf.matmul(_X1, weights['hidden1']), biases['hidden1'])
layer_1 = tf.nn.relu(layer_1)
# Layer 1.2
# layer_1_2 = tf.add(tf.matmul(_X2, weights['hidden2']), biases['hidden2'])
# layer_1_2 = tf.nn.relu(layer_1_2)
# Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['out']), biases['out'])
output = tf.nn.relu(layer_2)
return output
pred = RNN(X1, weights, biases)
cost = tf.reduce_sum(tf.pow(pred - Y, 2)) / (n_observations - 1)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Adam Optimizer
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(Y,1))
init = tf.initialize_all_variables()
# initData = tf.initialize_variables(x1.all(), x2.all())
with tf.Session() as sess:
# Here we tell tensorflow that we want to initialize all
# the variables in the graph so we can use them
sess.run(init)
# Fit all training data
prev_training_cost = 0.0
for epoch_i in range(n_epochs) :
for (_x1) in x1:
for (_x2) in x2:
print("Input 1:")
print(_x1)
print("Input 2:")
print(_x2)
print("Add function: ")
print(sess.run(add, feed_dict={X1: x1, X2: x2}))
y = sess.run(add, feed_dict={X1: x1, X2: x2})
print(y)
sess.run(optimizer, feed_dict={X: x, Y: y})
training_cost = sess.run(
cost, feed_dict={X: xs, Y: ys})
print(training_cost)
if epoch_i % 20 == 0:
ax.plot(X1, X2, pred.eval(
feed_dict={X1: x1, X2: x2}, session=sess),
'k', alpha=epoch_i / n_epochs)
fig.show()
plt.draw()
# Allow the training to quit if we've reached a minimum
if np.abs(prev_training_cost - training_cost) < 0.000001:
break
prev_training_cost = training_cost
So are you training a feed forward network or a recurrent neural network?
The code you write within RNN() remind me of a simple neural network (feedforward network). Yet your tittle says you are working on RNN's
You might find this implementation interesting. Like you, it generates vectors of integers and uses an RNN to do the addition

How could I use batch normalization in TensorFlow?

I would like to use batch normalization in TensorFlow. I found the related C++ source code in core/ops/nn_ops.cc. However, I did not find it documented on tensorflow.org.
BN has different semantics in MLP and CNN, so I am not sure what exactly this BN does.
I did not find a method called MovingMoments either.
Update July 2016 The easiest way to use batch normalization in TensorFlow is through the higher-level interfaces provided in either contrib/layers, tflearn, or slim.
Previous answer if you want to DIY:
The documentation string for this has improved since the release - see the docs comment in the master branch instead of the one you found. It clarifies, in particular, that it's the output from tf.nn.moments.
You can see a very simple example of its use in the batch_norm test code. For a more real-world use example, I've included below the helper class and use notes that I scribbled up for my own use (no warranty provided!):
"""A helper class for managing batch normalization state.
This class is designed to simplify adding batch normalization
(http://arxiv.org/pdf/1502.03167v3.pdf) to your model by
managing the state variables associated with it.
Important use note: The function get_assigner() returns
an op that must be executed to save the updated state.
A suggested way to do this is to make execution of the
model optimizer force it, e.g., by:
update_assignments = tf.group(bn1.get_assigner(),
bn2.get_assigner())
with tf.control_dependencies([optimizer]):
optimizer = tf.group(update_assignments)
"""
import tensorflow as tf
class ConvolutionalBatchNormalizer(object):
"""Helper class that groups the normalization logic and variables.
Use:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn = ConvolutionalBatchNormalizer(depth, 0.001, ewma, True)
update_assignments = bn.get_assigner()
x = bn.normalize(y, train=training?)
(the output x will be batch-normalized).
"""
def __init__(self, depth, epsilon, ewma_trainer, scale_after_norm):
self.mean = tf.Variable(tf.constant(0.0, shape=[depth]),
trainable=False)
self.variance = tf.Variable(tf.constant(1.0, shape=[depth]),
trainable=False)
self.beta = tf.Variable(tf.constant(0.0, shape=[depth]))
self.gamma = tf.Variable(tf.constant(1.0, shape=[depth]))
self.ewma_trainer = ewma_trainer
self.epsilon = epsilon
self.scale_after_norm = scale_after_norm
def get_assigner(self):
"""Returns an EWMA apply op that must be invoked after optimization."""
return self.ewma_trainer.apply([self.mean, self.variance])
def normalize(self, x, train=True):
"""Returns a batch-normalized version of x."""
if train:
mean, variance = tf.nn.moments(x, [0, 1, 2])
assign_mean = self.mean.assign(mean)
assign_variance = self.variance.assign(variance)
with tf.control_dependencies([assign_mean, assign_variance]):
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, self.beta, self.gamma,
self.epsilon, self.scale_after_norm)
else:
mean = self.ewma_trainer.average(self.mean)
variance = self.ewma_trainer.average(self.variance)
local_beta = tf.identity(self.beta)
local_gamma = tf.identity(self.gamma)
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, local_beta, local_gamma,
self.epsilon, self.scale_after_norm)
Note that I called it a ConvolutionalBatchNormalizer because it pins the use of tf.nn.moments to sum across axes 0, 1, and 2, whereas for non-convolutional use you might only want axis 0.
Feedback appreciated if you use it.
As of TensorFlow 1.0 (February 2017) there's also the high-level tf.layers.batch_normalization API included in TensorFlow itself.
It's super simple to use:
# Set this to True for training and False for testing
training = tf.placeholder(tf.bool)
x = tf.layers.dense(input_x, units=100)
x = tf.layers.batch_normalization(x, training=training)
x = tf.nn.relu(x)
...except that it adds extra ops to the graph (for updating its mean and variance variables) in such a way that they won't be dependencies of your training op. You can either just run the ops separately:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
sess.run([train_op, extra_update_ops], ...)
or add the update ops as dependencies of your training op manually, then just run your training op as normal:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
train_op = optimizer.minimize(loss)
...
sess.run([train_op], ...)
The following works fine for me, it does not require invoking EMA-apply outside.
import numpy as np
import tensorflow as tf
from tensorflow.python import control_flow_ops
def batch_norm(x, n_out, phase_train, scope='bn'):
"""
Batch normalization on convolutional maps.
Args:
x: Tensor, 4D BHWD input maps
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope):
beta = tf.Variable(tf.constant(0.0, shape=[n_out]),
name='beta', trainable=True)
gamma = tf.Variable(tf.constant(1.0, shape=[n_out]),
name='gamma', trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0,1,2], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train,
mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
Example:
import math
n_in, n_out = 3, 16
ksize = 3
stride = 1
phase_train = tf.placeholder(tf.bool, name='phase_train')
input_image = tf.placeholder(tf.float32, name='input_image')
kernel = tf.Variable(tf.truncated_normal([ksize, ksize, n_in, n_out],
stddev=math.sqrt(2.0/(ksize*ksize*n_out))),
name='kernel')
conv = tf.nn.conv2d(input_image, kernel, [1,stride,stride,1], padding='SAME')
conv_bn = batch_norm(conv, n_out, phase_train)
relu = tf.nn.relu(conv_bn)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
for i in range(20):
test_image = np.random.rand(4,32,32,3)
sess_outputs = session.run([relu],
{input_image.name: test_image, phase_train.name: True})
There is also an "official" batch normalization layer coded by the developers. They don't have very good docs on how to use it but here is how to use it (according to me):
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
def batch_norm_layer(x,train_phase,scope_bn):
bn_train = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
to actually use it you need to create a placeholder for train_phase that indicates if you are in training or inference phase (as in train_phase = tf.placeholder(tf.bool, name='phase_train')). Its value can be filled during inference or training with a tf.session as in:
test_error = sess.run(fetches=cross_entropy, feed_dict={x: batch_xtest, y_:batch_ytest, train_phase: False})
or during training:
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_:batch_ys, train_phase: True})
I'm pretty sure this is correct according to the discussion in github.
Seems there is another useful link:
http://r2rt.com/implementing-batch-normalization-in-tensorflow.html
You can simply use the build-in batch_norm layer:
batch_norm = tf.cond(is_train,
lambda: tf.contrib.layers.batch_norm(prev, activation_fn=tf.nn.relu, is_training=True, reuse=None),
lambda: tf.contrib.layers.batch_norm(prev, activation_fn =tf.nn.relu, is_training=False, reuse=True))
where prev is the output of your previous layer (can be both fully-connected or a convolutional layer) and is_train is a boolean placeholder. Just use batch_norm as the input to the next layer, then.
Since someone recently edited this, I'd like to clarify that this is no longer an issue.
This answer does not seem correct When phase_train is set to false, it still updates the ema mean and variance. This can be verified with the following code snippet.
x = tf.placeholder(tf.float32, [None, 20, 20, 10], name='input')
phase_train = tf.placeholder(tf.bool, name='phase_train')
# generate random noise to pass into batch norm
x_gen = tf.random_normal([50,20,20,10])
pt_false = tf.Variable(tf.constant(True))
#generate a constant variable to pass into batch norm
y = x_gen.eval()
[bn, bn_vars] = batch_norm(x, 10, phase_train)
tf.initialize_all_variables().run()
train_step = lambda: bn.eval({x:x_gen.eval(), phase_train:True})
test_step = lambda: bn.eval({x:y, phase_train:False})
test_step_c = lambda: bn.eval({x:y, phase_train:True})
# Verify that this is different as expected, two different x's have different norms
print(train_step()[0][0][0])
print(train_step()[0][0][0])
# Verify that this is same as expected, same x's (y) have same norm
print(train_step_c()[0][0][0])
print(train_step_c()[0][0][0])
# THIS IS DIFFERENT but should be they same, should only be reading from the ema.
print(test_step()[0][0][0])
print(test_step()[0][0][0])
Using TensorFlow built-in batch_norm layer, below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine. Just FYI this example is mostly built upon the data and code from Udacity DeepLearning course.
P.S. Yes, parts of it were discussed one way or another in answers earlier but I decided to gather in one code snippet everything so that you have example of whole network training process with Batch Normalization and its evaluation
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
So a simple example of the use of this batchnorm class:
from bn_class import *
with tf.name_scope('Batch_norm_conv1') as scope:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn_conv1 = ConvolutionalBatchNormalizer(num_filt_1, 0.001, ewma, True)
update_assignments = bn_conv1.get_assigner()
a_conv1 = bn_conv1.normalize(a_conv1, train=bn_train)
h_conv1 = tf.nn.relu(a_conv1)

Categories