Tensorflow 2: GradientTape returns None - python

I have the following code, written in tf.keras with Tensorflow 2. Basically; I need the cross entropy term's gradient with respect to the variable self.temperature. dce1_dx correctly calculates the derivative. But on the other hand, when I wrap the same cross entropy calculation into a tf.keras.Model object, the second gradient calculation, dce2_dx returns None. What is the difference between these two tf.GradientTape calculations? I am experienced in TF1 but newly switching to TF2 and eager execution, so I am stuck at that point.
import numpy as np
import tensorflow as tf
logits = np.random.uniform(low=-10.0, high=10.0, size=(10000, 5))
labels = np.random.randint(low=0, high=5, size=(10000, ))
logits_tf = tf.keras.Input(name="logits_tf", shape=(logits.shape[1]), dtype=tf.float32)
labels_tf = tf.keras.Input(name="labels_tf", shape=(), dtype=tf.int32)
dataset = tf.data.Dataset.from_tensor_slices((logits, labels))
dataset = dataset.batch(batch_size=logits.shape[0])
for lgts, idx in dataset:
temperature = tf.Variable(name="temperature", dtype=tf.float32, initial_value=tf.constant(2.0),
trainable=True)
scaled_logits = logits_tf / temperature
ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ce_loss = ce_loss(labels_tf, scaled_logits)
model = tf.keras.Model(inputs=[logits_tf, labels_tf], outputs=[ce_loss], name="calibration_model")
with tf.GradientTape() as tape0:
tape0.watch(temperature)
scaled_lgts = tf.cast(lgts, tf.float32) / temperature
ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ce = ce(idx, scaled_lgts)
dce1_dx = tape0.gradient(ce, temperature)
with tf.GradientTape() as tape1:
# Compute the derivative: d{CrossEntropy}/d{Temperature}
tape1.watch(temperature)
ce2 = model([lgts, idx])
# !!!Returns None!!!
dce2_dx = tape1.gradient(ce2, temperature)

Related

Custom loss function for out of distribution detection using CNN in Tensorflow 2.0+

My question is in reference to the paper Learning Confidence for Out-of-Distribution Detection in Neural Networks.
I need help in creating a custom loss function in tensorflow 2.0+ as per the paper to get confident prediction from the CNN on a in distribution (if the image belongs to train categories) image while a low prediction for an out of distribution (any random image) image. The paper suggests adding a confidence estimation branch to any conventional feedforward architecture in parallel with the original class prediction branch (refer to image below)
In order to define the loss function, the softmax prediction probabilities are adjusted by interpolating between the original predictions(pi) and the target probability distribution y, where the degree of interpolation is indicated by the network’s confidence(c):
pi'= c · pi + (1 − c)yi and the final loss is :
I need help in implementing this along with the loss function in Tensorflow 2.0+, below is what I could think of, from my knowledge:
import tensorflow.keras.backend as k
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import ResNet50
#Defining custom loss function
def custom_loss(c):
def loss(y_true, y_pred):
interpolated_p = c*y_pred+ (1-c)*y_true
return -k.reduce_sum((k.log(interpolated_p) * y_true), axis=-1) - k.log(c)
return loss
#Defining model strcuture using resnet50
basemodel = ResNet50(weights = "imagenet",include_top = False)
headmodel = basemodel.output
headmodel = layers.AveragePooling2D(pool_size = (7,7))(headmodel)
#Add a sigmoid layer to the pooling output
conf_branch = layers.Dense(1,activation = "sigmoid",name = "confidence_branch")(headmodel)
# Add a softmax layer after the pooling output
softmax_branch = layers.Dense(10,activation = "softmax",name = "softmax_branch")(headmodel)
# Instantiate an end-to-end model predicting both confidence and class prediction
model = keras.Model(
inputs=basemodel.input,
outputs=[softmax_branch, conf_branch],
)
model.compile(loss=custom_loss(c=conf_branch.output), optimizer='rmsprop')
Appreciate any help on this ! Thanks !
The following is the code I wrote for the keras implementation:
num_classes = 10
basemodel = ResNet50(weights = "imagenet",include_top = False)
headmodel = basemodel.output
headmodel = layers.AveragePooling2D(pool_size = (7,7))(headmodel)
conf_branch = layers.Dense(1,activation = "sigmoid",name="confidence_branch")(headmodel)
softmax_branch = layers.Dense(num_classes,activation = "softmax",name = "softmax_branch")(headmodel)
output = Concatenate(axis=-1)([softmax_branch , conf_branch])
def custom_loss(y_true, y_pred, budget=0.3):
with tf.compat.v1.variable_scope("LAMBDA", reuse=tf.compat.v1.AUTO_REUSE):
LAMBDA = tf.compat.v1.get_variable("LAMBDA", dtype=tf.float32, initializer=tf.constant(0.1))
pred_original = y_pred[:, 0:num_classes]
confidence = y_pred[:, num_classes]
eps = 1e-12
pred_original = tf.clip_by_value(pred_original, 0. + eps, 1. - eps)
confidence = tf.clip_by_value(confidence, 0. + eps, 1. - eps)
b = np.random.uniform(size=y_true.shape[0], low=0.0, high=1.0)
conf = confidence * b + (1 - b)
conf = tf.expand_dims(conf, axis=-1)
pred_new = pred_original * conf + y_true * (1 - conf)
xentropy_loss = tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(pred_new), axis=-1))
confidence_loss = tf.reduce_mean(-tf.math.log(confidence))
total_loss = xentropy_loss + LAMBDA * confidence_loss
def true_func():
return LAMBDA / 1.01
def false_func():
return LAMBDA / 0.99
LAMBDA_NEW = tf.cond(budget > confidence_loss, true_func, false_func)
LAMBDA.assign(LAMBDA_NEW)
# tf.print(LAMBDA)
return total_loss
def accuracy(y_true, y_pred):
y_pred = y_pred[:, :num_classes]
correct_pred = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return accuracy
model = Model(inputs=basemodel.input, outputs=output)
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss=custom_loss, optimizer=optimizer, metrics=[accuracy])

No gradients provided for any variable - Custom loss function with random weights depending on the Softmax output

I have difficulties writing a custom loss function that makes use of some random weights generated according to the class/state predicted by the Softmax output. The desired property is:
The model is a simple feedforward neural network with input-dimension as 1 and the output dimension as 6.
The activation function of the output layer is Softmax, which intends to estimate the actual number of classes or states using Argmax.
Note that the training data only consists of X (there is no Y).
The loss function is defined according to random weights (i.e., Weibull distribution) sampled based on the predicted state number for each input sample X.
As follows, I provided a minimal example for illustration. For simplification purposes, I only define the loss function based on the random weights for state/class-1. I get: "ValueError: No gradients provided for any variable: ['dense_41/kernel:0', 'dense_41/bias:0', 'dense_42/kernel:0', 'dense_42/bias:0']."
As indicated in the post below, I found out that argmax is not differntiable, and a softargmax function would help (as I implemented in the following code). However, I still get the same error.
Getting around tf.argmax which is not differentiable
import sys
import time
from tqdm import tqdm
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from scipy.stats import weibull_min
###############################################################################################
# Generate Dataset
lb = np.array([2.0]) # Left boundary
ub = np.array([100.0]) # Right boundary
# Data Points - uniformly distributed
N_r = 50
X_r = np.linspace(lb, ub, N_r)
###############################################################################################
#Define Model
class DGM:
# Initialize the class
def __init__(self, X_r):
#Normalize training input data
self.Xmean, self.Xstd = np.mean(X_r), np.std(X_r)
X_r = (X_r - self.Xmean) / self.Xstd
self.X_r = X_r
#Input and output variable dimensions
self.X_dim = 1; self.Y_dim = 6
# Define tensors
self.X_r_tf = tf.convert_to_tensor(X_r, dtype=tf.float32)
#Learning rate
self.LEARNING_RATE=1e-4
#Feedforward neural network model
self.modelTest = self.test_model()
###############################################
# Initialize network weights and biases
def test_model(self):
input_shape = self.X_dim
dimensionality = self.Y_dim
model = tf.keras.Sequential()
model.add(layers.Input(shape=input_shape))
model.add(layers.Dense(64, kernel_initializer='glorot_uniform',bias_initializer='zeros'))
model.add(layers.Activation('tanh'))
model.add(layers.Dense(dimensionality))
model.add(layers.Activation('softmax'))
return model
##############################################
def compute_loss(self):
#Define optimizer
gen_opt = tf.keras.optimizers.Adam(lr=self.LEARNING_RATE, beta_1=0.0,beta_2=0.9)
with tf.GradientTape() as test_tape:
###### calculate loss
generated_u = self.modelTest(self.X_r_tf, training=True)
#number of data
n_data = generated_u.shape[0]
#initialize random weights assuming state-1 at all input samples
wt1 = np.zeros((n_data, 1),dtype=np.float32) #initialize weights
for b in range(n_data):
wt1[b] = weibull_min.rvs(c=2, loc=0, scale =4 , size=1)
wt1 = tf.reshape(tf.convert_to_tensor(wt1, dtype=tf.float32),shape=(n_data,1))
#print('-----------sampling done-----------')
#determine the actual state using softargmax
idst = self.softargmax(generated_u)
idst = tf.reshape(tf.cast(idst, tf.float32),shape=(n_data,1))
#index state-1
id1 = tf.constant(0.,dtype=tf.float32)
#assign weights if predicted state is state-1
wt1_final = tf.cast(tf.equal(idst, id1), dtype=tf.float32)*wt1
#final loss
test_loss = tf.reduce_mean(tf.square(wt1_final))
#print('-----------test loss calcuated-----------')
gradients_of_modelTest = test_tape.gradient(test_loss,
[self.modelTest.trainable_variables])
gen_opt.apply_gradients(zip(gradients_of_modelTest[0],self.modelTest.trainable_variables))
return test_loss
#reference: Getting around tf.argmax which is not differentiable
#https://stackoverflow.com/questions/46926809/getting-around-tf-argmax-which-is-not-differentiable
def softargmax(self, x, beta=1e10):
x = tf.convert_to_tensor(x)
x_range = tf.range(x.shape.as_list()[-1], dtype=x.dtype)
return tf.reduce_sum(tf.nn.softmax(x*beta,axis=1) * x_range, axis=-1)
##############################################
def train(self,training_steps=100):
train_start_time = time.time()
for step in tqdm(range(training_steps), desc='Training'):
start = time.time()
test_loss = self.compute_loss()
if (step + 1) % 10 == 0:
elapsed_time = time.time() - train_start_time
sec_per_step = elapsed_time / step
mins_left = ((training_steps - step) * sec_per_step)
tf.print("\nStep # ", step, "/", training_steps,
output_stream=sys.stdout)
tf.print("Current time:", elapsed_time, " time left:",
mins_left, output_stream=sys.stdout)
tf.print("Test Loss: ", test_loss, output_stream=sys.stdout)
###############################################################################################
#Define and train the model
model = DGM(X_r)
model.train(training_steps=100)

Why does sigmoid & crossentropy of Keras/tensorflow have low precision?

I have the following simple neural network (with 1 neuron only) to test the computation precision of sigmoid activation & binary_crossentropy of Keras:
model = Sequential()
model.add(Dense(1, input_dim=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
To simplify the test, I manually set the only weight to 1 and bias to 0, and then evaluate the model with 2-point training set {(-a, 0), (a, 1)}, i.e.
y = numpy.array([0, 1])
for a in range(40):
x = numpy.array([-a, a])
keras_ce[a] = model.evaluate(x, y)[0] # cross-entropy computed by keras/tensorflow
my_ce[a] = np.log(1+exp(-a)) # My own computation
My Question: I found the binary crossentropy (keras_ce) computed by Keras/Tensorflow reach a floor of 1.09e-7 when a is approx. 16, as illustrated below (blue line). It doesn't decrease further as 'a' keeps growing. Why is that?
This neural network has 1 neuron only whose weight is set to 1 and bias is 0. With the 2-point training set {(-a, 0), (a, 1)}, the binary_crossentropy is just
-1/2 [ log(1 - 1/(1+exp(a)) ) + log( 1/(1+exp(-a)) ) ] = log(1+exp(-a))
So the cross-entropy should decrease as a increases, as illustrated in orange ('my') above. Is there some Keras/Tensorflow/Python setup I can change to increase its precision? Or am I mistaken somewhere? I'd appreciate any suggestions/comments/answers.
TL;DR version: the probability values (i.e. the outputs of sigmoid function) are clipped due to numerical stability when computing the loss function.
If you inspect the source code, you would find that using binary_crossentropy as the loss would result in a call to binary_crossentropy function in losses.py file:
def binary_crossentropy(y_true, y_pred):
return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
which in turn, as you can see, calls the equivalent backend function. In case of using Tensorflow as the backend, that would result in a call to binary_crossentropy function in tensorflow_backend.py file:
def binary_crossentropy(target, output, from_logits=False):
""" Docstring ..."""
# Note: tf.nn.sigmoid_cross_entropy_with_logits
# expects logits, Keras expects probabilities.
if not from_logits:
# transform back to logits
_epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
output = tf.log(output / (1 - output))
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target,
logits=output)
As you can see from_logits argument is set to False by default. Therefore, the if condition evaluates to true and as a result the values in the output are clipped to the range [epsilon, 1-epislon]. That's why no matter how small or large a probability is, it could not be smaller than epsilon and greater than 1-epsilon. And that explains why the output of binary_crossentropy loss is also bounded.
Now, what is this epsilon here? It is a very small constant which is used for numerical stability (e.g. prevent division by zero or undefined behaviors, etc.). To find out its value you can further inspect the source code and you would find it in the common.py file:
_EPSILON = 1e-7
def epsilon():
"""Returns the value of the fuzz factor used in numeric expressions.
# Returns
A float.
# Example
```python
>>> keras.backend.epsilon()
1e-07
```
"""
return _EPSILON
If for any reason, you would like more precision you can alternatively set the epsilon value to a smaller constant using set_epsilon function from the backend:
def set_epsilon(e):
"""Sets the value of the fuzz factor used in numeric expressions.
# Arguments
e: float. New value of epsilon.
# Example
```python
>>> from keras import backend as K
>>> K.epsilon()
1e-07
>>> K.set_epsilon(1e-05)
>>> K.epsilon()
1e-05
```
"""
global _EPSILON
_EPSILON = e
However, be aware that setting epsilon to an extremely low positive value or zero, may disrupt the stability of computations all over the Keras.
I think that keras take into account numerical stability,
Let's track how keras caculate
First,
def binary_crossentropy(y_true, y_pred):
return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
Then,
def binary_crossentropy(target, output, from_logits=False):
"""Binary crossentropy between an output tensor and a target tensor.
# Arguments
target: A tensor with the same shape as `output`.
output: A tensor.
from_logits: Whether `output` is expected to be a logits tensor.
By default, we consider that `output`
encodes a probability distribution.
# Returns
A tensor.
"""
# Note: tf.nn.sigmoid_cross_entropy_with_logits
# expects logits, Keras expects probabilities.
if not from_logits:
# transform back to logits
_epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
output = tf.log(output / (1 - output))
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target,
logits=output)
Notice tf.clip_by_value is used for numerical stability
Let's compare keras binary_crossentropy, tensorflow tf.nn.sigmoid_cross_entropy_with_logits and custom loss function(eleminate vale clipping)
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
import keras
# keras
model = Sequential()
model.add(Dense(units=1, activation='sigmoid', input_shape=(
1,), weights=[np.ones((1, 1)), np.zeros(1)]))
# print(model.get_weights())
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
# tensorflow
G = tf.Graph()
with G.as_default():
x_holder = tf.placeholder(dtype=tf.float32, shape=(2,))
y_holder = tf.placeholder(dtype=tf.float32, shape=(2,))
entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
logits=x_holder, labels=y_holder))
sess = tf.Session(graph=G)
# keras with custom loss function
def customLoss(target, output):
# if not from_logits:
# # transform back to logits
# _epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
# output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
# output = tf.log(output / (1 - output))
output = tf.log(output / (1 - output))
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target,
logits=output)
model_m = Sequential()
model_m.add(Dense(units=1, activation='sigmoid', input_shape=(
1,), weights=[np.ones((1, 1)), np.zeros(1)]))
# print(model.get_weights())
model_m.compile(loss=customLoss,
optimizer='adam', metrics=['accuracy'])
N = 100
xaxis = np.linspace(10, 20, N)
keras_ce = np.zeros(N)
tf_ce = np.zeros(N)
my_ce = np.zeros(N)
keras_custom = np.zeros(N)
y = np.array([0, 1])
for i, a in enumerate(xaxis):
x = np.array([-a, a])
# cross-entropy computed by keras/tensorflow
keras_ce[i] = model.evaluate(x, y)[0]
my_ce[i] = np.log(1+np.exp(-a)) # My own computation
tf_ce[i] = sess.run(entropy, feed_dict={x_holder: x, y_holder: y})
keras_custom[i] = model_m.evaluate(x, y)[0]
# print(model.get_weights())
plt.plot(xaxis, keras_ce, label='keras')
plt.plot(xaxis, my_ce, 'b', label='my_ce')
plt.plot(xaxis, tf_ce, 'r:', linewidth=5, label='tensorflow')
plt.plot(xaxis, keras_custom, '--', label='custom loss')
plt.xlabel('a')
plt.ylabel('xentropy')
plt.yscale('log')
plt.legend()
plt.savefig('compare.jpg')
plt.show()
we can see that tensorflow is same with manual computing, but keras with custom loss encounter numeric overflow as expected.

Getting InvalidArgumentError in softmax_cross_entropy_with_logits

I'm pretty new to tensorflow and trying to do some experiments with the Iris dataset. I created following model function (MWE):
def model_fn(features, labels, mode):
net = tf.feature_column.input_layer(features, [tf.feature_column.numeric_column(key=key) for key in FEATURE_NAMES])
logits = tf.layers.dense(inputs=net, units=3)
loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
Unfortunately I get the following error:
InvalidArgumentError: Input to reshape is a tensor with 256 values, but the requested shape has 1
[[Node: Reshape = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](softmax_cross_entropy_with_logits_sg, Reshape/shape)]]
Seems to be some problem with the shapes of the tensors. However both logits and labels have an equal shape of (256, 3) - as it is required by the documentation. Also both tensors have type float32.
Just for the sake of completeness, here is the input function for the estimator:
import pandas as pd
import tensorflow as tf
import numpy as np
IRIS_DATA = "data/iris.csv"
FEATURE_NAMES = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
CLASS_NAME = ["class"]
COLUMNS = FEATURE_NAMES + CLASS_NAME
# read dataset
iris = pd.read_csv(IRIS_DATA, header=None, names=COLUMNS)
# encode classes
iris["class"] = iris["class"].astype('category').cat.codes
# train test split
np.random.seed(1)
msk = np.random.rand(len(iris)) < 0.8
train = iris[msk]
test = iris[~msk]
def iris_input_fn(batch_size=256, mode="TRAIN"):
def prepare_input(data=None):
#do mean normaization across all samples
mu = np.mean(data)
sigma = np.std(data)
data = data - mu
data = data / sigma
is_nan = np.isnan(data)
is_inf = np.isinf(data)
if np.any(is_nan) or np.any(is_inf):
print('data is not well-formed : is_nan {n}, is_inf: {i}'.format(n= np.any(is_nan), i=np.any(is_inf)))
data = transform_data(data)
return data
def transform_data(data):
data = data.astype(np.float32)
return data
def load_data():
global train
trn_all_data=train.iloc[:,:-1]
trn_all_labels=train.iloc[:,-1]
return (trn_all_data.astype(np.float32),
trn_all_labels.astype(np.int32))
data, labels = load_data()
data = prepare_input(data)
labels = tf.one_hot(labels, depth=3)
labels = tf.cast(labels, tf.float32)
dataset = tf.data.Dataset.from_tensor_slices((data.to_dict(orient="list"), labels))
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
return dataset.make_one_shot_iterator().get_next()
Dataset from UCI repo
Solved the problem by replacing the loss function from nn module:
loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
by the loss function of losses module
loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
or by
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))
loss which is fed to the minimize method of GradientDescentOptimizer needed to be a scalar. A single value for the whole batch.
Problem was, I computed the softmax cross entropy for each element in the batch, which resulted in a tensor containing 256 (batch size) cross entropy values, and tried to feed this in the minimize method. Therefore the error message
Input to reshape is a tensor with 256 values, but the requested shape has 1

How could I use batch normalization in TensorFlow?

I would like to use batch normalization in TensorFlow. I found the related C++ source code in core/ops/nn_ops.cc. However, I did not find it documented on tensorflow.org.
BN has different semantics in MLP and CNN, so I am not sure what exactly this BN does.
I did not find a method called MovingMoments either.
Update July 2016 The easiest way to use batch normalization in TensorFlow is through the higher-level interfaces provided in either contrib/layers, tflearn, or slim.
Previous answer if you want to DIY:
The documentation string for this has improved since the release - see the docs comment in the master branch instead of the one you found. It clarifies, in particular, that it's the output from tf.nn.moments.
You can see a very simple example of its use in the batch_norm test code. For a more real-world use example, I've included below the helper class and use notes that I scribbled up for my own use (no warranty provided!):
"""A helper class for managing batch normalization state.
This class is designed to simplify adding batch normalization
(http://arxiv.org/pdf/1502.03167v3.pdf) to your model by
managing the state variables associated with it.
Important use note: The function get_assigner() returns
an op that must be executed to save the updated state.
A suggested way to do this is to make execution of the
model optimizer force it, e.g., by:
update_assignments = tf.group(bn1.get_assigner(),
bn2.get_assigner())
with tf.control_dependencies([optimizer]):
optimizer = tf.group(update_assignments)
"""
import tensorflow as tf
class ConvolutionalBatchNormalizer(object):
"""Helper class that groups the normalization logic and variables.
Use:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn = ConvolutionalBatchNormalizer(depth, 0.001, ewma, True)
update_assignments = bn.get_assigner()
x = bn.normalize(y, train=training?)
(the output x will be batch-normalized).
"""
def __init__(self, depth, epsilon, ewma_trainer, scale_after_norm):
self.mean = tf.Variable(tf.constant(0.0, shape=[depth]),
trainable=False)
self.variance = tf.Variable(tf.constant(1.0, shape=[depth]),
trainable=False)
self.beta = tf.Variable(tf.constant(0.0, shape=[depth]))
self.gamma = tf.Variable(tf.constant(1.0, shape=[depth]))
self.ewma_trainer = ewma_trainer
self.epsilon = epsilon
self.scale_after_norm = scale_after_norm
def get_assigner(self):
"""Returns an EWMA apply op that must be invoked after optimization."""
return self.ewma_trainer.apply([self.mean, self.variance])
def normalize(self, x, train=True):
"""Returns a batch-normalized version of x."""
if train:
mean, variance = tf.nn.moments(x, [0, 1, 2])
assign_mean = self.mean.assign(mean)
assign_variance = self.variance.assign(variance)
with tf.control_dependencies([assign_mean, assign_variance]):
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, self.beta, self.gamma,
self.epsilon, self.scale_after_norm)
else:
mean = self.ewma_trainer.average(self.mean)
variance = self.ewma_trainer.average(self.variance)
local_beta = tf.identity(self.beta)
local_gamma = tf.identity(self.gamma)
return tf.nn.batch_norm_with_global_normalization(
x, mean, variance, local_beta, local_gamma,
self.epsilon, self.scale_after_norm)
Note that I called it a ConvolutionalBatchNormalizer because it pins the use of tf.nn.moments to sum across axes 0, 1, and 2, whereas for non-convolutional use you might only want axis 0.
Feedback appreciated if you use it.
As of TensorFlow 1.0 (February 2017) there's also the high-level tf.layers.batch_normalization API included in TensorFlow itself.
It's super simple to use:
# Set this to True for training and False for testing
training = tf.placeholder(tf.bool)
x = tf.layers.dense(input_x, units=100)
x = tf.layers.batch_normalization(x, training=training)
x = tf.nn.relu(x)
...except that it adds extra ops to the graph (for updating its mean and variance variables) in such a way that they won't be dependencies of your training op. You can either just run the ops separately:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
sess.run([train_op, extra_update_ops], ...)
or add the update ops as dependencies of your training op manually, then just run your training op as normal:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
train_op = optimizer.minimize(loss)
...
sess.run([train_op], ...)
The following works fine for me, it does not require invoking EMA-apply outside.
import numpy as np
import tensorflow as tf
from tensorflow.python import control_flow_ops
def batch_norm(x, n_out, phase_train, scope='bn'):
"""
Batch normalization on convolutional maps.
Args:
x: Tensor, 4D BHWD input maps
n_out: integer, depth of input maps
phase_train: boolean tf.Varialbe, true indicates training phase
scope: string, variable scope
Return:
normed: batch-normalized maps
"""
with tf.variable_scope(scope):
beta = tf.Variable(tf.constant(0.0, shape=[n_out]),
name='beta', trainable=True)
gamma = tf.Variable(tf.constant(1.0, shape=[n_out]),
name='gamma', trainable=True)
batch_mean, batch_var = tf.nn.moments(x, [0,1,2], name='moments')
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([batch_mean, batch_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(batch_mean), tf.identity(batch_var)
mean, var = tf.cond(phase_train,
mean_var_with_update,
lambda: (ema.average(batch_mean), ema.average(batch_var)))
normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
return normed
Example:
import math
n_in, n_out = 3, 16
ksize = 3
stride = 1
phase_train = tf.placeholder(tf.bool, name='phase_train')
input_image = tf.placeholder(tf.float32, name='input_image')
kernel = tf.Variable(tf.truncated_normal([ksize, ksize, n_in, n_out],
stddev=math.sqrt(2.0/(ksize*ksize*n_out))),
name='kernel')
conv = tf.nn.conv2d(input_image, kernel, [1,stride,stride,1], padding='SAME')
conv_bn = batch_norm(conv, n_out, phase_train)
relu = tf.nn.relu(conv_bn)
with tf.Session() as session:
session.run(tf.initialize_all_variables())
for i in range(20):
test_image = np.random.rand(4,32,32,3)
sess_outputs = session.run([relu],
{input_image.name: test_image, phase_train.name: True})
There is also an "official" batch normalization layer coded by the developers. They don't have very good docs on how to use it but here is how to use it (according to me):
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
def batch_norm_layer(x,train_phase,scope_bn):
bn_train = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
to actually use it you need to create a placeholder for train_phase that indicates if you are in training or inference phase (as in train_phase = tf.placeholder(tf.bool, name='phase_train')). Its value can be filled during inference or training with a tf.session as in:
test_error = sess.run(fetches=cross_entropy, feed_dict={x: batch_xtest, y_:batch_ytest, train_phase: False})
or during training:
sess.run(fetches=train_step, feed_dict={x: batch_xs, y_:batch_ys, train_phase: True})
I'm pretty sure this is correct according to the discussion in github.
Seems there is another useful link:
http://r2rt.com/implementing-batch-normalization-in-tensorflow.html
You can simply use the build-in batch_norm layer:
batch_norm = tf.cond(is_train,
lambda: tf.contrib.layers.batch_norm(prev, activation_fn=tf.nn.relu, is_training=True, reuse=None),
lambda: tf.contrib.layers.batch_norm(prev, activation_fn =tf.nn.relu, is_training=False, reuse=True))
where prev is the output of your previous layer (can be both fully-connected or a convolutional layer) and is_train is a boolean placeholder. Just use batch_norm as the input to the next layer, then.
Since someone recently edited this, I'd like to clarify that this is no longer an issue.
This answer does not seem correct When phase_train is set to false, it still updates the ema mean and variance. This can be verified with the following code snippet.
x = tf.placeholder(tf.float32, [None, 20, 20, 10], name='input')
phase_train = tf.placeholder(tf.bool, name='phase_train')
# generate random noise to pass into batch norm
x_gen = tf.random_normal([50,20,20,10])
pt_false = tf.Variable(tf.constant(True))
#generate a constant variable to pass into batch norm
y = x_gen.eval()
[bn, bn_vars] = batch_norm(x, 10, phase_train)
tf.initialize_all_variables().run()
train_step = lambda: bn.eval({x:x_gen.eval(), phase_train:True})
test_step = lambda: bn.eval({x:y, phase_train:False})
test_step_c = lambda: bn.eval({x:y, phase_train:True})
# Verify that this is different as expected, two different x's have different norms
print(train_step()[0][0][0])
print(train_step()[0][0][0])
# Verify that this is same as expected, same x's (y) have same norm
print(train_step_c()[0][0][0])
print(train_step_c()[0][0][0])
# THIS IS DIFFERENT but should be they same, should only be reading from the ema.
print(test_step()[0][0][0])
print(test_step()[0][0][0])
Using TensorFlow built-in batch_norm layer, below is the code to load data, build a network with one hidden ReLU layer and L2 normalization and introduce batch normalization for both hidden and out layer. This runs fine and trains fine. Just FYI this example is mostly built upon the data and code from Udacity DeepLearning course.
P.S. Yes, parts of it were discussed one way or another in answers earlier but I decided to gather in one code snippet everything so that you have example of whole network training process with Batch Normalization and its evaluation
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
pickle_file = '/home/maxkhk/Documents/Udacity/DeepLearningCourse/SourceCode/tensorflow/examples/udacity/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
/ predictions.shape[0])
#for NeuralNetwork model code is below
#We will use SGD for training to save our time. Code is from Assignment 2
#beta is the new parameter - controls level of regularization.
#Feel free to play with it - the best one I found is 0.001
#notice, we introduce L2 for both biases and weights of all layers
batch_size = 128
beta = 0.001
#building tensorflow graph
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#introduce batchnorm
tf_train_dataset_bn = tf.contrib.layers.batch_norm(tf_train_dataset)
#now let's build our new hidden layer
#that's how many hidden neurons we want
num_hidden_neurons = 1024
#its weights
hidden_weights = tf.Variable(
tf.truncated_normal([image_size * image_size, num_hidden_neurons]))
hidden_biases = tf.Variable(tf.zeros([num_hidden_neurons]))
#now the layer itself. It multiplies data by weights, adds biases
#and takes ReLU over result
hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset_bn, hidden_weights) + hidden_biases)
#adding the batch normalization layerhi()
hidden_layer_bn = tf.contrib.layers.batch_norm(hidden_layer)
#time to go for output linear layer
#out weights connect hidden neurons to output labels
#biases are added to output labels
out_weights = tf.Variable(
tf.truncated_normal([num_hidden_neurons, num_labels]))
out_biases = tf.Variable(tf.zeros([num_labels]))
#compute output
out_layer = tf.matmul(hidden_layer_bn,out_weights) + out_biases
#our real output is a softmax of prior result
#and we also compute its cross-entropy to get our loss
#Notice - we introduce our L2 here
loss = (tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
out_layer, tf_train_labels) +
beta*tf.nn.l2_loss(hidden_weights) +
beta*tf.nn.l2_loss(hidden_biases) +
beta*tf.nn.l2_loss(out_weights) +
beta*tf.nn.l2_loss(out_biases)))
#now we just minimize this loss to actually train the network
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
#nice, now let's calculate the predictions on each dataset for evaluating the
#performance so far
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(out_layer)
valid_relu = tf.nn.relu( tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, out_weights) + out_biases)
test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
test_prediction = tf.nn.softmax(tf.matmul(test_relu, out_weights) + out_biases)
#now is the actual training on the ANN we built
#we will run it for some number of steps and evaluate the progress after
#every 500 steps
#number of steps we will train our ANN
num_steps = 3001
#actual training
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
So a simple example of the use of this batchnorm class:
from bn_class import *
with tf.name_scope('Batch_norm_conv1') as scope:
ewma = tf.train.ExponentialMovingAverage(decay=0.99)
bn_conv1 = ConvolutionalBatchNormalizer(num_filt_1, 0.001, ewma, True)
update_assignments = bn_conv1.get_assigner()
a_conv1 = bn_conv1.normalize(a_conv1, train=bn_train)
h_conv1 = tf.nn.relu(a_conv1)

Categories