Changing BatchNormalization momentum while training in Tensorflow 2

Changing BatchNormalization momentum while training in Tensorflow 2 - python

I want batch normalization running statistics (mean and variance) to converge in the end of training, which requires to increase batch norm momentum from some initial value to 1.0. I managed to change momentum using a custom Callback, but it works only if my model is compiled in eager mode. Toy example (it sets momentum=1.0 after epoch zero due to which moving_mean should stop updating):
import tensorflow as tf # version 2.3.1
import tensorflow_datasets as tfds
ds_train, ds_test = tfds.load("mnist", split=["train", "test"], shuffle_files=True, as_supervised=True)
ds_train = ds_train.batch(128)
ds_test = ds_test.batch(128)
model = tf.keras.models.Sequential(
[
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.ReLU(),
tf.keras.layers.Dense(10),
]
)
model.compile(
optimizer=tf.keras.optimizers.Adam(0.001),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
# run_eagerly=True,
)
class BatchNormMomentumCallback(tf.keras.callbacks.Callback):
def on_epoch_begin(self, epoch, logs=None):
last_bn_layer = None
for layer in self.model.layers:
if isinstance(layer, tf.keras.layers.BatchNormalization):
if epoch == 0:
layer.momentum = 0.99
else:
layer.momentum = 1.0
last_bn_layer = layer
if last_bn_layer:
tf.print("Momentum=" + str(last_bn_layer.moving_mean[-1].numpy())) # Should not change after epoch 1
batchnorm_decay = BatchNormMomentumCallback()
model.fit(ds_train, epochs=6, validation_data=ds_test, callbacks=[batchnorm_decay], verbose=0)
Output (get this when run_eagerly=False)
Momentum=0.0
Momentum=-102.20184
Momentum=-106.04614
Momentum=-116.36204
Momentum=-129.995
Momentum=-123.70443
Expected output (get it when run_eagerly=True)
Momentum=0.0
Momentum=-5.9038606
Momentum=-5.9038606
Momentum=-5.9038606
Momentum=-5.9038606
Momentum=-5.9038606
I guess this happens because in graph mode TF compiles the model as graph with a momentum defined as 0.99, and the uses this value in the graph (so momentum is not updated by BatchNormMomentumCallback).
Question:
Is there a way to update that compiled momentum variable inside the graph while training? I want to update momentum not in eager mode (i.e. using run_eagerly=False) because training efficiency is important.

I would recommend simply using a custom training loop for your use case. You will have all the flexibility you need:
import tensorflow as tf # version 2.3.1
import tensorflow_datasets as tfds
ds_train, ds_test = tfds.load("mnist", split=["train", "test"], shuffle_files=True, as_supervised=True)
ds_train = ds_train.batch(128)
ds_test = ds_test.batch(128)
model = tf.keras.models.Sequential(
[
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.ReLU(),
tf.keras.layers.Dense(10),
]
)
optimizer = tf.keras.optimizers.Adam(0.001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
batch_norm_layer = model.layers[2]
#tf.function
def train_step(epoch, model, batch):
if epoch == 0:
batch_norm_layer.momentum = 0.99
else:
batch_norm_layer.momentum = 1.0
with tf.GradientTape() as tape:
x_batch_train, y_batch_train = batch
logits = model(x_batch_train, training=True)
loss_value = loss_fn(y_batch_train, logits)
train_acc_metric.update_state(y_batch_train, logits)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
epochs = 6
for epoch in range(epochs):
tf.print("\nStart of epoch %d" % (epoch,))
tf.print("Momentum = ", batch_norm_layer.moving_mean[-1], summarize=-1)
for batch in ds_train:
train_step(epoch, model, batch)
train_acc = train_acc_metric.result()
tf.print("Training acc over epoch: %.4f" % (float(train_acc),))
train_acc_metric.reset_states()
Start of epoch 0
Momentum = 0
Training acc over epoch: 0.9158
Start of epoch 1
Momentum = -20.2749767
Training acc over epoch: 0.9634
Start of epoch 2
Momentum = -20.2749767
Training acc over epoch: 0.9755
Start of epoch 3
Momentum = -20.2749767
Training acc over epoch: 0.9826
Start of epoch 4
Momentum = -20.2749767
Training acc over epoch: 0.9876
Start of epoch 5
Momentum = -20.2749767
Training acc over epoch: 0.9915
A simple test shows that the function with the tf.function decorator performs way better:
import tensorflow as tf # version 2.3.1
import tensorflow_datasets as tfds
import timeit
ds_train, ds_test = tfds.load("mnist", split=["train", "test"], shuffle_files=True, as_supervised=True)
ds_train = ds_train.batch(128)
ds_test = ds_test.batch(128)
model = tf.keras.models.Sequential(
[
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.ReLU(),
tf.keras.layers.Dense(10),
]
)
optimizer = tf.keras.optimizers.Adam(0.001)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
batch_norm_layer = model.layers[2]
#tf.function
def train_step(epoch, model, batch):
if epoch == 0:
batch_norm_layer.momentum = 0.99
else:
batch_norm_layer.momentum = 1.0
with tf.GradientTape() as tape:
x_batch_train, y_batch_train = batch
logits = model(x_batch_train, training=True)
loss_value = loss_fn(y_batch_train, logits)
train_acc_metric.update_state(y_batch_train, logits)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
def train_step_without_tffunction(epoch, model, batch):
if epoch == 0:
batch_norm_layer.momentum = 0.99
else:
batch_norm_layer.momentum = 1.0
with tf.GradientTape() as tape:
x_batch_train, y_batch_train = batch
logits = model(x_batch_train, training=True)
loss_value = loss_fn(y_batch_train, logits)
train_acc_metric.update_state(y_batch_train, logits)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
epochs = 6
for epoch in range(epochs):
tf.print("\nStart of epoch %d" % (epoch,))
tf.print("Momentum = ", batch_norm_layer.moving_mean[-1], summarize=-1)
test = True
for batch in ds_train:
train_step(epoch, model, batch)
if test:
tf.print("TF function:", timeit.timeit(lambda: train_step(epoch, model, batch), number=10))
tf.print("Eager function:", timeit.timeit(lambda: train_step_without_tffunction(epoch, model, batch), number=10))
test = False
train_acc = train_acc_metric.result()
tf.print("Training acc over epoch: %.4f" % (float(train_acc),))
train_acc_metric.reset_states()
Start of epoch 0
Momentum = 0
TF function: 0.02285163299893611
Eager function: 0.11109527599910507
Training acc over epoch: 0.9229
Start of epoch 1
Momentum = -88.1852188
TF function: 0.024091466999379918
Eager function: 0.1109461480009486
Training acc over epoch: 0.9639
Start of epoch 2
Momentum = -88.1852188
TF function: 0.02331122400210006
Eager function: 0.11751473100230214
Training acc over epoch: 0.9756
Start of epoch 3
Momentum = -88.1852188
TF function: 0.02656845700039412
Eager function: 0.1121610670015798
Training acc over epoch: 0.9830
Start of epoch 4
Momentum = -88.1852188
TF function: 0.02821972700257902
Eager function: 0.15709391699783737
Training acc over epoch: 0.9877
Start of epoch 5
Momentum = -88.1852188
TF function: 0.02441513300072984
Eager function: 0.10921925399816246
Training acc over epoch: 0.9917

Another option is to declare the momentum as a variable
momentum = tf.Variable(0.99, trainable=False)
# pass into the BN layer
tf.keras.layers.BatchNormalization(momentum=momentum)
Then you can have a callback that updates the momentum
class BNMomentumUpdate(tf.keras.callbacks.Callback):
def __init__(self, momentum):
super().__init__()
self.momentum = momentum
def on_epoch_end(self, epoch, logs=None):
if epoch > 0:
self.momentum.assign(1.)

Related

Why is BERT model with pytorch native approach not learning?

My custom BERT model's architecture:
class BertArticleClassifier(nn.Module):
def __init__(self, n_classes, freeze_bert_weights=False):
super(BertArticleClassifier, self).__init__()
self.bert = AutoModel.from_pretrained('bert-base-uncased')
if freeze_bert_weights:
for param in self.bert.parameters():
param.requires_grad = False
self.dropout = nn.Dropout(0.1)
self.fc_1 = nn.Linear(768, 256)
self.leaky_relu = nn.LeakyReLU()
self.fc_out = nn.Linear(256, n_classes)
def forward(self, input_ids, attention_mask):
output = self.bert(input_ids, attention_mask)
return self.fc_out(self.leaky_relu(self.fc_1(self.dropout(output['pooler_output']))))
self.bert is a model from transformers library.
Training script:
def train_my_model(model, optimizer, criterion, scheduler, epochs, dataloader_train, dataloader_validation, device, pretrained_weights=None):
if pretrained_weights:
torch.save(model.state_dict(), pretrained_weights)
for epoch in tqdm(range(1, epochs + 1)):
model.train()
loss_train_total = 0
progress_bar = tqdm(dataloader_train, desc=f'Epoch {epoch :1d}', leave=False, disable=False)
for batch in progress_bar:
optimizer.zero_grad()
batch = tuple(batch[b].to(device) for b in batch)
input_ids, mask, labels = batch
predictions = model(input_ids, mask)
loss = criterion(predictions, labels)
loss.backward()
loss_train_total += loss.item()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
torch.save(model.state_dict(), f'models_data/bert_my_model/finetuned_BERT_epoch_{epoch}.model')
tqdm.write(f'\nEpoch {epoch}')
loss_train_avg = loss_train_total / len(dataloader_train)
tqdm.write(f'Training loss: {loss_train_avg}')
val_loss, predictions, true_vals = evaluate(model, dataloader_validation, criterion, device)
val_f1 = f1_score_func(predictions, true_vals)
tqdm.write(f'Validation loss: {val_loss}')
tqdm.write(f'F1 Score (Weighted): {val_f1}')
Optimizer and Criterion:
optimizer = AdamW(model.parameters(),
lr=1e-4,
eps=1e-6)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)
After 5 epochs I get the same validation loss ~3.1. I know that my data is preprocessed in the correct way because if I train this transformers BertForSequenceClassification model, the model is learning, but the problem with that approach is that I cannot tweak the loss function to accept the class weights, so that is the reason for creating my own custom model.
As you can see in the model's forward method, I extract the output['pooler_output'] piece, and disregard the loss (which is returned alongside the output['pooler_output'] element). The problem which I may deduced is that when in the training loop I call loss.backward(), maybe the model's weights aren't updating, because transformers BERT model's return their own loss as an output.
What am I doing wrong?

How to find train_losses and val_losses in Tensorflow, Neural machine translation with attention

I am learning Neural machine translation from this tutorial
https://www.tensorflow.org/tutorials/text/nmt_with_attention#restore_the_latest_checkpoint_and_test
But seems like there are no train_losses and val_losses in the tutorial (only batch_loss).
Is there any way to get loss value history like what we did with another model
Ex.
train_loss = seqModel.history['loss']
val_loss = seqModel.history['val_loss']
train_acc = seqModel.history['acc']
val_acc = seqModel.history['val_acc']

In that tutorials, there have actually. When they use
for epoch in range(EPOCHS):
start = time.time()
enc_hidden = encoder.initialize_hidden_state()
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, enc_hidden)
total_loss += batch_loss
By that, they're computing the training loss that is coming from the train_step method. But there is no validation set so no validation loss is shown.
Based on your comment, you need to write the test_step function and use it in the training loop. Here is a minimum representation to get the validation loss.
#tf.function
def test_step(inp, targ, enc_hidden):
loss = 0
enc_output, enc_hidden = encoder(inp, enc_hidden, training=False)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
for t in range(1, targ.shape[1]):
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden,
enc_output, training=False)
loss += loss_function(targ[:, t], predictions)
dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1]))
return batch_loss
To use it in the custom training loop, you would do as follows. Note, I'm using the same dataset, but practically we need to create a separate validation dataset.
EPOCHS = 5
history = {'loss':[], 'val_loss':[]}
for epoch in range(EPOCHS):
start = time.time()
enc_hidden = encoder.initialize_hidden_state()
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(inp, targ, enc_hidden)
total_loss += batch_loss
if (epoch + 1) % 2 == 0:
checkpoint.save(file_prefix=checkpoint_prefix)
history['loss'].append(total_loss.numpy()/steps_per_epoch)
print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = test_step(inp, targ, enc_hidden)
total_loss += batch_loss
history['val_loss'].append(total_loss.numpy()/steps_per_epoch)
print(f'Epoch {epoch+1} Val Loss {total_loss/steps_per_epoch:.4f}')
print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')
Next,
history['loss']
history['val_loss']

Tensorflow Random segmentation faults

I am trying to run the demo code from official tensorflow website
I am attaching the full code (copied and arranged) here for ease
import tensorflow as tf
# print("1")
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import time
import os
# print("2")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
# #tf.function
def train_step(x, y):
with tf.GradientTape() as tape:
logits = model(x, training=True)
loss_value = loss_fn(y, logits)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
train_acc_metric.update_state(y, logits)
return loss_value
# #tf.function
def test_step(x, y):
val_logits = model(x, training=False)
val_acc_metric.update_state(y, val_logits)
inputs = keras.Input(shape=(784,), name="digits")
x1 = layers.Dense(64, activation="relu")(inputs)
x2 = layers.Dense(64, activation="relu")(x1)
outputs = layers.Dense(10, name="predictions")(x2)
model = keras.Model(inputs=inputs, outputs=outputs)
# Instantiate an optimizer.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()
# Prepare the training dataset.
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))
# Reserve 10,000 samples for validation.
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]
# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)
epochs = 2
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
start_time = time.time()
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
loss_value = train_step(x_batch_train, y_batch_train)
# Log every 200 batches.
if step % 200 == 0:
print(
"Training loss (for one batch) at step %d: %.4f"
% (step, float(loss_value))
)
print("Seen so far: %d samples" % ((step + 1) * 64))
# Display metrics at the end of each epoch.
train_acc = train_acc_metric.result()
print("Training acc over epoch: %.4f" % (float(train_acc),))
# Reset training metrics at the end of each epoch
train_acc_metric.reset_states()
# Run a validation loop at the end of each epoch.
for x_batch_val, y_batch_val in val_dataset:
test_step(x_batch_val, y_batch_val)
val_acc = val_acc_metric.result()
val_acc_metric.reset_states()
print("Validation acc: %.4f" % (float(val_acc),))
print("Time taken: %.2fs" % (time.time() - start_time))
print("end")
Without any reason, this code enters Segmentation Fault in Tensorflow 2.3.1 right at the beginning
>python dummy.py
2021-03-11 17:45:52.231509: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
Segmentation fault (core dumped)
Interestingly if I put some random print statements at the very start(those print("1") etc statements, the code will execute till the end and suffer segmentation fault at the end(redundant output not shown)
Start of epoch 1
Training loss (for one batch) at step 0: 1.0215
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.9116
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.4894
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.5636
Seen so far: 38464 samples
Training acc over epoch: 0.8416
Validation acc: 0.8296
Time taken: 3.16s
end
Segmentation fault (core dumped)
Another observation is, if I uncomment the #tf.function on top of my trainStep and testStep functions, the code enters into segfault again but after it prints
Start of epoch 0
Can someone explain what is going wrong with my Tensorflow package?

It was due to older version of Ubuntu. I was using 14. After upgrading to 18, the issue got resolved

Tensorflow Eager execution won't work

I've been trying to replicate the Eager Execution tutorial using the MNIST dataset, but it doesn't work... I get no error, but it looks like as the code just stops in the first round, the output is
Epoch 000: Loss: nan, Accuracy: 9.898%
I've tested other MNIST codes and they progressed faster... (I waited for about 30 min)
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
#Load dataset
mnist = tf.contrib.learn.datasets.load_dataset('mnist')
train_data = mnist.train.images
train_labels = np.asarray(mnist.train.labels, dtype = np.int32)
test_data = mnist.test.images
test_labels = np.asarray(mnist.test.labels, dtype = np.int32)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.shuffle(buffer_size=100000)
train_dataset = train_dataset.batch(10)
features, label = iter(train_dataset).next()
print("example features:", features[0])
print("example label:", label[0])
model = tf.keras.Sequential([
tf.keras.layers.Dense(10, activation="relu", input_shape=(784,)), # input shape required
tf.keras.layers.Dense(10, activation="relu"),
tf.keras.layers.Dense(3)
])
def loss(model, x, y):
y_ = model(x)
return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_)
def grad(model, inputs, targets):
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets)
return tape.gradient(loss_value, model.variables)
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.01)
train_loss_results = []
train_accuracy_results = []
num_epochs = 200
for epoch in range(num_epochs):
epoch_loss_avg = tf.contrib.eager.metrics.Mean()
epoch_accuracy = tf.contrib.eager.metrics.Accuracy()
for x,y in train_dataset:
grads = grad(model, x, y)
optimizer.apply_gradients(zip(grads, model.variables),
global_step=tf.train.get_or_create_global_step())
epoch_loss_avg(loss(model, x, y))
epoch_accuracy(tf.argmax(model(x), axis=1, output_type=tf.int32), y)
train_loss_results.append(epoch_loss_avg.result())
train_accuracy_results.append(epoch_accuracy.result())
if epoch % 50 == 0:
print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
epoch_loss_avg.result(),
epoch_accuracy.result()))
PS. I know that the model probably won't be efficient, is just a test run

How to tell Keras stop training based on loss value?

Currently I use the following code:
callbacks = [
EarlyStopping(monitor='val_loss', patience=2, verbose=0),
ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0),
]
model.fit(X_train.astype('float32'), Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
shuffle=True, verbose=1, validation_data=(X_valid, Y_valid),
callbacks=callbacks)
It tells Keras to stop training when loss didn't improve for 2 epochs. But I want to stop training after loss became smaller than some constant "THR":
if val_loss < THR:
break
I've seen in documentation there are possibility to make your own callback:
http://keras.io/callbacks/
But nothing found how to stop training process. I need an advice.

I found the answer. I looked into Keras sources and find out code for EarlyStopping. I made my own callback, based on it:
class EarlyStoppingByLossVal(Callback):
def __init__(self, monitor='val_loss', value=0.00001, verbose=0):
super(Callback, self).__init__()
self.monitor = monitor
self.value = value
self.verbose = verbose
def on_epoch_end(self, epoch, logs={}):
current = logs.get(self.monitor)
if current is None:
warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)
if current < self.value:
if self.verbose > 0:
print("Epoch %05d: early stopping THR" % epoch)
self.model.stop_training = True
And usage:
callbacks = [
EarlyStoppingByLossVal(monitor='val_loss', value=0.00001, verbose=1),
# EarlyStopping(monitor='val_loss', patience=2, verbose=0),
ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0),
]
model.fit(X_train.astype('float32'), Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
shuffle=True, verbose=1, validation_data=(X_valid, Y_valid),
callbacks=callbacks)

The keras.callbacks.EarlyStopping callback does have a min_delta argument. From Keras documentation:
min_delta: minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement.

One solution is to call model.fit(nb_epoch=1, ...) inside a for loop, then you can put a break statement inside the for loop and do whatever other custom control flow you want.

I solved the same problem using custom callback.
In the following custom callback code assign THR with the value at which you want to stop training and add the callback to your model.
from keras.callbacks import Callback
class stopAtLossValue(Callback):
def on_batch_end(self, batch, logs={}):
THR = 0.03 #Assign THR with the value at which you want to stop training.
if logs.get('loss') <= THR:
self.model.stop_training = True

While I was taking the TensorFlow in practice specialization, I learned a very elegant technique. Just little modified from the accepted answer.
Let's set the example with our favorite MNIST data.
import tensorflow as tf
class new_callback(tf.keras.callbacks.Callback):
def epoch_end(self, epoch, logs={}):
if(logs.get('accuracy')> 0.90): # select the accuracy
print("\n !!! 90% accuracy, no further training !!!")
self.model.stop_training = True
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0 #normalize
callbacks = new_callback()
# model = tf.keras.models.Sequential([# define your model here])
model.compile(optimizer=tf.optimizers.Adam(),
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, callbacks=[callbacks])
So, here I set the metrics=['accuracy'], and thus in the callback class the condition is set to 'accuracy'> 0.90.
You can choose any metric and monitor the training like this example. Most importantly you can set different conditions for different metric and use them simultaneously.
Hopefully this helps!

For me the model would only stop training if I added a return statement after setting the stop_training parameter to True because I was calling after self.model.evaluate. So either make sure to put stop_training = True at the end of the function or add a return statement.
def on_epoch_end(self, batch, logs):
self.epoch += 1
self.stoppingCounter += 1
print('\nstopping counter \n',self.stoppingCounter)
#Stop training if there hasn't been any improvement in 'Patience' epochs
if self.stoppingCounter >= self.patience:
self.model.stop_training = True
return
# Test on additional set if there is one
if self.testingOnAdditionalSet:
evaluation = self.model.evaluate(self.val2X, self.val2Y, verbose=0)
self.validationLoss2.append(evaluation[0])
self.validationAcc2.append(evaluation[1])enter code here

If you're using a custom training loop, you can use a collections.deque, which is a "rolling" list which can be appended, and the left-hand items gets popped out when the list is longer than maxlen. Here's the line:
loss_history = deque(maxlen=early_stopping + 1)
for epoch in range(epochs):
fit(epoch)
loss_history.append(test_loss.result().numpy())
if len(loss_history) > early_stopping and loss_history.popleft() < min(loss_history)
break
Here's a full example:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.layers import Dense
from collections import deque
data, info = tfds.load('iris', split='train', as_supervised=True, with_info=True)
data = data.map(lambda x, y: (tf.cast(x, tf.int32), y))
train_dataset = data.take(120).batch(4)
test_dataset = data.skip(120).take(30).batch(4)
model = tf.keras.models.Sequential([
Dense(8, activation='relu'),
Dense(16, activation='relu'),
Dense(info.features['label'].num_classes)])
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
train_loss = tf.keras.metrics.Mean()
test_loss = tf.keras.metrics.Mean()
train_acc = tf.keras.metrics.SparseCategoricalAccuracy()
test_acc = tf.keras.metrics.SparseCategoricalAccuracy()
opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
#tf.function
def train_step(inputs, labels):
with tf.GradientTape() as tape:
logits = model(inputs, training=True)
loss = loss_object(labels, logits)
gradients = tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_acc(labels, logits)
#tf.function
def test_step(inputs, labels):
logits = model(inputs, training=False)
loss = loss_object(labels, logits)
test_loss(loss)
test_acc(labels, logits)
def fit(epoch):
template = 'Epoch {:>2} Train Loss {:.3f} Test Loss {:.3f} ' \
'Train Acc {:.2f} Test Acc {:.2f}'
train_loss.reset_states()
test_loss.reset_states()
train_acc.reset_states()
test_acc.reset_states()
for X_train, y_train in train_dataset:
train_step(X_train, y_train)
for X_test, y_test in test_dataset:
test_step(X_test, y_test)
print(template.format(
epoch + 1,
train_loss.result(),
test_loss.result(),
train_acc.result(),
test_acc.result()
))
def main(epochs=50, early_stopping=10):
loss_history = deque(maxlen=early_stopping + 1)
for epoch in range(epochs):
fit(epoch)
loss_history.append(test_loss.result().numpy())
if len(loss_history) > early_stopping and loss_history.popleft() < min(loss_history):
print(f'\nEarly stopping. No validation loss '
f'improvement in {early_stopping} epochs.')
break
if __name__ == '__main__':
main(epochs=250, early_stopping=10)
Epoch 1 Train Loss 1.730 Test Loss 1.449 Train Acc 0.33 Test Acc 0.33
Epoch 2 Train Loss 1.405 Test Loss 1.220 Train Acc 0.33 Test Acc 0.33
Epoch 3 Train Loss 1.173 Test Loss 1.054 Train Acc 0.33 Test Acc 0.33
Epoch 4 Train Loss 1.006 Test Loss 0.935 Train Acc 0.33 Test Acc 0.33
Epoch 5 Train Loss 0.885 Test Loss 0.846 Train Acc 0.33 Test Acc 0.33
...
Epoch 89 Train Loss 0.196 Test Loss 0.240 Train Acc 0.89 Test Acc 0.87
Epoch 90 Train Loss 0.195 Test Loss 0.239 Train Acc 0.89 Test Acc 0.87
Epoch 91 Train Loss 0.195 Test Loss 0.239 Train Acc 0.89 Test Acc 0.87
Epoch 92 Train Loss 0.194 Test Loss 0.239 Train Acc 0.90 Test Acc 0.87
Early stopping. No validation loss improvement in 10 epochs.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Changing BatchNormalization momentum while training in Tensorflow 2 - python

Related

Why is BERT model with pytorch native approach not learning?

How to find train_losses and val_losses in Tensorflow, Neural machine translation with attention

Tensorflow Random segmentation faults

Tensorflow Eager execution won't work

How to tell Keras stop training based on loss value?

Categories

Resources