Tensorflow Callback as Custom Metric for CTC

Tensorflow Callback as Custom Metric for CTC - python

In an attempt to yield more metrics during the training of my model (written in TensorFlow version 2.1.0), like the Character Error Rate (CER) and Word Error Rate (WER), I created a callback to pass to the fit function of my model. It is able to generate the CER and WER at the end of an epoch.
It's my second choice as I wanted to create a custom metric for this, but you can only use keras backend functionality for custom metrics. Does anyone have any advice on how to convert the callback below into a Custom Metric (which can then be calculated during training on the validation and/or training data)?
Some roadblocks I encountered are:
Failure to convert the K.ctc_decode result to a sparse tensor
How can you calculate a distance like edit-distance using the Keras backend?
class Metrics(tf.keras.callbacks.Callback):
def __init__(self, valid_data, steps):
"""
valid_data is a TFRecordDataset with batches of 100 elements per batch, shuffled and repeated infinitely.
steps define the amount of batches per epoch
"""
super(Metrics, self).__init__()
self.valid_data = valid_data
self.steps = steps
def on_train_begin(self, logs={}):
self.cer = []
self.wer = []
def on_epoch_end(self, epoch, logs={}):
imgs = []
labels = []
for idx, (img, label) in enumerate(self.valid_data.as_numpy_iterator()):
if idx >= self.steps:
break
imgs.append(img)
labels.extend(label)
imgs = np.array(imgs)
labels = np.array(labels)
out = self.model.predict((batch for batch in imgs))
input_length = len(max(out, key=len))
out = np.asarray(out)
out_len = np.asarray([input_length for _ in range(len(out))])
decode, log = K.ctc_decode(out,
out_len,
greedy=True)
decode = [[[int(p) for p in x if p != -1] for x in y] for y in decode][0]
for (pred, lab) in zip(decode, labels):
dist = editdistance.eval(pred, lab)
self.cer.append(dist / (max(len(pred), len(lab))))
self.wer.append(not np.array_equal(pred, lab))
print("Mean CER: {}".format(np.mean([self.cer], axis=1)[0]))
print("Mean WER: {}".format(np.mean([self.wer], axis=1)[0]))

Solved in TF 2.3.1, but should apply for previous versions of 2.x as well.
Some remarks:
Information on how to properly implement a Tensorflow Custom Metric is scarce. The question implied the use of a callback to implement the metric. This has longer epochs as a consequence (due to the explicit extra calculation of the metric on_epoch_end), or so I believe. Implementing it as a subclass of tensorflow.keras.metrics.Metric seems the right way, and yields results (if verbose is set correctly) while the epoch is ongoing.
Calculating the edit distance for the CER is quite easily performed using tf.edit_distance (using sparse tensors), this can subsequently be used to calculate the WER using some tf logic.
Alas, I am yet to find out how to implement both the CER and WER in one metric (as it has quite some duplicate code), if anyone knows how to do so, please contact me.
Custom metrics can simply be added into the compilation of your TF model:
self.model.compile(optimizer=opt, loss=loss, metrics=[CERMetric(), WERMetric()])
class CERMetric(tf.keras.metrics.Metric):
"""
A custom Keras metric to compute the Character Error Rate
"""
def __init__(self, name='CER_metric', **kwargs):
super(CERMetric, self).__init__(name=name, **kwargs)
self.cer_accumulator = self.add_weight(name="total_cer", initializer="zeros")
self.counter = self.add_weight(name="cer_count", initializer="zeros")
def update_state(self, y_true, y_pred, sample_weight=None):
input_shape = K.shape(y_pred)
input_length = tf.ones(shape=input_shape[0]) * K.cast(input_shape[1], 'float32')
decode, log = K.ctc_decode(y_pred,
input_length,
greedy=True)
decode = K.ctc_label_dense_to_sparse(decode[0], K.cast(input_length, 'int32'))
y_true_sparse = K.ctc_label_dense_to_sparse(y_true, K.cast(input_length, 'int32'))
decode = tf.sparse.retain(decode, tf.not_equal(decode.values, -1))
distance = tf.edit_distance(decode, y_true_sparse, normalize=True)
self.cer_accumulator.assign_add(tf.reduce_sum(distance))
self.counter.assign_add(len(y_true))
def result(self):
return tf.math.divide_no_nan(self.cer_accumulator, self.counter)
def reset_states(self):
self.cer_accumulator.assign(0.0)
self.counter.assign(0.0)
class WERMetric(tf.keras.metrics.Metric):
"""
A custom Keras metric to compute the Word Error Rate
"""
def __init__(self, name='WER_metric', **kwargs):
super(WERMetric, self).__init__(name=name, **kwargs)
self.wer_accumulator = self.add_weight(name="total_wer", initializer="zeros")
self.counter = self.add_weight(name="wer_count", initializer="zeros")
def update_state(self, y_true, y_pred, sample_weight=None):
input_shape = K.shape(y_pred)
input_length = tf.ones(shape=input_shape[0]) * K.cast(input_shape[1], 'float32')
decode, log = K.ctc_decode(y_pred,
input_length,
greedy=True)
decode = K.ctc_label_dense_to_sparse(decode[0], K.cast(input_length, 'int32'))
y_true_sparse = K.ctc_label_dense_to_sparse(y_true, K.cast(input_length, 'int32'))
decode = tf.sparse.retain(decode, tf.not_equal(decode.values, -1))
distance = tf.edit_distance(decode, y_true_sparse, normalize=True)
correct_words_amount = tf.reduce_sum(tf.cast(tf.not_equal(distance, 0), tf.float32))
self.wer_accumulator.assign_add(correct_words_amount)
self.counter.assign_add(len(y_true))
def result(self):
return tf.math.divide_no_nan(self.wer_accumulator, self.counter)
def reset_states(self):
self.wer_accumulator.assign(0.0)
self.counter.assign(0.0)

Alas, I am yet to find out how to implement both the CER and WER in
one metric (as it has quite some duplicate code), if anyone knows how
to do so, please contact me.
Hey, this solution really helped me a lot. As of now, there are TensorFlow 2.10 releases, so for this version, I wrote a combination of WER and CER metrics, here is the final working code:
import tensorflow as tf
class CWERMetric(tf.keras.metrics.Metric):
""" A custom TensorFlow metric to compute the Character Error Rate
"""
def __init__(self, name='CWER', **kwargs):
super(CWERMetric, self).__init__(name=name, **kwargs)
self.cer_accumulator = tf.Variable(0.0, name="cer_accumulator", dtype=tf.float32)
self.wer_accumulator = tf.Variable(0.0, name="wer_accumulator", dtype=tf.float32)
self.counter = tf.Variable(0, name="counter", dtype=tf.int32)
def update_state(self, y_true, y_pred, sample_weight=None):
input_shape = tf.keras.backend.shape(y_pred)
input_length = tf.ones(shape=input_shape[0], dtype='int32') * tf.cast(input_shape[1], 'int32')
decode, log = tf.keras.backend.ctc_decode(y_pred, input_length, greedy=True)
decode = tf.keras.backend.ctc_label_dense_to_sparse(decode[0], input_length)
y_true_sparse = tf.cast(tf.keras.backend.ctc_label_dense_to_sparse(y_true, input_length), "int64")
decode = tf.sparse.retain(decode, tf.not_equal(decode.values, -1))
distance = tf.edit_distance(decode, y_true_sparse, normalize=True)
correct_words_amount = tf.reduce_sum(tf.cast(tf.not_equal(distance, 0), tf.float32))
self.wer_accumulator.assign_add(correct_words_amount)
self.cer_accumulator.assign_add(tf.reduce_sum(distance))
self.counter.assign_add(len(y_true))
def result(self):
return {
"CER": tf.math.divide_no_nan(self.cer_accumulator, tf.cast(self.counter, tf.float32)),
"WER": tf.math.divide_no_nan(self.wer_accumulator, tf.cast(self.counter, tf.float32))
}
I still need to check whether it calculates CER and WER correctly, I'll find out that something is missing, I'll update this.

Related

Keras model on GPU: using Pandas in a custom loss function

I'm trying to define the following (toy) custom loss function in Keras:
def flexed_distance_loss(y_true, y_pred):
y_true_df = pd.DataFrame(y_true, columns=my_columns)
# do something with y_true_df
return categorical_crossentropy(y_true_df.values, y_pred)
I'm running this model on GPU with tf.distribute.MirroredStrategy().
Compiling the model generates no error, but when running model.fit(), the following error happens:
>>> y_true_df = pd.DataFrame(y_true, columns=my_columns)
OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed:
AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.
It seems that Pandas is trying to iterate over the tensor y_true, which is forbidden in graph mode (the preferred mode when training on GPU).
Must I understand that this is not possible to use Pandas within a loss function when training on GPU?
What would be some plausible alternatives, other than doing all the manipulations directly in TensorFlow itself? I'm doing quite some heavy re-indexing and merging and I can't begin to imagine the pain of doing all this in native TensorFlow code.
Note:
For reference, this is the kind of manipulation I'm trying to make:
def flexed_distance_loss(y_true, y_pred):
y_true_df = pd.DataFrame(y_true, columns=my_columns)
y_true_custom = y_true_df.idxmax(axis=1).to_frame(name='my_name')
y_true_df = pd.concat([y_true_custom, y_true_df], axis=1)
y_true_df = y_true_df.where(y_true_df != 0, np.NaN)
y_true_df = y_true_df.reset_index().set_index('my_name')
nearby = y_true_df.fillna(pivoted_df.reindex(y_true_df.index)) \
.fillna(0) \
.set_index('index').sort_index()
nearby = np.expm1(nearby).div(np.sum(np.expm1(nearby), axis=1), axis=0)
y_true_flexed = nearby.values
return categorical_crossentropy(y_true_flexed, y_pred)

Actually I realised that all I'm doing within the custom loss function is transforming y_true. In the real case, I'm transforming it based on some random number (if random.random() > 0.1 then apply the transformation).
The most appropriate place to do this is not in a loss function, but in the batch generator instead.
class BatchGenerator(tf.keras.utils.Sequence):
def __init__(self, indices, batch_size, mode):
self.indices = indices
self.batch_size = batch_size
self.mode = mode
def __len__(self):
return math.ceil(len(self.indices) / self.batch_size)
def __getitem__(self, idx):
batch = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
X_batch = X[batch, :]
y_batch = y[batch, :]
if self.mode == 'train' and random.random() > 0.3:
# pick y from regular batch
return X_batch, y_batch
else:
# apply flex-distancing to y
return X_batch, flex_distance_batch(y_batch)
batch_size = 512*4
train_generator = BatchGenerator(range(0, test_cutoff), batch_size, 'train')
test_generator = BatchGenerator(range(test_cutoff, len(y_df)), batch_size, 'test')
This way the transformations are applied directly from the batch generator, and Pandas is perfectly allowed here as we're dealing only with NumPy array on the CPU.

Loss on dev set is always increasing unlike training set loss

I designed a network for a text classification problem. To do this, I'm using huggingface transformet's BERT model with a linear layer above that for fine-tuning. My problem is that the loss on the training set is decreasing which is fine, but when it comes to do the evaluation after each epoch on the development set, the loss is increasing with epochs. I'm posting my code to investigate if there's something wrong with it.
for epoch in range(1, args.epochs + 1):
total_train_loss = 0
trainer.set_train()
for step, batch in enumerate(train_dataloader):
loss = trainer.step(batch)
total_train_loss += loss
avg_train_loss = total_train_loss / len(train_dataloader)
logger.info(('Training loss for epoch %d/%d: %4.2f') % (epoch, args.epochs, avg_train_loss))
print("\n-------------------------------")
logger.info('Start validation ...')
trainer.set_eval()
y_hat = list()
y = list()
total_dev_loss = 0
for step, batch_val in enumerate(dev_dataloader):
true_labels_ids, predicted_labels_ids, loss = trainer.validate(batch_val)
total_dev_loss += loss
y.extend(true_labels_ids)
y_hat.extend(predicted_labels_ids)
avg_dev_loss = total_dev_loss / len(dev_dataloader)
print(("\n-Total dev loss: %4.2f on epoch %d/%d\n") % (avg_dev_loss, epoch, args.epochs))
print("Training terminated!")
Following is the trainer file, which I use for doing a forward pass on a given batch and then backpropagate accordingly.
class Trainer(object):
def __init__(self, args, model, device, data_points, is_test=False, train_stats=None):
self.args = args
self.model = model
self.device = device
self.loss = nn.CrossEntropyLoss(reduction='none')
if is_test:
# Should load the model from checkpoint
self.model.eval()
self.model.load_state_dict(torch.load(args.saved_model))
logger.info('Loaded saved model from %s' % args.saved_model)
else:
self.model.train()
self.optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = data_points * self.args.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=0,
num_training_steps=total_steps)
def step(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
self.model.zero_grad()
outputs = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
loss = self.loss(outputs, batch_labels)
loss = loss.sum()
(loss / loss.numel()).backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optim.step()
self.scheduler.step()
return loss
def validate(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
with torch.no_grad():
model_output = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
predicted_label_ids = self._predict(model_output)
label_ids = batch_labels.to('cpu').numpy()
loss = self.loss(model_output, batch_labels)
loss = loss.sum()
return label_ids, predicted_label_ids, loss
def _predict(self, logits):
return np.argmax(logits.to('cpu').numpy(), axis=1)
Finally, the following is my model (i.e., Classifier) class:
import torch.nn as nn
from transformers import BertModel
class Classifier(nn.Module):
def __init__(self, args, is_eval=False):
super(Classifier, self).__init__()
self.bert_model = BertModel.from_pretrained(
args.init_checkpoint,
output_attentions=False,
output_hidden_states=True,
)
self.is_eval_mode = is_eval
self.linear = nn.Linear(768, 2) # binary classification
def switch_state(self):
self.is_eval_mode = not self.is_eval_mode
def forward(self, input_ids, attention_mask=None, labels=None):
bert_outputs = self.bert_model(input_ids,
token_type_ids=None,
attention_mask=attention_mask)
# Should give the logits to the the linear layer
model_output = self.linear(bert_outputs[1])
return model_output
For visualization the loss throughout the epochs:

When I've used Bert for text classification my model has generally behaved as you tell. In part this is expected because pre-trained models tend to require few epochs to fine-tune, actually if you check Bert's paper the number of epochs recommended for fine-tuning is between 2 and 4.
On the other hand, I've usually found the optimum at just 1 or 2 epochs, which coincides with your case also. My guess is: there is a trade-off when fine-tuning pre-trained models between fitting to your downstream task and forgetting the weights learned at pre-training. Depending on the data you have, the equilibrium point may happen sooner or later and overfitting starts after that. But this paragraph is speculation based on my experience.

When validation loss increases it means your model is overfitting

Custom loss function on Keras

I have a dataset containing a matrix of features X and a matrix of labels y of size N where each element y_i belongs to [0,1]. I have the following loss function
where g(.) is a function that depends on the input matrix X.
I know that Keras custom loss function has to be of the form customLoss(y_true,y_predicted), however, I'm having difficulties incorporating the term g(X) in the loss function since this depends on the input matrix.
For each data point in my dataset, my input is of the form X_i = (H, P) where these two parameters are matrices and the function g is defined for each data point as g(X_i) = H x P. Can I pass a = (H, P) in the loss function since this depends on each example or do I need to pass all the matrices at once by concatenating them?
Edit (based on Daniel's answer):
original_model_inputs = keras.layers.Input(shape=X_train.shape[1])
y_true_inputs = keras.layers.Input(shape=y_train.shape[1])
hidden1 = keras.layers.Dense(256, activation="relu")(original_model_inputs)
hidden2 = keras.layers.Dense(128, activation="relu")(hidden1)
output = keras.layers.Dense(K)(hidden2)
def lambdaLoss(x):
yTrue, yPred, alpha = x
return (K.log(yTrue) - K.log(yPred))**2+alpha*yPred
loss = Lambda(lambdaLoss)(y_true_inputs, output, a)
model = Keras.Model(inputs=[original_model_inputs, y_true_inputs], outputs=[output], loss)
def dummyLoss(true, pred):
return pred
model.compile(loss = dummyLoss, optimizer=Adam())
train_model = model.fit([X_train, y_train], None, batch_size = 32,
epochs = 50,
validation_data = ([X_valid, y_valid], None),
callbacks=callbacks)

Fixing the understanding of my answer:
original_model_inputs = keras.layers.Input(shape=X_train.shape[1:]) #must be a tuple, not an int
y_true_inputs = keras.layers.Input(shape=y_train.shape[1:]) #must be a tuple, not an int
hidden1 = keras.layers.Dense(256, activation="relu")(original_model_inputs)
hidden2 = keras.layers.Dense(128, activation="relu")(hidden1)
output = keras.layers.Dense(K)(hidden2)
You need something to do g(X), I have no idea of what it is, but you need to do it somewhere.
And yes, you need to pass the whole tensor at once, you cannot make x_i and everything else.
def g(x):
return something
gResults = Lambda(g)(original_model_inputs)
Continuing my answer:
def lambdaLoss(x):
yTrue, yPred, G = x
.... #wait.... where is Y_true in your loss formula?
loss = Lambda(lambdaLoss)([y_true_inputs, output, gResults]) #must be a list of inputs including G
You need a model for training and another to get the outputs, because we're doing a frankenstein model because of the different loss.
training_model = keras.Model(inputs=[original_model_inputs, y_true_inputs], outputs=loss)
prediction_model = keras.Model(original_model_inputs, output)
Only the training model must be compiled:
def dummyLoss(true, pred):
return pred
training_model.compile(loss = dummyLoss, optimizer=Adam())
training_model = model.fit([X_train, y_train], None, batch_size = 32,
epochs = 50,
validation_data = ([X_valid, y_valid], None),
callbacks=callbacks)
Use the other model to get result data:
results = prediction_model.predict(some_x)

Looks like a GAN of some sort. I will refer to (x) as "x_input", Two methods:
Method 1) Inherit from tf.keras.model class and write your own (not recommended, not shown)
Method 2) Inherit from tf.keras.losses.Loss class. and return tuple of (custom) tf.keras.losses.Loss instance and tf.keras.layers.Layer that does nothing more than act as shell to grab and save a copy of the x_input (x). This layer instance can then be added as the top layer in model. The (custom) tf.keraslosses. Loss instance can then access the input on demand. This method also has best future support throughout the life of Tensorflow.
First, create a custom layer and custom loss class:
class Acrylic_Layer(tf.keras.layers.Layer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.x_input = None
def build(self, *args, **kwargs):
pass
def call(self, input):
self.x_input = input
return input # Pass input directly through to next layer
class Custom_Loss(tf.keras.losses.Loss):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.input_thief = Acrylic_Layer() # <<< Magic, python is pass by reference!
def __call__(self, y_true, y_pred, sample_weight=None):
x_input = self.input_thief.x_input # <<< x_input pulled from model
Second, add layer and loss function to model
loss_fn = Custom_Loss(*args, **kwargs)
input_thief = loss_fn.input_thief
model = tf.keras.models.Sequential([
input_thief, # <<< transparent layer
Other_layers,
])
model.fit(loss=loss_fn) # <<< loss function
Lastly, I'm the market looking for a ML/python role, giving a shout out.

CTC loss goes down and stops

I’m trying to train a captcha recognition model. Model details are resnet pretrained CNN layers + Bidirectional LSTM + Fully Connected. It reached 90% sequence accuracy on captcha generated by python library captcha. The problem is that these generated captcha seems to have similary location of each character. When I randomly add spaces between characters, the model does not work any more. So I wonder is LSTM learning segmentation during learning? Then I try to use CTC loss. At first, loss goes down pretty quick. But it stays at about 16 without significant drop later. I tried different layers of LSTM, different number of units. 2 Layers of LSTM reach lower loss, but still not converging. 3 layers are just like 2 layers. The loss curve:
#encoding:utf8
import os
import sys
import torch
import warpctc_pytorch
import traceback
import torchvision
from torch import nn, autograd, FloatTensor, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
from tensorboard import SummaryWriter
from pprint import pprint
from net.utils import decoder
from logging import getLogger, StreamHandler
logger = getLogger(__name__)
handler = StreamHandler(sys.stdout)
logger.addHandler(handler)
from dataset_util.utils import id_to_character
from dataset_util.transform import rescale, normalizer
from config.config import MAX_CAPTCHA_LENGTH, TENSORBOARD_LOG_PATH, MODEL_PATH
class CNN_RNN(nn.Module):
def __init__(self, lstm_bidirectional=True, use_ctc=True, *args, **kwargs):
super(CNN_RNN, self).__init__(*args, **kwargs)
model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
param.requires_grad = False
modules = list(model_conv.children())[:-1] # delete the last fc layer.
for param in modules[8].parameters():
param.requires_grad = True
self.resnet = nn.Sequential(*modules) # CNN with fixed parameters from resnet as feature extractor
self.lstm_input_size = 512 * 2 * 2
self.lstm_hidden_state_size = 512
self.lstm_num_layers = 2
self.chracter_space_length = 64
self._lstm_bidirectional = lstm_bidirectional
self._use_ctc = use_ctc
if use_ctc:
self._max_captcha_length = int(MAX_CAPTCHA_LENGTH * 2)
else:
self._max_captcha_length = MAX_CAPTCHA_LENGTH
if lstm_bidirectional:
self.lstm_hidden_state_size = self.lstm_hidden_state_size * 2 # so that hidden size for one direction in bidirection lstm is the same as vanilla lstm
self.lstm = self.lstm = nn.LSTM(self.lstm_input_size, self.lstm_hidden_state_size // 2, dropout=0.5, bidirectional=True, num_layers=self.lstm_num_layers)
else:
self.lstm = nn.LSTM(self.lstm_input_size, self.lstm_hidden_state_size, dropout=0.5, bidirectional=False, num_layers=self.lstm_num_layers) # dropout doen't work for one layer lstm
self.ouput_to_tag = nn.Linear(self.lstm_hidden_state_size, self.chracter_space_length)
self.tensorboard_writer = SummaryWriter(TENSORBOARD_LOG_PATH)
# self.dropout_lstm = nn.Dropout()
def init_hidden_status(self, batch_size):
if self._lstm_bidirectional:
self.hidden = (autograd.Variable(torch.zeros((self.lstm_num_layers * 2, batch_size, self.lstm_hidden_state_size // 2))),
autograd.Variable(torch.zeros((self.lstm_num_layers * 2, batch_size, self.lstm_hidden_state_size // 2)))) # number of layers, batch size, hidden dimention
else:
self.hidden = (autograd.Variable(torch.zeros((self.lstm_num_layers, batch_size, self.lstm_hidden_state_size))),
autograd.Variable(torch.zeros((self.lstm_num_layers, batch_size, self.lstm_hidden_state_size)))) # number of layers, batch size, hidden dimention
def forward(self, image):
'''
:param image: # batch_size, CHANNEL, HEIGHT, WIDTH
:return:
'''
features = self.resnet(image) # [batch_size, 512, 2, 2]
batch_size = image.shape[0]
features = [features.view(batch_size, -1) for i in range(self._max_captcha_length)]
features = torch.stack(features)
self.init_hidden_status(batch_size)
output, hidden = self.lstm(features, self.hidden)
# output = self.dropout_lstm(output)
tag_space = self.ouput_to_tag(output.view(-1, output.size(2))) # [MAX_CAPTCHA_LENGTH * BATCH_SIZE, CHARACTER_SPACE_LENGTH]
tag_space = tag_space.view(self._max_captcha_length, batch_size, -1)
if not self._use_ctc:
tag_score = F.log_softmax(tag_space, dim=2) # [MAX_CAPTCHA_LENGTH, BATCH_SIZE, CHARACTER_SPACE_LENGTH]
else:
tag_score = tag_space
return tag_score
def train_net(self, data_loader, eval_data_loader=None, learning_rate=0.008, epoch_num=400):
try:
if self._use_ctc:
loss_function = warpctc_pytorch.warp_ctc.CTCLoss()
else:
loss_function = nn.NLLLoss()
# optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), momentum=0.9, lr=learning_rate)
# optimizer = MultiStepLR(optimizer, milestones=[10,15], gamma=0.5)
# optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, self.parameters()))
optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()))
self.tensorboard_writer.add_scalar("learning_rate", learning_rate)
tensorbard_global_step=0
if os.path.exists(os.path.join(TENSORBOARD_LOG_PATH, "resume_step")):
with open(os.path.join(TENSORBOARD_LOG_PATH, "resume_step"), "r") as file_handler:
tensorbard_global_step = int(file_handler.read()) + 1
for epoch_index, epoch in enumerate(range(epoch_num)):
for index, sample in enumerate(data_loader):
optimizer.zero_grad()
input_image = autograd.Variable(sample["image"]) # batch_size, 3, 255, 255
tag_score = self.forward(input_image)
if self._use_ctc:
tag_score, target, tag_score_sizes, target_sizes = self._loss_preprocess_ctc(tag_score, sample)
loss = loss_function(tag_score, target, tag_score_sizes, target_sizes)
loss = loss / tag_score.size(1)
else:
target = sample["padded_label_idx"]
tag_score, target = self._loss_preprocess(tag_score, target)
loss = loss_function(tag_score, target)
print("Training loss: {}".format(float(loss)))
self.tensorboard_writer.add_scalar("training_loss", float(loss), tensorbard_global_step)
loss.backward()
optimizer.step()
if index % 250 == 0:
print(u"Processing batch: {} of {}, epoch: {}".format(index, len(data_loader), epoch_index))
self.evaluate(eval_data_loader, loss_function, tensorbard_global_step)
tensorbard_global_step += 1
self.save_model(MODEL_PATH + "_epoch_{}".format(epoch_index))
except KeyboardInterrupt:
print("Exit for KeyboardInterrupt, save model")
self.save_model(MODEL_PATH)
with open(os.path.join(TENSORBOARD_LOG_PATH, "resume_step"), "w") as file_handler:
file_handler.write(str(tensorbard_global_step))
except Exception as excp:
logger.error(str(excp))
logger.error(traceback.format_exc())
def predict(self, image):
# TODO ctc version
'''
:param image: [batch_size, channel, height, width]
:return:
'''
tag_score = self.forward(image)
# TODO ctc
# if self._use_ctc:
# tag_score = F.softmax(tag_score, dim=-1)
# decoder.decode(tag_score)
confidence_log_probability, indexes = tag_score.max(2)
predicted_labels = []
for batch_index in range(indexes.size(1)):
label = ""
for character_index in range(self._max_captcha_length):
if int(indexes[character_index, batch_index]) != 1:
label += id_to_character[int(indexes[character_index, batch_index])]
predicted_labels.append(label)
return predicted_labels, tag_score
def predict_pil_image(self, pil_image):
try:
self.eval()
processed_image = normalizer(rescale({"image": pil_image}))["image"].view(1, 3, 255, 255)
result, tag_score = self.predict(processed_image)
self.train()
except Exception as excp:
logger.error(str(excp))
logger.error(traceback.format_exc())
return [""], None
return result, tag_score
def evaluate(self, eval_dataloader, loss_function, step=0):
total = 0
sequence_correct = 0
character_correct = 0
character_total = 0
loss_total = 0
batch_size = eval_data_loader.batch_size
true_predicted = {}
self.eval()
for sample in eval_dataloader:
total += batch_size
input_images = sample["image"]
predicted_labels, tag_score = self.predict(input_images)
for predicted, true_label in zip(predicted_labels, sample["label"]):
if predicted == true_label: # dataloader is making label a list, use batch_size=1
sequence_correct += 1
for index, true_character in enumerate(true_label):
character_total += 1
if index < len(predicted) and predicted[index] == true_character:
character_correct += 1
true_predicted[true_label] = predicted
if self._use_ctc:
tag_score, target, tag_score_sizes, target_sizes = self._loss_preprocess_ctc(tag_score, sample)
loss_total += float(loss_function(tag_score, target, tag_score_sizes, target_sizes) / batch_size)
else:
tag_score, target = self._loss_preprocess(tag_score, sample["padded_label_idx"])
loss_total += float(loss_function(tag_score, target)) # averaged over batch index
print("True captcha to predicted captcha: ")
pprint(true_predicted)
self.tensorboard_writer.add_text("eval_ture_to_predicted", str(true_predicted), global_step=step)
accuracy = float(sequence_correct) / total
avg_loss = float(loss_total) / (total / batch_size)
character_accuracy = float(character_correct) / character_total
self.tensorboard_writer.add_scalar("eval_sequence_accuracy", accuracy, global_step=step)
self.tensorboard_writer.add_scalar("eval_character_accuracy", character_accuracy, global_step=step)
self.tensorboard_writer.add_scalar("eval_loss", avg_loss, global_step=step)
self.zero_grad()
self.train()
def _loss_preprocess(self, tag_score, target):
'''
:param tag_score: value return by self.forward
:param target: sample["padded_label_idx"]
:return: (processed_tag_score, processed_target) ready for NLLoss function
'''
target = target.transpose(0, 1)
target = target.contiguous()
target = target.view(target.size(0) * target.size(1))
tag_score = tag_score.view(-1, self.chracter_space_length)
return tag_score, target
def _loss_preprocess_ctc(self, tag_score, sample):
target_2d = [
[int(ele) for ele in sample["padded_label_idx"][row, :] if int(ele) != 0 and int(ele) != 1]
for row in range(sample["padded_label_idx"].size(0))]
target = []
for ele in target_2d:
target.extend(ele)
target = autograd.Variable(torch.IntTensor(target))
# tag_score = F.softmax(F.sigmoid(tag_score), dim=-1)
tag_score_sizes = autograd.Variable(torch.IntTensor([self._max_captcha_length] * tag_score.size(1)))
target_sizes = autograd.Variable(sample["captcha_length"].int())
return tag_score, target, tag_score_sizes, target_sizes
# def visualize_graph(self, dataset):
# '''Since pytorch use dynamic graph, an input is required to visualize graph in tensorboard'''
# # warning: Do not run this, the graph is too large to visualize...
# sample = dataset[0]
# input_image = autograd.Variable(sample["image"].view(1, 3, 255, 255))
# tag_score = self.forward(input_image)
# self.tensorboard_writer.add_graph(self, tag_score)
def save_model(self, model_path):
self.tensorboard_writer.close()
self.tensorboard_writer = None # can't be pickled
torch.save(self, model_path)
self.tensorboard_writer = SummaryWriter(TENSORBOARD_LOG_PATH)
#classmethod
def load_model(cls, model_path=MODEL_PATH, *args, **kwargs):
net = cls(*args, **kwargs)
if os.path.exists(model_path):
model = torch.load(model_path)
if model:
model.tensorboard_writer = SummaryWriter(TENSORBOARD_LOG_PATH)
net = model
return net
def __del__(self):
if self.tensorboard_writer:
self.tensorboard_writer.close()
if __name__ == "__main__":
from dataset_util.dataset import dataset, eval_dataset
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)
eval_data_loader = DataLoader(eval_dataset, batch_size=2, shuffle=True)
net = CNN_RNN.load_model()
net.train_net(data_loader, eval_data_loader=eval_data_loader)
# net.predict(dataset[0]["image"].view(1, 3, 255, 255))
# predict_pil_image test code
# from config.config import IMAGE_PATHS
# import glob
# from PIL import Image
#
# image_paths = glob.glob(os.path.join(IMAGE_PATHS.get("EVAL"), "*.png"))
# for image_path in image_paths:
# pil_image = Image.open(image_path)
# predicted, score = net.predict_pil_image(pil_image)
# print("True value: {}, predicted: {}".format(os.path.split(image_path)[1], predicted))
print("Done")
The above codes are main part. If you need other components that makes it running, leave a comment. Got stuck here for quite long. Any advice for training crnn + ctc is appreciated.

I've been training with ctc loss and encountered the same problem. I know this is a rather late answer but hopefully it'll help someone else who's researching on this. After trial and error and a lot of research there are a few things that's worth knowing when it comes to training with ctc (if your model is set up correctly):
The quickest way for the model to lower cost is to predict only blanks. This is noted in a few papers and blogs: see http://www.tbluche.com/ctc_and_blank.html
The model learns to predict only blanks first, then it starts picking up on the error signal in regards to the correct underlying labels. This is also explained in the above link. In practice, I noticed that my model starts to learn the real underlying labels/targets after a couple hundred epochs and the loss starts decreasing dramatically again. Similar to what is shown for the toy example here: https://thomasmesnard.github.io/files/CTC_Poster_Mesnard_Auvolat.pdf
These parameters have a great impact on whether your model converges or not - learning rate, batch size and epoch number.

You have a few questions, so I will try to answer them one by one.
First, why does adding spaces to the captcha break the model?
A neural network learns to deal with the data it is trained on. If you change the distribution of the data (by for example adding spaces between characters) there is no guarantee that the network will generalize. As you hint at in your question. It is possible that the captchas you train on always have the characters in the same positions, or at the same distance from one another, thus your model learns that and learns to exploit this by looking in those positions. If you want your network to generalize a specific scenario, you should explicitly train on that scenario. So in your case, you should add random spaces also during training.
Second, why does the loss not go below 16?
Clearly, from the fact that your training loss is also stalled at 16 (like your validation loss), the problem is that your model simply doesn't have the capacity to deal with the complexity of the problem. In other words, your model is underfitting. You had the correct reflex to try to increase the capacity of your network. You tried to increase the capacity of the LSTM and it didn't help. Thus, the next logical step is that the convolution part of your network is not powerful enough. So here are a few things that you might want to try, from most likely to succeed in my opinion to least likely:
Make convnet trainable: I notice that you are using a pretrained convnet and that you are not fine-tuning the weights of that convnet. That could be a problem. Whatever your convnet was trained on, it might not develop the required features to deal with captchas. You should try learning the weights of the convnet too, in order to develop useful features for captchas.
Use deeper convnet: This is the naive thing to do. Your convnet doesn't have good enough features, try a more powerful deeper one. (But you should definitely use this only after you've made the convnet trainable).

From my experience, training RNN model with CTC loss is not an EASY task. The model may not converge at all if the training is not carefully setup-ed. Here are my suggestions:
Check the CTC loss output along training. For a model would converge, the CTC loss at each batch fluctuates notably. If you observed that the CTC loss shrinks almost monotonically to a stable value, then the model is most likely stuck at a local minima
Use short samples to pretrain your model. Though we have advanced RNN strucures like LSTM and GRU, it's still hard to back-propagate the RNN for long steps.
Enlarge sample variety. You can even add artificial samples to help your model escape from local minima.
F.Y.I., we've just open-sourced a new deep learning framework Dandelion which has built-in CTC objective, and interface pretty much like pytorch. You can try your model with Dandelion and compare it with your current implementation.

Python - features should be a dictionary of `Tensor`s with high level tf APIs

I want to train, evaluate the accuracy and eventually predict with my model. This is my first time using high level APIs such as tf.estimator.
I'm getting a value error from estimator.train(train_input_fn):
'ValueError: features should be a dictionary of `Tensor's. Given type: '
I'm not sure what is going on here. My model is taking 3 inputs and producing a binary output from one neuron.
Before this error I was getting an error about the requested shape not equal to the actual shape, or something along those lines. I fixed it by reducing the batchSize down to 1, instead of 100. I'm sure this isn't going to do so well when it comes to training though.
Any ideas? Heres my code:
import tensorflow as tf
import numpy as np
import sys
sys.path.insert(0, '/Users/blairburns/Documents/DeepLearning/BackgroundColourPredictor/Dataset/Testing/')
sys.path.insert(0, '/Users/blairburns/Documents/DeepLearning/BackgroundColourPredictor/Dataset/Training/')
#other files
from TestDataNormaliser import *
from TrainDataNormaliser import *
learning_rate = 0.01
trainingIteration = 15
batchSize = 1
displayStep = 2
#Layers using tf.layers
def get_logits(features):
l1 = tf.layers.dense(features, 3, activation=tf.nn.relu)
l2 = tf.layers.dense(l1, 4, activation=tf.nn.relu)
l3 = tf.layers.dense(l2, 1, activation=None)
a = l3
return a
#cost function
def get_loss(a, labels):
#cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(a)))
return tf.nn.sigmoid_cross_entropy_with_logits(logits=a, labels=labels)
#cross_entropy = tf.reduce_mean((l3 - y)**2)
#cross_entropy = -tf.reduce_sum(y*tf.log(a))-tf.reduce_sum((1-y)*tf.log(1-a))
#optimizer
def get_train_op(loss):
learning_rate = 1e-3
optimizer = tf.train.RMSPropOptimizer(learning_rate)
return optimizer.minimize(loss, global_step=tf.train.get_global_step())
#training
####
def get_inputs(feature_data, label_data, batch_size, n_epochs=None, shuffle=True):
dataset = tf.data.Dataset.from_tensor_slices(
(feature_data, label_data))
dataset = dataset.repeat(n_epochs)
if shuffle:
dataset = dataset.shuffle(len(feature_data))
dataset = dataset.batch(batch_size)
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
def model_fn(features, labels, mode):
a = get_logits(features)
loss = get_loss(a, labels)
train_op = get_train_op(loss)
predictions = tf.greater(a, 0)
accuracy = tf.metrics.accuracy(labels, predictions)
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops={'Accuracy': accuracy},
predictions=predictions
)
def train_input_fn():
return get_inputs(
trainArrayValues,
trainArrayLabels,
batchSize
)
def eval_input_fn():
return get_inputs(
testArrayValues,
testArrayLabels,
batchSize,
n_epochs=1,
shuffle=False
)
model_dir = './savedModel'
estimator = tf.estimator.LinearRegressor(feature_columns=[model_fn, model_dir])
#estimator.train(train_input_fn, max_steps=1)
estimator.train(train_input_fn)
estimator.evaluate(eval_input_fn)

Your problem is this line:
estimator = tf.estimator.LinearRegressor(feature_columns=[model_fn, model_dir])
You need to set the feature_columns argument to an array of feature columns. A feature column tells the estimator about the data you're feeding it.
It looks like all your input data is numeric, so I'd call tf.feature_column.numeric_column to create your feature column(s). The documentation is here. For example, the following code creates a numeric feature column containing x-coordinates:
xcol = tf.feature_column.numeric_column('x')
If all your estimator needs are x-coordinates, then you could create the estimator with the following code:
estimator = tf.estimator.LinearRegressor(feature_columns=[xcol])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.