Related
Dear stackoverflow members,
I am currently trying to implement my own keras tuner training loop. In this loop I want to pass the input variable multiple times through the model in example:
Y = Startvalue
for i in range(x):
Y = model(Y)
I want to see if this method creates more stable simulations for my self feedback problem.
When I implement it I get an OOM error even when I do not loop. This error does not occur when I just do it normally.
My Class example (the OOM error occurs when i switch logits for logits2:
class MyTuner(kt.Tuner):
def run_trial(self, trial, train_ds, validation_data):
model = self.hypermodel.build(trial.hyperparameters)
optimizer = tf.keras.optimizers.Adam()
epoch_loss_metric = tf.keras.metrics.MeanSquaredError()
def microbatch(T_IN, A_IN, D_IN):
OUT_T = []
OUT_A = []
for i in range(len(T_IN)):
A_IN_R = tf.expand_dims(tf.squeeze(A_IN[i]), 0)
T_IN_R = tf.expand_dims(tf.squeeze(T_IN[i]), 0)
D_IN_R = tf.expand_dims(tf.squeeze(D_IN[i]), 0)
(OUT_T_R, OUT_A_R) = model((A_IN_R, T_IN_R, D_IN_R))
OUT_T.append(tf.squeeze(OUT_T_R))
OUT_A.append(tf.squeeze(OUT_A_R))
return(tf.squeeze(tf.stack(OUT_T)), tf.squeeze(tf.stack(OUT_A)))
def run_train_step(data):
T_IN = tf.dtypes.cast(data[0][0], 'float32')
A_IN = tf.dtypes.cast(data[0][1], 'float32')
D_IN = tf.dtypes.cast(data[0][2], 'float32')
A_Ta = tf.dtypes.cast(data[1][0], 'float32')
T_Ta = tf.dtypes.cast(data[1][1], 'float32')
mse = tf.keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
logits2 = microbatch(T_IN, A_IN, D_IN)
logits = model([A_IN, T_IN, D_IN])
loss = mse((T_Ta, A_Ta), logits2)
# Add any regularization losses.
if model.losses:
loss += tf.math.add_n(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
epoch_loss_metric.update_state((T_Ta, A_Ta), logits2)
return loss
for epoch in range(1000):
print('Epoch: {}'.format(epoch))
self.on_epoch_begin(trial, model, epoch, logs={})
for batch, data in enumerate(train_ds):
self.on_batch_begin(trial, model, batch, logs={})
batch_loss = float(run_train_step(data))
self.on_batch_end(trial, model, batch, logs={'loss': batch_loss})
if batch % 100 == 0:
loss = epoch_loss_metric.result().numpy()
print('Batch: {}, Average Loss: {}'.format(batch, loss))
epoch_loss = epoch_loss_metric.result().numpy()
self.on_epoch_end(trial, model, epoch, logs={'loss': epoch_loss})
epoch_loss_metric.reset_states()
````
In my understanding, the micro-batch function is not implementing a self-feedback loop (though it does not affect the OOM)
I guess what's happening is that because you are computing the output of the network k times, the amount of memory consumption by the network is increasing by k times (because it needs to store intermediate tensors for backprop).
What you can do is, at each self-feedback instance, you backprop the gradients so that all the intermediate tensors do not increase beyond the limit.
lemme know if you have any doubt,
I designed a network for a text classification problem. To do this, I'm using huggingface transformet's BERT model with a linear layer above that for fine-tuning. My problem is that the loss on the training set is decreasing which is fine, but when it comes to do the evaluation after each epoch on the development set, the loss is increasing with epochs. I'm posting my code to investigate if there's something wrong with it.
for epoch in range(1, args.epochs + 1):
total_train_loss = 0
trainer.set_train()
for step, batch in enumerate(train_dataloader):
loss = trainer.step(batch)
total_train_loss += loss
avg_train_loss = total_train_loss / len(train_dataloader)
logger.info(('Training loss for epoch %d/%d: %4.2f') % (epoch, args.epochs, avg_train_loss))
print("\n-------------------------------")
logger.info('Start validation ...')
trainer.set_eval()
y_hat = list()
y = list()
total_dev_loss = 0
for step, batch_val in enumerate(dev_dataloader):
true_labels_ids, predicted_labels_ids, loss = trainer.validate(batch_val)
total_dev_loss += loss
y.extend(true_labels_ids)
y_hat.extend(predicted_labels_ids)
avg_dev_loss = total_dev_loss / len(dev_dataloader)
print(("\n-Total dev loss: %4.2f on epoch %d/%d\n") % (avg_dev_loss, epoch, args.epochs))
print("Training terminated!")
Following is the trainer file, which I use for doing a forward pass on a given batch and then backpropagate accordingly.
class Trainer(object):
def __init__(self, args, model, device, data_points, is_test=False, train_stats=None):
self.args = args
self.model = model
self.device = device
self.loss = nn.CrossEntropyLoss(reduction='none')
if is_test:
# Should load the model from checkpoint
self.model.eval()
self.model.load_state_dict(torch.load(args.saved_model))
logger.info('Loaded saved model from %s' % args.saved_model)
else:
self.model.train()
self.optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = data_points * self.args.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=0,
num_training_steps=total_steps)
def step(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
self.model.zero_grad()
outputs = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
loss = self.loss(outputs, batch_labels)
loss = loss.sum()
(loss / loss.numel()).backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optim.step()
self.scheduler.step()
return loss
def validate(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
with torch.no_grad():
model_output = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
predicted_label_ids = self._predict(model_output)
label_ids = batch_labels.to('cpu').numpy()
loss = self.loss(model_output, batch_labels)
loss = loss.sum()
return label_ids, predicted_label_ids, loss
def _predict(self, logits):
return np.argmax(logits.to('cpu').numpy(), axis=1)
Finally, the following is my model (i.e., Classifier) class:
import torch.nn as nn
from transformers import BertModel
class Classifier(nn.Module):
def __init__(self, args, is_eval=False):
super(Classifier, self).__init__()
self.bert_model = BertModel.from_pretrained(
args.init_checkpoint,
output_attentions=False,
output_hidden_states=True,
)
self.is_eval_mode = is_eval
self.linear = nn.Linear(768, 2) # binary classification
def switch_state(self):
self.is_eval_mode = not self.is_eval_mode
def forward(self, input_ids, attention_mask=None, labels=None):
bert_outputs = self.bert_model(input_ids,
token_type_ids=None,
attention_mask=attention_mask)
# Should give the logits to the the linear layer
model_output = self.linear(bert_outputs[1])
return model_output
For visualization the loss throughout the epochs:
When I've used Bert for text classification my model has generally behaved as you tell. In part this is expected because pre-trained models tend to require few epochs to fine-tune, actually if you check Bert's paper the number of epochs recommended for fine-tuning is between 2 and 4.
On the other hand, I've usually found the optimum at just 1 or 2 epochs, which coincides with your case also. My guess is: there is a trade-off when fine-tuning pre-trained models between fitting to your downstream task and forgetting the weights learned at pre-training. Depending on the data you have, the equilibrium point may happen sooner or later and overfitting starts after that. But this paragraph is speculation based on my experience.
When validation loss increases it means your model is overfitting
I am trying to understand deep-learning with pytorch. I read the pytorch tutorial: https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html, and its written:
''torch.nn only supports mini-batches. The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample.
For example, nn.Conv2d will take in a 4D Tensor of nSamples x nChannels x Height x Width.
If you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.''
I am not sure to understand what it means.
Indeed, I have made a simple feed-forward neural network (cf below code), on which I used a really small dataset (the idea is to learn how it works without mini-batches first, not really to have anything useful), and hence don't need to use mini-batches. Hence, I introduce directly all the sample at each epochs. If I understand it correctly, I should add 'train_data = train_data.unsqueeze(0) '. But I am not sure where, as it seems to change the data size to 1. Also, it works without adding this line so why should I really use it?
Any help would be highly appreciated!
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import os
import numpy as np
# Download data
#...
# Construct network
len_input = len(data[0])
len_output = nbr_class
print('There is %d classes used for classification'%nbr_class)
#defining a new class: Net, that extended nn.Module
#setup the “skeleton” of our network architecture
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
#creation of fully connected. A fully connected neural network layer is represented by the nn.Linear object,
#with the first argument in the definition being the number of nodes in layer l and the next argument being
#the number of nodes in layer l+1
self.fc1 = nn.Linear(len_input, 200)
self.fc2 = nn.Linear(200, 200)
self.fc3 = nn.Linear(200, len_output)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return F.log_softmax(x)
# Create initial network
epochs = 3000
model = Net()
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# Train & test network
def train(data, target, epoch):
model.train() # set the model in "training mode"
# run the main training loop
#no need batch size as small number of sample! like this the process is more exact
#zero our gradients before calling .backward() which is necessary for new sum of gradients
optimizer.zero_grad()
target_pred = model(data)
loss = criterion(target_pred, target)
#Propagate the gradients back through the network
loss.backward()
#update the weights
#tell our optimizer to “step”, meaning that the weights will be updated using the calculated gradients
#according to our rule. perform model parameter update (update weights)
optimizer.step()
# for graphing puposes
loss_array.append(loss.data[0])
if epoch in list(range(0,20000,100)):
print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, loss.data[0]))
def test(epoch, test_data, test_target):
#eval mode to turn Dropout and BatchNorm off
model.eval()
test_loss = 0
correct = 0
test_target_pred = model(test_data)
criterion = nn.NLLLoss()
# sum up batch loss
test_loss += criterion(test_target_pred, test_target).data[0]
pred = test_target_pred.data.max(1)[1] # get the index of the max log-probability
correct += pred.eq(test_target.data).sum()
if epoch in list(range(0,20000,100)):
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_target), 100. * correct / len(test_target)))
if __name__ == '__main__':
for epoch in range(epochs):
train(data = train_data, target=train_target, epoch=epoch)
test(epoch, test_data, test_target)
I’m trying to train a captcha recognition model. Model details are resnet pretrained CNN layers + Bidirectional LSTM + Fully Connected. It reached 90% sequence accuracy on captcha generated by python library captcha. The problem is that these generated captcha seems to have similary location of each character. When I randomly add spaces between characters, the model does not work any more. So I wonder is LSTM learning segmentation during learning? Then I try to use CTC loss. At first, loss goes down pretty quick. But it stays at about 16 without significant drop later. I tried different layers of LSTM, different number of units. 2 Layers of LSTM reach lower loss, but still not converging. 3 layers are just like 2 layers. The loss curve:
#encoding:utf8
import os
import sys
import torch
import warpctc_pytorch
import traceback
import torchvision
from torch import nn, autograd, FloatTensor, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
from tensorboard import SummaryWriter
from pprint import pprint
from net.utils import decoder
from logging import getLogger, StreamHandler
logger = getLogger(__name__)
handler = StreamHandler(sys.stdout)
logger.addHandler(handler)
from dataset_util.utils import id_to_character
from dataset_util.transform import rescale, normalizer
from config.config import MAX_CAPTCHA_LENGTH, TENSORBOARD_LOG_PATH, MODEL_PATH
class CNN_RNN(nn.Module):
def __init__(self, lstm_bidirectional=True, use_ctc=True, *args, **kwargs):
super(CNN_RNN, self).__init__(*args, **kwargs)
model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
param.requires_grad = False
modules = list(model_conv.children())[:-1] # delete the last fc layer.
for param in modules[8].parameters():
param.requires_grad = True
self.resnet = nn.Sequential(*modules) # CNN with fixed parameters from resnet as feature extractor
self.lstm_input_size = 512 * 2 * 2
self.lstm_hidden_state_size = 512
self.lstm_num_layers = 2
self.chracter_space_length = 64
self._lstm_bidirectional = lstm_bidirectional
self._use_ctc = use_ctc
if use_ctc:
self._max_captcha_length = int(MAX_CAPTCHA_LENGTH * 2)
else:
self._max_captcha_length = MAX_CAPTCHA_LENGTH
if lstm_bidirectional:
self.lstm_hidden_state_size = self.lstm_hidden_state_size * 2 # so that hidden size for one direction in bidirection lstm is the same as vanilla lstm
self.lstm = self.lstm = nn.LSTM(self.lstm_input_size, self.lstm_hidden_state_size // 2, dropout=0.5, bidirectional=True, num_layers=self.lstm_num_layers)
else:
self.lstm = nn.LSTM(self.lstm_input_size, self.lstm_hidden_state_size, dropout=0.5, bidirectional=False, num_layers=self.lstm_num_layers) # dropout doen't work for one layer lstm
self.ouput_to_tag = nn.Linear(self.lstm_hidden_state_size, self.chracter_space_length)
self.tensorboard_writer = SummaryWriter(TENSORBOARD_LOG_PATH)
# self.dropout_lstm = nn.Dropout()
def init_hidden_status(self, batch_size):
if self._lstm_bidirectional:
self.hidden = (autograd.Variable(torch.zeros((self.lstm_num_layers * 2, batch_size, self.lstm_hidden_state_size // 2))),
autograd.Variable(torch.zeros((self.lstm_num_layers * 2, batch_size, self.lstm_hidden_state_size // 2)))) # number of layers, batch size, hidden dimention
else:
self.hidden = (autograd.Variable(torch.zeros((self.lstm_num_layers, batch_size, self.lstm_hidden_state_size))),
autograd.Variable(torch.zeros((self.lstm_num_layers, batch_size, self.lstm_hidden_state_size)))) # number of layers, batch size, hidden dimention
def forward(self, image):
'''
:param image: # batch_size, CHANNEL, HEIGHT, WIDTH
:return:
'''
features = self.resnet(image) # [batch_size, 512, 2, 2]
batch_size = image.shape[0]
features = [features.view(batch_size, -1) for i in range(self._max_captcha_length)]
features = torch.stack(features)
self.init_hidden_status(batch_size)
output, hidden = self.lstm(features, self.hidden)
# output = self.dropout_lstm(output)
tag_space = self.ouput_to_tag(output.view(-1, output.size(2))) # [MAX_CAPTCHA_LENGTH * BATCH_SIZE, CHARACTER_SPACE_LENGTH]
tag_space = tag_space.view(self._max_captcha_length, batch_size, -1)
if not self._use_ctc:
tag_score = F.log_softmax(tag_space, dim=2) # [MAX_CAPTCHA_LENGTH, BATCH_SIZE, CHARACTER_SPACE_LENGTH]
else:
tag_score = tag_space
return tag_score
def train_net(self, data_loader, eval_data_loader=None, learning_rate=0.008, epoch_num=400):
try:
if self._use_ctc:
loss_function = warpctc_pytorch.warp_ctc.CTCLoss()
else:
loss_function = nn.NLLLoss()
# optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), momentum=0.9, lr=learning_rate)
# optimizer = MultiStepLR(optimizer, milestones=[10,15], gamma=0.5)
# optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, self.parameters()))
optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.parameters()))
self.tensorboard_writer.add_scalar("learning_rate", learning_rate)
tensorbard_global_step=0
if os.path.exists(os.path.join(TENSORBOARD_LOG_PATH, "resume_step")):
with open(os.path.join(TENSORBOARD_LOG_PATH, "resume_step"), "r") as file_handler:
tensorbard_global_step = int(file_handler.read()) + 1
for epoch_index, epoch in enumerate(range(epoch_num)):
for index, sample in enumerate(data_loader):
optimizer.zero_grad()
input_image = autograd.Variable(sample["image"]) # batch_size, 3, 255, 255
tag_score = self.forward(input_image)
if self._use_ctc:
tag_score, target, tag_score_sizes, target_sizes = self._loss_preprocess_ctc(tag_score, sample)
loss = loss_function(tag_score, target, tag_score_sizes, target_sizes)
loss = loss / tag_score.size(1)
else:
target = sample["padded_label_idx"]
tag_score, target = self._loss_preprocess(tag_score, target)
loss = loss_function(tag_score, target)
print("Training loss: {}".format(float(loss)))
self.tensorboard_writer.add_scalar("training_loss", float(loss), tensorbard_global_step)
loss.backward()
optimizer.step()
if index % 250 == 0:
print(u"Processing batch: {} of {}, epoch: {}".format(index, len(data_loader), epoch_index))
self.evaluate(eval_data_loader, loss_function, tensorbard_global_step)
tensorbard_global_step += 1
self.save_model(MODEL_PATH + "_epoch_{}".format(epoch_index))
except KeyboardInterrupt:
print("Exit for KeyboardInterrupt, save model")
self.save_model(MODEL_PATH)
with open(os.path.join(TENSORBOARD_LOG_PATH, "resume_step"), "w") as file_handler:
file_handler.write(str(tensorbard_global_step))
except Exception as excp:
logger.error(str(excp))
logger.error(traceback.format_exc())
def predict(self, image):
# TODO ctc version
'''
:param image: [batch_size, channel, height, width]
:return:
'''
tag_score = self.forward(image)
# TODO ctc
# if self._use_ctc:
# tag_score = F.softmax(tag_score, dim=-1)
# decoder.decode(tag_score)
confidence_log_probability, indexes = tag_score.max(2)
predicted_labels = []
for batch_index in range(indexes.size(1)):
label = ""
for character_index in range(self._max_captcha_length):
if int(indexes[character_index, batch_index]) != 1:
label += id_to_character[int(indexes[character_index, batch_index])]
predicted_labels.append(label)
return predicted_labels, tag_score
def predict_pil_image(self, pil_image):
try:
self.eval()
processed_image = normalizer(rescale({"image": pil_image}))["image"].view(1, 3, 255, 255)
result, tag_score = self.predict(processed_image)
self.train()
except Exception as excp:
logger.error(str(excp))
logger.error(traceback.format_exc())
return [""], None
return result, tag_score
def evaluate(self, eval_dataloader, loss_function, step=0):
total = 0
sequence_correct = 0
character_correct = 0
character_total = 0
loss_total = 0
batch_size = eval_data_loader.batch_size
true_predicted = {}
self.eval()
for sample in eval_dataloader:
total += batch_size
input_images = sample["image"]
predicted_labels, tag_score = self.predict(input_images)
for predicted, true_label in zip(predicted_labels, sample["label"]):
if predicted == true_label: # dataloader is making label a list, use batch_size=1
sequence_correct += 1
for index, true_character in enumerate(true_label):
character_total += 1
if index < len(predicted) and predicted[index] == true_character:
character_correct += 1
true_predicted[true_label] = predicted
if self._use_ctc:
tag_score, target, tag_score_sizes, target_sizes = self._loss_preprocess_ctc(tag_score, sample)
loss_total += float(loss_function(tag_score, target, tag_score_sizes, target_sizes) / batch_size)
else:
tag_score, target = self._loss_preprocess(tag_score, sample["padded_label_idx"])
loss_total += float(loss_function(tag_score, target)) # averaged over batch index
print("True captcha to predicted captcha: ")
pprint(true_predicted)
self.tensorboard_writer.add_text("eval_ture_to_predicted", str(true_predicted), global_step=step)
accuracy = float(sequence_correct) / total
avg_loss = float(loss_total) / (total / batch_size)
character_accuracy = float(character_correct) / character_total
self.tensorboard_writer.add_scalar("eval_sequence_accuracy", accuracy, global_step=step)
self.tensorboard_writer.add_scalar("eval_character_accuracy", character_accuracy, global_step=step)
self.tensorboard_writer.add_scalar("eval_loss", avg_loss, global_step=step)
self.zero_grad()
self.train()
def _loss_preprocess(self, tag_score, target):
'''
:param tag_score: value return by self.forward
:param target: sample["padded_label_idx"]
:return: (processed_tag_score, processed_target) ready for NLLoss function
'''
target = target.transpose(0, 1)
target = target.contiguous()
target = target.view(target.size(0) * target.size(1))
tag_score = tag_score.view(-1, self.chracter_space_length)
return tag_score, target
def _loss_preprocess_ctc(self, tag_score, sample):
target_2d = [
[int(ele) for ele in sample["padded_label_idx"][row, :] if int(ele) != 0 and int(ele) != 1]
for row in range(sample["padded_label_idx"].size(0))]
target = []
for ele in target_2d:
target.extend(ele)
target = autograd.Variable(torch.IntTensor(target))
# tag_score = F.softmax(F.sigmoid(tag_score), dim=-1)
tag_score_sizes = autograd.Variable(torch.IntTensor([self._max_captcha_length] * tag_score.size(1)))
target_sizes = autograd.Variable(sample["captcha_length"].int())
return tag_score, target, tag_score_sizes, target_sizes
# def visualize_graph(self, dataset):
# '''Since pytorch use dynamic graph, an input is required to visualize graph in tensorboard'''
# # warning: Do not run this, the graph is too large to visualize...
# sample = dataset[0]
# input_image = autograd.Variable(sample["image"].view(1, 3, 255, 255))
# tag_score = self.forward(input_image)
# self.tensorboard_writer.add_graph(self, tag_score)
def save_model(self, model_path):
self.tensorboard_writer.close()
self.tensorboard_writer = None # can't be pickled
torch.save(self, model_path)
self.tensorboard_writer = SummaryWriter(TENSORBOARD_LOG_PATH)
#classmethod
def load_model(cls, model_path=MODEL_PATH, *args, **kwargs):
net = cls(*args, **kwargs)
if os.path.exists(model_path):
model = torch.load(model_path)
if model:
model.tensorboard_writer = SummaryWriter(TENSORBOARD_LOG_PATH)
net = model
return net
def __del__(self):
if self.tensorboard_writer:
self.tensorboard_writer.close()
if __name__ == "__main__":
from dataset_util.dataset import dataset, eval_dataset
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)
eval_data_loader = DataLoader(eval_dataset, batch_size=2, shuffle=True)
net = CNN_RNN.load_model()
net.train_net(data_loader, eval_data_loader=eval_data_loader)
# net.predict(dataset[0]["image"].view(1, 3, 255, 255))
# predict_pil_image test code
# from config.config import IMAGE_PATHS
# import glob
# from PIL import Image
#
# image_paths = glob.glob(os.path.join(IMAGE_PATHS.get("EVAL"), "*.png"))
# for image_path in image_paths:
# pil_image = Image.open(image_path)
# predicted, score = net.predict_pil_image(pil_image)
# print("True value: {}, predicted: {}".format(os.path.split(image_path)[1], predicted))
print("Done")
The above codes are main part. If you need other components that makes it running, leave a comment. Got stuck here for quite long. Any advice for training crnn + ctc is appreciated.
I've been training with ctc loss and encountered the same problem. I know this is a rather late answer but hopefully it'll help someone else who's researching on this. After trial and error and a lot of research there are a few things that's worth knowing when it comes to training with ctc (if your model is set up correctly):
The quickest way for the model to lower cost is to predict only blanks. This is noted in a few papers and blogs: see http://www.tbluche.com/ctc_and_blank.html
The model learns to predict only blanks first, then it starts picking up on the error signal in regards to the correct underlying labels. This is also explained in the above link. In practice, I noticed that my model starts to learn the real underlying labels/targets after a couple hundred epochs and the loss starts decreasing dramatically again. Similar to what is shown for the toy example here: https://thomasmesnard.github.io/files/CTC_Poster_Mesnard_Auvolat.pdf
These parameters have a great impact on whether your model converges or not - learning rate, batch size and epoch number.
You have a few questions, so I will try to answer them one by one.
First, why does adding spaces to the captcha break the model?
A neural network learns to deal with the data it is trained on. If you change the distribution of the data (by for example adding spaces between characters) there is no guarantee that the network will generalize. As you hint at in your question. It is possible that the captchas you train on always have the characters in the same positions, or at the same distance from one another, thus your model learns that and learns to exploit this by looking in those positions. If you want your network to generalize a specific scenario, you should explicitly train on that scenario. So in your case, you should add random spaces also during training.
Second, why does the loss not go below 16?
Clearly, from the fact that your training loss is also stalled at 16 (like your validation loss), the problem is that your model simply doesn't have the capacity to deal with the complexity of the problem. In other words, your model is underfitting. You had the correct reflex to try to increase the capacity of your network. You tried to increase the capacity of the LSTM and it didn't help. Thus, the next logical step is that the convolution part of your network is not powerful enough. So here are a few things that you might want to try, from most likely to succeed in my opinion to least likely:
Make convnet trainable: I notice that you are using a pretrained convnet and that you are not fine-tuning the weights of that convnet. That could be a problem. Whatever your convnet was trained on, it might not develop the required features to deal with captchas. You should try learning the weights of the convnet too, in order to develop useful features for captchas.
Use deeper convnet: This is the naive thing to do. Your convnet doesn't have good enough features, try a more powerful deeper one. (But you should definitely use this only after you've made the convnet trainable).
From my experience, training RNN model with CTC loss is not an EASY task. The model may not converge at all if the training is not carefully setup-ed. Here are my suggestions:
Check the CTC loss output along training. For a model would converge, the CTC loss at each batch fluctuates notably. If you observed that the CTC loss shrinks almost monotonically to a stable value, then the model is most likely stuck at a local minima
Use short samples to pretrain your model. Though we have advanced RNN strucures like LSTM and GRU, it's still hard to back-propagate the RNN for long steps.
Enlarge sample variety. You can even add artificial samples to help your model escape from local minima.
F.Y.I., we've just open-sourced a new deep learning framework Dandelion which has built-in CTC objective, and interface pretty much like pytorch. You can try your model with Dandelion and compare it with your current implementation.
Problem statement
I am trying to train a dynamic RNN in TensorFlow v1.0.1 on Linux RedHat 7.3 (problem also manifests on Windows 7), and no matter what I try, I get the exact same training and validation error at every epoch, i.e. my weights are not updating.
I appreciate any help you can offer.
Example
I tried to reduce this to a minimum example that shows my issue, but the minimum example is still pretty large. I based the network structure largely on this gist.
Network definition
import functools
import numpy as np
import tensorflow as tf
def lazy_property(function):
attribute = '_' + function.__name__
#property
#functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
class MyNetwork:
"""
Class defining an RNN for labeling a time series.
"""
def __init__(self, data, target, num_hidden=64):
self.data = data
self.target = target
self._num_hidden = num_hidden
self._num_steps = int(self.target.get_shape()[1])
self._num_classes = int(self.target.get_shape()[2])
self._weight_and_bias() # create weight and bias tensors
self.prediction
self.error
self.optimize
#lazy_property
def prediction(self):
"""Defines the recurrent neural network prediction scheme."""
# Dynamic LSTM.
network = tf.contrib.rnn.BasicLSTMCell(self._num_hidden)
output, _ = tf.nn.dynamic_rnn(network, data, dtype=tf.float32)
# Flatten and apply same weights to all time steps.
output = tf.reshape(output, [-1, self._num_hidden])
prediction = tf.nn.softmax(tf.matmul(output, self.weight) + self.bias)
prediction = tf.reshape(prediction,
[-1, self._num_steps, self._num_classes])
return prediction
#lazy_property
def cost(self):
"""Defines the cost function for the network."""
cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction),
axis=[1, 2])
cross_entropy = tf.reduce_mean(cross_entropy)
return cross_entropy
#lazy_property
def optimize(self):
"""Defines the optimization scheme."""
learning_rate = 0.003
optimizer = tf.train.RMSPropOptimizer(learning_rate)
return optimizer.minimize(self.cost)
#lazy_property
def error(self):
"""Defines a measure of prediction error."""
mistakes = tf.not_equal(tf.argmax(self.target, 2),
tf.argmax(self.prediction, 2))
return tf.reduce_mean(tf.cast(mistakes, tf.float32))
def _weight_and_bias(self):
"""Returns appropriately sized weight and bias tensors for the output layer."""
self.weight = tf.Variable(tf.truncated_normal(
[self._num_hidden, self._num_classes],
mean=0.0,
stddev=0.01,
dtype=tf.float32))
self.bias = tf.Variable(tf.constant(0.1, shape=[self._num_classes]))
Training
Here is my training process. The all_data class just holds my data and labels, and uses a batch generator class to spit out batches for training when I call all_data.train.next() and all_data.train_labels.next(). You can reproduce with any batch generation scheme you like, and I can add the code if you think it is relevant; I felt like this was getting too long as it is.
tf.reset_default_graph()
data = tf.placeholder(tf.float32,
[None, all_data.num_steps, all_data.num_features])
target = tf.placeholder(tf.float32,
[None, all_data.num_steps, all_data.num_outputs])
model = MyNetwork(data, target, NUM_HIDDEN)
print('Training the model...')
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print('Initialized.')
for epoch in range(3):
print('Epoch {} |'.format(epoch), end='', flush=True)
for step in range(all_data.train_size // BATCH_SIZE):
# Generate the next training batch and train.
d = all_data.train.next()
t = all_data.train_labels.next()
sess.run(model.optimize,
feed_dict={data: d, target: t})
# Update the user periodically.
if step % summary_frequency == 0:
print('.', end='', flush=True)
# Show training and validation error at the end of each epoch.
print('|', flush=True)
train_error = sess.run(model.error,
feed_dict={data: d, target: t})
valid_error = sess.run(model.error,
feed_dict={
data: all_data.valid,
target: all_data.valid_labels
})
print('Training error: {}%'.format(100 * train_error))
print('Validation error: {}%'.format(100 * valid_error))
# Check testing error after everything.
test_error = sess.run(model.error,
feed_dict={
data: all_data.test,
target: all_data.test_labels
})
print('Testing error after {} epochs: {}%'.format(epoch + 1, 100 * test_error))
For a simple example, I generated random data and labels, where data has shape [num_samples, num_steps, num_features], and each sample has a single label associated with the whole thing:
data = np.random.rand(5000, 1000, 2)
labels = np.random.randint(low=0, high=2, size=[5000])
I then converted my labels to one-hot vectors and tiled them so that the resulting labels tensor was the same size as the data tensor.
Results
No matter what I do, I get results like this:
Training the model...
Initialized.
Epoch 0 |.......................................................|
Training error: 56.25%
Validation error: 53.39999794960022%
Epoch 1 |.......................................................|
Training error: 56.25%
Validation error: 53.39999794960022%
Epoch 2 |.......................................................|
Training error: 56.25%
Validation error: 53.39999794960022%
Testing error after 3 epochs: 49.000000953674316%
Where I have exactly the same error at every epoch. Even if my weights were randomly walking around this should change. For the example shown here, I used random data with random labels, so I do not expect much improvement, but I do expect some change, and I am getting the exact same results every epoch. When I do this with my actual data set, I get the same behavior.
Insight
I hesitate to include this in case it proves to be a red herring, but I believe that my optimizer is calculating cost function gradients of None. When I tried a different optimizer and attempted to clip the gradients, I went ahead and used tf.Print to output the gradients as well. The network crashed with an error that tf.Print could not handle None-type values.
Attempted fixes
I have tried the following things, and the problem persists in all cases:
Using different optimizers, e.g. AdamOptimizer with and without modifications to the gradients (clipping).
Adjusting batch sizes.
Using many more and many fewer hidden nodes.
Running for more epochs.
Initializing my weights with different values assigned to stddev.
Initializing my biases to zeros (using tf.zeros) and to different constants.
Using weights and biases that are defined within the prediction method and are not member variables of the class, and a _weight_and_bias method that is defined as a #staticmethod like in this gist.
Determining logits in the prediction function instead of softmax predictions, i.e. predictions = tf.matmul(output, self.weights) + self.bias, and then using tf.nn.softmax_cross_entropy_with_logits. This requires some reshaping because the method wants its labels and targets given with shape [batch_size, num_classes], so the cost method becomes:
(line added to get code to format...)
#lazy_property
def cost(self):
"""Defines the cost function for the network."""
targs = tf.reshape(self.target, [-1, self._num_classes])
logits = tf.reshape(self.predictions, [-1, self._num_classes])
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=targs, logits=logits)
cross_entropy = tf.reduce_mean(cross_entropy)
return cross_entropy
Changing which size dimension I leave as None when I create my placeholders as suggested in this answer, which requires a bit of rewriting in the network definition. Basically setting size = [all_data.batch_size, -1, all_data.num_features] and size = [all_data.batch_size, -1, all_data.num_classes].
Using tf.contrib.rnn.DropoutWrapper in my network definition and passing a dropout value set to 0.5 in training and 1.0 in validation and testing.
The problem went away when I used
output = tf.contrib.layers.flatten(output)
logits = tf.contrib.layers.fully_connected(output, some_size, activation_fn=None)
instead of flattening my network output, defining weights, and performing the tf.matmul(output, weight) + bias manually. I then used logits (instead of predictions in the question) in my cost function with
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=target,
logits=logits)
If you want to get the network prediction, you will still need to do prediction = tf.nn.softmax(logits).
I have no idea why this helped, but the network would not train even on random made-up data until I made these changes.