PyTorch loading GradScaler from checkpoint - python

I am saving my model, optimizer, scheduler, and scaler in a general checkpoint.
Now when I load them, they load properly but after the first iteration the scaler.step(optimizer) throws this error:
Traceback (most recent call last):
File "HistNet/trainloop.py", line 92, in <module>
scaler.step(optimizer)
File "/opt/conda/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 333, in step
retval = optimizer.step(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/adam.py", line 108, in step
F.adam(params_with_grad,
File "/opt/conda/lib/python3.8/site-packages/torch/optim/functional.py", line 86, in adam
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
RuntimeError: The size of tensor a (32) must match the size of tensor b (64) at non-singleton dimension 0
Now I don't really understand why a shape mismatch of all things is there. I'm doing everything similarly to official docs, here is shortened version of my code:
dataloader = DataLoader(Dataset)
model1 = model1()
optimizer = optim.Adam(parameters, lr, betas)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: decay_rate**epoch)
scaler = amp.GradScaler()
if resume: epoch_resume = load_checkpoint(path, model1, optimizer, scheduler, scaler)
for epoch in trange(epoch_resume, config['epochs']+1, desc='Epochs'):
for content_image, style_image in tqdm(dataloader, desc='Dataloader'):
content_image, style_image = content_image.to(device), style_image.to(device)
with amp.autocast():
content_image = TF.rgb_to_grayscale(content_image)
s = TF.rgb_to_grayscale(style_image)
deformation_field = model1(s, content_image)
output_image = F.grid_sample(content_image, deformation_field.float(), align_corners=False)
loss_after = cost_function(output_image, s, device=device)
loss_list += [loss_after]
scaler.scale(loss_after).backward()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
scheduler.step()
torch.save({
'epoch': epoch,
'model1_state_dict': model1.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict': scheduler.state_dict(),
'scaler_state_dict': scaler.state_dict(),
}, path)
def load_checkpoint(checkpoint_path, model1, optimizer, scheduler, scaler):
checkpoint = torch.load(checkpoint_path)
model1.load_state_dict(checkpoint['model1_state_dict'])
model1.train()
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
scaler.load_state_dict(checkpoint['scaler_state_dict'])
epoch = checkpoint['epoch']
return epoch+1

For anyone with similar issue:
It boiled down to my use of 2 models and 1 optimizer. I did:
parameters = set()
for net in nets:
parameters |= set(net.parameters())
which resulted in unordered list of parameters which was unsurprisingly different with each resume.
I currently changed it to:
parameters = []
for net in nets:
parameters += list(net.parameters())
which works but I haven't seen the use of list in any other code as of now and I have seen the use of a set. So be wary of some potential unwanted behavior. As of now I understand you lose only the fact that you can have multiple same tensors in a list. But with two different models I don't see how it could affect the optimizer. If you know more than me, please correct me.

Related

Problem completing BERT model for sentiment classification

I am trying to figure out sentiment classification on movie reviews using BERT, transformers and tensorflow. This is the code I currently have:
def read_dataset(filename, model_name="bert-base-uncased"):
"""Reads a dataset from the specified path and returns sentences and labels"""
tokenizer = BertTokenizer.from_pretrained(model_name)
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
# preallocate memory for the data
sents, labels = list(), np.empty((len(lines), 1), dtype=int)
for i, line in enumerate(lines):
text, str_label, _ = line.split("\t")
labels[i] = int(str_label.split("=")[1] == "POS")
sents.append(text)
return dict(tokenizer(sents, padding=True, truncation=True, return_tensors="tf")), labels
class BertMLP(tf.keras.Model):
def __init__(self, embed_batch_size=100, model_name="bert-base-cased"):
super(BertMLP, self).__init__()
self.bs = embed_batch_size
self.model = TFBertModel.from_pretrained(model_name)
self.classification_head = tf.keras.models.Sequential(
layers = [
tf.keras.Input(shape=(self.model.config.hidden_size,)),
tf.keras.layers.Dense(350, activation="tanh"),
tf.keras.layers.Dense(200, activation="tanh"),
tf.keras.layers.Dense(50, activation="tanh"),
tf.keras.layers.Dense(1, activation="sigmoid", use_bias=False)
]
)
def call(self, inputs):
outputs = self.model(inputs)
return outputs
def evaluate(model, inputs, labels, loss_func):
mean_loss = tf.keras.metrics.Mean(name="train_loss")
accuracy = tf.keras.metrics.BinaryAccuracy(name="train_accuracy")
predictions = model(inputs)
mean_loss(loss_func(labels, predictions))
accuracy(labels, predictions)
return mean_loss.result(), accuracy.result() * 100
if __name__ == "__main__":
train = read_dataset("datasets/rt-polarity.train.vecs")
dev = read_dataset("datasets/rt-polarity.dev.vecs")
test = read_dataset("datasets/rt-polarity.test.vecs")
mlp = BertMLP()
mlp.compile(tf.keras.optimizers.SGD(learning_rate=0.01), loss='mse')
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
print("Before training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")
mlp.fit(*train, epochs=10, batch_size=10)
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
print("After training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")
However, when I run this code, I get an error:
Traceback (most recent call last):
File "C:\Users\home\anaconda3\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\users\home\downloads\mlp.py", line 60, in <module>
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
File "c:\users\home\downloads\mlp.py", line 46, in evaluate
predictions = model(inputs)
File "C:\Users\home\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "c:\users\home\downloads\mlp.py", line 39, in call
outputs = self.model(inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
return func(self, **unpacked_inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 1108, in call
outputs = self.bert(
File "C:\Users\home\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
return func(self, **unpacked_inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 781, in call
embedding_output = self.embeddings(
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 203, in call
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
InvalidArgumentError: Exception encountered when calling layer "embeddings" (type TFBertEmbeddings).
indices[1174,8] = 29550 is not in [0, 28996) [Op:ResourceGather]
Call arguments received:
• input_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
• position_ids=None
• token_type_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
• inputs_embeds=None
• past_key_values_length=0
• training=False
I googled for a while, and I can't find anything conclusive. I am pretty sure it has something to do with this part:
def call(self, inputs):
outputs = self.model(inputs)
return outputs
But again, I have tried a lot of different things, including limiting dataset size and installing different versions of transformers and tensorflow, but to no avail. Please let me know what I'm doing wrong. Thank you!
OP was using bert-base-cased for their model, and bert-base-uncased for their tokenizer, causing issues during training when the vocab size of the model and the tokenized data differed.

How to train Pytorch model on custom data

I am very rookie in transferring my code from Keras/Tensorflow to Pytorch and I am trying to retrain my TF model in Pytorch, however, my dataset has some particularities which make it difficult to me to make it run in Pytorch.
To understand my issues, recall that I have a custom dataset initialized this way:
class MyDataSet(torch.utils.data.Dataset):
def __init__(self, x, y, transform=None):
super(MyDataSet, self).__init__()
# store the raw tensors
self._x = np.load(x)
self._y = np.load(y)
self._x=np.swapaxes(self._x,3,2)
self._x=np.swapaxes(self._x,2,1)
self.transform = transform
def __len__(self):
# a DataSet must know it size
return self._x.shape[0]
def __getitem__(self, index):
x = self._x[index, :]
y = self._y[index]
return x, y
The shape of _self._x is (12000, 3, 224, 224) and the shape of self._y is (12000,)
I am fine-tuning a pre-trained RESNET-50 in this data, and the training happens the following way:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet50
import time
import copy
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for param in model.parameters():
param.requires_grad = False
#Transform dataset
print("Loading Data")
transform = transforms.Compose([transforms.ToTensor()])
dataset = MyDataSet("me/train1-features.npy","/me/train1-classes.npy",transform=transform)
dataloader = DataLoader(dataset, batch_size=4)
print("Configuring network")
feature_extract = True
num_epochs = 15
num_classes=12
model_ft = resnet50(pretrained=True)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, num_classes)
if torch.cuda.is_available():
model_ft.cuda()
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
params_to_update = []
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
params_to_update.append(param)
print("\t",name)
else:
for name,param in model_ft.named_parameters():
if param.requires_grad == True:
print("\t",name)
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
#Train (how to validate?)
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
#transfer labels and inputs to cuda()
inputs,labels=inputs.cuda(), labels.cuda()
# zero the parameter gradients
optimizer_ft.zero_grad()
# forward + backward + optimize
outputs = model_ft(inputs)
loss = loss_func(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
running_loss = 0.0
However, whenever I run this code, I receive the following error
Traceback (most recent call last):
File "train_my_data_example.py", line 114, in <module>
outputs = model_ft(inputs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/torchvision/models/resnet.py", line 249, in forward
return self._forward_impl(x)
File "/usr/local/lib/python3.8/dist-packages/torchvision/models/resnet.py", line 232, in _forward_impl
x = self.conv1(x)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py", line 399, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py", line 395, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Input type (torch.cuda.ByteTensor) and weight type (torch.cuda.FloatTensor) should be the same
I also can do the train and validation procedures normally on TF/Keras, but I don't know how to do that in my custom Dataset with Pytorch.
How can I solve my problem and also run train/val loop with Pytorch in my custom data?
It seems that np.load is loading binary data to X so ToTensor() is trying to preserve the dtype by coercing it to a ByteTensor. You can fix this by making a small change in __getitem__:
def __getitem__(self, index):
x = self._x[index, :]
y = self._y[index]
return x.astype(np.float32), y

Pytorch Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU (while checking arguments for addmm)

I am a beginner to machine learning and trying to train a model on counting the amount of numbers below 0.5 in a 1D Vector with the length of 10. The input vectors contain number between 0 and 1. I generate the input data and the labels in my script instead of having them in a seperate file, because the data is so simple.
This is the Code:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.lin1 = nn.Linear(10,10)
self.lin2 = nn.Linear(10,1)
def forward(self,x):
x = self.lin1(x)
x = F.relu(x)
x = self.lin2(x)
return x
net = MyNet()
net.to(device)
def train():
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.1)
for epochs in range(100):
target = 0
data = torch.rand(10)
for entry in data:
if entry < 0.5:
target += 1
# print(target)
# print(data)
data = data.to(device)
out = net(data)
# print(out)
target = torch.Tensor(target)
target = target.to(device)
loss = criterion(out, target)
print(loss)
net.zero_grad()
loss.backward()
optimizer.step()
def test():
acc_error = 0
for i in range(100):
test_data = torch.rand(10)
test_data.to(device)
test_target = 0
for entry in test_data:
if entry < 0.5:
test_target += 1
out = net(test_data)
error = test_target - out
if error < 0:
error *= -1
acc_error += error
overall_error = acc_error / 100
print(overall_error)
train()
test()
This is the error:
Traceback (most recent call last):
File "test1.py", line 70, in <module>
test()
File "test1.py", line 59, in test
out = net(test_data)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "test1.py", line 15, in forward
x = self.lin1(x)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/linear.py", line 94, in forward
return F.linear(input, self.weight, self.bias)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/functional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU (while checking arguments for addmm)
The other posts regarding the topic have not solved my problem. Maybe somebody can help. Thanks!
Notice how your error message traces back to test, while train works fine.
You've transfered your data correctly in train:
data = data.to(device)
But not in test:
test_data.to(device)
Instead it should be reassigned to test_data, since torch.Tensor.to makes a copy:
test_data = test_data.to(device)

Resuming pytorch model training raises error “CUDA out of memory”

My goal is to save the model at every epoch as I have to stop the training during the night and I don't want to lose progress.
After I trained my model for 1 epoch I interrupted the process via terminal with CTRL+Z.
When I tried to resume the training I got this error
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
File "train.py", line 174, in <module>
train(train_loader, model, optimizer, epoch)
File "train.py", line 97, in train
loss1 = CE(atts, gts)
File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/loss.py", line 500, in forward
reduce=self.reduce)
File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/functional.py", line 1516, in binary_cross_entropy_with_logits
max_val = (-input).clamp(min=0)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu:58
The code that manages everything is this one
import wandb
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import pdb, os, argparse
from datetime import datetime
from model.CPD_models import CPD_VGG
from model.CPD_ResNet_models import CPD_ResNet
from data import get_loader
from utils import clip_gradient, adjust_lr
parser = argparse.ArgumentParser()
parser.add_argument('--epoch', type=int, default=10, help='epoch number')
parser.add_argument('--lr', type=float, default=1e-4, help='learning rate')
parser.add_argument('--batchsize', type=int, default=1, help='training batch size')
parser.add_argument('--trainsize', type=int, default=352, help='training dataset size')
parser.add_argument('--clip', type=float, default=0.5, help='gradient clipping margin')
parser.add_argument('--is_ResNet', type=bool, default=False, help='VGG or ResNet backbone')
parser.add_argument('--decay_rate', type=float, default=0.1, help='decay rate of learning rate')
parser.add_argument('--decay_epoch', type=int, default=50, help='every n epochs decay learning rate')
parser.add_argument('--model_id', type=str, required=True, help='required unique id for trained model name')
parser.add_argument('--resume', type=str, default='', help='path to resume model training from checkpoint')
parser.add_argument('--wandb', type=bool, default=False, help='enable wandb tracking model training')
opt = parser.parse_args()
model_id = opt.model_id
WANDB_EN = opt.wandb
if WANDB_EN:
wandb.init(entity="albytree", project="cpd-train")
# Add all parsed config in one line
if WANDB_EN:
wandb.config.update(opt)
tot_epochs = opt.epoch
print("Training Info")
print("EPOCHS: {}".format(opt.epoch))
print("LEARNING RATE: {}".format(opt.lr))
print("BATCH SIZE: {}".format(opt.batchsize))
print("TRAIN SIZE: {}".format(opt.trainsize))
print("CLIP: {}".format(opt.clip))
print("USING ResNet backbone: {}".format(opt.is_ResNet))
print("DECAY RATE: {}".format(opt.decay_rate))
print("DECAY EPOCH: {}".format(opt.decay_epoch))
print("MODEL ID: {}".format(opt.model_id))
# build models
if opt.is_ResNet:
model = CPD_ResNet()
else:
model = CPD_VGG()
model.cuda()
params = model.parameters()
optimizer = torch.optim.Adam(params, opt.lr)
# If no previous training, 0 epochs passed
last_epoch = 0
resume_model_path = opt.resume;
if resume_model_path:
print("Loading previous trained model:"+resume_model_path)
checkpoint = torch.load(resume_model_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
last_epoch = checkpoint['epoch']
last_loss = checkpoint['loss']
dataset_name = 'ECSSD'
image_root = '../../DATASETS/TEST/'+dataset_name+'/im/'
gt_root = '../../DATASETS/TEST/'+dataset_name+'/gt/'
train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, trainsize=opt.trainsize)
total_step = len(train_loader)
print("Total step per epoch: {}".format(total_step))
CE = torch.nn.BCEWithLogitsLoss()
####################################################################################################
def train(train_loader, model, optimizer, epoch):
model.train()
for i, pack in enumerate(train_loader, start=1):
optimizer.zero_grad()
images, gts = pack
images = Variable(images)
gts = Variable(gts)
images = images.cuda()
gts = gts.cuda()
atts, dets = model(images)
loss1 = CE(atts, gts)
loss2 = CE(dets, gts)
loss = loss1 + loss2
loss.backward()
clip_gradient(optimizer, opt.clip)
optimizer.step()
if WANDB_EN:
wandb.log({'Loss': loss})
if i % 100 == 0 or i == total_step:
print('{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], Loss1: {:.4f} Loss2: {:0.4f}'.
format(datetime.now(), epoch, opt.epoch, i, total_step, loss1.data, loss2.data))
# Save model and optimizer training data
trained_model_data = {
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'epoch': epoch,
'loss': loss
}
if opt.is_ResNet:
save_path = 'models/CPD_Resnet/'
else:
save_path = 'models/CPD_VGG/'
if not os.path.exists(save_path):
print("Making trained model folder [{}]".format(save_path))
os.makedirs(save_path)
torch_model_ext = '.pth'
wandb_model_ext = '.h5'
model_unique_id = model_id+'_'+'ep'+'_'+'%d' % epoch
trained_model_name = 'CPD_train'
save_full_path_torch = save_path + trained_model_name + '_' + model_unique_id + torch_model_ext
save_full_path_wandb = save_path + trained_model_name + '_' + model_unique_id + wandb_model_ext
if os.path.exists(save_full_path_torch):
print("Torch model with name ["+save_full_path_torch+"] already exists!")
answ = raw_input("Do you want to replace it? [y/n] ")
if("y" in answ):
torch.save(trained_model_data, save_full_path_torch)
print("Saved torch model in "+save_full_path_torch)
else:
torch.save(trained_model_data, save_full_path_torch)
print("Saved torch model in "+save_full_path_torch)
if WANDB_EN:
if os.path.exists(save_full_path_wandb):
print("Wandb model with name ["+save_full_path_wandb+"] already exists!")
answ = raw_input("Do you want to replace it? [y/n] ")
if("y" in answ):
wandb.save(save_full_path_wandb)
print("Saved wandb model in "+save_full_path_wandb)
else:
wandb.save(save_full_path_wandb)
print("Saved wandb model in "+save_full_path_wandb)
####################################################################################################
print("Training on dataset: "+dataset_name)
print("Train images path: "+image_root)
print("Train gt path: "+gt_root)
print("Let's go!")
if WANDB_EN:
wandb.watch(model, log="all")
for epoch in range(last_epoch+1, tot_epochs+1):
adjust_lr(optimizer, opt.lr, epoch, opt.decay_rate, opt.decay_epoch)
train(train_loader, model, optimizer, epoch)
print("TRAINING DONE!")
It seems that there's something wrong with the loss but I cannot understand what's the problem.
EDIT 1:
I trained the model for 2 epochs without errors and then I interrupted the process.
I also killed the process that was leaved in the gpu memory.
After I tried to resume the model saved at epoch 1 and epoch 2 I got the same cuda error but in a different part of the code
THCudaCheck FAIL file=/opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu line=58 error=2 : out of memory
Traceback (most recent call last):
File "train.py", line 191, in <module>
train(train_loader, model, optimizer, epoch)
File "train.py", line 112, in train
atts, dets = model(images)
File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/home/albytree/TESI/CODICE/Workspace/ALGS/CPD/model/CPD_models.py", line 131, in forward
detection = self.agg2(x5_2, x4_2, x3_2)
File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 491, in __call__
result = self.forward(*input, **kwargs)
File "/home/albytree/TESI/CODICE/Workspace/ALGS/CPD/model/CPD_models.py", line 86, in forward
x3_2 = torch.cat((x3_1, self.conv_upsample5(self.upsample(x2_2))), 1)
RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1525909934016/work/aten/src/THC/generic/THCStorage.cu:58
Moreover I tried to test the saved model at epoch 1 and epoch 2 and got this error
Traceback (most recent call last):
File "test.py", line 45, in <module>
model.load_state_dict(torch.load(opt.model_path))
File "/home/albytree/miniconda3/envs/cpd-wandb/lib/python2.7/site-packages/torch/nn/modules/module.py", line 721, in load_state_dict
self.__class__.__name__, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for CPD_VGG:
Missing key(s) in state_dict: "vgg.conv1.conv1_1.bias", "vgg.conv1.conv1_1.weight", "vgg.conv1.conv1_2.bias", "vgg.conv1.conv1_2.weight", "vgg.conv2.conv2_1.bias", "vgg.conv2.conv2_1.weight", "vgg.conv2.conv2_2.bias", "vgg.conv2.conv2_2.weight", "vgg.conv3.conv3_1.bias", "vgg.conv3.conv3_1.weight", "vgg.conv3.conv3_2.bias", "vgg.conv3.conv3_2.weight", "vgg.conv3.conv3_3.bias", "vgg.conv3.conv3_3.weight", "vgg.conv4_1.conv4_1_1.bias", "vgg.conv4_1.conv4_1_1.weight", "vgg.conv4_1.conv4_2_1.bias", "vgg.conv4_1.conv4_2_1.weight", "vgg.conv4_1.conv4_3_1.bias", "vgg.conv4_1.conv4_3_1.weight", "vgg.conv5_1.conv5_1_1.bias", "vgg.conv5_1.conv5_1_1.weight", "vgg.conv5_1.conv5_2_1.bias", "vgg.conv5_1.conv5_2_1.weight", "vgg.conv5_1.conv5_3_1.bias", "vgg.conv5_1.conv5_3_1.weight", "vgg.conv4_2.conv4_1_2.bias", "vgg.conv4_2.conv4_1_2.weight", "vgg.conv4_2.conv4_2_2.bias", "vgg.conv4_2.conv4_2_2.weight", "vgg.conv4_2.conv4_3_2.bias", "vgg.conv4_2.conv4_3_2.weight", "vgg.conv5_2.conv5_1_2.bias", "vgg.conv5_2.conv5_1_2.weight", "vgg.conv5_2.conv5_2_2.bias", "vgg.conv5_2.conv5_2_2.weight", "vgg.conv5_2.conv5_3_2.bias", "vgg.conv5_2.conv5_3_2.weight", "rfb3_1.branch0.0.bias", "rfb3_1.branch0.0.weight", "rfb3_1.branch1.0.bias", "rfb3_1.branch1.0.weight", "rfb3_1.branch1.1.bias", "rfb3_1.branch1.1.weight", "rfb3_1.branch1.2.bias", "rfb3_1.branch1.2.weight", "rfb3_1.branch1.3.bias", "rfb3_1.branch1.3.weight", "rfb3_1.branch2.0.bias", "rfb3_1.branch2.0.weight", "rfb3_1.branch2.1.bias", "rfb3_1.branch2.1.weight", "rfb3_1.branch2.2.bias", "rfb3_1.branch2.2.weight", "rfb3_1.branch2.3.bias", "rfb3_1.branch2.3.weight", "rfb3_1.branch3.0.bias", "rfb3_1.branch3.0.weight", "rfb3_1.branch3.1.bias", "rfb3_1.branch3.1.weight", "rfb3_1.branch3.2.bias", "rfb3_1.branch3.2.weight", "rfb3_1.branch3.3.bias", "rfb3_1.branch3.3.weight", "rfb3_1.conv_cat.bias", "rfb3_1.conv_cat.weight", "rfb3_1.conv_res.bias", "rfb3_1.conv_res.weight", "rfb4_1.branch0.0.bias", "rfb4_1.branch0.0.weight", "rfb4_1.branch1.0.bias", "rfb4_1.branch1.0.weight", "rfb4_1.branch1.1.bias", "rfb4_1.branch1.1.weight", "rfb4_1.branch1.2.bias", "rfb4_1.branch1.2.weight", "rfb4_1.branch1.3.bias", "rfb4_1.branch1.3.weight", "rfb4_1.branch2.0.bias", "rfb4_1.branch2.0.weight", "rfb4_1.branch2.1.bias", "rfb4_1.branch2.1.weight", "rfb4_1.branch2.2.bias", "rfb4_1.branch2.2.weight", "rfb4_1.branch2.3.bias", "rfb4_1.branch2.3.weight", "rfb4_1.branch3.0.bias", "rfb4_1.branch3.0.weight", "rfb4_1.branch3.1.bias", "rfb4_1.branch3.1.weight", "rfb4_1.branch3.2.bias", "rfb4_1.branch3.2.weight", "rfb4_1.branch3.3.bias", "rfb4_1.branch3.3.weight", "rfb4_1.conv_cat.bias", "rfb4_1.conv_cat.weight", "rfb4_1.conv_res.bias", "rfb4_1.conv_res.weight", "rfb5_1.branch0.0.bias", "rfb5_1.branch0.0.weight", "rfb5_1.branch1.0.bias", "rfb5_1.branch1.0.weight", "rfb5_1.branch1.1.bias", "rfb5_1.branch1.1.weight", "rfb5_1.branch1.2.bias", "rfb5_1.branch1.2.weight", "rfb5_1.branch1.3.bias", "rfb5_1.branch1.3.weight", "rfb5_1.branch2.0.bias", "rfb5_1.branch2.0.weight", "rfb5_1.branch2.1.bias", "rfb5_1.branch2.1.weight", "rfb5_1.branch2.2.bias", "rfb5_1.branch2.2.weight", "rfb5_1.branch2.3.bias", "rfb5_1.branch2.3.weight", "rfb5_1.branch3.0.bias", "rfb5_1.branch3.0.weight", "rfb5_1.branch3.1.bias", "rfb5_1.branch3.1.weight", "rfb5_1.branch3.2.bias", "rfb5_1.branch3.2.weight", "rfb5_1.branch3.3.bias", "rfb5_1.branch3.3.weight", "rfb5_1.conv_cat.bias", "rfb5_1.conv_cat.weight", "rfb5_1.conv_res.bias", "rfb5_1.conv_res.weight", "agg1.conv_upsample1.bias", "agg1.conv_upsample1.weight", "agg1.conv_upsample2.bias", "agg1.conv_upsample2.weight", "agg1.conv_upsample3.bias", "agg1.conv_upsample3.weight", "agg1.conv_upsample4.bias", "agg1.conv_upsample4.weight", "agg1.conv_upsample5.bias", "agg1.conv_upsample5.weight", "agg1.conv_concat2.bias", "agg1.conv_concat2.weight", "agg1.conv_concat3.bias", "agg1.conv_concat3.weight", "agg1.conv4.bias", "agg1.conv4.weight", "agg1.conv5.bias", "agg1.conv5.weight", "rfb3_2.branch0.0.bias", "rfb3_2.branch0.0.weight", "rfb3_2.branch1.0.bias", "rfb3_2.branch1.0.weight", "rfb3_2.branch1.1.bias", "rfb3_2.branch1.1.weight", "rfb3_2.branch1.2.bias", "rfb3_2.branch1.2.weight", "rfb3_2.branch1.3.bias", "rfb3_2.branch1.3.weight", "rfb3_2.branch2.0.bias", "rfb3_2.branch2.0.weight", "rfb3_2.branch2.1.bias", "rfb3_2.branch2.1.weight", "rfb3_2.branch2.2.bias", "rfb3_2.branch2.2.weight", "rfb3_2.branch2.3.bias", "rfb3_2.branch2.3.weight", "rfb3_2.branch3.0.bias", "rfb3_2.branch3.0.weight", "rfb3_2.branch3.1.bias", "rfb3_2.branch3.1.weight", "rfb3_2.branch3.2.bias", "rfb3_2.branch3.2.weight", "rfb3_2.branch3.3.bias", "rfb3_2.branch3.3.weight", "rfb3_2.conv_cat.bias", "rfb3_2.conv_cat.weight", "rfb3_2.conv_res.bias", "rfb3_2.conv_res.weight", "rfb4_2.branch0.0.bias", "rfb4_2.branch0.0.weight", "rfb4_2.branch1.0.bias", "rfb4_2.branch1.0.weight", "rfb4_2.branch1.1.bias", "rfb4_2.branch1.1.weight", "rfb4_2.branch1.2.bias", "rfb4_2.branch1.2.weight", "rfb4_2.branch1.3.bias", "rfb4_2.branch1.3.weight", "rfb4_2.branch2.0.bias", "rfb4_2.branch2.0.weight", "rfb4_2.branch2.1.bias", "rfb4_2.branch2.1.weight", "rfb4_2.branch2.2.bias", "rfb4_2.branch2.2.weight", "rfb4_2.branch2.3.bias", "rfb4_2.branch2.3.weight", "rfb4_2.branch3.0.bias", "rfb4_2.branch3.0.weight", "rfb4_2.branch3.1.bias", "rfb4_2.branch3.1.weight", "rfb4_2.branch3.2.bias", "rfb4_2.branch3.2.weight", "rfb4_2.branch3.3.bias", "rfb4_2.branch3.3.weight", "rfb4_2.conv_cat.bias", "rfb4_2.conv_cat.weight", "rfb4_2.conv_res.bias", "rfb4_2.conv_res.weight", "rfb5_2.branch0.0.bias", "rfb5_2.branch0.0.weight", "rfb5_2.branch1.0.bias", "rfb5_2.branch1.0.weight", "rfb5_2.branch1.1.bias", "rfb5_2.branch1.1.weight", "rfb5_2.branch1.2.bias", "rfb5_2.branch1.2.weight", "rfb5_2.branch1.3.bias", "rfb5_2.branch1.3.weight", "rfb5_2.branch2.0.bias", "rfb5_2.branch2.0.weight", "rfb5_2.branch2.1.bias", "rfb5_2.branch2.1.weight", "rfb5_2.branch2.2.bias", "rfb5_2.branch2.2.weight", "rfb5_2.branch2.3.bias", "rfb5_2.branch2.3.weight", "rfb5_2.branch3.0.bias", "rfb5_2.branch3.0.weight", "rfb5_2.branch3.1.bias", "rfb5_2.branch3.1.weight", "rfb5_2.branch3.2.bias", "rfb5_2.branch3.2.weight", "rfb5_2.branch3.3.bias", "rfb5_2.branch3.3.weight", "rfb5_2.conv_cat.bias", "rfb5_2.conv_cat.weight", "rfb5_2.conv_res.bias", "rfb5_2.conv_res.weight", "agg2.conv_upsample1.bias", "agg2.conv_upsample1.weight", "agg2.conv_upsample2.bias", "agg2.conv_upsample2.weight", "agg2.conv_upsample3.bias", "agg2.conv_upsample3.weight", "agg2.conv_upsample4.bias", "agg2.conv_upsample4.weight", "agg2.conv_upsample5.bias", "agg2.conv_upsample5.weight", "agg2.conv_concat2.bias", "agg2.conv_concat2.weight", "agg2.conv_concat3.bias", "agg2.conv_concat3.weight", "agg2.conv4.bias", "agg2.conv4.weight", "agg2.conv5.bias", "agg2.conv5.weight", "HA.gaussian_kernel".
Unexpected key(s) in state_dict: "loss", "optimizer_state_dict", "model_state_dict", "epoch".
Maybe I'm not saving the states as intended ?
The weird thing is that before adding the resume training code I was just saving the model at every epoch only with torch.save(model.state_dict(), save_full_path_torch) : I managed to train the model in 10 epochs and it still works during testing.
Although this question has been posted 5 months ago, in case if anyone else comes across a similar issue, here is a simple solution.
As explained in Pytorch FAQ, tensors defining the loss is accumulating history across the training loop because loss is a differentiable variable here.
One simple solution is to typecast the loss with float.
Secondly, make sure that you do not use the loss anywhere else but loss.item() while you are printing the loss or logging it to the wandb.

Unable to use MSE of VGG features in loss function

I'm using keras (tf.keras) in tensorflow 2.0.0
I've a network, whose input is an image and output is also an image. I want to use a combination of MSE, MSE in VGG feature space and some other losses, which depend on intermediate layer output. I'm defining a custom loss function. I'm able to build the model, compile with the custom loss. But when I train using fit_generator, I'm getting a SymbolicException saying Inputs to eager execution function cannot be Keras symbolic tensors
Full Code:
Train File:
def __init__(self, gray_images: bool, verbose: bool = True):
super().__init__(gray_images, verbose)
self.model = None
self.vgg_feature_extractor = VggFeaturesExtractor(model_name='vgg16', layers=[3, 6, 10])
def build_model():
image_input = Input(shape=(None, None, num_input_channels))
out1 = self.build_out1_model(image_input, num_filters, depth_t)
out2 = self.build_out2_model(image_input, num_filters, depth_n, use_bnorm)
enhanced_image = ... # Some function of image_input, out1 and out2
self.model = Model(inputs=image_input, outputs=enhanced_image)
self.model.add_loss(loss_weights[1] * self.loss2(out2))
self.model.compile(optimizer='adam', loss=self.vgg_loss)
def vgg_loss(self, gt_image, est_image):
gt_features = self.vgg_feature_extractor.extract_features(gt_image)
est_features = self.vgg_feature_extractor.extract_features(est_image)
loss = tf.reduce_mean(tf.square(gt_features[0] - est_features[0])) + \
tf.reduce_mean(tf.square(gt_features[1] - est_features[1])) + \
tf.reduce_mean(tf.square(gt_features[2] - est_features[2]))
return loss
VggFeatures.py:
class VggFeaturesExtractor:
def __init__(self, model_name: str, layers: List[int]):
self.model_name = model_name
self.layers = layers
if model_name == 'vgg16':
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
vgg_model = VGG16(include_top=False)
self.preprocess_input = preprocess_input
elif model_name == 'vgg19':
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
vgg_model = VGG19(include_top=False)
self.preprocess_input = preprocess_input
else:
raise RuntimeError(f'Unknown Model: {model_name}')
outputs = []
for layer_num in layers:
outputs.append(vgg_model.layers[layer_num].output)
self.feature_extractor = keras.Model(inputs=vgg_model.input, outputs=outputs)
def extract_features(self, images: numpy.ndarray):
preprocessed_images = self.preprocess_input(images)
features = self.feature_extractor(preprocessed_images)
return features
Stack trace:
Epoch 1/1000
Traceback (most recent call last):
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/execute.py", line 61, in quick_execute
num_outputs)
TypeError: An op outside of the function building code is being passed
a "Graph" tensor. It is possible to have Graph tensors
leak out of the function building context by including a
tf.init_scope in your function building code.
For example, the following function will fail:
#tf.function
def has_init_scope():
my_constant = tf.constant(1.)
with tf.init_scope():
added = my_constant * 2
The graph tensor has name: StridedSliceGrad:0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/media/nagabhushan/Data02/SNB/IISc/Research/.../Workspace/Ideas/01_Supervised/src/N09.py", line 363, in <module>
main()
File "/media/nagabhushan/Data02/SNB/IISc/Research/.../Workspace/Ideas/01_Supervised/src/N09.py", line 343, in main
args.save_interval)
File "/media/nagabhushan/Data02/SNB/IISc/Research/.../Workspace/Ideas/01_Supervised/src/N09.py", line 92, in train_model
verbose=self.verbose)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 1297, in fit_generator
steps_name='steps_per_epoch')
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_generator.py", line 265, in model_iteration
batch_outs = batch_function(*batch_data)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 973, in train_on_batch
class_weight=class_weight, reset_metrics=reset_metrics)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py", line 264, in train_on_batch
output_loss_metrics=model._output_loss_metrics)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_eager.py", line 311, in train_on_batch
output_loss_metrics=output_loss_metrics))
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_eager.py", line 268, in _process_single_batch
grads = tape.gradient(scaled_total_loss, trainable_weights)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/backprop.py", line 1014, in gradient
unconnected_gradients=unconnected_gradients)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/imperative_grad.py", line 76, in imperative_grad
compat.as_str(unconnected_gradients.value))
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 911, in _backward_function_wrapper
processed_args, remapped_captures)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 1224, in _call_flat
ctx, args, cancellation_manager=cancellation_manager)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 511, in call
ctx=ctx)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/execute.py", line 75, in quick_execute
"tensors, but found {}".format(keras_symbolic_tensors))
tensorflow.python.eager.core._SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'StridedSliceGrad:0' shape=(16, 64, 64, 3) dtype=float32>]
Process finished with exit code 1
Note:
1. If I replace self.model.compile(optimizer='adam', loss=self.vgg_loss) with self.model.compile(optimizer='adam', loss='mse'), code works fine, which implies the other part of code is working correctly.
2. Almost every question I found on SO regarding VGG loss advises to append VGG network to the main network, set trainable=False for VGG network and then train with MSE loss. But I can't do that, since I have many components in my loss function.
I was able to fix this issue by disabling eager execution. In tensorflow 2.0, eager execution is enabled by default.
tf.compat.v1.disable_eager_execution()
I didn't understand how this was able to fix the issue though. If anybody stumbles on a similar problem, you can try disabling eager execution.

Categories