the original batch_size = 16, but I wanted to give accumulation = 2 so that I have a similar effect as when I used batch_size = 32.
The original training time lasted an hour, so I expected 2 hour training time with the gradient accumulation.
But the training ends at 50%, lasting an hour even with the gradient accumulation.
I don't know why it's stopping.. below is my code for training
def train_runner(model, train_dataset, valid_dataset , batch_size, num_train_epochs, learning_rate):
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size)
valid_dataloader = DataLoader(dataset = valid_dataset, batch_size = batch_size)
lowest_total_valid_loss = 9999.
step = 0
global_total_step = len(train_dataloader) * num_train_epochs
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0)
print("TRAIN START")
with tqdm(total=global_total_step, unit='step') as t:
total = 0
total_loss = 0
for epoch in range(num_train_epochs):
for iteration,batch in enumerate(train_dataloader):
#optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
start_positions = batch['start_positions'].to(device)
end_positions = batch['end_positions'].to(device)
outputs = model(input_ids,
attention_mask=attention_mask,
start_positions=start_positions,
end_positions=end_positions)
loss = outputs.loss
(loss / ACCUMULATION).backward()
step += 1
if step % ACCUMULATION:
continue
clip_grad_norm_(model.parameters(), max_norm=1.)
optimizer.step()
optimizer.zero_grad(set_to_none=True)
batch_loss = loss.item() * len(input_ids)
total += len(input_ids)
total_loss += batch_loss / ACCUMULATION
global_total_step += 1
t.set_postfix(loss="{:.6f}".format(total_loss / total), batch_loss="{:.6f}".format(batch_loss))
t.update(1)
del input_ids
del attention_mask
del start_positions
del end_positions
del outputs
del loss
## validation ##
if iteration != 0 and iteration % int(len(train_dataloader) / 10) == 0:
total_valid_loss = 0
for batch_val in valid_dataloader:
model.eval()
optimizer.zero_grad()
input_ids = batch_val['input_ids'].to(device)
attention_mask = batch_val['attention_mask'].to(device)
start_positions = batch_val['start_positions'].to(device)
end_positions = batch_val['end_positions'].to(device)
with torch.no_grad():
outputs = model(input_ids,
attention_mask=attention_mask,
start_positions=start_positions,
end_positions=end_positions)
loss = outputs.loss
total_valid_loss += loss.item()
if total_valid_loss < lowest_total_valid_loss:
print(f"lowest_total_valid_loss: {total_valid_loss} epoch : {epoch} iteration : {iteration}")
torch.save(model.state_dict(),'./output_model_best')
lowest_total_valid_loss = total_valid_loss
## validation ##
#model.save_pretrained("./klue_output_model")
print("TRAIN END")
for iteration,batch in enumerate(train_dataloader):
if step % ACCUMULATION:
t.update(1) # add one update here as well.
continue
...
t.update(1)
Half of the time you do not update the tqdm counter or set its value too high during initialization. So it can't go higher than 50%.
my data are divided to 2 parts training and validation one . I Used load_dataset and dataloader functions . I convert data in dataset to torch format using traindataset.set_format
when starting training I got error
new(): invalid data type 'numpy.str_'
in this line
for step,batch in enumerate(train_dataloader):
so how can i fix this error?
model= MixModel()
#model.load_state_dict(torch.load(r"/media/sh/saved_weightscnnbert.pt"))
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
traindataset = load_dataset('csv', data_files='/content/drive//My Drive/Colab Notebooks/newdataset/newdata_train2',split='train')
testdataset = load_dataset('csv', data_files='/content/drive//My Drive/Colab Notebooks/newdataset/newdata_valid2',split='train')
traindataset =traindataset.map(encode)
testdataset1 = testdataset.map(encode)
traindataset =traindataset.map(lambda examples: {'labels': examples['symptoms']}, batched=True)
testdataset =testdataset1.map(lambda examples: {'labels': examples['symptoms']}, batched=True)
traindataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
testdataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(traindataset, batch_size= 64)
test_dataloader = torch.utils.data.DataLoader(testdataset, batch_size= 64)
# function to train the model
def train():
model.train()
total_loss, total_accuracy = 0, 0
# empty list to save model predictions
total_preds=[]
Labels=[]
# iterate over batches
for step,batch in enumerate(train_dataloader):
# progress update after every 50 batches.
if step % 100 == 0 and not step == 0:
print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
sent_id, mask, labels = batch['input_ids'],batch['attention_mask'],batch['labels']
# clear previously calculated gradients
model.zero_grad()
# get model predictions for the current batch
preds = model(sent_id, mask, labels)
# compute the loss between actual and predicted values
alpha=0.25
gamma=2
ce_loss = loss_fn(preds, labels)
#pt = torch.exp(-ce_loss)
#focal_loss = (alpha * (1-pt)**gamma * ce_loss).mean() # mean over the batch
# add on to the total loss
total_loss = total_loss + ce_loss.item()
# backward pass to calculate the gradients
ce_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# update parameters
optimizer.step()
preds =torch.argmax(preds, dim=1)
total_preds.append(preds)
total_accuracy += (preds == labels).float().sum()
# compute the training loss of the epoch
avg_loss = total_loss / len(train_dataloader)
avg_accuracy = total_accuracy / len(traindataset)
# predictions are in the form of (no. of batches, size of batch, no. of classes).
# reshape the predictions in form of (number of samples, no. of classes)
total_preds = np.concatenate(total_preds, axis=0)
#returns the loss and predictions
return avg_loss, total_preds, avg_accuracy
I have a custom-written model using PyTorch. However, when I try to use .to(device) to train in GPU, ı come across this error. The model does run on the CPU but is slow. Can Anyone give me a little insight on how to solve this?
I added the inputs to the GPU as well and still, I receive this error. The second picture added shows the initial part of the error and the title I mentioned in the last lie of the errors.
Model Train Loop
Error Initial Part
def build_network(dim_val, dim_attn, input_size, dec_seq_len, output_sequence_length, n_decoder_layers, n_encoder_layers, n_heads):
# Input Size = 1
t = Transformer(dim_val, dim_attn, input_size ,dec_seq_len, output_sequence_length, n_decoder_layers, n_encoder_layers, n_heads)
return t
def build_optimizer(model ,learning_rate):
optimizer = torch.optim.Adam(model.parameters(), learning_rate)
return optimizer
Initialize network
t = build_network(2, 2, 1, 2, 2, 2, 2, 2).to(device)
Initialize Optimizer
optimizer = build_optimizer(t, 0.001)
MAPE Formula
def mape(actual, pred):
actual, pred = np.array(actual), np.array(pred)
return round(np.mean(np.abs((actual - pred) / actual)) * 100,2)
for e in range(2):
start_time = time.time()
# Batch Train Loss
batch_losses = []
batch_mape = []
# Batch Test Loss
val_batch_loss = []
val_batch_mape = []
################################################ Train Batch Loop ################################################
for test_x, test_y in train_loader:
X = test_x.to(device)
Y = test_y.to(device)
# Zero Grad at start of loop
optimizer.zero_grad()
# Forward pass and calculate loss
net_out = t(X)
## Loss MSE
loss = torch.mean((net_out - Y) ** 2)
# Train MAPE
train_mape = mape(np.abs((Y.detach().numpy())),
net_out.detach().numpy())
# Backwards pass
loss.backward()
optimizer.step()
# Append the Batch Loss
batch_losses.append(loss.item())
# Append the Batch MAPE
batch_mape.append(train_mape)
################################################ Validation Batch Loop ################################################
for test_x, test_y in val_loader:
X_Test = test_x.to(device)
Y_Test = test_y.to(device)
#test_values = torch.from_numpy(Transformer_Xtest)
val = t(X_Test)
# Val Loss
validation_loss = torch.mean((val - Y_Test) ** 2)
# Append the Batch Loss
val_batch_loss.append(validation_loss.item())
# Val MAPE
validation_mape = mape(np.abs((Y_Test.detach().numpy())),
val.detach().numpy())
# Append the Batch MAPE
val_batch_mape.append(validation_mape)
######## Train Epoch Summary
epoch_train_loss = round(np.mean(batch_losses), 5)
epoch_train_mape = round(np.mean(batch_mape), 5)
######## Val Epoch Summary
epoch_val_loss = round(np.mean(val_batch_loss), 5)
epoch_val_mape = round(np.mean(val_batch_mape), 5)
show_print = print("Epoch: ", e+1, "epoch_train_loss:", epoch_train_loss, "epoch_train_mape: ", epoch_train_mape, "epoch_val_loss:", epoch_val_loss, "epoch_val_mape:", epoch_val_mape,"--- Elapsed Time Seconds ---", round((time.time() - start_time), 3) )
I have this very simple resnet18 network that I am trying to train from scratch for task of landmark estimation (I have 4 landmarks):
num_classes = 4 * 2 #4 coordinates X and Y flattened --> 4 of 2D keypoints or landmarks
class Network(nn.Module):
def __init__(self,num_classes=8):
super().__init__()
self.model_name = 'resnet18'
self.model = models.resnet18()
self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
def forward(self, x):
x = x.float()
out = self.model(x)
return out
For the following piece of code:
network = Network()
network.cuda()
criterion = nn.MSELoss()
optimizer = optim.Adam(network.parameters(), lr=0.0001)
loss_min = np.inf
num_epochs = 1
start_time = time.time()
for epoch in range(1,num_epochs+1):
loss_train = 0
loss_test = 0
running_loss = 0
network.train()
print('size of train loader is: ', len(train_loader))
for step in range(1,len(train_loader)+1):
batch = next(iter(train_loader))
images, landmarks = batch['image'], batch['landmarks']
#RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[64, 600, 800, 3] to have 3 channels, but got 600 channels instead
#using permute to fix the above error
images = images.permute(0,3,1,2)
images = images.cuda()
landmarks = landmarks.view(landmarks.size(0),-1).cuda()
##images = torchvision.transforms.Normalize(images) #find the args later
##landmarks = torchvision.transforms.Normalize(landmarks) #find the args later
predictions = network(images)
# clear all the gradients before calculating them
optimizer.zero_grad()
print('predictions are: ', predictions.float())
print('landmarks are: ', landmarks.float())
# find the loss for the current step
loss_train_step = criterion(predictions.float(), landmarks.float())
loss_train_step = loss_train_step.to(torch.float32)
print("loss_train_step before backward: ", loss_train_step)
# calculate the gradients
loss_train_step.backward()
# update the parameters
optimizer.step()
print("loss_train_step after backward: ", loss_train_step)
loss_train += loss_train_step.item()
print("loss_train: ", loss_train)
running_loss = loss_train/step
print('step: ', step)
print('running loss: ', running_loss)
print_overwrite(step, len(train_loader), running_loss, 'train')
network.eval()
with torch.no_grad():
for step in range(1,len(test_loader)+1):
batch = next(iter(train_loader))
images, landmarks = batch['image'], batch['landmarks']
images = images.permute(0,3,1,2)
images = images.cuda()
landmarks = landmarks.view(landmarks.size(0),-1).cuda()
predictions = network(images)
# find the loss for the current step
loss_test_step = criterion(predictions, landmarks)
loss_test += loss_test_step.item()
running_loss = loss_test/step
print_overwrite(step, len(test_loader), running_loss, 'Validation')
loss_train /= len(train_loader)
loss_test /= len(test_loader)
print('\n--------------------------------------------------')
print('Epoch: {} Train Loss: {:.4f} Valid Loss: {:.4f}'.format(epoch, loss_train, loss_test))
print('--------------------------------------------------')
if loss_test < loss_min:
loss_min = loss_test
torch.save(network.state_dict(), '../moth_landmarks.pth')
print("\nMinimum Valid Loss of {:.4f} at epoch {}/{}".format(loss_min, epoch, num_epochs))
print('Model Saved\n')
print('Training Complete')
print("Total Elapsed Time : {} s".format(time.time()-start_time))
I get this NAN as well as very large MSE losses (showing only for one step):
size of train loader is: 90
predictions are: tensor([[-0.0380, -0.1871, 0.0729, -0.3570, -0.2153, 0.3066, 1.1273, -0.0558],
[-0.0316, -0.1876, 0.0317, -0.3613, -0.2333, 0.3023, 1.0940, -0.0665],
[-0.0700, -0.1882, 0.0068, -0.3201, -0.1884, 0.2953, 1.0516, -0.0567],
[-0.0844, -0.2009, 0.0573, -0.3166, -0.2597, 0.3127, 1.0343, -0.0573],
[-0.0486, -0.2333, 0.0535, -0.3245, -0.2310, 0.2818, 1.0590, -0.0716],
[-0.0240, -0.1989, 0.0572, -0.3135, -0.2435, 0.2912, 1.0612, -0.0560],
[-0.0942, -0.2439, 0.0277, -0.3147, -0.2368, 0.2978, 1.0110, -0.0874],
[-0.0356, -0.2285, 0.0064, -0.3179, -0.2432, 0.3083, 1.0300, -0.0756]],
device='cuda:0', grad_fn=<AddmmBackward>)
landmarks are: tensor([[501.9200, 240.1600, 691.0000, 358.0000, 295.0000, 294.0000, 488.6482,
279.6466],
[495.6300, 246.0600, 692.0000, 235.0000, 286.0000, 242.0000, 464.0000,
339.0000],
[488.7100, 240.8900, 613.4007, 218.3425, 281.0000, 220.0000, 415.9966,
338.4796],
[502.5721, 245.4983, 640.0000, 131.0000, 360.0000, 143.0000, 542.9840,
321.8463],
[505.1393, 246.4364, 700.0000, 306.0000, 303.0000, 294.0000, 569.6925,
351.8367],
[501.0900, 244.0100, 724.0000, 251.0000, 302.0000, 276.0000, 504.6415,
291.7443],
[495.9500, 244.2800, 608.0000, 127.0000, 323.0000, 166.0000, 491.0000,
333.0000],
[490.2500, 241.3400, 699.0000, 304.0000, 398.6197, 313.8339, 429.1374,
303.8483]], device='cuda:0')
loss_train_step before backward: tensor(166475.6875, device='cuda:0', grad_fn=<MseLossBackward>)
loss_train_step after backward: tensor(166475.6875, device='cuda:0', grad_fn=<MseLossBackward>)
loss_train: 166475.6875
step: 1
running loss: 166475.6875
The other thing besides Network I am also suspicious of is the transforms:
Here's the transforms I am using:
transformed_dataset = MothLandmarksDataset(csv_file='moth_gt.csv',
root_dir='.',
transform=transforms.Compose(
[
Rescale(256),
RandomCrop(224),
ToTensor()#,
##transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ],
## std = [ 0.229, 0.224, 0.225 ])
]
)
)
How can I fix it? Also, with having Normalize I do still get same problem (NAN and very large loss values since the predictions are very off).
How can I basically debug why the predictions are very off?
I think there is a problem with the way you are using DataLoader. iter(train_loader) creates an iterator out of the data loader and calling next should give you the next example from dataset. But you are calling next(iter(train_loader)) in each iteration which creates a new iterator each time from train_loader and returns the next example which would essentially be the first example in your dataset. So i think this way you end up training (and validating) on the same example in each iteration. Even if your data loader shuffles the dataset each time, you'll end up training on some random samples from the dataset (as each time only the first random sample will be used) and not on the complete dataset. Try changing your code such that you use iter(train_loader) only once and and call next in each iteration:
train_iter = iter(train_loader)
for step in range(1, len(train_loader)+1):
batch = next(train_iter)
# now batch contains (step)th batch (assuming 1-based indexing)
Or even better change your for loop to iterate on the train_loader itself:
for step, batch in enumerate(train_loader):
# now batch contains (step)th batch (assuming 0-based indexing)
Similar change would also be required for validation or evaluation. Let me know if this resolves your problem or helps in any way.
I am very new in pytorch and implementing my own network of image classifier. However I see for each epoch training accuracy is very good but validation accuracy is 0.i noted till 5th epoch. I am using Adam optimizer and have learning rate .001. also resampling the whole data set after each epoch into training n validation set. Please help where I am going wrong.
Here is my code:
### where is data?
data_dir_train = '/home/sup/PycharmProjects/deep_learning/CNN_Data/training_set'
data_dir_test = '/home/sup/PycharmProjects/deep_learning/CNN_Data/test_set'
# Define your batch_size
batch_size = 64
allData = datasets.ImageFolder(root=data_dir_train,transform=transformArr)
# We need to further split our training dataset into training and validation sets.
def split_train_validation():
# Define the indices
num_train = len(allData)
indices = list(range(num_train)) # start with all the indices in training set
split = int(np.floor(0.2 * num_train)) # define the split size
#train_idx, valid_idx = indices[split:], indices[:split]
# Random, non-contiguous split
validation_idx = np.random.choice(indices, size=split, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# define our samplers -- we use a SubsetRandomSampler because it will return
# a random subset of the split defined by the given indices without replacement
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
#train_loader = DataLoader(allData,batch_size=batch_size,sampler=train_sampler,shuffle=False,num_workers=4)
#validation_loader = DataLoader(dataset=allData,batch_size=1, sampler=validation_sampler)
return (train_sampler,validation_sampler)
Training
from torch.optim import Adam
import torch
import createNN
import torch.nn as nn
import loadData as ld
from torch.autograd import Variable
from torch.utils.data import DataLoader
# check if cuda - GPU support available
cuda = torch.cuda.is_available()
#create model, optimizer and loss function
model = createNN.ConvNet(class_num=2)
optimizer = Adam(model.parameters(),lr=.001,weight_decay=.0001)
loss_func = nn.CrossEntropyLoss()
if cuda:
model.cuda()
# function to save model
def save_model(epoch):
torch.save(model.load_state_dict(),'imageClassifier_{}.model'.format(epoch))
print('saved model at epoch',epoch)
def exp_lr_scheduler ( epoch , init_lr = args.lr, weight_decay = args.weight_decay, lr_decay_epoch = cf.lr_decay_epoch):
lr = init_lr * ( 0.5 ** (epoch // lr_decay_epoch))
def train(num_epochs):
best_acc = 0.0
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
model.train()
acc = 0.0
loss = 0.0
total = 0
# train model with training data
for i,(images,labels) in enumerate(train_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal , predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
# Valid model with validataion data
model.eval()
acc = 0.0
loss = 0.0
total = 0
for i,(images,labels) in enumerate(validation_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal, predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
valid_acc = acc / total
valid_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, valid_loss))
if(best_acc<valid_acc):
best_acc = valid_acc
save_model(epoch)
# at 30th epoch we save the model
if (epoch == 30):
save_model(epoch)
train(20)
I think you did not take into account that acc += torch.sum(predClass == labels.data) returns a tensor instead of a float value. Depending on the version of pytorch you are using I think you should change it to:
acc += torch.sum(predClass == labels.data).cpu().data[0] #pytorch 0.3
acc += torch.sum(predClass == labels.data).item() #pytorch 0.4
Although your code seems to be working for old pytorch version, I would recommend you to upgrade to the 0.4 version.
Also, I mentioned other problems/typos in your code.
You are loading the dataset for every epoch.
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
...
That should not happen, it should be enough loading it once
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
...
In the training part you have (this does not happen in the validation):
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
Where you are printing acc instead of train_acc
Also, in the validation part I mentioned that you are printing print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc)) when it should be something like 'Mean val acc'.
Changing this lines of code, using a standard model I created and CIFAR dataset the training seems to converge, accuracy increases at every epoch while mean loss value decreases.
I Hope I could help you!