I'm trying to make a CNN and using a training module to train it. I'd like to specify the number of iterations it makes but am finding that it runs continuously.
Is anyone able to help me with this?
def train(model, epochs=10):
optimiser = torch.optim.SGD(model.parameters(), lr=0.001)
writer = SummaryWriter()
batch_idx = 0
loss_total = 0
epoch = 0
for epoch in range(epochs):
print('range:', range(epochs))
for batch in train_loader:
features, labels = batch
prediction = model(features)
# cf = confusion_matrix(labels, prediction)
loss = F.cross_entropy(prediction, labels) # Loss model changes label size
loss_total += loss.item()
loss.backward()
print('loss:', loss.item())
optimiser.step()
optimiser.zero_grad()
writer.add_scalar('Loss', loss.item(), batch_idx)
batch_idx += 1
print('epoch', epoch)
epoch += 1 # why does this not stop???
print('Total loss:', loss_total/batch_idx)
If it helps you can also find this on my GitHub: https://github.com/amosmike/facebook-market-search/blob/master/CNN.py
Thank you for any help you can provide
you shouldn't be incrementing your epoch variable at all. The epoch is being pulled from the range. Secondly, you're inside the batch loop. You probably don't want to bail after N batches.
Problem is you are incrementing epoch in batch.
epoch += 1 # why does this not stop???
Related
Im currently building an LSTM Model for predicting stock prices in pytorch. I now want to implement a walk forward validation method, but I couldnt find any resource in how to do that.
This is my current training loop:
#%%
lstm1 = LSTM1(num_classes, input_size, hidden_dim, num_layers, X_train_tensors_final.shape[1])
criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(lstm1.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
outputs = lstm1.forward(X_train_tensors_final)
optimizer.zero_grad() #clear gradients
loss = criterion(outputs, y_train_tensors)
loss.backward() #calculates the loss of the loss function
optimizer.step() #improve from loss, i.e backprop
if epoch % 100 == 0:
print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
df_X_ss = ss.transform(df.iloc[:, 0:-1])
df_y_mm = ss.transform(df.iloc[:, 0:1])
df_X_ss = Variable(torch.Tensor(df_X_ss))
df_y_mm = Variable(torch.Tensor(df_y_mm))
df_X_ss = torch.reshape(df_X_ss, (df_X_ss.shape[0], 1, df_X_ss.shape[1]))
train_predict = lstm1(df_X_ss)
data_predict = train_predict.data.numpy()
The model should now predict one step into the future, then calculate the absolute percentage error. For the next step, the model should use the actual y value instead of the predicted yhat to make its next prediction. What would be the best way of implementing this? Or is there some build in function in pytorch that would do this ?
I am working on a very imbalanced data, 15% labeled as 1 and the rest as 0, using BERT.
the code i wrote uses maxing outputs which gives me predictions of 0 for everything.
How do I include thresholds in my code to maximise my predictions of 1.
nsteps=215
nepoch=3
best_val_acc = 0
for epoch in range(nepoch):
model.train()
print(f"epoch n°{epoch+1}:")
av_epoch_loss=0
progress_bar = tqdm(range(nsteps))
for batch in trainloader:
batch = {k:v.cuda() for k,v in batch.items()}
outputs = model(**batch)
loss = criterion(outputs, *batch)
av_epoch_loss += loss
loss.backward()
optim.step()
optim.zero_grad()
predictions=torch.argmax(outputs.logits, dim=-1)
f1.add_batch(predictions=predictions, references=batch["labels"])
acc.add_batch(predictions=predictions, references=batch["labels"])
progress_bar.update(1)
av_epoch_loss /= nsteps
print(f"Training Loss: {av_epoch_loss: .2f}")
acc_res = acc.compute()["accuracy"]
print(f"Training Accuracy: {acc_res:.2f}")
f_res = f1.compute()["f1"]
print(f"Training F1-score: {f_res:.2f}")
model.eval()
val_acc = validate(model)
if val_acc > best_val_acc:
print("Achieved best validation accuracy so far. Saving model.")
best_val_acc = val_acc
best_model_state = deepcopy(model.state_dict())
print("\n\n")
I looked in pytorch documentation but i couldn't figure it out.
I am training my conditional GAN network using the code from TensorFlow tutorial that uses a written from scratch training loop
def fit(train_ds, epochs, test_ds):
for epoch in range(epochs):
start = time.time()
display.clear_output(wait=True)
for example_input, example_target in test_ds.take(1):
generate_images(generator, example_input, example_target)
print("Epoch: ", epoch)
# Train
for n, (input_image, target) in train_ds.enumerate():
print('.', end='')
if (n+1) % 100 == 0:
print()
train_step(input_image, target, epoch)
print()
# saving (checkpoint) the model every 20 epochs
if (epoch + 1) % 20 == 0:
checkpoint.save(file_prefix = checkpoint_prefix)
print ('Time taken for epoch {} is {} sec\n'.format(epoch + 1,
time.time()-start))
checkpoint.save(file_prefix = checkpoint_prefix)
and the train step is defined like this
#tf.function
def train_step(input_image, target, epoch):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
gen_output = generator(input_image, training=True)
disc_real_output = discriminator([input_image, target], training=True)
disc_generated_output = discriminator([input_image, gen_output], training=True)
gen_total_loss, gen_gan_loss, gen_l1_loss = generator_loss(disc_generated_output, gen_output, target)
disc_loss = discriminator_loss(disc_real_output, disc_generated_output)
generator_gradients = gen_tape.gradient(gen_total_loss,
generator.trainable_variables)
discriminator_gradients = disc_tape.gradient(disc_loss,
discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(generator_gradients,
generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(discriminator_gradients,
discriminator.trainable_variables))
with summary_writer.as_default():
tf.summary.scalar('gen_total_loss', gen_total_loss, step=epoch)
tf.summary.scalar('gen_gan_loss', gen_gan_loss, step=epoch)
tf.summary.scalar('gen_l1_loss', gen_l1_loss, step=epoch)
tf.summary.scalar('disc_loss', disc_loss, step=epoch)
now my question is for the summary writer is it saving the loss of the batch only or the average across the whole dataset and if it is for the batch which batch loss is it saving and how can I get the average across the whole dataset if the batches are not the same size ?
I assumed it was the average because I got the code from a tensorflow tutorial so I trusted it but when I think about it I am not sure that that is the case.
If you want Tensorboard to get only the loss per epoch, you need to save the values to Tensorboard at the end of every epoch, not at every batch.
First, create variables for mean values for every epoch:
for epoch in range(epochs):
mean_epoch_loss = tf.metrics.Mean()
# etc...
And in train_step, update this value with the corresponding loss:
#tf.function
def train_step(input_image, target, epoch):
# etc...
mean_epoch_loss.update_state(epoch_loss)
And at the end of every epoch, save this value to Tensorboard:
for epoch in range(epochs):
# etc...
with summary_writer.as_default():
tf.summary.scalar('mean_epoch_loss', mean_epoch_loss, step=epoch)
I am trying to use my own dataset to classify text according to https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/5%20-%20Multi-class%20Sentiment%20Analysis.ipynb. My dataset is a csv of sentences and a class associated with it. there are 6 different classes:
sent class
'the fox is brown' animal
'the house is big' object
'one water is drinkable' water
...
When running:
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
print(start_time)
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
print(train_loss.type())
print(train_acc.type())
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut5-model.pt')
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
, I receive the following error
RuntimeError: "log_softmax_lastdim_kernel_impl" not implemented for 'torch.LongTensor'
pointing to:
<ipython-input-38-9c6cff70d2aa> in train(model, iterator, optimizer, criterion)
14 print('pred'+ predictions.type())
15 #batch.label = batch.label.type(torch.LongTensor)
---> 16 loss = criterion(predictions.long(), batch.label)**
The solution posted here https://github.com/pytorch/pytorch/issues/14224 suggests I need to use long/int.
I had to add .long() at line ** in order to fix this earlier error:
RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target'
The specific lines of code are:
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
predictions = model(batch.text)
print('pred'+ predictions.type())
#batch.label = batch.label.type(torch.LongTensor)
loss = criterion(predictions.long(), batch.label)**
acc = categorical_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
Note, the ** was originally loss = criterion(predictions, batch.label)
Any other suggestions to fix this issue?
criterion is defined as torch.nn.CrossEntropyLoss() in your notebook. As mentioned in documentation of CrossEntropyLoss, it expects probability values returned by model for each of the 'K' classes and corresponding value for ground-truth label as input. Now, probability values are float tensors, while ground-truth label should be a long tensor representing a class (class can not be a float, e.g. 2.3 can not represent a class). hence:
loss = criterion(predictions, batch.label.long())
should work.
if using gpu, instead of defining the long type under loss criterion, you should probably define it before using cuda. I face the same error. resolved it with below:
# move data to GPU, if available
if train_on_gpu:
inp = inp.cuda()
target = target.long()
target=target.cuda()
h = tuple([each.data for each in hidden])
# perform backpropagation and optimization
#zero accumulated gradient
rnn.zero_grad()
#getting out_put from model
output,h = rnn(inp,h)
#calculating loss and performing back_propagation
loss = criterion(output.squeeze(), target)
I took a tutorial available here and I tried to run it on my dataset, it was able to compile and it begins the training but here what I got :
The model don't seem to be saved at each iteration.
And I tried with 100 epochs it didn't change anything, it gives the output of the first iteration.
Do you have an idea what would be the problem ? (I know the code is long sorry)
def train(model, epochs, log_string):
'''Train the RNN'''
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# Used to determine when to stop the training early
valid_loss_summary = []
# Keep track of which batch iteration is being trained
iteration = 0
print()
print("Training Model: {}".format(log_string))
train_writer = tf.summary.FileWriter('./logs/3/train/{}'.format(log_string), sess.graph)
valid_writer = tf.summary.FileWriter('./logs/3/valid/{}'.format(log_string))
for e in range(epochs):
state = sess.run(model.initial_state)
# Record progress with each epoch
train_loss = []
train_acc = []
val_acc = []
val_loss = []
with tqdm(total=len(x_train)) as pbar:
for _, (x, y) in enumerate(get_batches(x_train, y_train, batch_size), 1):
feed = {model.inputs: x,
model.labels: y[:, None],
model.keep_prob: dropout,
model.initial_state: state}
summary, loss, acc, state, _ = sess.run([model.merged,
model.cost,
model.accuracy,
model.final_state,
model.optimizer],
feed_dict=feed)
# Record the loss and accuracy of each training batch
train_loss.append(loss)
train_acc.append(acc)
# Record the progress of training
train_writer.add_summary(summary, iteration)
iteration += 1
pbar.update(batch_size)
avg_train_loss = np.mean(train_loss)
avg_train_acc = np.mean(train_acc)
val_state = sess.run(model.initial_state)
with tqdm(total=len(x_valid)) as pbar:
for x, y in get_batches(x_valid, y_valid, batch_size):
feed = {model.inputs: x,
model.labels: y[:, None],
model.keep_prob: 1,
model.initial_state: val_state}
summary, batch_loss, batch_acc, val_state = sess.run([model.merged,
model.cost,
model.accuracy,
model.final_state],
feed_dict=feed)
# Record the validation loss and accuracy of each epoch
val_loss.append(batch_loss)
val_acc.append(batch_acc)
pbar.update(batch_size)
# Average the validation loss and accuracy of each epoch
avg_valid_loss = np.mean(val_loss)
avg_valid_acc = np.mean(val_acc)
valid_loss_summary.append(avg_valid_loss)
# Record the validation data's progress
valid_writer.add_summary(summary, iteration)
# Print the progress of each epoch
print("Epoch: {}/{}".format(e, epochs),
"Train Loss: {:.3f}".format(avg_train_loss),
"Train Acc: {:.3f}".format(avg_train_acc),
"Valid Loss: {:.3f}".format(avg_valid_loss),
"Valid Acc: {:.3f}".format(avg_valid_acc))
# Stop training if the validation loss does not decrease after 3 epochs
if avg_valid_loss > min(valid_loss_summary):
print("No Improvement.")
stop_early += 1
if stop_early == 3:
break
# Reset stop_early if the validation loss finds a new low
# Save a checkpoint of the model
else:
print("New Record!")
stop_early = 0
checkpoint = "sauvegarde/controverse_{}.ckpt".format(log_string)
saver.save(sess,checkpoint)
Thank you very much for your answers :)