Dear stackoverflow members,
I am currently trying to implement my own keras tuner training loop. In this loop I want to pass the input variable multiple times through the model in example:
Y = Startvalue
for i in range(x):
Y = model(Y)
I want to see if this method creates more stable simulations for my self feedback problem.
When I implement it I get an OOM error even when I do not loop. This error does not occur when I just do it normally.
My Class example (the OOM error occurs when i switch logits for logits2:
class MyTuner(kt.Tuner):
def run_trial(self, trial, train_ds, validation_data):
model = self.hypermodel.build(trial.hyperparameters)
optimizer = tf.keras.optimizers.Adam()
epoch_loss_metric = tf.keras.metrics.MeanSquaredError()
def microbatch(T_IN, A_IN, D_IN):
OUT_T = []
OUT_A = []
for i in range(len(T_IN)):
A_IN_R = tf.expand_dims(tf.squeeze(A_IN[i]), 0)
T_IN_R = tf.expand_dims(tf.squeeze(T_IN[i]), 0)
D_IN_R = tf.expand_dims(tf.squeeze(D_IN[i]), 0)
(OUT_T_R, OUT_A_R) = model((A_IN_R, T_IN_R, D_IN_R))
OUT_T.append(tf.squeeze(OUT_T_R))
OUT_A.append(tf.squeeze(OUT_A_R))
return(tf.squeeze(tf.stack(OUT_T)), tf.squeeze(tf.stack(OUT_A)))
def run_train_step(data):
T_IN = tf.dtypes.cast(data[0][0], 'float32')
A_IN = tf.dtypes.cast(data[0][1], 'float32')
D_IN = tf.dtypes.cast(data[0][2], 'float32')
A_Ta = tf.dtypes.cast(data[1][0], 'float32')
T_Ta = tf.dtypes.cast(data[1][1], 'float32')
mse = tf.keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
logits2 = microbatch(T_IN, A_IN, D_IN)
logits = model([A_IN, T_IN, D_IN])
loss = mse((T_Ta, A_Ta), logits2)
# Add any regularization losses.
if model.losses:
loss += tf.math.add_n(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
epoch_loss_metric.update_state((T_Ta, A_Ta), logits2)
return loss
for epoch in range(1000):
print('Epoch: {}'.format(epoch))
self.on_epoch_begin(trial, model, epoch, logs={})
for batch, data in enumerate(train_ds):
self.on_batch_begin(trial, model, batch, logs={})
batch_loss = float(run_train_step(data))
self.on_batch_end(trial, model, batch, logs={'loss': batch_loss})
if batch % 100 == 0:
loss = epoch_loss_metric.result().numpy()
print('Batch: {}, Average Loss: {}'.format(batch, loss))
epoch_loss = epoch_loss_metric.result().numpy()
self.on_epoch_end(trial, model, epoch, logs={'loss': epoch_loss})
epoch_loss_metric.reset_states()
````
In my understanding, the micro-batch function is not implementing a self-feedback loop (though it does not affect the OOM)
I guess what's happening is that because you are computing the output of the network k times, the amount of memory consumption by the network is increasing by k times (because it needs to store intermediate tensors for backprop).
What you can do is, at each self-feedback instance, you backprop the gradients so that all the intermediate tensors do not increase beyond the limit.
lemme know if you have any doubt,
Related
I’m trying to constrain the weight of my model by explicitly applying the gradients; shower, this is not working and I can’t figure out why.
I’m defining the model with the following function:
def init_model(num_hidden_layers=2, num_neurons_per_layer=64):
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(2,)) )
for _ in range(num_hidden_layers):
model.add(tf.keras.layers.Dense(num_neurons_per_layer, activation=tf.keras.layers.LeakyReLU( ),kernel_initializer="glorot_uniform") )
model.add(tf.keras.layers.Dense(1,kernel_initializer="glorot_uniform"))
return model
When using the fit method, the loss function decreases and the model fits the data:
Nepochs = 1500
lr = 0.001
def my_loss(u_true, u_pred):
return tf.math.reduce_mean(tf.math.square(u_true - u_pred))
model_0 = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_0 = tf.keras.optimizers.Adam(learning_rate=lr)
model_0.compile(loss=my_loss, optimizer=optim_0)
model_0.summary()
history_0 = model_0.fit(X_train,u_train,validation_data=(X_test.numpy(),u_test.numpy()),epochs=Nepochs, batch_size=X_train.shape[0])
When I explicitly specify and apply the gradient, the loss function stagnates and the output does not fit the data (it is uniform everywhere):
Nepochs = 1500
lr = 0.001
def compute_loss(model, X_data, u_data):
u_pred = model(X_data)
loss = tf.math.reduce_mean(tf.math.square(u_data - u_pred))
return loss
#tf.function
def training(model, optim, X_train, u_train, X_test=None, u_test=None):
if X_test is not None:
validation_loss = compute_loss(model, X_test, u_test )
else:
validation_loss = None
with tf.GradientTape(persistent=True) as tape:
tape.watch(model.trainable_variables)
loss = compute_loss(model, X_train, u_train )
grad_theta = tape.gradient(loss, model.trainable_variables)
optim.apply_gradients(zip(grad_theta, model.trainable_variables))
return loss,validation_loss
model_G = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_G = tf.keras.optimizers.Adam(learning_rate=lr)
model_G.summary()
hist = {'val_loss':[],'loss':[]}
for i in range(Nepochs+1):
loss, val_loss = training(model_G,optim_G,X_train,u_train,X_test,u_test)
hist['loss'].append(loss.numpy())
hist['val_loss'].append(val_loss.numpy())
if val_loss is not None:
print('It {:05d}: loss = {:10.8e}, validation loss = {:10.8e} '.format(i,loss,val_loss))
else:
print('It {:05d}: loss = {:10.8e}'.format(i,loss))
Why do the two versions provide different results?
Thanks for the help.
Cesare
Finally, I found that expanding the dimension of the targets as follows:
u_train = tf.expand_dims(u_train,axis=-1)
u_test = tf.expand_dims(u_test,axis=-1)
the model training properly and the loss functions are correctly evaluated.
u_train and u_test previously had shapes equal to the number of entries N only; by expanding the dimension, the shape now is (N,1).
using fit the code works with both; when explicitly using the gradient, only with targets of shape (N,1).
I am using PyTorch I am trying to do the validation on my dataset to obtain optimal number of channels in my neural network. I have the following code:
def train_during_validation():
for epoch in range (1, 201):
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = criterion(out[data.val_mask], data.y[data.val_mask])
loss.backward()
optimizer.step()
return loss
def validation():
loss_val = np.zeros(50, dtype = float)
model = GCN(hidden_channels = 1)
loss_val = train_during_validation()
print(loss_val)
validation()
In the code above I train previously defined model with 16 channels and I obtain a loss of 0.33. But as soon as I start doing validation on hidden_channel (see code below), my loss does not go down (it remains on 1.95). I do not understand why. Can somebody explain?
def train_during_validation(model):
print(f'Model:{model}')
for epoch in range (1, 201):
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = criterion(out[data.val_mask], data.y[data.val_mask])
loss.backward()
optimizer.step()
return loss
def validation():
loss_val = np.zeros(50, dtype = float)
model = GCN(hidden_channels = 1)
for i in range (50):
model = GCN(hidden_channels = i)
#print(model)
loss_val[i] = train_during_validation(model)
print(loss_val[i])
validation()
Eventually I found an error:
optimizer and criterion must be defined inside the def train_during_validation(model)
Sorry if this sounds like a repeat. I have been through all of the related questions and found no suitable solutions to my problem's context.
I am trying to build a generative model that outputs probabilities for each tracked day of COVID to input into an SEIR-based epidemiology model.
The generation works. However, I cannot figure out how to train the model. I have to write a custom loss function that runs the day-by-day parameters through a step function for the epidemiology model with will populate a dataset of "confirmed" and "removed" for each day. I then compare that data to the recorded "confirmed" and "removed" from John Hopkin's COVID dataset on GitHub.
I use Mean Absolute Error to calculate a loss between the "confirmed" and "removed" based on the generated probabilities and the actual values from the JHU dataset. The issue I am running into is when I call the tf.gradient() function it returns a list of Nones. I am stuck here and any assistance would be greatly appreciated.
Here is the code I am using:
Training Step
# Define function to train the model based on one input
loss_fn = MeanAbsoluteError()
optimizer = Adam(learning_rate=0.005)
#tf.function
def train_step(x, y):
y_pred = np.zeros((3, latent_dim))
N = tf.constant(int(7_000_000_000), dtype=tf.float64)
E0 = tf.Variable(int(1000), trainable=False, dtype=tf.float64)
I0 = tf.Variable(covid_df.iloc[0]["Confirmed"], trainable=False, dtype=tf.float64)
R0 = tf.Variable(covid_df.iloc[0]["Removed"], trainable=False, dtype=tf.float64)
S0 = tf.Variable(N - E0 - I0 - R0, trainable=False, dtype=tf.float64)
u0 = tf.Variable(0, trainable=False, dtype=tf.float64)
SuEIRs = tf.stack([S0,u0,E0,I0,R0])
with tf.GradientTape() as tape:
logits = generator(tf.reshape(x, (batch_size, 4, latent_dim)), training=True)
betas = logits[0][0]
sigmas = logits[0][1]
mus = logits[0][2]
gammas = logits[0][3]
for t in range(latent_dim):
SuEIR_diffs = SuEIR_step(SuEIRs, t, N, betas, sigmas, mus, gammas)
SuEIRs = SuEIRs + SuEIR_diffs
confirmed = SuEIRs[3]
removed = SuEIRs[4]
# update y_pred
y_pred[0,t] = float(t+1)
y_pred[1,t] = confirmed.numpy()
y_pred[2,t] = removed.numpy()
# Convert predictions
y_pred = tf.convert_to_tensor(y_pred)
# Calculate loss
loss_value = loss_fn(y[1], y_pred[1]) + loss_fn(y[2], y_pred[2])
# Calculate the gradient
grads = tape.gradient(loss_value, generator.trainable_weights)
print(grads) ##==>> outputs [None, None, None, None]
# Apply gradients to model
optimizer.apply_gradients(zip(grads, generator.trainable_weights))
return loss_value
Training Loop
import time
epochs = 2
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
start_time = time.time()
# Iterate over the batches of the dataset.
for step in range(sample_size):
loss_value = train_step(x_input[step], y_true)
# Log every 5 batches.
if step % 5 == 0:
print(
"Training loss (for one batch) at step %d: %.4f"
% (step, float(loss_value))
)
print("Time taken: %.2fs" % (time.time() - start_time))
Error output
ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
loss_value and generator.trainable_weights are populated as expected.
EDIT: Updated code to reflect the suggestions of Myrl Marmarelis and the architecture of TensorFlow's custom training loop guide. Still having the same issue of gradients being a list of None's.
Try changing your calls to np.array(...) before calculating the loss (especially on y_pred) to tf.convert_to_tensor(...). You need to build a proper symbolic graph by keeping everything as tf.Tensors. In fact, make sure you are not converting anything to a non-Tensor anywhere along the chain of computation between the model parameters and the loss.
I would also suggest wrapping your training procedure in a #tf.function so that Tensorflow may compile it into a static graph.
I designed a network for a text classification problem. To do this, I'm using huggingface transformet's BERT model with a linear layer above that for fine-tuning. My problem is that the loss on the training set is decreasing which is fine, but when it comes to do the evaluation after each epoch on the development set, the loss is increasing with epochs. I'm posting my code to investigate if there's something wrong with it.
for epoch in range(1, args.epochs + 1):
total_train_loss = 0
trainer.set_train()
for step, batch in enumerate(train_dataloader):
loss = trainer.step(batch)
total_train_loss += loss
avg_train_loss = total_train_loss / len(train_dataloader)
logger.info(('Training loss for epoch %d/%d: %4.2f') % (epoch, args.epochs, avg_train_loss))
print("\n-------------------------------")
logger.info('Start validation ...')
trainer.set_eval()
y_hat = list()
y = list()
total_dev_loss = 0
for step, batch_val in enumerate(dev_dataloader):
true_labels_ids, predicted_labels_ids, loss = trainer.validate(batch_val)
total_dev_loss += loss
y.extend(true_labels_ids)
y_hat.extend(predicted_labels_ids)
avg_dev_loss = total_dev_loss / len(dev_dataloader)
print(("\n-Total dev loss: %4.2f on epoch %d/%d\n") % (avg_dev_loss, epoch, args.epochs))
print("Training terminated!")
Following is the trainer file, which I use for doing a forward pass on a given batch and then backpropagate accordingly.
class Trainer(object):
def __init__(self, args, model, device, data_points, is_test=False, train_stats=None):
self.args = args
self.model = model
self.device = device
self.loss = nn.CrossEntropyLoss(reduction='none')
if is_test:
# Should load the model from checkpoint
self.model.eval()
self.model.load_state_dict(torch.load(args.saved_model))
logger.info('Loaded saved model from %s' % args.saved_model)
else:
self.model.train()
self.optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = data_points * self.args.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=0,
num_training_steps=total_steps)
def step(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
self.model.zero_grad()
outputs = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
loss = self.loss(outputs, batch_labels)
loss = loss.sum()
(loss / loss.numel()).backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optim.step()
self.scheduler.step()
return loss
def validate(self, batch):
batch = tuple(t.to(self.device) for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
with torch.no_grad():
model_output = self.model(batch_input_ids,
attention_mask=batch_input_masks,
labels=batch_labels)
predicted_label_ids = self._predict(model_output)
label_ids = batch_labels.to('cpu').numpy()
loss = self.loss(model_output, batch_labels)
loss = loss.sum()
return label_ids, predicted_label_ids, loss
def _predict(self, logits):
return np.argmax(logits.to('cpu').numpy(), axis=1)
Finally, the following is my model (i.e., Classifier) class:
import torch.nn as nn
from transformers import BertModel
class Classifier(nn.Module):
def __init__(self, args, is_eval=False):
super(Classifier, self).__init__()
self.bert_model = BertModel.from_pretrained(
args.init_checkpoint,
output_attentions=False,
output_hidden_states=True,
)
self.is_eval_mode = is_eval
self.linear = nn.Linear(768, 2) # binary classification
def switch_state(self):
self.is_eval_mode = not self.is_eval_mode
def forward(self, input_ids, attention_mask=None, labels=None):
bert_outputs = self.bert_model(input_ids,
token_type_ids=None,
attention_mask=attention_mask)
# Should give the logits to the the linear layer
model_output = self.linear(bert_outputs[1])
return model_output
For visualization the loss throughout the epochs:
When I've used Bert for text classification my model has generally behaved as you tell. In part this is expected because pre-trained models tend to require few epochs to fine-tune, actually if you check Bert's paper the number of epochs recommended for fine-tuning is between 2 and 4.
On the other hand, I've usually found the optimum at just 1 or 2 epochs, which coincides with your case also. My guess is: there is a trade-off when fine-tuning pre-trained models between fitting to your downstream task and forgetting the weights learned at pre-training. Depending on the data you have, the equilibrium point may happen sooner or later and overfitting starts after that. But this paragraph is speculation based on my experience.
When validation loss increases it means your model is overfitting
Being new to DL and Tensorflow, I am trying to refer to https://machinetalk.org/2019/03/29/neural-machine-translation-with-attention-mechanism/?unapproved=67&moderation-hash=ea8e5dcb97c8236f68291788fbd746a7#comment-67 for implementing a Sequence to Sequence Model with Attention. However, I am quite confused as to how I can implement early stopping to avoid overfitting as the convention model.fit() is not being used in the code I am following.
if MODE == 'train':
for e in range(NUM_EPOCHS):
en_initial_states = encoder.init_states(BATCH_SIZE)
encoder.save_weights(
'checkpoints_luong/encoder/encoder_{}.h5'.format(e + 1))
decoder.save_weights(
'checkpoints_luong/decoder/decoder_{}.h5'.format(e + 1))
for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
loss = train_step(source_seq, target_seq_in,
target_seq_out, en_initial_states)
if batch % 100 == 0:
print('Epoch {} Batch {} Loss {:.4f}'.format(
e + 1, batch, loss.numpy()))
try:
predict()
except Exception:
continue
Below is the Syntax for train_step() function:-
#tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
loss = 0
with tf.GradientTape() as tape:
en_outputs = encoder(source_seq, en_initial_states)
en_states = en_outputs[1:]
de_state_h, de_state_c = en_states
# We need to create a loop to iterate through the target sequences
for i in range(target_seq_out.shape[1]):
# Input to the decoder must have shape of (batch_size, length)
# so we need to expand one dimension
decoder_in = tf.expand_dims(target_seq_in[:, i], 1)
logit, de_state_h, de_state_c, _ = decoder(
decoder_in, (de_state_h, de_state_c), en_outputs[0])
# The loss is now accumulated through the whole batch
loss += loss_func(target_seq_out[:, i], logit)
variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
return loss / target_seq_out.shape[1]
To implement early stopping in this case, is it the same process to use callback etc., or is the process different? An example for this case is highly appreciated.