#tf.function is slowing down training step

#tf.function is slowing down training step - python

I am using the following tf.function decorated training step:
#tf.function
def train_step(inputs, labels):
with tf.GradientTape(persistent=True) as tape:
predictions = model([X, F], training=True)
losses = [l_f(tf.expand_dims(labels[:,i], axis=-1), predictions[i]) for i, l_f in enumerate(loss_functions)]
gradients = [tape.gradient(l, model.trainable_variables) for l in losses]
for g in gradients:
grads = [gg if gg is not None else tf.zeros_like(model.trainable_variables[i], dtype=tf.float32) for i, gg in enumerate(g)]
optimizer.apply_gradients(zip(grads, model.trainable_variables)
del tape
return losses
def weighted_loss(weights):
#tf.function
def loss_func(labels, predictions):
min_class_filter = tfk.backend.greater(labels, 0.5)
y_min = tf.boolean_mask(labels, min_class_filter)
y_max = tf.boolean_mask(labels, tf.math.logical_not(min_class_filter))
y_pred_min = tf.boolean_mask(predictions, min_class_filter)
y_pred_max = tf.boolean_mask(predictions, tf.math.logical_not(min_class_filter))
loss_min_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_min, y_pred_min))
loss_max_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_max, y_pred_max))
loss_all = tfk.backend.mean(tfk.backend.binary_crossentropy(labels, predictions))
return weights[0]*loss_min_class + weights[1]*loss_max_class + weights[2]*loss_all
return loss_func
loss_functions = [weighted_loss(w) for w in target_weights]
It's a little quirky, but basically, my network has multiple outputs, which means that there are cases where returning a gradient of None for certain weights is correct, so I am replacing those gradients with zero, and I'm calculating the loss at each of these outputs separately and then propagating each of them at each step.
When I run this as written, it takes an extremely long time (10min+) to run a single training step, and I see the following message in the logs:
E tensorflow/core/grappler/optimizers/meta_optimizer.cc:502] function_operator failed: Invalid argument: Input 0 of node model/LSTM_forward_0/zeros_like was passed int32 from model/LSTM_forward_0/StatefulPartitioned Call:9 incompatible with expected variant.
When I remove the #tf.function decorator, it runs in about 10% of the time, and I do not see this log warning. Is this warning a red herring or does it legitimately point to an issue created by adding #tf.function?
Additional Details:
TF 2.0
GPU enabled and available
CUDA 10.1
GPU utilization 0% in both cases but that isn't caused by data-feed maxing the CPU throughput, as when I generate training data outside of the training loop, it's as good as instantaneous from TFRecords with sufficient prefetch and limited augmentation
dtype of inputs, labels, gradients and all model.trainable_variables are all tf.float32

From what I read, tf.function should not include any assignment to the graph vars for it to run smoothly.
In a training step, you are changing the weights of the model, thus violating this.
I'm not sure this is the reason, but you can try to leave tf.function only in the loss function, but not in the training step.

I have figured out how to fix it. The issue was with overwriting None gradients, not with the persistent gradient tape.
#tf.function
def train_step(inputs, labels):
with tf.GradientTape(persistent=True) as tape:
predictions = model([X, F], training=True)
losses = [l_f(labels, predictions, i) for i, l_f in enumerate(loss_functions)]
gradients = [tape.gradient(l, model.trainable_variables) for l in losses]
for g in gradients:
optimizer.apply_gradients(zip(g, model.trainable_variables)
del tape
return losses
def weighted_loss(weights):
#tf.function
def loss_func(labs, preds, i):
labels = tf.expand_dims(labs[:,i], axis=-1)
predictions = preds[i]
min_class_filter = tfk.backend.greater(labels, 0.5)
y_min = tf.boolean_mask(labels, min_class_filter)
y_max = tf.boolean_mask(labels, tf.math.logical_not(min_class_filter))
y_pred_min = tf.boolean_mask(predictions, min_class_filter)
y_pred_max = tf.boolean_mask(predictions, tf.math.logical_not(min_class_filter))
loss_min_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_min, y_pred_min))
loss_max_class = tfk.backend.mean(tfk.backend.binary_crossentropy(y_max, y_pred_max))
loss_all = tfk.backend.mean(tfk.backend.binary_crossentropy(labels, predictions))
return weights[0]*loss_min_class + weights[1]*loss_max_class + weights[2]*loss_all
return loss_func
loss_functions = [weighted_loss(w) for w in target_weights]
By passing all outputs and all labels into the loss function (even if I ignore a bunch of them) the tape will return an appropriate gradient (0) for all branches, not just the ones in focus for that particular loss.

Related

Pytorch backward does not compute the gradients for requested variables

I'm trying to train a resnet18 model on pytorch (+pytorch-lightning) with the use of Virtual Adversarial Training. During the computations required for this type of training I need to obtain the gradient of D (ie. the cross-entropy loss of the model) with regard to tensor r.
This should, in theory, happen in the following code snippet:
def generic_step(self, train_batch, batch_idx, step_type):
x, y = train_batch
unlabeled_idx = y is None
d = torch.rand(x.shape).to(x.device)
d = d/(torch.norm(d) + 1e-8)
pred_y = self.classifier(x)
y[unlabeled_idx] = pred_y[unlabeled_idx]
l = self.criterion(pred_y, y)
R_adv = torch.zeros_like(x)
for _ in range(self.ip):
r = self.xi * d
r.requires_grad = True
pred_hat = self.classifier(x + r)
# pred_hat = F.log_softmax(pred_hat, dim=1)
D = self.criterion(pred_hat, pred_y)
self.classifier.zero_grad()
D.requires_grad=True
D.backward()
R_adv += self.eps * r.grad / (torch.norm(r.grad) + 1e-8)
R_adv /= 32
loss = l + R_adv * self.a
loss.backward()
self.accuracy[step_type] = self.acc_metric(torch.argmax(pred_y, 1), y)
return loss
Here, to my understanding, r.grad should in theory be the gradient of D with respect to r. However, the code throws this at D.backward():
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
(full traceback excluded because this error is not helpful and technically "solved" as I know the cause for it, explained just below)
After some research and debugging it seems that in this situation D.backward() attempts to calculate dD/dD disregarding any previous mention of requires_grad=True. This is confirmed when I add D.requires_grad=True and I get D.grad=Tensor(1.,device='cuda:0') but r.grad=None.
Does anyone know why this may be happening?

In Lightning, .backward() and optimizer step are all handled under the hood. If you do it yourself like in the code above, it will mess with Lightning because it doesn't know you called backward yourself.
You can enable manual optimization in the LightningModule:
def __init__(self):
super().__init__()
# put this in your init
self.automatic_optimization = False
This tells Lightning that you are taking over calling backward and handling optimizer step + zero grad yourself. Don't forget to add that in your code above. You can access the optimizer and scheduler like so in your training step:
def training_step(self, batch, batch_idx):
optimizer = self.optimizers()
scheduler = self.lr_schedulers()
# do your training step
# don't forget to call:
# 1) backward 2) optimizer step 3) zero grad
Read more about manual optimization here.

Model does not train properly when explicitly applying the gradients

I’m trying to constrain the weight of my model by explicitly applying the gradients; shower, this is not working and I can’t figure out why.
I’m defining the model with the following function:
def init_model(num_hidden_layers=2, num_neurons_per_layer=64):
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(2,)) )
for _ in range(num_hidden_layers):
model.add(tf.keras.layers.Dense(num_neurons_per_layer, activation=tf.keras.layers.LeakyReLU( ),kernel_initializer="glorot_uniform") )
model.add(tf.keras.layers.Dense(1,kernel_initializer="glorot_uniform"))
return model
When using the fit method, the loss function decreases and the model fits the data:
Nepochs = 1500
lr = 0.001
def my_loss(u_true, u_pred):
return tf.math.reduce_mean(tf.math.square(u_true - u_pred))
model_0 = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_0 = tf.keras.optimizers.Adam(learning_rate=lr)
model_0.compile(loss=my_loss, optimizer=optim_0)
model_0.summary()
history_0 = model_0.fit(X_train,u_train,validation_data=(X_test.numpy(),u_test.numpy()),epochs=Nepochs, batch_size=X_train.shape[0])
When I explicitly specify and apply the gradient, the loss function stagnates and the output does not fit the data (it is uniform everywhere):
Nepochs = 1500
lr = 0.001
def compute_loss(model, X_data, u_data):
u_pred = model(X_data)
loss = tf.math.reduce_mean(tf.math.square(u_data - u_pred))
return loss
#tf.function
def training(model, optim, X_train, u_train, X_test=None, u_test=None):
if X_test is not None:
validation_loss = compute_loss(model, X_test, u_test )
else:
validation_loss = None
with tf.GradientTape(persistent=True) as tape:
tape.watch(model.trainable_variables)
loss = compute_loss(model, X_train, u_train )
grad_theta = tape.gradient(loss, model.trainable_variables)
optim.apply_gradients(zip(grad_theta, model.trainable_variables))
return loss,validation_loss
model_G = init_model(num_hidden_layers=2, num_neurons_per_layer=64)
optim_G = tf.keras.optimizers.Adam(learning_rate=lr)
model_G.summary()
hist = {'val_loss':[],'loss':[]}
for i in range(Nepochs+1):
loss, val_loss = training(model_G,optim_G,X_train,u_train,X_test,u_test)
hist['loss'].append(loss.numpy())
hist['val_loss'].append(val_loss.numpy())
if val_loss is not None:
print('It {:05d}: loss = {:10.8e}, validation loss = {:10.8e} '.format(i,loss,val_loss))
else:
print('It {:05d}: loss = {:10.8e}'.format(i,loss))
Why do the two versions provide different results?
Thanks for the help.
Cesare

Finally, I found that expanding the dimension of the targets as follows:
u_train = tf.expand_dims(u_train,axis=-1)
u_test = tf.expand_dims(u_test,axis=-1)
the model training properly and the loss functions are correctly evaluated.
u_train and u_test previously had shapes equal to the number of entries N only; by expanding the dimension, the shape now is (N,1).
using fit the code works with both; when explicitly using the gradient, only with targets of shape (N,1).

tf.gradients() returns a list of [None]

Sorry if this sounds like a repeat. I have been through all of the related questions and found no suitable solutions to my problem's context.
I am trying to build a generative model that outputs probabilities for each tracked day of COVID to input into an SEIR-based epidemiology model.
The generation works. However, I cannot figure out how to train the model. I have to write a custom loss function that runs the day-by-day parameters through a step function for the epidemiology model with will populate a dataset of "confirmed" and "removed" for each day. I then compare that data to the recorded "confirmed" and "removed" from John Hopkin's COVID dataset on GitHub.
I use Mean Absolute Error to calculate a loss between the "confirmed" and "removed" based on the generated probabilities and the actual values from the JHU dataset. The issue I am running into is when I call the tf.gradient() function it returns a list of Nones. I am stuck here and any assistance would be greatly appreciated.
Here is the code I am using:
Training Step
# Define function to train the model based on one input
loss_fn = MeanAbsoluteError()
optimizer = Adam(learning_rate=0.005)
#tf.function
def train_step(x, y):
y_pred = np.zeros((3, latent_dim))
N = tf.constant(int(7_000_000_000), dtype=tf.float64)
E0 = tf.Variable(int(1000), trainable=False, dtype=tf.float64)
I0 = tf.Variable(covid_df.iloc[0]["Confirmed"], trainable=False, dtype=tf.float64)
R0 = tf.Variable(covid_df.iloc[0]["Removed"], trainable=False, dtype=tf.float64)
S0 = tf.Variable(N - E0 - I0 - R0, trainable=False, dtype=tf.float64)
u0 = tf.Variable(0, trainable=False, dtype=tf.float64)
SuEIRs = tf.stack([S0,u0,E0,I0,R0])
with tf.GradientTape() as tape:
logits = generator(tf.reshape(x, (batch_size, 4, latent_dim)), training=True)
betas = logits[0][0]
sigmas = logits[0][1]
mus = logits[0][2]
gammas = logits[0][3]
for t in range(latent_dim):
SuEIR_diffs = SuEIR_step(SuEIRs, t, N, betas, sigmas, mus, gammas)
SuEIRs = SuEIRs + SuEIR_diffs
confirmed = SuEIRs[3]
removed = SuEIRs[4]
# update y_pred
y_pred[0,t] = float(t+1)
y_pred[1,t] = confirmed.numpy()
y_pred[2,t] = removed.numpy()
# Convert predictions
y_pred = tf.convert_to_tensor(y_pred)
# Calculate loss
loss_value = loss_fn(y[1], y_pred[1]) + loss_fn(y[2], y_pred[2])
# Calculate the gradient
grads = tape.gradient(loss_value, generator.trainable_weights)
print(grads) ##==>> outputs [None, None, None, None]
# Apply gradients to model
optimizer.apply_gradients(zip(grads, generator.trainable_weights))
return loss_value
Training Loop
import time
epochs = 2
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
start_time = time.time()
# Iterate over the batches of the dataset.
for step in range(sample_size):
loss_value = train_step(x_input[step], y_true)
# Log every 5 batches.
if step % 5 == 0:
print(
"Training loss (for one batch) at step %d: %.4f"
% (step, float(loss_value))
)
print("Time taken: %.2fs" % (time.time() - start_time))
Error output
ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
loss_value and generator.trainable_weights are populated as expected.
EDIT: Updated code to reflect the suggestions of Myrl Marmarelis and the architecture of TensorFlow's custom training loop guide. Still having the same issue of gradients being a list of None's.

Try changing your calls to np.array(...) before calculating the loss (especially on y_pred) to tf.convert_to_tensor(...). You need to build a proper symbolic graph by keeping everything as tf.Tensors. In fact, make sure you are not converting anything to a non-Tensor anywhere along the chain of computation between the model parameters and the loss.
I would also suggest wrapping your training procedure in a #tf.function so that Tensorflow may compile it into a static graph.

Custom Traing Loop with multiple model pass through

Dear stackoverflow members,
I am currently trying to implement my own keras tuner training loop. In this loop I want to pass the input variable multiple times through the model in example:
Y = Startvalue
for i in range(x):
Y = model(Y)
I want to see if this method creates more stable simulations for my self feedback problem.
When I implement it I get an OOM error even when I do not loop. This error does not occur when I just do it normally.
My Class example (the OOM error occurs when i switch logits for logits2:
class MyTuner(kt.Tuner):
def run_trial(self, trial, train_ds, validation_data):
model = self.hypermodel.build(trial.hyperparameters)
optimizer = tf.keras.optimizers.Adam()
epoch_loss_metric = tf.keras.metrics.MeanSquaredError()
def microbatch(T_IN, A_IN, D_IN):
OUT_T = []
OUT_A = []
for i in range(len(T_IN)):
A_IN_R = tf.expand_dims(tf.squeeze(A_IN[i]), 0)
T_IN_R = tf.expand_dims(tf.squeeze(T_IN[i]), 0)
D_IN_R = tf.expand_dims(tf.squeeze(D_IN[i]), 0)
(OUT_T_R, OUT_A_R) = model((A_IN_R, T_IN_R, D_IN_R))
OUT_T.append(tf.squeeze(OUT_T_R))
OUT_A.append(tf.squeeze(OUT_A_R))
return(tf.squeeze(tf.stack(OUT_T)), tf.squeeze(tf.stack(OUT_A)))
def run_train_step(data):
T_IN = tf.dtypes.cast(data[0][0], 'float32')
A_IN = tf.dtypes.cast(data[0][1], 'float32')
D_IN = tf.dtypes.cast(data[0][2], 'float32')
A_Ta = tf.dtypes.cast(data[1][0], 'float32')
T_Ta = tf.dtypes.cast(data[1][1], 'float32')
mse = tf.keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
logits2 = microbatch(T_IN, A_IN, D_IN)
logits = model([A_IN, T_IN, D_IN])
loss = mse((T_Ta, A_Ta), logits2)
# Add any regularization losses.
if model.losses:
loss += tf.math.add_n(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
epoch_loss_metric.update_state((T_Ta, A_Ta), logits2)
return loss
for epoch in range(1000):
print('Epoch: {}'.format(epoch))
self.on_epoch_begin(trial, model, epoch, logs={})
for batch, data in enumerate(train_ds):
self.on_batch_begin(trial, model, batch, logs={})
batch_loss = float(run_train_step(data))
self.on_batch_end(trial, model, batch, logs={'loss': batch_loss})
if batch % 100 == 0:
loss = epoch_loss_metric.result().numpy()
print('Batch: {}, Average Loss: {}'.format(batch, loss))
epoch_loss = epoch_loss_metric.result().numpy()
self.on_epoch_end(trial, model, epoch, logs={'loss': epoch_loss})
epoch_loss_metric.reset_states()
````

In my understanding, the micro-batch function is not implementing a self-feedback loop (though it does not affect the OOM)
I guess what's happening is that because you are computing the output of the network k times, the amount of memory consumption by the network is increasing by k times (because it needs to store intermediate tensors for backprop).
What you can do is, at each self-feedback instance, you backprop the gradients so that all the intermediate tensors do not increase beyond the limit.
lemme know if you have any doubt,

Eagerly update a keras model's weights directly using the gradient

I am writing a custom optimizer with Eager Execution in Ternsorflow 1.15 but can't figure out how to update the weights.
Taking gradient descent as an example, I have the weights, the gradient and a scalar learning rate but can't figure out how to combine them.
This is an implementation of gradient descent where model is a keras.Model e.g. a multilayer CNN:
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
step = tf.multiply(self.lr, grad)
model.trainable_variables.assign_sub(step)
but it fails on the tf.multiply saying
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shapes of all inputs must match: values[0].shape = [5,5,1,6] != values[1].shape = [6] [Op:Pack] name: packed
I also know the last line will fail as trainable_variables is a list and doesn't have the method assign_sub.
How can I rewrite the last two lines of my code to do:
model.trainable_variables -= lr * grad

Figured it out. As both are lists we need to iterate through their pairs of gradients and variables for each layer together and update each of these separately.
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
for v, g in zip(model.trainable_variables, grad):
v.assign_sub(lr * g)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

#tf.function is slowing down training step - python

Related

Pytorch backward does not compute the gradients for requested variables

Model does not train properly when explicitly applying the gradients

tf.gradients() returns a list of [None]

Custom Traing Loop with multiple model pass through

Eagerly update a keras model's weights directly using the gradient

Categories

Resources