I have a big issue with memory. I am developing a big application with GUI for testing and optimizing neural networks. The main program is showing the GUI, but training is done in thread. In my app I need to train many models with different parameters one after one. To do this I need to create a model for each attempt. When I train one I want to delete it and train new one, but I cannot delete old model. I am trying to do something like this:
del model
torch.cuda.empty_cache()
but GPU memory doesn't change,
then i tried to do this:
model.cpu()
del model
When I move model to CPU, GPU memory is freed but CPU memory increase.
In each attempt of training, memory is increasing all the time. Only when I close my app and run it again the all memory is freed.
Is there a way to delete model permanently from GPU or CPU?
Edit:
Code:
Thread, where the procces of training take pleace:
class uczeniegridsearcch(QObject):
endofoneloop = pyqtSignal()
endofonesample = pyqtSignal()
finished = pyqtSignal()
def __init__(self, train_loader, test_loader, epoch, optimizer, lenoftd, lossfun, numberofsamples, optimparams, listoflabels, model_name, num_of_class, pret):
super(uczeniegridsearcch, self).__init__()
self.train_loaderup = train_loader
self.test_loaderup = test_loader
self.epochup = epoch
self.optimizername = optimizer
self.lenofdt = lenoftd
self.lossfun = lossfun
self.numberofsamples = numberofsamples
self.acc = 0
self.train_loss = 0
self.sendloss = 0
self.optimparams = optimparams
self.listoflabels = listoflabels
self.sel_Net = model_name
self.num_of_class = num_of_class
self.sel_Pret = pret
self.modelforsend = []
def setuptrainmodel(self):
if self.sel_Net == "AlexNet":
model = models.alexnet(pretrained=self.sel_Pret)
model.classifier[6] = torch.nn.Linear(4096, self.num_of_class)
elif self.sel_Net == "ResNet50":
model = models.resnet50(pretrained=self.sel_Pret)
model.fc = torch.nn.Linear(model.fc.in_features, self.num_of_class)
elif self.sel_Net == "VGG13":
model = models.vgg13(pretrained=self.sel_Pret)
model.classifier[6] = torch.nn.Linear(model.classifier[6].in_features, self.num_of_class)
elif self.sel_Net == "DenseNet201":
model = models.densenet201(pretrained=self.sel_Pret)
model.classifier = torch.nn.Linear(model.classifier.in_features, self.num_of_class)
elif self.sel_Net == "MNASnet":
model = models.mnasnet1_0(pretrained=self.sel_Pret)
model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, self.num_of_class)
elif self.sel_Net == "ShuffleNet v2":
model = models.shufflenet_v2_x1_0(pretrained=self.sel_Pret)
model.fc = torch.nn.Linear(model.fc.in_features, self.num_of_class)
elif self.sel_Net == "SqueezeNet":
model = models.squeezenet1_0(pretrained=self.sel_Pret)
model.classifier[1] = torch.nn.Conv2d(512, self.num_of_class, kernel_size=(1, 1), stride=(1, 1))
model.num_classes = self.num_of_class
elif self.sel_Net == "GoogleNet":
model = models.googlenet(pretrained=self.sel_Pret)
model.fc = torch.nn.Linear(model.fc.in_features, self.num_of_class)
return model
def train(self):
for x in range(self.numberofsamples):
torch.cuda.empty_cache()
modelup = self.setuptrainmodel()
device = torch.device('cuda')
optimizerup = TableWidget.setupotimfun(self, modelup, self.optimizername, self.optimparams[(x, 0)],
self.optimparams[(x, 1)], self.optimparams[(x, 2)],
self.optimparams[(x, 3)],
self.optimparams[(x, 4)], self.optimparams[(x, 5)])
modelup = modelup.to(device)
best_accuracy = 0.0
train_error_count = 0
for epoch in range(self.epochup):
for images, labels in iter(self.train_loaderup):
images = images.to(device)
labels = labels.to(device)
optimizerup.zero_grad()
outputs = modelup(images)
loss = TableWidget.setuplossfun(self, lossfun=self.lossfun, outputs=outputs, labels=labels)
self.train_loss += loss
loss.backward()
optimizerup.step()
train_error_count += float(torch.sum(torch.abs(labels - outputs.argmax(1))))
self.train_loss /= len(self.train_loaderup)
test_error_count = 0.0
for images, labels in iter(self.test_loaderup):
images = images.to(device)
labels = labels.to(device)
outputs = modelup(images)
test_error_count += float(torch.sum(torch.abs(labels - outputs.argmax(1))))
test_accuracy = 1.0 - float(test_error_count) / float(self.lenofdt)
print('%s, %d,%d: %f %f' % ("PrĂ³ba nr:", x+1, epoch, test_accuracy, self.train_loss), "Parametry: ", self.optimparams[x,:])
self.acc = test_accuracy
self.sendloss = self.train_loss.item()
self.endofoneloop.emit()
self.endofonesample.emit()
modelup.cpu()
del modelup,optimizerup,device,test_accuracy,test_error_count,train_error_count,loss,labels,images,outputs
torch.cuda.empty_cache()
self.finished.emit()
How I call thread in main block:
self.qtest = uczeniegridsearcch(self.train_loader,self.test_loader, int(self.InputEpoch.text()),
self.sel_Optim,len(self.test_dataset), self.sel_Loss,
int(self.numberofsamples.text()), self.params, self.listoflabels,
self.sel_Net,len(self.sel_ImgClasses),self.sel_Pret)
self.qtest.endofoneloop.connect(self.inkofprogress)
self.qtest.endofonesample.connect(self.inksamples)
self.qtest.finished.connect(self.prints)
testtret = threading.Thread(target=self.qtest.train)
testtret.start()
Assuming that the model creation code is run iteratively inside a loop,I suggest the following
Put code for model creation, training,evaluation and model deletion code inside a separate function and call that function from the loop body.
Call gc.collect() after the function call
The rational for first point is that the model creation, deletion and cache clearing would happen in a separate stack and it would force the GPU memory clearance when the method returns.
Related
I am getting;
tensorflow.python.framework.errors_impl.OperatorNotAllowedInGraphError: Using a symbolic `tf.Tensor` as a Python `bool` is not allowed in Graph execution. Use Eager execution or decorate this function with #tf.function.
Error while I'm trying to fit DDPG agent over custom environment.
Here is the CustomEnv()
class CustomEnv(Env):
def __init__(self):
print("Test_3 : Init")
"""NOTE: Bool array element definition for Box action space needs to be determined !!!!"""
self.action_space = Tuple((Box(low=4, high=20, shape=(1, 1)),
Box(low=0, high=1, shape=(1, 1)),
MultiBinary(1),
MultiBinary(1),
Box(low=4, high=20, shape=(1, 1)),
Box(low=0, high=1, shape=(1, 1)),
MultiBinary(1),
MultiBinary(1),
Box(low=0, high=100, shape=(1, 1)),
Box(low=0, high=100, shape=(1, 1))))
"""Accuracy array"""
self.observation_space = Box(low=np.asarray([0]), high=np.asarray([100]))
"""Initial Space"""
self.state = return_Acc(directory=source_dir, input_array=self.action_space.sample())
self.episode_length = 20
print(f"Action Space sample = {self.action_space.sample()}")
print("Test_3 : End Init")
def step(self, action):
print(f"Model Action Space Output = {action}")
print("Test_2 : Step")
accuracy_of_model = random.randint(0,100)#return_Acc(directory=source_dir, input_array=action)
self.state = accuracy_of_model#round(100*abs(accuracy_of_model))
self.episode_length -= 1
# Calculating the reward
print(f"self.state = {self.state}, accuracy_of_model = {accuracy_of_model}")
if (self.state > 60):
reward = self.state
else:
reward = -(60-self.state)*10
if self.episode_length <= 0:
done = True
else:
done = False
# Setting the placeholder for info
info = {}
# Returning the step information
print("Test_2 : End Step")
return self.state, reward, done, info
def reset(self):
print("Test_1 : Reset")
self.state = 50
print(f"Self state = {self.state}")
self.episode_length = 20
print("Test_1 : End Reset")
return self.state
return_Acc function runs a Random Decision Forrest Model and return it's accuracy to DDPG model for determining next step's parameters. For the last my DDPG model as given below;
states = env.observation_space.shape
actions = np.asarray(env.action_space.sample()).size
print(f"states = {states}, actions = {actions}")
def model_creation(states, actions):
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(32, activation='relu', input_shape=states))
model.add(tf.keras.layers.Dense(24, activation='relu'))
model.add(tf.keras.layers.Dense(actions, activation='linear'))
model.build()
return model
model = model_creation(states, actions)
model.summary()
def build_agent(model, actions, critic):
policy = BoltzmannQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
nafa = DDPGAgent(nb_actions=actions, actor=model, memory=memory, critic=critic, critic_action_input=action_input)
#dqn = DQNAgent(model=model, memory=memory, policy=policy,
# nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
return nafa
action_input = Input(shape=(actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())
dqn = build_agent(model, actions, critic)
dqn.compile(tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=200, visualize=False, verbose=1)
results = dqn.test(env, nb_episodes=500, visualize=False)
print(f"episode_reward = {np.mean(results.history['episode_reward'])}")
I tried most of the solutions that I found here like
tf.compat.v1.enable_eager_execution()
and combination of this with other functions. (Such as enable_v2_behaviour()) But I couldn't able to make this worked. If I don't run RDF model inside DDPG then there is no problem occurring. If it's possible how can I connect RDf model accuracy output to self.state as an input.
keras-rl2 1.0.5
tensorflow-macos 2.10.0
And I'm using M1 based mac if that's matter.
To anyone interested with the solution I came up with a slower but at least working solution. It's actually simpler than expected. Just insert a command which runs the model script from terminal and write its output to a text file, than read that text file from RL agent script and again write the action space values to a text file which then can be red from model to create observation.
I am working on the Point Cloud Registration Network(PCRNET) and I have a issue with the training process. For that I wrote a pytorch model that consists of 5 convolutional layers and 5 fully connected layers. My custom loss output changes with each new initialization of the network but then for each epoch I obtain the same values for each batch. Therefore no training is happening. I narrowed the error down to the fact that no gradients are being computed.
Here is my network and forward pass
class pcrnetwork(nn.Module):
def __init__(self,):
# This is the network that gets initialized with every new instance
super().__init__()
self.conv1 = nn.Conv1d(3,64,1, padding="valid")
self.conv2 = nn.Conv1d(64,64,1,padding="valid")
self.conv3 = nn.Conv1d(64,64,1,padding="valid")
self.conv4 = nn.Conv1d(64,128,1,padding="valid")
self.conv5 = nn.Conv1d(128,1024,1,padding="valid")
self.fc1 = nn.Linear(2048,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,512)
self.fc4 = nn.Linear(512,256)
self.fc5 = nn.Linear(256,6)
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(64)
self.bn3 = nn.BatchNorm1d(64)
self.bn4 = nn.BatchNorm1d(128)
self.bn6 = nn.BatchNorm1d(1024)
self.bn7 = nn.BatchNorm1d(512)
self.bn8 = nn.BatchNorm1d(512)
self.bn9 = nn.BatchNorm1d(256)
def forward1(self,input,input1,points):
point_cloud = torch.cat((input,input1),dim=2)
net = Func.relu(self.bn1(self.conv1(point_cloud)))
net = Func.relu(self.bn2(self.conv2(net)))
net = Func.relu(self.bn3(self.conv3(net)))
net = Func.relu(self.bn4(self.conv4(net)))
net = Func.relu(self.conv5(net))
net_s = net[:,:,0:points]
net_t = net[:,:,points:points*2]
pool = nn.MaxPool1d(net_s.size(-1))(net_s)
pool2 = nn.MaxPool1d(net_t.size(-1))(net_t)
global_feature = torch.cat((pool,pool2),1)
#global_feature = torch.squeeze(global_feature,dim=2)
global_feature = torch.flatten(global_feature,start_dim=1)
# fully connected part
net = Func.relu(self.bn6(self.fc1(global_feature)))
net = Func.relu(self.bn7(self.fc2(net)))
net = Func.relu(self.bn8(self.fc3(net)))
net = Func.relu(self.bn9(self.fc4(net)))
net = Func.relu(self.fc5(net))
pose = net
output = appply_transformation(torch.transpose(input,1,2),pose)
return output
my training loop looks like this:
def train1():
losss = []
for epoch in range(1):
model.train()
total_loss = 0.0
#poses = []
for idx, data in enumerate(train_loader,0):
x = data["source"] # shape= [32,2048,3]
y = data["target"]
x = torch.transpose(x,1,2)
x = x.to(device)
y = torch.transpose(y,1,2)
y = y.to(device)
optimizer.zero_grad()
output = model.forward1(x,y,2048)
y = torch.transpose(y,1,2)
loss = og_chamfer1(y,output)
loss.backward()
optimizer.step()
print(loss.item())
And finally here is the code for my loss function. The idea here is to let the network calculate 6 parameters(3 rotational, 3 translational) that get fed into my apply transformation function. Then my actual loss(=Chamfer Distance) is being calculated on the transformed source point cloud and the target point cloud.
def dist_vec(source, targ):
#AB = torch.matmul(targ,torch.transpose(source,1,2))
AB = torch.matmul(targ,torch.transpose(source,0,1))
#print("ab hat die shape",AB.shape)
AA = torch.sum(torch.square(targ),1)
#AA = AA[:,:,None]
#print("AA hat die shape", AA.shape)
BB = torch.sum(torch.square(source),1)
#BB = BB[:,:,None]
dist_matrix = torch.transpose((BB - 2 * AB), 0,1) + AA
return dist_matrix
def og_chamfer1(sourc,targ): # source =[32,2048,3]
batch_loss1 = torch.zeros(size=(len(sourc),))
batch_loss = []
#print(len(source))
for i in range(len(sourc)):
dist = dist_vec(sourc[i],targ[i])
#print("dist hat die shape", dist.shape)
min_x_val, min_x_idx = torch.min(dist, axis=0)
#print("this is minx", min_x_val)
#min_x = torch.tensor(min_x[0])
min_y_val, min_y_idx = torch.min(dist,axis=1)
#print("this is min y", min_y_val)
mean = torch.mean(min_x_val) + torch.mean(min_y_val)
batch_loss1[i] = mean
#batch_loss.append(mean)
#print(batch_loss)
#print(len(batch_loss))
#batch_loss_total = sum(batch_loss)/len(sourc)
#print(mean.shape)
batch_loss1 = torch.mean(batch_loss1)
return batch_loss1
all of these functions should work, I just post them for reference. I think the problem for para.grad=None lays somewhere in my apply transformation function:
def rotate_cloud_by_angle_z(input, rotation_angle):
# the input here should have shape=(num.of points x 3)
# dtype for the rotation matrix needs to be set to float64
cosval = torch.cos(rotation_angle) # DONT USE TF.MATH.COS BECAUSE U GET A TENSOR NOT A NUMBER
sinval = torch.sin(rotation_angle)
#print("sinval hat shape:",sinval.shape)
#cosval = torch.from_numpy(cosval)
#sinval = torch.from_numpy(sinval)
rotation_matrix =torch.tensor([[cosval.item(),-sinval.item(),0],[sinval.item(),cosval.item(),0],[0,0,1]],dtype=torch.float32, requires_grad=False)
rotation_matrix = rotation_matrix.to(device)
product = torch.matmul(input, rotation_matrix)
return product
def appply_transformation(datas,poses):
transformed_data = datas
#print("poses hat die shape", poses.shape)
for i in range(datas.shape[0]):
#print("poses[i,5] hat shape:", poses[i,5])
#print("poses hat shape:", poses.shape)
transformed_data[i,:,:] = rotate_cloud_by_angle_z(transformed_data[i,:,:].clone(),poses[i,5])
#print(poses.shape)
#print("poses[i,5] hat shape:", poses[i,5])
transformed_data[i,:,:] = rotate_cloud_by_angle_y(transformed_data[i,:,:].clone(),poses[i,4])
transformed_data[i,:,:] = rotate_cloud_by_angle_x(transformed_data[i,:,:].clone(),poses[i,3])
transformed_data[i,:,:] = translation(transformed_data[i,:,:].clone(),torch.tensor([poses[i,0],poses[i,1],poses[i,2]],requires_grad=False).to(device))
return transformed_data
on https://discuss.pytorch.org/t/model-param-grad-is-none-how-to-debug/52634/3 I could find out that one shouldn't use .item() or rewrapping of tensors like x = torch.tensor(x) but essentially I don't know how to change my apply transformation function in such that the gradient calculation works.
If anyone has any tips on that I would be super grateful!
I am training a model whose output and ground truth should be binary. It's an inception based two stream models. Inception architecture is used as an encoder and for decoder a custom based model is designed consisting of conv layers, batch normalization, up sampling and using tanh as non linearity.I have tried with relu but still no result.
Model is initializing at different values but not updating. My model's forward function is:
def forward(self, inp):
# Preprocessing
out = self.conv3d_1a_7x7(inp)
skip1 = out
out = self.maxPool3d_2a_3x3(out)
out = self.dropout(out)
out = self.conv3d_2b_1x1(out)
out = self.conv3d_2c_3x3(out)
out = self.maxPool3d_3a_3x3(out)
out = self.dropout(out)
out = self.mixed_3b(out)
skip2 = out
out = self.mixed_3c(out)
out = self.maxPool3d_4a_3x3(out)
out = self.dropout(out)
out = self.mixed_4b(out)
out = self.mixed_4c(out)
out = self.dropout(out)
out = self.mixed_4d(out)
skip3 = out
out = self.dropout(out)
out = self.mixed_4e(out)
out = self.mixed_4f(out)
out = self.maxPool3d_5a_2x2(out)
out = self.dropout(out)
out = self.mixed_5b(out)
out = self.mixed_5c(out)
out = self.dropout(out)
out = self.tconv6(out, skip1,skip2,skip3)
out = self.sigmoid(out)
print("Before permutation", out.shape)
out = out.permute(0,1,3,4,2)
out_logits = out
return out, out_logits
My train function is:
misc,out_logits[stream] = models[stream](data[stream])
out_softmax = torch.nn.functional.softmax(out_logits[stream], 1).requires_grad_()
val, preds = torch.max(out_logits[stream].data, 1)
preds = preds.to(device, dtype=torch.float)
gt = torch.round(gt)
gt_avg = torch.mean(gt)
gt[gt>gt_avg] = 1
gt[gt<=gt_avg] = 0
out_logits[stream] = out_logits[stream].squeeze(1)
losses[stream] = criterion(preds.cpu(), gt.cpu()).requires_grad_()
if phase == 'train':
optimizers[stream].zero_grad()
losses[stream].backward(retain_graph=True)
optimizers[stream].step()
running_losses[stream] += losses[stream].item() * data[stream].shape[0]
running_corrects[stream] += torch.sum(val.cpu() == gt_c.data.cpu()).item()
correct_t = torch.sum(preds==gt_c).item()
total_t = gt_c.shape[0]*gt_c.shape[1]*gt_c.shape[2]*gt_c.shape[3]
acc_epc = 100*correct_t/total_t
for scheduler in schedulers.values():
scheduler.step()
My loss and accuracy is always constant shown here
I have tried using different optimizers like SGD, Adam , RMSprop. Furthermore, I have tried tuning the hyperparameters but model is not converging. What am I missing?
You send the wrong variable into loss fuction if you are doing crossentropy. Change preds to out_logits[stream] and there's no need to do .cpu() and require_grad().
losses[stream] = criterion(out_logits[stream], gt)
Also, you performed argmax for preds. It's not differentiable regardless the loss function you used.
I am trying to use tensorflow addon's multioptimizer for discriminative layer training, different learning rates for different layers, but it does not work with the callback ReduceLROnPlateau.
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(patience=5, min_delta=1e-4, min_lr=1e-7, verbose=0)
with tpu_strategy.scope():
roberta_model = create_model(512)
optimizers = [
AdamWeightDecay(learning_rate=0.00001, weight_decay_rate=0.00001),
AdamWeightDecay(learning_rate=0.0001, weight_decay_rate=0.0001)
]
# specifying the optimizers and layers in which it will operate
optimizers_and_layers = [
(optimizers[0], roberta_model.layers[:3]),
(optimizers[1], roberta_model.layers[3:])
]
# Using Multi Optimizer from Tensorflow Addons
opt = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
roberta_model.compile(optimizer=opt,
loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1), metrics=["accuracy"])
history=roberta_model.fit(train, epochs=50, validation_data=val, callbacks=[reduce_lr])
At the end of the first epoch it produces this error:
AttributeError: 'MultiOptimizer' object has no attribute 'lr'
It works fine without the ReduceLROnPlateau callback.
I tried several things to solve this where the last attempt was to modify the callback - writing my own reduce learning rate on plateau callback. But this is far beyond my coding skills. I have commented where i made a couple of changes to the orginal callback.
I tried like this:
class My_ReduceLROnPlateau(tf.keras.callbacks.Callback):
def __init__(self,
monitor='val_loss',
factor=0.1,
patience=10,
verbose=0,
mode='auto',
min_delta=1e-4,
cooldown=0,
min_lr=0,
**kwargs):
super(My_ReduceLROnPlateau, self).__init__()
self.monitor = monitor
if factor >= 1.0:
raise ValueError(
f'ReduceLROnPlateau does not support a factor >= 1.0. Got {factor}')
if 'epsilon' in kwargs:
min_delta = kwargs.pop('epsilon')
logging.warning('`epsilon` argument is deprecated and '
'will be removed, use `min_delta` instead.')
self.factor = factor
self.min_lr = min_lr
self.min_delta = min_delta
self.patience = patience
self.verbose = verbose
self.cooldown = cooldown
self.cooldown_counter = 0 # Cooldown counter.
self.wait = 0
self.best = 0
self.mode = mode
self.monitor_op = None
self._reset()
def _reset(self):
"""Resets wait counter and cooldown counter.
"""
if self.mode not in ['auto', 'min', 'max']:
logging.warning('Learning rate reduction mode %s is unknown, '
'fallback to auto mode.', self.mode)
self.mode = 'auto'
if (self.mode == 'min' or
(self.mode == 'auto' and 'acc' not in self.monitor)):
self.monitor_op = lambda a, b: np.less(a, b - self.min_delta)
self.best = np.Inf
else:
self.monitor_op = lambda a, b: np.greater(a, b + self.min_delta)
self.best = -np.Inf
self.cooldown_counter = 0
self.wait = 0
def on_train_begin(self, logs=None):
self._reset()
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
logs['lr'] = backend.get_value(self.model.optimizer[1].lr)
current = logs.get(self.monitor)
if current is None:
logging.warning('Learning rate reduction is conditioned on metric `%s` '
'which is not available. Available metrics are: %s',
self.monitor, ','.join(list(logs.keys())))
else:
if self.in_cooldown():
self.cooldown_counter -= 1
self.wait = 0
if self.monitor_op(current, self.best):
self.best = current
self.wait = 0
elif not self.in_cooldown():
self.wait += 1
if self.wait >= self.patience:
# Here below i tried to subscript the self.model.optimizer
#, guessing that each pointed to one of the optimzers.
# And using the same code as in the original ReduceLROnPlateau to
# update the optimizers.
old_lr1 = backend.get_value(self.model.optimizer[1].lr)
old_lr0 = backend.get_value(self.model.optimizer[0].lr)
if old_lr1 > np.float32(self.min_lr):
new_lr1 = old_lr1 * self.factor
new_lr1 = max(new_lr1, self.min_lr)
backend.set_value(self.model.optimizer[1].lr, new_lr1)
new_lr0 = old_lr0 * self.factor
new_lr0 = max(new_lr0, self.min_lr)
backend.set_value(self.model.optimizer[0].lr, new_lr0)
if self.verbose > 0:
io_utils.print_msg(
f'\nEpoch {epoch +1}: '
f'ReduceLROnPlateau reducing learning rate to {new_lr0} and {new_lr1}.')
self.cooldown_counter = self.cooldown
self.wait = 0
def in_cooldown(self):
return self.cooldown_counter > 0
Then i created the callback
reduce_lr = My_ReduceLROnPlateau(patience=5, min_delta=1e-4, min_lr=1e-7, verbose=0)
and started to train again. At the end of the first epoch i got the following error.
TypeError: 'MultiOptimizer' object is not subscriptable
i.e. you cant do this self.model.optimizer[1], self.model.optimizer[0].
So my question is how to solve this? I.e using discriminative layer training with ReduceLROnPlateau.
Either via some other method or modify my attempt of creating a new callback class.
Here is a link to the orginal ReduceLROnPlateau callback, i.e. without the few changes i did above in my custom callback.
A solution would maybe be possible using this:
Note: Currently, tfa.optimizers.MultiOptimizer does not support callbacks that modify optimizers. However, you can instantiate optimizer layer pairs with tf.keras.optimizers.schedules.LearningRateSchedule instead of a static learning rate
Looking in the code of tfa.optimizers.MultiOptimizer (in the method create_optimizer_spec, it seems that optimizers can be accessed via
self.model.optimizer.optimizer_specs[0]["optimizer"] and self.model.optimizer.optimizer_specs[1]["optimizer"] to change the learning rate (which is why self.model.optimizer[1] raises an error).
Then your custom callback seems to work.
I am trying to write the loss of training set and validation set every X batches. I wrote a karas callback based on tensorflow v2 that calculate the values, but I cannot figure out how to put them in the same graph. Here is what I have done:
Used two diffrent summary writers
Used tf.summary.scalars
Used the same tag name 'loss'
def __init__(self, log_dir, train_data, validation_data, calculation_freq, num_train_batches, num_validation_batches):
self.batch = 0
self.train_data = train_data
self.validation_data = validation_data
self.calc_freq = calculation_freq
self.num_train_batches = num_train_batches
self.num_validation_batches = num_validation_batches
self.log_dir = log_dir
self.model = model
self.eval_validation = validation_data is not None
train_tensor_board_path = self.log_dir + '_train'
if not os.path.exists(train_tensor_board_path):
os.makedirs(train_tensor_board_path)
self.train_writer = tf.summary.create_file_writer(train_tensor_board_path)
self.train_writer.set_as_default()
if self.eval_validation:
validation_tensor_board_path = self.log_dir + '_validation'
if not os.path.exists(validation_tensor_board_path):
os.makedirs(validation_tensor_board_path)
self.validation_writer = tf.summary.create_file_writer(validation_tensor_board_path)
self.validation_writer.set_as_default()
def on_batch_end(self, batch, logs={}):
print('Batch number:', self.batch)
if self.batch % self.calc_freq == 0:
train_loss = self.model.evaluate(train_data, steps=num_train_batches)
tf.summary.scalar('loss', float(train_loss), step=self.batch)
self.train_writer.flush()
if self.eval_validation:
validation_loss = self.model.evaluate(validation_data, steps=num_validation_batches)
tf.summary.scalar('loss', float(validation_loss), step=self.batch)
self.validation_writer.flush()
self.batch += 1
print('Write to tensorboard')
def on_train_end(self, _):
self.train_writer.close()
if self.eval_validation:
self.validation_writer.close()```
Since writes go to the current default summary writer [1] and each summary point is associated with an integral step value, try to change the context for each of the phases (train or validation) to the current default summary writer using with ... .as_default():.
def on_batch_end(self, batch, logs={}):
print('Batch number:', self.batch)
if self.batch % self.calc_freq == 0:
train_loss = self.model.evaluate(train_data, steps=num_train_batches)
with self.train_writer.as_default(): # current default summary
tf.summary.scalar('loss', float(train_loss), step=self.batch)
self.train_writer.flush()
if self.eval_validation:
validation_loss = self.model.evaluate(validation_data, steps=num_validation_batches)
with self.validation_writer.as_default(): # current default summary
tf.summary.scalar('loss', float(validation_loss), step=self.batch)
self.validation_writer.flush()
self.batch += 1
print('Write to tensorboard')