tensorflow 2 : loss using hidden layers output

tensorflow 2 : loss using hidden layers output - python

I am trying to implement the OSME MAMC model describe in article https://arxiv.org/abs/1806.05372.
I'm stuck where I have to add a cost that doesn't depend on y_true and y_pred but on hidden layers and y_true.
It can't be right as tensorflow custom loss, for which we need y_true and y_pred.
I wrote the model into class, then tried to use gradient tape to add NPairLoss to Softmax output loss, but gradient is NaN during training.
I think my approach isn't good, but I have no idea how to design / write it.
Here my model :
class OSME_network(tf.keras.Model):
def __init__(self, nbrclass=10, weight="imagenet",input_tensor=(32,32,3)):
super(OSME_network, self).__init__()
self.nbrclass = nbrclass
self.weight = weight
self.input_tensor=input_tensor
self.Resnet_50=ResNet50(include_top=False, weights=self.weight, input_shape=self.input_tensor)
self.Resnet_50.trainable=False
self.split=Lambda(lambda x: tf.split(x,num_or_size_splits=2,axis=-1))
self.s_1=OSME_Layer(ch=1024,ratio=16)
self.s_2=OSME_Layer(ch=1024,ratio=16)
self.fl1=tf.keras.layers.Flatten()
self.fl2=tf.keras.layers.Flatten()
self.d1=tf.keras.layers.Dense(1024, name='fc1')
self.d2=tf.keras.layers.Dense(1024,name='fc2')
self.fc=Concatenate()
self.preds=tf.keras.layers.Dense(self.nbrclass,activation='softmax')
#tf.function
def call(self,x): #set à construire le model sequentiellement
x=self.Resnet_50(x)
x_1,x_2=self.split(x)
xx_1 = self.s_1(x_1)
xx_2 = self.s_2(x_2)
xxx_1 = self.d1(xx_1)
xxx_2 = self.d2(xx_2)
xxxx_1 = self.fl1(xxx_1)
xxxx_2 = self.fl2(xxx_2)
fc = self.fc([xxxx_1,xxxx_2]) #fc1 + fc2
ret=self.preds(fc)
return xxxx_1,xxxx_2,ret
class OSME_Layer(tf.keras.layers.Layer):
def __init__(self,ch,ratio):
super(OSME_Layer,self).__init__()
self.GloAvePool2D=GlobalAveragePooling2D()
self.Dense1=Dense(ch//ratio,activation='relu')
self.Dense2=Dense(ch,activation='sigmoid')
self.Mult=Multiply()
self.ch=ch
def call(self,inputs):
squeeze=self.GloAvePool2D(inputs)
se_shape = (1, 1, self.ch)
se = Reshape(se_shape)(squeeze)
excitation=self.Dense1(se)
excitation=self.Dense2(excitation)
scale=self.Mult([inputs,excitation])
return scale
class NPairLoss():
def __init__(self):
self._inputs = None
self._y=None
#tf.function
def __call__(self,inputs,y):
targets=tf.argmax(y, axis=1)
b, p, _ = inputs.shape
n = b * p
inputs=tf.reshape(inputs, [n, -1])
targets = tf.repeat(targets,repeats=p)
parts = tf.tile(tf.range(p),[b])
prod=tf.linalg.matmul(inputs,inputs,transpose_a=False,transpose_b=True)
same_class_mask = tf.math.equal(tf.broadcast_to(targets,[n, n]),tf.transpose(tf.broadcast_to(targets,(n, n))))
same_atten_mask = tf.math.equal(tf.broadcast_to(parts,[n, n]),tf.transpose(tf.broadcast_to(parts,(n, n))))
s_sasc = same_class_mask & same_atten_mask
s_sadc = (~same_class_mask) & same_atten_mask
s_dasc = same_class_mask & (~same_atten_mask)
s_dadc = (~same_class_mask) & (~same_atten_mask)
loss_sasc = 0
loss_sadc = 0
loss_dasc = 0
for i in range(n):
#loss_sasc
pos = prod[i][s_sasc[i]]
neg = prod[i][s_sadc[i] | s_dasc[i] | s_dadc[i]]
n_pos=tf.shape(pos)[0]
n_neg=tf.shape(neg)[0]
pos = tf.transpose(tf.broadcast_to(pos,[n_neg,n_pos]))
neg = tf.broadcast_to(neg,[n_pos,n_neg])
exp=tf.clip_by_value(tf.math.exp(neg - pos),clip_value_min=0,clip_value_max=9e6) # need to clip value, else inf
loss_sasc += tf.reduce_sum(tf.math.log(1 + tf.reduce_sum(exp,axis=1)))
#loss_sadc
pos = prod[i][s_sadc[i]]
neg = prod[i][s_dadc[i]]
n_pos = tf.shape(pos)[0]
n_neg = tf.shape(neg)[0]
pos = tf.transpose(tf.broadcast_to(pos,[n_neg,n_pos])) #np.transpose(np.tile(pos,[n_neg,1]))
neg = tf.broadcast_to(neg,[n_pos,n_neg])#np.tile(neg,[n_pos,1])
exp=tf.clip_by_value(tf.math.exp(neg - pos),clip_value_min=0,clip_value_max=9e6)
loss_sadc += tf.reduce_sum(tf.math.log(1 + tf.reduce_sum(exp,axis=1)))
#loss_dasc
pos = prod[i][s_dasc[i]]
neg = prod[i][s_dadc[i]]
n_pos = tf.shape(pos)[0]
n_neg = tf.shape(neg)[0]
pos = tf.transpose(tf.broadcast_to(pos,[n_neg,n_pos])) #np.transpose(np.tile(pos,[n_neg,1]))
neg = tf.broadcast_to(neg,[n_pos,n_neg])#np.tile(neg,[n_pos,1])
exp=tf.clip_by_value(tf.math.exp(neg - pos),clip_value_min=0,clip_value_max=9e6)
loss_dasc += tf.reduce_sum(tf.math.log(1 + tf.reduce_sum(exp,axis=1)))
return (loss_sasc + loss_sadc + loss_dasc) / n
then, for training :
#tf.function
def train_step(x,y):
with tf.GradientTape() as tape:
fc1,fc2,y_pred=model(x,training=True)
stacked=tf.stack([fc1,fc2],axis=1)
layerLoss=npair(stacked,y)
loss=cce(y, y_pred) +0.001*layerLoss
grads=tape.gradient(loss,model.trainable_variables)
opt.apply_gradients(zip(grads,model.trainable_variables))
return loss
model=OSME_network(weight="imagenet",nbrclass=10,input_tensor=(32, 32, 3))
model.compile(optimizer=opt, loss=categorical_crossentropy, metrics=["acc"])
model.build(input_shape=(None,32,32,3))
cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True,name='categorical_crossentropy')
npair=NPairLoss()
for each batch :
x=tf.Variable(x_train[start:end])
y=tf.Variable(y_train[start:end])
train_loss=train_step(x,y)
Thanks for any help :)

You can use tensorflow's add_loss.
model.compile() loss functions in Tensorflow always take two parameters y_true and y_pred. Using model.add_loss() has no such restriction and allows you to write much more complex losses that depend on many other tensors, but it has the inconvenience of being more dependent on the model, whereas the standard loss functions work with just any model.
You can find the official documentation of add_loss here. Add loss tensor(s), potentially dependent on layer inputs. This method can be used inside a subclassed layer or model's call function, in which case losses should be a Tensor or list of Tensors. There are few example in the documentation to explain the add_loss.
This method can also be called directly on a Functional Model during construction. In this case, any loss Tensors passed to this Model must be symbolic and be able to be traced back to the model's Inputs. These losses become part of the model's topology and are tracked in get_config.
Example :
inputs = tf.keras.Input(shape=(10,))
x = tf.keras.layers.Dense(10)(inputs)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs, outputs)
# Activity regularization.
model.add_loss(tf.abs(tf.reduce_mean(x)))
You can call self.add_loss(loss_value) from inside the call method of a custom layer. Here's a simple example that adds activity regularization.
Example:
class ActivityRegularizationLayer(layers.Layer):
def call(self, inputs):
self.add_loss(tf.reduce_sum(inputs) * 0.1)
return inputs # Pass-through layer.
inputs = keras.Input(shape=(784,), name='digits')
x = layers.Dense(64, activation='relu', name='dense_1')(inputs)
# Insert activity regularization as a layer
x = ActivityRegularizationLayer()(x)
x = layers.Dense(64, activation='relu', name='dense_2')(x)
outputs = layers.Dense(10, name='predictions')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=1e-3),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True))
# The displayed loss will be much higher than before
# due to the regularization component.
model.fit(x_train, y_train,
batch_size=64,
epochs=1)
You can find good example using add_loss here and here with explanations.
Hope this answers your question. Happy Learning.

Related

How to apply a loss metric that will penalize predicting all zeros in multilabel classification problem?

Say I have a classification problem that has 30 potential binary labels. These labels are not mutually exclusive. The labels tend to be sparse--there is, on average, 1 positive label per all 30 labels but sometimes more than only 1. In the following code, how can I penalize the model from predicting all zeros? The accuracy will be high, but recall will be awful!
import numpy as np
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
OUTPUT_NODES = 30
np.random.seed(0)
def get_dataset():
"""
Get a dataset of X and y. This is a learnable problem as there is some signal in the features. 10% of the time, a
positive-output's index will also have a positive feature for that index
:return: X and y data for training
"""
n_observations = 30000
y = np.random.rand(n_observations, OUTPUT_NODES)
y = (y <= (1 / OUTPUT_NODES)).astype(int) # Makes a sparse output where there is roughly 1 positive label: ((1 / OUTPUT_NODES) * OUTPUT_NODES ≈ 1)
X = np.zeros((n_observations, OUTPUT_NODES))
for i in range(len(y)):
for j, feature in enumerate(y[i]):
if feature == 1:
X[i][j] = 1 if np.random.rand(1) > 0.9 else 0 # Makes the input features more noisy
# X[i][j] = 1 # Using this instead will make the model perform very well
return X, y
def create_model():
input_layer = Input(shape=(OUTPUT_NODES, ))
dense1 = Dense(100, activation='relu')(input_layer)
dense2 = Dense(100, activation='relu')(dense1)
output_layer = Dense(30, activation='sigmoid')(dense2)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['Recall'])
return model
def main():
X, y = get_dataset()
model = create_model()
model.fit(X, y, epochs=10, batch_size=10)
X_pred = np.random.randint(0, 2, (100, OUTPUT_NODES))
y_pred = model.predict(X_pred)
print(X_pred)
print(y_pred.round(1))
if __name__ == '__main__':
main()
I believe I read here that I could use:
weighted_cross_entropy_with_logits
to address this issue. How would that affect my final output layer's activation functions? Would I have to have an activation function? How do I specify a penalty to misclassifications of a true positive class?

Ok, it is an interesting problem
First you need to define a weighted cross entropy loss wrapper:
def wce_logits(positive_class_weight=1.):
def mylossw(y_true, logits):
cross_entropy = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(logits=logits, labels=tf.cast(y_true, dtype=tf.float32), pos_weight=positive_class_weight))
return cross_entropy
return mylossw
The positive_class_weight is applied to the positive class data. You need this wrapper for tf.nn.weighted_cross_entropy_with_logits to get a loss function that takes y_true and y_pred (only) as inputs.
Note that you must cast y_true to float32.
Second, you can not use the predefined Recall, because it does not work with logits. I found a workaround in this discussion
class Recall(tf.keras.metrics.Recall):
def __init__(self, from_logits=False, *args, **kwargs):
super().__init__(*args, **kwargs)
self._from_logits = from_logits
def update_state(self, y_true, y_pred, sample_weight=None):
if self._from_logits:
super(Recall, self).update_state(y_true, tf.nn.sigmoid(y_pred), sample_weight)
else:
super(Recall, self).update_state(y_true, y_pred, sample_weight)
Finally, you need to remove the sigmoid activation from the last layer as you are using logits
def create_model():
input_layer = Input(shape=(OUTPUT_NODES, ))
dense1 = Dense(100, activation='relu')(input_layer)
dense2 = Dense(100, activation='relu')(dense1)
output_layer = Dense(30)(dense2)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss=wce_logits(positive_class_weight=27.), metrics=[Recall(from_logits=True)])
return model
Note that the positive weight is set to 27 here. You can read a discussion on how to correctly calculate the weight

Why PyTorch optimizer might fail to update its parameters?

I am trying to do a simple loss-minimization for a specific variable coeff using PyTorch optimizers. This variable is supposed to be used as an interpolation coefficient for two vectors w_foo and w_bar to find a third vector, w_target.
w_target = `w_foo + coeff * (w_bar - w_foo)
With w_foo and w_bar set as constant, at each optimization step I calculate w_target for the given coeff. Loss is determined from w_target using a fairly complex process beyond the scope of this question.
# w_foo.shape = [1, 16, 512]
# w_bar.shape = [1, 16, 512]
# num_layers = 16
# num_steps = 10000
vgg_loss = VGGLoss()
coeff = torch.randn([num_layers, ])
optimizer = torch.optim.Adam([coeff], lr=initial_learning_rate)
for step in range(num_steps):
w_target = w_foo + torch.matmul(coeff, (w_bar - w_foo))
optimizer.zero_grad()
target_image = generator.synthesis(w_target)
processed_target_image = process(target_image)
loss = vgg_loss(processed_target_image, source_image)
loss.backward()
optimizer.step()
However, when running this optimizer, I am met with query_opt not changing from one step to another, optimizer being essentially useless. I would like to ask for some advice on what I am doing wrong here.
Edit:
As suggested, I will try to elaborate on the loss function. Essentially, w_target is used to generate an image, and VGGLoss uses VGG feature extractor to compare this synthetic image with a certain exemplar source image.
class VGGLoss(torch.nn.Module):
def __init__(self, device, vgg):
super().__init__()
for param in self.parameters():
param.requires_grad = True
self.vgg = vgg # VGG16 in eval mode
def forward(self, source, target):
loss = 0
source_features = self.vgg(source, resize_images=False, return_lpips=True)
target_features = self.vgg(target, resize_images=False, return_lpips=True)
loss += (source_features - target_features).square().sum()
return loss

GRU Loss decreased upto 0.9 but not further, PyTorch

the code that I am using for experimenting with GRU.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import *
class N(nn.Module):
def __init__(self):
super().__init__()
self.embed = nn.Embedding(5,2)
self.layers = 4
self.gru = nn.GRU(2, 512, self.layers, batch_first=True)
self.bat = nn.BatchNorm1d(4)
self.bat1 = nn.BatchNorm1d(4)
self.bat2 = nn.BatchNorm1d(4)
self.fc = nn.Linear(512,100)
self.fc1 = nn.Linear(100,100)
self.fc2 = nn.Linear(100,5)
self.s = nn.Softmax(dim=-1)
def forward(self,x):
h0 = torch.zeros(self.layers, x.size(0), 512).requires_grad_()
x = self.embed(x)
x,hn = self.gru(x,h0)
x = self.bat(x)
x = self.fc(x)
x = nn.functional.relu(x)
x = self.bat1(x)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.bat2(x)
x = self.fc2(x)
softmaxed = self.s(x)
return softmaxed
inp = torch.tensor([[4,3,2,1],[2,3,4,1],[4,1,2,3],[1,2,3,4]])
out = torch.tensor([[3,2,1,4],[3,2,4,1],[1,2,3,4],[2,3,4,1]])
k = 0
n = N()
opt = torch.optim.Adam(n.parameters(),lr=0.0001)
while k<10000:
print(inp.shape)
o = n(inp)
o = o.view(-1, o.size(-1))
out = out.view(-1)
loss = nn.functional.cross_entropy(o.view(-1,o.size(-1)),out.view(-1)-1)
acc = ((torch.argmax(o, dim=1) == (out -1)).sum().item() / out.size(0))
if k==10000:
print(torch.argmax(o, dim=1))
print(out-1)
exit()
print(loss,acc)
loss.backward()
opt.step()
opt.zero_grad()
k+=1
print(o[0])
Shrinked Output:
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
tensor([4.8500e-01, 9.7813e-06, 5.1498e-01, 6.2428e-06, 7.5929e-06],
grad_fn=<SelectBackward>)
The Loss is 0.9593 and accuracy reached up to 0.9375. For this simple input data, the GRU loss is this big. What is the reason? Is there anything wrong in this code? I used cross_entropy as loss function and Adam as the optimizer. Learning rate is 0.001. I tried multiple learning rates but all gave the same final result. I added batch normalization, it speed up the training, but the same loss and accuracy. Why loss does not decrease up to 0.2 or something.

I think it's because you are using cross entropy loss function which in PyTorch combines log-softmax and negative log likelihood. Since your model already performs softmax before returning the output, you actually end up calculating the negative log likelihood for softmax of softmax. Try removing the final softmax from your model.
PyTorch documentation for cross entropy loss: https://pytorch.org/docs/stable/nn.functional.html#cross-entropy

Inverting Gradients in Keras

I'm trying to port the BoundingLayer function from this file to the DDPG.py agent in keras-rl but I'm having some trouble with the implementation.
I modified the get_gradients(loss, params) method in DDPG.py to add this:
action_bounds = [-30, 50]
inverted_grads = []
for g,p in zip(modified_grads, params):
is_above_upper_bound = K.greater(p, K.constant(action_bounds[1], dtype='float32'))
is_under_lower_bound = K.less(p, K.constant(action_bounds[0], dtype='float32'))
is_gradient_positive = K.greater(g, K.constant(0, dtype='float32'))
is_gradient_negative = K.less(g, K.constant(0, dtype='float32'))
invert_gradient = tf.logical_or(
tf.logical_and(is_above_upper_bound, is_gradient_negative),
tf.logical_and(is_under_lower_bound, is_gradient_positive)
)
inverted_grads.extend(K.switch(invert_gradient, -g, g))
modified_grads = inverted_grads[:]
But I get an error about the shape:
ValueError: Shape must be rank 0 but is rank 2 for 'cond/Switch' (op: 'Switch') with input shapes: [2,400], [2,400].

keras-rl "get_gradients" function uses gradients calculated with a combined actor-critic model, but you need the gradient of the critic output wrt the action input to apply the inverting gradients feature.
I've recently implemented it on a RDPG prototype I'm working on, using keras-rl. Still testing, the code can be optimized and is not bug free for sure, but I've put the inverting gradient to work by modifying some keras-rl lines of code. In order to modify the gradient of the critic output wrt the action input, I've followed the original formula to compute the actor gradient, with the help of this great post from Patrick Emami: http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html.
I'm putting here the entire "compile" function, redefined in a class that inherits from "DDPAgent", where the inverting gradient feature is implemented.
def compile(self, optimizer, metrics=[]):
metrics += [mean_q]
if type(optimizer) in (list, tuple):
if len(optimizer) != 2:
raise ValueError('More than two optimizers provided. Please only provide a maximum of two optimizers, the first one for the actor and the second one for the critic.')
actor_optimizer, critic_optimizer = optimizer
else:
actor_optimizer = optimizer
critic_optimizer = clone_optimizer(optimizer)
if type(actor_optimizer) is str:
actor_optimizer = optimizers.get(actor_optimizer)
if type(critic_optimizer) is str:
critic_optimizer = optimizers.get(critic_optimizer)
assert actor_optimizer != critic_optimizer
if len(metrics) == 2 and hasattr(metrics[0], '__len__') and hasattr(metrics[1], '__len__'):
actor_metrics, critic_metrics = metrics
else:
actor_metrics = critic_metrics = metrics
def clipped_error(y_true, y_pred):
return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)
# Compile target networks. We only use them in feed-forward mode, hence we can pass any
# optimizer and loss since we never use it anyway.
self.target_actor = clone_model(self.actor, self.custom_model_objects)
self.target_actor.compile(optimizer='sgd', loss='mse')
self.target_critic = clone_model(self.critic, self.custom_model_objects)
self.target_critic.compile(optimizer='sgd', loss='mse')
# We also compile the actor. We never optimize the actor using Keras but instead compute
# the policy gradient ourselves. However, we need the actor in feed-forward mode, hence
# we also compile it with any optimzer and
self.actor.compile(optimizer='sgd', loss='mse')
# Compile the critic.
if self.target_model_update < 1.:
# We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
critic_updates = get_soft_target_model_updates(self.target_critic, self.critic, self.target_model_update)
critic_optimizer = AdditionalUpdatesOptimizer(critic_optimizer, critic_updates)
self.critic.compile(optimizer=critic_optimizer, loss=clipped_error, metrics=critic_metrics)
clipnorm = getattr(actor_optimizer, 'clipnorm', 0.)
clipvalue = getattr(actor_optimizer, 'clipvalue', 0.)
critic_gradients_wrt_action_input = tf.gradients(self.critic.output, self.critic_action_input)
critic_gradients_wrt_action_input = [g / float(self.batch_size) for g in critic_gradients_wrt_action_input] # since TF sums over the batch
action_bounds = [(-1.,1.) for i in range(self.nb_actions)]
def calculate_inverted_gradient():
"""
Applies "inverting gradient" feature to the action-value gradients.
"""
gradient_wrt_action = -critic_gradients_wrt_action_input[0]
inverted_gradients = []
for n in range(self.batch_size):
inverted_gradient = []
for i in range(gradient_wrt_action[n].shape[0].value):
action = self.critic_action_input[n][i]
is_gradient_negative = K.less(gradient_wrt_action[n][i], K.constant(0, dtype='float32'))
adjust_for_upper_bound = gradient_wrt_action[n][i] * ((action_bounds[i][1] - action) / (action_bounds[i][1] - action_bounds[i][0]))
adjust_for_lower_bound = gradient_wrt_action[n][i] * ((action - action_bounds[i][0]) / (action_bounds[i][1] - action_bounds[i][0]))
modified_gradient = K.switch(is_gradient_negative, adjust_for_upper_bound, adjust_for_lower_bound)
inverted_gradient.append( modified_gradient )
inverted_gradients.append(inverted_gradient)
gradient_wrt_action = tf.stack(inverted_gradients)
return gradient_wrt_action
actor_gradients_wrt_weights = tf.gradients(self.actor.output, self.actor.trainable_weights, grad_ys=calculate_inverted_gradient())
actor_gradients_wrt_weights = [g / float(self.batch_size) for g in actor_gradients_wrt_weights] # since TF sums over the batch
def get_gradients(loss, params):
""" Used by the actor optimizer.
Returns the gradients to train the actor.
These gradients are obtained by multiplying the gradients of the actor output w.r.t. its weights
with the gradients of the critic output w.r.t. its action input. """
# Aplly clipping if defined
modified_grads = [g for g in actor_gradients_wrt_weights]
if clipnorm > 0.:
norm = K.sqrt(sum([K.sum(K.square(g)) for g in modified_grads]))
modified_grads = [optimizers.clip_norm(g, clipnorm, norm) for g in modified_grads]
if clipvalue > 0.:
modified_grads = [K.clip(g, -clipvalue, clipvalue) for g in modified_grads]
return modified_grads
actor_optimizer.get_gradients = get_gradients
# get_updates is the optimizer function that changes the weights of the network
updates = actor_optimizer.get_updates(self.actor.trainable_weights, self.actor.constraints, None)
if self.target_model_update < 1.:
# Include soft target model updates.
updates += get_soft_target_model_updates(self.target_actor, self.actor, self.target_model_update)
updates += self.actor.updates # include other updates of the actor, e.g. for BN
# Finally, combine it all into a callable function.
# The inputs will be all the necessary placeholders to compute the gradients (actor and critic inputs)
inputs = self.actor.inputs[:] + [self.critic_action_input, self.critic_history_input]
self.actor_train_fn = K.function(inputs, [self.actor.output], updates=updates)
self.actor_optimizer = actor_optimizer
self.compiled = True
When training the actor, you should now pass 3 inputs instead of 2: the observation inputs + the action input (with a prediction from the actor network), so you must also modify the "backward" function. In my case:
...
if self.episode > self.nb_steps_warmup_actor:
action = self.actor.predict_on_batch(history_batch)
inputs = [history_batch, action, history_batch]
actor_train_result = self.actor_train_fn(inputs)
action_values = actor_train_result[0]
assert action_values.shape == (self.batch_size, self.nb_actions)
...
After that you can have your actor with a linear activation in the output.

How do I correctly implement a custom activity regularizer in Keras?

I am trying to implement sparse autoencoders according to Andrew Ng's lecture notes as shown here.
It requires that a sparsity constraint be applied on an autoencoder layer by introducing a penalty term (K-L divergence). I tried to implement this using the direction provided here, after some minor changes.
Here is the K-L divergence and the sparsity penalty term implemented by the SparseActivityRegularizer class as shown below.
def kl_divergence(p, p_hat):
return (p * K.log(p / p_hat)) + ((1-p) * K.log((1-p) / (1-p_hat)))
class SparseActivityRegularizer(Regularizer):
sparsityBeta = None
def __init__(self, l1=0., l2=0., p=-0.9, sparsityBeta=0.1):
self.p = p
self.sparsityBeta = sparsityBeta
def set_layer(self, layer):
self.layer = layer
def __call__(self, loss):
#p_hat needs to be the average activation of the units in the hidden layer.
p_hat = T.sum(T.mean(self.layer.get_output(True) , axis=0))
loss += self.sparsityBeta * kl_divergence(self.p, p_hat)
return loss
def get_config(self):
return {"name": self.__class__.__name__,
"p": self.l1}
The model was built like so
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
autoencoder = Sequential()
encoder = containers.Sequential([Dense(250, input_dim=576, init='glorot_uniform', activation='tanh',
activity_regularizer=SparseActivityRegularizer(p=-0.9, sparsityBeta=0.1))])
decoder = containers.Sequential([Dense(576, input_dim=250)])
autoencoder.add(AutoEncoder(encoder=encoder, decoder=decoder, output_reconstruction=True))
autoencoder.layers[0].build()
autoencoder.compile(loss='mse', optimizer=SGD(lr=0.001, momentum=0.9, nesterov=True))
loss = autoencoder.fit(X_train_tmp, X_train_tmp, nb_epoch=200, batch_size=800, verbose=True, show_accuracy=True, validation_split = 0.3)
autoencoder.save_weights('SparseAutoEncoder.h5',overwrite = True)
result = autoencoder.predict(X_test)
When I call the fit() function I get negative loss values and the output does not resemble the input at all. I want to know where I am going wrong. What is the correct way to calculate the average activation of a layer and use this custom sparsity regularizer? Any sort of help will be greatly appreciated. Thanks!
I am using Keras 0.3.1 with Python 2.7 as the latest Keras (1.0.1) build does not have the Autoencoder layer.

You have defined self.p = -0.9 instead of the 0.05 value that both the original poster and the lecture notes you referred to are using.

I correct some erros:
class SparseRegularizer(keras.regularizers.Regularizer):
def __init__(self, rho = 0.01,beta = 1):
"""
rho : Desired average activation of the hidden units
beta : Weight of sparsity penalty term
"""
self.rho = rho
self.beta = beta
def __call__(self, activation):
rho = self.rho
beta = self.beta
# sigmoid because we need the probability distributions
activation = tf.nn.sigmoid(activation)
# average over the batch samples
rho_bar = K.mean(activation, axis=0)
# Avoid division by 0
rho_bar = K.maximum(rho_bar,1e-10)
KLs = rho*K.log(rho/rho_bar) + (1-rho)*K.log((1-rho)/(1-rho_bar))
return beta * K.sum(KLs) # sum over the layer units
def get_config(self):
return {
'rho': self.rho,
'beta': self.beta
}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

tensorflow 2 : loss using hidden layers output - python

Related

How to apply a loss metric that will penalize predicting all zeros in multilabel classification problem?

Why PyTorch optimizer might fail to update its parameters?

GRU Loss decreased upto 0.9 but not further, PyTorch

Inverting Gradients in Keras

How do I correctly implement a custom activity regularizer in Keras?

Categories

Resources