keras custom activity regularizer

keras custom activity regularizer - python

def kl_divergence(p, p_hat):
return (p * K.log(p / p_hat)) + ((1 - p) * K.log((1 - p) / (1 - p_hat)))
class SparseActivityRegularizer(Regularizer):
sparsityBeta = None
def __init__(self, l1=0., l2=0., p=0.01, sparsityBeta=0.1):
self.p = p
self.sparsityBeta = sparsityBeta
def set_layer(self, layer):
self.layer = layer
def __call__(self,loss):
#p_hat needs to be the average activation of the units in the hidden layer.
p_hat = T.sum(T.mean(self.layer.get_output(True) , axis=0))
loss += self.sparsityBeta * kl_divergence(self.p, p_hat)
return loss
def get_config(self):
return {"name": self.__class__.__name__,
"p": self.l1}
when I call this custom regularizer in the model as shown below
dr=0.5
inputs = Input(shape=(392,))
x = Dense(1000,activation='relu',activity_regularizer=SparseActivityRegularizer())(inputs)
x=Dropout(dr)(x)
out= Dense(392, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=out)
model.compile(loss=euc_dist_keras,
optimizer='adadelta', metrics=["accuracy"])
model.summary()
filepath="weightdae.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
save_best_only=True, mode='min')
callbacks_list = [checkpoint,TensorBoard(log_dir='/tmp/autoencoder')]
hist = model.fit(ptilde, p,
nb_epoch=40,
shuffle=True,
validation_data=(ptilde_val,p_val),
batch_size=32,
callbacks=callbacks_list)
I get the following error
AttributeError: 'SparseActivityRegularizer' object has no attribute 'layer'
Can someone please help me in solving this error?
I have checked the implementation of the regularizer,
and activity regularizers in keras have been implemented in the same way.
But here somehow it cannot find the attribute 'layer' and throws this error.

This type of Regularization declaration deprecated here.Since Keras 1.2.0, you must implement regularization as function like here or as callable class like here

try this:
class SparseRegularizer(keras.regularizers.Regularizer):
def __init__(self, rho = 0.01,beta = 1):
"""
rho : Desired average activation of the hidden units
beta : Weight of sparsity penalty term
"""
self.rho = rho
self.beta = beta
def __call__(self, activation):
rho = self.rho
beta = self.beta
# sigmoid because we need the probability distributions
activation = tf.nn.sigmoid(activation)
# average over the batch samples
rho_bar = K.mean(activation, axis=0)
# Avoid division by 0
rho_bar = K.maximum(rho_bar,1e-10)
KLs = rho*K.log(rho/rho_bar) + (1-rho)*K.log((1-rho)/(1-rho_bar))
return beta * K.sum(KLs) # sum over the layer units
def get_config(self):
return {
'rho': self.rho,
'beta': self.beta
}

Related

GradNorm implementation in Tensorflow 2.x / Keras

I have been trying to implement Gradnorm from the paper, GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks. I have been having some trouble figuring out how to do this since I have a two-head model and need to balance the gradients accordingly.
I am using Keras for this work and using the model.fit() function to train my model.
So far, I have tried to use a callback to implement this like:
class GradNorm(tf.keras.callbacks.Callback):
def __init__(self, model, optimizer, alpha=0.2):
super(GradNorm, self).__init__()
self.alpha = alpha
self.model = model
self.optimizer = optimizer
self.l_hat_1 = self.l_hat_2 = self.l_hat_avg = 1.0
self.inv_rate_1 = self.inv_rate_2 = 1.0
self.weights_shared = self.model.weights[:20] # all shared layer weights
self.w1 = self.w2 = 1.0
self.l01 = self.l02 = -1.0
self.task_losses = []
def on_batch_end(self, batch, logs=None):
loss_t1 = logs.get('loss1')
loss_t2 = logs.get('loss2')
self.task_losses.append((loss_t1, loss_t2))
# Weight losses
l1 = tf.multiply(loss_t1, self.w1)
l2 = tf.multiply(loss_t2, self.w2)
with tf.GradientTape(persistent=True) as tape:
G1R = tape.gradient(tf.constant(loss_t1, dtype=tf.float32), self.weights_shared)
G2R = tape.gradient(tf.constant(loss_t2, dtype=tf.float32), self.weights_shared)
G1 = tf.norm(G1R, ord=2)
G2 = tf.norm(G2R, ord=2)
However, this fails even at this point because G1R returns an array of None
Would really appreciate any help on this
Thank You

nn.LSTM not working together with functional_call for calculating the gradient

I have the following code:
import torch
from torch.nn.utils.stateless import functional_call
import torch.autograd as autograd
import torch.nn as nn
# This is the model
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
# This is the initialization function
def pars(model):
params = {}
for name, param in model.named_parameters():
if len(param.shape) == 1:
init = torch.nn.init.constant_(param, 0)
else:
init = torch.nn.init.orthogonal_(param)
params[name] = nn.Parameter(init)
return params
# Initializating the model
model = Encoder(4, 2, 5)
x = torch.rand(3, 5, 4)
params = pars(model)
# Running the model with functional_call and calculating gradient.
samp = functional_call(model, params, x)
grad_f = autograd.grad(torch.mean(samp), params.values(),
retain_graph=True, allow_unused=True)
print(grad_f)
# grad_f has gradient for the linear layer, but None for the LSTM layer.
# Running the model without functional_call and calculating gradient.
samp = model(x)
grad = autograd.grad(torch.mean(samp), model.parameters(), retain_graph=True)
print(grad)
# grad has gradient for all layers, e.g., linears and lstm.
I know the problem is with the LSTM layer because when I use a linear layer with nn.Linear, then the gradient depends on std as well as the linear layer. Unfortunately, I do not know to resolve this problem. I'd appreciate any help.
*Edit: I heavily edited the code provided just to further simplify the example. This code can be copied and run.
Update Dec 11, 2022
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
pdb.set_trace()
grad1 = autograd.grad(mean.mean(), params.values(),
retain_graph=True, allow_unused=True)
# This gives gradient for the self.lin1 layer, and None for the LSTM
grad2 = autograd.grad(mean.mean(), self.parameters(),
retain_graph=True, allow_unused=True)
# This gives gradient the LSTM, but None for the self.lin1 layer
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
When I run it the regular without functional_call and calling directly the model, then autograd.grad(mean.mean(), self.parameters(), allow_unused=True, retain_graph=True) has gradient for the self.lin1 and LSTM layer.
I don't know if this information is useful, but putting out there just in case.

Tensorflow 2.0: flat_map() to flatten Dataset of Dataset returns cardinality -2

I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)

I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.

Why gradient check gives high difference (almost 1)?

I'm trying to implement a Neural Net in python without the use of libraries like Keras or Tensorflow. I still have to test the net, right now I just tried to train it on Iris dataset and check afterwards the correctness of the backpropagation algorithm.
To do so, I wrote the gradient checking procedure, calculating the analytical gradients and comparing them with the gradients from backpropagation.
The point is that, even if the backpropagation algorithm seems correct to me, the difference between the gradients is always high (around 0.8, instead of the classic 1e-7).
Layer class
class Dense(Layer):
def __init__(self, input_shape, name=None, activation='relu', regularization='l2'):
self.name = name
self.is_output = False
self.weights = np.random.uniform(low=0.01, high=0.10, size=input_shape)
self.biases = np.ones((1,input_shape[1]))
if activation == 'sigmoid':
self.activation = Activation_Sigmoid()
else: #activation == 'relu':
self.activation = Activation_ReLU()
self.cost = Categorical_CrossEntropyLoss()
def set_as_output(self, is_output=True):
self.is_output = is_output
def forward(self, inputs, debug=False, epsilon=None):
self.net_input = inputs
if debug:
augmented_parameters = np.zeros(epsilon.shape)
weights_column_vector = np.reshape(self.weights,(-1,1))
biases_column_vector = np.reshape(self.biases,(-1,1))
concatenated_parameters = np.concatenate((weights_column_vector, biases_column_vector))
for i in range(concatenated_parameters.shape[0]):
augmented_parameters[i] = concatenated_parameters[i]
# make the augmented parameter long as theta in order to sum them
# this because epsilon is a standard basis vector
augmented_parameters += epsilon
# rebuild the weights matrix and biases vector to apply forward propagation
weights_end = self.weights.shape[0] * self.weights.shape[1]
biases_end = self.biases.shape[0] * self.biases.shape[1] + weights_end
weights = np.reshape(augmented_parameters[0:weights_end],self.weights.shape)
biases = np.reshape(augmented_parameters[weights_end:biases_end], self.biases.shape)
output = np.dot(inputs, weights) + biases
activated_output = self.activation.forward(output)
return activated_output
self.output = np.dot(inputs, self.weights) + self.biases
self.activated_output = self.activation.forward(self.output)
return self.activated_output
def backward(self, X, y, output, step, l2=0.5): #backpropagation
m = X.shape[0] # number of examples
if self.is_output:
error = self.cost.backward(output, y) #(a_k - y_hat_k)
delta_k = self.activation.backward(self.output)* error
# net input for neuron k is a_j^(l-1)
grad = np.dot(self.net_input.T, delta_k)
#update weights with l2 regularization
self.grad_w = grad + (l2 / m)*self.weights
self.grad_b = np.sum(delta_k * 1,axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_k ,self.weights.T)
else:
delta_j = self.activation.backward(self.output) * output
grad = np.dot(self.net_input.T, delta_j)
self.grad_w = grad + (l2 / m) * self.weights
self.grad_b = np.sum(delta_j * 1, axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_j, self.weights.T)
def get_parameters(self):
return self.weights, self.biases
def get_gradients(self):
return self.grad_w, self.grad_b
Neural Net class
class NeuralNet():
def __init__(self):
self.layers = []
self.layers_output = []
self.cost = None
self.regularization = L2_Regularization()
def add(self,layer):
self.layers.append(layer)
def forward(self, inputs, debug=False, epsilon=None):
input = np.copy(inputs)
for layer in self.layers:
output = layer.forward(input, debug=debug, epsilon=epsilon)
input = output
return input
def backward(self, X, y, output, step):
prev_delta = None
out = output
for layer in self.layers[::-1]:
prev_delta = layer.backward(X, y, out, step)
out = prev_delta
def fit(self, X, y, batch_size=1, epochs=10, step=0.05, shuffle=True):
self.layers[-1].set_as_output()
self.error = []
i = 0.005 * epochs
for epoch in range(epochs):
if shuffle:
X = np.random.permutation(X)
batches = int(np.ceil(X.shape[0]/batch_size))
batches_error = []
for t in range(batches):
batch_X = X[t*batch_size:np.min([X.shape[0],(t+1)*batch_size]),:]
batch_y = y[t*batch_size:np.min([y.shape[0],(t+1)*batch_size]),:]
output = self.forward(batch_X)
cost = self.cost.forward(output,batch_y)
cost += self.regularization.forward(X, self.layers)
batches_error.append(cost)
self.backward(batch_X, batch_y, output, step)
self.error.append(np.mean(batches_error))
if epoch % i == 0:
print('epoch:', epoch, 'error:', np.mean(self.error))
return self
def parameters_to_theta(self):
theta = []
for layer in self.layers:
w, b = layer.get_parameters()
#flatten parameter w
new_vector = np.reshape(w, (-1,1))
theta.append(new_vector)
#flatten parameter b
new_vector = np.reshape(b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradients_to_theta(self):
theta = []
for layer in self.layers:
grad_w, grad_b = layer.get_gradients()
new_vector = np.reshape(grad_w, (-1,1))
theta.append(new_vector)
new_vector = np.reshape(grad_b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradient_check(self, X, y, epsilon=1e-7):
theta = self.parameters_to_theta()
dtheta = self.gradients_to_theta()
num_parameters = theta.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
dtheta_approx = np.zeros((num_parameters, 1))
for i in range(num_parameters):
theta_plus = np.zeros((num_parameters,1))
theta_plus[i] = epsilon
J_plus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_plus),y)
theta_minus = np.zeros((num_parameters,1))
theta_minus[i] = - epsilon
J_minus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_minus),y)
dtheta_approx[i] = (J_plus[i] - J_minus[i])/ (2 * epsilon)
numerator = np.linalg.norm(dtheta - dtheta_approx)
denominator = np.linalg.norm(dtheta_approx) + np.linalg.norm(dtheta)
difference = numerator / denominator
return difference
I'm using ReLU and Sigmoid as activation functions, and Categorical Cross Entropy for the cost
import numpy as np
from scipy.special import expit as sigmoid
class Activation_ReLU:
def forward(self, inputs):
return np.maximum(0, inputs)
def backward(self, inputs):
return np.greater(inputs,0).astype(int)
class Activation_Sigmoid:
def forward(self, inputs):
return sigmoid(inputs)
def backward(self, inputs):
return sigmoid(inputs) * (1 - sigmoid(inputs))
class Categorical_CrossEntropyLoss():
def forward(self, y_pred, y_real):
predictions = np.copy(y_pred)
predictions = np.clip(predictions, 1e-12, 1 - 1e-12) # avoid zero values for log
n = y_real.shape[0]
return - (1 / n) * np.sum(y_real * np.log(y_pred))
def backward(self, y_pred, y_real):
return y_real - y_pred
These are the main classes that define the net. The model that I create to train on Iris dataset is a NN with 1 hidden layer.
# random seed is 1
X, y = load_iris(return_X_y=True)
X = (X - np.mean(X)) / np.std(X) # standardize data to improve network convergence
y = y.reshape((-1,1))
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)
model = NeuralNet()
model.add(Dense((4,10),name='input_layer',activation='relu'))
model.add(Dense((10,10),name='hidden_layer',activation='relu'))
model.add(Dense((10,3),name='output_layer',activation='sigmoid'))
model.fit(X_train,y_train, batch_size=5, epochs=200, step=1e-3)
difference = model.gradient_check(X_train, y_train)
And then, the result of print(difference) is
0.7992920544491866
So there is something wrong with my implementation. What things I have to check to determine the causes of this high difference between gradients?

multi-class weighted loss function in pytorch

I am training a unet based model for multi-class segmentation task on pytorch framework. Optimizing the model with following loss function,
class MulticlassJaccardLoss(_Loss):
"""Implementation of Jaccard loss for multiclass (semantic) image segmentation task
"""
__name__ = 'mc_jaccard_loss'
def __init__(self, classes: List[int] = None, from_logits=True, weight=None, reduction='elementwise_mean'):
super(MulticlassJaccardLoss, self).__init__(reduction=reduction)
self.classes = classes
self.from_logits = from_logits
self.weight = weight
def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor:
"""
:param y_pred: NxCxHxW
:param y_true: NxHxW
:return: scalar
"""
if self.from_logits:
y_pred = y_pred.softmax(dim=1)
n_classes = y_pred.size(1)
smooth = 1e-3
if self.classes is None:
classes = range(n_classes)
else:
classes = self.classes
n_classes = len(classes)
loss = torch.zeros(n_classes, dtype=torch.float, device=y_pred.device)
if self.weight is None:
weights = [1] * n_classes
else:
weights = self.weight
for class_index, weight in zip(classes, weights):
jaccard_target = (y_true == class_index).float()
jaccard_output = y_pred[:, class_index, ...]
num_preds = jaccard_target.long().sum()
if num_preds == 0:
loss[class_index-1] = 0 #custom
else:
iou = soft_jaccard_score(jaccard_output, jaccard_target, from_logits=False, smooth=smooth)
loss[class_index-1] = (1.0 - iou) * weight #custom
if self.reduction == 'elementwise_mean':
return loss.mean()
if self.reduction == 'sum':
return loss.sum()
return loss
I am calculating loss for only two classes (class 1 and 2 and not for the background).
MulticlassJaccardLoss(weight=[0.5,10], classes=[1,2], from_logits=False)
When I train the model, it trains for first few iterations and I get the following error,
element 0 of tensors does not require grad and does not have a grad_fn
What is the mistake in the code?
Thanks!

Try setting:
torch.zeros(..., requires_grad=True)
I believe requires_grad=False is the default for torch.zeros, so this may help here.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

keras custom activity regularizer - python

This type of Regularization declaration deprecated here.Since Keras 1.2.0, you must implement regularization as function like here or as callable class like here

Related

GradNorm implementation in Tensorflow 2.x / Keras

nn.LSTM not working together with functional_call for calculating the gradient

Tensorflow 2.0: flat_map() to flatten Dataset of Dataset returns cardinality -2

Why gradient check gives high difference (almost 1)?

multi-class weighted loss function in pytorch

Categories

Resources