Tensorflow gradients do not exist for bias in custom layer - python

I've built an input convex neural network in Tensorflow following this ArXiv paper that is a scalar output feed-forward model. The first hidden layer is dense and subsequent layers are custom that takes two inputs: the output from the previous layer (kernel) and the model input (passthrough). Separate weights are applied to each. This allows a positive weights regularizer to be applied to kernel weights but not the passthrough. I calculate the regularizer and add it using self.add_loss in the call method of the custom layer. I'm also using custom activation functions that are squared leaky ReLU and leaky ReLU.
When I am training this network I am able to calculate a gradient for the bias in the first dense layer but I get a warning that no gradient exists for the bias in the custom layer. When I add #tf.function to my activation functions the warning goes away but the gradient is 0. Furthermore, loss.numpy() throws an error when I use #tf.function and run in a local Jupyter notebook (but not in Colab).
Any ideas why the bias gradient exists for the dense but not the custom layer and how to calculate the bias gradient for all layers? A minimal working example is provided in this Colab notebook. Much appreciated!
Below is my custom layer. It's very similar to the standard dense layer.
class DensePartiallyConstrained(Layer):
'''
A custom layer inheriting from `tf.keras.layers.Layers` class.
This class is a fully-connected layer with two inputs. This allows
for different constraints on the weights of each input. This enables
a passthrough of the inputs to each hidden layer to have no
weight constraints while the input from the previous layer can have
a positive constraint. It also allows for different initializations
of the weight values for each input.
Most of this code and documentation was borrowed from the
`tf.keras.layers.Dense` documentation on Github (thanks!).
'''
def __init__(self,
units,
activation = None,
use_bias = True,
kernel_initializer = 'glorot_uniform',
passthrough_initializer = 'glorot_uniform',
bias_initializer = 'zeros',
kernel_constraint = None,
passthrough_constraint = None,
bias_constraint = None,
activity_regularizer = None,
regularizer_constant = 1.0,
**kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(DensePartiallyConstrained, self).__init__(
activity_regularizer = regularizers.get(activity_regularizer), **kwargs)
self.units = int(units)
self.activation = activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.passthrough_initializer = initializers.get(passthrough_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.passthrough_constraint = constraints.get(passthrough_constraint)
self.bias_constraint = constraints.get(bias_constraint)
# This is for add_loss in call() method
self.regularizer_constant = regularizer_constant
# What does this do?
self.supports_masking = True
self.kernel_input_spec = InputSpec(min_ndim=2)
self.passthrough_input_spec = InputSpec(min_ndim=2)
def build(self, input_shape):
# Input shapes provided as list [kernel, passthrough]
kernel_input_shape, passthrough_input_shape = input_shape
# Check for proper datatype
dtype = dtypes.as_dtype(self.dtype or K.floatx())
if not (dtype.is_floating or dtype.is_complex):
raise TypeError('Unable to build `DensePartiallyConstrained` layer with non-floating point '
'dtype %s' % (dtype,))
# Check kernel input dimensions
kernel_input_shape = tensor_shape.TensorShape(kernel_input_shape)
if tensor_shape.dimension_value(kernel_input_shape[-1]) is None:
raise ValueError('The last dimension of the inputs to `DensePartiallyConstrained` '
'should be defined. Found `None`.')
kernel_last_dim = tensor_shape.dimension_value(kernel_input_shape[-1])
self.kernel_input_spec = InputSpec(min_ndim=2,
axes={-1: kernel_last_dim})
# Check passthrough input dimensions
passthrough_input_shape = tensor_shape.TensorShape(passthrough_input_shape)
if tensor_shape.dimension_value(passthrough_input_shape[-1]) is None:
raise ValueError('The last dimension of the inputs to `DensePartiallyConstrained` '
'should be defined. Found `None`.')
passthrough_last_dim = tensor_shape.dimension_value(passthrough_input_shape[-1])
self.passthrough_input_spec = InputSpec(min_ndim=2,
axes={-1: passthrough_last_dim})
# Add weights to kernel (between layer connections)
self.kernel = self.add_weight(name = 'kernel',
shape = [kernel_last_dim, self.units],
initializer = self.kernel_initializer,
constraint = self.kernel_constraint,
dtype = self.dtype,
trainable = True)
# Add weight to input passthrough
self.passthrough = self.add_weight(name = 'passthrough',
shape = [passthrough_last_dim, self.units],
initializer = self.passthrough_initializer,
constraint = self.passthrough_constraint,
dtype = self.dtype,
trainable = True)
# Add weights to bias
if self.use_bias:
self.bias = self.add_weight(name = 'bias',
shape = [self.units,],
initializer = self.bias_initializer,
constraint = self.bias_constraint,
dtype = self.dtype,
trainable = True)
else:
self.bias = None
self.built = True
super(DensePartiallyConstrained, self).build(input_shape)
def call(self, inputs):
# Inputs provided as list [kernel, passthrough]
kernel_input, passthrough_input = inputs
# Calculate weights regularizer
self.add_loss(self.regularizer_constant * tf.reduce_sum(tf.square(tf.math.maximum(tf.negative(self.kernel), 0.0))))
# Calculate layer output
outputs = tf.add(tf.matmul(kernel_input, self.kernel), tf.matmul(passthrough_input, self.passthrough))
if self.use_bias:
outputs = tf.add(outputs, self.bias)
if self.activation is not None:
return self.activation(outputs)
return outputs
And my activation functions:
##tf.function
def squared_leaky_ReLU(x, alpha = 0.2):
return tf.square(tf.maximum(x, alpha * x))
##tf.function
def leaky_ReLU(x, alpha = 0.2):
return tf.maximum(x, alpha * x)
Edit:
With a tensorflow update I can now access loss.numpy() when using #tf.function with my activation functions. This returns 0 gradients for the bias in all of my custom layers.
I'm beginning to think that the lack of gradient for the bias terms in the custom layer might have something to do with my loss function:
minimax loss
where
regularizer
is regularization for the weights in the custom layer kernel only. The loss for g(x) is based on the gradient with respect to the inputs, so it doesn't contain any information about the bias (the bias in f(x) update normally). Still though, if this is the case I don't understand why the bias in the first hidden dense layer of g(y) is updated? The networks are identical other than f(x) has a positive constraint on the kernel weights.

Related

How to override gradient for the nonlinearity functions in lasagne?

I have a model, for which i need to compute the gradients of output w.r.t the model's input. But I want to apply some custom gradients for some of the nonlinearity functions applied on some of the model's layers. So i tried the idea explained here, which computes the nonlinear rectifier (RELU) in the forward pass but modifies the gradients of Relu in the backward pass. I added the following two classes:
The helper class that allows us to replace a nonlinearity with an Op
that has the same output, but a custom gradient
class ModifiedBackprop(object):
def __init__(self, nonlinearity):
self.nonlinearity = nonlinearity
self.ops = {} # memoizes an OpFromGraph instance per tensor type
def __call__(self, x):
# OpFromGraph is oblique to Theano optimizations, so we need to move
# things to GPU ourselves if needed.
if theano.sandbox.cuda.cuda_enabled:
maybe_to_gpu = theano.sandbox.cuda.as_cuda_ndarray_variable
else:
maybe_to_gpu = lambda x: x
# We move the input to GPU if needed.
x = maybe_to_gpu(x)
# We note the tensor type of the input variable to the nonlinearity
# (mainly dimensionality and dtype); we need to create a fitting Op.
tensor_type = x.type
# If we did not create a suitable Op yet, this is the time to do so.
if tensor_type not in self.ops:
# For the graph, we create an input variable of the correct type:
inp = tensor_type()
# We pass it through the nonlinearity (and move to GPU if needed).
outp = maybe_to_gpu(self.nonlinearity(inp))
# Then we fix the forward expression...
op = theano.OpFromGraph([inp], [outp])
# ...and replace the gradient with our own (defined in a subclass).
op.grad = self.grad
# Finally, we memoize the new Op
self.ops[tensor_type] = op
# And apply the memoized Op to the input we got.
return self.ops[tensor_type](x)
The subclass that does guided backpropagation through a nonlinearity:
class GuidedBackprop(ModifiedBackprop):
def grad(self, inputs, out_grads):
(inp,) = inputs
(grd,) = out_grads
dtype = inp.dtype
print('It works')
return (grd * (inp > 0).astype(dtype) * (grd > 0).astype(dtype),)
Then i used them in my code as follows:
import lasagne as nn
model_in = T.tensor3()
# model_in = net['input'].input_var
nn.layers.set_all_param_values(net['l_out'], model['param_values'])
relu = nn.nonlinearities.rectify
relu_layers = [layer for layer in
nn.layers.get_all_layers(net['l_out']) if getattr(layer,
'nonlinearity', None) is relu]
modded_relu = GuidedBackprop(relu)
for layer in relu_layers:
layer.nonlinearity = modded_relu
prop = nn.layers.get_output(
net['l_out'], model_in, deterministic=True)
for sample in range(ini, batch_len):
model_out = prop[sample, 'z'] # get prop for label 'z'
gradients = theano.gradient.jacobian(model_out, wrt=model_in)
# gradients = theano.grad(model_out, wrt=model_in)
get_gradients = theano.function(inputs=[model_in],
outputs=gradients)
grads = get_gradients(X_batch) # gradient dimension: X_batch == model_in(64, 20, 32)
grads = np.array(grads)
grads = grads[sample]
Now when i run the code, it works without any error, and the shape of the output is also correct. But that's because it executes the default theano.grad function and not the one supposed to override it. In other words, the grad() function in the class GuidedBackprop never been invoked.
I can't understand what is the issue?
is there's a solution?
If this is an unresolved issue, is there's an implementation for a Theano Op that can achieve such a functionality or some other way to override gradient for specific nonlinearity functions applied on some of the model's layers?
Are you try to set it back the value of model output into model layer input, all gradients calculation
group_1_ShoryuKen_Left = tf.constant([ 0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,1,0,1,0,0,0,0, 0,0,0,0,0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0,0,1,0,0 ], shape=(1, 1, 48), dtype=tf.float32)
## layer_2 = tf.keras.layers.Dense(256, kernel_initializer=tf.constant_initializer(1.))
layer_2 = tf.keras.layers.LSTM(32, kernel_initializer=tf.constant_initializer(1.))
b_out = layer_2(group_1_ShoryuKen_Left)
layer_2.set_weights(layer_1.get_weights())

Fine-tune BERT model by removing unused layers

I came across this code for BERT sentiment analysis where the unused layers are removed, Update trainable vars/trainable weights are added and I am looking for documentation which shows what are the different layers in bert, how can we remove the unused layers, add weights, etc. However, I am unable to find any documentation for this.
BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
MAX_SEQ_LENGTH = 512
class BertLayer(tf.keras.layers.Layer):
def __init__(self, bert_path, n_fine_tune_encoders=10, **kwargs,):
self.n_fine_tune_encoders = n_fine_tune_encoders
self.trainable = True
self.output_size = 768
self.bert_path = bert_path
super(BertLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.bert = tf_hub.Module(self.bert_path,
trainable=self.trainable,
name=f"{self.name}_module")
# Remove unused layers
trainable_vars = self.bert.variables
trainable_vars = [var for var in trainable_vars
if not "/cls/" in var.name]
trainable_layers = ["embeddings", "pooler/dense"]
# Select how many layers to fine tune
for i in range(self.n_fine_tune_encoders+1):
trainable_layers.append(f"encoder/layer_{str(10 - i)}")
# Update trainable vars to contain only the specified layers
trainable_vars = [var for var in trainable_vars
if any([l in var.name
for l in trainable_layers])]
# Add to trainable weights
for var in trainable_vars:
self._trainable_weights.append(var)
for var in self.bert.variables:
if var not in self._trainable_weights:# and 'encoder/layer' not in var.name:
self._non_trainable_weights.append(var)
print('Trainable layers:', len(self._trainable_weights))
print('Non Trainable layers:', len(self._non_trainable_weights))
super(BertLayer, self).build(input_shape)
def call(self, inputs):
inputs = [K.cast(x, dtype="int32") for x in inputs]
input_ids, input_mask, segment_ids = inputs
bert_inputs = dict(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids)
pooled = self.bert(inputs=bert_inputs,
signature="tokens",
as_dict=True)["pooled_output"]
return pooled
def compute_output_shape(self, input_shape):
return (input_shape[0], self.output_size)
model = build_model(bert_path=BERT_PATH, max_seq_length=MAX_SEQ_LENGTH, n_fine_tune_encoders=10)
Can anyone please point me to where I can find resources to learn the different layers in bert, how to remove some layers, add weights, how many layers to fine-tune, etc.?
As mentioned in the comments, you can't actually delete layers from the model architecture. However, you can freeze layers that you do not want to be trained. So the layer you freeze is not trained and the parameters on that layer are not updated
You can see the layers with this;
bert_model = AutoModel.from_pretrained("bert-base-uncased")
print(bert_model)
#or
for name, param in model.named_parameters():
print(name)
You can also freeze a layer or more than one layer like this:
for name, param in self.model.named_parameters():
if 'classifier' not in name:
param.requires_grad = False
In example the script above will freeze all layers since it does not contain any layer "classifier" expression and you can get just embedding vectors from bert's output. Apart from this, there is no need to specify a trainable layer, because the layers that you have not already frozen will continue to train.
You can also check out all of bert's heads and layer structures from this document

Trainable Matrix multiplication Layer

I'm trying to build a (custom) trainable matrix-multiplication layer in TensorFlow, but things aren't working out... More precisely, my model should look like this:
x -> A(x) x
where A(x) is a feed-forward network with values in the n x n matrix (and thus depends on the input x) and A(x) is matrix by vector multiplication.
Here's what I've coded-up:
class custom_layer(tf.keras.layers.Layer):
def __init__(self, units=16, input_dim=32):
super(custom_layer, self).__init__()
self.units = units
def build(self, input_shape):
self.Tw1 = self.add_weight(name='Weights_1 ',
shape=(input_shape[-1], input_shape[-1]),
initializer='GlorotUniform',
trainable=True)
self.Tw2 = self.add_weight(name='Weights_2 ',
shape=(input_shape[-1], (self.units)**2),
initializer='GlorotUniform',
trainable=True)
self.Tb = self.add_weight(name='basies',
shape=(input_shape[-1],),
initializer='GlorotUniform',#Previously 'ones'
trainable=True)
def call(self, input):
# Build Vector-Valued Feed-Forward Network
ffNN = tf.matmul(input, self.Tw1) + self.Tb
ffNN = tf.nn.relu(ffNN)
ffNN = tf.matmul(ffNN, self.Tw2)
# Map to Matrix
ffNN = tf.reshape(ffNN, [self.units,self.units])
# Multiply Matrix-Valued function with input data
x_out = tf.matmul(ffNN,input)
# Return Output
return x_out
Now I build the model:
input_layer = tf.keras.Input(shape=[2])
output_layer = custom_layer(2)(input_layer)
model = tf.keras.Model(inputs=[input_layer], outputs=[output_layer])
# Compile Model
#----------------#
# Define Optimizer
optimizer_on = tf.keras.optimizers.SGD(learning_rate=10**(-1))
# Compile
model.compile(loss = 'mse',
optimizer = optimizer_on,
metrics = ['mse'])
# Fit Model
#----------------#
model.fit(data_x, data_y, epochs=(10**1), verbose=0)
and then I get this error message:
InvalidArgumentError: Input to reshape is a tensor with 128 values, but the requested shape has 4
[[node model_62/reconfiguration_unit_70/Reshape (defined at <ipython-input-176-0b494fa3fc75>:46) ]] [Op:__inference_distributed_function_175181]
Errors may have originated from an input operation.
Input Source operations connected to node model_62/reconfiguration_unit_70/Reshape:
model_62/reconfiguration_unit_70/MatMul_1 (defined at <ipython-input-176-0b494fa3fc75>:41)
Function call stack:
distributed_function
Thoughts:
It seems like something is wrong with the network dimensions but I can't figure what/how to repair it...

Save and restore for a CNN based Denoising Network Tensorflow

My question is about restoring the Denoised Trained Model.
I have my network defined in the following way.
Conv1->relu1->Conv2->relu2->Conv3->relu3->Deconv1
The tf.variable_scope(name) is same as above.
Now I have my loss, optimizer and accuracy defined with tf.name_scope.
When I try to restore loss function, It will ask even for labels (which I don't have).
feed_dict={x:input, y:labels}
sess.run('loss',feed_dict)
Can anyone please help me understand how to test this? Which operation should I restore ?
Should I have to call all layers, pass the input and check the loss(MSE)?
I checked many examples but it seems to be all Classification problem and defining softmax with logits at last works.
Edit:
Below is my code and now it is easily visible how tf.name_scope and tf.variable_scope is defined. I feel I may have to bring whole layer to test new Image. Is that right?
def new_conv_layer(input, num_input_channels, filter_size, num_filters, name):
with tf.variable_scope(name):
# Shape of the filter-weights for the convolution
shape = [filter_size, filter_size, num_input_channels, num_filters]
# Create new weights (filters) with the given shape
weights = tf.Variable(tf.truncated_normal([filter_size, filter_size, num_input_channels, num_filters], stddev=0.5))
# Create new biases, one for each filter
biases = tf.Variable(tf.constant(0.05, shape=[num_filters]))
filters = tf.Variable(tf.truncated_normal([filter_size, filter_size, num_input_channels, num_filters], stddev=0.5))
# TensorFlow operation for convolution
layer = tf.nn.conv2d(input=input, filter=filters, strides=[1,1,1,1], padding='SAME')
# Add the biases to the results of the convolution.
layer += biases
return layer, weights
def new_relu_layer(input, name):
with tf.variable_scope(name):
#TensorFlow operation for convolution
layer = tf.nn.relu(input)
return layer
def new_pool_layer(input, name):
with tf.variable_scope(name):
# TensorFlow operation for convolution
layer = tf.nn.max_pool(value=input, ksize=[1, 1, 1, 1], strides=[1, 1, 1, 1], padding='SAME')
return layer
def new_layer(inputs, filters,kernel_size,strides,padding, name):
with tf.variable_scope(name):
layer = tf.layers.conv2d_transpose(inputs=inputs, filters=filters , kernel_size=kernel_size, strides=strides, padding=padding, data_format = 'channels_last')
return layer
layer_conv1, weights_conv1 = new_conv_layer(input=yTraininginput, num_input_channels=1, filter_size=5, num_filters=32, name ="conv1")
layer_relu1 = new_relu_layer(layer_conv1, name="relu1")
layer_conv2, weights_conv2 = new_conv_layer(input=layer_relu1, num_input_channels=32, filter_size=5, num_filters=64, name ="conv2")
layer_relu2 = new_relu_layer(layer_conv2, name="relu2")
layer_conv3, weights_conv3 = new_conv_layer(input=layer_relu2, num_input_channels=64, filter_size=5, num_filters=128, name ="conv3")
layer_relu3 = new_relu_layer(layer_conv3, name="relu3")
layer_deconv1 = new_layer(inputs=layer_relu3, filters=1, kernel_size=[5,5] ,strides=[1,1] ,padding='same',name = 'deconv1')
layer_relu4 = new_relu_layer(layer_deconv1, name="relu4")
layer_conv4, weights_conv4 = new_conv_layer(input=layer_relu4, num_input_channels=1, filter_size=5, num_filters=128, name ="conv4")
layer_relu5 = new_relu_layer(layer_conv4, name="relu5")
layer_deconv2 = new_layer(inputs=layer_relu5, filters=1, kernel_size=[5,5] ,strides=[1,1] ,padding='same',name = 'deconv2')
layer_relu6 = new_relu_layer(layer_deconv2, name="relu6")
# Use Cross entropy cost function
with tf.name_scope("loss"):
cross_entropy = tf.losses.mean_squared_error(labels = xTraininglabel,predictions = layer_relu6)
# Use Adam Optimizer
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate=1e-6).minimize(loss = cross_entropy)
# Accuracy
with tf.name_scope("accuracy"):
accuracy = tf.image.psnr(a=layer_relu6,b=xTraininglabel,max_val=1.0)
Try to view the graph of your code on tensorboard, get the operation name from the last layer(in your case deconv4). Something like below image.
Try loading the tensor, using below code:
operation = graph.get_tensor_by_name("<operationname:0>")
This should work, as your layers are interconnected.
Let me know if this worked!
Operation Image

Tying Autoencoder Weights in a Dense Keras Layer

I am attempting to create a custom, Dense layer in Keras to tie weights in an Autoencoder. I have tried following an example for doing this in convolutional layers here, but it seemed like some of the steps did not apply for the Dense layer (also, the code is from over two years ago).
By tying weights, I want the decode layer to use the transposed weight matrix of the encode layer. This approach is also taken in this article (page 5). Below is the relevant quote from the article:
Here, we choose both the encoding and decoding activation function to be sigmoid function and only consider the
tied weights case, in which W ′ = WT
(where WT
is the
transpose of W ) as most existing deep learning methods
do.
In the quote above, W is the weight matrix in the encode layer and W' (equal to the transpose of W) is the weight matrix in the decode layer.
I did not change too much in the dense layer. I added a tied_to parameter to the constructor, which allows you to pass the layer you want to tie it to. The only other change was to the build function, the snippet for this is below:
def build(self, input_shape):
assert len(input_shape) >= 2
input_dim = input_shape[-1]
if self.tied_to is not None:
self.kernel = K.transpose(self.tied_to.kernel)
self._non_trainable_weights.append(self.kernel)
else:
self.kernel = self.add_weight(shape=(input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
Below is the __init__ method, the only change here was the addition of the tied_to parameter.
def __init__(self, units,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
tied_to=None,
**kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(Dense, self).__init__(**kwargs)
self.units = units
self.activation = activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.activity_regularizer = regularizers.get(activity_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.input_spec = InputSpec(min_ndim=2)
self.supports_masking = True
self.tied_to = tied_to
The call function was not edited, but it is below for reference.
def call(self, inputs):
output = K.dot(inputs, self.kernel)
if self.use_bias:
output = K.bias_add(output, self.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
Above, I added a conditional to check if the tied_to parameter was set, and if so, set the layer's kernel to the transpose of the tied_to layer's kernel.
Below is the code used to instantiate the model. It is done using Keras's sequential API and DenseTied is my custom layer.
# encoder
#
encoded1 = Dense(2, activation="sigmoid")
decoded1 = DenseTied(4, activation="sigmoid", tied_to=encoded1)
# autoencoder
#
autoencoder = Sequential()
autoencoder.add(encoded1)
autoencoder.add(decoded1)
After training the model, below is the model summary and weights.
autoencoder.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_7 (Dense) (None, 2) 10
_________________________________________________________________
dense_tied_7 (DenseTied) (None, 4) 12
=================================================================
Total params: 22
Trainable params: 14
Non-trainable params: 8
________________________________________________________________
autoencoder.layers[0].get_weights()[0]
array([[-2.122982 , 0.43029135],
[-2.1772149 , 0.16689162],
[-1.0465667 , 0.9828905 ],
[-0.6830663 , 0.0512633 ]], dtype=float32)
autoencoder.layers[-1].get_weights()[1]
array([[-0.6521988 , -0.7131109 , 0.14814234, 0.26533198],
[ 0.04387903, -0.22077179, 0.517225 , -0.21583867]],
dtype=float32)
As you can see, the weights reported by autoencoder.get_weights() do not seem to be tied.
So after showing my approach, my question is, is this a valid way to tie weights in a Dense Keras layer? I was able to run the code, and it is currently training. It seems that the loss function is decreasing reasonably as well. My fear is that this will only set them equal when the model is build, but not actually tie them. My hope is that the backend transpose function is tying them through references under the hood, but I am sure that I am missing something.
Thanks Mikhail Berlinkov,
One imporant remark: This code runs under Keras, but not in eager mode in TF2.0. It runs, but it trains badly.
The critical point is, how the object stores the transposed weight.
self.kernel = K.transpose(self.tied_to.kernel)
In non eager mode this creates a graph the right way. In eager mode this fails, probably because the value of a transposed variable is stored at build (== the first call), and then used at subsequent calls.
However: the solution is to store the variable unaltered at build,
and put the transpose operation into the call method.
I spent several days to figure this out, and I am happy if this helps anyone.
So after showing my approach, my question is, is this a valid way to tie weights in a Dense Keras layer?
Yes, it's valid.
My fear is that this will only set them equal when the model is build, but not actually tie them. My hope is that the backend transpose function is tying them through references under the hood, but I am sure that I am missing something.
It actually ties them in a computation graph, you can check in printing model.summary() that there's just one copy of these trainable weights. Also, after training your model you can check weights of corresponding layers with model.get_weights(). When the model is build there're no weights yet actually, just placeholders for them.
random.seed(1)
class DenseTied(Layer):
def __init__(self, units,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
tied_to=None,
**kwargs):
self.tied_to = tied_to
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super().__init__(**kwargs)
self.units = units
self.activation = activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.activity_regularizer = regularizers.get(activity_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.input_spec = InputSpec(min_ndim=2)
self.supports_masking = True
def build(self, input_shape):
assert len(input_shape) >= 2
input_dim = input_shape[-1]
if self.tied_to is not None:
self.kernel = K.transpose(self.tied_to.kernel)
self._non_trainable_weights.append(self.kernel)
else:
self.kernel = self.add_weight(shape=(input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.built = True
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 2
assert input_shape[-1] == self.units
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
def call(self, inputs):
output = K.dot(inputs, self.kernel)
if self.use_bias:
output = K.bias_add(output, self.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
# input_ = Input(shape=(16,), dtype=np.float32)
# encoder
#
encoded1 = Dense(4, activation="sigmoid", input_shape=(4,), use_bias=True)
decoded1 = DenseTied(4, activation="sigmoid", tied_to=encoded1, use_bias=False)
# autoencoder
#
autoencoder = Sequential()
# autoencoder.add(input_)
autoencoder.add(encoded1)
autoencoder.add(decoded1)
autoencoder.compile(optimizer="adam", loss="binary_crossentropy")
print(autoencoder.summary())
autoencoder.fit(x=np.random.rand(100, 4), y=np.random.randint(0, 1, size=(100, 4)))
print(autoencoder.layers[0].get_weights()[0])
print(autoencoder.layers[1].get_weights()[0])

Categories