I build my model using tf.keras.layers.Dense. In the first layer of my model I want some weights to be constant Zero. As in the gradient calculation these weights should be get a gradient = zero (as the last term in the chain rule corresponds to the weight, which is 0 for a constant).
This is my approach so far:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
tf.enable_eager_execution()
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, activation=tf.sigmoid, input_shape=(2,)),
tf.keras.layers.Dense(2, activation=tf.sigmoid)
])
weights=[np.array([[tf.constant(0), 0.25],[0.2,0.3]]),np.array([0.35,0.35]),np.array([[0.4,0.5],[0.45, 0.55]]),np.array([0.6,0.6])]
model.set_weights(weights)
def loss(model, x, y):
y_ = model(x)
return tf.losses.mean_squared_error(labels=y, predictions=y_)
def grad(model, inputs, targets):
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets)
return loss_value, tape.gradient(loss_value, model.trainable_variables)
But in the gradient calculation the weight tf.constant(0) has a gradient not equal zero.
Do I have an understanding problem?
How can I set a weight(or some weights) in a layer(not all weights in one layer) to a constant value (which should not change during training)?
My answer is based on the CustomConnected layer from this answer. As I said in a comment, when you multiply a weight w_ij by c_ij=0 via the connections matrix, the gradient of the loss with respect to that weight becomes zero as well (since the last factor in the chain rule corresponds to c_ij=0).
Here is a minimal example in Keras:
# Using CustomConnected from:
# https://stackoverflow.com/questions/50290769/specify-connections-in-nn-in-keras
import tensorflow as tf
import numpy as np
tf.enable_eager_execution()
# Define model
inp = tf.keras.layers.Input(shape=(2,))
c = np.array([[1., 1.], [1., 0.]], dtype=np.float32)
h = CustomConnected(2, c)(inp)
model = tf.keras.models.Model(inp, h)
# Set initial weights and compile
w = [np.random.rand(2, 2) * c]
model.set_weights(w)
model.compile(tf.train.AdamOptimizer(), 'mse')
# Check gradients
x = tf.constant(np.random.rand(10, 2), dtype=tf.float32)
y = np.random.rand(10, 2)
with tf.GradientTape() as tape:
loss_value = tf.losses.mean_squared_error(labels=y, predictions=model(x))
grad = tape.gradient(loss_value, model.trainable_variables)
print('Gradients: ', grad[0])
Note that I set c[1,1]=0 so the gradient corresponding to weight w[1,1] is 0 regardless of the input.
Related
I am trying to train a mixture model but I am unclear how to specify a trainable array argument in order to allow the weights to be updated. So if I have the following with weights hard coded
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
weights = [0.2, 0.8]
dist = tfd.Mixture(cat=tfd.Categorical(probs=weights),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
#tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
gradients = tape.gradient(loss,dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables))
return loss
for i in range(20000):
loss = train_step(X)
where X is a 1D Numpy array with shape (272, 1)
Now let's say I want to learn the weights. If I try in the Categorical distribution constructor
probs=[tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')]
then I get an error "No gradients provided for any variable"
if I try
probs=tf.Variable([tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')], trainable=True, name='weights')
then weight1 and weight2 do not appear in the list of trainablevariables. weights is listed but does not update.
What is the correct way to specify the weights to the probs argument so they will be updated during training?
Maybe try the following:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
dist = tfd.Mixture(cat=tfd.Categorical(probs=tf.Variable([0.2, 0.8])),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
#tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
tf.print(dist.trainable_variables)
gradients = tape.gradient(loss, dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables)) #E
return loss
for i in range(10):
loss = train_step(tf.random.normal((272, 1)))
([0.2 0.8], 0, 1, 0, 1)
([0.2 0.8], -0.00999249145, 1.00999844, -0.0099981213, 1.00999963)
([0.200921655 0.799828708], -0.00638755737, 1.00682414, -0.00639217719, 1.00682521)
([0.20176363 0.799696386], -0.000149463303, 1.00765562, -0.000160227064, 1.00764322)
([0.200775564 0.800094664], 0.000889031217, 1.00637043, 0.000898908474, 1.00636196)
([0.199177444 0.800768435], -0.00115872873, 1.0025779, -0.00113528164, 1.0025754)
([0.19703567 0.801662683], -0.000830670586, 0.998396218, -0.000778611051, 0.998392522)
([0.193336055 0.80336237], 0.00244163908, 0.993740082, 0.00255049323, 0.993718445)
([0.192727238 0.803925216], 0.00376213156, 0.989788294, 0.00386576797, 0.989756942)
([0.194845349 0.802922785], 0.0022987891, 0.986021399, 0.00232516858, 0.985970497)
Let's suppose we have a neural nets with three layers : Inputs > Hidden > Outputs and consider that the weigths between the Hidden and Outputs layers are : W, b where W is a matrix of shape (N, M). By default, all components of W and b are set as trainable in keras. I know how to set the entire W or b as non trainable like in the link below:
How to set parameters in keras to be non-trainable?
What I want is to be able to set only a specific component of W (for example) to be non trainable. For instance, If:
W = [[W11, W12]
[W21, W22]]
Which can be rewritten in:
W = [W1, W2] with W1 = [W11, W12] and W2 = [W21, W22]
and all W1 and W2 are of type tf.Variable,
How to set for instance W1 as non trainable?
I looked for some other topics but non of them helps me to get what I want. Some examples of links are belows:
Link 1 : https://keras.io/guides/transfer_learning/
Link 2 : https://github.com/tensorflow/tensorflow/issues/47597
Can anyone help me to solve this?
Thank you in advance
The tensor W is stored as a single tf.Variable (not four variables w11, w12, w21, w22) and tf.Variable.trainable controls entire tensors, not sub tensors. Worse yet, inside a keras layer, all variables have the same trainable attribute, because they are controlled by the tf.keras.layers.Layer.trainable attribute.
To do what you want, you'd want two variables W1 and W2, each wrapped in a different instance of a layer. You'd apply each layer to the input, resulting in half the answer. Then you can concat to get the complete answer.
You can create your own layers in keras. This will help you to customize your weights within your layers, e.g., whether they are trainable or not.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # suppress Tensorflow messages
import tensorflow as tf
from keras.layers import *
from keras.models import *
# Your custom layer
class Linear(Layer):
def __init__(self, units=32,**kwargs):
super(Linear, self).__init__(**kwargs)
self.units = units
def build(self, input_shape):
self.w = self.add_weight(
shape=(input_shape[-1], self.units),
initializer="random_normal",
trainable=True,
)
self.b = self.add_weight(
shape=(self.units,), initializer="random_normal", trainable=False
)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
In Linear, the weights w are trainable and the bias b are not. Here, I am creating a training loop for dummy data to visualize the weight updating.
batch_size=10
input_shape=(batch_size,5,5)
## model
model = Sequential()
model.add(Input(shape=input_shape))
model.add(Linear(units=4,name='my_linear_layer'))
model.add(Dense(1))
## dummy dataset
x = tf.random.normal(input_shape) # dummy input
y = tf.ones((batch_size,1)) # dummy output
## loss functions and optimizer
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
### training loop
epochs = 3
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
tf.print(model.get_layer('my_linear_layer').get_weights())
# Open a GradientTape to record the operations run
# during the forward pass, which enables auto-differentiation.
with tf.GradientTape() as tape:
# Run the forward pass of the layer.
# The operations that the layer applies
# to its inputs are going to be recorded
# on the GradientTape.
logits = model(x, training=True) # Logits for this minibatch
# Compute the loss value for this minibatch.
loss_value = loss_fn(y, logits)
# Use the gradient tape to automatically retrieve
# the gradients of the trainable variables with respect to the loss.
grads = tape.gradient(loss_value, model.trainable_weights)
# Run one step of gradient descent by updating
# the value of the variables to minimize the loss.
optimizer.apply_gradients(zip(grads, model.trainable_weights))
This loop returns the following result,
Start of epoch 0
[array([[ 0.08920084, -0.04294993, 0.06111819, 0.08334437],
[-0.0369432 , -0.05014499, 0.0305218 , -0.07486793],
[-0.01227043, 0.09460627, -0.0560123 , 0.01324316],
[-0.00255878, 0.00214959, -0.02924518, 0.04721532],
[-0.05532415, -0.02014978, -0.06785563, -0.07330619]],
dtype=float32),
array([ 0.02154647, 0.05153348, -0.00128291, -0.06794706], dtype=float32)]
Start of epoch 1
[array([[ 0.08961578, -0.04327399, 0.06152926, 0.08325274],
[-0.03829437, -0.04908974, 0.02918325, -0.07456956],
[-0.01417133, 0.09609085, -0.05789544, 0.01366292],
[-0.00236284, 0.00199657, -0.02905108, 0.04717206],
[-0.05536905, -0.02011472, -0.06790011, -0.07329627]],
dtype=float32),
array([ 0.02154647, 0.05153348, -0.00128291, -0.06794706], dtype=float32)]
Start of epoch 2
[array([[ 0.09001605, -0.04358549, 0.06192534, 0.08316355],
[-0.03960795, -0.04806747, 0.02788337, -0.07427685],
[-0.01599812, 0.09751251, -0.05970317, 0.01406999],
[-0.00217021, 0.00184666, -0.02886046, 0.04712913],
[-0.05540781, -0.02008455, -0.06793848, -0.07328764]],
dtype=float32),
array([ 0.02154647, 0.05153348, -0.00128291, -0.06794706], dtype=float32)]
As you see while the weights w are updating, the bias b stays constant.
So i'm trying to solve a similar problem at the moment. What you would need to do is first use the functional API of keras. Then put all the weights that you want to be trainable into one layers and all the weights you want to be non-trianable into another layer. Have the previous layer input into both these layers. Then what you can do is use the tensorflow concatenate layer to combine the layers back together. So say you had a hidden layer with 5 neurons, 3 where you wanted them to be trainable and 2 where you wanted them to be non-trainable.
X = Dense(5, activation='relu')(X) #previous layer
Y = Dense(3, activation='relu',name='trainable_layer')(X)
Z = Dense(2, activation='relu',name='non_trainable_layer')(X)
Z.trainable = False
X = Concatenate()([Y, Z])
X = Dense(5, activation='relu')(X) #layer after layer with mixed trainable weights
I have the following code, written in tf.keras with Tensorflow 2. Basically; I need the cross entropy term's gradient with respect to the variable self.temperature. dce1_dx correctly calculates the derivative. But on the other hand, when I wrap the same cross entropy calculation into a tf.keras.Model object, the second gradient calculation, dce2_dx returns None. What is the difference between these two tf.GradientTape calculations? I am experienced in TF1 but newly switching to TF2 and eager execution, so I am stuck at that point.
import numpy as np
import tensorflow as tf
logits = np.random.uniform(low=-10.0, high=10.0, size=(10000, 5))
labels = np.random.randint(low=0, high=5, size=(10000, ))
logits_tf = tf.keras.Input(name="logits_tf", shape=(logits.shape[1]), dtype=tf.float32)
labels_tf = tf.keras.Input(name="labels_tf", shape=(), dtype=tf.int32)
dataset = tf.data.Dataset.from_tensor_slices((logits, labels))
dataset = dataset.batch(batch_size=logits.shape[0])
for lgts, idx in dataset:
temperature = tf.Variable(name="temperature", dtype=tf.float32, initial_value=tf.constant(2.0),
trainable=True)
scaled_logits = logits_tf / temperature
ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ce_loss = ce_loss(labels_tf, scaled_logits)
model = tf.keras.Model(inputs=[logits_tf, labels_tf], outputs=[ce_loss], name="calibration_model")
with tf.GradientTape() as tape0:
tape0.watch(temperature)
scaled_lgts = tf.cast(lgts, tf.float32) / temperature
ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ce = ce(idx, scaled_lgts)
dce1_dx = tape0.gradient(ce, temperature)
with tf.GradientTape() as tape1:
# Compute the derivative: d{CrossEntropy}/d{Temperature}
tape1.watch(temperature)
ce2 = model([lgts, idx])
# !!!Returns None!!!
dce2_dx = tape1.gradient(ce2, temperature)
I am writing a custom optimizer with Eager Execution in Ternsorflow 1.15 but can't figure out how to update the weights.
Taking gradient descent as an example, I have the weights, the gradient and a scalar learning rate but can't figure out how to combine them.
This is an implementation of gradient descent where model is a keras.Model e.g. a multilayer CNN:
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
step = tf.multiply(self.lr, grad)
model.trainable_variables.assign_sub(step)
but it fails on the tf.multiply saying
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shapes of all inputs must match: values[0].shape = [5,5,1,6] != values[1].shape = [6] [Op:Pack] name: packed
I also know the last line will fail as trainable_variables is a list and doesn't have the method assign_sub.
How can I rewrite the last two lines of my code to do:
model.trainable_variables -= lr * grad
Figured it out. As both are lists we need to iterate through their pairs of gradients and variables for each layer together and update each of these separately.
lr = tf.constant(0.01)
def minimize(model, inputs, targets):
with tf.GradientTape() as tape:
logits = model(input)
loss_value = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=targets)
grad = tape.gradient(loss_value, model.trainable_variables)
for v, g in zip(model.trainable_variables, grad):
v.assign_sub(lr * g)
I try to implement MAML. Therefore I need a copy of my model (model_copy) to be trained one step,
then I need my meta_model to be trained with the loss of my model_copy.
I would like to do the training of the model_copy in a function.
If I copy my code to the function I don't get proper gradients_meta (they will be all none).
It seems, that the graphs are unconnected - how can I connect the graphs?
Any idea of what I am doing wrong? I watch a lot of variables, but that doesn't seem to make a difference..
Here is the code to reproduce this issue:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(x, y, meta_model):
with tf.GradientTape() as gg:
model_copy = copy_model(meta_model)
gg.watch(x)
gg.watch(meta_model.trainable_variables)
gg.watch(model_copy.trainable_variables)
loss, _ = compute_loss(model_copy, x, y)
gradient = gg.gradient(loss, model_copy.trainable_variables)
optimizer.apply_gradients(zip(gradient, model_copy.trainable_variables))
return model_copy
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
with tf.GradientTape() as g:
g.watch(x)
g.watch(y)
model_copy = do_calc(x, y, meta_model)
g.watch(model_copy.trainable_variables)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients always None !?!!11 elf
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))
Thank you in advance for any help.
I found a solution:
I needed to "connect" meta-model and model-copy somehow.
Can anybody explain why this works and how I would achieve that using a "proper" optimizer?
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend
def copy_model(model):
copied_model = keras.Sequential()
copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
copied_model.add(keras.layers.Dense(1))
copied_model.set_weights(model.get_weights())
return copied_model
def compute_loss(model, x, y):
logits = model(x) # prediction of my model
mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits)) # compute loss between prediciton and label/truth
return mse, logits
# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))
# optimizer for training
optimizer = keras.optimizers.Adam()
# function to calculate model_copys params
def do_calc(meta_model, x, y, gg, alpha=0.01):
model_copy = copy_model(meta_model)
loss, _ = compute_loss(model_copy, x, y)
gradients = gg.gradient(loss, model_copy.trainable_variables)
k = 0
for layer in range(len(model_copy.layers)):
# calculate adapted parameters w/ gradient descent
# \theta_i' = \theta - \alpha * gradients
model_copy.layers[layer].kernel = tf.subtract(meta_model.layers[layer].kernel,
tf.multiply(alpha, gradients[k]))
model_copy.layers[layer].bias = tf.subtract(meta_model.layers[layer].bias,
tf.multiply(alpha, gradients[k + 1]))
k += 2
return model_copy
with tf.GradientTape() as g:
# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))
adapted_models = []
# model_copy = meta_model
with tf.GradientTape() as gg:
model_copy = do_calc(meta_model, x, y, gg)
# calculate loss of model_copy
test_loss, _ = compute_loss(model_copy, x, y)
# build gradients for meta_model update
gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
# gradients work. Why???
optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))
Converting Tensor to numpy and using set_weights() will only copy the updated parameter values of the gradient, but the node name in the tf2 graph has changed, so it is not possible to directly use the loss of the copy model to find the gradient of the meta model