I am trying to train a mixture model but I am unclear how to specify a trainable array argument in order to allow the weights to be updated. So if I have the following with weights hard coded
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
weights = [0.2, 0.8]
dist = tfd.Mixture(cat=tfd.Categorical(probs=weights),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
#tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
gradients = tape.gradient(loss,dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables))
return loss
for i in range(20000):
loss = train_step(X)
where X is a 1D Numpy array with shape (272, 1)
Now let's say I want to learn the weights. If I try in the Categorical distribution constructor
probs=[tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')]
then I get an error "No gradients provided for any variable"
if I try
probs=tf.Variable([tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')], trainable=True, name='weights')
then weight1 and weight2 do not appear in the list of trainablevariables. weights is listed but does not update.
What is the correct way to specify the weights to the probs argument so they will be updated during training?
Maybe try the following:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
dist = tfd.Mixture(cat=tfd.Categorical(probs=tf.Variable([0.2, 0.8])),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
#tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
tf.print(dist.trainable_variables)
gradients = tape.gradient(loss, dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables)) #E
return loss
for i in range(10):
loss = train_step(tf.random.normal((272, 1)))
([0.2 0.8], 0, 1, 0, 1)
([0.2 0.8], -0.00999249145, 1.00999844, -0.0099981213, 1.00999963)
([0.200921655 0.799828708], -0.00638755737, 1.00682414, -0.00639217719, 1.00682521)
([0.20176363 0.799696386], -0.000149463303, 1.00765562, -0.000160227064, 1.00764322)
([0.200775564 0.800094664], 0.000889031217, 1.00637043, 0.000898908474, 1.00636196)
([0.199177444 0.800768435], -0.00115872873, 1.0025779, -0.00113528164, 1.0025754)
([0.19703567 0.801662683], -0.000830670586, 0.998396218, -0.000778611051, 0.998392522)
([0.193336055 0.80336237], 0.00244163908, 0.993740082, 0.00255049323, 0.993718445)
([0.192727238 0.803925216], 0.00376213156, 0.989788294, 0.00386576797, 0.989756942)
([0.194845349 0.802922785], 0.0022987891, 0.986021399, 0.00232516858, 0.985970497)
Related
I have the following code, written in tf.keras with Tensorflow 2. Basically; I need the cross entropy term's gradient with respect to the variable self.temperature. dce1_dx correctly calculates the derivative. But on the other hand, when I wrap the same cross entropy calculation into a tf.keras.Model object, the second gradient calculation, dce2_dx returns None. What is the difference between these two tf.GradientTape calculations? I am experienced in TF1 but newly switching to TF2 and eager execution, so I am stuck at that point.
import numpy as np
import tensorflow as tf
logits = np.random.uniform(low=-10.0, high=10.0, size=(10000, 5))
labels = np.random.randint(low=0, high=5, size=(10000, ))
logits_tf = tf.keras.Input(name="logits_tf", shape=(logits.shape[1]), dtype=tf.float32)
labels_tf = tf.keras.Input(name="labels_tf", shape=(), dtype=tf.int32)
dataset = tf.data.Dataset.from_tensor_slices((logits, labels))
dataset = dataset.batch(batch_size=logits.shape[0])
for lgts, idx in dataset:
temperature = tf.Variable(name="temperature", dtype=tf.float32, initial_value=tf.constant(2.0),
trainable=True)
scaled_logits = logits_tf / temperature
ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ce_loss = ce_loss(labels_tf, scaled_logits)
model = tf.keras.Model(inputs=[logits_tf, labels_tf], outputs=[ce_loss], name="calibration_model")
with tf.GradientTape() as tape0:
tape0.watch(temperature)
scaled_lgts = tf.cast(lgts, tf.float32) / temperature
ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ce = ce(idx, scaled_lgts)
dce1_dx = tape0.gradient(ce, temperature)
with tf.GradientTape() as tape1:
# Compute the derivative: d{CrossEntropy}/d{Temperature}
tape1.watch(temperature)
ce2 = model([lgts, idx])
# !!!Returns None!!!
dce2_dx = tape1.gradient(ce2, temperature)
I am using TF2 (2.3.0) NN to approximate the function y which solves the ODE: y'+3y=0
I have defined cutsom loss class and function in which I am trying to differentiate the single output with respect to the single input so the equation holds, provided that y_true is zero:
from tensorflow.keras.losses import Loss
import tensorflow as tf
class CustomLossOde(Loss):
def __init__(self, x, model, name='ode_loss'):
super().__init__(name=name)
self.x = x
self.model = model
def call(self, y_true, y_pred):
with tf.GradientTape() as tape:
tape.watch(self.x)
y_p = self.model(self.x)
dy_dx = tape.gradient(y_p, self.x)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
return loss
but running the following NN:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from custom_loss_ode import CustomLossOde
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
loss = CustomLossOde(model.input, model)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99),loss=loss)
model.run_eagerly = True
model.fit(x_train, y_train, batch_size=16, epochs=30)
for now I am getting 0 loss from the fisrt epoch, which doesn't make any sense.
I have printed both y_true and y_test from within the function and they seem OK so I suspect that the problem is in the gradien which I didn't succeed to print.
Apprecitate any help
Defining a custom loss with the high level Keras API is a bit difficult in that case. I would instead write the training loop from scracth, as it allows a finer grained control over what you can do.
I took inspiration from those two guides :
Advanced Automatic Differentiation
Writing a training loop from scratch
Basically, I used the fact that multiple tape can interact seamlessly. I use one to compute the loss function, the other to calculate the gradients to be propagated by the optimizer.
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
# using the high level tf.data API for data handling
x_train = tf.reshape(x_train,(-1,1))
dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1)
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99)
for step, (x,y_true) in enumerate(dataset):
# we need to convert x to a variable if we want the tape to be
# able to compute the gradient according to x
x_variable = tf.Variable(x)
with tf.GradientTape() as model_tape:
with tf.GradientTape() as loss_tape:
loss_tape.watch(x_variable)
y_pred = model(x_variable)
dy_dx = loss_tape.gradient(y_pred, x_variable)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
grad = model_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(grad, model.trainable_variables))
if step%20==0:
print(f"Step {step}: loss={loss.numpy()}")
I'm building a loss function, and I need to use the y_true and y_pred as indexes for a matrix which I'm using to calculate the loss. The problem is, both of these come as float tensors, and functions like cast() and round() are not differentiable, so I can't use them inside the loss functions.
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from sklearn.utils import shuffle
from keras.utils import to_categorical
K = tf.keras.backend
A = np.random.randint(12, size=(12,12))
def score_loss(y_true, y_pred):
y_pred = tf.nn.softmax(y_pred)
y_true = tf.nn.softmax(y_true)
y_pred = K.cast(y_pred,"int32")
y_true = K.cast(y_true,"int32")
loss = -K.sum(tf.gather_nd(A, tf.stack((y_true, y_pred), axis=-1)))
return loss
data = np.random.rand(1000,10)
data_y = np.array(range(0,10))
X = data[:, 0:8]
y = data[:, 9]
for i in range(0, len(y)):
y[i] = data_y[i%9]
y = shuffle(y)
y = to_categorical(y, 9)
model = Sequential()
model.add(Dense(200, input_shape = (8,), activation = "relu"))
model.add(Dense(9, activation = "softmax"))
model.compile(loss = score_loss, optimizer= Adam())
Getting the following error during training:
ValueError: An operation has `None` for gradient. Please make sure that all of your ops have a gradient defined (i.e. are differentiable). Common ops without gradient: K.argmax, K.round, K.eval.
I need another way of making the conversion to integers or an alternative way of doing the whole thing.
Let's say I have the following constraints and the network:
The architecture is fixed (see this image) (note that there are no biases)
Activation function for the hidden layer is ReLU
There's no activation function for the output layer (should just return the sum of the inputs it receive).
I tried to implement this in pytorch with various initialization schemes and different data sets but I failed (the code is at the bottom).
My questions are:
Is there anything wrong with my NN training process?
Is this a feasible problem? If yes, how?
If this is doable, can we still achieve that by constraining the weights to be in the set {-1, 0, 1}
Code:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import numpy as np
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.fc1 = nn.Linear(2,2,bias=False)
self.fc2 = nn.Linear(2,1, bias=False)
self.rl = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.rl(x)
x = self.fc2(x)
return x
#create an XOR data set to train
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype('int32')
# test data set
X_test = np.array([[0,0],[0,1], [1,0], [1,1]])
train = data_utils.TensorDataset(torch.from_numpy(X).float(), \
torch.from_numpy(y).float())
train_loader = data_utils.DataLoader(train, batch_size=50, shuffle=True)
test = torch.from_numpy(X_test).float()
# training the network
num_epoch = 10000
net = Network()
net.fc1.weight.data.clamp_(min=-1, max=1)
net.fc2.weight.data.clamp_(min=-1, max=1)
# define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters())
for epoch in range(num_epoch):
running_loss = 0 # loss per epoch
for (X, y)in train_loader:
# make the grads zero
optimizer.zero_grad()
# forward propagate
out = net(X)
# calculate loss and update
loss = criterion(out, y)
loss.backward()
optimizer.step()
running_loss += loss.data
if epoch%500== 0:
print("Epoch: {0} Loss: {1}".format(epoch, running_loss))
The loss doesn't improve. It gets stuck in some value after a few epochs ( i'm not sure how to make this reproducible as I'm getting different values every time)
net(test) returns a set of predictions that are no way close to XOR output.
You need to use a nonlinear activation function such as sigmoid in your hidden and output layers . because xor is not linearly separable.Also biases are required.
I build my model using tf.keras.layers.Dense. In the first layer of my model I want some weights to be constant Zero. As in the gradient calculation these weights should be get a gradient = zero (as the last term in the chain rule corresponds to the weight, which is 0 for a constant).
This is my approach so far:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
tf.enable_eager_execution()
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, activation=tf.sigmoid, input_shape=(2,)),
tf.keras.layers.Dense(2, activation=tf.sigmoid)
])
weights=[np.array([[tf.constant(0), 0.25],[0.2,0.3]]),np.array([0.35,0.35]),np.array([[0.4,0.5],[0.45, 0.55]]),np.array([0.6,0.6])]
model.set_weights(weights)
def loss(model, x, y):
y_ = model(x)
return tf.losses.mean_squared_error(labels=y, predictions=y_)
def grad(model, inputs, targets):
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets)
return loss_value, tape.gradient(loss_value, model.trainable_variables)
But in the gradient calculation the weight tf.constant(0) has a gradient not equal zero.
Do I have an understanding problem?
How can I set a weight(or some weights) in a layer(not all weights in one layer) to a constant value (which should not change during training)?
My answer is based on the CustomConnected layer from this answer. As I said in a comment, when you multiply a weight w_ij by c_ij=0 via the connections matrix, the gradient of the loss with respect to that weight becomes zero as well (since the last factor in the chain rule corresponds to c_ij=0).
Here is a minimal example in Keras:
# Using CustomConnected from:
# https://stackoverflow.com/questions/50290769/specify-connections-in-nn-in-keras
import tensorflow as tf
import numpy as np
tf.enable_eager_execution()
# Define model
inp = tf.keras.layers.Input(shape=(2,))
c = np.array([[1., 1.], [1., 0.]], dtype=np.float32)
h = CustomConnected(2, c)(inp)
model = tf.keras.models.Model(inp, h)
# Set initial weights and compile
w = [np.random.rand(2, 2) * c]
model.set_weights(w)
model.compile(tf.train.AdamOptimizer(), 'mse')
# Check gradients
x = tf.constant(np.random.rand(10, 2), dtype=tf.float32)
y = np.random.rand(10, 2)
with tf.GradientTape() as tape:
loss_value = tf.losses.mean_squared_error(labels=y, predictions=model(x))
grad = tape.gradient(loss_value, model.trainable_variables)
print('Gradients: ', grad[0])
Note that I set c[1,1]=0 so the gradient corresponding to weight w[1,1] is 0 regardless of the input.