No gradients provided for any variable for custom loss function

No gradients provided for any variable for custom loss function - python

I have created a custom loss function in Keras as follows:
import tensorflow as tf
import numpy as np
def custom_loss(y_true, y_pred):
cce = tf.keras.losses.CategoricalCrossentropy()
loss = cce(y_true, y_pred).numpy()
epsilon = np.finfo(np.float32).eps
confidence = np.clip(y_true.numpy(), epsilon, 1.-epsilon)
sample_entropy = -1. * np.sum(np.multiply(confidence, np.log(confidence) / np.log(np.e)), axis=-1)
entropy = np.mean(sample_entropy)
penalty = 0.1 * -entropy
return loss + penalty
When I use this custom loss function I'm getting the error message
ValueError: No gradients provided for any variable
Somehow now gradients can be calculated. How needs the loss function be changed so that gradients can be calculated?

Tensorflow need tensor to store the dependency info to let gradients flow backwards, if you convert tensor to numpy array in loss function then you break this dependency thus no gradients provided for any variable, so you need change every np operation in loss function to corresponding tf or backend operation, e.g:
import tensorflow as tf
import numpy as np
cce = tf.keras.losses.CategoricalCrossentropy()
epsilon = np.finfo(np.float32).eps
def custom_loss(y_true, y_pred):
loss = cce(y_true, y_pred)
confidence = tf.clip_by_value(y_true, epsilon, 1.-epsilon)
sample_entropy = -1. * tf.reduce_sum(tf.math.multiply(confidence, tf.math.log(confidence) / tf.math.log(np.e)), axis=-1)
entropy = tf.reduce_mean(sample_entropy)
penalty = 0.1 * -entropy
return loss + penalty

Related

Trainable array in Tensorflow probability

I am trying to train a mixture model but I am unclear how to specify a trainable array argument in order to allow the weights to be updated. So if I have the following with weights hard coded
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
weights = [0.2, 0.8]
dist = tfd.Mixture(cat=tfd.Categorical(probs=weights),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
#tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
gradients = tape.gradient(loss,dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables))
return loss
for i in range(20000):
loss = train_step(X)
where X is a 1D Numpy array with shape (272, 1)
Now let's say I want to learn the weights. If I try in the Categorical distribution constructor
probs=[tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')]
then I get an error "No gradients provided for any variable"
if I try
probs=tf.Variable([tf.Variable(0.2, name='weight1'),tf.Variable(0.8, name='weight2')], trainable=True, name='weights')
then weight1 and weight2 do not appear in the list of trainablevariables. weights is listed but does not update.
What is the correct way to specify the weights to the probs argument so they will be updated during training?

Maybe try the following:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd
dist = tfd.Mixture(cat=tfd.Categorical(probs=tf.Variable([0.2, 0.8])),
components=[tfd.Normal(loc=tf.Variable(0., name='loc1'), scale=tf.Variable(1., name='scale1')),
tfd.Normal(loc=tf.Variable(0., name='loc2'), scale=tf.Variable(1., name='scale2'))])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
#tf.function
def train_step(X):
with tf.GradientTape() as tape:
loss = -tf.reduce_mean(dist.log_prob(X))
tf.print(dist.trainable_variables)
gradients = tape.gradient(loss, dist.trainable_variables)
optimizer.apply_gradients(zip(gradients, dist.trainable_variables)) #E
return loss
for i in range(10):
loss = train_step(tf.random.normal((272, 1)))
([0.2 0.8], 0, 1, 0, 1)
([0.2 0.8], -0.00999249145, 1.00999844, -0.0099981213, 1.00999963)
([0.200921655 0.799828708], -0.00638755737, 1.00682414, -0.00639217719, 1.00682521)
([0.20176363 0.799696386], -0.000149463303, 1.00765562, -0.000160227064, 1.00764322)
([0.200775564 0.800094664], 0.000889031217, 1.00637043, 0.000898908474, 1.00636196)
([0.199177444 0.800768435], -0.00115872873, 1.0025779, -0.00113528164, 1.0025754)
([0.19703567 0.801662683], -0.000830670586, 0.998396218, -0.000778611051, 0.998392522)
([0.193336055 0.80336237], 0.00244163908, 0.993740082, 0.00255049323, 0.993718445)
([0.192727238 0.803925216], 0.00376213156, 0.989788294, 0.00386576797, 0.989756942)
([0.194845349 0.802922785], 0.0022987891, 0.986021399, 0.00232516858, 0.985970497)

How to set a constant as a weight in a keras model?

I build my model using tf.keras.layers.Dense. In the first layer of my model I want some weights to be constant Zero. As in the gradient calculation these weights should be get a gradient = zero (as the last term in the chain rule corresponds to the weight, which is 0 for a constant).
This is my approach so far:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
tf.enable_eager_execution()
model = tf.keras.Sequential([
tf.keras.layers.Dense(2, activation=tf.sigmoid, input_shape=(2,)),
tf.keras.layers.Dense(2, activation=tf.sigmoid)
])
weights=[np.array([[tf.constant(0), 0.25],[0.2,0.3]]),np.array([0.35,0.35]),np.array([[0.4,0.5],[0.45, 0.55]]),np.array([0.6,0.6])]
model.set_weights(weights)
def loss(model, x, y):
y_ = model(x)
return tf.losses.mean_squared_error(labels=y, predictions=y_)
def grad(model, inputs, targets):
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets)
return loss_value, tape.gradient(loss_value, model.trainable_variables)
But in the gradient calculation the weight tf.constant(0) has a gradient not equal zero.
Do I have an understanding problem?
How can I set a weight(or some weights) in a layer(not all weights in one layer) to a constant value (which should not change during training)?

My answer is based on the CustomConnected layer from this answer. As I said in a comment, when you multiply a weight w_ij by c_ij=0 via the connections matrix, the gradient of the loss with respect to that weight becomes zero as well (since the last factor in the chain rule corresponds to c_ij=0).
Here is a minimal example in Keras:
# Using CustomConnected from:
# https://stackoverflow.com/questions/50290769/specify-connections-in-nn-in-keras
import tensorflow as tf
import numpy as np
tf.enable_eager_execution()
# Define model
inp = tf.keras.layers.Input(shape=(2,))
c = np.array([[1., 1.], [1., 0.]], dtype=np.float32)
h = CustomConnected(2, c)(inp)
model = tf.keras.models.Model(inp, h)
# Set initial weights and compile
w = [np.random.rand(2, 2) * c]
model.set_weights(w)
model.compile(tf.train.AdamOptimizer(), 'mse')
# Check gradients
x = tf.constant(np.random.rand(10, 2), dtype=tf.float32)
y = np.random.rand(10, 2)
with tf.GradientTape() as tape:
loss_value = tf.losses.mean_squared_error(labels=y, predictions=model(x))
grad = tape.gradient(loss_value, model.trainable_variables)
print('Gradients: ', grad[0])
Note that I set c[1,1]=0 so the gradient corresponding to weight w[1,1] is 0 regardless of the input.

Custom Keras loss function that conditionally creates a zero gradient

My problem is I don't want the weights to be adjusted if y_true takes certain values. I do not want to simply remove those examples from training data because of the nature of the RNN I am trying to use.
Is there a way to write a conditional loss function in Keras with this behavior?
For example: if y_true is negative then apply zero gradient so that parameters in the model do not change, if y_true is positive loss = losses.mean_squared_error(y_true, y_pred).

You can define a custom loss function and simply use K.switch to conditionally get zero loss:
from keras import backend as K
from keras import losses
def custom_loss(y_true, y_pred):
loss = losses.mean_squared_error(y_true, y_pred)
return K.switch(K.flatten(K.equal(y_true, 0.)), K.zeros_like(loss), loss)
Test:
from keras import models
from keras import layers
model = models.Sequential()
model.add(layers.Dense(1, input_shape=(1,)))
model.compile(loss=custom_loss, optimizer='adam')
weights, bias = model.layers[0].get_weights()
x = np.array([1, 2, 3])
y = np.array([0, 0, 0])
model.train_on_batch(x, y)
# check if the parameters has not changed after training on the batch
>>> (weights == model.layers[0].get_weights()[0]).all()
True
>>> (bias == model.layers[0].get_weights()[1]).all()
True

Since the y's are in batches, you need to select those from the batch which are non-zero in the custom loss function
def myloss(y_true, y_pred):
idx = tf.not_equal(y_true, 0)
y_true = tf.boolean_mask(y_true, idx)
y_pred = tf.boolean_mask(y_pred, idx)
return losses.mean_squared_error(y_true, y_pred)
Then it can be used as such:
model = keras.Sequential([Dense(32, input_shape=(2,)), Dense(1)])
model.compile('adam', loss=myloss)
x = np.random.randn(2, 2)
y = np.array([1, 0])
model.fit(x, y)
But you might need extra logic in the loss function in case all y_true in the batch were zero, in this case, the loss function can be modified as such:
def myloss2(y_true, y_pred):
idx = tf.not_equal(y_true, 0)
y_true = tf.boolean_mask(y_true, idx)
y_pred = tf.boolean_mask(y_pred, idx)
loss = tf.cond(tf.equal(tf.shape(y_pred)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: losses.mean_squared_error(y_true, y_pred))
return loss

Why does sigmoid & crossentropy of Keras/tensorflow have low precision?

I have the following simple neural network (with 1 neuron only) to test the computation precision of sigmoid activation & binary_crossentropy of Keras:
model = Sequential()
model.add(Dense(1, input_dim=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
To simplify the test, I manually set the only weight to 1 and bias to 0, and then evaluate the model with 2-point training set {(-a, 0), (a, 1)}, i.e.
y = numpy.array([0, 1])
for a in range(40):
x = numpy.array([-a, a])
keras_ce[a] = model.evaluate(x, y)[0] # cross-entropy computed by keras/tensorflow
my_ce[a] = np.log(1+exp(-a)) # My own computation
My Question: I found the binary crossentropy (keras_ce) computed by Keras/Tensorflow reach a floor of 1.09e-7 when a is approx. 16, as illustrated below (blue line). It doesn't decrease further as 'a' keeps growing. Why is that?
This neural network has 1 neuron only whose weight is set to 1 and bias is 0. With the 2-point training set {(-a, 0), (a, 1)}, the binary_crossentropy is just
-1/2 [ log(1 - 1/(1+exp(a)) ) + log( 1/(1+exp(-a)) ) ] = log(1+exp(-a))
So the cross-entropy should decrease as a increases, as illustrated in orange ('my') above. Is there some Keras/Tensorflow/Python setup I can change to increase its precision? Or am I mistaken somewhere? I'd appreciate any suggestions/comments/answers.

TL;DR version: the probability values (i.e. the outputs of sigmoid function) are clipped due to numerical stability when computing the loss function.
If you inspect the source code, you would find that using binary_crossentropy as the loss would result in a call to binary_crossentropy function in losses.py file:
def binary_crossentropy(y_true, y_pred):
return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
which in turn, as you can see, calls the equivalent backend function. In case of using Tensorflow as the backend, that would result in a call to binary_crossentropy function in tensorflow_backend.py file:
def binary_crossentropy(target, output, from_logits=False):
""" Docstring ..."""
# Note: tf.nn.sigmoid_cross_entropy_with_logits
# expects logits, Keras expects probabilities.
if not from_logits:
# transform back to logits
_epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
output = tf.log(output / (1 - output))
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target,
logits=output)
As you can see from_logits argument is set to False by default. Therefore, the if condition evaluates to true and as a result the values in the output are clipped to the range [epsilon, 1-epislon]. That's why no matter how small or large a probability is, it could not be smaller than epsilon and greater than 1-epsilon. And that explains why the output of binary_crossentropy loss is also bounded.
Now, what is this epsilon here? It is a very small constant which is used for numerical stability (e.g. prevent division by zero or undefined behaviors, etc.). To find out its value you can further inspect the source code and you would find it in the common.py file:
_EPSILON = 1e-7
def epsilon():
"""Returns the value of the fuzz factor used in numeric expressions.
# Returns
A float.
# Example
```python
>>> keras.backend.epsilon()
1e-07
```
"""
return _EPSILON
If for any reason, you would like more precision you can alternatively set the epsilon value to a smaller constant using set_epsilon function from the backend:
def set_epsilon(e):
"""Sets the value of the fuzz factor used in numeric expressions.
# Arguments
e: float. New value of epsilon.
# Example
```python
>>> from keras import backend as K
>>> K.epsilon()
1e-07
>>> K.set_epsilon(1e-05)
>>> K.epsilon()
1e-05
```
"""
global _EPSILON
_EPSILON = e
However, be aware that setting epsilon to an extremely low positive value or zero, may disrupt the stability of computations all over the Keras.

I think that keras take into account numerical stability,
Let's track how keras caculate
First,
def binary_crossentropy(y_true, y_pred):
return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
Then,
def binary_crossentropy(target, output, from_logits=False):
"""Binary crossentropy between an output tensor and a target tensor.
# Arguments
target: A tensor with the same shape as `output`.
output: A tensor.
from_logits: Whether `output` is expected to be a logits tensor.
By default, we consider that `output`
encodes a probability distribution.
# Returns
A tensor.
"""
# Note: tf.nn.sigmoid_cross_entropy_with_logits
# expects logits, Keras expects probabilities.
if not from_logits:
# transform back to logits
_epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
output = tf.log(output / (1 - output))
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target,
logits=output)
Notice tf.clip_by_value is used for numerical stability
Let's compare keras binary_crossentropy, tensorflow tf.nn.sigmoid_cross_entropy_with_logits and custom loss function(eleminate vale clipping)
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
import keras
# keras
model = Sequential()
model.add(Dense(units=1, activation='sigmoid', input_shape=(
1,), weights=[np.ones((1, 1)), np.zeros(1)]))
# print(model.get_weights())
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
# tensorflow
G = tf.Graph()
with G.as_default():
x_holder = tf.placeholder(dtype=tf.float32, shape=(2,))
y_holder = tf.placeholder(dtype=tf.float32, shape=(2,))
entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
logits=x_holder, labels=y_holder))
sess = tf.Session(graph=G)
# keras with custom loss function
def customLoss(target, output):
# if not from_logits:
# # transform back to logits
# _epsilon = _to_tensor(epsilon(), output.dtype.base_dtype)
# output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
# output = tf.log(output / (1 - output))
output = tf.log(output / (1 - output))
return tf.nn.sigmoid_cross_entropy_with_logits(labels=target,
logits=output)
model_m = Sequential()
model_m.add(Dense(units=1, activation='sigmoid', input_shape=(
1,), weights=[np.ones((1, 1)), np.zeros(1)]))
# print(model.get_weights())
model_m.compile(loss=customLoss,
optimizer='adam', metrics=['accuracy'])
N = 100
xaxis = np.linspace(10, 20, N)
keras_ce = np.zeros(N)
tf_ce = np.zeros(N)
my_ce = np.zeros(N)
keras_custom = np.zeros(N)
y = np.array([0, 1])
for i, a in enumerate(xaxis):
x = np.array([-a, a])
# cross-entropy computed by keras/tensorflow
keras_ce[i] = model.evaluate(x, y)[0]
my_ce[i] = np.log(1+np.exp(-a)) # My own computation
tf_ce[i] = sess.run(entropy, feed_dict={x_holder: x, y_holder: y})
keras_custom[i] = model_m.evaluate(x, y)[0]
# print(model.get_weights())
plt.plot(xaxis, keras_ce, label='keras')
plt.plot(xaxis, my_ce, 'b', label='my_ce')
plt.plot(xaxis, tf_ce, 'r:', linewidth=5, label='tensorflow')
plt.plot(xaxis, keras_custom, '--', label='custom loss')
plt.xlabel('a')
plt.ylabel('xentropy')
plt.yscale('log')
plt.legend()
plt.savefig('compare.jpg')
plt.show()
we can see that tensorflow is same with manual computing, but keras with custom loss encounter numeric overflow as expected.

Custom TensorFlow metric: true positive rate at given false positive rate

I have a binary classification problem with categories background (bg) = 0, signal (sig) = 1, for which I am training NNs. For monitoring purposes, I am trying to implement a custom metric in Keras with TensorFlow backend that does the following:
1) Calculate the threshold on my NN output which would result in a false positive rate (classifying bg as signal) of X (in this case X = 0.02, but it could be anything).
2) Calculate the true positive rate at this threshold.
Given numpy arrays y_true, y_pred, I would write a function like:
def eff_at_2percent_metric(y_true, y_pred):
#Find list of bg events
bg_list = np.argwhere(y_true < 0.5)
#Order by the NN output
ordered_bg_predictions = np.flip(np.sort(y_pred[bg_list]),axis=0)
#Find the threshold with 2% false positive rate
threshold = ordered_bg_predictions[0.02*round(len(ordered_bg_list))]
#Find list of signal events
sig_list = np.argwhere(y_true > 0.5)
#Order these by NN output
ordered_sig_predictions = np.sort(y_pred[sig_list])
#Find true positive rate with this threshold
sig_eff = 1 - np.searchsorted(ordered_sig_predictions,threshold)/len(ordered_sig_predictions)
return sig_eff
Of course, this does not work because to implement a custom metric, y_true and y_pred are supposed to be TensorFlow tensors rather than numpy arrays. Is there any way I can make this work correctly?

There's a metric for sensitivity at specificity, which I believe is equivalent (specificity is one minus FPR).

You can implement your own metric, and here is an example for the false positive rate:
from tensorflow.python.eager import context
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops.metrics_impl import _aggregate_across_towers
from tensorflow.python.ops.metrics_impl import true_negatives
from tensorflow.python.ops.metrics_impl import false_positives
from tensorflow.python.ops.metrics_impl import _remove_squeezable_dimensions
def false_positive_rate(labels,
predictions,
weights=None,
metrics_collections=None,
updates_collections=None,
name=None):
if context.executing_eagerly():
raise RuntimeError('tf.metrics.recall is not supported is not '
'supported when eager execution is enabled.')
with variable_scope.variable_scope(name, 'false_alarm',
(predictions, labels, weights)):
predictions, labels, weights = _remove_squeezable_dimensions(
predictions=math_ops.cast(predictions, dtype=dtypes.bool),
labels=math_ops.cast(labels, dtype=dtypes.bool),
weights=weights)
false_p, false_positives_update_op = false_positives(
labels,
predictions,
weights,
metrics_collections=None,
updates_collections=None,
name=None)
true_n, true_negatives_update_op = true_negatives(
labels,
predictions,
weights,
metrics_collections=None,
updates_collections=None,
name=None)
def compute_false_positive_rate(true_n, false_p, name):
return array_ops.where(
math_ops.greater(true_n + false_p, 0),
math_ops.div(false_p, true_n + false_p), 0, name)
def once_across_towers(_, true_n, false_p):
return compute_false_positive_rate(true_n, false_p, 'value')
false_positive_rate = _aggregate_across_towers(
metrics_collections, once_across_towers, true_n, false_p)
update_op = compute_false_positive_rate(true_negatives_update_op,
false_positives_update_op, 'update_op')
if updates_collections:
ops.add_to_collections(updates_collections, update_op)
return false_positive_rate, update_op
You can adapt the code to the true positive rate.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

No gradients provided for any variable for custom loss function - python

Related

Trainable array in Tensorflow probability

How to set a constant as a weight in a keras model?

Custom Keras loss function that conditionally creates a zero gradient

Why does sigmoid & crossentropy of Keras/tensorflow have low precision?

Custom TensorFlow metric: true positive rate at given false positive rate

Categories

Resources