Context
Suppose we have some 1D data (e.g. time series), where all series have fixed length l:
# [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] index
example = [ 0, 1, 1, 0, 23, 22, 20, 14, 9, 2, 0, 0] # l = 12
and we want to perform semantic segmentation, with n classes:
# [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] index
labeled = [
[ 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], # class 1
[ 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0], # class 2
[ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0], # class 3
#[ ... ],
[ 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1], # class n
]
then the output for a single example has shape [n, l] (i.e. the data_format is not "channels_last") and the batched output has shape [b, n, l], where b is the number of examples in the batch.
These classes are independent, so it is my understanding that the use sigmoid cross entropy is applicable here as the loss rather than softmax cross entropy.
Question
I have a few small related questions in regards to the expected format for and use of tf.nn.sigmoid_cross_entropy_with_logits:
since the network outputs a tensor in the same shape as the batched labels, should I train the network under the assumption that it outputs logits, or take the keras approach (see keras's binary_crossentropy) and assume it outputs probabilities?
given the 1d segmentation problem, should I call tf.nn.sigmoid_cross_entropy_with_logits on:
data_format='channels_first' (as shown above), or
data_format='channels_last' (example.T)
if I want the labels to be assigned individually per channel?
should the loss operation passed to the optimizer be:
tf.nn.sigmoid_cross_entropy_with_logits(labels, logits),
tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels, logits)), or
tf.losses.sigmoid_cross_entropy?
Code
This Colab, highlights my confusion and demonstrates that the data_format does in fact matter..., but the documentation does not explicitly state which is expected.
Dummy data
c = 5 # number of channels (label classes)
p = 10 # number of positions ('pixels')
# data_format = 'channels_first', shape = [classes, pixels]
# 'logits' for 2 examples
pred_1 = np.array([[random.random() for v in range(p)]for n in range(c)]).astype(float)
pred_2 = np.array([[random.random() for v in range(p)]for n in range(c)]).astype(float)
# 'ground truth' for the above 2 examples
targ_1 = np.array([[0 if random.random() < 0.8 else 1 for v in range(p)]for n in range(c)]).astype(float)
targ_2 = np.array([[0 if random.random() < 0.8 else 1 for v in range(p)]for n in range(c)]).astype(float)
# batched form of the above examples
preds = np.array([pred_1, pred_2])
targs = np.array([targ_1, targ_2])
# data_format = 'channels_last', shape = [pixels, classes]
t_pred_1 = pred_1.T
t_pred_2 = pred_2.T
t_targ_1 = targ_1.T
t_targ_2 = targ_2.T
t_preds = np.array([t_pred_1, t_pred_2])
t_targs = np.array([t_targ_1, t_targ_2])
losses
tf.nn
# calculate individual losses for 'channels_first'
loss_1 = tf.nn.sigmoid_cross_entropy_with_logits(labels=targ_1, logits=pred_1)
loss_2 = tf.nn.sigmoid_cross_entropy_with_logits(labels=targ_2, logits=pred_2)
# calculate batch loss for 'channels_first'
b_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=targs, logits=preds)
# calculate individual losses for 'channels_last'
t_loss_1 = tf.nn.sigmoid_cross_entropy_with_logits(labels=t_targ_1, logits=t_pred_1)
t_loss_2 = tf.nn.sigmoid_cross_entropy_with_logits(labels=t_targ_2, logits=t_pred_2)
# calculate batch loss for 'channels_last'
t_b_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=t_targs, logits=t_preds)
# get actual tensors
with tf.Session() as sess:
# loss for 'channels_first'
l1 = sess.run(loss_1)
l2 = sess.run(loss_2)
# batch loss for 'channels_first'
bl = sess.run(b_loss)
# loss for 'channels_last'
t_l1 = sess.run(t_loss_1)
t_l2 = sess.run(t_loss_2)
# batch loss for 'channels_last'
t_bl = sess.run(t_b_loss)
tf.reduced_mean(tf.nn)
# calculate individual losses for 'channels_first'
rm_loss_1 = tf.reduce_mean(loss_1)
rm_loss_2 = tf.reduce_mean(loss_2)
# calculate batch loss for 'channels_first'
rm_b_loss = tf.reduce_mean(b_loss)
# calculate individual losses for 'channels_last'
rm_t_loss_1 = tf.reduce_mean(t_loss_1)
rm_t_loss_2 = tf.reduce_mean(t_loss_2)
# calculate batch loss for 'channels_last'
rm_t_b_loss = tf.reduce_mean(t_b_loss)
# get actual tensors
with tf.Session() as sess:
# loss for 'channels_first'
rm_l1 = sess.run(rm_loss_1)
rm_l2 = sess.run(rm_loss_2)
# batch loss for 'channels_first'
rm_bl = sess.run(rm_b_loss)
# loss for 'channels_last'
rm_t_l1 = sess.run(rm_t_loss_1)
rm_t_l2 = sess.run(rm_t_loss_2)
# batch loss for 'channels_last'
rm_t_bl = sess.run(rm_t_b_loss)
tf.losses
# calculate individual losses for 'channels_first'
tf_loss_1 = tf.losses.sigmoid_cross_entropy(multi_class_labels=targ_1, logits=pred_1)
tf_loss_2 = tf.losses.sigmoid_cross_entropy(multi_class_labels=targ_2, logits=pred_2)
# calculate batch loss for 'channels_first'
tf_b_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=targs, logits=preds)
# calculate individual losses for 'channels_last'
tf_t_loss_1 = tf.losses.sigmoid_cross_entropy(multi_class_labels=t_targ_1, logits=t_pred_1)
tf_t_loss_2 = tf.losses.sigmoid_cross_entropy(multi_class_labels=t_targ_2, logits=t_pred_2)
# calculate batch loss for 'channels_last'
tf_t_b_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=t_targs, logits=t_preds)
# get actual tensors
with tf.Session() as sess:
# loss for 'channels_first'
tf_l1 = sess.run(tf_loss_1)
tf_l2 = sess.run(tf_loss_2)
# batch loss for 'channels_first'
tf_bl = sess.run(tf_b_loss)
# loss for 'channels_last'
tf_t_l1 = sess.run(tf_t_loss_1)
tf_t_l2 = sess.run(tf_t_loss_2)
# batch loss for 'channels_last'
tf_t_bl = sess.run(tf_t_b_loss)
Test equivalency
data_format equivalency
# loss _should_(?) be the same for 'channels_first' and 'channels_last' data_format
# test example_1
e1 = (l1 == t_l1.T).all()
# test example 2
e2 = (l2 == t_l2.T).all()
# loss calculated for each example and then batched together should be the same
# as the loss calculated on the batched examples
ea = (np.array([l1, l2]) == bl).all()
t_ea = (np.array([t_l1, t_l2]) == t_bl).all()
# loss calculated on the batched examples for 'channels_first' should be the same
# as loss calculated on the batched examples for 'channels_last'
eb = (bl == np.transpose(t_bl, (0, 2, 1))).all()
e1, e2, ea, t_ea, eb
# (True, False, False, False, True) <- changes every time, so True is happenstance
equivalency between tf.reduce_mean and tf.losses
l_e1 = tf_l1 == rm_l1
l_e2 = tf_l2 == rm_l2
l_eb = tf_bl == rm_bl
l_t_e1 = tf_t_l1 == rm_t_l1
l_t_e2 = tf_t_l2 == rm_t_l2
l_t_eb = tf_t_bl == rm_t_bl
l_e1, l_e2, l_eb, l_t_e1, l_t_e2, l_t_eb
# (False, False, False, False, False, False)
Both tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(...)) and tf.losses.sigmoid_cross_entropy(...) (with default arguments) are computing the same thing. The problem is in your tests where you use == to compare two floating-point numbers. Instead, use np.isclose method to check whether two floating-point numbers are equal or not:
# loss _should_(?) be the same for 'channels_first' and 'channels_last' data_format
# test example_1
e1 = np.isclose(l1, t_l1.T).all()
# test example 2
e2 = np.isclose(l2, t_l2.T).all()
# loss calculated for each example and then batched together should be the same
# as the loss calculated on the batched examples
ea = np.isclose(np.array([l1, l2]), bl).all()
t_ea = np.isclose(np.array([t_l1, t_l2]), t_bl).all()
# loss calculated on the batched examples for 'channels_first' should be the same
# as loss calculated on the batched examples for 'channels_last'
eb = np.isclose(bl, np.transpose(t_bl, (0, 2, 1))).all()
e1, e2, ea, t_ea, eb
# (True, True, True, True, True)
And:
l_e1 = np.isclose(tf_l1, rm_l1)
l_e2 = np.isclose(tf_l2, rm_l2)
l_eb = np.isclose(tf_bl, rm_bl)
l_t_e1 = np.isclose(tf_t_l1, rm_t_l1)
l_t_e2 = np.isclose(tf_t_l2, rm_t_l2)
l_t_eb = np.isclose(tf_t_bl, rm_t_bl)
l_e1, l_e2, l_eb, l_t_e1, l_t_e2, l_t_eb
# (True, True, True, True, True, True)
Related
I am pretty new to keras and trying to build a model which takes in a list as input and returns a number between 1 and 16 (or 0 and 15, i have 16 classes)
this is my code so far:
import numpy as np
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
vectorizer = TextVectorization(output_mode = "int")
'''
vocab_data = np.array(["In animus fert nova"])
vectorizer.adapt(vocab_data) #wichtig
print(vectorizer(vocab_data))
'''
# 2 Möglichkeiten für die ersten 4 Metren --> 2^4 = 16 verschiedene Hexameter
num_Hexameter = 16
inputs = keras.Input(shape = (None,1), dtype=tf.int64)
x = layers.Dense(30)(inputs)
x = layers.Dense(20)(x)
x = layers.Dense(15)(x)
x = layers.Dense(10)(x)
x = layers.Dense(5)(x)
outputs = layers.Dense(num_Hexameter, activation = "softmax")(x)
model = keras.Model(inputs = inputs, outputs = outputs)
model.summary()
model.compile(optimizer = keras.optimizers.RMSprop(learning_rate=1e-3),
loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])
test_train_data_raw = np.array([["In nova fert animus mutatas dicere formas"],
["corpora di coeptis nam vos mutastis et illas"],
["adspirate meis primaque ab origine mundi"],
["ad mea perpetuum deducite tempora carmen"]])
vectorizer.adapt(test_train_data_raw)
test_train_data = vectorizer(test_train_data_raw)
test_train_data_lbl = np.array([[0, 0, 0, 0, 0, 0, 0, 3],[0, 0, 0, 0, 0, 0, 0, 7], [0, 0, 0, 0, 0, 0, 0, 10], [0, 0, 0, 0, 0, 0, 0, 2]])
test_train_data_lbl = np.array([3, 7, 10, 2])
print(test_train_data_lbl.shape)
history = model.fit(test_train_data, test_train_data_lbl, batch_size = 2, epochs = 10)
print(history.history)
test = vectorizer(np.array([["In nova fert animus mutatas dicere formas"]]))
print(test)
print(model.predict(test).shape)
the problem is if i leave the line test_train_data_lbl = np.array([3, 7, 10, 2]) uncommented it will give me the error: ValueError: Shape mismatch: The shape of labels (received (2, 1)) should equal the shape of logits except for the last dimension (received (2, 8, 16))
commenting the line leads to no error, but the result of model.predict(test) will be an array with shape (1, 7, 16). I understand that the 7 comes from the 7 words i have and 16 from the number of classes but i need this to be of the shape (1, 16) so that i can predict which class the line will be in.
I also now that i have to less training data, but i first wanted to make the model work without errors before generating the training data.
See EDIT below, the initial post almost has no meaning now but the question still remains.
I developing a neural network to semantically segment imagery. I have worked through various loss functions (categorical cross entropy (CCE), weight CCE, focal loss, tversky loss, jaccard loss, focal tversky loss, etc) which attempt to handle highly skewed class representation, though none are producing the desired effect. My advisor mentioned attempting to create a custom loss function which ignores false negatives for a specific class (but still penalizes false positives).
I have a 6 class problem and my network is setup to work in/with one-hot encoded truth data. As a result my loss function will accept two tensors, y_true, y_pred, of shape (batch, row, col, class) (which is currently (8, 128, 128, 6)). To be able to utilize the losses I have already explored I would like to alter y_pred to set the predicted value for the specific class (the 0th class) to always be correct. That is where y_true == class 0 set y_pred == class 0, otherwise do nothing.
I have spent way too much time attempting to create this loss function as a result of tensorflow tensors being immutable. My first attempt (which I was led to through my experience with numpy)
def weighted_categorical_crossentropy_ignore(weights):
weights = K.variable(weights)
def loss(y_true, y_pred):
y_pred[tf.where(y_true == [1, 0, 0, 0, 0, 0])] = [1, 0, 0, 0, 0, 0]
# Scale predictions so that the class probs of each sample sum to 1
y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
# Clip to prevent NaN's and Inf's
y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
loss = y_true * K.log(y_pred) * weights
loss = -K.sum(loss, -1)
return loss
return loss
Though obviously I cannot alter y_pred so this attempt failed. I ended up creating a few monstrosities attempting to "build" a tensor by iterating over [batch, row, col] and performing comparisons. While this(ese) attempts did not technically fail, they never actually began training. I assume it was taking on the order of minutes to compute the loss.
After many more failed efforts I started attempting to perform the requisite computation in pure numpy in a SSCCE. But keeping cognizant I was essentially limited to instantiating "simple" tensors (ie ones, zeros) and only performing "simple" operations like element-wise multiply, addition, and reshaping. Thus I arrived at this SSCCE
import numpy as np
from tensorflow.keras.utils import to_categorical
# Generate the "images" at random
true_flat = np.argmax(np.random.rand(1, 2, 2, 4), axis=3).astype('int')
true = to_categorical(true_flat, num_classes=4).astype('int')
pred_flat = np.argmax(np.random.rand(1, 2, 2, 4), axis=3).astype('int')
pred = to_categorical(pred_flat, num_classes=4).astype('int')
print('True:\n', true_flat)
print('Pred:\n', pred_flat)
# Create a mask representing an all "class 0" image
class_zero_label = np.array([1, 0, 0, 0])
czl_all = class_zero_label * np.ones(true.shape).astype('int')
# Mask both the truth and pred to locate class 0 pixels
czl_true_locs = czl_all * true
czl_pred_locs = czl_all * pred
# Subtract to create "addition" matrix
a = (czl_true_locs - czl_pred_locs) * czl_true_locs
print('a:\n', a)
# Do this
m = ((a + 1) - (a * 2))
print('m - ', m.shape, ':\n', m)
# Pull the front entry from 'm' and "expand" its value
#x = (m[:, :, :, 0].flatten() * np.ones(pred.shape).astype('int')).T.reshape(pred.shape)
m_front = m[:, :, :, 0]
print('m_front - ', m_front.shape, ':\n', m_front)
#m_flat = m_front.flatten()
m_flat = m_front.reshape(m_front.shape[0], m_front.shape[1]*m_front.shape[2])
print('m_flat - ', m_flat.shape, ':\n', m_flat)
m_expand = m_flat * np.ones(pred.shape).astype('int')
print('m_expand - ', m_expand.shape, ':\n', m_expand)
m_trans = m_expand.T
m_fixT = m_trans.reshape(pred.shape)
print('m_fixT - ', m_fixT.shape, ':\n', m_fixT)
m = m_fixT
print('m:\n', m.shape)
# Perform the math as described
pred = (pred * m) + a
print('Pred:\n', np.argmax(pred, axis=3))
This SSCCE, is well, terrible and complex. Essentially my goal here was to create two matrices, the "addition" and "multiplication" matrices. The multiplication matrix is meant to "zero out" every pixel in the predicted values where the truth value was equal to class 0. That is no matter the pixel value (ie a one-hot encoded vector) zero it out to be equal to [0, 0, 0, 0, 0, 0]. The addition matrix is then meant to add the vector [1, 0, 0, 0, 0, 0] to each of the zero'ed out locations. In the end this would achieve the goal of setting the predicted value of every truly class 0 pixel to correct.
The issue is that this SSCCE does not translate fully to tensorflow operations. The first issue is with the generation of the multiplication matrix, it is not defined correctly for when batch_size > 1. I thought no matter, just to see if it work I will break down and tf.unstack the y_true and y_pred tensors and iteration over them. Which has led me to the current instantiation of my loss function
def weighted_categorical_crossentropy_ignore(weights):
weights = K.variable(weights)
def loss(y_true, y_pred):
y_true_un = tf.unstack(y_true)
y_pred_un = tf.unstack(y_pred)
y_pred_new = []
for i in range(0, y_true.shape[0]):
yt = y_true_un[i]
yp = y_pred_un[i]
# Pred:
# [[[0 3] * [[[1 0] + [[[0 1] = [[[0 0]
# [3 1]]] [[1 1]]] [[0 0]]] [[3 1]]]
# If we multiple pred by a tensor which zeros out only incorrect class 0 labelleling
# Then add class zero to those zero'd out locations
# We can negate the effect of mis-classified class 0 pixels but still punish for
# incorrectly predicted class 0 labels for other classes.
# Create a mask respresenting an all "class 0" image
class_zero_label = K.variable([1.0, 0.0, 0.0, 0.0, 0.0, 0.0])
czl_all = class_zero_label * K.ones(yt.shape)
# Mask both true and pred to locate class 0 pixels
czl_true = czl_all * yt
czl_pred = czl_all * yp
# Subtract to create "addition matrix"
a = czl_true - czl_pred
# Do this.
m = ((a + 1) - (a * 2.))
# And this.
x = K.flatten(m[:, :, 0])
x = x * K.ones(yp.shape)
x = K.transpose(x)
x = K.reshape(x, yp.shape)
# Voila.
ypnew = (yp * x) + a
y_pred_new.append(ypnew)
y_pred_new = tf.concat(y_pred_new, 0)
# Continue calculating weighted categorical crossentropy
# -------------------------------------------------------
# Scale predictions so that the class probs of each sample sum to 1
y_pred_new /= K.sum(y_pred_new, axis=-1, keepdims=True)
# Clip to prevent NaN's and Inf's
y_pred_new = K.clip(y_pred_new, K.epsilon(), 1 - K.epsilon())
loss = y_true * K.log(y_pred_new) * weights
loss = -K.sum(loss, -1)
return loss
return loss
The current issue with this loss function lies in the apparent difference in the behavior between numpy and tensorflow when performing the operation
x = K.flatten(m[:, :, 0])
x = x * K.ones(yp.shape)
Which is meant to represent the behavior
m_flat = m_front.flatten()
m_expand = m_flat * np.ones(pred.shape).astype('int')
from the SSCCE.
So at this point I feel like I have delved so far into caveman coding I can't get out of it. I have to image there is some simple way akin to my initial attempt to perform the described behavior.
So, I guess my direct question is How do I implement
y_pred[tf.where(y_true == [1, 0, 0, 0, 0, 0])] = [1, 0, 0, 0, 0, 0]
in a custom tensorflow loss function?
EDIT: After fumbling around quite a bit more I have finally determined how to call .numpy() on the y_true, y_pred tensors to utilize numpy operations (Apparently setting tf.compat.v1.enable_eager_execution at the start of the program "doesn't work". I had to pass run_eagerly=True to Model().compile(...)).
This has allowed me to implement essentially the first attempt outlined
def weighted_categorical_crossentropy_ignore(weights):
weights = K.variable(weights)
def loss(y_true, y_pred):
yp = y_pred.numpy()
yt = y_true.numpy()
yp[np.nonzero(np.all(yt == [1, 0, 0, 0, 0, 0], axis=3))] = [1, 0, 0, 0, 0, 0]
# Continue calculating weighted categorical crossentropy
# -------------------------------------------------------
# Scale predictions so that the class probs of each sample sum to 1
yp /= K.sum(yp, axis=-1, keepdims=True)
# Clip to prevent NaN's and Inf's
yp = K.clip(yp, K.epsilon(), 1 - K.epsilon())
loss = y_true * K.log(yp) * weights
loss = -K.sum(loss, -1)
return loss
return loss
Though it seems by calling y_pred.numpy() (or the use of it thereafter) I have apparently "destroyed" the path/flow through the network. Based on the error when attempting to .fit
ValueError: No gradients provided for any variable: ['conv3d/kernel:0', <....>
I assume I somehow need to "remarshall" the tensor back to GPU memory? I have tried
yp = tf.convert_to_tensor(yp)
to no avail; same error. So I guess the same question still lies, but from a different motivation..
EDIT2: Well it seems from this SO Answer that I can't actually use numpy() to marshall the y_true, y_pred to use vanilla numpy operations. This necessarily "destroys" the network path and thus gradients cannot be calculated.
As I result I had realized with run_eagerly=True I can tf.Variable my y_true/y_pred and perform assignment. So in pure tensorflow I attempted to recreate the same code again
def weighted_categorical_crossentropy_ignore(weights):
weights = K.variable(weights)
def loss(y_true, y_pred):
# yp = y_pred.numpy().copy()
# yt = y_true.numpy().copy()
# yp[np.nonzero(np.all(yt == [1, 0, 0, 0, 0, 0], axis=3))] = [1, 0, 0, 0, 0, 0]
yp = K.variable(y_pred)
yt = K.variable(y_true)
#np.all
x = K.all(yt == [1, 0, 0, 0, 0, 0], axis=3)
#np.nonzero
ne = tf.not_equal(x, tf.constant(False))
y = tf.where(ne)
# Perform the desired operation
yp[y] = [1, 0, 0, 0, 0, 0]
# Continue calculating weighted categorical crossentropy
# -------------------------------------------------------
# Scale predictions so that the class probs of each sample sum to 1
#yp /= K.sum(yp, axis=-1, keepdims=True) # Cannot use \= on tf.var, must use var = var /
yp = yp / K.sum(yp, axis=-1, keepdims=True)
# Clip to prevent NaN's and Inf's
yp = K.clip(yp, K.epsilon(), 1 - K.epsilon())
loss = y_true * K.log(yp) * weights
loss = -K.sum(loss, -1)
return loss
return loss
But alas, this apparently creates the same issue as when calling .numpy(); no gradients can be computed. So I am again seemingly back at square 1.
EDIT3: Using the solution proposed by gobrewers14 in the answer posted below but modified based on my knowledge of the problem I have produced this loss function
def weighted_categorical_crossentropy_ignore(weights):
weights = K.variable(weights)
def loss(y_true, y_pred):
print('y_true.shape: ', y_true.shape)
print('y_pred.shape: ', y_pred.shape)
# Generate modified y_pred where all truly class0 pixels are correct
y_true_class0_indicies = tf.where(tf.math.equal(y_true, [1., 0., 0., 0., 0., 0.]))
y_pred_updates = tf.repeat([
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
repeats=y_true_class0_indicies.shape[0],
axis=0)
yp = tf.tensor_scatter_nd_update(y_pred, y_true_class0_indicies, y_pred_updates)
# Continue calculating weighted categorical crossentropy
# -------------------------------------------------------
# Scale predictions so that the class probs of each sample sum to 1
yp /= K.sum(yp, axis=-1, keepdims=True)
# Clip to prevent NaN's and Inf's
yp = K.clip(yp, K.epsilon(), 1 - K.epsilon())
loss = y_true * K.log(yp) * weights
loss = -K.sum(loss, -1)
return loss
return loss
Provided the original answer assumed y_true to be of shape [8, 128, 128] (ie a "flat" class representation, versus a one-hot encoded representation [8, 128, 128, 6]) I first print the shapes of the y_true and y_pred input tensors for sanity
y_true.shape: (8, 128, 128, 6)
y_pred.shape: (8, 128, 128, 6)
For further sanity, the output shape of the network, provided by the tail of model.summary is
conv2d_18 (Conv2D) (None, 128, 128, 6) 1542 dropout_5[0][0]
__________________________________________________________________________________________________
activation_9 (Activation) (None, 128, 128, 6) 0 conv2d_18[0][0]
==================================================================================================
Total params: 535,551,494
Trainable params: 535,529,478
Non-trainable params: 22,016
__________________________________________________________________________________________________
I then follow "the pattern" in the proposed solution and replace the original tf.math.equal(y_true, 0) with tf.math.equal(y_true, [1., 0., 0., 0., 0., 0.]) to handle the one-hot encoded case. From my understanding of the proposed solution currently (after ~10min of inspecting it) I assumed this should work. Though when attempting to train a model the following exception is thrown
InvalidArgumentError: Inner dimensions of output shape must match inner dimensions of updates shape. Output: [8,128,128,6] updates: [684584,6] [Op:TensorScatterUpdate]
Thus it seems as if the production of the (as I have named them) y_pred_updates produces a "collapsed" tensor with "too many" elements. I understand the motivation of the use of tf.repeat but its specific use seems to be incorrect. I assume it should produce a tensor with shape (8, 128, 128, 6) based on what I understand tf.tensor_scatter_nd_update to do. I assume this most likely is just based on the selection of the repeats and axis during the call to tf.repeat.
If I understand your question correctly, you are looking for something like this:
import tensorflow as tf
# batch of true labels
y_true = tf.constant([5, 0, 1, 3, 4, 0, 2, 0], dtype=tf.int64)
# batch of class probabilities
y_pred = tf.constant(
[
[0.34670502, 0.04551039, 0.14020428, 0.14341979, 0.21430719, 0.10985339],
[0.25681055, 0.14013883, 0.19890164, 0.11124421, 0.14526634, 0.14763844],
[0.09199252, 0.21889475, 0.1170236 , 0.1929019 , 0.20311192, 0.17607528],
[0.3246354 , 0.23257554, 0.15549366, 0.17282239, 0.00000001, 0.11447308],
[0.16502093, 0.13163856, 0.14371352, 0.19880624, 0.23360236, 0.12721846],
[0.27362782, 0.21408406, 0.10917682, 0.13135742, 0.10814326, 0.16361059],
[0.20697299, 0.23721898, 0.06455399, 0.11071447, 0.18990229, 0.19063729],
[0.10320242, 0.22173141, 0.2547973 , 0.2314068 , 0.07063974, 0.11822232]
], dtype=tf.float32)
# find the indices in the batch where the true label is the class 0
indices = tf.where(tf.math.equal(y_true, 0))
# create a tensor with the number of updates you want to replace in `y_pred`
updates = tf.repeat(
[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
repeats=indices.shape[0],
axis=0)
# insert the updates into `y_pred` at the specified indices
modified_y_pred = tf.tensor_scatter_nd_update(y_pred, indices, updates)
print(modified_y_pred)
# tf.Tensor(
# [[0.34670502, 0.04551039, 0.14020428, 0.14341979, 0.21430719, 0.10985339],
# [1.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000],
# [0.09199252, 0.21889475, 0.1170236 , 0.1929019 , 0.20311192, 0.17607528],
# [0.3246354 , 0.23257554, 0.15549366, 0.17282239, 0.00000001, 0.11447308],
# [0.16502093, 0.13163856, 0.14371352, 0.19880624, 0.23360236, 0.12721846],
# [1.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000],
# [0.20697299, 0.23721898, 0.06455399, 0.11071447, 0.18990229, 0.19063729],
# [1.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000]],
# shape=(8, 6), dtype=tf.float32)
This final tensor, modified_y_pred, can be used in differentiation.
EDIT:
It might be easier to do this with masks.
Example:
# these arent normalized to 1 but you get the point
probs = tf.random.normal([2, 4, 4, 6])
# raw labels per pixel
labels = tf.random.uniform(
shape=[2, 4, 4],
minval=0,
maxval=6,
dtype=tf.int64)
# your labels are already one-hot encoded
labels = tf.one_hot(labels, 6)
# boolean mask where classes are `0`
# converting back to int labels with argmax for purposes of
# using `tf.math.equal`. Matching on `[1, 0, 0, 0, 0, 0]` is
# potentially buggy; matching on an integer is a lot more
# explicit.
mask = tf.math.equal(tf.math.argmax(labels, -1), 0)[..., None]
# flip the mask to zero out the pixels across channels where
# labels are zero
probs *= tf.cast(tf.math.logical_not(mask), tf.float32)
# multiply the mask by the one-hot labels, and add back
# to the already masked probabilities.
probs += labels * tf.cast(mask, tf.float32)
When I calculate Binary Crossentropy by hand I apply sigmoid to get probabilities, then use Cross-Entropy formula and mean the result:
logits = tf.constant([-1, -1, 0, 1, 2.])
labels = tf.constant([0, 0, 1, 1, 1.])
probs = tf.nn.sigmoid(logits)
loss = labels * (-tf.math.log(probs)) + (1 - labels) * (-tf.math.log(1 - probs))
print(tf.reduce_mean(loss).numpy()) # 0.35197204
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
loss = cross_entropy(labels, logits)
print(loss.numpy()) # 0.35197204
How to calculate Categorical Cross-Entropy when logits and labels have different sizes?
logits = tf.constant([[-3.27133679, -22.6687183, -4.15501118, -5.14916372, -5.94609261,
-6.93373299, -5.72364092, -9.75725174, -3.15748906, -4.84012318],
[-11.7642536, -45.3370094, -3.17252636, 4.34527206, -17.7164974,
-0.595088899, -17.6322937, -2.36941719, -6.82157373, -3.47369862],
[-4.55468369, -1.07379043, -3.73261762, -7.08982277, -0.0288562477,
-5.46847963, -0.979336262, -3.03667569, -3.29502845, -2.25880361]])
labels = tf.constant([2, 3, 4])
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
reduction='none')
loss = loss_object(labels, logits)
print(loss.numpy()) # [2.0077195 0.00928135 0.6800677 ]
print(tf.reduce_mean(loss).numpy()) # 0.8990229
I mean how can I get the same result ([2.0077195 0.00928135 0.6800677 ]) by hand?
#OverLordGoldDragon answer is correct. In TF 2.0 it looks like this:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
loss = loss_object(labels, logits)
print(f'{loss.numpy()}\n{tf.math.reduce_sum(loss).numpy()}')
one_hot_labels = tf.one_hot(labels, 10)
preds = tf.nn.softmax(logits)
preds /= tf.math.reduce_sum(preds, axis=-1, keepdims=True)
loss = tf.math.reduce_sum(tf.math.multiply(one_hot_labels, -tf.math.log(preds)), axis=-1)
print(f'{loss.numpy()}\n{tf.math.reduce_sum(loss).numpy()}')
# [2.0077195 0.00928135 0.6800677 ]
# 2.697068691253662
# [2.0077198 0.00928142 0.6800677 ]
# 2.697068929672241
For language models:
vocab_size = 9
seq_len = 6
batch_size = 2
labels = tf.reshape(tf.range(batch_size*seq_len), (batch_size,seq_len)) # (2, 6)
logits = tf.random.normal((batch_size,seq_len,vocab_size)) # (2, 6, 9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
loss = loss_object(labels, logits)
print(f'{loss.numpy()}\n{tf.math.reduce_sum(loss).numpy()}')
one_hot_labels = tf.one_hot(labels, vocab_size)
preds = tf.nn.softmax(logits)
preds /= tf.math.reduce_sum(preds, axis=-1, keepdims=True)
loss = tf.math.reduce_sum(tf.math.multiply(one_hot_labels, -tf.math.log(preds)), axis=-1)
print(f'{loss.numpy()}\n{tf.math.reduce_sum(loss).numpy()}')
# [[1.341706 3.2518263 2.6482694 3.039099 1.5835983 4.3498387]
# [2.67237 3.3978183 2.8657475 nan nan nan]]
# nan
# [[1.341706 3.2518263 2.6482694 3.039099 1.5835984 4.3498387]
# [2.67237 3.3978183 2.8657475 0. 0. 0. ]]
# 25.1502742767334
SparseCategoricalCrossentropy is CategoricalCrossentropy that takes integer labels as opposed to one-hot. Example from source code, the two below are equivalent:
scce = tf.keras.losses.SparseCategoricalCrossentropy()
cce = tf.keras.losses.CategoricalCrossentropy()
labels_scce = K.variable([[0, 1, 2]])
labels_cce = K.variable([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
preds = K.variable([[.90,.05,.05], [.50,.89,.60], [.05,.01,.94]])
loss_cce = cce(labels_cce, preds, from_logits=False)
loss_scce = scce(labels_scce, preds, from_logits=False)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run([loss_cce, loss_scce])
print(K.get_value(loss_cce))
print(K.get_value(loss_scce))
# [0.10536055 0.8046684 0.0618754]
# [0.10536055 0.8046684 0.0618754]
As to how to do it 'by hand', we can refer to the Numpy backend:
np_labels = K.get_value(labels_cce)
np_preds = K.get_value(preds)
losses = []
for label, pred in zip(np_labels, np_preds):
pred /= pred.sum(axis=-1, keepdims=True)
losses.append(np.sum(label * -np.log(pred), axis=-1, keepdims=False))
print(losses)
# [0.10536055 0.8046684 0.0618754]
from_logits = True: preds is model output before passing it into softmax (so we pass it into softmax)
from_logits = False: preds is model output after passing it into softmax (so we skip this step)
So in summary, to compute it by hand:
Convert integer labels to one-hot labels
If preds are model outputs before softmax, we compute their softmax
pred /= ... normalizes predictions before computing logs; this way, high-probab. preds on zero-labels penalize correct predictions on one-labels. If from_logits = False, this step is skipped, since softmax does the normalization. See this snippet. Further reading
For each observation / sample, compute element-wise negative log (base e) only where label==1
Take mean of losses for all the observations
Lastly, the mathematical formula for categorical crossentropy is:
i iterates over N observations
c iterates over C classes
1 is the indicator function - here, like binary crossentropy, except operates on length-C vectors
p_model [y_i \in C_c] - predicted probability of observation i belonging to class c
This is a very newbie question but I'm trying to wrap my head around cross_entropy loss in Torch so I created the following code:
x = torch.FloatTensor([
[1.,0.,0.]
,[0.,1.,0.]
,[0.,0.,1.]
])
print(x.argmax(dim=1))
y = torch.LongTensor([0,1,2])
loss = torch.nn.functional.cross_entropy(x, y)
print(loss)
which outputs the following:
tensor([0, 1, 2])
tensor(0.5514)
What I don't understand is given my input matches the expected output why is the loss not 0?
That is because the input you give to your cross entropy function is not the probabilities as you did but the logits to be transformed into probabilities with this formula:
probas = np.exp(logits)/np.sum(np.exp(logits), axis=1)
So here the matrix of probabilities pytorch will use in your case is:
[0.5761168847658291, 0.21194155761708547, 0.21194155761708547]
[0.21194155761708547, 0.5761168847658291, 0.21194155761708547]
[0.21194155761708547, 0.21194155761708547, 0.5761168847658291]
torch.nn.functional.cross_entropy function combines log_softmax(softmax followed by a logarithm) and nll_loss(negative log likelihood loss) in a single
function, i.e. it is equivalent to F.nll_loss(F.log_softmax(x, 1), y).
Code:
x = torch.FloatTensor([[1.,0.,0.],
[0.,1.,0.],
[0.,0.,1.]])
y = torch.LongTensor([0,1,2])
print(torch.nn.functional.cross_entropy(x, y))
print(F.softmax(x, 1).log())
print(F.log_softmax(x, 1))
print(F.nll_loss(F.log_softmax(x, 1), y))
output:
tensor(0.5514)
tensor([[-0.5514, -1.5514, -1.5514],
[-1.5514, -0.5514, -1.5514],
[-1.5514, -1.5514, -0.5514]])
tensor([[-0.5514, -1.5514, -1.5514],
[-1.5514, -0.5514, -1.5514],
[-1.5514, -1.5514, -0.5514]])
tensor(0.5514)
Read more about torch.nn.functional.cross_entropy loss function from here.
Complete, copy/paste runnable example showing an example categorical cross-entropy loss calculation via:
-paper+pencil+calculator
-NumPy
-PyTorch
Other than minor rounding differences all 3 come out to be the same:
import torch
import torch.nn.functional as F
import numpy as np
def main():
### paper + pencil + calculator calculation #################
"""
predictions before softmax:
columns
(4 categories)
rows 1, 4, 1, 1
(3 samples) 5, 1, 2, 1
1, 2, 5, 1
ground truths (NOT one hot encoded)
1, 0, 2
preds softmax calculation:
(e^1/(e^1+e^4+e^1+e^1)), (e^4/(e^1+e^4+e^1+e^1)), (e^1/(e^1+e^4+e^1+e^1)), (e^1/(e^1+e^4+e^1+e^1))
(e^5/(e^5+e^1+e^2+e^1)), (e^1/(e^5+e^1+e^2+e^1)), (e^2/(e^5+e^1+e^2+e^1)), (e^1/(e^5+e^1+e^2+e^1))
(e^1/(e^1+e^2+e^5+e^1)), (e^2/(e^1+e^2+e^5+e^1)), (e^5/(e^1+e^2+e^5+e^1)), (e^1/(e^1+e^2+e^5+e^1))
preds after softmax:
0.04332, 0.87005, 0.04332, 0.04332
0.92046, 0.01686, 0.04583, 0.01686
0.01686, 0.04583, 0.92046, 0.01686
categorical cross-entropy loss calculation:
(-ln(0.87005) + -ln(0.92046) + -ln(0.92046)) / 3 = 0.10166
Note the loss ends up relatively low because all 3 predictions are correct
"""
### calculation via NumPy ###################################
# predictions from model (just made up example data in this case)
# rows = 3 samples, cols = 4 categories
preds = np.array([[1, 4, 1, 1],
[5, 1, 2, 1],
[1, 2, 5, 1]], dtype=np.float32)
# ground truths, NOT one hot encoded
gndTrs = np.array([1, 0, 2], dtype=np.int64)
preds = softmax(preds)
loss = calcCrossEntropyLoss(preds, gndTrs)
print('\n' + 'NumPy loss = ' + str(loss) + '\n')
### calculation via PyTorch #################################
# predictions from model (just made up example data in this case)
# rows = 3 samples, cols = 4 categories
preds = torch.tensor([[1, 4, 1, 1],
[5, 1, 2, 1],
[1, 2, 5, 1]], dtype=torch.float32)
# ground truths, NOT one hot encoded
gndTrs = torch.tensor([1, 0, 2], dtype=torch.int64)
loss = F.cross_entropy(preds, gndTrs)
print('PyTorch loss = ' + str(loss) + '\n')
# end function
def softmax(x: np.ndarray) -> np.ndarray:
numSamps = x.shape[0]
for i in range(numSamps):
x[i] = np.exp(x[i]) / np.sum(np.exp(x[i]))
# end for
return x
# end function
def calcCrossEntropyLoss(preds: np.ndarray, gndTrs: np.ndarray) -> np.ndarray:
assert len(preds.shape) == 2
assert len(gndTrs.shape) == 1
assert preds.shape[0] == gndTrs.shape[0]
numSamps = preds.shape[0]
mySum = 0.0
for i in range(numSamps):
# Note: in numpy, "log" is actually natural log (ln)
mySum += -1 * np.log(preds[i, gndTrs[i]])
# end for
crossEntLoss = mySum / numSamps
return crossEntLoss
# end function
if __name__ == '__main__':
main()
program output:
NumPy loss = 0.10165966302156448
PyTorch loss = tensor(0.1017)
I am trying to create an autoencoder from scratch for my dataset. It is a variational autoencoder for feature extraction. I am pretty new to machine learning and I would like to know how to feed my input data to the autoencoder.
My data is a time series data. It looks like below:
array([[[ 10, 0, 10, ..., 10, 0, 0],
...,
[ 0, 12, 32, ..., 2, 2, 2]],
[[ 0, 3, 7, ..., 7, 3, 0],
.....
[ 0, 2, 3, ..., 3, 4, 6]],
[[1, 3, 1, ..., 0, 10, 2],
...,
[2, 11, 12, ..., 1, 1, 8]]], dtype=int64)
It is a stack of arrays and the shape is (3, 1212, 700).
And where do I pass the label?
The examples online are simple and there is no detailed description as to how to feed the data in reality. Any examples or explanations will be highly helpful.
This can be solved using a generator. The generator takes your time series data of 700 data points each with 3 channels and 1212 time steps and it outputs a batch.
In the example I've written the batches are each the same time period, for example batch 0 is the first 10 time steps for each of your 700 samples, batch 1 is the time steps 1:11 for each of your 700 samples. If you want to mix this up in some way then you should edit the generator. The epoch ends when each batch has been tested and trained on. For the neural network a very simple encoder, decoder model can be enough to prove the concept - but you will probably want to replace with your own model. The variable n is used to determine how many time steps are used for the autoencoder.
import numpy as np
import pandas as pd
import keras
from keras.layers import Dense, Flatten
from tensorflow.python.client import device_lib
# check for my gpu
print(device_lib.list_local_devices())
# make some fake data
# your data
data = np.random.random((3, 1212, 700))
# this is a generator
def image_generator(data, n):
start = 0
end = n
while end < data.shape[1] -1:
last_n_steps = data[:,start:end].T
yield (last_n_steps, last_n_steps)
start +=1
end +=1
# the generator MUST loop
if end == data.shape[1] -1:
start = 0
end = n
n = 10
# basic model - replace with your own
encoder_input = Input(shape = (n,3), name = "encoder_input")
fc = Flatten()(encoder_input)
fc = Dense(100, activation='relu',name = "fc1")(fc)
encoder_output = Dense(5, activation='sigmoid',name = "encoder_output")(fc)
encoder = Model(encoder_input,encoder_output)
decoder_input = Input(shape = encoder.layers[-1].output_shape[1:], name = "decoder_input")
fc = Dense(100, activation='relu',name = "fc2")(decoder_input)
output = Dense(5, activation='sigmoid',name = "output")(fc)
decoder = Model(decoder_input,output)
combined_model_input = Input(shape = (n,3), name = "combined_model_input")
autoencoder = Model(combined_model_input, decoder(encoder(combined_model_input)))
model = Model(input_layer,output_layer)
model.compile(optimizer="adam", loss='mean_squared_error')
print(model.summary())
#and training
training_history = model.fit_generator(image_generator(data, n),
epochs =5,
initial_epoch = 0,
steps_per_epoch=data.shape[2]-n,
verbose=1
)