Keras Neural Net Loss Function - python

I've encountered a problem while writing Siamese net. Definition of the net takes as an input 2 vectors which represents 2 pieces of text. The vectors length is padded and different with respect to batches (in batch 1: vectors length = 32, in batch 2: vectors length = 64 and so on).
# model definition
def create_model(vocab_size=512, d_model=128):
def normalize(x):
norm = tf.norm(x, axis=-1, keepdims=True)
return tf.divide(x, norm)
component = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, d_model),
tf.keras.layers.LSTM(d_model),
tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1)),
tf.keras.layers.Lambda(normalize),
])
# due to the variability in text, input shape differs with respect to batch
inputs = [tf.keras.Input(shape=(None,)) for _ in range(2)]
outputs = tf.tuple([component(ins) for ins in inputs])
return tf.keras.Model(inputs=inputs, outputs=outputs)
# loss function
class MyLoss(tf.keras.losses.Loss):
def __init__(self):
super().__init__(name='TripletLoss')
def call(self, y_true, y_pred):
# >>> HERE IS THE PROBLEM, y_pred has different shape then I'd expect,
# its shape is (batch_size,) instead of (2, batch_size)
l, r = y_pred
# compute and return loss
return loss
When calling Model#fit(loss=MyLoss(), ...) the parameter passed to the MyLoss#call is a projection of the first coordinate of the model prediction, i.e. model.predict(z) returns [x, y] where x, y are vectors with length equal to the batch size. I'd expected that y_pred passed as a parameter to Loss#call would have had that exact value, that is [x,y], but it equals to the first vector of the given list, that is x. Furthermore I've looked up at the call stack and I've spotted that before y_pred is passed to the MyLoss#call it has expected value ([x,y]) which changes to the x in the keras' Loss.__call__ body.
I tried to reshape input, but other problems arised.

Related

How to use a batch_size of Keras tensor at the model building time?

I want to use an external program as a custom operation.
Because automatic gradient would be not available, I wrote the code to provide gradients by using numerical methods. However, because it have to compute the batch_size number of derivatives,
I wrote it to get batch_size from the shape of x.
Following is an example using numpy function as an external program
f(x) = np.sum(x**2)
(In fact, for this simple numpy function, no loop over batch_size is necessary. But, it is written for general external function.)
#tf.custom_gradient
def custom_op(x):
# without using numpy, use external function
# assume x shape = (batch_size,3)
batch_size= x.shape[0]
input_length = x.shape[1]
# assert input_length==3
yout=[] # shape should be (batch_size,1)
gout=[] # shape should be (batch_size,3)
for i in range(batch_size):
inputs = x[i,:] # shape (3,)
y = np.sum(inputs**2) # shape (3,)
yout.append(y) # shape (1,)
# compute differences
dy = []
for j in range(len(inputs)):
delta = np.zeros_like(inputs)
delta[j] = np.abs(inputs[j])*0.001
yplus = np.sum((inputs + delta)**2) # change only j-th input
grad = (yplus-y)/delta[j] #shape (1,)
dy.append(grad)
gout.append(dy)
yout = tf.convert_to_tensor(yout,dtype='float32') # (batch_size,)
yout = tf.reshape(yout,shape=(batch_size,1)) # (batch_size,1)
gout = tf.convert_to_tensor(gout,dtype='float32') # (batch_size,)
gout = tf.reshape(gout,shape=(batch_size,input_length)) # (batch_size,1)
def grad(upstream):
return upstream*gout
return yout, grad
x = tf.Variable([[1.,2.,3.],[2.,3.,4.]],dtype='float32')
with tf.GradientTape() as tape:
y = custom_op(x)
tape.gradient(y,x)
and found it works.
However, when I tried to use it in the keras model , for example,
def construct_model():
inputs = tf.keras.Input(shape=(3,)) #input array
x = tf.keras.layers.Dense(1)(inputs)
outputs = custom_op(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
optimizer = 'adam'
model.compile(loss='mean_squared_error',
optimizer=optimizer,
metrics=['mean_absolute_error', 'mean_squared_error'])
return model
model = construct_model()
it gives errors
because kerasTensor "inputs" does not have specified batch_size.
I tried to specify batch_size as "tf.keras.Input(shape=(3,),batch_size=2)".
However, it also raises errors because of the use of kerasTensor.
How should I change the custom_op to be compatible with keras?

LSTM always predicts the same class

I’m trying to solve an nlp classification problem with a LSTM. The code for the model is defined here:
class LSTM(nn.Module):
def __init__(self, hidden_size, embedding_size=66 ):
super().__init__()
self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first = True, bidirectional = True)
self.fc = nn.Linear(2*hidden_size,2)
def forward(self, input_seq):
output, (hidden_state, cell_state) = self.lstm(input_seq)
hidden_state = torch.cat((hidden_state[-1,:], hidden_state[-2,:]), -1)
logits = self.fc(hidden_state)
return nn.LogSoftmax(dim=1)(logits)
And the function I’m using to train this model is here:
def train_loop(dataloader, model, loss_fn, optimizer):
loss_fn = loss_fn
size = len(dataloader.dataset)
model.train()
zeros = 0
for batch, (X, y) in enumerate(dataloader):
# Transform string into tensor
tensor = torch.zeros(1,len(X[0]),66)
for i in range(len(X[0])):
tensor[0][i][ctoi[X[0][i]]] = 1
pred = model(tensor)
target = torch.zeros(2, dtype=torch.long)
target[y] = 1
if batch % 100 == 0:
print(pred.squeeze(), target)
loss = loss_fn(pred.squeeze(), target)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if pred.squeeze().argmax() == 0:
zeros += 1
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
print(f'In trainning predicted {zeros} zeroes out of {size} samples')
The X’s are still strings, that’s why I need to convert them to tensors before running it through the model. The y’s are either a 0 or 1 (since its a binary classification problem), that I need to convert to a tensor of shape (2,) to run through the loss function.
For some reason I keep getting the same class predicted for every input. The classes are not even that unbalanced (~45% to 55%), and I’ve tried changing the weights of the classes in the loss function with no improvements, it either converges to predicting always a 0 or always a 1. Most of the time it it converges to predicting always a 0, which makes even less sense because what happens usually is that the class 0 has less samples than class 1.
Since you're training a binary classification model, your output dim should be 1 (corresponding to a single probability P(y|x)). This means that the y you're retrieving from your dataloader should be the y used in your loss function (assuming a cross-entropy loss). The predicted class is therefore y_hat = round(pred) (i.e., is the prediction >= 0.5).
As a point of clarity, it would be much easier to follow your logic if the one-hot encoding happened within your dataset (either in __getitem__ or __iter__). It's also worth noting that you don't use embeddings, so the code of your classifier is a bit misleading.

Tensorflow AutoGraph Polynomial Model With Multiple Outputs

I have a tensorflow model whose outputs correspond to coefficients of multiple polynomials. Note that my model actually has another set outputs (multi-output), but I've mocked this below just by returning the input in addition to the polynomial coefficients.
I'm having a lot of trouble during the training of the model, related to tensor shapes. I've verified that the model is able to predict on sample inputs, and that the loss function works on sample outputs. But, during training, it immediately throws an error (see below)
For every input, the model takes in a fixed embedding-size input, and outputs coefficients for 2 polynomials of degree 2. For example, the output on a single input can look like:
[array([[[1, 2, 3],
[ 4, 5, 6]]]),
[...]]
corresponding to polynomials [1*x^2+2*x+3, 4*x^2+5*x+6]. Note that I've hidden the second output.
I noticed that tf.math.polyval requires a list of coefficients, making it wonky with AutoGrad. So, I implemented my own version of Horner's algorithm with pure tensors.
import numpy as np
import tensorflow as tf
import logging
import tensorflow.keras as K
#tf.function
def tensor_polyval(coeffs, x):
"""
Calculates polynomial scalars from tensor of polynomial coefficients
Tensorflow tf.math.polyval requires a list coeff, which isn't compatible with autograd
# Inputs:
- coeffs (NxD Tensor): each row of coeffs corresponds to r[0]*x^(D-1)+r[1]*x^(D-2)...+r[D]
- x: Scalar!
# Output:
- r[0]*x^(D-1)+r[1]*x^(D-2)...+r[D] for row in coeffs
"""
p = coeffs[:, 0]
for i in range(1,coeffs.shape[1]):
tf.autograph.experimental.set_loop_options(
shape_invariants=[(p, tf.TensorShape([None]))])
c = coeffs[:, i]
p = tf.add(c, tf.multiply(x, p))
return p
#tf.function
def coeffs_to_poly(coeffs, n):
# Converts a NxD array of coefficients to N evaluated polynomials at x=n
return tensor_polyval(coeffs, tf.convert_to_tensor(n))
Now here's a super-simplified example of my model, loss function and training routine:
def model_init(embedDim=8, polyDim=2,terms=2):
input = K.Input(shape=(embedDim,))
x = K.layers.Reshape((embedDim,))(input)
aCoeffs = K.layers.Dense((polyDim+1)*terms, activation='tanh')(x)
aCoeffs = K.layers.Reshape((terms, polyDim+1))(aCoeffs)
model = K.Model(inputs=input, outputs=[aCoeffs, input])
return model
def get_random_batch(batch, embedDim, dtype='float64'):
x = np.random.randn(batch, embedDim).astype(dtype)
y = np.array([1. for i in range(batch)]).astype(dtype)
return [x,
y]
#tf.function
def test_loss(y_true, y_pred, dtype=dataType):
an = tf.vectorized_map(lambda y_p: coeffs_to_poly(y_p[0],
tf.constant(5,dtype=dataType)),
y_pred)
return tf.reduce_mean(tf.reduce_mean(an,axis=-1))
embedDim=8
polyDim=2
terms=2
dataType = 'float64'
tf.keras.backend.set_floatx(dataType)
model = model_init(embedDim, polyDim, terms)
XTrain, yTrain = get_random_batch(batch=128,
embedDim=embedDim)
# Init Model
LR = 0.001
loss = test_loss
epochs = 5
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LR), loss=loss)
hist = model.fit(XTrain,
yTrain,
batch_size=4,
epochs=epochs,
max_queue_size=10, workers=2, use_multiprocessing=True)
The error I get is related to the tensor_polyval function:
<ipython-input-15-f96bd099fe08>:3 test_loss *
an = tf.vectorized_map(lambda y_p: coeffs_to_poly(y_p[0],
<ipython-input-5-7205207d12fd>:23 coeffs_to_poly *
return tensor_polyval(coeffs, tf.convert_to_tensor(n))
<ipython-input-5-7205207d12fd>:13 tensor_polyval *
p = coeffs[:, 0]
...
ValueError: Index out of range using input dim 1; input has only 1 dims for '{{node strided_slice}} = StridedSlice[Index=DT_INT32, T=DT_DOUBLE, begin_mask=1, ellipsis_mask=0, end_mask=1, new_axis_mask=0, shrink_axis_mask=2](coeffs, strided_slice/stack, strided_slice/stack_1, strided_slice/stack_2)' with input shapes: [3], [2], [2], [2] and with computed input tensors: input[3] = <1 1>.
What's frustrating is that I'm perfectly able to predict with the model on sample inputs and also calculate a sample loss:
test_loss(yTrain[0:5],
model.predict(XTrain[0:5]),
dtype=dataType)
which runs just fine.
In the test_loss function, specifically the I'm just referring to the first output, via y_p[0]. It tries to calculate the value of the polynomials at n=5 and then outputs an average over everything (again this is just mocked code). As I understand it, y_p[1] would refer to the second output (in this case, a copy of the input). I would think the tf.vectorized_map should be operating across all outputs of the model batch, but it seems to be slicing one extra dimension??
I noticed that the code does train if I remove the output ,input in the model (making it a single output) and change y_p[0] to y_p in the test_loss. I have no idea why it's broken when adding the extra output, as my understanding of tf.vectorized_map implies that it acts separately on each element of the list y_pred
If we need the single loss function to receive multiple outputs altogether, perhaps we can concatenate them together to form one output.
In this case:
Changes to the model structure, here we pack the outputs:
def model_init(embedDim=8, polyDim=2, terms=2):
input = K.Input(shape=(embedDim, ))
x = K.layers.Reshape((embedDim, ))(input)
aCoeffs = K.layers.Dense((polyDim + 1) * terms, activation='tanh')(x)
# pack the two outputs, add flatten layers if their shapes are not batch*K
outputs = K.layers.Concatenate()([aCoeffs, input])
model = K.Model(inputs=input, outputs=outputs)
model.summary()
return model
Changes to the loss function, here we unpack the outputs:
# the loss function needs to know these
polyDim = 2
terms = 2
#tf.function
def test_loss(y_true, y_pred, dtype=dataType):
"""Loss function for flattened outputs."""
# unpack multiple outputs
offset = (polyDim + 1) * terms
aCoeffs = tf.reshape(y_pred[:, :offset], [-1, terms, polyDim + 1])
inputs = y_pred[:, offset:]
print(aCoeffs, inputs)
# do something with the two unpacked outputs, like below
an = tf.vectorized_map(
lambda y_p: coeffs_to_poly(y_p, tf.constant(5, dtype=dataType)),
aCoeffs)
return tf.reduce_mean(tf.reduce_mean(an, axis=-1))
Notice that the loss function relies on the knowledge of the original shapes of the outputs in order to restore them. Consider sub-classing tf.keras.losses.Loss.
P.S. For anyone simply need different losses for the multiple losses:
Define loss functions for the two outputs.
#tf.function
def test_loss(y_true, y_pred, dtype=dataType):
"""Loss function for output 1
(Only changed y_p[0] to y_p)"""
an = tf.vectorized_map(
lambda y_p: coeffs_to_poly(y_p, tf.constant(5, dtype=dataType)),
y_pred)
return tf.reduce_mean(tf.reduce_mean(an, axis=-1))
#tf.function
def dummy_loss(y_true, y_pred, dtype=dataType):
"""Loss function for output 2 i.e. the input, for debugging
Better use 0 insead of 1.2345"""
return tf.constant(1.2345, dataType)
Change to model.compile:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LR), loss=[test_loss, dummy_loss])

Class Weight not supported for 3+ dimensional targets - Python Tensorflow [duplicate]

Here's the code I'm working with (pulled from Kaggle mostly):
inputs = Input((IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS))
...
outputs = Conv2D(4, (1, 1), activation='sigmoid') (c9)
model = Model(inputs=[inputs], outputs=[outputs])
model.compile(optimizer='adam', loss='dice', metrics=[mean_iou])
results = model.fit(X_train, Y_train, validation_split=0.1, batch_size=8, epochs=30, class_weight=class_weights)
I have 4 classes that are very imbalanced. Class A equals 70%, class B = 15%, class C = 10%, and class D = 5%. However, I care most about class D. So I did the following type of calculations: D_weight = A/D = 70/5 = 14 and so on for the weight for class B and A. (if there are better methods to select these weights, then feel free)
In the last line, I'm trying to properly set class_weights and I'm doing it as so: class_weights = {0: 1.0, 1: 6, 2: 7, 3: 14}.
However, when I do this, I get the following error.
class_weight not supported for 3+ dimensional targets.
Is it possible that I add a dense layer after the last layer and just use it as a dummy layer so I can pass the class_weights and then only use the output of the last conv2d layer to do the prediction?
If this is not possible, how would I modify the loss function (I'm aware of this post, however, just passing in the weights in to the loss function won't cut it, because the loss function is called separately for each class) ? Currently, I'm using the following loss function:
def dice_coef(y_true, y_pred):
smooth = 1.
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
def bce_dice_loss(y_true, y_pred):
return 0.5 * binary_crossentropy(y_true, y_pred) - dice_coef(y_true, y_pred)
But I don't see any way in which I can input class weights. If someone wants the full working code see this post. But remember to change the final conv2d layer's num classes to 4 instead of 1.
You can always apply the weights yourself.
The originalLossFunc below you can import from keras.losses.
The weightsList is your list with the weights ordered by class.
def weightedLoss(originalLossFunc, weightsList):
def lossFunc(true, pred):
axis = -1 #if channels last
#axis= 1 #if channels first
#argmax returns the index of the element with the greatest value
#done in the class axis, it returns the class index
classSelectors = K.argmax(true, axis=axis)
#if your loss is sparse, use only true as classSelectors
#considering weights are ordered by class, for each class
#true(1) if the class index is equal to the weight index
classSelectors = [K.equal(i, classSelectors) for i in range(len(weightsList))]
#casting boolean to float for calculations
#each tensor in the list contains 1 where ground true class is equal to its index
#if you sum all these, you will get a tensor full of ones.
classSelectors = [K.cast(x, K.floatx()) for x in classSelectors]
#for each of the selections above, multiply their respective weight
weights = [sel * w for sel,w in zip(classSelectors, weightsList)]
#sums all the selections
#result is a tensor with the respective weight for each element in predictions
weightMultiplier = weights[0]
for i in range(1, len(weights)):
weightMultiplier = weightMultiplier + weights[i]
#make sure your originalLossFunc only collapses the class axis
#you need the other axes intact to multiply the weights tensor
loss = originalLossFunc(true,pred)
loss = loss * weightMultiplier
return loss
return lossFunc
For using this in compile:
model.compile(loss= weightedLoss(keras.losses.categorical_crossentropy, weights),
optimizer=..., ...)
Changing the class balance directly on the input data
You can change the balance of the input samples too.
For instance, if you have 5 samples from class 1 and 10 samples from class 2, pass the samples for class 5 twice in the input arrays.
.
Using the sample_weight argument.
Instead of working "by class", you can also work "by sample".
Create an array of weights for each sample in your input array: len(x_train) == len(weights)
And fit passing this array to the sample_weight argument.
(If it's fit_generator, the generator will have to return the weights along with the train/true pairs: return/yield inputs, targets, weights)

Implementing a batch dependent loss in Keras

I have an autoencoder set up in Keras. I want to be able to weight the features of the input vector according to a predetermined 'precision' vector. This continuous valued vector has the same length as the input, and each element lies in the range [0, 1], corresponding to the confidence in the corresponding input element, where 1 is completely confident and 0 is no confidence.
I have a precision vector for every example.
I have defined a loss that takes into account this precision vector. Here, reconstructions of low-confidence features are down-weighted.
def MAEpw_wrapper(y_prec):
def MAEpw(y_true, y_pred):
return K.mean(K.square(y_prec * (y_pred - y_true)))
return MAEpw
My issue is that the precision tensor y_prec depends on the batch. I want to be able to update y_prec according to the current batch so that each precision vector is correctly associated with its observation.
I have the done the following:
global y_prec
y_prec = K.variable(P[:32])
Here P is a numpy array containing all precision vectors with the indices corresponding to the examples. I initialize y_prec to have the correct shape for a batch size of 32. I then define the following DataGenerator:
class DataGenerator(Sequence):
def __init__(self, batch_size, y, shuffle=True):
self.batch_size = batch_size
self.y = y
self.shuffle = shuffle
self.on_epoch_end()
def on_epoch_end(self):
self.indexes = np.arange(len(self.y))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __len__(self):
return int(np.floor(len(self.y) / self.batch_size))
def __getitem__(self, index):
indexes = self.indexes[index * self.batch_size: (index+1) * self.batch_size]
# Set precision vector.
global y_prec
new_y_prec = K.variable(P[indexes])
y_prec = K.update(y_prec, new_y_prec)
# Get training examples.
y = self.y[indexes]
return y, y
Here I am aiming to update y_prec in the same function that generates the batch. This seems to be updating y_prec as expected. I then define my model architecture:
dims = [40, 20, 2]
model2 = Sequential()
model2.add(Dense(dims[0], input_dim=64, activation='relu'))
model2.add(Dense(dims[1], input_dim=dims[0], activation='relu'))
model2.add(Dense(dims[2], input_dim=dims[1], activation='relu', name='bottleneck'))
model2.add(Dense(dims[1], input_dim=dims[2], activation='relu'))
model2.add(Dense(dims[0], input_dim=dims[1], activation='relu'))
model2.add(Dense(64, input_dim=dims[0], activation='linear'))
And finally, I compile and run:
model2.compile(optimizer='adam', loss=MAEpw_wrapper(y_prec))
model2.fit_generator(DataGenerator(32, digits.data), epochs=100)
Where digits.data is a numpy array of observations.
However, this ends up defining separate graphs:
StopIteration: Tensor("Variable:0", shape=(32, 64), dtype=float32_ref) must be from the same graph as Tensor("Variable_4:0", shape=(32, 64), dtype=float32_ref).
I've scoured SO for a solution to my problem but nothing I've found works. Any help on how to do this properly is appreciated.
This autoencoder can be easily implemented using the Keras functional API. This will allow to have an additional input placeholder y_prec_input, which will be fed with the "precision" vector. The full source code can be found here.
Data generator
First, let's reimplement your data generator as follows:
class DataGenerator(Sequence):
def __init__(self, batch_size, y, prec, shuffle=True):
self.batch_size = batch_size
self.y = y
self.shuffle = shuffle
self.prec = prec
self.on_epoch_end()
def on_epoch_end(self):
self.indexes = np.arange(len(self.y))
if self.shuffle:
np.random.shuffle(self.indexes)
def __len__(self):
return int(np.floor(len(self.y) / self.batch_size))
def __getitem__(self, index):
indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]
y = self.y[indexes]
y_prec = self.prec[indexes]
return [y, y_prec], y
Note that I got rid of the global variable. Now, instead, the precision vector P is provided as input argument (prec), and the generator yields an additional input that will be fed to the precision placeholder y_prec_input (see model definition).
Model
Finally, your model can be defined and trained as follows:
y_input = Input(shape=(input_dim,))
y_prec_input = Input(shape=(1,))
h_enc = Dense(dims[0], activation='relu')(y_input)
h_enc = Dense(dims[1], activation='relu')(h_enc)
h_enc = Dense(dims[2], activation='relu', name='bottleneck')(h_enc)
h_dec = Dense(dims[1], activation='relu')(h_enc)
h_dec = Dense(input_dim, activation='relu')(h_dec)
model2 = Model(inputs=[y_input, y_prec_input], outputs=h_dec)
model2.compile(optimizer='adam', loss=MAEpw_wrapper(y_prec_input))
# Train model
model2.fit_generator(DataGenerator(32, digits.data, P), epochs=100)
where input_dim = digits.data.shape[1]. Note that I also changed the output dimension of the decoder to input_dim, since it must match the input dimension.
Try to test your code with worker=0 when you call fit_generator, if it works normally then threading is your problem.
If threading is the cause, try this:
# In the code that executes on the main thread
graph = tf.get_default_graph()
# In code that executes in other threads(e.g. your generator)
with graph.as_default():
...
...
new_y_prec = K.variable(P[indexes])
y_prec = K.update(y_prec, new_y_prec)

Categories