Keras optimizing two outputs with a custom loss - python

I've been recently trying to implement a model, which can be described as following: Given an input matrix and a set of targets, let the model learn, simultaneously, the matrix representation, as well as the targets via a custom loss function.
The architecture (simplified):
input_matrix = Input(shape=(i_shape,))
layer1 = Dense(100)(input_matrix)
output = Dense(3)(layer1)
autoencoder_mid = Dense(100)(input_matrix)
autoencoder_output = Dense(i_shape)(autoencoder_mid)
My idea of a loss function:
def customLoss(true_matrix,pred_matrix):
def combined_loss(y_true,y_pred):
return K.abs(y_true-y_pred)
a = K.mean( K.square(y_pred - y_true) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((y_true - 3)/5 )))),axis=-1 )
b = K.mean( K.square(pred_matrix - true_matrix) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((true_matrix - 3)/5 )))),axis=-1)
return a+b
return combined_loss
I compile the model as:
net = Model(input_matrix, [output,autoencoder_output])
net = net.compile(optimizer='adam', loss=customLoss(true_matrix=X,pred_matrix=autoencoder_output))
Where I try to fit the network with a standard:
net.fit(X,
target,
epochs=10,
batch_size=10)
The error I get is:
ValueError: Tensor conversion requested dtype float32 for Tensor with dtype float64: 'Tensor("loss/dense_4_loss/Log_3:0", shape=(389, 3890), dtype=float64, device=/device:GPU:0)'
My question is, is there any other way of doing this? If so, could you please point me towards a possible solution. Thank you very much.

You can try this:
def customLoss(true_matrix):
def combined_loss(y_true,y_pred):
y_pred, pred_matrix = y_pred
...
return combined_loss
net = Model(input_matrix, [output,autoencoder_output])
net.compile(optimizer='adam', loss=customLoss(X))
As the original y_pred will be a touple with (output,autoencoder_output).
Concerning the double return, the function will only return the first one, so I'd remove one of the two return lines or combine the two outputs such as:
alpha = 0.5
beta = 0.5
...
loss1, loss2 = K.abs(y_true-y_pred), a+b
return alpha*loss1 + beta*loss2
Changing alpha and beta upon convenience.
Thus, the whole thing could be:
def customLoss(true_matrix, alpha = 0.5, beta = 0.5):
def combined_loss(y_true,y_pred):
y_pred, pred_matrix = y_pred
a = K.mean( K.square(y_pred - y_true) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((y_true - 3)/5 )))),axis=-1 )
b = K.mean( K.square(pred_matrix - true_matrix) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((true_matrix - 3)/5 )))),axis=-1)
loss1, loss2 = K.abs(y_true-y_pred), a+b
return alpha*loss1 + beta*loss2
return combined_loss
net = Model(input_matrix, [output,autoencoder_output])
net.compile(optimizer='adam', loss=customLoss(X))

Related

Neural network built from scratch in python to classify digits stuck at 11.35 percent accuracy. I am using the MNIST dataset

My neural network is stuck at 11.35 percent accuracy and i am unable to trace the error.
low accuracy at 11.35 percent
I am following this code https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb which I found in a youtube video.
Here is my code for the neural network(I have defined Xavier weight initialization in a module called nn):
"""1. 784 neurons in input layer
2. 128 neurons in hidden layer 1
3. 64 neurons in hidden layer 2
4. 10 neurons in output layer"""
def softmax(input):
y = np.exp(input - input.max())
activated = y/ np.sum(y, axis=0)
return activated
def softmax_grad(x):
exps = np.exp(x-x.max())
return exps / np.sum(exps,axis = 0) * (1 - exps /np.sum(exps,axis = 0))
def sigmoid(input):
activated = 1/(1 + np.exp(-input))
return activated
def sigmoid_grad(input):
grad = input*(1-input)
return grad
class DenseNN:
def __init__(self,d0,d1,d2,d3):
self.params = {'w1': nn.Xavier.initialize(d0, d1),
'w2': nn.Xavier.initialize(d1, d2),
'w3': nn.Xavier.initialize(d2, d3)}
def forward(self,a0):
params = self.params
params['a0'] = a0
params['z1'] = np.dot(params['w1'],params['a0'])
params['a1'] = sigmoid(params['z1'])
params['z2'] = np.dot(params['w2'],params['a1'])
params['a2'] = sigmoid(params['z2'])
params['z3'] = np.dot(params['w3'],params['a2'])
params['a3'] = softmax(params['z3'])
return params['a3']
def backprop(self,y_true,y_pred):
params = self.params
w_change = {}
error = softmax_grad(params['z3'])*((y_pred - y_true)/y_true.shape[0])
w_change['w3'] = np.outer(error,params['a2'])
error = np.dot(params['w3'].T,error)*sigmoid_grad(params['a2'])
w_change['w2'] = np.outer(error,params['a1'])
error = np.dot(params['w2'].T,error)*sigmoid_grad(params['a1'])
w_change['w1'] = np.outer(error,params['a0'])
return w_change
def update_weights(self,learning_rate,w_change):
self.params['w1'] -= learning_rate*w_change['w1']
self.params['w2'] -= learning_rate*w_change['w2']
self.params['w3'] -= learning_rate*w_change['w3']
def train(self,epochs,lr):
for epoch in range(epochs):
for i in range(60000):
a0 = np.array([x_train[i]]).T
o = np.array([y_train[i]]).T
y_pred = self.forward(a0)
w_change = self.backprop(o,y_pred)
self.update_weights(lr,w_change)
# print(self.compute_accuracy()*100)
# print(calc_mse(a3, o))
print((self.compute_accuracy())*100)
def compute_accuracy(self):
'''
This function does a forward pass of x, then checks if the indices
of the maximum value in the output equals the indices in the label
y. Then it sums over each prediction and calculates the accuracy.
'''
predictions = []
for i in range(10000):
idx = i
a0 = x_test[idx]
a0 = np.array([a0]).T
#print("acc a1",np.shape(a1))
o = y_test[idx]
o = np.array([o]).T
#print("acc o",np.shape(o))
output = self.forward(a0)
pred = np.argmax(output)
predictions.append(pred == np.argmax(o))
return np.mean(predictions)
Here is the code for loading the data:
#load dataset csv
train_data = pd.read_csv('../Datasets/MNIST/mnist_train.csv')
test_data = pd.read_csv('../Datasets/MNIST/mnist_test.csv')
#train data
x_train = train_data.drop('label',axis=1).to_numpy()
y_train = pd.get_dummies(train_data['label']).values
#test data
x_test = test_data.drop('label',axis=1).to_numpy()
y_test = pd.get_dummies(test_data['label']).values
fac = 0.99 / 255
x_train = np.asfarray(x_train) * fac + 0.01
x_test = np.asfarray(x_test) * fac + 0.01
# train_labels = np.asfarray(train_data[:, :1])
# test_labels = np.asfarray(test_data[:, :1])
#printing dimensions
print(np.shape(x_train)) #(60000,784)
print(np.shape(y_train)) #(60000,10)
print(np.shape(x_test)) #(10000,784)
print(np.shape(y_test)) #(10000,10)
print((x_train))
Kindly help
I am a newbie in machine learning so any help would be appreciated.I am unable to figure out where i am going wrong.Most of the code is almost similar to https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb but it manages to get 60 percent accuracy.
EDIT
I found the mistake :
Thanks to Bartosz Mikulski.
The problem was with how the weights were initialized in my Xavier weights initialization algorithm.
I changed the code for weights initialization to this:
self.params = {
'w1':np.random.randn(d1, d0) * np.sqrt(1. / d1),
'w2':np.random.randn(d2, d1) * np.sqrt(1. / d2),
'w3':np.random.randn(d3, d2) * np.sqrt(1. / d3),
'b1':np.random.randn(d1, 1) * np.sqrt(1. / d1),
'b2':np.random.randn(d2, 1) * np.sqrt(1. / d2),
'b3':np.random.randn(d3, 1) * np.sqrt(1. / d3),
}
then i got the output:
After changing weights initialization
after adding the bias parameters i got the output:
After changing weights initialization and adding bias
3: After changing weights initialization and adding bias
The one problem that I can see is that you are using only weights but no biases. They are very important because they allow your model to change the position of the decision plane (boundary) in the solution space. If you only have weights you can only angle the solution.
I guess that basically, this is the best fit you can get without biases. The dense layer is basically a linear function: w*x + b and you are missing the b. See the PyTorch documentation for the example: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#linear.
Also, can you show your Xavier initialization? In your case, even the simple normal distributed values would be enough as initialization, no need to rush into more advanced topics.
I would also suggest you start from the smaller problem (for example Iris dataset) and no hidden layers (just a simple linear regression that learns by using gradient descent). Then you can expand it by adding hidden layers, and then by trying harder problems with the code you already have.

How to write this custom loss function so it produces a loss for each sample?

I'm using this custom loss function for ccc
def ccc(y_true, y_pred):
ccc = ((ccc_v(y_true, y_pred) + ccc_a(y_true, y_pred)) / 2)
return 1 - ccc
def ccc_v(y_true, y_pred):
x = y_true[:,0]
y = y_pred[:,0]
x_mean = K.mean(x, axis=0)
y_mean = K.mean(y, axis=0)
covar = K.mean( (x - x_mean) * (y - y_mean) )
x_var = K.var(x)
y_var = K.var(y)
ccc = (2.0 * covar) / (x_var + y_var + (x_mean + y_mean)**2)
return ccc
def ccc_a(y_true, y_pred):
x = y_true[:,1]
y = y_pred[:,1]
x_mean = K.mean(x, axis=0)
y_mean = K.mean(y, axis=0)
covar = K.mean( (x - x_mean) * (y - y_mean) )
x_var = K.var(x)
y_var = K.var(y)
ccc = (2.0 * covar) / (x_var + y_var + (x_mean + y_mean)**2)
return ccc
Currently the loss function ccc returns a scalar. The loss function is split into 2 different functions (ccc_v and ccc_a) because I use them as metrics as well.
I've read from Keras doc and this question that a custom loss function should return a list of losses, one for each sample.
First question: my model trains even if the loss function returns a scalar. Is it that bad? How is training different if I use a loss function whose output is a scalar instead of a list of scalars?
Second question: how can I rewrite my loss function to return a list of losses? I know I should avoid means and sums but in my case I think it's not possible because there's not a global mean but different ones, one a the numerator for the covariance and a couple at the denominator for the variances.
if your using tensorflow there are automatic apis for calculating loss
tf.keras.losses.mse()
tf.keras.losses.mae()
tf.keras.losses.Huber()
# Define the loss function
def loss_function(w1, b1, w2, b2, features = borrower_features, targets = default):
predictions = model(w1, b1, w2, b2)
# Pass targets and predictions to the cross entropy loss
return keras.losses.binary_crossentropy(targets, predictions)
#if your using categorical_crossentropy than return the losses for it.
#convert your image into a single np.array for input
#build your SoftMax model
# Define a sequential model
model=keras.Sequential()
# Define a hidden layer
model.add(keras.layers.Dense(16, activation='relu', input_shape=(784,)))
# Define the output layer
model.add(keras.layers.Dense(4,activation='softmax'))
# Compile the model
model.compile('SGD', loss='categorical_crossentropy',metrics=['accuracy'])
# Complete the fitting operation
train_data=train_data.reshape((50,784))
# Fit the model
model.fit(train_data, train_labels, validation_split=0.2, epochs=3)
# Reshape test data
test_data = test_data.reshape(10, 784)
# Evaluate the model
model.evaluate(test_data, test_labels)

Modified PyTorch loss function BCEWithLogitsLoss returns NaNs

I'm trying to solve a binary classification problem (target=0 and target=1) with an exception:
Some of my labels are classified as target=0.5 on purpose, and I wish to have zero loss for either classifying it as 0 or 1 (i.e both classes are "correct").
I tried to implement a custom loss from scratch, based on PyTorch's BCEWithLogitsLoss:
class myLoss(torch.nn.Module):
def __init__(self, pos_weight=1):
super().__init__()
self.pos_weight = pos_weight
def forward(self, input, target):
epsilon = 10 ** -44
my_bce_loss = -1 * (self.pos_weight * target * F.logsigmoid(input + epsilon)
+ (1 - target) * log(1 - sigmoid(input) + epsilon))
add_loss = (target - 0.5) ** 2 * 4
mean_loss = (my_bce_loss * add_loss).mean()
return mean_loss
epsilon was chosen so the log will be bounded to -100, as suggested in BCE loss.
However I'm still getting NaN errors, after several epochs:
Function 'LogBackward' returned nan values in its 0th output.
or
Function 'SigmoidBackward' returned nan values in its 0th output.
Any suggestions how can I correct my loss function? maybe by somehow inherit and modify forward function?
Update:
The way I call my custom loss function:
y = batch[:, -1, :].to(self.device, dtype=torch.float32)
y_pred_batch = self.model(x)
LossFun = myLoss(self.pos_weight)
batch_result.loss = LossFun.forward(y_pred_batch, y)
I use Temporal Convolutional Network model, implemented as follows:
out = self.conv1(x)
out = self.chomp1(out)
out = self.elu(out)
out = self.dropout1(out)
res = x if self.downsample is None else self.downsample(x)
return self.tanh(out + res)
Try it this way:
class myLoss(torch.nn.Module):
def __init__(self, pos_weight=1):
super().__init__()
self.pos_weight = pos_weight
def forward(self, input, target):
epsilon = 10 ** -44
input = input.sigmoid().clamp(epsilon, 1 - epsilon)
my_bce_loss = -1 * (self.pos_weight * target * torch.log(input)
+ (1 - target) * torch.log(1 - input))
add_loss = (target - 0.5) ** 2 * 4
mean_loss = (my_bce_loss * add_loss).mean()
return mean_loss
To test I perform 1000 backwards:
target = torch.randint(high=2, size=(32,))
loss_fn = myLoss()
for i in range(1000):
inp = torch.rand(1, 32, requires_grad=True)
loss = loss_fn(inp, target)
loss.backward()
if torch.isnan(loss):
print('Loss NaN')
if torch.isnan(inp.grad).any():
print('NaN')
All works nice.

Is it possible to convert tensorflow code to theano code?

I have a function that uses some tensorflow functions. I need this function in Theano because on the Platform i want to use this code there is only Theano installed and not tensorflow. I am working mainly with Keras so tensorflow is quite cryptic for me.
The function looks like this:
class WeightedBinaryCrossEntropy(object):
def __init__(self, pos_ratio):
neg_ratio = 1. - pos_ratio
self.pos_ratio = tf.constant(pos_ratio, tf.float32)
self.weights = tf.constant(neg_ratio / pos_ratio, tf.float32)
self.__name__ = "weighted_binary_crossentropy({0})".format(pos_ratio)
def __call__(self, y_true, y_pred):
return self.weighted_binary_crossentropy(y_true, y_pred)
def weighted_binary_crossentropy(self, y_true, y_pred):
# Transform to logits
epsilon = tf.convert_to_tensor(K.common._EPSILON, y_pred.dtype.base_dtype)
y_pred = tf.clip_by_value(y_pred, epsilon, 1 - epsilon)
y_pred = tf.log(y_pred / (1 - y_pred))
cost = tf.nn.weighted_cross_entropy_with_logits(y_true, y_pred, self.weights)
return K.mean(cost * self.pos_ratio, axis=-1)
model.compile(loss=WeightedBinaryCrossEntropy(0.05), optimizer=optimizer, metrics=['accuracy'])
Installing Tensorflow on the Platform is not possible.
I got the code from here https://github.com/fchollet/keras/issues/2115
So are there functions in Theano that work like the functions in Tensorflow?
Maybe you should use only keras and have a portable model:
(Keras functions: https://keras.io/backend/)
class WeightedBinaryCrossEntropy(object):
def __init__(self, pos_ratio):
neg_ratio = 1. - pos_ratio
self.pos_ratio = K.constant([pos_ratio])
self.weights = K.constant([neg_ratio / pos_ratio])
self.__name__ = "weighted_binary_crossentropy({0})".format(pos_ratio)
def __call__(self, y_true, y_pred):
return self.weighted_binary_crossentropy(y_true, y_pred)
def weighted_binary_crossentropy(self, y_true, y_pred):
# Transform to logits
epsilon = K.epsilon()
y_pred = K.clip(y_pred, epsilon, 1 - epsilon)
y_pred = K.log(y_pred / (1 - y_pred))
#for the crossentropy, you can maybe (make sure, please)
#use K.binary_crossentropy and multiply the weights later
cost = self.approach1(y_true,y_pred)
#or you could simulate the same formula as in tensorflow:
#https://www.tensorflow.org/api_docs/python/tf/nn/weighted_cross_entropy_with_logits
cost = self.approach2(y_true,y_pred)
return K.mean(cost * self.pos_ratio, axis=-1)
#I use a similar thing in my codes, but I'm not sure my weights are calculated the same way you do
def approach1(self,y_true,y_pred):
weights = (y_true * self.weights) + 1 #weights applied only to positive values
return K.binary_crossentropy(y_true, y_pred,from_logits=True)*weights
#seems more trustable, since it's exactly the tensorflow formula
def approach2(self,y_true,y_pred):
posPart = y_true * (-K.log(K.sigmoid(y_pred))) * self.weights
negPart = (1-y_true)*(-K.log(1 - K.sigmoid(y_pred)))
return posPart + negPart
model.compile(loss=WeightedBinaryCrossEntropy(0.05), optimizer=optimizer, metrics=['accuracy'])

About mechanism/behavior of 'softmax_cross_entropy_with_logits' function on Tensorflow

:)
Hi guys.
Now, I'm working on Multi-hot classification by using tensorflow.
If softmax_cross_entropy_with_logits is used, the loss function increases (e.g. loss: 50 -> loss: 190000 -> loss: 2138712811 -> ...).
Therefore, I want to clarify the mechanism of softmax_cross_entropy_with_logits.
I thought that the python code of below could imitate softmax_cross_entropy_with_logits.
def imitation_SCEwL(logits, labels):
_y = tf.nn.softmax(logits)
_y = tf.clip_by_value(_y, 1e-10, 1.0)
cross_entropy = -( labels * tf.log(_y) )
cross_entropy = tf.reduce_sum(cross_entropy, 1, keep_dims=True)
cross_entropy = tf.reduce_mean( cross_entropy )
return cross_entropy
However, behavior is different between genuine (implemented in tensorflow) and the above code.
Using the above code, the loss function converges.
why?
I'm changing only code below.
def loss_softmax_cross_entropy_with_logits(logits, labels):
_y = tf.nn.softmax_cross_entropy_with_logits(logits, labels)
cross_entropy = tf.reduce_mean(_y)
return cross_entropy
Basic idea of your logic is correct but you should specify "dim" when you call softmax inside your imitation_SCEwL function.
See following code to calculate loss:
1. numpy implementation
def xent(features, labels, dim=-1):
if dim is -1:
dim = len(features.shape) - 1
one_only_on_dim = list(features.shape)
one_only_on_dim[dim] = 1
e = np.exp(features - np.reshape(
np.amax(
features, axis=dim), one_only_on_dim))
probs = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
l = -np.sum(labels * np.log(probs + 1.0e-20), axis=dim)
return l
tensorflow native implementation
""
def loss_softmax_cross_entropy_with_logits(np_features, np_labels, dim=-1):
loss = nn_ops.softmax_cross_entropy_with_logits(
labels=np_labels, logits=np_features, dim=dim)
tf_loss = sess.run(loss)
return tf_loss
revised version of your function
""
def imitation_SCEwL(logits, labels, dim=-1):
_y = tf.nn.softmax(logits, dim=dim)
cross_entropy = labels * (-tf.log(_y))
cross_entropy = tf.reduce_sum(cross_entropy, 1, keep_dims=True)
retval = sess.run(cross_entropy)
return retval
I tested that all of loss values from these functions are equivalent. Let me know if they don't work for you.

Categories