Problem with custom layer in Tensorflow (not calling) - python

I'm trying to implement the Large Margin Cosine Loss in Tensorflow. I've found the following class that does it:
import math
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer
from tensorflow.keras.initializers import Constant
from tensorflow.python.keras.utils import tf_utils
def _resolve_training(layer, training):
if training is None:
training = K.learning_phase()
if isinstance(training, int):
training = bool(training)
if not layer.trainable:
# When the layer is not trainable, override the value
training = False
return tf_utils.constant_value(training)
class CosFace(keras.layers.Layer):
"""
Implementation of CosFace layer. Reference: https://arxiv.org/abs/1801.09414
Arguments:
num_classes: number of classes to classify
s: scale factor
m: margin
regularizer: weights regularizer
"""
def __init__(self,
num_classes,
s=30.0,
m=0.35,
regularizer=None,
name='cosface',
**kwargs):
super().__init__(name=name, **kwargs)
self._n_classes = num_classes
self._s = float(s)
self._m = float(m)
self._regularizer = regularizer
def build(self, input_shape):
embedding_shape, label_shape = input_shape
self._w = self.add_weight(shape=(embedding_shape[-1], self._n_classes),
initializer='glorot_uniform',
trainable=True,
regularizer=self._regularizer)
def call(self, inputs, training=None):
"""
During training, requires 2 inputs: embedding (after backbone+pool+dense),
and ground truth labels. The labels should be sparse (and use
sparse_categorical_crossentropy as loss).
"""
print('calling CosFace Layer...')
embedding, label = inputs
# Squeezing is necessary for Keras. It expands the dimension to (n, 1)
label = tf.reshape(int(label), [-1], name='label_shape_correction')
# Normalize features and weights and compute dot product
x = tf.nn.l2_normalize(embedding, axis=1, name='normalize_prelogits')
w = tf.nn.l2_normalize(self._w, axis=0, name='normalize_weights')
cosine_sim = tf.matmul(x, w, name='cosine_similarity')
training = _resolve_training(self, training)
if not training:
# We don't have labels if we're not in training mode
return self._s * cosine_sim
else:
one_hot_labels = tf.one_hot(label,
depth=self._n_classes,
name='one_hot_labels')
theta = tf.math.acos(K.clip(
cosine_sim, -1.0 + K.epsilon(), 1.0 - K.epsilon()))
final_theta = tf.where(tf.cast(one_hot_labels, dtype=tf.bool),
tf.math.cos(theta) - self._m,
tf.math.cos(theta),
name='final_theta')
print(final_theta)
output = tf.math.cos(final_theta, name='cosine_sim_with_margin')
return self._s * output
I'm testing it on a simple CNN trained on the MNIST dataset. However the train doesn't work. Here is the Network architecture:
label = keras.layers.Input((), name="input/labels")
input = keras.layers.Input(shape=[28,28,1], name="input/image")
margin = CosFace(num_classes=10, dtype='float32')
x = keras.layers.Conv2D(64, (3,3), padding="same")(input)
x = keras.layers.Activation("relu")(x)
x = keras.layers.MaxPooling2D((2,2))(x)
x = keras.layers.Conv2D(32, (3,3), padding="same")(x)
x = keras.layers.Activation("relu")(x)
x = keras.layers.MaxPooling2D(pool_size=(2,2))(x)
x = keras.layers.Conv2D(16, (3,3), padding="same")(x)
x = keras.layers.Activation("relu")(x)
x = keras.layers.MaxPooling2D(pool_size=(2,2))(x)
x = keras.layers.Dropout(0.25)(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(128)(x)
x = keras.layers.Activation("relu", name="dense")(x)
x = keras.layers.Dropout(0.25)(x)
x = margin([x, label])
output = keras.layers.Activation("softmax")(x)
model_cos = keras.Model(inputs=[input, label], outputs=output)
model_cos.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
H_cos = model_cos.fit((X_train, y_train), y_train, batch_size=64, epochs=3, verbose=1)
And this is the output:
Epoch 1/3
calling CosFace Layer...
Tensor("functional_11/cosface/final_theta:0", shape=(None, 10), dtype=float32)
calling CosFace Layer...
Tensor("functional_11/cosface/final_theta:0", shape=(None, 10), dtype=float32)
860/860 [==============================] - 7s 8ms/step - loss: 0.3194 - accuracy: 0.9751
Epoch 2/3
860/860 [==============================] - 6s 7ms/step - loss: 0.0545 - accuracy: 1.0000
Epoch 3/3
860/860 [==============================] - 6s 7ms/step - loss: 0.0368 - accuracy: 1.0000
I don't understand what's going on, first of all the real accuracy isn't 1, second of all, it looks like that after the second epoch the CosFace layer is not called anymore.
Do you have any idea on how to fix this?

Related

"CSVLogger" is returning different loss values than "on_epoch_end"

I have a Wasserstein Generative Adversarial Network model with a custom loss function and two Keras callbacks. The first callback generates plots of generated data with real data using the on_epoch_end() method, and the second is the CSVLogger() class. When training my GAN, logs will display at the end of each epoch that display the generator loss and critic loss. I wanted to save these to a text file so I can plot them. By using the CSVLogger() I get different loss values for both the generator and critic. I was reading this Different loss values computed by train_on_batch and evaluate and I suspect it might be related. I am using the train_step() method and not the test_step(). Since I am using a custom WGAN loss function, is the reason the values are different because the CSVLogger() returns the normal built in loss metrics and doesn't see the custom loss function? I know I can log my loss values by extracting them from inside the on_epoch_end() method using keys = list(logs.keys()) and appending them to a numpy array. But I wanted to take advantage of a built in function if possible for speed.
The code:
import time
from tqdm.notebook import tqdm
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import CSVLogger
import numpy as np
import matplotlib.pyplot as plt
def define_generator(latent_dim):
# This function creates the generator model using the functional API.
# Layers...
# Input Layer
inputs = Input(shape=latent_dim, name='INPUT_LAYER')
# 1st hidden layer
x = Dense(50, activation='relu', name='HIDDEN_LAYER_1')(inputs)
# 2nd hidden layer
x = Dense(150, activation='relu', name='HIDDEN_LAYER_2')(x)
# 3rd hidden layer
x = Dense(300, activation='relu', name='HIDDEN_LAYER_3')(x)
# 4th hidden layer
x = Dense(150, activation='relu', name='HIDDEN_LAYER_4')(x)
# 5th hidden layer
x = Dense(50, activation='relu', name='HIDDEN_LAYER_5')(x)
# Output layer
outputs = Dense(2, activation='linear', name='OUPUT_LAYER')(x)
# Instantiating the generator model
model = Model(inputs=inputs, outputs=outputs, name='GENERATOR')
return model
def generator_loss(fake_logits):
# This function calculates and returns the WGAN-GP generator loss.
# Expected value of critic ouput from fake images
expectation_fake = tf.reduce_mean(fake_logits)
# Loss to minimize
loss = -expectation_fake
return loss
def define_critic():
# This function creates the critic model using the functional API.
# Layers...
# Input Layer
inputs = Input(shape=2, name='INPUT_LAYER')
# 1st hidden layer
x = Dense(50, activation='relu', name='HIDDEN_LAYER_1')(inputs)
# 2nd hidden layer
x = Dense(150, activation='relu', name='HIDDEN_LAYER_2')(x)
# 3rd hidden layer
x = Dense(300, activation='relu', name='HIDDEN_LAYER_3')(x)
# 4th hidden layer
x = Dense(150, activation='relu', name='HIDDEN_LAYER_4')(x)
# 5th hidden layer
x = Dense(50, activation='relu', name='HIDDEN_LAYER_5')(x)
# Output layer
outputs = Dense(1, activation='linear', name='OUPUT_LAYER')(x)
# Instantiating the critic model
model = Model(inputs=inputs, outputs=outputs, name='CRITIC')
return model
def critic_loss(real_logits, fake_logits):
# This function calculates and returns the WGAN-GP critic loss.
# Expected value of critic output from real images
expectation_real = tf.reduce_mean(real_logits)
# Expected value of critic output from fake images
expectation_fake = tf.reduce_mean(fake_logits)
# Loss to minimize
loss = expectation_fake - expectation_real
return loss
class define_wgan(keras.Model):
# This class creates the WGAN-GP object.
# Attributes:
# critic = the critic model.
# generator = the generator model.
# latent_dim = defines generator input dimension.
# critic_steps = defines how many times the critic gets trained for each training cycle.
# gp_weight = defines and returns the critic gradient for the gradient penalty term.
# Methods:
# compile() = defines the optimizer and loss function of both the critic and generator.
# gradient_penalty() = calcuates and returns the gradient penalty term in the WGAN-GP loss function.
# train_step() = performs the WGAN-GP training by updating the critic and generator weights
# and returns the loss for both. Called by fit().
def __init__(self, gen, critic, latent_dim, n_critic_train, gp_weight):
super().__init__()
self.critic = critic
self.generator = gen
self.latent_dim = latent_dim
self.critic_steps = n_critic_train
self.gp_weight = gp_weight
def compile(self, generator_loss, critic_loss):
super().compile()
self.generator_optimizer = keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5, beta_2=0.9)
self.critic_optimizer = keras.optimizers.Adam(learning_rate=0.0002, beta_1=0.5, beta_2=0.9)
self.generator_loss_function = generator_loss
self.critic_loss_function = critic_loss
def gradient_penalty(self, batch_size, x_real, x_fake):
# Random uniform samples of points between distribution.
# "alpha" must be a tensor so that "x_interp" will also be a tensor.
alpha = tf.random.uniform([batch_size,2])
# Data interpolated between real and fake distributions
x_interp = alpha*x_real + (1-alpha)*x_fake
# Calculating critic output gradient wrt interpolated data
with tf.GradientTape() as gp_tape:
gp_tape.watch(x_interp)
critic_output = self.critic(x_interp, training=True)
grad = gp_tape.gradient(critic_output, x_interp)[0]
# Calculating norm of gradient
grad_norm = tf.sqrt(tf.reduce_sum(tf.square(grad)))
# calculating gradient penalty
gp = tf.reduce_mean((grad_norm - 1.0)**2)
return gp
def train_step(self, x_real):
# Critic training
# Getting batch size for creating latent vectors
batch_size = tf.shape(x_real)[0]
# Critic training loop
for i in range(self.critic_steps):
# Generating latent vectors
latent = tf.random.normal(shape=(batch_size, self.latent_dim))
with tf.GradientTape() as tape:
# Obtaining fake data from generator
x_fake = self.generator(latent, training=True)
# Critic output from fake data
fake_logits = self.critic(x_fake, training=True)
# Critic output from real data
real_logits = self.critic(x_real, training=True)
# Calculating critic loss
c_loss = self.critic_loss_function(real_logits, fake_logits)
# Calcuating gradient penalty
gp = self.gradient_penalty(batch_size, x_real, x_fake)
# Adjusting critic loss with gradient penalty
c_loss = c_loss + gp_weight*gp
# Calculating gradient of critic loss wrt critic weights
critic_gradient = tape.gradient(c_loss, self.critic.trainable_variables)
# Updating critic weights
self.critic_optimizer.apply_gradients(zip(critic_gradient, self.critic.trainable_variables))
# Generator training
# Generating latent vectors
latent = tf.random.normal(shape=(batch_size, self.latent_dim))
with tf.GradientTape() as tape:
# Obtaining fake data from generator
x_fake = self.generator(latent, training=True)
# Critic output from fake data
fake_logits = self.critic(x_fake, training=True)
# Calculating generator loss
g_loss = self.generator_loss_function(fake_logits)
# Calculating gradient of generator loss wrt generator weights
generator_gradient = tape.gradient(g_loss, self.generator.trainable_variables)
# Updating generator weights
self.generator_optimizer.apply_gradients(zip(generator_gradient, self.generator.trainable_variables))
return {"g_loss": g_loss, "c_loss": c_loss}
class GAN_monitor(keras.callbacks.Callback):
def __init__(self, n_samples, latent_dim):
self.n_samples = n_samples
self.latent_dim = latent_dim
def on_epoch_end(self, epoch, logs=None):
# keys = list(logs.keys())
if (epoch+1)%1000 == 0:
latent = tf.random.normal(shape=(self.n_samples, self.latent_dim))
generated_data = self.model.generator(latent)
plt.plot(generated_data[:,0], generated_data[:,1], ls='None', marker='*', label='Generated Data')
plt.plot(dataset[:,0], dataset[:,1], ls='None', marker='.', markersize=1, label='Real Data')
plt.legend()
plt.savefig('Epoch _'+str(epoch+1)+'.png', dpi=300)
plt.cla()
if epoch == n_epochs-1:
plt.close()
def time_elapsed(runtime):
# This function clocks the runtime of training the GAN.
if runtime//3600:
hours = runtime//3600
minutes = (runtime%3600)//60
seconds = (runtime%3600)%60
print('Runtime: '+str(int(hours))+' hours, '+str(int(minutes))+' minutes, and '+str(round(seconds,3))+' seconds.')
elif runtime//60:
minutes = runtime//60
seconds = (runtime%60)
print('Runtime: '+str(int(minutes))+' minutes, and '+str(round(seconds,3))+' seconds.')
else:
print('Runtime: '+str(round(runtime,3))+' seconds.')
data = np.genfromtxt('Flight_1.dat', dtype='float', encoding=None, delimiter=',')[0:1001,0]
time_span = np.linspace(0,20,1001)
dataset = np.concatenate((time_span[:,np.newaxis], data[:,np.newaxis]), axis=1)
dataset.shape
# Training Parameters
latent_dim = 100
n_epochs = 10
n_critic_train = 5
gp_weight = 10
batch_Size = 100
# Instantiating the generator and critic models
gen = define_generator(latent_dim)
critic = define_critic()
# Instantiating the WGAN-GP object
WGAN = define_wgan(gen, critic, latent_dim, n_critic_train, gp_weight)
# Compling the WGAN-GP model
WGAN.compile(generator_loss, critic_loss)
# Instantiating custom Keras callbacks
monitor = GAN_monitor(n_samples=100, latent_dim=latent_dim)
logger = CSVLogger('training.log')
cbk = [monitor, logger]
# Training the WGAN-GP model
tic = time.perf_counter()
WGAN.fit(dataset, batch_size=batch_Size, epochs=n_epochs, callbacks=cbk)
toc = time.perf_counter()
time_elapsed(toc-tic)
Here is sample output from running 10 epochs:
Epoch 1/10
11/11 [==============================] - 3s 11ms/step - g_loss: -0.4902 - c_loss: -28.7432
Epoch 2/10
11/11 [==============================] - 0s 11ms/step - g_loss: -2.9594 - c_loss: -44.4573
Epoch 3/10
11/11 [==============================] - 0s 11ms/step - g_loss: -6.2499 - c_loss: -37.8952
Epoch 4/10
11/11 [==============================] - 0s 11ms/step - g_loss: -10.8878 - c_loss: -30.8383
Epoch 5/10
11/11 [==============================] - 0s 13ms/step - g_loss: -14.9338 - c_loss: -18.9380
Epoch 6/10
11/11 [==============================] - 0s 12ms/step - g_loss: -14.7492 - c_loss: -11.6413
Epoch 7/10
11/11 [==============================] - 0s 12ms/step - g_loss: 2.7521 - c_loss: -11.9163
Epoch 8/10
11/11 [==============================] - 0s 11ms/step - g_loss: 6.5407 - c_loss: -9.2661
Epoch 9/10
11/11 [==============================] - 0s 13ms/step - g_loss: -6.1613 - c_loss: -11.7921
Epoch 10/10
11/11 [==============================] - 0s 14ms/step - g_loss: 7.4347 - c_loss: -7.9312
Runtime: 4.183 seconds.
And here is what CSVLogger() printed:
epoch,c_loss,g_loss
0,-92.63285827636719,-1.7106478214263916
1,-55.53614044189453,-4.0739545822143555
2,-32.362613677978516,-5.798206329345703
3,-18.02860450744629,-9.20343017578125
4,-6.916697025299072,-5.970229625701904
5,3.580303192138672,-3.7867188453674316
6,-9.844627380371094,30.166370391845703
7,-4.9332594871521,14.637486457824707
8,-22.801029205322266,-21.10728645324707
9,-9.242209434509277,3.407137870788574
Should I just stick with the loss values that are printed out in the on_epoch_end() method? Thanks.

TypeError: Cannot convert a symbolic Keras input/output to numpy array

Trying to upgrade this awesome implementation of gumble-softmax-vae found here. However, I keep getting
TypeError: Cannot convert a symbolic Keras input/output to a numpy array.
I am stumped - tried many many things. Interestingly some searches return with other implementation of VAEs. I believe the error is somewhere in the "KL" term calculation of the loss.
Here is the almost working code:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
batch_size = 10
data_dim = 784
M = 10 # classes
N = 30 # how many distributions
nb_epoch = 100
epsilon_std = 0.01
anneal_rate = 0.0003
min_temperature = 0.5
tau = tf.Variable(5.0, dtype=tf.float32)
class Sampling(keras.layers.Layer):
def call(self, logits_y):
u = tf.random.uniform(tf.shape(logits_y), 0, 1)
y = logits_y - tf.math.log(
-tf.math.log(u + 1e-20) + 1e-20
) # logits + gumbel noise
y = tf.nn.softmax(tf.reshape(y, (-1, N, M)) / tau)
y = tf.reshape(y, (-1, N * M))
return y
encoder_inputs = keras.Input(shape=(data_dim))
x = keras.layers.Dense(512, activation="relu")(encoder_inputs)
x = keras.layers.Dense(256, activation="relu")(x)
logits_y = keras.layers.Dense(M * N, name="logits_y")(x)
z = Sampling()(logits_y)
encoder = keras.Model(encoder_inputs, z, name="encoder")
encoder.build(encoder_inputs)
print(encoder.summary())
decoder_inputs = keras.Input(shape=(N * M))
x = keras.layers.Dense(256, activation="relu")(decoder_inputs)
x = keras.layers.Dense(512, activation="relu")(x)
decoder_outputs = keras.layers.Dense(data_dim, activation="sigmoid")(x)
decoder = keras.Model(decoder_inputs, decoder_outputs, name="decoder")
decoder.build(decoder_inputs)
print(decoder.summary())
class VAE(keras.Model):
def __init__(self, encoder, decoder, **kwargs):
super(VAE, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
self.bce = tf.keras.losses.BinaryCrossentropy()
self.loss_tracker = keras.metrics.Mean(name="loss")
#property
def metrics(self):
return [self.loss_tracker]
def call(self, x):
z = self.encoder(x)
x_hat = self.decoder(z)
return x_hat
#tf.function
def gumbel_loss(self, y_true, y_pred, logits_y):
q_y = tf.reshape(logits_y, (-1, N, M))
q_y = tf.nn.softmax(q_y)
log_q_y = tf.math.log(q_y + 1e-20)
kl_tmp = q_y * (log_q_y - tf.math.log(1.0 / M))
kl = tf.math.reduce_sum(kl_tmp, axis=(1, 2))
kl = tf.squeeze(kl, axis=0)
elbo = data_dim * self.bce(y_true, y_pred) - kl
return elbo
def train_step(self, data):
x = data
with tf.GradientTape(persistent=True) as tape:
z = self.encoder(x, training=True)
x_hat = self.decoder(z, training=True)
x = tf.cast(x, dtype=tf.float32)
x_hat = tf.cast(x_hat, dtype=tf.float32)
logits_y = self.encoder.get_layer('logits_y').output
loss = self.gumbel_loss(x, x_hat, logits_y)
grads = tape.gradient(loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
self.loss_tracker.update_state(loss)
return {"loss": self.loss_tracker.result()}
def main():
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(
path="mnist.npz"
)
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
vae = VAE(encoder, decoder, name="vae-model")
vae_inputs = (None, data_dim)
vae.build(vae_inputs)
vae.compile(optimizer="adam", loss=None)
vae.fit(
x_train,
shuffle=True,
epochs=1,
batch_size=batch_size
)
if __name__ == "__main__":
main()
I think the main issue occurs when you try to get the output from the logits_y layer, (AFAIK), you can't do that, and instead, you need to build your encoder model with two outputs. Something like this way
class VAE(keras.Model):
def __init__(self, encoder, decoder, **kwargs):
super(VAE, self).__init__(**kwargs)
# self.encoder = encoder
self.encoder = tf.keras.Model(inputs=encoder.input,
outputs=[encoder.get_layer(name='logits_y').output,
encoder.output])
whatever...
So, in the training loop, this self.encoder will produce two outputs, one of them is the output of layer logit_y, which you need for some loss function. Lastly, change a few codes in other places for this, as follows
def call(self, x):
_, z = self.encoder(x)
x_hat = self.decoder(z)
return x_hat
#tf.function
def gumbel_loss(self, y_true, y_pred, logits_y):
q_y = tf.reshape(logits_y, (-1, N, M))
q_y = tf.nn.softmax(q_y)
log_q_y = tf.math.log(q_y + 1e-20)
kl_tmp = q_y * (log_q_y - tf.math.log(1.0 / M))
kl = tf.math.reduce_sum(kl_tmp, axis=(1, 2))
elbo = data_dim * self.bce(y_true, y_pred) - kl
return elbo
And lastly, the train_step function; note, corresponding variables are already in tf.float32, no need to convert.
def train_step(self, data):
x = data
with tf.GradientTape(persistent=True) as tape:
logits_y, z = self.encoder(x, training=True)
x_hat = self.decoder(z, training=True)
loss = self.gumbel_loss(x, x_hat, logits_y)
grads = tape.gradient(loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
self.loss_tracker.update_state(loss)
return {"loss": self.loss_tracker.result()}
You don't need to change anything of the above code now, here is some training logs (running on cpu, tf 2.5).
Epoch 1/5
6000/6000 [==============================] - 60s 10ms/step - loss: 54.4604
Epoch 2/5
6000/6000 [==============================] - 60s 10ms/step - loss: 18.8960
Epoch 3/5
6000/6000 [==============================] - 59s 10ms/step - loss: 12.1036
Epoch 4/5
6000/6000 [==============================] - 59s 10ms/step - loss: 8.5804
Epoch 5/5
6000/6000 [==============================] - 59s 10ms/step - loss: 6.3916

Pytorch CNN loss is not changing,

I making a CNN for a binary classification problem between bees and ants images.
Images are of 500x500 dimension with 3 channels.
Here is my code.
Dataloader:
def load_data(path):
data = []
ant = 0
bee = 0
for folder in os.listdir(path):
print(folder)
curfolder = os.path.join(path, folder)
for file in os.listdir(curfolder):
image = plt.imread(curfolder+'/'+file)
image = cv2.resize(image, (500,500))
if folder == 'ants':
ant += 1
data.append([np.array(image) , np.eye(2)[0]])
elif folder == 'bees':
bee += 1
data.append([np.array(image) , np.eye(2)[1]])
np.random.shuffle(data)
np.save('train.npy',data)
print('ants : ',ant)
print('bees : ',bee)
training_data = np.load("train.npy",allow_pickle=True)
print(len(training_data))
CNN class
class Net(nn.Module):
def __init__(self):
super().__init__() # just run the init of parent class (nn.Module)
self.conv1 = nn.Conv2d(3, 32, 5) # input is 1 image, 32 output channels, 5x5 kernel / window
self.conv2 = nn.Conv2d(32, 64, 5) # input is 32, bc the first layer output 32. Then we say the output will be 64 channels, 5x5 kernel / window
self.conv3 = nn.Conv2d(64, 128, 5)
x = torch.randn(3,500,500).view(-1,3,500,500)
self._to_linear = None
self.convs(x)
print(self._to_linear)
self.fc1 = nn.Linear(self._to_linear, 512) #flattening.
self.fc2 = nn.Linear(512, 2) # 512 in, 2 out bc we're doing 2 classes (dog vs cat).
def convs(self, x):
# max pooling over 2x2
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
if self._to_linear is None:
self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
return x
def forward(self, x):
x = self.convs(x)
x = x.view(-1, self._to_linear) # .view is reshape ... this flattens X before
x = F.relu(self.fc1(x))
x = self.fc2(x) # bc this is our output layer. No activation here.
return F.softmax(x, dim=1)
net = Net()
print(net)
loss and optimizer
import torch.optim as optim
optimizer = optim.Adam(net.parameters(), lr=0.001)
loss_function = nn.MSELoss()
Data Flat
train_X = torch.Tensor([i[0] for i in training_data]).view(-1,3,500,500)
train_X = train_X/255.0
train_y = torch.Tensor([i[1] for i in training_data])
training the model
device = torch.device("cuda:0")
net = Net().to(device)
print(len(train_X))
epochs = 10
BATCH_SIZE = 1
for epoch in range(epochs):
for i in range(0, len(train_X), BATCH_SIZE): # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev
#print(f"{i}:{i+BATCH_SIZE}")
batch_X = train_X[i:i+BATCH_SIZE]
batch_y = train_y[i:i+BATCH_SIZE]
batch_X, batch_y = batch_X.to(device), batch_y.to(device)
net.zero_grad()
outputs = net(batch_X)
loss = loss_function(outputs, batch_y)
loss.backward()
optimizer.step() # Does the update
print(f"Epoch : {epoch}. Loss: {loss}")
The loss does not update for every epoch. I have tried changing the learning rate but the problem still remains.
Epoch : 0. Loss: 0.23345321416854858
Epoch : 1. Loss: 0.23345321416854858
Epoch : 2. Loss: 0.23345321416854858
Epoch : 3. Loss: 0.23345321416854858
Epoch : 4. Loss: 0.23345321416854858
Epoch : 5. Loss: 0.23345321416854858
Epoch : 6. Loss: 0.23345321416854858
Epoch : 7. Loss: 0.23345321416854858
Epoch : 8. Loss: 0.23345321416854858
Epoch : 9. Loss: 0.23345321416854858
Thank you in advance.
In your training loop, you should do optimizer.zero_grad() instead of net.zero_grad(). Also, you are using MSELoss() for a classification problem, you need something like BinaryCrossEntropy() or CrossEntropy() or NLLLoss().

Same function in Keras Loss and Metric give different values even without regularization

I'm building a custom u-net for a semantic segmentation problem, but i'm seeing a weird behavior in the way that loss and metric are calculated during training, with very significative differences.
Update at the bottom for a minimal reproducible example:
I've read this one (1), and this one (2), another one (3) and yet another one(4), but haven't found a suitable answer.
When training the model, i'm using the same function for loss and for metric, and the results vary wildly.
First example with categorical_cross_entropy (i'm using a very small toy set just to show it):
from tensorflow.python.keras import losses
model.compile(optimizer='adam', loss=losses.categorical_crossentropy,
metrics=[losses.categorical_crossentropy])
And the output i get is:
4/4 [===] - 3s 677ms/step - loss: 4.1023 - categorical_crossentropy: 1.0256
- val_loss: 1.3864 - val_categorical_crossentropy: 1.3864
As you can see, loss and categorical_crossentropy are about 4x.
If i'm using a custom metric, the difference is orders of magnitude:
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.losses import categorical_crossentropy
def dice_cross_loss(y_true, y_pred, epsilon=1e-6, smooth=1):
ce_loss = categorical_crossentropy(y_true, y_pred)
y_true_f = K.flatten(y_true)
y_pred_f = K.flatten(y_pred)
intersection = K.sum(y_true_f * y_pred_f)
dice_coef = (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + epsilon)
return ce_loss - K.log(dice_coef + epsilon)
model.compile(optimizer='adam', loss=dice_cross_loss,
metrics=[dice_cross_loss])
When I run it, it's even worse:
4/4 [===] - 3s 682ms/step - loss: 20.9706 - dice_cross_loss: 5.2428
- val_loss: 4.3681 - val_dice_cross_loss: 4.3681
When using larger examples, the difference between the loss and the loss as metric can be more than tenfold.
When reading (1), I removed ALL regularization layers that can work differently on evaluation. from the model. No dropout, no batchnorm. There is pooling, but that shouldn't be the cause of it.
The fitiing code is unremarkable:
model.fit(x=data_x, y=data_y, batch_size=batch_size, epochs=epochs,
verbose=1, validation_split=0.2, shuffle=True, workers=4)
This is the code of the network:
class CustomUnet(object):
def __init__(self, image_shape=(20, 30, 3), n_class=2, **params):
# read parameters
initial_filters = params.get("initial_filters", 64)
conv_activations = params.get("conv_activations", ReLU())
final_activation = params.get("final_activation", "softmax")
self.name = "CustomUnet"
input_layer = Input(shape=image_shape, name='image_input')
conv1 = self.conv_block(input_layer, nfilters=initial_filters, activation=conv_activations, name="con1")
conv1_out = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = self.conv_block(conv1_out, nfilters=initial_filters*2, activation=conv_activations, name="con2")
conv2_out = MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = self.conv_block(conv2_out, nfilters=initial_filters*4, activation=conv_activations, name="con3")
conv3_out = MaxPooling2D(pool_size=(2, 2))(conv3)
conv4 = self.conv_block(conv3_out, nfilters=initial_filters*8, activation=conv_activations, name="con4")
# number jumps from 4 to 7 because it used to have an extra layer and haven't got to refactor properly.
deconv7 = self.deconv_block(conv4, residual=conv3, nfilters=initial_filters*4, name="decon7",
conv_activations=conv_activations)
deconv8 = self.deconv_block(deconv7, residual=conv2, nfilters=initial_filters*2, name="decon8",
conv_activations=conv_activations)
deconv9 = self.deconv_block(deconv8, residual=conv1, nfilters=initial_filters, name="decon9",
conv_activations=conv_activations)
output_layer = Conv2D(filters=n_class, kernel_size=(1, 1))(deconv9)
model = Model(inputs=input_layer, outputs=output_layer4, name='Unet')
self.model = model
def conv_block(self, input_layer, nfilters, size=3, padding='same', initializer="he_normal", name="none",
activation=ReLU()):
x = Conv2D(filters=nfilters, kernel_size=(size, size), padding=padding, kernel_initializer=initializer)(input_layer)
x = Activation(activation)(x)
x = Conv2D(filters=nfilters, kernel_size=(size, size), padding=padding, kernel_initializer=initializer)(x)
x = Activation(activation)(x)
return x
def deconv_block(self, input_layer, residual, nfilters, size=3, padding='same', strides=(2, 2), name="none",
conv_activations=ReLU()):
y = Conv2DTranspose(nfilters, kernel_size=(size, size), strides=strides, padding=padding)(input_layer)
y = concatenate([y, residual]) #, axis=3)
y = self.conv_block(y, nfilters, activation=conv_activations)
return y
Is this an expected behavior? What am I not understanding about the difference on how the loss and the metric are calculated? Have I messed up something in the code?
Thanks!!
Minimal reproducible example:
from tensorflow.python.keras.layers import Input, Conv2D, Activation
from tensorflow.python.keras.models import Model
import numpy as np
input_data = np.random.rand(100, 300, 300, 3) # 300x300 images
out_data = np.random.randint(0, 2, size=(100, 300, 300, 4)) # 4 classes
def simple_model(image_shape, n_class):
input_layer = Input(shape=image_shape, name='image_input')
x = Conv2D(filters=3, kernel_size=(3, 3), padding="same", kernel_initializer="he_normal")(input_layer)
x = Activation("relu")(x)
x = Conv2D(filters=3, kernel_size=(3, 3), padding="same", kernel_initializer="he_normal")(x)
x = Activation("relu")(x)
x = Conv2D(filters=n_class, kernel_size=(1, 1))(x)
output_layer = Activation("softmax")(x)
model = Model(inputs=input_layer, outputs=output_layer, name='Sample')
return model
sample_model = simple_model(input_data[0].shape, out_data.shape[-1])
sample_model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["categorical_crossentropy"])
batch_size = 5
steps = input_data.shape[0] // batch_size
epochs = 20
history = sample_model.fit(x=input_data, y=out_data, batch_size=batch_size, epochs=epochs, # , callbacks=callbacks,
verbose=1, validation_split=0.2, workers=1)
And the results I get still have the weirdness:
80/80 [===] - 9s 108ms/step - loss: 14.0259 - categorical_crossentropy: 2.8051 - val_loss: 13.9439 - val_categorical_crossentropy: 2.7885
So loss: 14.0259 - categorical_crossentropy: 2.8051. Now i'm lost...
Got an solution working.
It seems to be an issue with TF imported libraries.
If I do
from tensorflow.python.keras.layers import Input, Conv2D, Activation
from tensorflow.python.keras.models import Model
I get the weird behavior from above
Bue if i replace that for
from keras.layers import Input, Conv2D, Activation
from keras.models import Model
I get much more consistent numers:
5/80 [>.....] - ETA: 20s - loss: 2.7886 - categorical_crossentropy: 2.7879
10/80 [==>...] - ETA: 12s - loss: 2.7904 - categorical_crossentropy: 2.7899
15/80 [====>.] - ETA: 9s - loss: 2.7900 - categorical_crossentropy: 2.7896
The are still some differences, but they seem much more reasonable
Still, if you know why, please let me know!
Keras does get its source of randomness from the NumPy random number generator, so this must be seeded regardless of whether you are using a Theano or TensorFlow backend.
We have use seed() function at the top of the file before any other imports or other code.
from numpy.random import seed
seed(1)
In addition, TensorFlow has its own random number generator that must also be seeded by calling the set_random_seed() function immediately after the NumPy random number generator, as follows:
from tensorflow import set_random_seed
set_random_seed(2)
Thanks,
Rajeswari Ponnuru.

Keras metric does not provide same result as metric calculated in callback

I'm trying to do regression using a pretrained vgg16 network. As loss and also metric I have chosen the mean absolute error. I wanted to do a check if this score is actually correct and implemented the mean absolute score myself in a callback. However, the results are not the same as can be seen by the output:
Training MAE:126.649451276
Epoch 1/100
638/638 [==============================] - 406s - loss: 38.9601 - mean_absolute_error: 38.9601
Training MAE:40.7683742351
Epoch 2/100
638/638 [==============================] - 362s - loss: 19.8719 - mean_absolute_error: 19.8719
Training MAE:43.2516028945
The Training MAE should be the same (or at least almost the same), as the loss or the mean_absolute_error in the epoch above. For the first epoch this is ok. For the second epoch it is not. There the MAE is 43.24 but the loss is 19.87 and the mean_absolute_error provided by keras is 19.87.
I've cleaned up my code and tried to find a reason why but I can't find it. Why is this happening?
My code:
from keras.layers.core import Flatten, Dense, Dropout
import numpy as np
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras import optimizers
from keras.models import Model
import os
from keras.layers.core import *
from keras.callbacks import Callback, ModelCheckpoint
os.environ["CUDA_VISIBLE_DEVICES"]="2"
model_checkpoints = "/home/usr/PycharmProjects/RSNA/model_checkpoints/model2.hdf5"
data_dir = "/home/usr/PycharmProjects/RSNA/data/"
data_training = "dataset/training"
training_images = "boneage-training-dataset/"
training_gt = "training_gt/"
n_batch = 16
n_training_samples = 10213
n_validation_samples = 1136
n_testing_samples = 1262
def mae(X,y,mdl):
pred = mdl.predict(X)
gt = y
return str(np.mean(np.abs(np.array(gt)-np.array(pred))))
class LossHistory(Callback):
def on_epoch_begin(self, epoch, logs={}):
mae_score = mae(X_train,y_train,self.model)
print "Training MAE:" + mae_score
def regression_flow_from_directory(flow_from_directory_gen, rev_indices):
for x, y in flow_from_directory_gen:
yield x, [float(rev_indices[val]) for val in y]
if __name__ == '__main__':
width = 224
height = 224
X_train = []
y_train = []
train_datagen = image.ImageDataGenerator(
rescale=1./255,
width_shift_range=0.2,
height_shift_range= 0.2,
)
train_generator = train_datagen.flow_from_directory(
data_dir+data_training,
target_size=(width, height),
batch_size=n_batch,
color_mode='rgb',
class_mode='sparse',
seed=42)
indices = train_generator.class_indices
rev_indices = dict((v,k) for k, v in indices.iteritems())
train_generator = regression_flow_from_directory(train_generator,rev_indices)
i = 0
print "Epcohs: " + str(n_training_samples//n_batch)
for x,y in train_generator:
if i <= n_training_samples//n_batch:
X_train.extend(x)
y_train.extend(y)
i +=1
else:
break;
print "Maximum: " + str(np.max(y_train))
X_train = np.array(X_train)
print X_train.shape
model = VGG16(weights='imagenet', include_top=False,input_shape = (224, 224, 3))
last = model.output
x = Flatten(name='flatten')(last)
x = Dense(4096, activation='relu', name='fc1')(x)
x = Dropout(0.5, noise_shape=None, seed=None)(x)
x = Dense(4096, activation='relu', name='fc2')(x)
x = Dense(1, activation='relu', name='predictions')(x)
my_model = Model(input=model.input, output=x)
my_model.compile(loss="mae", optimizer=optimizers.SGD(lr=0.00001, momentum=0.9),
metrics=["mae"])
history = LossHistory()
print my_model.summary()
print n_validation_samples//n_batch
my_model.fit_generator(
train_generator,
steps_per_epoch=n_training_samples//n_batch,
epochs=100,
callbacks=[history],
)

Categories