Varational autoencoder using tf.GradientTape

Varational autoencoder using tf.GradientTape - python

Here is an example of tensorflow GradientTape provided by keras for a typical variotional autoencoder:
VAE-keras-example
The train_step function is implemented inside the model and it is trained with the "model.fit()". The example performs great and no problem at all.
However, for another application, I need to implement the train_step function outside of the model definition. In the beginning, I started with above mentioned example as the target application is also a kind of VAE. Accordingly, I applied some modifications and tried to train the same model structure; please find the whole code in the next; however, I get very weird numbers for the loss values comparing to the original code; even after a couple of iterations it gets nan values for losses.
Could you please let me know what's the mistake and why this happens?
Thanks in advance
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow import keras
import numpy as np
print(tf.test.is_gpu_available()) # prints True
print(tf.__version__) # prints '2.0.0-beta1'
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
latent_dim = 2
encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
x = layers.Dense(7 * 7 * 64, activation="relu")(z)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
model = keras.Model(encoder_inputs, [decoder_outputs, z_mean, z_log_var] , name="decoder")
model.summary()
optimizer = tf.keras.optimizers.Adam(lr=0.001)
objective = tf.keras.losses.SparseCategoricalCrossentropy()
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
num_samples = x_train.shape[0]
epochs=1
batch_size=128
#tf.function
def train_step(data):
with tf.GradientTape() as tape:
reconstruction, z_mean, z_log_var = model(data, training=True)
data = tf.expand_dims(data, axis=-1)
reconstruction_loss = tf.reduce_mean(
tf.reduce_sum(keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)))
kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
total_loss = (reconstruction_loss + kl_loss)
grads = tape.gradient(total_loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return total_loss, reconstruction_loss, kl_loss
with tf.device('gpu:0'):
for epoch in range (epochs):
for step in range(num_samples//batch_size):
s = step*batch_size
e = s+batch_size
x_batch = x_train[s:e,:,:]
total_loss, reconstruction_loss, kl_loss = train_step(x_batch)
print("-----------------")
print(f"epoch: {epoch} step: {step}")
print(f"reconstruction_loss: {reconstruction_loss} ")
print(f"kl_loss: {kl_loss} ")
print(f"total_loss: {total_loss}")

I think you forgot to normalize your data as shown in the tutorial you are referring to:
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
x_train = x_train.astype("float32") / 255
Otherwise, your code seems to be running fine and the loss in not nan. Here is the code for reference:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow import keras
import numpy as np
print(tf.test.is_gpu_available()) # prints True
print(tf.__version__) # prints '2.0.0-beta1'
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
latent_dim = 2
encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
x = layers.Dense(7 * 7 * 64, activation="relu")(z)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
model = keras.Model(encoder_inputs, [decoder_outputs, z_mean, z_log_var] , name="decoder")
model.summary()
optimizer = tf.keras.optimizers.Adam(lr=0.001)
objective = tf.keras.losses.SparseCategoricalCrossentropy()
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
x_train = x_train.astype("float32") / 255
num_samples = x_train.shape[0]
epochs=4
batch_size=128
#tf.function
def train_step(data):
with tf.GradientTape() as tape:
reconstruction, z_mean, z_log_var = model(data, training=True)
reconstruction_loss = tf.reduce_mean(
tf.reduce_sum(keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)))
kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))/batch_size
kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))/batch_size
total_loss = (reconstruction_loss + kl_loss)
grads = tape.gradient(total_loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return total_loss, reconstruction_loss, kl_loss
with tf.device('gpu:0'):
for epoch in range (epochs):
for step in range(num_samples//batch_size):
s = step*batch_size
e = s+batch_size
x_batch = x_train[s:e,:,:, tf.newaxis]
print(x_batch.shape)
total_loss, reconstruction_loss, kl_loss = train_step(x_batch)
print("-----------------")
print(f"epoch: {epoch} step: {step}")
print(f"reconstruction_loss: {reconstruction_loss} ")
print(f"kl_loss: {kl_loss} ")
print(f"total_loss: {total_loss}")

Related

3D - Variational Autoencoder implementation Error

I'm trying to implement a 3D variational autoencoder following instructions in the book Generative Deep Learning and taking also some things from here link, those examples are in 2D so I have adapted it.
This is the code i'm using:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Input, BatchNormalization, Conv3D, Dense, Flatten, Lambda, Reshape, UpSampling3D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
## Functional API
input_shape = (32, 32, 32, 1)
z_dim = 3000
########### Encoder ############
enc_in = Input(shape = input_shape, name="encoder_input")
enc_conv1 = tf.keras.layers.Conv3D(filters=8, kernel_size=5, strides=(1,1,1), padding='same', activation="ReLU", kernel_initializer="he_uniform", name="conv_1")(enc_in)
max1 = tf.keras.layers.MaxPool3D(pool_size=(2,2,2), strides=(2,2,2), padding='valid', name="max_pool_1")(enc_conv1)
enc_conv2 = tf.keras.layers.Conv3D(filters=16, kernel_size=3, padding='same', activation="ReLU", kernel_initializer="he_uniform", name="conv_2")(max1)
max2 = tf.keras.layers.MaxPool3D(pool_size=2, name="max_pool_2")(enc_conv2)
enc_fc1 = Dense(units = 512, kernel_initializer = 'he_uniform', activation = 'ReLU')(Flatten()(max2))
mu = Dense(units = z_dim, kernel_initializer = 'he_uniform', activation = None, name="mu")(enc_fc1)
log_variance = Dense(units = z_dim, kernel_initializer = 'he_uniform', activation = None, name="log_variance")(enc_fc1)
def sampling(args):
mu, log_variance = args
epsilon = K.random_normal(shape=K.shape(mu), mean= 0., stddev= 1.)
return mu + K.exp(0.5 * log_variance) * epsilon
z = Lambda(sampling, output_shape = (z_dim, ), name="z")([mu, log_variance])
encoder = Model(enc_in, [mu, log_variance, z], name="encoder")
########## Decoder ############
dec_in = Input(shape = (z_dim, ), name="decoder_input")
dec_fc1 = Dense(units = 512, kernel_initializer = 'he_uniform', activation = 'ReLU')(dec_in)
dec_unflatten = Reshape(target_shape = (8,8,8,1))(dec_fc1)
dec_conv1 = tf.keras.layers.Conv3D(filters=32, kernel_size=3, padding='same', activation="ReLU", kernel_initializer="he_uniform", name="deconv_1")(dec_unflatten)
ups1 = tf.keras.layers.UpSampling3D(size=(2, 2, 2), name="ups_1")(dec_conv1)
dec_conv2 = tf.keras.layers.Conv3D(filters=16, kernel_size=3, padding='same', activation="ReLU", kernel_initializer="he_uniform", name="deconv_2")(ups1)
ups2 = tf.keras.layers.UpSampling3D(size=(2, 2, 2), name="ups_2")(dec_conv2)
dec_conv4 = Conv3D(filters = 1, kernel_size = (3, 3, 3), padding = 'same', activation="ReLU", kernel_initializer = 'he_uniform', name='decorder_output')(ups2)
decoder = Model(dec_in, dec_conv4, name="decoder")
model_input = enc_in
model_output = decoder(z)
v_autoencoder = Model(model_input, model_output)
######################
def vae_r_loss(T1_input, model_output):
r_loss = K.mean(K.square(T1_input - model_output), axis = [1,2,3])
return r_loss_factor * r_loss
def vae_kl_loss(T1_input, model_output):
kl_loss = -0.5 * K.sum(1 + log_variance - K.square(mu) - K.exp(log_variance), axis = 1)
return kl_loss
def vae_loss(T1_input, model_output):
r_loss = vae_r_loss(T1_input, model_output)
kl_loss = vae_kl_loss(T1_input, model_output)
return r_loss + kl_loss
learning_rate=0.001
r_loss_factor = 500
optimizer = keras.optimizers.Adam(lr=learning_rate)
v_autoencoder.compile(optimizer=optimizer, loss = vae_loss, metrics = [vae_r_loss, vae_kl_loss])
v_autoencoder.summary()
But when I try to train the model:
history = v_autoencoder.fit(T1_input, T1_input, epochs=10, validation_split=0.2, shuffle=True)
I get this error:
TypeError: You are passing KerasTensor(type_spec=TensorSpec(shape=(), dtype=tf.float32, name=None), name='Placeholder:0', description="created by layer 'tf.cast_4'"), an intermediate Keras symbolic input/output, to a TF API that does not allow registering custom dispatchers, such as `tf.cond`, `tf.function`, gradient tapes, or `tf.map_fn`. Keras Functional model construction only supports TF API calls that *do* support dispatching, such as `tf.math.add` or `tf.reshape`. Other APIs cannot be called directly on symbolic Kerasinputs/outputs. You can work around this limitation by putting the operation in a custom Keras layer `call` and calling that layer on this symbolic input/output.

The encoder and the decoder model seem to disconnected and the KerasTensor are not able to flow correctly. Also, the encoder returns three tensors [ mu , variance , z] whereas the decoder only requires z as an input.
Replace these lines,
model_input = enc_in
model_output = decoder(z)
v_autoencoder = Model(model_input, model_output)
with,
vae_input = Input(shape = input_shape, name="encoder_input")
mu_ , variance_ , z_ = encoder( vae_input )
vae_output = decoder( z_ )
v_autoencoder = Model( vae_input , [vae_output , mu_ , variance_] )
So, basically, the model will now output mu_ and variance_ which are the outputs of the encoder, and are required for vae_kl_loss. Earlier, you were using these KerasTensor directly in vae_kl_loss and hence you got the error.
Remove the axis argument from vae_kl_loss,
def vae_kl_loss( mu , log_variance ):
kl_loss = -0.5 * K.sum(1 + log_variance - K.square(mu) - K.exp(log_variance))
return kl_loss
Pass mu and variance to vae_kl_loss,
def vae_loss(T1_input, model_output):
mu = model_output[ 1 ]
variance = model_output[ 2 ]
vae_output = model_output[ 0 ]
r_loss = vae_r_loss(T1_input, vae_output )
kl_loss = vae_kl_loss( mu , variance )
return r_loss + kl_loss
Modified code:
import tensorflow as tf
from tensorflow.keras.layers import Input, BatchNormalization, Conv3D, Dense, Flatten, Lambda, Reshape, UpSampling3D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
## Functional API
input_shape = (32, 32, 32, 1)
z_dim = 3000
########### Encoder ############
enc_in = Input(shape = input_shape, name="encoder_input")
enc_conv1 = tf.keras.layers.Conv3D(filters=8, kernel_size=5, strides=(1,1,1), padding='same', activation="ReLU", kernel_initializer="he_uniform", name="conv_1")(enc_in)
max1 = tf.keras.layers.MaxPool3D(pool_size=(2,2,2), strides=(2,2,2), padding='valid', name="max_pool_1")(enc_conv1)
enc_conv2 = tf.keras.layers.Conv3D(filters=16, kernel_size=3, padding='same', activation="ReLU", kernel_initializer="he_uniform", name="conv_2")(max1)
max2 = tf.keras.layers.MaxPool3D(pool_size=2, name="max_pool_2")(enc_conv2)
enc_fc1 = Dense(units = 512, kernel_initializer = 'he_uniform', activation = 'ReLU')(Flatten()(max2))
mu = Dense(units = z_dim, kernel_initializer = 'he_uniform', activation = None, name="mu")(enc_fc1)
log_variance = Dense(units = z_dim, kernel_initializer = 'he_uniform', activation = None, name="log_variance")(enc_fc1)
def sampling(args):
mu, log_variance = args
epsilon = K.random_normal(shape=K.shape(mu), mean= 0., stddev= 1.)
return mu + K.exp(0.5 * log_variance) * epsilon
z = Lambda(sampling, output_shape = (z_dim, ), name="z")([mu, log_variance])
encoder = Model(enc_in, [mu, log_variance, z], name="encoder")
########## Decoder ############
dec_in = Input(shape = (z_dim, ), name="decoder_input")
dec_fc1 = Dense(units = 512, kernel_initializer = 'he_uniform', activation = 'ReLU')(dec_in)
dec_unflatten = Reshape(target_shape = (8,8,8,1))(dec_fc1)
dec_conv1 = tf.keras.layers.Conv3D(filters=32, kernel_size=3, padding='same', activation="ReLU", kernel_initializer="he_uniform", name="deconv_1")(dec_unflatten)
ups1 = tf.keras.layers.UpSampling3D(size=(2, 2, 2), name="ups_1")(dec_conv1)
dec_conv2 = tf.keras.layers.Conv3D(filters=16, kernel_size=3, padding='same', activation="ReLU", kernel_initializer="he_uniform", name="deconv_2")(ups1)
ups2 = tf.keras.layers.UpSampling3D(size=(2, 2, 2), name="ups_2")(dec_conv2)
dec_conv4 = Conv3D(filters = 1, kernel_size = (3, 3, 3), padding = 'same', activation="ReLU", kernel_initializer = 'he_uniform', name='decorder_output')(ups2)
decoder = Model(dec_in, dec_conv4, name="decoder")
vae_input = Input(shape = input_shape, name="encoder_input")
mu_ , variance_ , z_ = encoder( vae_input )
vae_output = decoder( z_ )
v_autoencoder = Model( vae_input , [vae_output , mu_ , variance_] )
######################
def vae_r_loss(T1_input, model_output):
r_loss = K.mean(K.square(T1_input - model_output), axis = [1,2,3])
return r_loss_factor * r_loss
def vae_kl_loss( mu , log_variance ):
kl_loss = -0.5 * K.sum(1 + log_variance - K.square(mu) - K.exp(log_variance))
return kl_loss
def vae_loss(T1_input, model_output):
mu = model_output[ 1 ]
variance = model_output[ 2 ]
vae_output = model_output[ 0 ]
r_loss = vae_r_loss(T1_input, vae_output )
kl_loss = vae_kl_loss( mu , variance )
return r_loss + kl_loss
learning_rate=0.001
r_loss_factor = 500
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
v_autoencoder.compile(optimizer=optimizer, loss = vae_loss, metrics = [vae_r_loss, vae_kl_loss])
v_autoencoder.summary()

Adversarial Autoencoder is not working and not learning properly

I am trying to get an Adversarial AutoEncoder going using keras Fit method on a keras.model class
but for some reason it is not working.
Keep in mind that I tried updating encoder and decoder at the same time.
I tried giving the disc loss to the encoder with and without the reconstruction loss
The reconstruction loss stayed the same while encoder disc loss kept increasing as the discriminator's own loss kept dropping.
discriminator = keras.Sequential(
[
keras.Input(shape=(4, 4, 128)),
layers.Flatten(),
layers.Dense(128, activation="relu"),
layers.Dense(128, activation="relu"),
layers.Dense(128, activation="relu"),
layers.Dense(1, activation="sigmoid"),
],
name="discriminator",
)
discriminator.summary()
encoder = keras.Sequential(
[
keras.Input(shape=(28, 28, 1)),
layers.Conv2D(24, 3, activation="relu", strides=2, padding="same"),
layers.Conv2D(48, 3, activation="relu", strides=2, padding="same"),
layers.Conv2D(96, 3, activation="relu", strides=2, padding="same"),
layers.Flatten(),
layers.Dense(4 * 4 * 128, activation="linear"),
layers.Reshape((4, 4, 128)),
],
name="encoder",
)
encoder.summary()
decoder = keras.Sequential(
[
keras.Input(shape=(4, 4, 128)),
layers.Flatten(),
layers.Dense(7 * 7 * 64, activation="relu"),
layers.Reshape((7, 7, 64)),
layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same"),
layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same"),
layers.Conv2DTranspose(1, 3, activation="sigmoid", strides=1, padding="same"),
],
name="decoder",
)
I am not sure If it is in the model itself of not. I am using MNIST Dataset for this
class AAE(keras.Model):
def __init__(self, encoder, decoder, discriminator):
super(AAE, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.discriminator = discriminator
self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
self.disc_tracker = keras.metrics.Mean(name="disc_loss")
self.discEnc_tracker = keras.metrics.Mean(name="discEnc_loss")
#property
def metrics(self):
return [
self.total_loss_tracker,
self.reconstruction_loss_tracker,
self.disc_tracker,
self.discEnc_tracker,
]
def compile(self, di_optimizer, e_optimizer,de_optimizer, loss_fn):
super(AAE, self).compile()
self.dis_optimizer = di_optimizer
self.e_optimizer = e_optimizer
self.de_optimizer = de_optimizer
self.lossBCE = loss_fn[0]
self.lossMAE = loss_fn[1]
def train_step(self, data):
latent = self.encoder(data)
batch_size = 200
dists = tf.random.normal((batch_size,4,4,128))
y_real = tf.ones((batch_size, 1))
y_fake = tf.zeros((batch_size, 1))
real_dist_mix = tf.concat((dists, latent),axis=0)
y_real_fake_mix = tf.concat((y_real, y_fake),axis=0)
with tf.GradientTape() as tape:
predictions = self.discriminator(real_dist_mix)
d_loss = self.lossBCE(y_real_fake_mix, predictions)
grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
self.dis_optimizer.apply_gradients(zip(grads, self.discriminator.trainable_weights))
with tf.GradientTape() as Etape, tf.GradientTape() as Dtape:
latent = self.encoder(data)
reconstruction = self.decoder(latent)
reconstruction_loss = self.lossMAE(data, reconstruction)
total_loss = reconstruction_loss
Egrads = Etape.gradient(total_loss, self.encoder.trainable_weights)
self.e_optimizer.apply_gradients(zip(Egrads, self.encoder.trainable_weights))
Dgrads = Dtape.gradient(total_loss, self.decoder.trainable_weights)
self.de_optimizer.apply_gradients(zip(Dgrads, self.decoder.trainable_weights))
with tf.GradientTape() as tape:
latent = self.encoder(data)
predictions = self.discriminator(latent)
e_loss = self.lossBCE(y_fake, predictions)
grads = tape.gradient(e_loss, self.encoder.trainable_weights)
self.e_optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights))
self.total_loss_tracker.update_state(total_loss)
self.reconstruction_loss_tracker.update_state(reconstruction_loss)
self.disc_tracker.update_state(d_loss)
self.discEnc_tracker.update_state(e_loss)
return {
"loss": self.total_loss_tracker.result(),
"reconstruction_loss": self.reconstruction_loss_tracker.result(),
"disc_loss": self.disc_tracker.result(),
"discEnc_loss": self.discEnc_tracker.result(),
}
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
mnist_digits = np.concatenate([x_train, x_test], axis=0)
mnist_digits = np.expand_dims(mnist_digits, -1).astype("float32") / 255
Aae = AAE(encoder, decoder, discriminator)
#vae.compile(optimizer=keras.optimizers.Adam())
Aae.compile(
di_optimizer=keras.optimizers.Adam(learning_rate=0.00001),
e_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
de_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
loss_fn=[tf.keras.losses.BinaryCrossentropy(),tf.keras.losses.MeanAbsoluteError()]
)
h=Aae.fit(mnist_digits, epochs=15, batch_size=200)

I think that the error is here:
with tf.GradientTape() as tape:
latent = self.encoder(data)
predictions = self.discriminator(latent)
e_loss = self.lossBCE(y_fake, predictions)
grads = tape.gradient(e_loss, self.encoder.trainable_weights)
self.e_optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights))
I would put e_loss = self.lossBCE(y_real, predictions), because the encoder tries to fool the discriminator.

Differences between model results

I've been experimenting with VAEs for weeks now, trying to get similar results as in a tutorial code, but being unsuccessful in it. The only difference between mine and the tutorial's code is that it compiles the model (demonstration will be below), and runs fit on it, but I run each layer manually. What I've done is set up weight initialization with numpy arrays, just as with epsilons, saved into .npy files, and ran the two models. The results were different (the tutorial's model gives way more smooth results)
As of my understanding of neural networks so far, given the same training data in the same order, with the same layers, with same initial weights, same optimizer, and hyperparameters, the results should be the same all the time.
I've made 3 simple examples. MNIST data with two dense layers with sizes 16, and 10. Weight initialization:
import numpy as np
w1 = np.random.uniform(-1, 1, size=(784, 16))
w2 = np.random.uniform(-1, 1, size=(16, 10))
np.save('w1', w1)
np.save('w2', w2)
First version (manually executing layers):
import numpy as np
import tensorflow as tf
import math
w1 = np.load('w1.npy')
w2 = np.load('w2.npy')
class Model(tf.keras.Model):
def __init__(self):
super().__init__()
self.flat = tf.keras.layers.Flatten()
self.w1 = tf.keras.layers.Dense(
units=16,
activation='relu',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w1)
)
self.w2 = tf.keras.layers.Dense(
units=10,
activation='sigmoid',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w2)
)
self.optimizer = tf.keras.optimizers.Adam()
#tf.function
def call(self, x, y):
with tf.GradientTape() as tape:
x = self.flat(x)
a = self.w1(x)
y_hat = self.w2(a)
loss = tf.keras.losses.sparse_categorical_crossentropy(y, y_hat)
gradients = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return loss
def train(x, y, epochs, batch_size):
batch_quantity = math.floor(x.shape[0] / batch_size)
for e in range(epochs):
for b in range(batch_quantity):
train_x = x[b:b + batch_size]
train_y = y[b:b + batch_size]
loss = model(train_x, train_y)
print(
'Epoch:', e,
'Batch:', b,
'Loss:', loss.numpy().sum(),
)
model = Model()
(x, y), _ = tf.keras.datasets.mnist.load_data()
epochs = 30
batch_size = 100
train(x, y, epochs, batch_size)
This one gives the exact same results no matter how many times you run it. As expected.
Second version (saving layers into a Model - manually calculating gradients, optimization - compile, and fit):
import numpy as np
import tensorflow as tf
w1 = np.load('w1.npy')
w2 = np.load('w2.npy')
input = tf.keras.Input(shape=(28, 28))
x = tf.keras.layers.Flatten()(input)
a = tf.keras.layers.Dense(
units=16,
activation='relu',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w1)
)(x)
y_hat = tf.keras.layers.Dense(
units=10,
activation='sigmoid',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w2)
)(a)
model = tf.keras.Model(input, y_hat, name='model')
model.summary()
class Model(tf.keras.Model):
def __init__(self, model):
super().__init__()
self.model = model
def train_step(self, data):
x, y = data[0]
with tf.GradientTape() as tape:
y_hat = self.model(x)
loss = tf.keras.losses.sparse_categorical_crossentropy(y, y_hat)
gradients = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return {
'loss': tf.reduce_sum(loss)
}
model = Model(model)
train_data, _ = tf.keras.datasets.mnist.load_data()
epochs = 30
batch_size = 100
model.compile(optimizer=tf.keras.optimizers.Adam())
model.fit(train_data, epochs=epochs, batch_size=batch_size shuffle=False)
Third version (saving layers into a Model compile, and fit):
import numpy as np
import tensorflow as tf
w1 = np.load('w1.npy')
w2 = np.load('w2.npy')
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(
units=16,
activation='relu',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w1)
),
tf.keras.layers.Dense(
units=10,
activation='sigmoid',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w2)
)
])
(x, y), _ = tf.keras.datasets.mnist.load_data()
batch_size = 100
epochs = 30
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM),
optimizer=tf.keras.optimizers.Adam()
)
model.fit(
x, y, batch_size=batch_size, epochs=epochs, shuffle=False
)
The second, and third versions are giving almost the exact same results, but different than the first. That I can't understand, as the optimizer, the loss function, the loss reduction, initial weights, training data, everything is exactly the same.
If you're curious, the VAE examples, and their differences can be tested.
Initialize weights:
import numpy as np
epsilon = np.random.normal(size=(100, 2))
wc1 = np.random.uniform(-0.022, 0.022, size=(3,3,1,32))
wc2 = np.random.uniform(-0.022, 0.022, size=(3,3,32,64))
wd1 = np.random.uniform(-0.022, 0.022, size=(3136,16))
wm = np.random.uniform(-0.022, 0.022, size=(16,2))
ws = np.random.uniform(-0.022, 0.022, size=(16,2))
wd2 = np.random.uniform(-0.022, 0.022, size=(2,3136))
wct1 = np.random.uniform(-0.022, 0.022, size=(3,3,64,64))
wct2 = np.random.uniform(-0.022, 0.022, size=(3,3,32,64))
wct3 = np.random.uniform(-0.022, 0.022, size=(3,3,1,32))
np.save('epsilon', epsilon)
np.save('wc1', wc1)
np.save('wc2', wc2)
np.save('wd1', wd1)
np.save('wm', wm)
np.save('ws', ws)
np.save('wd2', wd2)
np.save('wct1', wct1)
np.save('wct2', wct2)
np.save('wct3', wct3)
Tutorial's VAE:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
wc1 = np.load('wc1.npy')
wc2 = np.load('wc2.npy')
wd1 = np.load('wd1.npy')
wm = np.load('wm.npy')
ws = np.load('ws.npy')
wd2 = np.load('wd2.npy')
wct1 = np.load('wct1.npy')
wct2 = np.load('wct2.npy')
wct3 = np.load('wct3.npy')
epsilon = np.load('epsilon.npy')
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
# batch = tf.shape(z_mean)[0]
# dim = tf.shape(z_mean)[1]
# epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
latent_dim = 2
encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc1))(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc2))(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd1))(x)
z_mean = layers.Dense(latent_dim, name="z_mean", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wm))(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var", use_bias=False, kernel_initializer=tf.keras.initializers.constant(ws))(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(7 * 7 * 64, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd2))(latent_inputs)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct1))(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct2))(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct3))(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()
class VAE(keras.Model):
def __init__(self, encoder, decoder, **kwargs):
super(VAE, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
def train_step(self, data):
if isinstance(data, tuple):
data = data[0]
with tf.GradientTape() as tape:
z_mean, z_log_var, z = encoder(data)
reconstruction = decoder(z)
reconstruction_loss = tf.reduce_mean(
keras.losses.binary_crossentropy(data, reconstruction)
)
reconstruction_loss *= 28 * 28
kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
kl_loss = tf.reduce_mean(kl_loss)
kl_loss *= -0.5
total_loss = reconstruction_loss + kl_loss
grads = tape.gradient(total_loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
return {
'mean': z_mean,
"loss": total_loss,
"reconstruction_loss": reconstruction_loss,
"kl_loss": kl_loss,
}
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
mnist_digits = x_train
mnist_digits = np.expand_dims(mnist_digits, -1).astype("float32") / 255
epochs=30
batch_size=100
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
vae.fit(mnist_digits, epochs=epochs, batch_size=batch_size)
import matplotlib.pyplot as plt
def plot_latent(encoder, decoder):
# display a n*n 2D manifold of digits
n = 30
digit_size = 28
scale = 2.0
figsize = 15
figure = np.zeros((digit_size * n, digit_size * n))
# linearly spaced coordinates corresponding to the 2D plot
# of digit classes in the latent space
grid_x = np.linspace(-scale, scale, n)
grid_y = np.linspace(-scale, scale, n)[::-1]
for i, yi in enumerate(grid_y):
for j, xi in enumerate(grid_x):
z_sample = np.array([[xi, yi]])
x_decoded = decoder.predict(z_sample)
digit = x_decoded[0].reshape(digit_size, digit_size)
figure[
i * digit_size : (i + 1) * digit_size,
j * digit_size : (j + 1) * digit_size,
] = digit
plt.figure(figsize=(figsize, figsize))
start_range = digit_size // 2
end_range = n * digit_size + start_range + 1
pixel_range = np.arange(start_range, end_range, digit_size)
sample_range_x = np.round(grid_x, 1)
sample_range_y = np.round(grid_y, 1)
plt.xticks(pixel_range, sample_range_x)
plt.yticks(pixel_range, sample_range_y)
plt.xlabel("z[0]")
plt.ylabel("z[1]")
plt.imshow(figure, cmap="Greys_r")
plt.savefig('trg.png')
plt.close()
plot_latent(encoder, decoder)
def plot_label_clusters(encoder, decoder, data, labels):
# display a 2D plot of the digit classes in the latent space
z_mean, _, _ = encoder.predict(data)
plt.figure(figsize=(12, 10))
plt.scatter(z_mean[:, 0], z_mean[:, 1], c=labels)
plt.colorbar()
plt.xlabel("z[0]")
plt.ylabel("z[1]")
plt.savefig('wer.png')
plt.close()
(x_train, y_train), _ = keras.datasets.mnist.load_data()
x_train = np.expand_dims(x_train, -1).astype("float32") / 255
plot_label_clusters(encoder, decoder, x_train, y_train)
My VAE:
import math
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
batch_size = 100
epochs = 30
(x, y), _ = tf.keras.datasets.mnist.load_data()
x = tf.expand_dims(x.astype("float32") / 255, 3)
wc1 = np.load('wc1.npy')
wc2 = np.load('wc2.npy')
wd1 = np.load('wd1.npy')
wm = np.load('wm.npy')
ws = np.load('ws.npy')
wd2 = np.load('wd2.npy')
wct1 = np.load('wct1.npy')
wct2 = np.load('wct2.npy')
wct3 = np.load('wct3.npy')
epsilon = np.load('epsilon.npy')
class VAE(tf.keras.Model):
def __init__(self):
super().__init__()
self.encoder = [
tf.keras.layers.Conv2D(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc1)),
tf.keras.layers.Conv2D(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(16, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd1)),
]
self.wm = tf.keras.layers.Dense(2, use_bias=False, kernel_initializer=tf.keras.initializers.constant(wm))
self.wv = tf.keras.layers.Dense(2, use_bias=False, kernel_initializer=tf.keras.initializers.constant(ws))
self.decoder = [
tf.keras.layers.Dense(7 * 7 * 64, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd2)),
tf.keras.layers.Reshape((7, 7, 64)),
tf.keras.layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct1)),
tf.keras.layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct2)),
tf.keras.layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct3)),
]
self.optimizer = tf.keras.optimizers.Adam()
#tf.function
def call(self, x):
with tf.GradientTape() as tape:
z = x
for layer in self.encoder:
z = layer(z)
mean = self.wm(z)
stdev = self.wv(z)
# epsilon = tf.random.normal(mean.shape)
z = mean + tf.exp(0.5 * stdev) * epsilon
y_pred = z
for layer in self.decoder:
y_pred = layer(y_pred)
reconstruction_loss = tf.reduce_mean(
tf.keras.losses.binary_crossentropy(x, y_pred)
)
reconstruction_loss *= 28 * 28
kl_loss = 1 + stdev - tf.square(mean) - tf.exp(stdev)
kl_loss = tf.reduce_mean(kl_loss)
kl_loss *= -0.5
loss = reconstruction_loss + kl_loss
gradients = tape.gradient(loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(gradients, self.trainable_weights))
return loss, reconstruction_loss, kl_loss
model = VAE()
def train(x):
batch_quantity = math.floor(x.shape[0] / batch_size)
for i in range(epochs):
for bi in range(batch_quantity):
train_x = x[bi:bi + batch_size]
loss, reconstruction_loss, kl_loss = model(train_x)
print(
'Epoch:', i,
'Batch:', bi,
'Loss:', loss.numpy(),
'Reconstruction:', reconstruction_loss.numpy(),
'KL:', kl_loss.numpy()
)
train(x)
# display a 2D plot of the digit classes in the latent space
z = x
for layer in model.encoder:
z = layer(z)
mean = model.wm(z)
plt.figure(figsize=(12, 10))
plt.scatter(mean[:, 0], mean[:, 1], c=y)
plt.colorbar()
plt.xlabel("z[0]")
plt.ylabel("z[1]")
plt.savefig('def.png')
plt.close()
n = 30
d = 2.0
s = 28
grid_x = np.linspace(-d, d, n)
grid_y = np.linspace(d, -d, n)
image_width = s*n
image_height = image_width
image = np.zeros((image_height, image_width))
for row, x in enumerate(grid_x):
for col, y in enumerate(grid_y):
z = np.array([[x, y]])
for layer in model.decoder:
z = layer(z)
digit = tf.squeeze(z)
image[row * s: (row + 1) * s,
col * s: (col + 1) * s] = digit.numpy()
plt.figure(figsize=(15, 15))
plt.imshow(image, cmap='Greys_r')
plt.axis('Off')
plt.savefig('asd.png')
plt.close()

Keras, using a generator for the data (VAE)

I'm currently trying to implement a variational autoencoder but I'm quite stuck, I cannot understand how to use a datagenerator in Keras. What I have so far is:
import keras
import tensorflow as tf
from tensorflow.keras import layers
class Sampling(layers.Layer):
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(z_log_var / 2) * epsilon
class factor_vae(keras.Model):
def __init__(self):
super(factor_vae, self).__init__()
self.encoder = self.encoder_factor_vae()
self.decoder = self.decoder_factor_vae()
self.classifier = self.MLP_classifier()
def train_step(self, data):
data = data[0]
with tf.GradientTape() as tape:
z, z_mean, z_log_var = self.encoder(data)
reconstruction = self.decoder(z)
reconstruction_loss = tf.reduce_mean(
keras.losses.mse(data, reconstruction))
reconstruction_loss *= 4096 #denna kan ändras
kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
kl_loss = tf.reduce_mean(kl_loss)
kl_loss *= -0.5
total_loss = reconstruction_loss + (kl_loss)
grads = tape.gradient(total_loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
return {
"loss": total_loss,
"reconstruction_loss": reconstruction_loss,
"kl_loss": kl_loss,
}
def encoder_factor_vae(self):
x_inp = Input(shape=(64, 64, 1))
z = layers.Conv2D(filters=32, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(x_inp)
z = BatchNormalization()(z)
z = layers.Conv2D(filters=32, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(z)
z = BatchNormalization()(z)
z = layers.Conv2D(filters=64, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(z)
z = BatchNormalization()(z)
z = layers.Conv2D(filters=64, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(z)
z = BatchNormalization()(z)
z = layers.Flatten()(z)
z = Dense(units=128, activation='relu')(z)
z = BatchNormalization()(z)
z_mean = Dense(units=10, activation='relu')(z) # här tror jag samplingen ska ske
z_log_var = Dense(units=10, activation='sigmoid')(z) # bör vara sampling från reparameterizationen
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(x_inp, [z, z_mean, z_log_var], name="encoder")
encoder.summary()
return encoder
def decoder_factor_vae(self):
z_inp = Input(shape=(10,))
x_rec = Dense(units=128, activation='relu')(z_inp)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1024, activation='relu')(x_rec) #hit fungerar
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Reshape((4, 4, 64))(x_rec)
x_rec = layers.Conv2DTranspose(filters=64, kernel_size=(4, 4), activation='relu', strides=2, padding='same')(
x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Conv2DTranspose(filters=32, kernel_size=(4, 4), activation='relu', strides=2, padding='same')(
x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Conv2DTranspose(filters=32, kernel_size=(4, 4), activation='relu', strides=2, padding='same')(
x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Conv2DTranspose(filters=1, kernel_size=(4, 4), strides=2, padding='same')(
x_rec)
decoder = keras.Model(z_inp, x_rec, name="decoder") # går att skicka in vilken batchsize som helst
decoder.summary()
return decoder
def MLP_classifier(self):
z_inp = Input(shape=(10,))
x_rec = Dense(units=1000)(z_inp) #1
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) #2
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) # 3
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) # 4
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) # 5
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=2)(x_rec) # 6
classifier = keras.Model(z_inp, x_rec, name="clasifier")
return classifier
def generate_batches(data):
L = 50
start = 0
end = start + L
y_L_real = np.zeros((L, 2))
y_L_fake = np.zeros((L, 2))
y_L_real[:, 0] = 1
y_L_fake[:, 1] = 1
#total_y = np.vstack((y_L_real, y_L_fake))
while True:
x_L_real = data[start:end] #antalet värden är 2xL
x_L_fake = np.roll(x_L_real, shift=2, axis=0)
total_x = np.vstack((x_L_real, x_L_fake))
start += L
end += L
if start >= data.shape[0]:
start = 0
end = L
yield total_x, total_x
data = dsprite()
factor = factor_vae()
xyz = np.load("C:\\Users\\joaki\\OneDrive\\Skrivbord\\images\\dsprites_ndarray_"
"co1sh3sc6or40x32y32_64x64.npz")
test_data = xyz['imgs']
train_steps = 3000
steps_epoch = 300
factor.compile(optimizer=keras.optimizers.Adam(0.001))
train_generator = generate_batches(test_data)
factor.fit_generator(train_generator, steps_per_epoch=steps_epoch, epochs=50)
There is a lot of code, but it does work fine as long as I used my entire dataset, but as soon as I try to use my implemented "train_generator" it breaks down and I get the error message:
NotImplementedError: When subclassing the Model class, you should implement a call method. So I know there is something wrong with my implementation of the train_generator, but I dont understand what I've missed, can someone provide me more information?

Try reading this forum page, seems that you should call method in your class when subclassing:
https://github.com/tensorflow/tensorflow/issues/43173

Although all subclasses of keras.Model must implement call, it is missing in several examples of Keras (see here or here). Under certain conditions, the error 'When subclassing the Model class, you should implement a call method.' is thrown.
I encounter this problem when including a DataGenerator (subclassed from keras.utils.Sequence) and solved it by implementing call() like this:
Autoencoder
...
def call(self, inputs, training=None, mask=None):
z = self.encoder(inputs=inputs, training=training, mask=mask)
return self.decoder(z)
...
GAN
...
def call(self, inputs, training=None, mask=None):
batch_size = tf.shape(inputs)[0]
random_latent_vector = tf.random.normal(shape=(batch_size, self.latent_dim))
x = self.generator(inputs=random_latent_vector, training=training, mask=mask)
if len(x.shape) != len(inputs.shape):
raise Exception(f'Fake signal ({x.shape}) and real signal ({inputs.shape}) do not have same shape dimension')
return self.critic(inputs=x, training=training, mask=mask)
...
It seems to be a known problem (see here)

Why does my keras custom layer fit well on training data but gives bad results on validation?

I am trying to understand how Keras custom layers works, but I am facing a problem with the validation accuracy of my model.
I tried to reproduce a simple convolutional network on MNIST dataset but with a custom layer combining the Conv2D operator and the BatchNormalisation.
First, the data I used :
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = np.array([x.reshape(28, 28, 1) for x in X_train])
X_test = np.array([x.reshape(28, 28, 1) for x in X_test])
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)
Here is the original implementation which works well :
def get_model():
input_ = Input(shape=(28, 28, 1))
x = Conv2D(filters=64, kernel_size=3, activation="relu", input_shape=(28,28,1))(input_)
x = BatchNormalization()(x)
x = MaxPool2D(pool_size=(2,2))(x)
x = Conv2D(filters=128, kernel_size=3, activation="relu")(input_)
x = BatchNormalization()(x)
x = MaxPool2D(pool_size=(2,2))(x)
x = Conv2D(filters=256, kernel_size=3, activation="relu")(input_)
x = BatchNormalization()(x)
x = MaxPool2D(pool_size=(2,2))(x)
x = Flatten()(x)
x = Dense(128, activation="relu")(x)
x = Dense(64, activation="relu")(x)
x = Dense(10, activation="softmax")(x)
mod = Model(inputs=input_, outputs=x)
return mod
optim = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, clipvalue=K.epsilon())
model = get_model()
model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=["accuracy"])
model.fit(X_train, y_train, batch_size=128, epochs=3, validation_data=(X_test, y_test))
With this initial model, after 3 epochs, I get a train accuracy of 97% and validation 97%
And here is my custom layer :
class Conv2DLayer(Layer):
def __init__(self, filters, kernel_size, dropout_ratio=None, strides=(1, 1), activation="relu", use_bn=True, *args, **kwargs):
self._filters = filters
self._kernel_size = kernel_size
self._dropout_ratio = dropout_ratio
self._strides = strides
self.use_bn = use_bn
self._activation = activation
self._args = args
self._kwargs = kwargs
super(Conv2DLayer, self).__init__(*args, **kwargs)
def build(self, input_shape):
self.conv = Conv2D(self._filters,
kernel_size=self._kernel_size,
activation=self._activation,
strides=self._strides,
input_shape=input_shape,
*self._args,
**self._kwargs)
self.conv.build(input_shape)
self.out_conv_shape = self.conv.compute_output_shape(input_shape)
self._trainable_weights = self.conv._trainable_weights
self._non_trainable_weights = self.conv._non_trainable_weights
if self.use_bn:
self.bn = BatchNormalization()
self.bn.build(self.out_conv_shape)
self._trainable_weights.extend(self.bn._trainable_weights)
self._non_trainable_weights.extend(self.bn._non_trainable_weights)
if self._dropout_ratio is not None:
self.dropout = Dropout(rate=self._dropout_ratio)
self.dropout.build(self.out_conv_shape)
self._trainable_weights.extend(self.dropout._trainable_weights)
self._non_trainable_weights.extend(self.dropout._non_trainable_weights)
super(Conv2DLayer, self).build(input_shape)
def call(self, inputs):
x = self.conv(inputs)
if self.use_bn:
x = self.bn(x)
if self._dropout_ratio is not None:
x = self.dropout(x)
return x
def compute_output_shape(self, input_shape):
return self.out_conv_shape
Finally, here is the modified model :
def get_model():
input_ = Input(shape=(28, 28, 1))
x = Conv2DLayer(filters=64, kernel_size=3, activation="relu")(input_)
x = MaxPool2D(pool_size=(2,2))(x)
x = Conv2DLayer(filters=128, kernel_size=3, activation="relu")(input_)
x = MaxPool2D(pool_size=(2,2))(x)
x = Conv2DLayer(filters=256, kernel_size=3, activation="relu")(input_)
x = MaxPool2D(pool_size=(2,2))(x)
x = Flatten()(x)
x = Dense(128, activation="relu")(x)
x = Dense(64, activation="relu")(x)
x = Dense(10, activation="softmax")(x)
mod = Model(inputs=input_, outputs=x)
return mod
For this model with custom layer, I managed to get the same train accuracy (97%), but the validation accuracy get stuck around 50%.
EDIT
Thanks to Matias Valdenegro, I achieved to solve the problem by modifying the call method :
def call(self, inputs):
training = K.learning_phase()
x = self.conv(inputs)
if self.use_bn:
x = self.bn(x, training=training)
if self._dropout_ratio is not None:
x = self.dropout(x, training=training)
return x
With K the keras.backend module.

Both Dropout and Batch Normalization behave differently during training and testing/inference, and your layer does not have any of that behavior, so its using those inner layers as training mode during inference, producing incorrect results.
I am not sure but I think you can fix this by passing the training parameter in the call function call through the layers, something like:
def call(self, inputs, training=None):
x = self.conv(inputs)
if self.use_bn:
x = self.bn(x, training=training)
if self._dropout_ratio is not None:
x = self.dropout(x, training=training)
return x
This should make the inner layers work differently during train and testing/inference phases.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Varational autoencoder using tf.GradientTape - python

Related

3D - Variational Autoencoder implementation Error

Adversarial Autoencoder is not working and not learning properly

Differences between model results

Keras, using a generator for the data (VAE)

Why does my keras custom layer fit well on training data but gives bad results on validation?

Categories

Resources