Variational Auto Encoder produces the same picture as the input - python

I'm trying to train a CVAE, convolutional variational auto encoder, to generate new pictures of human faces.
I'm using the same loss function, training step function, generating function etc. as in the official TensorFlow CVAE Tutorial. I'll post them below.
The only thing I've changed is the encoder and decoder network and the input and output shapes (because I want 128x128 RGB pictures, while the tutorial deals with 28x28 MNIST pictures).
The problem I encounter is, that every picture generated (by generate_and_save_images()) is not a new one (variational), but just a picture that exists 1 by 1 in the dataset. (My training dataset has 518 pictures of faces)
So the generating of new pictures doesn't work, we just have the same pictures 1 by 1 recreated.
Why? And how to fix it?
#tf.function
def sample(self, eps=None):
if eps is None:
eps = tf.random.normal(shape=(100, self.latent_dim))
return self.decode(eps, apply_sigmoid=True)
def encode(self, x):
mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
return mean, logvar
def reparameterize(self, mean, logvar):
eps = tf.random.normal(shape=mean.shape)
return eps * tf.exp(logvar * .5) + mean
def decode(self, z, apply_sigmoid=False):
logits = self.decoder(z)
if apply_sigmoid:
probs = tf.sigmoid(logits)
return probs
return logits
def log_normal_pdf(sample, mean, logvar, raxis=1):
log2pi = tf.math.log(2. * np.pi)
return tf.reduce_sum(
-.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
axis=raxis)
def compute_loss(model, x):
mean, logvar = model.encode(x)
z = model.reparameterize(mean, logvar)
x_logit = model.decode(z)
cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)
logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])
logpz = log_normal_pdf(z, 0., 0.)
logqz_x = log_normal_pdf(z, mean, logvar)
return -tf.reduce_mean(logpx_z + logpz - logqz_x)
#tf.function
def train_step(model, x, optimizer):
"""Executes one training step and returns the loss.
This function computes the loss and gradients, and uses the latter to
update the model's parameters.
"""
with tf.GradientTape() as tape:
loss = compute_loss(model, x)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
# keeping the random vector constant for generation (prediction) so
# it will be easier to see the improvement.
random_vector_for_generation = tf.random.normal(
shape=[num_examples_to_generate, latent_dim])
def generate_and_save_images(model, epoch, test_sample):
mean, logvar = model.encode(test_sample)
z = model.reparameterize(mean, logvar)
predictions = model.sample(z)
_ = plt.figure(figsize=(20, 20))
for i in range(predictions.shape[0]):
plt.subplot(4, 4, i + 1)
plt.imshow(predictions[i, :, :, :])
plt.axis('off')
# tight_layout minimizes the overlap between 2 sub-plots
plt.savefig(r'D:\...\image_at_epoch_{:04d}.png'.format(epoch))
plt.show()

Related

The parameters of the model with custom loss function doesn’t upgraded thorough its learning over epochs

Thank you for reading my post.
I’m currently developing the peak detection algorithm using CNN to determine the ideal convolution kernel which is representable as the ideal mother wavelet function that will maximize the peak detection accuracy.
To begin with, I created my own IoU loss function and the simple model and tried to run the learning. The execution itself worked without any errors, but somehow it failed.
The parameters of the model with custom loss function doesn't upgraded thorough its learning over epochs
My own loss function is described as below.
def IoU(inputs: torch.Tensor, labels: torch.Tensor,
smooth: float=0.1, threshold: float = 0.5, alpha: float = 1.0):
'''
- alpha: a parameter that sharpen the thresholding.
if alpha = 1 -> thresholded input is the same as raw input.
'''
thresholded_inputs = inputs**alpha / (inputs**alpha + (1 - inputs)**alpha)
inputs = torch.where(thresholded_inputs < threshold, 0, 1)
batch_size = inputs.shape[0]
intersect_tensor = (inputs * labels).view(batch_size, -1)
intersect = intersect_tensor.sum(-1)
union_tensor = torch.max(inputs, labels).view(batch_size, -1)
union = union_tensor.sum(-1)
iou = (intersect + smooth) / (union + smooth) # We smooth our devision to avoid 0/0
iou_score = iou.mean()
return 1- iou_score
and my training model is,
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv1d(1, 1, kernel_size=32, stride=1, padding=16),
nn.Linear(257, 256),
nn.LogSoftmax(1)
)
def forward(self, x):
return self.net(x)
model = MLP()
opt = optim.Adadelta(model.parameters())
# initialization of the kernel of Conv1d
def init_kernel(m):
if type(m) == nn.Conv1d:
nn.init.kaiming_normal_(m.weight)
print(m.weight)
plt.plot(m.weight[0][0].detach().numpy())
model.apply(init_kernel)
def step(x, y, is_train=True):
opt.zero_grad()
y_pred = model(x)
y_pred = y_pred.reshape(-1, 256)
loss = IoU(y_pred, y)
loss.requires_grad = True
loss.retain_grad = True
if is_train:
loss.backward()
opt.step()
return loss, y_pred
and lastly, the execution code is,
from torch.autograd.grad_mode import F
train_loss_arr, val_loss_arr = [], []
valbose = 10
epochs = 200
for e in range(epochs):
train_loss, val_loss, acc = 0., 0., 0.,
for x, y in train_set.as_numpy_iterator():
x = torch.from_numpy(x)
y = torch.from_numpy(y)
model.train()
loss, y_pred = step(x, y, is_train=True)
train_loss += loss.item()
train_loss /= len(train_set)
for x, y ,in val_set.as_numpy_iterator():
x = torch.from_numpy(x)
y = torch.from_numpy(y)
model.eval()
with torch.no_grad():
loss, y_pred = step(x, y, is_train=False)
val_loss += loss.item()
val_loss /= len(val_set)
train_loss_arr.append(train_loss)
val_loss_arr.append(val_loss)
# visualize current kernel to check whether the learning is on progress safely.
if e % valbose == 0:
print(f"Epoch[{e}]({(e*100/epochs):0.2f}%): train_loss: {train_loss:0.4f}, val_loss: {val_loss:0.4f}")
fig, axs = plt.subplots(1, 4, figsize=(12, 4))
print(y_pred[0], y_pred[0].shape)
axs[0].plot(x[0][0])
axs[0].set_title("spectra")
axs[1].plot(y_pred[0])
axs[1].set_title("y pred")
axs[2].plot(y[0])
axs[2].set_title("y true")
axs[3].plot(model.state_dict()["net.0.weight"][0][0].numpy())
axs[3].set_title("kernel1")
plt.show()
with these programs, I tried to evaluate this simple model, however, model parameters didn't change at all over epochs.
Visualization of the results at epoch 0 and 30.
epoch 0:
prediction and kernel at epoch0
epoch 30:
prediction and kernel at epoch30
As you can see, the kernel has not be modified through its learning over epochs.
I took a survey to figure out what causes this problem for hours but I'm still not sure how to fix my loss function and model into trainable ones.
Thank you.
Try printing the gradient after loss.backward() with:
y_pred.grad()
I suspect what you'll find is that after a backward pass, the gradient of y_pred is zero. This means that either a.) gradient is not enabled for one or more of the variables at which the computation graph has a node, or b.) (more likely) you are using an operation which is not differentiable.
In your case, at a minimum torch.where is non-differentiable, so you'll need to replace that. Thersholding operations are non-differentiable and are generally replaced with "soft" thresholding operations (see Softmax instead of max function for classification) so that gradient computation still works. Try replacing this with a soft threshold or no threshold at all.

why I got Wrong training result with changing forward function in VAE?

I have a VAE model with the below forward function and loss function:
def forward(self, x):
mu, logvar = self.encode(x.view(-1, 2))
z = self.reparameterize(mu, logvar)
return self.decode(z), mu, logvar
def loss_function(recon_x, x, mu, logvar):
L2 = torch.mean((recon_x-x)**2)
KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
return L2 + KLD
the result of training for the multivariate gaussian dataset is similar to the photo below:
To do something, I changed the forward and loss function as follows
def forward(self, x):
mu, logvar = self.encode(x.view(-1, 2))
z = self.reparameterize(mu, logvar)
q0 = torch.distributions.normal.Normal(mu, (0.5 * logvar).exp())
prior = torch.distributions.normal.Normal(0., 1.)
log_prior_z = prior.log_prob(z).sum(-1)
log_q_z = q0.log_prob(z).sum(-1)
return self.decode(z), log_q_z - log_prior_z
def loss_function(recon_x, x, KL):
L2 = torch.mean((recon_x-x)**2)
return L2 + KL
but the training result is as follows
How do I make these changes to get the same training result?

Error when using run_eagerly=False in model.compile custom Keras Model in Tensorflow

I am developing a custom model in Tensorflow. I am trying to implement a Virtual Adversarial Training (VAT) model from https://arxiv.org/abs/1704.03976. The model makes use of both labeled and unlabeled data in its classification task. Therefore, in the train_step of the model, I need to divide the data of the batch into labeled (0, or 1), or unlabeled (-1). It seems to work as expected when compiling the model using run_eagerly=True, but when I use run_eagerly=False, it gives me the following error:
ValueError: Number of mask dimensions must be specified, even if some dimensions are None. E.g. shape=[None] is ok, but shape=None is not.
which seems to be produced in:
X_l, y_l = tf.boolean_mask(X, tf.logical_not(missing)), tf.boolean_mask(y, tf.logical_not(missing))
I am not sure what is causing the error, but it seems to have something to do with a weird tensor shape issues that only occur during run_eagerly=False. I need the boolean_mask functionality in order to distinguish the labeled and unlabeled data. I hope someone can help me out. In order to reproduce the errors, I added the model, and a small simulation example. The simulation will produce the error I have, when run_eagerly=False is set.
Thanks in advance.
Model defintion:
from tensorflow import keras
import tensorflow as tf
metric_acc = keras.metrics.BinaryAccuracy()
metric_loss = keras.metrics.Mean('loss')
class VAT(keras.Model):
def __init__(self, units_1=16, units_2=16, dropout=0.3, xi=1e-6, epsilon=2.0, alpha=1.0):
super(VAT, self).__init__()
# Set model parameters
self.units_1 = units_1
self.units_2 = units_2
self.dropout = dropout
self.xi = xi
self.epsilon = epsilon
self.alpha = alpha
# First hidden
self.dense1 = keras.layers.Dense(self.units_1)
self.activation1 = keras.layers.Activation(tf.nn.leaky_relu)
self.dropout1 = keras.layers.Dropout(self.dropout)
# Second hidden
self.dense2 = keras.layers.Dense(self.units_2)
self.activation2 = keras.layers.Activation(tf.nn.leaky_relu)
self.dropout2 = keras.layers.Dropout(self.dropout)
# Output layer
self.dense3 = keras.layers.Dense(1)
self.activation3 = keras.layers.Activation("sigmoid")
def call(self, inputs, training=None, mask=None):
x1 = self.dense1(inputs)
x2 = self.activation1(x1)
x3 = self.dropout1(x2, training=True)
x4 = self.dense2(x3)
x5 = self.activation2(x4)
x6 = self.dropout2(x5, training=True)
x7 = self.dense3(x6)
x8 = self.activation3(x7)
return x8
def generate_perturbation(self, inputs):
# Generate normal vectors
d = tf.random.normal(shape=tf.shape(inputs))
# Normalize vectors
d = tf.math.l2_normalize(d, axis=1)
# Calculate r
r = self.xi * d
# Make predictions
p = self(inputs, training=True)
# Tape gradient
with tf.GradientTape() as tape:
tape.watch(r)
# Perturbed predictions
p_perturbed = self(inputs + r, training=True)
# Calculate divergence
D = keras.losses.KLD(p, p_perturbed) + keras.losses.KLD(1 - p, 1 - p_perturbed)
# Calculate gradient
gradient = tape.gradient(D, r)
# Calculate r_vadv
r_vadv = tf.math.l2_normalize(gradient, axis=1)
# Return virtual adversarial perturbation
return r_vadv
#tf.function
def train_step(self, data):
# Unpack data
X, y = data
# Missing label boolean indices
missing = tf.squeeze(tf.equal(y, -1))
# Split data into labeled and unlabeled data
X_l, y_l = tf.boolean_mask(X, tf.logical_not(missing)), tf.boolean_mask(y, tf.logical_not(missing))
X_u = tf.boolean_mask(X, missing)
# Calculate virtual perturbations for labeled and unlabeled
r_l = self.generate_perturbation(X_l)
r_u = self.generate_perturbation(X_u)
# Tape gradient
with tf.GradientTape() as model_tape:
model_tape.watch(self.trainable_variables)
# Calculate probabilities real data
prob_l, prob_u = self(X_l, training=True), self(X_u, training=True)
# Calculate probabilities perturbed data
prob_r_l, prob_r_u = self(X_l + self.epsilon * r_l, training=True), self(X_u + self.epsilon * r_u, training=True)
# Calculate loss
loss = vat_loss(y_l, prob_l, prob_u, prob_r_l, prob_r_u, self.alpha)
# Calculate gradient
model_gradient = model_tape.gradient(loss, self.trainable_variables)
# Update weights
self.optimizer.apply_gradients(zip(model_gradient, self.trainable_variables))
# Compute metrics
metric_acc.update_state(y_l, prob_l)
metric_loss.update_state(loss)
return {'loss': metric_loss.result(), 'accuracy': metric_acc.result()}
#property
def metrics(self):
return [metric_loss, metric_acc]
def vat_loss(y_l, prob_l, prob_u, prob_r_l, prob_r_u, alpha):
N_l = tf.cast(tf.size(prob_l), dtype=tf.dtypes.float32)
N_u = tf.cast(tf.size(prob_u), dtype=tf.dtypes.float32)
if tf.equal(N_l, 0):
# No labeled examples: get contribution from unlabeled data using perturbations
R_vadv = tf.reduce_sum(
keras.losses.KLD(prob_u, prob_r_u)
+ keras.losses.KLD(1 - prob_u, 1 - prob_r_u)
)
return alpha * R_vadv / N_u
elif tf.equal(N_u, 0):
# No unlabeled examples: get contribution from labeled data
R = tf.reduce_sum(keras.losses.binary_crossentropy(y_l, prob_l))
R_vadv = tf.reduce_sum(
keras.losses.KLD(prob_l, prob_r_l)
+ keras.losses.KLD(1 - prob_l, 1 - prob_r_l)
)
return R / N_l + alpha * R_vadv / N_l
else:
# Get contribution from labeled data
R = tf.reduce_sum(keras.losses.binary_crossentropy(y_l, prob_l))
# Get contribution from labeled and unlabeled data using perturbations
R_vadv = tf.reduce_sum(
keras.losses.KLD(prob_l, prob_r_l)
+ keras.losses.KLD(1 - prob_l, 1 - prob_r_l)
) + tf.reduce_sum(
keras.losses.KLD(prob_u, prob_r_u)
+ keras.losses.KLD(1 - prob_u, 1 - prob_r_u)
)
return R / N_l + alpha * R_vadv / (N_l + N_u)
Simulation example:
To show that the model/code works as desired (when using run_eagerly=True, I made a simulation example. In this example, I bias when observations are labeled/unlabeled. The figure below illustrates the labeled observations used by the model (yellow or purple), and the unlabeled observations (blue).
The VAT produces an accuracy of around ~0.75, whereas the reference model produces an accuracy of around ~0.58. These accuracies are produced without hyperparameter tuning.
from modules.vat import VAT
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
def create_biased_sample(x, proportion_labeled):
labeled = np.random.choice([True, False], p=[proportion_labeled, 1-proportion_labeled])
if x[0] < 0.0:
return False
elif x[0] > 1.0:
return False
else:
return labeled
# Simulation parameters
N = 2000
proportion_labeled = 0.15
# Model training parameters
BATCH_SIZE = 128
BUFFER_SIZE = 60000
EPOCHS = 100
# Generate a dataset
X, y = datasets.make_moons(n_samples=N, noise=.05, random_state=3)
X, y = X.astype('float32'), y.astype('float32')
y = y.reshape(-1, 1)
# Split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
# Simulate missing labels
sample_biased = lambda x: create_biased_sample(x, proportion_labeled)
labeled = np.array([sample_biased(k) for k in X_train])
y_train[~ labeled] = -1
# Estimate VAT model
vat = VAT(dropout=0.2, units_1=16, units_2=16, epsilon=0.5)
vat.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), run_eagerly=True)
vat.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)
# Estimate a reference model
reference = keras.models.Sequential([
keras.layers.Input(shape=(2,)),
keras.layers.Dense(16),
keras.layers.Activation(tf.nn.leaky_relu),
keras.layers.Dropout(0.2),
keras.layers.Dense(16),
keras.layers.Activation(tf.nn.leaky_relu),
keras.layers.Dropout(0.2),
keras.layers.Dense(1),
keras.layers.Activation("sigmoid")
])
reference.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss=keras.losses.binary_crossentropy, run_eagerly=False)
reference.fit(X_train[y_train.flatten() != -1, :], y_train[y_train.flatten() != -1], batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)
# Calculate out-of-sample accuracies
test_acc_vat = tf.reduce_mean(keras.metrics.binary_accuracy(y_test, vat(X_test, training=False)))
test_acc_reference = tf.reduce_mean(keras.metrics.binary_accuracy(y_test, reference(X_test, training=False)))
# Print results
print('Test accuracy of VAT: {}'.format(test_acc_vat))
print('Test accuracy of reference model: {}'.format(test_acc_reference))
# Plot scatter
plt.scatter(X_test[:, 0], X_test[:, 1])
plt.scatter(X_train[y_train.flatten() != -1, 0], X_train[y_train.flatten() != -1, 1], c=y_train.flatten()[y_train.flatten() != -1])
For anyone who is interested, I solved the issue by adding the following in the train_step() method:
missing.set_shape([None])
It should be just after declaring the tensor missing. I solved this using this thread: Tensorflow boolean_mask with dynamic mask.

Beta-Variational AutoEncoder can't disentangle

I am working on a dummy example with generated heartbeats, and want to first use a VAE to encode the heartbeats and afterwards a simple classifier.
Problem is when i increase the beta above 0.01, the reconstructions become nonsense (see the first image).
And when the beta is low i get a normal autoencoder output with no disentanglement (second image).
I believe the problem may be in my KL divergence or VAE loss function, but i can't seem to find it.
In my encoder i do the reparameterization as such:
enc = self.encoder(x,batch_size, x_lenghts)
mu = self.enc2mean(enc)
logv = self.enc2logv(enc)
std = torch.exp(0.5*logv)
z = torch.randn([batch_size,1, self.encoder_hidden_sizes[-1] * (int(self.bidirectional)+1)]).to(self.device)
z = z * std + mu
And i define the VAE loss as:
def VAE_loss(x, reconstruction, mu, logvar, batch_size, latent_dim, beta=0):
mse = F.mse_loss(x, reconstruction)
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
KLD /= (batch_size * latent_dim)
return mse + beta*KLD
Full standalone code to reproduce the results is here.
Any insights are appreciated!

Implementing back propagation using numpy and python for cleveland dataset

I wanted to predict heart disease using backpropagation algorithm for neural networks. For this I used UCI heart disease data set linked here: processed cleveland. To do this, I used the cde found on the following blog: Build a flexible Neural Network with Backpropagation in Python and changed it little bit according to my own dataset. My code is as follows:
import numpy as np
import csv
reader = csv.reader(open("cleveland_data.csv"), delimiter=",")
x = list(reader)
result = np.array(x).astype("float")
X = result[:, :13]
y0 = result[:, 13]
y1 = np.array([y0])
y = y1.T
# scale units
X = X / np.amax(X, axis=0) # maximum of X array
class Neural_Network(object):
def __init__(self):
# parameters
self.inputSize = 13
self.outputSize = 1
self.hiddenSize = 13
# weights
self.W1 = np.random.randn(self.inputSize, self.hiddenSize)
self.W2 = np.random.randn(self.hiddenSize, self.outputSize)
def forward(self, X):
# forward propagation through our network
self.z = np.dot(X, self.W1)
self.z2 = self.sigmoid(self.z) # activation function
self.z3 = np.dot(self.z2, self.W2)
o = self.sigmoid(self.z3) # final activation function
return o
def sigmoid(self, s):
# activation function
return 1 / (1 + np.exp(-s))
def sigmoidPrime(self, s):
# derivative of sigmoid
return s * (1 - s)
def backward(self, X, y, o):
# backward propgate through the network
self.o_error = y - o # error in output
self.o_delta = self.o_error * self.sigmoidPrime(o) # applying derivative of sigmoid to error
self.z2_error = self.o_delta.dot(
self.W2.T) # z2 error: how much our hidden layer weights contributed to output error
self.z2_delta = self.z2_error * self.sigmoidPrime(self.z2) # applying derivative of sigmoid to z2 error
self.W1 += X.T.dot(self.z2_delta) # adjusting first set (input --> hidden) weights
self.W2 += self.z2.T.dot(self.o_delta) # adjusting second set (hidden --> output) weights
def train(self, X, y):
o = self.forward(X)
self.backward(X, y, o)
NN = Neural_Network()
for i in range(100): # trains the NN 100 times
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" + str(NN.forward(X)))
print("Loss: \n" + str(np.mean(np.square(y - NN.forward(X))))) # mean sum squared loss
print("\n")
NN.train(X, y)
But when I run this code, my all predicted outputs become = 1 after few iterations and then stays the same for up to all 100 iterations. what is the problem in the code?
Few mistakes that I've noticed:
The output of your network is a sigmoid, i.e. a value between [0, 1] -- suits for predicting probabilities. But the target seems to be a value between [0, 4]. This explains the desire of the network to maximize the output to get as close as possible to large labels. But it can't go more than 1.0 and gets stuck.
You should either get rid of the final sigmoid or pre-process the label and scale it to [0, 1]. Both options will make it learn better.
You don't use the learning rate (effectively setting it to 1.0), which is probably a bit high, so it's possible for the NN to diverge. My experiments showed that 0.01 is a good learning rate, but you can play around with that.
Other than this, your backprop seems working right.

Categories