Differences between model results - python

I've been experimenting with VAEs for weeks now, trying to get similar results as in a tutorial code, but being unsuccessful in it. The only difference between mine and the tutorial's code is that it compiles the model (demonstration will be below), and runs fit on it, but I run each layer manually. What I've done is set up weight initialization with numpy arrays, just as with epsilons, saved into .npy files, and ran the two models. The results were different (the tutorial's model gives way more smooth results)
As of my understanding of neural networks so far, given the same training data in the same order, with the same layers, with same initial weights, same optimizer, and hyperparameters, the results should be the same all the time.
I've made 3 simple examples. MNIST data with two dense layers with sizes 16, and 10. Weight initialization:
import numpy as np
w1 = np.random.uniform(-1, 1, size=(784, 16))
w2 = np.random.uniform(-1, 1, size=(16, 10))
np.save('w1', w1)
np.save('w2', w2)
First version (manually executing layers):
import numpy as np
import tensorflow as tf
import math
w1 = np.load('w1.npy')
w2 = np.load('w2.npy')
class Model(tf.keras.Model):
def __init__(self):
super().__init__()
self.flat = tf.keras.layers.Flatten()
self.w1 = tf.keras.layers.Dense(
units=16,
activation='relu',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w1)
)
self.w2 = tf.keras.layers.Dense(
units=10,
activation='sigmoid',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w2)
)
self.optimizer = tf.keras.optimizers.Adam()
#tf.function
def call(self, x, y):
with tf.GradientTape() as tape:
x = self.flat(x)
a = self.w1(x)
y_hat = self.w2(a)
loss = tf.keras.losses.sparse_categorical_crossentropy(y, y_hat)
gradients = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return loss
def train(x, y, epochs, batch_size):
batch_quantity = math.floor(x.shape[0] / batch_size)
for e in range(epochs):
for b in range(batch_quantity):
train_x = x[b:b + batch_size]
train_y = y[b:b + batch_size]
loss = model(train_x, train_y)
print(
'Epoch:', e,
'Batch:', b,
'Loss:', loss.numpy().sum(),
)
model = Model()
(x, y), _ = tf.keras.datasets.mnist.load_data()
epochs = 30
batch_size = 100
train(x, y, epochs, batch_size)
This one gives the exact same results no matter how many times you run it. As expected.
Second version (saving layers into a Model - manually calculating gradients, optimization - compile, and fit):
import numpy as np
import tensorflow as tf
w1 = np.load('w1.npy')
w2 = np.load('w2.npy')
input = tf.keras.Input(shape=(28, 28))
x = tf.keras.layers.Flatten()(input)
a = tf.keras.layers.Dense(
units=16,
activation='relu',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w1)
)(x)
y_hat = tf.keras.layers.Dense(
units=10,
activation='sigmoid',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w2)
)(a)
model = tf.keras.Model(input, y_hat, name='model')
model.summary()
class Model(tf.keras.Model):
def __init__(self, model):
super().__init__()
self.model = model
def train_step(self, data):
x, y = data[0]
with tf.GradientTape() as tape:
y_hat = self.model(x)
loss = tf.keras.losses.sparse_categorical_crossentropy(y, y_hat)
gradients = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return {
'loss': tf.reduce_sum(loss)
}
model = Model(model)
train_data, _ = tf.keras.datasets.mnist.load_data()
epochs = 30
batch_size = 100
model.compile(optimizer=tf.keras.optimizers.Adam())
model.fit(train_data, epochs=epochs, batch_size=batch_size shuffle=False)
Third version (saving layers into a Model compile, and fit):
import numpy as np
import tensorflow as tf
w1 = np.load('w1.npy')
w2 = np.load('w2.npy')
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(
units=16,
activation='relu',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w1)
),
tf.keras.layers.Dense(
units=10,
activation='sigmoid',
use_bias=False,
kernel_initializer=tf.keras.initializers.constant(w2)
)
])
(x, y), _ = tf.keras.datasets.mnist.load_data()
batch_size = 100
epochs = 30
model.compile(
loss=tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.SUM),
optimizer=tf.keras.optimizers.Adam()
)
model.fit(
x, y, batch_size=batch_size, epochs=epochs, shuffle=False
)
The second, and third versions are giving almost the exact same results, but different than the first. That I can't understand, as the optimizer, the loss function, the loss reduction, initial weights, training data, everything is exactly the same.
If you're curious, the VAE examples, and their differences can be tested.
Initialize weights:
import numpy as np
epsilon = np.random.normal(size=(100, 2))
wc1 = np.random.uniform(-0.022, 0.022, size=(3,3,1,32))
wc2 = np.random.uniform(-0.022, 0.022, size=(3,3,32,64))
wd1 = np.random.uniform(-0.022, 0.022, size=(3136,16))
wm = np.random.uniform(-0.022, 0.022, size=(16,2))
ws = np.random.uniform(-0.022, 0.022, size=(16,2))
wd2 = np.random.uniform(-0.022, 0.022, size=(2,3136))
wct1 = np.random.uniform(-0.022, 0.022, size=(3,3,64,64))
wct2 = np.random.uniform(-0.022, 0.022, size=(3,3,32,64))
wct3 = np.random.uniform(-0.022, 0.022, size=(3,3,1,32))
np.save('epsilon', epsilon)
np.save('wc1', wc1)
np.save('wc2', wc2)
np.save('wd1', wd1)
np.save('wm', wm)
np.save('ws', ws)
np.save('wd2', wd2)
np.save('wct1', wct1)
np.save('wct2', wct2)
np.save('wct3', wct3)
Tutorial's VAE:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
wc1 = np.load('wc1.npy')
wc2 = np.load('wc2.npy')
wd1 = np.load('wd1.npy')
wm = np.load('wm.npy')
ws = np.load('ws.npy')
wd2 = np.load('wd2.npy')
wct1 = np.load('wct1.npy')
wct2 = np.load('wct2.npy')
wct3 = np.load('wct3.npy')
epsilon = np.load('epsilon.npy')
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
# batch = tf.shape(z_mean)[0]
# dim = tf.shape(z_mean)[1]
# epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
latent_dim = 2
encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc1))(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc2))(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd1))(x)
z_mean = layers.Dense(latent_dim, name="z_mean", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wm))(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var", use_bias=False, kernel_initializer=tf.keras.initializers.constant(ws))(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(7 * 7 * 64, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd2))(latent_inputs)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct1))(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct2))(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct3))(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()
class VAE(keras.Model):
def __init__(self, encoder, decoder, **kwargs):
super(VAE, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
def train_step(self, data):
if isinstance(data, tuple):
data = data[0]
with tf.GradientTape() as tape:
z_mean, z_log_var, z = encoder(data)
reconstruction = decoder(z)
reconstruction_loss = tf.reduce_mean(
keras.losses.binary_crossentropy(data, reconstruction)
)
reconstruction_loss *= 28 * 28
kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
kl_loss = tf.reduce_mean(kl_loss)
kl_loss *= -0.5
total_loss = reconstruction_loss + kl_loss
grads = tape.gradient(total_loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
return {
'mean': z_mean,
"loss": total_loss,
"reconstruction_loss": reconstruction_loss,
"kl_loss": kl_loss,
}
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
mnist_digits = x_train
mnist_digits = np.expand_dims(mnist_digits, -1).astype("float32") / 255
epochs=30
batch_size=100
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
vae.fit(mnist_digits, epochs=epochs, batch_size=batch_size)
import matplotlib.pyplot as plt
def plot_latent(encoder, decoder):
# display a n*n 2D manifold of digits
n = 30
digit_size = 28
scale = 2.0
figsize = 15
figure = np.zeros((digit_size * n, digit_size * n))
# linearly spaced coordinates corresponding to the 2D plot
# of digit classes in the latent space
grid_x = np.linspace(-scale, scale, n)
grid_y = np.linspace(-scale, scale, n)[::-1]
for i, yi in enumerate(grid_y):
for j, xi in enumerate(grid_x):
z_sample = np.array([[xi, yi]])
x_decoded = decoder.predict(z_sample)
digit = x_decoded[0].reshape(digit_size, digit_size)
figure[
i * digit_size : (i + 1) * digit_size,
j * digit_size : (j + 1) * digit_size,
] = digit
plt.figure(figsize=(figsize, figsize))
start_range = digit_size // 2
end_range = n * digit_size + start_range + 1
pixel_range = np.arange(start_range, end_range, digit_size)
sample_range_x = np.round(grid_x, 1)
sample_range_y = np.round(grid_y, 1)
plt.xticks(pixel_range, sample_range_x)
plt.yticks(pixel_range, sample_range_y)
plt.xlabel("z[0]")
plt.ylabel("z[1]")
plt.imshow(figure, cmap="Greys_r")
plt.savefig('trg.png')
plt.close()
plot_latent(encoder, decoder)
def plot_label_clusters(encoder, decoder, data, labels):
# display a 2D plot of the digit classes in the latent space
z_mean, _, _ = encoder.predict(data)
plt.figure(figsize=(12, 10))
plt.scatter(z_mean[:, 0], z_mean[:, 1], c=labels)
plt.colorbar()
plt.xlabel("z[0]")
plt.ylabel("z[1]")
plt.savefig('wer.png')
plt.close()
(x_train, y_train), _ = keras.datasets.mnist.load_data()
x_train = np.expand_dims(x_train, -1).astype("float32") / 255
plot_label_clusters(encoder, decoder, x_train, y_train)
My VAE:
import math
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
batch_size = 100
epochs = 30
(x, y), _ = tf.keras.datasets.mnist.load_data()
x = tf.expand_dims(x.astype("float32") / 255, 3)
wc1 = np.load('wc1.npy')
wc2 = np.load('wc2.npy')
wd1 = np.load('wd1.npy')
wm = np.load('wm.npy')
ws = np.load('ws.npy')
wd2 = np.load('wd2.npy')
wct1 = np.load('wct1.npy')
wct2 = np.load('wct2.npy')
wct3 = np.load('wct3.npy')
epsilon = np.load('epsilon.npy')
class VAE(tf.keras.Model):
def __init__(self):
super().__init__()
self.encoder = [
tf.keras.layers.Conv2D(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc1)),
tf.keras.layers.Conv2D(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wc2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(16, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd1)),
]
self.wm = tf.keras.layers.Dense(2, use_bias=False, kernel_initializer=tf.keras.initializers.constant(wm))
self.wv = tf.keras.layers.Dense(2, use_bias=False, kernel_initializer=tf.keras.initializers.constant(ws))
self.decoder = [
tf.keras.layers.Dense(7 * 7 * 64, activation="relu", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wd2)),
tf.keras.layers.Reshape((7, 7, 64)),
tf.keras.layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct1)),
tf.keras.layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct2)),
tf.keras.layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same", use_bias=False, kernel_initializer=tf.keras.initializers.constant(wct3)),
]
self.optimizer = tf.keras.optimizers.Adam()
#tf.function
def call(self, x):
with tf.GradientTape() as tape:
z = x
for layer in self.encoder:
z = layer(z)
mean = self.wm(z)
stdev = self.wv(z)
# epsilon = tf.random.normal(mean.shape)
z = mean + tf.exp(0.5 * stdev) * epsilon
y_pred = z
for layer in self.decoder:
y_pred = layer(y_pred)
reconstruction_loss = tf.reduce_mean(
tf.keras.losses.binary_crossentropy(x, y_pred)
)
reconstruction_loss *= 28 * 28
kl_loss = 1 + stdev - tf.square(mean) - tf.exp(stdev)
kl_loss = tf.reduce_mean(kl_loss)
kl_loss *= -0.5
loss = reconstruction_loss + kl_loss
gradients = tape.gradient(loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(gradients, self.trainable_weights))
return loss, reconstruction_loss, kl_loss
model = VAE()
def train(x):
batch_quantity = math.floor(x.shape[0] / batch_size)
for i in range(epochs):
for bi in range(batch_quantity):
train_x = x[bi:bi + batch_size]
loss, reconstruction_loss, kl_loss = model(train_x)
print(
'Epoch:', i,
'Batch:', bi,
'Loss:', loss.numpy(),
'Reconstruction:', reconstruction_loss.numpy(),
'KL:', kl_loss.numpy()
)
train(x)
# display a 2D plot of the digit classes in the latent space
z = x
for layer in model.encoder:
z = layer(z)
mean = model.wm(z)
plt.figure(figsize=(12, 10))
plt.scatter(mean[:, 0], mean[:, 1], c=y)
plt.colorbar()
plt.xlabel("z[0]")
plt.ylabel("z[1]")
plt.savefig('def.png')
plt.close()
n = 30
d = 2.0
s = 28
grid_x = np.linspace(-d, d, n)
grid_y = np.linspace(d, -d, n)
image_width = s*n
image_height = image_width
image = np.zeros((image_height, image_width))
for row, x in enumerate(grid_x):
for col, y in enumerate(grid_y):
z = np.array([[x, y]])
for layer in model.decoder:
z = layer(z)
digit = tf.squeeze(z)
image[row * s: (row + 1) * s,
col * s: (col + 1) * s] = digit.numpy()
plt.figure(figsize=(15, 15))
plt.imshow(image, cmap='Greys_r')
plt.axis('Off')
plt.savefig('asd.png')
plt.close()

Related

Varational autoencoder using tf.GradientTape

Here is an example of tensorflow GradientTape provided by keras for a typical variotional autoencoder:
VAE-keras-example
The train_step function is implemented inside the model and it is trained with the "model.fit()". The example performs great and no problem at all.
However, for another application, I need to implement the train_step function outside of the model definition. In the beginning, I started with above mentioned example as the target application is also a kind of VAE. Accordingly, I applied some modifications and tried to train the same model structure; please find the whole code in the next; however, I get very weird numbers for the loss values comparing to the original code; even after a couple of iterations it gets nan values for losses.
Could you please let me know what's the mistake and why this happens?
Thanks in advance
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow import keras
import numpy as np
print(tf.test.is_gpu_available()) # prints True
print(tf.__version__) # prints '2.0.0-beta1'
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
latent_dim = 2
encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
x = layers.Dense(7 * 7 * 64, activation="relu")(z)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
model = keras.Model(encoder_inputs, [decoder_outputs, z_mean, z_log_var] , name="decoder")
model.summary()
optimizer = tf.keras.optimizers.Adam(lr=0.001)
objective = tf.keras.losses.SparseCategoricalCrossentropy()
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
num_samples = x_train.shape[0]
epochs=1
batch_size=128
#tf.function
def train_step(data):
with tf.GradientTape() as tape:
reconstruction, z_mean, z_log_var = model(data, training=True)
data = tf.expand_dims(data, axis=-1)
reconstruction_loss = tf.reduce_mean(
tf.reduce_sum(keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)))
kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
total_loss = (reconstruction_loss + kl_loss)
grads = tape.gradient(total_loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return total_loss, reconstruction_loss, kl_loss
with tf.device('gpu:0'):
for epoch in range (epochs):
for step in range(num_samples//batch_size):
s = step*batch_size
e = s+batch_size
x_batch = x_train[s:e,:,:]
total_loss, reconstruction_loss, kl_loss = train_step(x_batch)
print("-----------------")
print(f"epoch: {epoch} step: {step}")
print(f"reconstruction_loss: {reconstruction_loss} ")
print(f"kl_loss: {kl_loss} ")
print(f"total_loss: {total_loss}")
I think you forgot to normalize your data as shown in the tutorial you are referring to:
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
x_train = x_train.astype("float32") / 255
Otherwise, your code seems to be running fine and the loss in not nan. Here is the code for reference:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow import keras
import numpy as np
print(tf.test.is_gpu_available()) # prints True
print(tf.__version__) # prints '2.0.0-beta1'
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
latent_dim = 2
encoder_inputs = keras.Input(shape=(28, 28, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
x = layers.Dense(7 * 7 * 64, activation="relu")(z)
x = layers.Reshape((7, 7, 64))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
model = keras.Model(encoder_inputs, [decoder_outputs, z_mean, z_log_var] , name="decoder")
model.summary()
optimizer = tf.keras.optimizers.Adam(lr=0.001)
objective = tf.keras.losses.SparseCategoricalCrossentropy()
(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
x_train = x_train.astype("float32") / 255
num_samples = x_train.shape[0]
epochs=4
batch_size=128
#tf.function
def train_step(data):
with tf.GradientTape() as tape:
reconstruction, z_mean, z_log_var = model(data, training=True)
reconstruction_loss = tf.reduce_mean(
tf.reduce_sum(keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)))
kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))/batch_size
kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))/batch_size
total_loss = (reconstruction_loss + kl_loss)
grads = tape.gradient(total_loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return total_loss, reconstruction_loss, kl_loss
with tf.device('gpu:0'):
for epoch in range (epochs):
for step in range(num_samples//batch_size):
s = step*batch_size
e = s+batch_size
x_batch = x_train[s:e,:,:, tf.newaxis]
print(x_batch.shape)
total_loss, reconstruction_loss, kl_loss = train_step(x_batch)
print("-----------------")
print(f"epoch: {epoch} step: {step}")
print(f"reconstruction_loss: {reconstruction_loss} ")
print(f"kl_loss: {kl_loss} ")
print(f"total_loss: {total_loss}")

How to plot Receptive Fields, for a CNN/fashionMNIST?

I created my CNN with PyTorch Lightning, and I am actually looking for plotting the Receptive Fields.
Do you have any suggestions about it?
I look for different solutions here and there, but I actually can't make them synergize with PyTorch Lightning.
Is it possible to visualize the Receptive fields directly inside Tensorboard?
I'll share with you my Dataset:
train_dataset = torchvision.datasets.FashionMNIST('classifier_data', train=True, download=True, transform=transforms.ToTensor())
train, val = train_test_split(train_dataset, test_size = .2)
train_loader = DataLoader(train, batch_size = 32)
val_loader = DataLoader(train, batch_size = 32)
test_dataset = torchvision.datasets.FashionMNIST('classifier_data', train=False, download=True, transform=transforms.ToTensor())
test_loader = DataLoader(test_dataset, batch_size = 32)
and CNN:
def __init__(self, dropout, learn_rate, momentum, weight_decay, optimizer):
#def __init__(self, dropout, learn_rate, weight_decay, optimizer):
super().__init__()
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 5)
self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 12 , kernel_size = 5)
self.fc1 = nn.Linear(in_features = 12*4*4, out_features = 120)
self.fc2 = nn.Linear(in_features = 120, out_features = 60)
self.out = nn.Linear(in_features = 60, out_features = 10)
self.do = nn.Dropout(dropout) #for overfitting issues
self.loss = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy()
self.learn_rate = learn_rate
self.momentum = momentum #with Adam we don't have momentum. To Check best Optimizer with Optune, please comment this line.
self.weight_decay = weight_decay
self.optimizer = optimizer
self.train_loss = []
self.val_loss = []
self.train_acc = []
self.test_acc = []
#plot into tensorboard
log_dir = pathlib.Path.cwd() / "lightning_logs"
self.writer = SummaryWriter(log_dir)
#forward step
#I add each layer to the histogram. It's plotted into tensorboard
def forward(self, x, additional_out=False):
#conv1
x = self.conv1(x)
self.writer.add_histogram("First convolutional layer CNN", x)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size = 2, stride = 2)
#conv2
x = self.conv2(x)
self.writer.add_histogram("Second convolutional layer CNN", x)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size = 2, stride = 2)
#fuly connected 1
x = x.reshape(-1, 12*4*4)
x = self.fc1(x)
self.writer.add_histogram("First linear layer CNN", x)
x = F.relu(x)
x = self.do(x)
#fully connected 2
x=self.fc2(x)
self.writer.add_histogram("Second linear layer CNN", x)
x = F.relu(x)
x = self.do(x)
#output
x = self.out(x)
self.writer.add_histogram("Output layer CNN", x)
return x
#optimizer
def configure_optimizers(self):
#optimizer = self.optimizer(self.parameters(), lr = self.learn_rate, momentum = self.momentum, weight_decay = self.weight_decay)
optimizer = self.optimizer(self.parameters(), lr = self.learn_rate, weight_decay = self.weight_decay)
return optimizer
#training step
def training_step(self, batch, batch_idx):
x, y = batch
b = x.size(0)
x = x.view(b, -1, 28, 28)
logit = self(x)
J = self.loss(logit, y) #loss
#self.train_loss.append(J) #no need to append
acc = self.accuracy(logit, y) #accuracy
#self.train_acc.append(acc) #no need to append
self.log("train_loss_cnn", J.item())
self.log("train_acc_cnn", acc.item())
return {'loss': J}
#Since I used Tensorboard, it don't have to append to loss
def test_step(self, batch, batch_idx):
p, q = batch
b = p.size(0)
p = p.view(b, -1, 28, 28)
logit = self(p)
J = self.loss(logit, q) #loss
acc_test = self.accuracy(logit, q) #accuracy
#self.train_acc.append(acc_test) #no need to append
#self.train_loss.append(J) #no need to append
self.log("test_acc_cnn", acc_test.item())
self.log("test_loss_cnn", J.item())
def validation_step(self, batch, batch_idx=None):
u, v = batch
b = u.size(0)
u = u.view(b, -1, 28, 28)
logit = self(u)
J = self.loss(logit, v) #loss
#self.val_loss.append(J) #no need to append
acc_val = self.accuracy(logit, v) #accuracy
#self.train_acc.append(acc_val) #no need to append
self.log("val_loss_cnn", J.item())
self.log("val_acc_cnn", acc_val.item())
return {"loss": J, "pred": logit, "target": v}
#Once saves from validation step, I take with me the returned elements, and I can plot the Confusion Matrix inside Tensorboard
def validation_epoch_end(self, outputs):
preds = torch.cat([tmp['pred'] for tmp in outputs])
targets = torch.cat([tmp['target'] for tmp in outputs])
conf_mat = confusion_matrix(preds, targets, num_classes=10)
df_cm = pd.DataFrame(conf_mat.numpy(), index = range(10), columns=range(10))
plt.figure(figsize = (10,7))
fig_ = sns.heatmap(df_cm, annot=True, cmap='Spectral').get_figure()
plt.close(fig_)
self.logger.experiment.add_figure("Confusion matrix CNN", fig_, self.current_epoch)

ValueError in model subclassing with tensorflow 2

I'm trying to implement a WideResnet using Model subclassing in keras. I cannot understand what's wrong in my code:
class ResidualBlock(layers.Layer):
def __init__(self, filters, kernel_size, dropout, dropout_percentage, strides=1, **kwargs):
super(ResidualBlock, self).__init__(**kwargs)
self.conv_1 = layers.Conv2D(filters, (1, 1), strides=strides)
self.bn_1 = layers.BatchNormalization()
self.rel_1 = layers.ReLU()
self.conv_2 = layers.Conv2D(filters, kernel_size, padding="same", strides=strides)
self.dropout = layers.Dropout(dropout_percentage)
self.bn_2 = layers.BatchNormalization()
self.rel_2 = layers.ReLU()
self.conv_3 = layers.Conv2D(filters, kernel_size, padding="same")
self.add = layers.Add()
self.dropout = dropout
self.strides = strides
def call(self, inputs):
x = inputs
if self.strides > 1:
x = self.conv_1(x)
res_x = self.bn_1(x)
res_x = self.rel_1(x)
res_x = self.conv_2(x)
if self.dropout:
res_x = self.dropout(x)
res_x = self.bn_2(x)
res_x = self.rel_2(x)
res_x = self.conv_3(x)
inputs = self.add([x, res_x])
return inputs
class WideResidualNetwork(models.Model):
def __init__(self, input_shape, n_classes, d, k, kernel_size=(3, 3), dropout=False, dropout_percentage=0.3, strides=1, **kwargs):
super(WideResidualNetwork, self).__init__(**kwargs)
if (d-4)%6 != 0:
raise ValueError('Please choose a correct depth!')
self.rel_1 = layers.ReLU()
self.conv_1 = layers.Conv2D(16, (3, 3), padding='same')
self.conv_2 = layers.Conv2D(16*k, (1, 1))
self.dense = layers.Dense(n_classes)
self.dropout = dropout
self.dropout_percentage = dropout_percentage
self.N = int((d - 4) / 6)
self.k = k
self.d = d
self.kernel_size = kernel_size
def build(self, input_shape):
self.bn_1 = layers.BatchNormalization(input_shape=input_shape)
def call(self, inputs):
x = self.bn_1(inputs)
x = self.rel_1(x)
x = self.conv_1(x)
x = self.conv_2(x)
for _ in range(self.N):
x = ResidualBlock(16*self.k, self.kernel_size, self.dropout, self.dropout_percentage)(x)
x = ResidualBlock( 32*self.k, self.kernel_size, self.dropout, self.dropout_percentage, strides=2)(x)
for _ in range(self.N-1):
x = ResidualBlock( 32*self.k, self.kernel_size, self.dropout, self.dropout_percentage)(x)
x = ResidualBlock( 64*self.k, self.kernel_size, self.dropout, self.dropout_percentage, strides=2)(x)
for _ in range(self.N-1):
x = ResidualBlock( 64*self.k, self.kernel_size, self.dropout, self.dropout_percentage)(x)
x = layers.GlobalAveragePooling2D()(x)
x = self.dense(x)
x = layers.Activation("softmax")(x)
return x
When i try to fit the model in this way:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
model = WideResidualNetwork(x_train[0].shape, 10, 28, 1)
x_train, x_test = x_train/255. , x_test/255.
model = WideResidualNetwork(x_train[0].shape, 10, 28, 1)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
epochs = 40
batch_size = 64
validation_split = 0.2
h = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
I got the following error:
...
<ipython-input-26-61c1bdb3546c>:31 call *
x = ResidualBlock(16*self.k, self.kernel_size, self.dropout, self.dropout_percentage)(x)
<ipython-input-9-3fea1e77cb6e>:23 call *
res_x = self.bn_1(x)
...
ValueError: tf.function-decorated function tried to create variables on non-first call.
So I didn't understand where is the problem, I also tried to move the initialization into the build, but without results, the error persists. Probably I have some gaps in my knowledge
Thank you in advance
You are initializing ResidualBlocks, GlobalAveragePooling2D, and Activation layers into the call method. Try to move them into the init, as you did for other layers, and it shouldn't give you that error.

Keras, using a generator for the data (VAE)

I'm currently trying to implement a variational autoencoder but I'm quite stuck, I cannot understand how to use a datagenerator in Keras. What I have so far is:
import keras
import tensorflow as tf
from tensorflow.keras import layers
class Sampling(layers.Layer):
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(z_log_var / 2) * epsilon
class factor_vae(keras.Model):
def __init__(self):
super(factor_vae, self).__init__()
self.encoder = self.encoder_factor_vae()
self.decoder = self.decoder_factor_vae()
self.classifier = self.MLP_classifier()
def train_step(self, data):
data = data[0]
with tf.GradientTape() as tape:
z, z_mean, z_log_var = self.encoder(data)
reconstruction = self.decoder(z)
reconstruction_loss = tf.reduce_mean(
keras.losses.mse(data, reconstruction))
reconstruction_loss *= 4096 #denna kan ändras
kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
kl_loss = tf.reduce_mean(kl_loss)
kl_loss *= -0.5
total_loss = reconstruction_loss + (kl_loss)
grads = tape.gradient(total_loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
return {
"loss": total_loss,
"reconstruction_loss": reconstruction_loss,
"kl_loss": kl_loss,
}
def encoder_factor_vae(self):
x_inp = Input(shape=(64, 64, 1))
z = layers.Conv2D(filters=32, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(x_inp)
z = BatchNormalization()(z)
z = layers.Conv2D(filters=32, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(z)
z = BatchNormalization()(z)
z = layers.Conv2D(filters=64, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(z)
z = BatchNormalization()(z)
z = layers.Conv2D(filters=64, kernel_size=(4, 4), activation="relu", strides=2, padding="same")(z)
z = BatchNormalization()(z)
z = layers.Flatten()(z)
z = Dense(units=128, activation='relu')(z)
z = BatchNormalization()(z)
z_mean = Dense(units=10, activation='relu')(z) # här tror jag samplingen ska ske
z_log_var = Dense(units=10, activation='sigmoid')(z) # bör vara sampling från reparameterizationen
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(x_inp, [z, z_mean, z_log_var], name="encoder")
encoder.summary()
return encoder
def decoder_factor_vae(self):
z_inp = Input(shape=(10,))
x_rec = Dense(units=128, activation='relu')(z_inp)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1024, activation='relu')(x_rec) #hit fungerar
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Reshape((4, 4, 64))(x_rec)
x_rec = layers.Conv2DTranspose(filters=64, kernel_size=(4, 4), activation='relu', strides=2, padding='same')(
x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Conv2DTranspose(filters=32, kernel_size=(4, 4), activation='relu', strides=2, padding='same')(
x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Conv2DTranspose(filters=32, kernel_size=(4, 4), activation='relu', strides=2, padding='same')(
x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = layers.Conv2DTranspose(filters=1, kernel_size=(4, 4), strides=2, padding='same')(
x_rec)
decoder = keras.Model(z_inp, x_rec, name="decoder") # går att skicka in vilken batchsize som helst
decoder.summary()
return decoder
def MLP_classifier(self):
z_inp = Input(shape=(10,))
x_rec = Dense(units=1000)(z_inp) #1
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) #2
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) # 3
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) # 4
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=1000)(x_rec) # 5
x_rec = LeakyReLU(alpha=0.3)(x_rec)
x_rec = BatchNormalization()(x_rec)
x_rec = Dense(units=2)(x_rec) # 6
classifier = keras.Model(z_inp, x_rec, name="clasifier")
return classifier
def generate_batches(data):
L = 50
start = 0
end = start + L
y_L_real = np.zeros((L, 2))
y_L_fake = np.zeros((L, 2))
y_L_real[:, 0] = 1
y_L_fake[:, 1] = 1
#total_y = np.vstack((y_L_real, y_L_fake))
while True:
x_L_real = data[start:end] #antalet värden är 2xL
x_L_fake = np.roll(x_L_real, shift=2, axis=0)
total_x = np.vstack((x_L_real, x_L_fake))
start += L
end += L
if start >= data.shape[0]:
start = 0
end = L
yield total_x, total_x
data = dsprite()
factor = factor_vae()
xyz = np.load("C:\\Users\\joaki\\OneDrive\\Skrivbord\\images\\dsprites_ndarray_"
"co1sh3sc6or40x32y32_64x64.npz")
test_data = xyz['imgs']
train_steps = 3000
steps_epoch = 300
factor.compile(optimizer=keras.optimizers.Adam(0.001))
train_generator = generate_batches(test_data)
factor.fit_generator(train_generator, steps_per_epoch=steps_epoch, epochs=50)
There is a lot of code, but it does work fine as long as I used my entire dataset, but as soon as I try to use my implemented "train_generator" it breaks down and I get the error message:
NotImplementedError: When subclassing the Model class, you should implement a call method. So I know there is something wrong with my implementation of the train_generator, but I dont understand what I've missed, can someone provide me more information?
Try reading this forum page, seems that you should call method in your class when subclassing:
https://github.com/tensorflow/tensorflow/issues/43173
Although all subclasses of keras.Model must implement call, it is missing in several examples of Keras (see here or here). Under certain conditions, the error 'When subclassing the Model class, you should implement a call method.' is thrown.
I encounter this problem when including a DataGenerator (subclassed from keras.utils.Sequence) and solved it by implementing call() like this:
Autoencoder
...
def call(self, inputs, training=None, mask=None):
z = self.encoder(inputs=inputs, training=training, mask=mask)
return self.decoder(z)
...
GAN
...
def call(self, inputs, training=None, mask=None):
batch_size = tf.shape(inputs)[0]
random_latent_vector = tf.random.normal(shape=(batch_size, self.latent_dim))
x = self.generator(inputs=random_latent_vector, training=training, mask=mask)
if len(x.shape) != len(inputs.shape):
raise Exception(f'Fake signal ({x.shape}) and real signal ({inputs.shape}) do not have same shape dimension')
return self.critic(inputs=x, training=training, mask=mask)
...
It seems to be a known problem (see here)

Negative labels withh keras ocr example

I am trying to implement an handwriting ocr based on the keras ocr example: link.
However I get the following error:
InvalidArgumentError: All labels must be nonnegative integers, batch: 0 labels: 1,0,11,9,45,0,25,17,27,41,39,9,37,0,23,1,39,9,35,0,11,35,29,25,0,1,0,27,9,1,35,3,49,0,43,17,23,23,1,13,9,0,69,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
[[{{node ctc_6/CTCLoss}}]]
[[{{node training_5/SGD/gradients/ctc_6/CTCLoss_grad/mul}}]]
Here are the generator, the ctc and the train function:
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
# the 2 is critical here since the first couple outputs of the RNN
# tend to be garbage:
y_pred = y_pred[:, 2:, :]
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
#Generation of data: load the images, resize, gray, normalize them
class DataGenerator(keras.utils.Sequence):
def __init__(self, list_Files, labels,downsample_factor, max_string_length=80, batch_size=32, dim=(512,64), shuffle=True):
self.dim = dim
self.batch_size = batch_size
self.labels = labels
self.list_Files = list_Files
self.shuffle = shuffle
self.on_epoch_end()
self.max_string_length = max_string_length
self.downsample_factor = downsample_factor
#TODO: Add weight save
def on_epoch_end(self):
self.indexes = np.arange(len(self.list_Files))
if self.shuffle==True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_Files_temp):
#*[2,2] --> 2,2 (unpack values)
X = np.ones([self.batch_size, *self.dim,1])
y = np.ones([self.batch_size, self.max_string_length])*-1 #As in the keras_ocr example why -1?
X_length = np.zeros([self.batch_size,1])
y_length = np.zeros([self.batch_size,1])
#TODO: add mix with blank inputs as it is said to be important for transitional invariance
for i, file in enumerate(list_Files_temp):
im = cv2.imread(file)# load the file as numpy array
im = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY) #Transform the file into a Gray image
im = cv2.resize(im, self.dim[::-1]) #Resize it (cv2 takes width first)
im = im / 255 #Normalization
X[i,0:self.dim[0],:,0] = im
X_length[i] = self.dim[0] // self.downsample_factor -2 #?????
seq = text_to_labels(self.labels[file])
y[i,0:len(seq)] = text_to_labels(self.labels[file]) #Transform the text into a list of integers
y_length[i] = len(y[i])
print("LEN={0}".format(y_length[i]))
inputs={'the_input': X,
'the_labels': y,
'input_length':X_length,
'label_length':y_length
}
outputs = {'ctc': np.zeros([self.batch_size])}
print(y)
return (inputs, outputs)
def __len__(self):
'Number of batches per epoch'
return int(np.floor(len(self.list_Files) / self.batch_size))
def __getitem__(self, index):
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
list_Files_temp = [self.list_Files[k] for k in indexes]
#print(list_Files_temp[0])
(inputs, outputs) = self.__data_generation(list_Files_temp)
return (inputs, outputs)
def train(dim_images,partition,labels):
#Misc parameters
absolute_max_string_length = 80
output_size = len(alphabet) + 1 #+1 for the CTC blank symbol
#Network parameters
img_h = dim_images[0]
img_w = dim_images[1]
conv_filters = 16
kernel_size = (3,3)
pool_size = 2
time_dense_size = 32
rnn_size = 512
act = 'relu'
input_shape = (*DIM_IMAGES,1)
downsample_factor = pool_size**2
#Convolutional layer
input_data = Input(name='the_input', shape=input_shape)
inner = Conv2D(conv_filters, kernel_size, padding='same',
activation=act, kernel_initializer='he_normal', name='conv1')(input_data)
inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(inner)
inner = Conv2D(conv_filters, kernel_size, padding='same',
activation=act, kernel_initializer='he_normal',
name='conv2')(inner)
inner = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(inner)
conv_to_rnn_dims = (img_w // (pool_size ** 2), (img_h // (pool_size ** 2)) * conv_filters)
inner = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(inner)
#Recurrent layer
gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(inner)
gru1_merged = add([gru_1, gru_1b])
gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(gru1_merged)
# transforms RNN output to character activations:
inner = Dense(output_size, kernel_initializer='he_normal',
name='dense2')(concatenate([gru_2, gru_2b]))
#Prediction (need to be decoded)
y_pred = Activation('softmax', name='softmax')(inner)
Model(inputs=input_data, outputs=y_pred).summary()
labelsI = Input(name='the_labels',
shape =[absolute_max_string_length], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
# Keras doesn't currently support loss funcs with extra parameters
# so CTC loss is implemented in a lambda layer
loss_out = Lambda(
ctc_lambda_func, output_shape=(1,),
name='ctc')([y_pred, labelsI, input_length, label_length])
#Genrators
training_generator = DataGenerator(partition['train'],labels,downsample_factor, batch_size=BATCH_SIZE, dim=DIM_IMAGES, shuffle=True)
valid_generator = DataGenerator(partition['valid'], labels,downsample_factor, batch_size=BATCH_SIZE, dim=DIM_IMAGES, shuffle=False)
# clipnorm seems to speeds up convergence
sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
model = Model(inputs=[input_data, labelsI, input_length, label_length],
outputs=loss_out)
# the loss calc occurs elsewhere, so use a dummy lambda func for the loss
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)
# captures output of softmax so we can decode the output during visualization
test_func = K.function([input_data], [y_pred])
model.fit_generator(
generator=training_generator,
steps_per_epoch=(len(partition['train'])-len(partition['valid'])) // BATCH_SIZE,
epochs=20,
validation_data=valid_generator,
validation_steps=len(partition['valid'])//BATCH_SIZE)
I guess the '-1' labels come from this line:
y = np.ones([self.batch_size, self.max_string_length])*-1
In the original code, the there was a similar line (line 220) but it runs well:
self.Y_data = np.ones([self.num_words, self.absolute_max_string_len]) * -1
I thought the '-1' were a way of padding the sequence, but this value seems forbidden by the ctc function, is there something I am missing here?
It seems I just mixed up my image length and image width. Plus, the "label_length" should be equal to the real length of the sentence (before paddding with -1). Therefore the line:
y_length[i] = len(y[i])
Should be replaced by:
y_length[i] = len(seq)

Categories