Model loss on a padded and masked sequence in tensorflow - python

The data I'm working on is a collection of sequences of different length. I have padded all the sequences to the same length and written an LSTM model that uses masks to ignore the padded part of the data.
However, I would expect the loss of the model to be given by the sum of the losses at every time step divided by the total number of valid time-steps (loss_masked_1 below) while in truth the denominator is actually the total number of time-steps, valid or not (loss_masked_2).
Is this the intended behavior? And are the two fundamentally equivalent, from the point of view of the backprop algorithm?
Here is a MWE.
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import LSTM
# Config
N = 32
T = 10
n = 2
mask_value = -1.
tf.random.set_seed(1)
np.random.seed(1)
# Data creation
X = np.ones((N, T, n)) * mask_value
Y = np.ones((N, T, 1)) * mask_value
for i in range(N):
l = np.random.randint(1, T)
value = np.random.random([l, n])
X[i, :l] = value
Y[i, :l] = np.array([sum(v) > 0.5 * n for v in value])[:, None]
class MyModel(Model):
def __init__(self, n, mask_value, *args, **kwargs):
super().__init__(name='MyModel', *args, **kwargs)
self.mask_value = mask_value
self.n = n
self.LSTM = LSTM(self.n, return_sequences=True, activation='linear')
return
def call(self, inputs, training=None, mask=None):
mask = tf.cast(tf.reduce_sum(inputs - self.mask_value, axis=-1), tf.bool)
x = self.LSTM(inputs, mask=mask)
return x
model = MyModel(n, mask_value)
model.build(input_shape=(N, T, n))
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'],
)
model.summary()
mask = 1 - tf.cast(tf.reduce_all(tf.equal(X, mask_value), axis=-1), tf.float32)
loss_unmasked = tf.reduce_mean(tf.keras.losses.binary_crossentropy(Y, model.predict(X)))
loss_masked_1 = tf.reduce_sum(tf.keras.losses.binary_crossentropy(Y, model.predict(X)) * mask) / tf.reduce_sum(mask)
loss_masked_2 = tf.reduce_sum(tf.keras.losses.binary_crossentropy(Y, model.predict(X)) * mask) / (N * T)
print(f"model.evaluate(X, Y): {model.evaluate(X, Y)[0]:.2f}\n"
f"loss_unmasked : {loss_unmasked:.2f}\n"
f"loss_masked_1 : {loss_masked_1:.2f}\n"
f"loss_masked_2 : {loss_masked_2:.2f}"
)

Related

Keras tensor operation dimensionality problem when implementing inductive bias

I am trying to implement the following inductive bias (the following is for a batch size/sample size of one, the first array dimension):
A function g takes five variables/features 100 times (second array dimension in X below). I then find the average of these hundred function evaluations and feed it into a second function f. Using the Keras-only API, I tried implementing this inductive bias in the following way:
def call(self, x):
y_i = self.g(x)[:,:, 0]
y = keras.backend.sum(y_i, axis=1, keepdims=True) / y_i.shape[1]
z = self.f(y)
return z[:, 0]
Sadly, I get the following error.
NotImplementedError: Unable to build a Value from a 'tuple' instance
which is a Keras problem when dimensionality does not match. If I write instead:
def call(self, x):
y_i = self.g(x)[:,:, 0]
return y_i
skipping f entirely, the whole model runs fine. I therefore suspect that my summing y = keras.backend.sum(y_i, axis=1, keepdims=True) / y_i.shape[1] is wrong.
For reproduction, here is the entire runnable code. You can skip import os and os.environ... if you are not on a Mac with PlaidML and AMD.
Code
# %%
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
import numpy as np
import keras
def mlp2(size_in, size_out):
hidden = 128
inputs = keras.Input(shape=(size_in,))
x = keras.layers.Dense(hidden, name='layer1', activation='relu')(inputs)
x = keras.layers.Dense(hidden, name='layer2', activation='relu')(x)
x = keras.layers.Dense(hidden, name='layer3', activation='relu')(x)
outputs = keras.layers.Dense(size_out, name='layer4', activation='relu')(x)
m = keras.Model(inputs, outputs)
return m
class SumNet(keras.models.Model):
def __init__(self):
super(SumNet, self).__init__()
########################################################
# The same inductive bias as above!
self.g = mlp2(5, 1)
self.f = mlp2(1, 1)
def call(self, x):
y_i = self.g(x)[:,:, 0]
y = keras.backend.sum(y_i, axis=1, keepdims=True) / y_i.shape[1]
z = self.f(y)
return z[:,]
# %%
###### np.random.seed(0)
factoring = SumNet()
# check if there is an argument for a maximum learning rate, set to default 10^-3
# check difference of epochs vs total steps in pytorch "scheduler" object
optimizer = keras.optimizers.Adam(lr=1e-3)
factoring.compile(optimizer, loss=keras.losses.mean_squared_error)
# %%
N = 100000
Nt = 100
X = 6 * np.random.rand(N, Nt, 5) - 3
y_i = X[..., 0] ** 2 + 6 * np.cos(2 * X[..., 2])
y = np.sum(y_i, axis=1) / y_i.shape[1]
z = y ** 2
X.shape, y.shape
# create array along first dim of X
f_dim = np.arange(len(X))
training_indeces = np.random.choice(f_dim, int(.8*f_dim.shape[0]), replace=False)
# include_idx = set(training_indeces) #Set is more efficient, but doesn't reorder your elements if that is desireable
mask = np.array([(i in training_indeces) for i in np.arange(len(X))])
Xtrain = X[mask]
ztrain = z[mask]
Xtest = X[~mask]
ztest = z[~mask]
# %%
factoring.fit(Xtrain, ztrain, batch_size=64, epochs=3, validation_split=.05)
results = factoring.evaluate(Xtest, ztest, batch_size=64)
print("test loss, test acc:", results)

Trying to use Dice Loss with UNET

I'm trying to implement the UNET at the keras website:
Image segmentation with a U-Net-like architecture
With only one change. use Dice loss instead of "sparse_categorical_crossentropy". However, every time I try something, I get different error. I'm coding on google colab using Tensorflow 2.7.
For example, I tried using
def DiceLoss(targets, inputs, smooth=1e-6):
#flatten label and prediction tensors
inputs = K.flatten(inputs)
targets = K.flatten(targets)
intersection = K.sum(K.dot(targets, inputs))
dice = (2*intersection + smooth) / (K.sum(targets) + K.sum(inputs) + smooth)
return 1 - dice
The eror I got:
ValueError: Shape must be rank 2 but is rank 1 for '{{node DiceLoss99/MatMul}} = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false](DiceLoss99/Reshape_1, DiceLoss99/Reshape)' with input shapes: [?], [?].
The problem is on this line:
intersection = K.sum(K.dot(targets, inputs))
I also tried this library:
!pip install git+https://github.com/qubvel/segmentation_models
# define optomizer
n_classes=3
LR = 0.0001
optim = keras.optimizers.Adam(LR)
dice_loss_sm = sm.losses.DiceLoss(class_weights=K.ones_like(n_classes))
However, I got the following error:
TypeError: Input 'y' of 'Mul' Op has type int32 that does not match type float32 of argument 'x'.
the remaining code is same as in keras.io. but I listed below for completeness :
from tensorflow.keras import layers
def get_model(img_size, num_classes):
inputs = keras.Input(shape=img_size + (3,))
### [First half of the network: downsampling inputs] ###
# Entry block
x = layers.Conv2D(32, 3, strides=2, padding="same")(inputs)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
previous_block_activation = x # Set aside residual
# Blocks 1, 2, 3 are identical apart from the feature depth.
for filters in [64, 128, 256]:
x = layers.Activation("relu")(x)
x = layers.SeparableConv2D(filters, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.SeparableConv2D(filters, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D(3, strides=2, padding="same")(x)
# Project residual
residual = layers.Conv2D(filters, 1, strides=2, padding="same")(
previous_block_activation
)
x = layers.add([x, residual]) # Add back residual
previous_block_activation = x # Set aside next residual
### [Second half of the network: upsampling inputs] ###
for filters in [256, 128, 64, 32]:
x = layers.Activation("relu")(x)
x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
x = layers.BatchNormalization()(x)
x = layers.UpSampling2D(2)(x)
# Project residual
residual = layers.UpSampling2D(2)(previous_block_activation)
residual = layers.Conv2D(filters, 1, padding="same")(residual)
x = layers.add([x, residual]) # Add back residual
previous_block_activation = x # Set aside next residual
# Add a per-pixel classification layer
outputs = layers.Conv2D(num_classes, 3, activation="softmax", padding="same")(x)
# Define the model
model = keras.Model(inputs, outputs)
return model
# Free up RAM in case the model definition cells were run multiple times
keras.backend.clear_session()
# Build model
model = get_model(img_size, num_classes)
model.summary()
# Configure the model for training.
# We use the "sparse" version of categorical_crossentropy
# because our target data is integers.
# notice I changed the lose the dice loss instead of sparse_categorical_crossentropy
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy")
callbacks = [
keras.callbacks.ModelCheckpoint("oxford_segmentation.h5", save_best_only=True)
]
# Train the model, doing validation at the end of each epoch.
epochs = 15
model.fit(train_gen, epochs=epochs, validation_data=val_gen, callbacks=callbacks)
EDIT
This detailed error message when trying the lose library at segmentation_models:
The issue on this code :
backend = kwargs['backend']
Args:
gt: ground truth 4D keras tensor (B, H, W, C) or (B, C, H, W)
pr: prediction 4D keras tensor (B, H, W, C) or (B, C, H, W)
class_weights: 1. or list of class weights, len(weights) = C
class_indexes: Optional integer or list of integers, classes to consider, if ``None`` all classes are used.
beta: f-score coefficient
smooth: value to avoid division by zero
per_image: if ``True``, metric is calculated as mean over images in batch (B),
else over whole batch
threshold: value to round predictions (use ``>`` comparison), if ``None`` prediction will not be round
Returns:
F-score in range [0, 1]
"""
Args:
gt: ground truth 4D keras tensor (B, H, W, C) or (B, C, H, W)
pr: prediction 4D keras tensor (B, H, W, C) or (B, C, H, W)
class_weights: 1. or list of class weights, len(weights) = C
class_indexes: Optional integer or list of integers, classes to consider, if ``None`` all classes are used.
beta: f-score coefficient
smooth: value to avoid division by zero
per_image: if ``True``, metric is calculated as mean over images in batch (B),
else over whole batch
threshold: value to round predictions (use ``>`` comparison), if ``None`` prediction will not be round
Returns:
F-score in range [0, 1]
"""
Args:
gt: ground truth 4D keras tensor (B, H, W, C) or (B, C, H, W)
pr: prediction 4D keras tensor (B, H, W, C) or (B, C, H, W)
class_weights: 1. or list of class weights, len(weights) = C
class_indexes: Optional integer or list of integers, classes to consider, if ``None`` all classes are used.
beta: f-score coefficient
smooth: value to avoid division by zero
per_image: if ``True``, metric is calculated as mean over images in batch (B),
else over whole batch
threshold: value to round predictions (use ``>`` comparison), if ``None`` prediction will not be round
Returns:
F-score in range [0, 1]
"""
gt, pr = gather_channels(gt, pr, indexes=class_indexes, **kwargs)
pr = round_if_needed(pr, threshold, **kwargs)
axes = get_reduce_axes(per_image, **kwargs)
# calculate score
tp = backend.sum(gt * pr, axis=axes) # the issue here
fp = backend.sum(pr, axis=axes) - tp
fn = backend.sum(gt, axis=axes) - tp
score = ((1 + beta ** 2) * tp + smooth) \
/ ((1 + beta ** 2) * tp + beta ** 2 * fn + fp + smooth)
score = average(score, per_image, class_weights, **kwargs)
return score
The code for gt,pr and axis is here:
def get_reduce_axes(per_image, **kwargs):
backend = kwargs['backend']
axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
if not per_image:
axes.insert(0, 0)
return axes
def gather_channels(*xs, indexes=None, **kwargs):
"""Slice tensors along channels axis by given indexes"""
if indexes is None:
return xs
elif isinstance(indexes, (int)):
indexes = [indexes]
xs = [_gather_channels(x, indexes=indexes, **kwargs) for x in xs]
return xs
def round_if_needed(x, threshold, **kwargs):
backend = kwargs['backend']
if threshold is not None:
x = backend.greater(x, threshold)
x = backend.cast(x, backend.floatx())
return x
You are passing 1-dimensional vectors to K.dot, while the ValueError is saying that K.dot requires arrays with 2-dimensions.
You can replace it with element-wise multiplication, i.e. intersection = K.sum(targets *inputs)

Using a multiple matrices to predict a matrix in Python with Keras

I modified the code from here. What I'm trying to do is combine the two matrices to predict the output matrix. The output matrix is built from the two input matrices. The problem seems to be associated to:
self.Combined_dense_1 = tf.keras.layers.Dense(units=32, activation="relu")
self.Combined_dense_2 = tf.keras.layers.Dense(units=16, activation="softmax")
The linked medium tutorial only predicting a single number based on the combined mixed input. I however am trying to predict a whole matrix but don't know how to structure the combined layer (if this is even the problem).
The error: "ValueError: Shape mismatch: The shape of labels (received (40,)) should equal the shape of logits except for the last dimension (received (10, 16))."
The code:
import warnings
import sys
if not sys.warnoptions:
warnings.simplefilter("ignore")
import numpy as np
import os
import random
import tensorflow as tf
from tensorflow import keras
from IPython.display import clear_output
class model(keras.Model):
def __init__(self):
super().__init__()
# The layers to process our image
self.Conv2D_1 = tf.keras.layers.Conv2D(filters=32,
kernel_size=(1, 1),
strides=(1, 1)
)
self.Conv2D_2 = tf.keras.layers.Conv2D(filters=32,
kernel_size=(3, 3),
strides=(1, 1)
)
# our combined layers
self.Combined_dense_1 = tf.keras.layers.Dense(units=32, activation="relu")
self.Combined_dense_2 = tf.keras.layers.Dense(units=16, activation="softmax")
def call(self, input_image_one, input_image_two):
# Image model
I = self.Conv2D_1(input_image_one)
I = self.Conv2D_2(I)
# Flatten I so we can merge our data.
I = tf.keras.layers.Flatten()(I)
N = self.Conv2D_1(input_image_two)
N = self.Conv2D_2(N)
N = tf.keras.layers.Flatten()(N)
# Combined model
x = tf.concat([N, I], 1) # Concatenate through axis #1
x = self.Combined_dense_1(x)
x = self.Combined_dense_2(x)
return x
network = model()
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def train_step(model, optimizer, loss_function,
images_one_batch, images_two_batch,
labels):
with tf.GradientTape() as tape:
model_output = model(images_one_batch, images_two_batch)
print(model_output)
loss = loss_function(labels, model_output) # our labels vs our predictions
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return loss
def train(model, optimizer, loss_function, epochs,
images_one_batch, images_two_batch,
labels):
loss_array = []
for epoch in range(epochs):
loss = train_step(model, optimizer, loss_function, images_one_batch, images_two_batch, labels)
loss_array.append(loss)
if ((epoch + 1) % 20 == 0):
# Calculating accuracy
network_output = network(images_one_batch, images_two_batch)
preds = np.argmax(network_output, axis=1)
acc = 0
for i in range(len(images_one_batch)):
if (preds[i] == labels[i]):
acc += 1
print(" loss:", loss, " Accuracy: ", acc / len(images_one_batch) * 100, "%")
clear_output(wait=True)
NumberofVars = 2;
width= NumberofVars; height = NumberofVars
NumberOfComputationSets = 10
CM_MatrixArr1 = []
CM_MatrixArr2 = []
for j in range(NumberOfComputationSets):
Theta1 = list(np.reshape(np.random.randint(2, size=4), (1,4))[0])
Theta1 = list(np.float_(Theta1))
CM_MatrixArr1.append(Theta1)
Theta2 = list(np.reshape(np.random.randint(2, size=4), (1,4))[0])
Theta2 = list(np.float_(Theta2))
CM_MatrixArr2.append(Theta2)
combinedCM_MatrixArr = []
combinedCM_toIntArr = []
for x,y in zip(CM_MatrixArr1, CM_MatrixArr2):
combinedCM = []
combinedCM_toInt = 0
for a,b in zip(x,y):
LogVal = (a == b)
combinedCM.append(float(LogVal == True))
combinedCM_MatrixArr.append(combinedCM)
combinedCM_MatrixArr = np.array(combinedCM_MatrixArr)
combinedCM_MatrixArr = combinedCM_MatrixArr.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr1 = np.array(CM_MatrixArr1)
CM_MatrixArr1 = CM_MatrixArr1.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr1 = CM_MatrixArr1.reshape(NumberOfComputationSets, 2,2,1)
CM_MatrixArr2 = np.array(CM_MatrixArr2)
CM_MatrixArr2 = CM_MatrixArr2.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr2 = CM_MatrixArr2.reshape(NumberOfComputationSets, 2,2,1)
train(network,optimizer,loss_function,300,CM_MatrixArr1,CM_MatrixArr2,combinedCM_MatrixArr)

Loss going to NaN after few iterations

In my model, the input is a graph data in the form of edge-index and the node features. After a few iterations of training on graph data, loss (EDIT: which is a combination of MSELoss function and a negative loss function i.e., L1 + (-L2)) becomes NaN. Both L1 and -L2 become NaN after around 40 iterations.
Learning rate = 0.00001. I also checked for invalid input data also, but found none.
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import networkx as nx
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
class Model(nn.Module):
def __init__(self, nin, nhid1, nout, inp_l, hid_l, out_l=1):
super(Model, self).__init__()
self.g1 = GCNConv(in_channels= nin, out_channels= nhid1)
self.g2 = GCNConv(in_channels= nhid1, out_channels= nout)
self.dropout = 0.5
self.lay1 = nn.Linear(inp_l ,hid_l)
self.lay2 = nn.Linear(hid_l ,out_l)
def forward(self, x, adj):
x = F.relu(self.g1(x, adj))
x = F.dropout(x, self.dropout, training=self.training)
x = self.g2(x, adj)
x = self.lay1(x)
x = F.relu(x)
x = self.lay2(x)
x = F.relu(x)
return x
The inputs to the model:
x (Tensor , optional ) – Node feature matrix with shape [num_nodes, num_node_features].
edge_index (LongTensor , optional ) – Graph connectivity in COO format with shape [2, num_edges]
Here num_nodes=1000 ; num_node_features=1 ; num_edges = 5000
GCNConv is a graph embedder returns a [num_nodes, dim] matrix. It takes in the edge-list and the features to return a matrix.
EDIT 2: Added how the loss is calculated
def train_model(epoch):
model= Model(nin = 1, nhid1=128, nout=128, inp_l=128, hid_l=64, out_l=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.00001)
model.train()
t = time.time()
optimizer.zero_grad()
Y = model(features, adjacency_list)
Y1 = func(Y) #Y1 values are calculated from Y by passing through a function func to obtain a same sized vector as Y
loss1 = ((Y1-Y)**2).mean() #MSE Loss function
loss2 = -Y.abs().mean() # This loss is implemented to prevent Y values going to 0. Notice the "-" sign
loss_train = loss1 + loss2
loss_train.backward(retain_graph=True)
nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
if epoch%20==0:
print("MSE loss = ",loss1,"\t","Mean Loss = ",loss2)
print('Epoch: {:04d}'.format(epoch+1),
'loss_train: {:.4f}'.format(loss_train.item()),
'time: {:.4f}s'.format(time.time() - t))
print("\n\n")
return Y

Tensorflow 2 LSTM model doesn't learn using a Sequence

I'm currently using a LSTM model to make timeserie predictions with Tensorflow 2.2.0
I've been using a large dataset and everything works nicely.
However, the dataset creation takes a lot of RAM and I wanted to use a tensorflow.keras.utils.Sequence to solve the issue, my problem is the following:
When using a Sequence, my model doesn't learn anymore (it predicts the average of the real signal over the whole dataset)
My dataset is created from two python lists x_train_flights and y_train_flights, each containing pandas DataFrames. For each (x_train_flight, y_train_flight) of this list:
x_train_flight of shape (-1, features) containing features signals
y_train_flight of shape (-1, 1) containing one signal being aligned in time with the ones from x_train_flights
The system looks like as follow (I am not allowed to share the real data, I've recreated the graph using pseudo-random signals instead):
Here, features=2 (the blue and orange lines), and look_back=5. That is to say, the 10 points (from x_train_flights) in the rectangle are used to predict the golden point (which is compared to the corresponding point in y_train_flights during the training phase). The gray points are previous predictions.
To create my dataset, I've been using these functions:
def lstm_shapify(sequence, look_back, features):
res = np.empty((look_back, len(sequence), features), dtype=np.float32)
for i in range(look_back):
res[i] = np.roll(sequence, -i * features)
return np.transpose(res, axes=(1, 0, 2))[:-look_back + 1]
def make_dataset(x_flights, y_flights, look_back, features):
x = np.empty((0, look_back, features), dtype=np.float32)
y = np.empty((0, 1), dtype=np.float32)
for i in range(len(x_flights)):
x_sample = x_flights[i].values
y_sample = y_flights[i].values[look_back - 1:]
x = np.concatenate([x, lstm_shapify(x_sample, look_back, features)])
y = np.concatenate([y, y_sample])
return x, y
And I fit my network with the following:
model.fit(
x_train,
y_train,
epochs=7,
batch_size=batch_size
)
So, I've created this custom Sequence:
class LSTMGenerator(Sequence):
def __init__(
self,
x_flights: List[DataFrame],
y_flights: List[DataFrame],
look_back: int,
batch_size: int,
features: int
):
self.x_flights = x_flights
self.y_flights = []
self.look_back = look_back
self.batch_size = batch_size
self.features = features
self.length = 0
for y_flight in y_flights:
y = y_flight.iloc[look_back - 1:].to_numpy()
self.y_flights.append(y)
self.length += len(y) // batch_size
def __getitem__(self, index):
flight_index = 0
while True:
n = len(self.y_flights[flight_index]) // self.batch_size
if index < n:
break
flight_index += 1
index = index - n
start_index = index * self.batch_size
x_batch = lstm_shapify(
self.x_flights[flight_index]
.iloc[start_index:start_index + self.batch_size + self.look_back - 1]
.to_numpy(),
self.look_back,
self.features
)
y_batch = self.y_flights[flight_index][start_index:start_index + self.batch_size]
return x_batch, y_batch
def __len__(self):
return self.length
Each tuple (x, y) it returns are two numpy arrays of shape (batch_size, look_back, features) and (batch_size, 1) respectively.
And now I'm trying to fit it with:
model.fit(
LSTMGenerator(x_train_flights, y_train_flights, look_back, batch_size, features),
epochs=epochs
)
Here is my model:
model = Sequential()
model.add(LSTM(
100,
input_shape=(look_back, features),
kernel_regularizer=regularizers.l2(1e-3),
bias_regularizer=regularizers.l2(1e-4)
))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(1, activation='tanh'))
model.compile(optimizer='adam', loss='mse')
Hope you can help me
EDIT: more details about the datasets
I solved it by taking a break and looking at the code once again (and I realized it was a silly mistake): the issue of my Sequence comes from the samples in each batch being consecutive samples in time, whereas my compute-everything-dataset's batches where nicely shuffled.
My Sequence was problematic because the batches were selected at a random index from a random dataset. Now I select each sample at a random index from a random dataset to create a single batch.
Here is a working example:
from tensorflow.keras import *
from tensorflow.keras.layers import *
from tensorflow.keras.utils import *
import numpy as np
import tensorflow as tf
np.random.seed(1234)
tf.random.set_seed(1234)
features = 3
lookback = 7
model = Sequential()
model.add(LSTM(500, input_shape = (lookback, features)))
model.add(Dense(1, activation='tanh'))
XS = np.random.randn(200, features)
YS = np.random.randn(200)
class LookbackSeq(Sequence):
def __init__(self, XS, YS, batch_size, lookback):
self.XS = XS
self.YS = YS
self.batch_size = batch_size
self.lookback = lookback
def __len__(self):
n_windows = self.XS.shape[0] - self.lookback
return int(np.ceil(n_windows / self.batch_size))
def __getitem__(self, i):
base = i * self.batch_size
n_windows = self.XS.shape[0] - self.lookback
batch_size = min(n_windows - base, self.batch_size)
X = np.zeros((batch_size, self.lookback, self.XS.shape[1]))
Y = np.zeros((batch_size, 1))
for i in range(batch_size):
for j in range(self.lookback):
X[i, j] = self.XS[base + i + j]
Y[i] = self.YS[base + i + self.lookback]
return X, Y
model.compile(optimizer='adam', loss='mse')
# ALL SAMPLES IN MEMORY
X, Y = [], []
for i in range(len(XS) - lookback):
X.append(XS[i:i+lookback])
Y.append(YS[i+lookback])
X, Y = np.array(X), np.array(Y)
model.fit(X, Y, epochs = 10, batch_size = 4, shuffle = False)
# GENERATED ON THE FLY
# gen = LookbackSeq(XS, YS, 4, lookback)
# model.fit(x = gen,
# steps_per_epoch = len(gen),
# shuffle = False,
# epochs = 10)
I'm assuming your input data has the shape X = (n_points, n_features) and Y = (n_points,). LookbackSeq does the batching and windowing (lookback) for you.
You can comment and uncomment the relevant lines to either train with samples generated on the fly or with them all stored in memory. You should get identical results.

Categories