I would like to do a neural network for regression analysis using optuna based on this site.
I would like to create a model with two 1D data as input and one 1D data as output in batch learning.
x is the training data and y is the teacher data.
class Model(nn.Module):
# コンストラクタ(インスタンス生成時の初期化)
def __init__(self,trial, mid_units1, mid_units2):
super(Model, self).__init__()
self.linear1 = nn.Linear(2, mid_units1)
self.bn1 = nn.BatchNorm1d(mid_units1)
self.linear2 = nn.Linear(mid_units1, mid_units2)
self.bn2 = nn.BatchNorm1d(mid_units2)
self.linear3 = nn.Linear(mid_units2, 1)
self.activation = trial_activation(trial)
def forward(self, x):
x = self.linear1(x)
x = self.bn1(x)
x = self.activation(x)
x = self.linear2(x)
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCH = 100
x = torch.from_numpy(a[0].astype(np.float32)).to(device)
y = torch.from_numpy(a[1].astype(np.float32)).to(device)
def train_epoch(model, optimizer, criterion):
model.train()
optimizer.zero_grad() # 勾配情報を0に初期化
y_pred = model(x) # 予測
loss = criterion(y_pred.reshape(y.shape), y) # 損失を計算(shapeを揃える)
loss.backward() # 勾配の計算
optimizer.step() # 勾配の更新
return loss.item()
def trial_activation(trial):
activation_names = ['ReLU','logsigmoid']
activation_name = trial.suggest_categorical('activation', activation_names)
if activation_name == activation_names[0]:
activation = F.relu
else:
activation = F.logsigmoid
return activation
def objective(trial):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 中間層のユニット数の試行
mid_units1 = int(trial.suggest_discrete_uniform("mid_units1", 1024*2,1024*4, 64*2))
mid_units2 = int(trial.suggest_discrete_uniform("mid_units2", 1024, 1024*2, 64*2))
net = Model(trial, mid_units1, mid_units2).to(device)
criterion = nn.MSELoss()
# 最適化手法の試行
optimizer = trial_optimizer(trial, net)
train_loss = 0
for epoch in range(EPOCH):
train_loss = train_epoch(net, optimizer, criterion, device)
torch.save(net.state_dict(), str(trial.number) + "new1.pth")
return train_loss
strage_name = "a.sql"
study_name = 'a'
study = optuna.create_study(
study_name = study_name,
storage='sqlite:///' + strage_name,
load_if_exists=True,
direction='minimize')
TRIAL_SIZE = 100
study.optimize(objective, n_trials=TRIAL_SIZE)
error message
---> 28 loss = criterion(y_pred.reshape(y.shape), y) # 損失を計算(shapeを揃える)
29 loss.backward() # 勾配の計算
30 optimizer.step() # 勾配の更新
AttributeError: 'NoneType' object has no attribute 'reshape'
Because of the above error, I checked the value of y_pred and found it to be None.
model.train()
optimizer.zero_grad()
I am thinking that these two lines may be wrong, but I don't know how to solve this problem.
With PyTorch, when you call y_pred = model(x) that will call the forward function which is defined in the Model class.
So, y_pred will get the result of the forward function, in your case, it returns nothing, that's why you get a None value. You can change the forward function as below:
def forward(self, x):
x = self.linear1(x)
x = self.bn1(x)
x = self.activation(x)
x = self.linear2(x)
return x
Related
I am building a model to classify news (AG news dataset). The vocab size ~33k with custom embedding layer. I have run this for 20 epochs but the loss and accuracy (1.3 and 26% respec.) is almost constant even at the end of 20th epoch. Can someone please help me with this? Also, am I feeding the correct input to the fc layer? I am using CrossEntropyLoss as the loss function.
Here is my model class:
class NewsClassifier(nn.Module):
def __init__(self, vocab_weights = None, rnn_type = 'LSTM', vocab_size = len(vocab.vocab), n_classes = 4, embed_size = 300, rnn_units = 512, \
n_layers = 2, bi_dir = True, rnn_drop = 0.0, padding_index = vocab['<unk>']):
super().__init__()
self.rnn_units = rnn_units
self.n_classes = n_classes
self.rnn_type = rnn_type
if vocab_weights:
self.embedding = nn.Embedding.from_pretrained(torch.as_tensor(vocab_weights))
else:
self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx = padding_index)
if rnn_type == 'LSTM':
self.rnn = nn.LSTM(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
elif rnn_type == 'GRU':
self.rnn = nn.GRU(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
else:
raise NotImplementError
self.fc = nn.Linear(2 * rnn_units if bi_dir else rnn_units, self.n_classes)
def forward(self, data, lens):
x_embed = self.embedding(data) # (padded_lens, batch_size, embed_dim)
x_packed = pack_padded_sequence(x_embed, lens.cpu(), enforce_sorted = False) #packing sequences and passing to RNN unit
if self.rnn_type == 'LSTM':
output_packed, (hidden,cell) = self.rnn(x_packed) #output is packed and cannot be fed to linear layers
else:
output_packed, hidden = self.rnn(x_packed) #For GRU there is only hidden state
#Though n number of layers are stacked the output is always 1
output_padded, _ = pad_packed_sequence(output_packed) #output is padded to be fed to linear layer (padded_lens, batch size, hidden_units)
#Picking only the last output --> equivalent to reutrn_sequences = False in Keras
out_reduced = torch.cat((output_padded[-1, :, : self.rnn_units], output_padded[-1, :, self.rnn_units :]), 1)
return self.fc(out_reduced)
model = NewsClassifier()
print(f'The total number of trainable parameters are : {sum(p.numel() for p in model.parameters() if p.requires_grad)}')
My training function is:
def train(model, iterator = trainDataloader, optimizer = optimizer, loss_fn = criterion):
e_loss = e_acc = i = 0
model.train()
for inputs, leng, labels in iterator:
inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
optimizer.zero_grad()
preds = model(inputs, leng).squeeze(1)
loss = loss_fn(preds, labels.long())
acc = accuracy(preds, labels)
loss.backward()
optimizer.step()
e_loss += loss.item()
e_acc += acc.item()
i += 1
return e_loss/i, e_acc/i
def predict(model, iterator = testDataloader, loss_fn = criterion):
e_loss = e_acc = i = 0
model.eval()
with torch.no_grad():
for inputs, leng, labels in iterator:
inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
preds = model(inputs, leng).squeeze(1)
loss = loss_fn(preds, labels.long())
acc = accuracy(preds, labels)
e_loss += loss.item()
e_acc += acc.item()
i += 1
return e_loss/i, e_acc/i
N_EPOCHS = 20
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(model)
valid_loss, valid_acc = predict(model)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut1-model.pt')
print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
I am trying to train an activity recognition system using PyTorch, but the network is not training and loss is not dropping, even though I have a similar model working perfectly on keras. I have provided code for the training loop, model class, and dataset class here. Can you help me why the loss is not dropping (accuracy is not increasing)
main training loop
# create dataset
dataset = IMU_dataset()
train_loader = DataLoader(dataset=dataset,
batch_size=40,
shuffle=True,
num_workers=2)
num_epochs = 100
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)
print(total_samples, n_iterations)
input_shape = 3
output_index = 6
device = torch.device('cpu')
model = HARmodel(input_shape, output_index).to(device)
model.float()
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in range(num_epochs):
for i, (inputs, labels) in enumerate(train_loader):
# origin shape: [40, 3, 400]
labels = labels.to(device)
# Forward pass
outputs = model(inputs.to(device).float())
loss = criterion(outputs, labels.long())
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
# if (i+1) % 5 == 0:
# print(f'loss: {loss.item()}')
print(model.calculate_accuracy(dataset.x_data, dataset.y_data), model.calculate_loss(dataset.x_data, dataset.y_data, criterion))
Here is the model class.
class HARmodel(nn.Module):
"""Model for human-activity-recognition."""
def __init__(self, input_size, num_classes):
super().__init__()
# Extract features, 1D conv layers
self.layer_1 = nn.Conv1d(input_size, 100, 10, stride=1)
self.activation_relu = nn.ReLU()
self.layer_2 = nn.Conv1d(100, 100, 10, stride=1)
self.layer_3 = nn.Conv1d(100, 100, 10, stride=1)
self.layer_4 = nn.MaxPool1d(2, stride=3)
self.layer_5 = nn.Dropout(p=0.2)
self.layer_6 = nn.Conv1d(100, 160, 10, stride=1)
self.layer_7 = nn.Conv1d(160, 160, 10, stride=1)
self.layer_8 = nn.Conv1d(160, 160, 10, stride=1)
# self.layer_9 = nn.AvgPool1d(97)
self.layer_10 = nn.Dropout(p=0.5)
self.layer_11 = nn.Linear(160, 6)
self.activation_softmax = nn.Softmax()
def forward(self, x):
x = self.layer_1(x)
x = self.activation_relu(x)
x = self.layer_2(x)
x = self.activation_relu(x)
x = self.layer_3(x)
x = self.activation_relu(x)
x = self.layer_4(x)
x = self.layer_5(x)
x = self.layer_6(x)
x = self.activation_relu(x)
x = self.layer_7(x)
x = self.activation_relu(x)
x = self.layer_8(x)
x = self.activation_relu(x)
self.layer_9 = nn.AvgPool1d(x.shape[2])
x = self.layer_9(x)
x = self.layer_10(x)
y = self.layer_11(x.view(x.shape[0],x.shape[1]))
# y = self.activation_softmax(y)
return y
def calculate_accuracy(self, X,y):
with torch.no_grad():
output = model.forward(X.float())
max_index = output.max(dim = 1)[1]
true_output = y.type(torch.LongTensor)
result = (max_index == true_output).sum()/y.shape[0]
return result.detach().numpy()
def calculate_loss(self, X,y, crit):
with torch.no_grad():
output = model.forward(X.float())
max_index = output.max(dim = 1)[1]
true_output = y.type(torch.LongTensor)
return crit(output, true_output).item()
Here is the dataset class:
class IMU_dataset(Dataset):
def __init__(self):
self.n = X.shape[0]
self.x_data = torch.from_numpy(X.reshape(-1,3,400))
self.y_data = torch.from_numpy(y)
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.n
EDIT 1:
I got to know that I need to remove the softmax layer
I have tried with a lower learning rate and still have the same problem.
As a Pytorch newbie (coming from tensorflow), I am unsure of how to implement Early Stopping. My research has led me discover that pytorch does not have a native way to so this. I have also discovered torchsample, but am unable to install it in my conda environment for whatever reason. Is there a simple way to go about applying early stopping without it? Here is my current setup:
class RegressionDataset(Dataset):
def __init__(self, X_data, y_data):
self.X_data = X_data
self.y_data = y_data
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
def __len__(self):
return len(self.X_data)
train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = RegressionDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
# Model Params
EPOCHS = 100
BATCH_SIZE = 1000
LEARNING_RATE = 0.001
NUM_FEATURES = np.shape(X_test)[1]
# Initialize Dataloader
train_loader = DataLoader(dataset = train_dataset, batch_size=BATCH_SIZE, shuffle = True)
val_loader = DataLoader(dataset = val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset = test_dataset, batch_size=BATCH_SIZE)
# Define Neural Network Architecture
class MultipleRegression(nn.Module):
def __init__(self, num_features):
super(MultipleRegression, self).__init__()
# Define architecture
self.layer_1 = nn.Linear(num_features, 16)
self.layer_2 = nn.Linear(16, 32)
self.layer_3 = nn.Linear(32, 25)
self.layer_4 = nn.Linear(25, 20)
self.layer_5 = nn.Linear(20, 16)
self.layer_out = nn.Linear(16, 1)
self.relu = nn.ReLU() # ReLU applied to all layers
# Initialize weights and biases
nn.init.xavier_uniform_(self.layer_1.weight)
nn.init.zeros_(self.layer_1.bias)
nn.init.xavier_uniform_(self.layer_2.weight)
nn.init.zeros_(self.layer_2.bias)
nn.init.xavier_uniform_(self.layer_3.weight)
nn.init.zeros_(self.layer_3.bias)
nn.init.xavier_uniform_(self.layer_4.weight)
nn.init.zeros_(self.layer_4.bias)
nn.init.xavier_uniform_(self.layer_5.weight)
nn.init.zeros_(self.layer_5.bias)
nn.init.xavier_uniform_(self.layer_out.weight)
nn.init.zeros_(self.layer_out.bias)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.relu(self.layer_2(x))
x = self.relu(self.layer_3(x))
x = self.relu(self.layer_4(x))
x = self.relu(self.layer_5(x))
x = self.layer_out(x)
return(x)
def predict(self, test_inputs):
x = self.relu(self.layer_1(test_inputs))
x = self.relu(self.layer_2(x))
x = self.relu(self.layer_3(x))
x = self.relu(self.layer_4(x))
x = self.relu(self.layer_5(x))
x = self.layer_out(x)
return(x)
# Check for GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = MultipleRegression(NUM_FEATURES)
model.to(device)
print(model)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
# define dictionary to store loss/epochs for training and validation
loss_stats = {
"train": [],
"val": []
}
# begin training
print("Begin Training")
for e in tqdm(range(1, EPOCHS+1)):
# Training
train_epoch_loss = 0
model.train()
for X_train_batch, y_train_batch in train_loader:
X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
optimizer.zero_grad()
y_train_pred = model(X_train_batch)
train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
# validation
with torch.no_grad():
val_epoch_loss = 0
model.eval()
for X_val_batch, y_val_batch in val_loader:
X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
y_val_pred = model(X_val_batch)
val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
val_epoch_loss += val_loss.item()
loss_stats["train"].append(train_epoch_loss/len(train_loader))
loss_stats["val"].append(val_epoch_loss/len(val_loader))
print(f"Epoch {e}: \ Train loss: {train_epoch_loss/len(train_loader):.5f} \ Val loss: {val_epoch_loss/len(val_loader):.5f}")
# Visualize loss and accuracy
train_val_loss_df = pd.DataFrame.from_dict(loss_stats).reset_index().melt(id_vars=["index"]).rename(columns = {"index":"epochs"})
plt.figure()
sns.lineplot(data = train_val_loss_df, x = "epochs", y = "value", hue = "variable").set_title("Train-Val Loss/Epoch")
# Test model
y_pred_list = []
with torch.no_grad():
model.eval()
for X_batch, _ in test_loader:
X_batch = X_batch.to(device)
y_test_pred = model(X_batch)
y_pred_list.append(y_test_pred.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
y_pred_list = [item for sublist in y_pred_list for item in sublist]
y_pred_list = np.array(y_pred_list)
mse = mean_squared_error(y_test, y_pred_list)
r_square = r2_score(y_test, y_pred_list)
print("Mean Squared Error :", mse)
print("R^2 :", r_square)
A basic way to do this is to keep track of the best validation loss obtained so far.
You can have a variable best_loss = 0 initialized before your loop over epochs (or you could do other things like best loss per epoch, etc.).
After each validation pass then do:
if val_loss > best_loss:
best_loss = val_loss
# At this point also save a snapshot of the current model
torch.save(model, 'my_model_best_loss.pth')
Then, if the best_loss does not improve significantly after some number of training steps, or by the end of the epoch, or if it val_loss gets worse, break out of the loop and terminate the training there.
For implementing algorithms like early stopping (and your training loop in general) you may find it easier to give PyTorch Lightning a try (no affiliation, but it's much easier than trying to roll everything by hand).
I am a beginner looking to code an ANN in PyTorch for the task of prediction for a dynamic engineering system of a Free Piston Sterling Engine. The dataset consists of 6 inputs and 3 outputs, as shown below:
Dataset
I have a basic code which I believe should be able to accommodate for this task, however I believe there may be an issue with the labelling of the dataset, and the datatype used. I have tried converting to longtensor datatype but it has not helped.
I receive the following error when changing the output datatype to float32:
"expected scalar type Long but found Float."
and when I put it as int64, I receive:
"Target 85 is out of bounds."
Please take a look, and any advice would be very appreciated. I have included the code below:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
SEED = 4096
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
file_path = "./Dynamics of Sterling Engine Data(1).csv"
df = pd.read_csv(
file_path,
header=None,
names=[
"Kdp(N/m)",
"Kpp(N/m)",
"Cdp(Ns/m)",
"Cl(Ns/m)",
"mdp(kg)",
"mpp(kg)",
"f(Hz)",
"γ(DP/PP)",
"α(°)",
],
)
n = len(df.index) # 55
shuffle_indices = np.random.permutation(n)
df = df.iloc[shuffle_indices]
x = df.iloc[:, :6].values.astype(np.float32)
y = df.iloc[:, -3].values.astype(np.float32)
mu = x.mean(axis=0)
span = x.max(axis=0) - x.min(axis=0)
def rescale(inputs):
return (inputs - mu) / span
x = rescale(x)
num_train = int(n * 0.82)
num_test = n - num_train
x_train = x[:num_train]
y_train = y[:num_train]
x_test = x[-num_test:]
y_test = y[-num_test:]
class NpDataset(Dataset):
def __init__(self, data, label):
assert len(data) == len(label)
self.data = torch.from_numpy(data)
self.label = torch.from_numpy(label)
def __getitem__(self, index):
return self.data[index], self.label[index]
def __len__(self):
return len(self.label)
train_dataset = NpDataset(x_train, y_train)
test_dataset = NpDataset(x_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
device = torch.device("cpu")
print(device)
class SterlingNN(nn.Module):
def __init__(self):
super(SterlingNN, self).__init__()
# 6 input feautures per data point
self.fn1 = nn.Linear(6, 6) # 6 features, 6 nodes in hidden layer
self.fn2 = nn.Linear(6, 3) # 6 nodes in hidden layer, 3 outputs
def forward(self, x):
x = torch.sigmoid(self.fn1(x)) # sigmoid activation function
x = self.fn2(x)
return x
model = SterlingNN()
print(model.to(device))
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(
model.parameters(), lr=0.01, weight_decay=0.01
)
x, y = next(iter(train_dataloader))
x = x[:5].to(device)
score = model(x)
print(score)
def train():
model.train() # model into training mode and iteratate through data loader
for x, y in train_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
optimiser.zero_grad()
score = model(x)
loss = loss_fn(score, y)
loss.backward()
optimiser.step()
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
def evaluate():
model.eval()
with torch.no_grad():
for x, y in test_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
score = model(x)
loss = loss_fn(score, y)
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
max_epochs = 128
for epoch in range(max_epochs):
tr_loss, tr_acc = train()
eva_loss, eva_acc = evaluate()
print(
"[{epoch}/{max_epochs}] Train loss:{tr_loss:.4f} acc:{tr_acc*100:.2f}% - Test loss:{eva_loss:.4f} acc:{eva_acc*100:.2f}%".format()
)
am using an Seq2Seq project from Google that use Encoder/Decoder, there is the 2 encoder and decoder class :
#ENCODER
class EncoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
super().__init__()
self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
output_dim=embedding_dims)
self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True,
return_state=True )
encoder_embedding = self.encoder_embedding
encoder_rnnlayer = self.encoder_rnnlayer
#DECODER
class DecoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,output_vocab_size, embedding_dims, rnn_units):
super().__init__()
self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
output_dim=embedding_dims)
self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
# Sampler
self.sampler = tfa.seq2seq.sampler.TrainingSampler()
# Create attention mechanism with memory = None
self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
self.rnn_cell = self.build_rnn_cell(BATCH_SIZE)
self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
output_layer=self.dense_layer)
def build_attention_mechanism(self, units,memory, memory_sequence_length):
return tfa.seq2seq.LuongAttention(units, memory = memory,
memory_sequence_length=memory_sequence_length)
#return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)
# wrap decodernn cell
def build_rnn_cell(self, batch_size ):
rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
attention_layer_size=dense_units)
return rnn_cell
def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size,
dtype = Dtype)
decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
return decoder_initial_state
i create an instance of EncoderNetwork and DecoderNetwork with my argument and use the loss_function and train_step already defined to train my model
def loss_function(y_pred, y):
#shape of y [batch_size, ty]
#shape of y_pred [batch_size, Ty, output_vocab_size]
sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
reduction='none')
loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
mask = tf.logical_not(tf.math.equal(y,0)) #output 0 for y=0 else output 1
mask = tf.cast(mask, dtype=loss.dtype)
loss = mask* loss
loss = tf.reduce_mean(loss)
return loss
def train_step(input_batch, output_batch,encoder_initial_cell_state):
#initialize loss = 0
loss = 0
with tf.GradientTape() as tape:
encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp,
initial_state =encoder_initial_cell_state)
#[last step activations,last memory_state] of encoder passed as input to decoder Network
# Prepare correct Decoder input & output sequence data
decoder_input = output_batch[:,:-1] # ignore <end>
#compare logits with timestepped +1 version of decoder_input
decoder_output = output_batch[:,1:] #ignore <start>
# Decoder Embeddings
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
#Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
decoderNetwork.attention_mechanism.setup_memory(a)
decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE,
encoder_state=[a_tx, c_tx],
Dtype=tf.float32)
#BasicDecoderOutput
outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state,
sequence_length=BATCH_SIZE*[Ty-1])
logits = outputs.rnn_output
#Calculate loss
loss = loss_function(logits, decoder_output)
#Returns the list of all layer variables / weights.
variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables
# differentiate loss wrt variables
gradients = tape.gradient(loss, variables)
#grads_and_vars – List of(gradient, variable) pairs.
grads_and_vars = zip(gradients,variables)
optimizer.apply_gradients(grads_and_vars)
return loss
the training does not use fit() methode but like this :
epochs = 20
for i in range(1, epochs+1):
encoder_initial_cell_state = initialize_initial_state()
total_loss = 0.0
for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
total_loss += batch_loss
if (batch+1)%5 == 0:
print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))
the result are fine and the predict fonction work perfectly (custom predict function), but how can i save the model ? i tried pickel and keras.save() but it doesn't work any idea ?