I'm trying to implement Gradient accumulation on TF2.x. All implementations I've found are either for TF1.x or for the old keras interface. I don't think there is an implementation out there (though I'd be very happy to be proven wrong on this).
Here's what I'm working with:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense
from tqdm import tqdm
import matplotlib.pyplot as plt
class SimpleTrainStepModel(Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
if len(data) == 3:
x, y, sample_weight = data
else:
(x, y), sample_weight = data, None
# FIRST GRADIENT
with tf.GradientTape() as tape:
y_pred = self(x, training = True) # Forward pass
loss = self.compiled_loss(y, y_pred, sample_weight = sample_weight, regularization_losses = self.losses)
gradients = tape.gradient(loss, self.trainable_variables)
self.compiled_metrics.update_state(y, y_pred)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return {m.name: m.result() for m in self.metrics}
class GradAccumModel(Model):
def fit(self, *args, batch_size = 32, grad_accum = 1, **kwargs):
self.train_function = None
if batch_size % grad_accum != 0:
raise ValueError('Batch size must be divisible by the Gradient accumulation steps, dummy!')
self.grad_accum = grad_accum
self.batch_size = batch_size
return super(GradAccumModel, self).fit(*args,
batch_size = self.batch_size,
#validation_batch_size = validation_batch_size,#self.batch_size//grad_accum if validation_batch_size is None else validation_batch_size,
**kwargs)
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
if len(data) == 3:
x, y, sample_weight = data
else:
(x, y), sample_weight = data, None
step = self.batch_size // self.grad_accum
# def _slice_nested(obj, i, j):
# if type(obj) is list:
# return [o[i:j] for o in obj]
# else:
# return obj[i:j]
# FIRST GRADIENT
with tf.GradientTape() as tape:
y_pred = self(x[:step], training = True) # Forward pass
loss = self.compiled_loss(y[:step], y_pred, sample_weight = sample_weight, regularization_losses = self.losses)
gradients = tape.gradient(loss, self.trainable_variables)
self.compiled_metrics.update_state(y[:step], y_pred)
i = tf.constant(step)
# tf.print('TF - HERE!')
def cond(i, *args):
return i < self.batch_size
def body(i, grad):
# tf.print('\tTF - HERE!')
with tf.GradientTape() as tape:
y_pred = self(x[i:i + step], training = True) # Forward pass
loss = self.compiled_loss(y[i:i + step], y_pred, sample_weight = sample_weight, regularization_losses = self.losses)
_grad = tape.gradient(loss, self.trainable_variables)
for g,_g in zip(grad, _grad):
g += _g
self.compiled_metrics.update_state(y[i:i + step], y_pred)
return [i + step, grad]
i, gradients = tf.while_loop(cond, body, [i, gradients], parallel_iterations = 1)
# for g in gradients: # I tried with and without division co calculate the mean
# g *= 1/self.grad_accum #
# Update weights
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
# Update metrics (includes the metric that tracks the loss)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
if __name__ == '__main__':
(x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.mnist.load_data()
for MODEL, ga_kwarg, colour in list(zip([Model, SimpleTrainStepModel, GradAccumModel, GradAccumModel],
[{}, {}, {'grad_accum': 1}, {'grad_accum': 6}],
['blue', 'green', 'yellow', 'red'])):
for _ in tqdm(range(10)):
# tf.random.set_seed(0)
x = Input((28, 28))
y = x
y = Flatten()(y)
y = Dense(128, activation = 'sigmoid')(y)
y = Dense(10, activation = 'softmax')(y)
model = MODEL(x, y)
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = tf.keras.optimizers.Adam(1e-4),
metrics = ['acc'])
hist = model.fit(x_train, y_train, validation_data = (x_valid, y_valid), verbose = 0, batch_size = 6000, epochs = 100, **ga_kwarg)
plt.plot(hist.history['val_acc'], color = colour, alpha = .25)
plt.title('')
plt.xscale('symlog')
plt.yscale('logit')
plt.show()
I've been able to verify that it does actually save gpu memory. However, the end result is not the same as the normal Model.fit.
As you can see, the first three Model.fits are well clustered and give the same results. But when the the while cycle comes into play the training is quite different.
Anyone have any idea why this is happening?
After a lot more attempts I found the solution, It seems that the main problem was the compound assignments of the gradients which don't work quite as I was expecting. Here is my final solution for anyone who might be interested. It includes the extra stuff for distributed, mixed precision trainings, and nested input/output.
from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
from tensorflow.python.distribute import parameter_server_strategy
from tensorflow.python.distribute import distribution_strategy_context as ds_context
from tensorflow.python.util import nest
from tensorflow.keras.models import Model as _Model
class Model(_Model):
def fit(self, *args, batch_size: int = 32, grad_accum_steps: int = 1, **kwargs):
"""
Shallow wrapper of Model.fit that captures batch_size and additional kwarg: grad_accum.
Parameters
----------
batch_size : int
same as in Model.fit
grad_accum_steps : int
Number of steps to split batch_size into. The `batch_size` should be divisible by `grad_accum` (defaults to 1).
"""
if grad_accum_steps == 1:
super().fit(*args, batch_size = batch_size, **kwargs)
self.train_function = None
num_workers = ds_context.get_strategy().num_replicas_in_sync
if batch_size % (grad_accum_steps * num_workers) != 0:
raise ValueError(f'Batch size ({batch_size}) must be divisible by the Gradient accumulation steps ({grad_accum_steps}), and the number of replicas ({num_workers}), dummy!')
self._grad_accum_ = grad_accum_steps
self._batch_size_ = batch_size
self._num_workers_ = num_workers
train_step_backup = self.train_step
self.train_step = self._train_step_
out = super(self).fit(*args,
batch_size = self._batch_size_, # TODO maybe consider validation batch size
**kwargs)
del self._grad_accum_
del self._batch_size_
del self._num_workers_
self.train_step = train_step_backup
return out
def _train_step_(self, data):
"""
Custom training step taking into account gradient accumulation for low memory training
"""
if len(data) == 3:
x, y, sample_weight = data
else:
(x, y), sample_weight = data, None
def slice_map(struct, start, stop): # dealing with nasty nested structures
if struct is None:
return None # special case for sample_weight
return nest.map_structure(lambda x: x[start:stop], struct)
# ---------- GRAD ACCUM STUFF ----------------------------------------------------------------------------------
step = self._batch_size_ // self._num_workers_ // self._grad_accum_
x_ = slice_map(x, 0, step)
y_ = slice_map(y, 0, step)
w_ = slice_map(sample_weight, 0, step)
with tf.GradientTape() as tape:
y_pred = self(x_, training = True) # Forward pass
loss = self.compiled_loss(y_, y_pred, sample_weight = w_, regularization_losses = self.losses)
if isinstance(self.optimizer, lso.LossScaleOptimizer):
loss = self.optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, self.trainable_variables)
gradients = [gradient * (1./self._grad_accum_) for gradient in gradients]
self.compiled_metrics.update_state(y_, y_pred)
i = tf.constant(step)
def cond(i, *args):
return i < self._batch_size_
def body(i, grad):
x_ = slice_map(x, i, i + step)
y_ = slice_map(y, i, i + step)
w_ = slice_map(sample_weight, i, i + step)
with tf.GradientTape() as tape:
y_pred = self(x_, training = True) # Forward pass
loss = self.compiled_loss(y_, y_pred, sample_weight = w_, regularization_losses = self.losses)
if isinstance(self.optimizer, lso.LossScaleOptimizer):
loss = self.optimizer.get_scaled_loss(loss)
_grad = tape.gradient(loss, self.trainable_variables)
_grad = [_g * (1./self._grad_accum_) for _g in _grad]
grad = [g + _g for g,_g in zip(grad, _grad)]
self.compiled_metrics.update_state(y_, y_pred)
return [i + step, grad]
i, gradients = tf.while_loop(cond, body, [i, gradients], parallel_iterations = 1)
# --------------------------------------------------------------------------------------------------------------
# ---------- STUFF FROM Model._minimize ------------------------------------------------------------------------
aggregate_grads_outside_optimizer = (self.optimizer._HAS_AGGREGATE_GRAD and not isinstance(self.distribute_strategy.extended, parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer: # TODO there might be some issues with the scaling, due to the extra accumulation steps
gradients = self.optimizer._aggregate_gradients(zip(gradients, self.trainable_variables))
if isinstance(self.optimizer, lso.LossScaleOptimizer):
gradients = self.optimizer.get_unscaled_gradients(gradients)
gradients = self.optimizer._clip_gradients(gradients)
if self.trainable_variables:
if aggregate_grads_outside_optimizer:
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables), experimental_aggregate_gradients = False)
else:
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
# --------------------------------------------------------------------------------------------------------------
return {m.name: m.result() for m in self.metrics}
Related
I would like to do a neural network for regression analysis using optuna based on this site.
I would like to create a model with two 1D data as input and one 1D data as output in batch learning.
x is the training data and y is the teacher data.
class Model(nn.Module):
# コンストラクタ(インスタンス生成時の初期化)
def __init__(self,trial, mid_units1, mid_units2):
super(Model, self).__init__()
self.linear1 = nn.Linear(2, mid_units1)
self.bn1 = nn.BatchNorm1d(mid_units1)
self.linear2 = nn.Linear(mid_units1, mid_units2)
self.bn2 = nn.BatchNorm1d(mid_units2)
self.linear3 = nn.Linear(mid_units2, 1)
self.activation = trial_activation(trial)
def forward(self, x):
x = self.linear1(x)
x = self.bn1(x)
x = self.activation(x)
x = self.linear2(x)
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCH = 100
x = torch.from_numpy(a[0].astype(np.float32)).to(device)
y = torch.from_numpy(a[1].astype(np.float32)).to(device)
def train_epoch(model, optimizer, criterion):
model.train()
optimizer.zero_grad() # 勾配情報を0に初期化
y_pred = model(x) # 予測
loss = criterion(y_pred.reshape(y.shape), y) # 損失を計算(shapeを揃える)
loss.backward() # 勾配の計算
optimizer.step() # 勾配の更新
return loss.item()
def trial_activation(trial):
activation_names = ['ReLU','logsigmoid']
activation_name = trial.suggest_categorical('activation', activation_names)
if activation_name == activation_names[0]:
activation = F.relu
else:
activation = F.logsigmoid
return activation
def objective(trial):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 中間層のユニット数の試行
mid_units1 = int(trial.suggest_discrete_uniform("mid_units1", 1024*2,1024*4, 64*2))
mid_units2 = int(trial.suggest_discrete_uniform("mid_units2", 1024, 1024*2, 64*2))
net = Model(trial, mid_units1, mid_units2).to(device)
criterion = nn.MSELoss()
# 最適化手法の試行
optimizer = trial_optimizer(trial, net)
train_loss = 0
for epoch in range(EPOCH):
train_loss = train_epoch(net, optimizer, criterion, device)
torch.save(net.state_dict(), str(trial.number) + "new1.pth")
return train_loss
strage_name = "a.sql"
study_name = 'a'
study = optuna.create_study(
study_name = study_name,
storage='sqlite:///' + strage_name,
load_if_exists=True,
direction='minimize')
TRIAL_SIZE = 100
study.optimize(objective, n_trials=TRIAL_SIZE)
error message
---> 28 loss = criterion(y_pred.reshape(y.shape), y) # 損失を計算(shapeを揃える)
29 loss.backward() # 勾配の計算
30 optimizer.step() # 勾配の更新
AttributeError: 'NoneType' object has no attribute 'reshape'
Because of the above error, I checked the value of y_pred and found it to be None.
model.train()
optimizer.zero_grad()
I am thinking that these two lines may be wrong, but I don't know how to solve this problem.
With PyTorch, when you call y_pred = model(x) that will call the forward function which is defined in the Model class.
So, y_pred will get the result of the forward function, in your case, it returns nothing, that's why you get a None value. You can change the forward function as below:
def forward(self, x):
x = self.linear1(x)
x = self.bn1(x)
x = self.activation(x)
x = self.linear2(x)
return x
I am I am trying to write a code for regression using neural networks (for learning).
Here are my code:
#fixme: k-fold cross validation
n_crossVal = 10
kf = KFold(n_splits = n_crossVal) #, random_state=1, shuffle=True fixme
for p_t in key_set_1:
cur_ds = []
for i, roi in enumerate(key_set_2):
if(i==0):
cur_ds = brain_ds[p_t + '_' + roi]
else:
cur_ds = np.hstack((cur_ds, brain_ds[p_t + '_' + roi]))
print(cur_ds.shape)
print(n_train)
size_input = cur_ds.shape[1]
preds_case = np.zeros(glove_ds.shape)
k_no = 0
for k_train_index, k_test_index in kf.split(cur_ds):
train_X_ds = torch.from_numpy(cur_ds[k_train_index, :])
train_y_ds = torch.from_numpy(glove_ds[k_train_index, :])
train_ds = TensorDataset(train_X_ds, train_y_ds)
test_X_ds = torch.from_numpy(cur_ds[k_test_index, :])
test_y_ds = torch.from_numpy(glove_ds[k_test_index, :])
test_ds = TensorDataset(test_X_ds, test_y_ds)
preds = fit_reg(train_ds, train_X_ds, train_y_ds, test_X_ds, test_y_ds, which_case, k_no, p_t)
k_no += 1
preds_case[k_test_index, :] = preds.detach().numpy()
and my model:
class RegressionNet(nn.Module):
def __init__(self):
super(RegressionNet, self).__init__()
self.linear1 = nn.Linear(size_input, size_hidden)
self.act1 = nn.ReLU()
self.linear2 = nn.Linear(size_hidden, size_output)
def forward(self, input_X):
X = self.linear1(input_X)
X = self.act1(X)
X = self.linear2(X)
return X
def fit_reg(train_ds, train_X_torch, train_y_torch, test_X_torch, case_type, fold_no, p_t):
num_epochs = 1
loss_fn = F.mse_loss
model = RegressionNet()
opt = torch.optim.SGD(model.parameters(), lr=1e-5)
for epoch in range(num_epochs):
print("num epoch: ", epoch)
for xb, yb in train_ds:
#not batch? fixme
#print(xb.shape, yb.shape, type(xb), type(yb))
pred = model(xb.float())
loss = loss_fn(pred, yb.float())
loss.backward()
opt.step()
opt.zero_grad()
print('Training loss: ', loss_fn(model(train_X_torch.float()), train_y_torch.float()))
pred_test_here = model(test_X_torch.float())
torch.save(model.state_dict(), './weights_' + case_type + '_' + str(fold_no) + '_' + p_t)
return pred_test_here
So I am using 10-fold cross validation. Each time, I pass the 9/10th of my data into the network and try to test it on the rest.
My questions:
Is this the correct way to perform regression?
How can I send batches of data instead of one sample per time for training?
After training is finished with some number of epochs, I show training loss as the loss between whole samples, is that correct?
Thanks in advance.
This question needs serious editing; No data is included and then the second block of code is used for data formatting which is not easy to follow. But here's my example if it can help. I took the suggested Model with small changes and made a 1D Regression code,
import torch
from sklearn.model_selection import KFold
from torch import nn
import math
import torch.nn.functional as F
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import numpy as np
predictionFull = []
lossFull = []
# data
x = torch.unsqueeze(torch.linspace(-math.pi, math.pi, 1000), dim=1)
y = torch.sin(x**2) + 0.3*torch.rand(x.size())
fig, ax = plt.subplots(figsize=(12,7))
curve, = ax.plot(x, x, 'r-', linewidth=2)
time_text = ax.text(.5, .5, '', fontsize=15)
def update(i):
#label = 'timestep {0}'.format(i)
curve.set_ydata(predictionFull[i].data.numpy())
time_text.set_text('Loss = %.4f' % lossFull[i].data.numpy())
time_text.set_x(1.0)
time_text.set_y(-3.0)
time_text.set_color('red')
return curve
class RegressionNet(nn.Module):
def __init__(self, size_input, size_hidden, size_output):
super(RegressionNet, self).__init__()
self.linear1 = nn.Linear(size_input, size_hidden)
self.linear2 = nn.Linear(size_hidden, size_output)
def forward(self, input_X):
X = F.relu(self.linear1(input_X))
X = self.linear2(X)
return X
def fit_reg(x, y):
num_epochs = 2000
loss_fn = torch.nn.MSELoss()
model = RegressionNet(1, 500, 1)
opt = torch.optim.Adam(model.parameters(), lr=0.002)
for epoch in range(num_epochs):
pred = model(x) # input x and predict based on x
loss = loss_fn(pred, y) # must be (1. nn output, 2. target)
opt.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
opt.step() # apply gradients
predictionFull.append(pred)
lossFull.append(loss)
fit_reg(x, y)
ax.scatter(x.data.numpy(), y.data.numpy(), color = "orange")
ax.set_xlim(-math.pi, math.pi)
ax.set_ylim(-math.pi, math.pi)
if __name__ == '__main__':
# FuncAnimation will call the 'update' function for each frame; here
# animating over 10 frames, with an interval of 200ms between frames.
anim = FuncAnimation(fig, update, frames=np.arange(0, 2000, 20), interval=2)
anim.save('./an.gif', writer='imagemagick', fps=500)
I am a beginner looking to code an ANN in PyTorch for the task of prediction for a dynamic engineering system of a Free Piston Sterling Engine. The dataset consists of 6 inputs and 3 outputs, as shown below:
Dataset
I have a basic code which I believe should be able to accommodate for this task, however I believe there may be an issue with the labelling of the dataset, and the datatype used. I have tried converting to longtensor datatype but it has not helped.
I receive the following error when changing the output datatype to float32:
"expected scalar type Long but found Float."
and when I put it as int64, I receive:
"Target 85 is out of bounds."
Please take a look, and any advice would be very appreciated. I have included the code below:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
SEED = 4096
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
file_path = "./Dynamics of Sterling Engine Data(1).csv"
df = pd.read_csv(
file_path,
header=None,
names=[
"Kdp(N/m)",
"Kpp(N/m)",
"Cdp(Ns/m)",
"Cl(Ns/m)",
"mdp(kg)",
"mpp(kg)",
"f(Hz)",
"γ(DP/PP)",
"α(°)",
],
)
n = len(df.index) # 55
shuffle_indices = np.random.permutation(n)
df = df.iloc[shuffle_indices]
x = df.iloc[:, :6].values.astype(np.float32)
y = df.iloc[:, -3].values.astype(np.float32)
mu = x.mean(axis=0)
span = x.max(axis=0) - x.min(axis=0)
def rescale(inputs):
return (inputs - mu) / span
x = rescale(x)
num_train = int(n * 0.82)
num_test = n - num_train
x_train = x[:num_train]
y_train = y[:num_train]
x_test = x[-num_test:]
y_test = y[-num_test:]
class NpDataset(Dataset):
def __init__(self, data, label):
assert len(data) == len(label)
self.data = torch.from_numpy(data)
self.label = torch.from_numpy(label)
def __getitem__(self, index):
return self.data[index], self.label[index]
def __len__(self):
return len(self.label)
train_dataset = NpDataset(x_train, y_train)
test_dataset = NpDataset(x_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
device = torch.device("cpu")
print(device)
class SterlingNN(nn.Module):
def __init__(self):
super(SterlingNN, self).__init__()
# 6 input feautures per data point
self.fn1 = nn.Linear(6, 6) # 6 features, 6 nodes in hidden layer
self.fn2 = nn.Linear(6, 3) # 6 nodes in hidden layer, 3 outputs
def forward(self, x):
x = torch.sigmoid(self.fn1(x)) # sigmoid activation function
x = self.fn2(x)
return x
model = SterlingNN()
print(model.to(device))
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(
model.parameters(), lr=0.01, weight_decay=0.01
)
x, y = next(iter(train_dataloader))
x = x[:5].to(device)
score = model(x)
print(score)
def train():
model.train() # model into training mode and iteratate through data loader
for x, y in train_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
optimiser.zero_grad()
score = model(x)
loss = loss_fn(score, y)
loss.backward()
optimiser.step()
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
def evaluate():
model.eval()
with torch.no_grad():
for x, y in test_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
score = model(x)
loss = loss_fn(score, y)
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
max_epochs = 128
for epoch in range(max_epochs):
tr_loss, tr_acc = train()
eva_loss, eva_acc = evaluate()
print(
"[{epoch}/{max_epochs}] Train loss:{tr_loss:.4f} acc:{tr_acc*100:.2f}% - Test loss:{eva_loss:.4f} acc:{eva_acc*100:.2f}%".format()
)
My dataset( Network traffic dataset where we do binary classification)-
Number of features is 25 and I have normalized the dataset.
My ELM model-
class ELM:
def __init__(self, num_input_nodes, num_hidden_units, num_out_units, activation='sigmoid',
loss='bce', beta_init=None, w_init=None, bias_init=None):
self._num_input_nodes = num_input_nodes
self._num_hidden_units = num_hidden_units
self._num_out_units = num_out_units
self._activation = getActivation(activation)
self._loss = getLoss(loss)
if isinstance(beta_init, np.ndarray):
self._beta = beta_init
else:
self._beta = np.random.uniform(-1., 1., size=(self._num_hidden_units, self._num_out_units))
if isinstance(w_init, np.ndarray):
self._w = w_init
else:
self._w = np.random.uniform(-1, 1, size=(self._num_input_nodes, self._num_hidden_units))
if isinstance(bias_init, np.ndarray):
self._bias = bias_init
else:
self._bias = np.zeros(shape=(self._num_hidden_units,))
print('Bias shape:', self._bias.shape)
print('W shape:', self._w.shape)
print('Beta shape:', self._beta.shape)
def fit(self, X, Y, display_time=False):
H = self._activation(X.dot(self._w) + self._bias)
# Moore–Penrose pseudo inverse
if display_time:
start = time.time()
H_pinv = np.linalg.pinv(H)
if display_time:
stop = time.time()
print(f'Train time: {stop-start}')
self._beta = H_pinv.dot(Y)
# print('Fit Beta shape:', self._beta.shape)
def __call__(self, X):
H = self._activation(X.dot(self._w) + self._bias)
return H.dot(self._beta)
def evaluate(self, X, Y):
pred = self(X)
# Loss (base on model setting)
loss = self._loss(Y, pred)
# Accuracy
acc = np.sum(np.argmax(pred, axis=-1) == np.argmax(Y, axis=-1)) / len(Y)
# Unweighted Average Recall
# TODO
return loss, acc
# Network Settings
num_classes = 1
num_hidden_layers = 512
input_length = 25
When I am trying to run this for my dataset, the accuracy is coming to zero. I have taken sigmoid as the activation function and binary cross entropy as loss for my binary classification task.
am using an Seq2Seq project from Google that use Encoder/Decoder, there is the 2 encoder and decoder class :
#ENCODER
class EncoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
super().__init__()
self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
output_dim=embedding_dims)
self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True,
return_state=True )
encoder_embedding = self.encoder_embedding
encoder_rnnlayer = self.encoder_rnnlayer
#DECODER
class DecoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,output_vocab_size, embedding_dims, rnn_units):
super().__init__()
self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
output_dim=embedding_dims)
self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
# Sampler
self.sampler = tfa.seq2seq.sampler.TrainingSampler()
# Create attention mechanism with memory = None
self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
self.rnn_cell = self.build_rnn_cell(BATCH_SIZE)
self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
output_layer=self.dense_layer)
def build_attention_mechanism(self, units,memory, memory_sequence_length):
return tfa.seq2seq.LuongAttention(units, memory = memory,
memory_sequence_length=memory_sequence_length)
#return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)
# wrap decodernn cell
def build_rnn_cell(self, batch_size ):
rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
attention_layer_size=dense_units)
return rnn_cell
def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size,
dtype = Dtype)
decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
return decoder_initial_state
i create an instance of EncoderNetwork and DecoderNetwork with my argument and use the loss_function and train_step already defined to train my model
def loss_function(y_pred, y):
#shape of y [batch_size, ty]
#shape of y_pred [batch_size, Ty, output_vocab_size]
sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
reduction='none')
loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
mask = tf.logical_not(tf.math.equal(y,0)) #output 0 for y=0 else output 1
mask = tf.cast(mask, dtype=loss.dtype)
loss = mask* loss
loss = tf.reduce_mean(loss)
return loss
def train_step(input_batch, output_batch,encoder_initial_cell_state):
#initialize loss = 0
loss = 0
with tf.GradientTape() as tape:
encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp,
initial_state =encoder_initial_cell_state)
#[last step activations,last memory_state] of encoder passed as input to decoder Network
# Prepare correct Decoder input & output sequence data
decoder_input = output_batch[:,:-1] # ignore <end>
#compare logits with timestepped +1 version of decoder_input
decoder_output = output_batch[:,1:] #ignore <start>
# Decoder Embeddings
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
#Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
decoderNetwork.attention_mechanism.setup_memory(a)
decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE,
encoder_state=[a_tx, c_tx],
Dtype=tf.float32)
#BasicDecoderOutput
outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state,
sequence_length=BATCH_SIZE*[Ty-1])
logits = outputs.rnn_output
#Calculate loss
loss = loss_function(logits, decoder_output)
#Returns the list of all layer variables / weights.
variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables
# differentiate loss wrt variables
gradients = tape.gradient(loss, variables)
#grads_and_vars – List of(gradient, variable) pairs.
grads_and_vars = zip(gradients,variables)
optimizer.apply_gradients(grads_and_vars)
return loss
the training does not use fit() methode but like this :
epochs = 20
for i in range(1, epochs+1):
encoder_initial_cell_state = initialize_initial_state()
total_loss = 0.0
for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
total_loss += batch_loss
if (batch+1)%5 == 0:
print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))
the result are fine and the predict fonction work perfectly (custom predict function), but how can i save the model ? i tried pickel and keras.save() but it doesn't work any idea ?