Related
I am a beginner looking to code an ANN in PyTorch for the task of prediction for a dynamic engineering system of a Free Piston Sterling Engine. The dataset consists of 6 inputs and 3 outputs, as shown below:
Dataset
I have a basic code which I believe should be able to accommodate for this task, however I believe there may be an issue with the labelling of the dataset, and the datatype used. I have tried converting to longtensor datatype but it has not helped.
I receive the following error when changing the output datatype to float32:
"expected scalar type Long but found Float."
and when I put it as int64, I receive:
"Target 85 is out of bounds."
Please take a look, and any advice would be very appreciated. I have included the code below:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
SEED = 4096
torch.manual_seed(SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
file_path = "./Dynamics of Sterling Engine Data(1).csv"
df = pd.read_csv(
file_path,
header=None,
names=[
"Kdp(N/m)",
"Kpp(N/m)",
"Cdp(Ns/m)",
"Cl(Ns/m)",
"mdp(kg)",
"mpp(kg)",
"f(Hz)",
"γ(DP/PP)",
"α(°)",
],
)
n = len(df.index) # 55
shuffle_indices = np.random.permutation(n)
df = df.iloc[shuffle_indices]
x = df.iloc[:, :6].values.astype(np.float32)
y = df.iloc[:, -3].values.astype(np.float32)
mu = x.mean(axis=0)
span = x.max(axis=0) - x.min(axis=0)
def rescale(inputs):
return (inputs - mu) / span
x = rescale(x)
num_train = int(n * 0.82)
num_test = n - num_train
x_train = x[:num_train]
y_train = y[:num_train]
x_test = x[-num_test:]
y_test = y[-num_test:]
class NpDataset(Dataset):
def __init__(self, data, label):
assert len(data) == len(label)
self.data = torch.from_numpy(data)
self.label = torch.from_numpy(label)
def __getitem__(self, index):
return self.data[index], self.label[index]
def __len__(self):
return len(self.label)
train_dataset = NpDataset(x_train, y_train)
test_dataset = NpDataset(x_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
device = torch.device("cpu")
print(device)
class SterlingNN(nn.Module):
def __init__(self):
super(SterlingNN, self).__init__()
# 6 input feautures per data point
self.fn1 = nn.Linear(6, 6) # 6 features, 6 nodes in hidden layer
self.fn2 = nn.Linear(6, 3) # 6 nodes in hidden layer, 3 outputs
def forward(self, x):
x = torch.sigmoid(self.fn1(x)) # sigmoid activation function
x = self.fn2(x)
return x
model = SterlingNN()
print(model.to(device))
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(
model.parameters(), lr=0.01, weight_decay=0.01
)
x, y = next(iter(train_dataloader))
x = x[:5].to(device)
score = model(x)
print(score)
def train():
model.train() # model into training mode and iteratate through data loader
for x, y in train_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
optimiser.zero_grad()
score = model(x)
loss = loss_fn(score, y)
loss.backward()
optimiser.step()
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
def evaluate():
model.eval()
with torch.no_grad():
for x, y in test_dataloader:
x = x.to(device)
y = y.to(device)
n = x.size(0)
score = model(x)
loss = loss_fn(score, y)
predictions = score.max(1, keepdim=True)[1]
num_correct = predictions.eq(y.view_as(predictions)).sum().item()
acc = num_correct / n
return loss, acc
max_epochs = 128
for epoch in range(max_epochs):
tr_loss, tr_acc = train()
eva_loss, eva_acc = evaluate()
print(
"[{epoch}/{max_epochs}] Train loss:{tr_loss:.4f} acc:{tr_acc*100:.2f}% - Test loss:{eva_loss:.4f} acc:{eva_acc*100:.2f}%".format()
)
I am working on clinical EHR.
I am currently referring to this blog and github link here.
https://sparklerussell.com/post/using-electronic-health-records-to-predict-future-diagnosis-codes-with-gated-recurrent-units/
https://github.com/sparalic/Electronic-Health-Records-GRUs
I have generated the dataset and processed it as per the instructions in the notebooks present in the repository. I am facing an issue trying to train the model.
using : jupytor notebook (with google colab)
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import numpy as np
import itertools
import pickle
import sys, random
Load Data :
def load_data(sequences, labels):
dataSize = len(labels)
idx = np.random.permutation(dataSize)
nTest = int(np.ceil(0.15 * dataSize))
nValid = int(np.ceil(0.10 * dataSize))
test_idx = idx[:nTest]
valid_idx = idx[nTest:nTest+nValid]
train_idx = idx[nTest+nValid:]
train_x = sequences[train_idx]
train_y = labels[train_idx]
test_x = sequences[test_idx]
test_y = labels[test_idx]
valid_x = sequences[valid_idx]
valid_y = labels[valid_idx]
train_x = [sorted(seq) for seq in train_x]
train_y = [sorted(seq) for seq in train_y]
valid_x = [sorted(seq) for seq in valid_x]
valid_y = [sorted(seq) for seq in valid_y]
test_x = [sorted(seq) for seq in test_x]
test_y = [sorted(seq) for seq in test_y]
train = (train_x, train_y)
test = (test_x, test_y)
valid = (valid_x, valid_y)
return (train, test, valid)
Padding the input :
def padding(seqs, labels, vocab, n_classes):
lengths = np.array([len(seq) for seq in seqs]) - 1 # remove the last list in each patient's sequences for labels
n_samples = len(lengths)
maxlen = np.max(lengths)
x = torch.zeros(maxlen, n_samples, vocab) # maxlen = number of visits, n_samples = samples
y = torch.zeros(maxlen, n_samples, n_classes)
mask = torch.zeros(maxlen, n_samples)
for idx, (seq,label) in enumerate(zip(seqs,labels)):
for xvec, subseq in zip(x[:,idx,:], seq[:-1]):
xvec[subseq] = 1.
for yvec, subseq in zip(y[:,idx,:], label[1:]):
yvec[subseq] = 1.
mask[:lengths[idx], idx] = 1.
return x, y, lengths, mask
GRU Class :
torch.manual_seed(1)
class EHRNN(nn.Module):
def __init__(self, inputDimSize, hiddenDimSize,embSize, batchSize, numClass):
super(EHRNN, self).__init__()
self.hiddenDimSize = hiddenDimSize
self.inputDimSize = inputDimSize
self.embSize = embSize
self.numClass = numClass
self.batchSize = batchSize
#Initialize random weights
self.W_z = nn.Parameter(torch.randn(self.embSize, self.hiddenDimSize).cuda())
self.W_r = nn.Parameter(torch.randn(self.embSize, self.hiddenDimSize).cuda())
self.W_h = nn.Parameter(torch.randn(self.embSize, self.hiddenDimSize).cuda())
self.U_z = nn.Parameter(torch.randn(self.hiddenDimSize, self.hiddenDimSize).cuda())
self.U_r = nn.Parameter(torch.randn(self.hiddenDimSize, self.hiddenDimSize).cuda())
self.U_h = nn.Parameter(torch.randn(self.hiddenDimSize, self.hiddenDimSize).cuda())
self.b_z = nn.Parameter(torch.zeros(self.hiddenDimSize).cuda())
self.b_r = nn.Parameter(torch.zeros(self.hiddenDimSize).cuda())
self.b_h = nn.Parameter(torch.zeros(self.hiddenDimSize).cuda())
self.params = [self.W_z, self.W_r, self.W_h,
self.U_z, self.U_r, self.U_h,
self.b_z, self.b_r, self.b_h]
def forward(self,emb,h):
z = torch.sigmoid(torch.matmul(emb, self.W_z) + torch.matmul(h, self.U_z) + self.b_z)
r = torch.sigmoid(torch.matmul(emb, self.W_r) + torch.matmul(h, self.U_r) + self.b_r)
h_tilde = torch.tanh(torch.matmul(emb, self.W_h) + torch.matmul(r * h, self.U_h) + self.b_h)
h = z * h + ((1. - z) * h_tilde)
return h
def init_hidden(self):
return Variable(torch.zeros(self.batchSize,self.hiddenDimSize))
Custom Layer for handling two layer GRU :
torch.manual_seed(1)
class build_EHRNN(nn.Module):
def __init__(self, inputDimSize=4894, hiddenDimSize=[200,200], batchSize=100, embSize=200,numClass=4894, dropout=0.5,logEps=1e-8):
super(build_EHRNN, self).__init__()
self.inputDimSize = inputDimSize
self.hiddenDimSize = hiddenDimSize
self.numClass = numClass
self.embSize = embSize
self.batchSize = batchSize
self.dropout = nn.Dropout(p=0.5)
self.logEps = logEps
# Embedding inputs
self.W_emb = nn.Parameter(torch.randn(self.inputDimSize, self.embSize).cuda())
self.b_emb = nn.Parameter(torch.zeros(self.embSize).cuda())
self.W_out = nn.Parameter(torch.randn(self.hiddenDimSize, self.numClass).cuda())
self.b_out = nn.Parameter(torch.zeros(self.numClass).cuda())
self.params = [self.W_emb, self.W_out,
self.b_emb, self.b_out]
def forward(self,x, y, lengths, mask):
self.emb = torch.tanh(torch.matmul(x, self.W_emb) + self.b_emb)
input_values = self.emb
self.outputs = [input_values]
for i, hiddenSize in enumerate([self.hiddenDimSize, self.hiddenDimSize]): # iterate over layers
rnn = EHRNN(self.inputDimSize,hiddenSize,self.embSize,self.batchSize,self.numClass) # calculate hidden states
hidden_state = []
h = self.init_hidden().cuda()
for i,seq in enumerate(input_values): # loop over sequences in each batch
h = rnn(seq, h)
hidden_state.append(h)
hidden_state = self.dropout(torch.stack(hidden_state)) # apply dropout between layers
input_values = hidden_state
y_linear = torch.matmul(hidden_state, self.W_out) + self.b_out # fully connected layer
yhat = F.softmax(y_linear, dim=1) # yhat
yhat = yhat*mask[:,:,None] # apply mask
# Loss calculation
cross_entropy = -(y * torch.log(yhat + self.logEps) + (1. - y) * torch.log(1. - yhat + self.logEps))
last_step = -torch.mean(y[-1] * torch.log(yhat[-1] + self.logEps) + (1. - y[-1]) * torch.log(1. - yhat[-1] + self.logEps))
prediction_loss = torch.sum(torch.sum(cross_entropy, dim=0),dim=1)/ torch.cuda.FloatTensor(lengths)
cost = torch.mean(prediction_loss) + 0.000001 * (self.W_out ** 2).sum() # regularize
return (yhat, hidden_state, cost)
def init_hidden(self):
return torch.zeros(self.batchSize, self.hiddenDimSize) # initial state
Load data :
train, test, valid = load_data(sequences, labels)
Batch Size:
batchSize = 100
n_batches = int(np.ceil(float(len(train[0])) / float(batchSize)))-1
n_batches_valid = int(np.ceil(float(len(valid[0])) / float(batchSize)))-1
Model:
model = build_EHRNN(inputDimSize=4894, hiddenDimSize=200, batchSize=20, embSize=200, numClass=4894, dropout=0.5, logEps=1e-8)
model = model.to(device)
optimizer = torch.optim.Adadelta(model.parameters(), lr = 0.01, rho=0.90)
max_epochs = 5
loss_all = []
iteration = 0
for e in range(max_epochs):
for index in random.sample(range(n_batches), n_batches):
batchX = train[0][:n_batches*batchSize][index*batchSize:(index+1)*batchSize]
batchY = train[1][:n_batches*batchSize][index*batchSize:(index+1)*batchSize]
optimizer.zero_grad()
x, y, lengths, mask = padding(batchX, batchY, 4894, 4894)
if torch.cuda.is_available():
x, y, lenghts, mask = x.cuda(), y.cuda(), lengths, mask.cuda()
outputs, hidden, cost = model(x,y, h, lengths, mask)
if torch.cuda.is_available():
cost.cuda()
cost.backward()
nn.utils.clip_grad_norm_(model.parameters(), 5)
optimizer.step()
loss_all.append(cost.item())
iteration +=1
if iteration % 10 == 0:
# Calculate Accuracy
losses = []
model.eval()
val_loss = []
for index in random.sample(range(n_batches_valid), n_batches_valid):
validX = valid[0][:n_batches_valid*batchSize][index*batchSize:(index+1)*batchSize]
validY = valid[1][:n_batches_valid*batchSize][index*batchSize:(index+1)*batchSize]
x, y, lengths, mask = padding(validX, validY, 4894, 4894)
if torch.cuda.is_available():
x, y, lenghts, mask = x.cuda(), y.cuda(), lenghts, mask.cuda()
outputs, hidden_val, cost_val = model(x,y, h, lengths, mask)
losses.append(cost_val)
model.train()
print("Epoch: {}/{}...".format(e+1, max_epochs),
"Step: {}...".format(iteration),
"Training Loss: {:.4f}...".format(np.mean(loss_all)),
"Val Loss: {:.4f}".format(torch.mean(torch.tensor(losses))))
ERROR :
ValueError Traceback (most recent call last)
<ipython-input-76-9ca4916456a9> in <module>()
8
9 for e in range(max_epochs):
---> 10 for index in random.sample(range(n_batches), n_batches):
11 batchX = train[0][:n_batches*batchSize][index*batchSize:(index+1)*batchSize]
12 batchY = train[1][:n_batches*batchSize][index*batchSize:(index+1)*batchSize]
/usr/lib/python3.7/random.py in sample(self, population, k)
319 n = len(population)
320 if not 0 <= k <= n:
--> 321 raise ValueError("Sample larger than population or is negative")
322 result = [None] * k
323 setsize = 21 # size of a small set minus size of an empty list
ValueError: Sample larger than population or is negative
I tried many things but I couldn't solve the problem.
I'm trying to implement Gradient accumulation on TF2.x. All implementations I've found are either for TF1.x or for the old keras interface. I don't think there is an implementation out there (though I'd be very happy to be proven wrong on this).
Here's what I'm working with:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense
from tqdm import tqdm
import matplotlib.pyplot as plt
class SimpleTrainStepModel(Model):
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
if len(data) == 3:
x, y, sample_weight = data
else:
(x, y), sample_weight = data, None
# FIRST GRADIENT
with tf.GradientTape() as tape:
y_pred = self(x, training = True) # Forward pass
loss = self.compiled_loss(y, y_pred, sample_weight = sample_weight, regularization_losses = self.losses)
gradients = tape.gradient(loss, self.trainable_variables)
self.compiled_metrics.update_state(y, y_pred)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return {m.name: m.result() for m in self.metrics}
class GradAccumModel(Model):
def fit(self, *args, batch_size = 32, grad_accum = 1, **kwargs):
self.train_function = None
if batch_size % grad_accum != 0:
raise ValueError('Batch size must be divisible by the Gradient accumulation steps, dummy!')
self.grad_accum = grad_accum
self.batch_size = batch_size
return super(GradAccumModel, self).fit(*args,
batch_size = self.batch_size,
#validation_batch_size = validation_batch_size,#self.batch_size//grad_accum if validation_batch_size is None else validation_batch_size,
**kwargs)
def train_step(self, data):
# Unpack the data. Its structure depends on your model and
# on what you pass to `fit()`.
if len(data) == 3:
x, y, sample_weight = data
else:
(x, y), sample_weight = data, None
step = self.batch_size // self.grad_accum
# def _slice_nested(obj, i, j):
# if type(obj) is list:
# return [o[i:j] for o in obj]
# else:
# return obj[i:j]
# FIRST GRADIENT
with tf.GradientTape() as tape:
y_pred = self(x[:step], training = True) # Forward pass
loss = self.compiled_loss(y[:step], y_pred, sample_weight = sample_weight, regularization_losses = self.losses)
gradients = tape.gradient(loss, self.trainable_variables)
self.compiled_metrics.update_state(y[:step], y_pred)
i = tf.constant(step)
# tf.print('TF - HERE!')
def cond(i, *args):
return i < self.batch_size
def body(i, grad):
# tf.print('\tTF - HERE!')
with tf.GradientTape() as tape:
y_pred = self(x[i:i + step], training = True) # Forward pass
loss = self.compiled_loss(y[i:i + step], y_pred, sample_weight = sample_weight, regularization_losses = self.losses)
_grad = tape.gradient(loss, self.trainable_variables)
for g,_g in zip(grad, _grad):
g += _g
self.compiled_metrics.update_state(y[i:i + step], y_pred)
return [i + step, grad]
i, gradients = tf.while_loop(cond, body, [i, gradients], parallel_iterations = 1)
# for g in gradients: # I tried with and without division co calculate the mean
# g *= 1/self.grad_accum #
# Update weights
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
# Update metrics (includes the metric that tracks the loss)
# Return a dict mapping metric names to current value
return {m.name: m.result() for m in self.metrics}
if __name__ == '__main__':
(x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.mnist.load_data()
for MODEL, ga_kwarg, colour in list(zip([Model, SimpleTrainStepModel, GradAccumModel, GradAccumModel],
[{}, {}, {'grad_accum': 1}, {'grad_accum': 6}],
['blue', 'green', 'yellow', 'red'])):
for _ in tqdm(range(10)):
# tf.random.set_seed(0)
x = Input((28, 28))
y = x
y = Flatten()(y)
y = Dense(128, activation = 'sigmoid')(y)
y = Dense(10, activation = 'softmax')(y)
model = MODEL(x, y)
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = tf.keras.optimizers.Adam(1e-4),
metrics = ['acc'])
hist = model.fit(x_train, y_train, validation_data = (x_valid, y_valid), verbose = 0, batch_size = 6000, epochs = 100, **ga_kwarg)
plt.plot(hist.history['val_acc'], color = colour, alpha = .25)
plt.title('')
plt.xscale('symlog')
plt.yscale('logit')
plt.show()
I've been able to verify that it does actually save gpu memory. However, the end result is not the same as the normal Model.fit.
As you can see, the first three Model.fits are well clustered and give the same results. But when the the while cycle comes into play the training is quite different.
Anyone have any idea why this is happening?
After a lot more attempts I found the solution, It seems that the main problem was the compound assignments of the gradients which don't work quite as I was expecting. Here is my final solution for anyone who might be interested. It includes the extra stuff for distributed, mixed precision trainings, and nested input/output.
from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
from tensorflow.python.distribute import parameter_server_strategy
from tensorflow.python.distribute import distribution_strategy_context as ds_context
from tensorflow.python.util import nest
from tensorflow.keras.models import Model as _Model
class Model(_Model):
def fit(self, *args, batch_size: int = 32, grad_accum_steps: int = 1, **kwargs):
"""
Shallow wrapper of Model.fit that captures batch_size and additional kwarg: grad_accum.
Parameters
----------
batch_size : int
same as in Model.fit
grad_accum_steps : int
Number of steps to split batch_size into. The `batch_size` should be divisible by `grad_accum` (defaults to 1).
"""
if grad_accum_steps == 1:
super().fit(*args, batch_size = batch_size, **kwargs)
self.train_function = None
num_workers = ds_context.get_strategy().num_replicas_in_sync
if batch_size % (grad_accum_steps * num_workers) != 0:
raise ValueError(f'Batch size ({batch_size}) must be divisible by the Gradient accumulation steps ({grad_accum_steps}), and the number of replicas ({num_workers}), dummy!')
self._grad_accum_ = grad_accum_steps
self._batch_size_ = batch_size
self._num_workers_ = num_workers
train_step_backup = self.train_step
self.train_step = self._train_step_
out = super(self).fit(*args,
batch_size = self._batch_size_, # TODO maybe consider validation batch size
**kwargs)
del self._grad_accum_
del self._batch_size_
del self._num_workers_
self.train_step = train_step_backup
return out
def _train_step_(self, data):
"""
Custom training step taking into account gradient accumulation for low memory training
"""
if len(data) == 3:
x, y, sample_weight = data
else:
(x, y), sample_weight = data, None
def slice_map(struct, start, stop): # dealing with nasty nested structures
if struct is None:
return None # special case for sample_weight
return nest.map_structure(lambda x: x[start:stop], struct)
# ---------- GRAD ACCUM STUFF ----------------------------------------------------------------------------------
step = self._batch_size_ // self._num_workers_ // self._grad_accum_
x_ = slice_map(x, 0, step)
y_ = slice_map(y, 0, step)
w_ = slice_map(sample_weight, 0, step)
with tf.GradientTape() as tape:
y_pred = self(x_, training = True) # Forward pass
loss = self.compiled_loss(y_, y_pred, sample_weight = w_, regularization_losses = self.losses)
if isinstance(self.optimizer, lso.LossScaleOptimizer):
loss = self.optimizer.get_scaled_loss(loss)
gradients = tape.gradient(loss, self.trainable_variables)
gradients = [gradient * (1./self._grad_accum_) for gradient in gradients]
self.compiled_metrics.update_state(y_, y_pred)
i = tf.constant(step)
def cond(i, *args):
return i < self._batch_size_
def body(i, grad):
x_ = slice_map(x, i, i + step)
y_ = slice_map(y, i, i + step)
w_ = slice_map(sample_weight, i, i + step)
with tf.GradientTape() as tape:
y_pred = self(x_, training = True) # Forward pass
loss = self.compiled_loss(y_, y_pred, sample_weight = w_, regularization_losses = self.losses)
if isinstance(self.optimizer, lso.LossScaleOptimizer):
loss = self.optimizer.get_scaled_loss(loss)
_grad = tape.gradient(loss, self.trainable_variables)
_grad = [_g * (1./self._grad_accum_) for _g in _grad]
grad = [g + _g for g,_g in zip(grad, _grad)]
self.compiled_metrics.update_state(y_, y_pred)
return [i + step, grad]
i, gradients = tf.while_loop(cond, body, [i, gradients], parallel_iterations = 1)
# --------------------------------------------------------------------------------------------------------------
# ---------- STUFF FROM Model._minimize ------------------------------------------------------------------------
aggregate_grads_outside_optimizer = (self.optimizer._HAS_AGGREGATE_GRAD and not isinstance(self.distribute_strategy.extended, parameter_server_strategy.ParameterServerStrategyExtended))
if aggregate_grads_outside_optimizer: # TODO there might be some issues with the scaling, due to the extra accumulation steps
gradients = self.optimizer._aggregate_gradients(zip(gradients, self.trainable_variables))
if isinstance(self.optimizer, lso.LossScaleOptimizer):
gradients = self.optimizer.get_unscaled_gradients(gradients)
gradients = self.optimizer._clip_gradients(gradients)
if self.trainable_variables:
if aggregate_grads_outside_optimizer:
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables), experimental_aggregate_gradients = False)
else:
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
# --------------------------------------------------------------------------------------------------------------
return {m.name: m.result() for m in self.metrics}
am using an Seq2Seq project from Google that use Encoder/Decoder, there is the 2 encoder and decoder class :
#ENCODER
class EncoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
super().__init__()
self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
output_dim=embedding_dims)
self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True,
return_state=True )
encoder_embedding = self.encoder_embedding
encoder_rnnlayer = self.encoder_rnnlayer
#DECODER
class DecoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,output_vocab_size, embedding_dims, rnn_units):
super().__init__()
self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
output_dim=embedding_dims)
self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
# Sampler
self.sampler = tfa.seq2seq.sampler.TrainingSampler()
# Create attention mechanism with memory = None
self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
self.rnn_cell = self.build_rnn_cell(BATCH_SIZE)
self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
output_layer=self.dense_layer)
def build_attention_mechanism(self, units,memory, memory_sequence_length):
return tfa.seq2seq.LuongAttention(units, memory = memory,
memory_sequence_length=memory_sequence_length)
#return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)
# wrap decodernn cell
def build_rnn_cell(self, batch_size ):
rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
attention_layer_size=dense_units)
return rnn_cell
def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size,
dtype = Dtype)
decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
return decoder_initial_state
i create an instance of EncoderNetwork and DecoderNetwork with my argument and use the loss_function and train_step already defined to train my model
def loss_function(y_pred, y):
#shape of y [batch_size, ty]
#shape of y_pred [batch_size, Ty, output_vocab_size]
sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
reduction='none')
loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
mask = tf.logical_not(tf.math.equal(y,0)) #output 0 for y=0 else output 1
mask = tf.cast(mask, dtype=loss.dtype)
loss = mask* loss
loss = tf.reduce_mean(loss)
return loss
def train_step(input_batch, output_batch,encoder_initial_cell_state):
#initialize loss = 0
loss = 0
with tf.GradientTape() as tape:
encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp,
initial_state =encoder_initial_cell_state)
#[last step activations,last memory_state] of encoder passed as input to decoder Network
# Prepare correct Decoder input & output sequence data
decoder_input = output_batch[:,:-1] # ignore <end>
#compare logits with timestepped +1 version of decoder_input
decoder_output = output_batch[:,1:] #ignore <start>
# Decoder Embeddings
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
#Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
decoderNetwork.attention_mechanism.setup_memory(a)
decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE,
encoder_state=[a_tx, c_tx],
Dtype=tf.float32)
#BasicDecoderOutput
outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state,
sequence_length=BATCH_SIZE*[Ty-1])
logits = outputs.rnn_output
#Calculate loss
loss = loss_function(logits, decoder_output)
#Returns the list of all layer variables / weights.
variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables
# differentiate loss wrt variables
gradients = tape.gradient(loss, variables)
#grads_and_vars – List of(gradient, variable) pairs.
grads_and_vars = zip(gradients,variables)
optimizer.apply_gradients(grads_and_vars)
return loss
the training does not use fit() methode but like this :
epochs = 20
for i in range(1, epochs+1):
encoder_initial_cell_state = initialize_initial_state()
total_loss = 0.0
for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
total_loss += batch_loss
if (batch+1)%5 == 0:
print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))
the result are fine and the predict fonction work perfectly (custom predict function), but how can i save the model ? i tried pickel and keras.save() but it doesn't work any idea ?
I am trying to run some code on Deep embedding clustering on mnist with the help of Keras , however, I get the following error
from keras.datasets import mnist
import numpy as np
import keras.backend as K
from keras.engine.topology import Layer, InputSpec
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import SGD
from keras import callbacks
from keras.initializers import VarianceScaling
from sklearn.cluster import KMeans
def autoencoder(dims, act='relu', init='glorot_uniform'):
n_stacks = len(dims) - 1
# input
input_img = Input(shape=(dims[0],), name='input')
x = input_img
# internal layers in encoder
for i in range(n_stacks-1):
x = Dense(dims[i + 1], activation=act, kernel_initializer=init, name='encoder_%d' % i)(x)
# hidden layer
encoded = Dense(dims[-1], kernel_initializer=init, name='encoder_%d' % (n_stacks - 1))(x) # hidden layer, features are extracted from here
x = encoded
# internal layers in decoder
for i in range(n_stacks-1, 0, -1):
x = Dense(dims[i], activation=act, kernel_initializer=init, name='decoder_%d' % i)(x)
# output
x = Dense(dims[0], kernel_initializer=init, name='decoder_0')(x)
decoded = x
return Model(inputs=input_img, outputs=decoded, name='AE'), Model(inputs=input_img, outputs=encoded, name='encoder')
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))
x = x.reshape((x.shape[0], -1))
x = np.divide(x, 255.)
n_clusters = len(np.unique(y))
kmeans = KMeans(n_clusters=n_clusters, n_init=20, n_jobs=4)
y_pred_kmeans = kmeans.fit_predict(x)
dims = [x.shape[-1], 500, 500, 2000, 10]
init = VarianceScaling(scale=1. / 3., mode='fan_in',
distribution='uniform')
pretrain_optimizer = SGD(lr=1, momentum=0.9)
pretrain_epochs = 300
batch_size = 256
save_dir = './results'
autoencoder, encoder = autoencoder(dims, init=init)
autoencoder.compile(optimizer=pretrain_optimizer, loss='mse')
autoencoder.fit(x, x, batch_size=batch_size, epochs=pretrain_epochs) #, callbacks=cb)
autoencoder.save_weights(save_dir + '/ae_weights.h5')
class ClusteringLayer(Layer):
def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super(ClusteringLayer, self).__init__(**kwargs)
self.n_clusters = n_clusters
self.alpha = alpha
self.initial_weights = weights
self.input_spec = InputSpec(ndim=2)
def build(self, input_shape):
assert len(input_shape) == 2
input_dim = input_shape[1]
self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
self.clusters = self.add_weight((self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
if self.initial_weights is not None:
self.set_weights(self.initial_weights)
del self.initial_weights
self.built = True
def call(self, inputs, **kwargs):
q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
q **= (self.alpha + 1.0) / 2.0
q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) # Make sure each sample's 10 values add up to 1.
return q
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) == 2
return input_shape[0], self.n_clusters
def get_config(self):
config = {'n_clusters': self.n_clusters}
base_config = super(ClusteringLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
model = Model(inputs=encoder.input, outputs=clustering_layer)
model.compile(optimizer=SGD(0.01, 0.9), loss='kld')
y_pred_last = np.copy(y_pred_kmeans)
model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])
# computing an auxiliary target distribution
def target_distribution(q):
weight = q ** 2 / q.sum(0)
return (weight.T / weight.sum(1)).T
loss = 0
index = 0
maxiter = 8000
update_interval = 140
index_array = np.arange(x.shape[0])
tol = 0.001 # tolerance threshold to stop training
for ite in range(int(maxiter)):
if ite % update_interval == 0:
q = model.predict(x, verbose=2 )
p = target_distribution(q) # update the auxiliary target distribution p
# evaluate the clustering performance
y_pred = q.argmax(1)
if y is not None:
acc = np.round(metrics.acc(y, y_pred), 5)
nmi = np.round(metrics.nmi(y, y_pred), 5)
ari = np.round(metrics.ari(y, y_pred), 5)
loss = np.round(loss, 5)
print('Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % (ite, acc, nmi, ari), ' ; loss=', loss)
# check stop criterion - model convergence
delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
y_pred_last = np.copy(y_pred)
if ite > 0 and delta_label < tol:
print('delta_label ', delta_label, '< tol ', tol)
print('Reached tolerance threshold. Stopping training.')
break
idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])]
loss = model.train_on_batch(x=x[idx], y=p[idx])
index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0
model.save_weights(save_dir + '/DEC_model_final.h5')
model.load_weights(save_dir + '/DEC_model_final.h5')
the error:
with session.graph.as_default():
AttributeError: 'NoneType' object has no attribute 'graph'
(the problem might be in saving the model but I can't figure out why I am wrong.) my code runs perfectly in jupyter notebook but I can't run it in an editor like pycharm .please help.