I have a model implemented in pytorch that applies a final fully connected layer before running the softmax function.
The architecture is defined to solve a 4-class Speech Emotion Recognition task: given an audio track, it transforms it into its spectrogram and uses it to predict the emotion between happiness, sadness, neutrality and anger.
Unlike the architecture of the paper, it attempts to adapt the implementation of the Compact Convolutional Transformer found on Github at the link https://github.com/SHI-Labs/Compact-Transformers/blob/main/src/cct.py.
To improve the performance of the model I am following some tricks defined in the paper https://arxiv.org/abs/2104.07288.
Like what is described in the paper, however, my model also suffers from a "class collapse" problem: even by balancing the dataset, it tends to predict the anger and sadness classes well and the other two badly.
In the paper to solve this problem they apply a particular weight regularization technique to the fully connected layer, described in chapter 2.4.
Unfortunately, however, I cannot understand how I should modify my fully connected layer in pytorch to implement this type of regularization.
Code of the model:
class CCT(nn.Module):
def __init__(self,
img_size=224,
embedding_dim=768,
n_input_channels=3,
n_conv_layers=1,
kernel_size=7,
stride=2,
padding=3,
pooling_kernel_size=3,
pooling_stride=2,
pooling_padding=1,
dropout=0.,
attention_dropout=0.1,
stochastic_depth=0.1,
num_layers=14,
num_heads=6,
mlp_ratio=4.0,
num_classes=1000,
positional_embedding='learnable',
*args, **kwargs):
super(CCT, self).__init__()
self.tokenizer = Tokenizer(n_input_channels=n_input_channels,
n_output_channels=embedding_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding,
pooling_kernel_size=pooling_kernel_size,
pooling_stride=pooling_stride,
pooling_padding=pooling_padding,
max_pool=True,
activation=nn.ReLU,
n_conv_layers=n_conv_layers,
conv_bias=False)
self.classifier = TransformerClassifier(
sequence_length=self.tokenizer.sequence_length(n_channels=n_input_channels,
height=img_size,
width=img_size),
embedding_dim=embedding_dim,
seq_pool=True,
dropout=dropout,
attention_dropout=attention_dropout,
stochastic_depth=stochastic_depth,
num_layers=num_layers,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
num_classes=num_classes,
positional_embedding=positional_embedding
)
def forward(self, x):
x = self.tokenizer(x)
return self.classifier(x)
class Tokenizer(nn.Module):
def __init__(self,
kernel_size, stride, padding,
pooling_kernel_size=3, pooling_stride=2, pooling_padding=1,
n_conv_layers=1,
n_input_channels=3,
n_output_channels=64,
in_planes=64,
activation=None,
max_pool=True,
conv_bias=False):
super(Tokenizer, self).__init__()
n_filter_list = [n_input_channels] + \
[in_planes for _ in range(n_conv_layers - 1)] + \
[n_output_channels]
self.conv_layers = nn.Sequential(
*[nn.Sequential(
nn.Conv2d(n_filter_list[i], n_filter_list[i + 1],
kernel_size=(kernel_size, kernel_size),
stride=(stride, stride),
padding=(padding, padding), bias=conv_bias),
nn.Identity() if activation is None else activation(),
nn.MaxPool2d(kernel_size=pooling_kernel_size,
stride=pooling_stride,
padding=pooling_padding) if max_pool else nn.Identity()
)
for i in range(n_conv_layers)
])
self.flattener = nn.Flatten(2, 3)
self.apply(self.init_weight)
def sequence_length(self, n_channels=3, height=224, width=224):
return self.forward(torch.zeros((1, n_channels, height, width))).shape[1]
def forward(self, x):
return self.flattener(self.conv_layers(x)).transpose(-2, -1)
#staticmethod
def init_weight(m):
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
class TransformerClassifier(Module):
def __init__(self,
seq_pool=True,
embedding_dim=768,
num_layers=12,
num_heads=12,
mlp_ratio=4.0,
num_classes=1000,
dropout=0.1,
attention_dropout=0.1,
stochastic_depth=0.1,
positional_embedding='learnable',
sequence_length=None):
super().__init__()
positional_embedding = positional_embedding if \
positional_embedding in ['sine', 'learnable', 'none'] else 'sine'
dim_feedforward = int(embedding_dim * mlp_ratio)
self.embedding_dim = embedding_dim
self.sequence_length = sequence_length
self.seq_pool = seq_pool
self.num_tokens = 0
assert sequence_length is not None or positional_embedding == 'none', \
f"Positional embedding is set to {positional_embedding} and" \
f" the sequence length was not specified."
if not seq_pool:
sequence_length += 1
self.class_emb = Parameter(torch.zeros(1, 1, self.embedding_dim),
requires_grad=True)
self.num_tokens = 1
else:
self.attention_pool = Linear(self.embedding_dim, 1)
if positional_embedding != 'none':
if positional_embedding == 'learnable':
self.positional_emb = Parameter(torch.zeros(1, sequence_length, embedding_dim),
requires_grad=True)
init.normal_(self.positional_emb, std=0.2)
else:
self.positional_emb = Parameter(self.sinusoidal_embedding(sequence_length, embedding_dim),
requires_grad=False)
else:
self.positional_emb = None
self.dropout = Dropout(p=dropout)
dpr = [x.item() for x in torch.linspace(0, stochastic_depth, num_layers)]
self.blocks = ModuleList([
TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads,
dim_feedforward=dim_feedforward, dropout=dropout,
attention_dropout=attention_dropout, drop_path_rate=dpr[i])
for i in range(num_layers)])
self.norm = LayerNorm(embedding_dim)
self.fc = Linear(embedding_dim, num_classes)
self.apply(self.init_weight)
def forward(self, x):
if self.positional_emb is None and x.size(1) < self.sequence_length:
x = F.pad(x, (0, 0, 0, self.n_channels - x.size(1)), mode='constant', value=0)
if not self.seq_pool:
cls_token = self.class_emb.expand(x.shape[0], -1, -1)
x = torch.cat((cls_token, x), dim=1)
if self.positional_emb is not None:
x += self.positional_emb
x = self.dropout(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
if self.seq_pool:
x = torch.matmul(F.softmax(self.attention_pool(x), dim=1).transpose(-1, -2), x).squeeze(-2)
else:
x = x[:, 0]
x = self.fc(x)
return x
Can someone help me?
As you have not shared any network architecture I will try to give a basic example. I am not sure the regularization of the paper but I will give a simple example that will apply L1 regularization on specific layer (e.g. layer 0)
my_model = nn.Sequential(
nn.Linear(5, 5),
nn.ReLU(),
nn.Linear(5, 2)
)
x = torch.randn(5, 5)
target = torch.ones(5, dtype=torch.long)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
for epoch in range(10):
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
# This will be your weight regularization, choose your layer like model[0] and apply normalization that you want on that layer weights
l1_norm = torch.norm(model[0].weight, p=1)
loss += l1_norm
loss.backward()
optimizer.step()
print('Epoch {}, loss {}, norm layer {}'.format(
epoch, loss.item(), l1_norm.item()))
Related
I was reading guide in which an author used model.train() in each epoch because of the DropOut layer (he didn't use Pytorch Lightning). The question is - should i include model.train() in my Pytorch Lightning Module and if so, how do i do it ? (Or it does it automatically ?)
The code is below
class MulticlassClassificationLIGHT(pl.LightningModule):
def __init__(self,class_weights):
super(MulticlassClassificationLIGHT, self).__init__()
self.num_feature=35
self.num_class=36
self.layer_1 = nn.Linear(self.num_feature, 512)
self.layer_2 = nn.Linear(512, 128)
self.layer_3 = nn.Linear(128, 64)
self.layer_out = nn.Linear(64, self.num_class)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.2)
self.batchnorm1 = nn.BatchNorm1d(512)
self.batchnorm2 = nn.BatchNorm1d(128)
self.batchnorm3 = nn.BatchNorm1d(64)
self.loss = nn.CrossEntropyLoss(weight=class_weights.to(device))
def forward(self, x):
x = self.layer_1(x)
x = self.batchnorm1(x)
x = self.relu(x)
x = self.layer_2(x)
x = self.batchnorm2(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_3(x)
x = self.batchnorm3(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
def training_step(self, batch, batch_idx):
x, y = batch
logits = self.forward(x)
loss = self.loss(logits, y)
self.log("train_loss", loss, prog_bar=True, logger=True)
return loss
And here is an example from guide
for e in tqdm(range(1, EPOCHS+1)):
train_epoch_loss = 0
train_epoch_acc = 0
model.train()
for X_train_batch, y_train_batch in train_loader:
X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
optimizer.zero_grad()
I use Trainer for training. Here is it
trainer = pl.Trainer(
devices="auto",
accelerator="auto",
auto_lr_find=False,
auto_scale_batch_size=True,
fast_dev_run=False,
num_sanity_val_steps=3,
logger=logger,
min_epochs=EPOCHS,
)
And how I use it
trainer.fit(model, data_module_classifier.train_dataloader(),data_module_classifier.val_dataloader() )
There is just no information in the documentation and solutions are only in usual Pytorch, but not in Pytorch Lightning
I am facing a multiclass classification problem related to the activity of some drugs using Pytorch neural net, I have three activity classes (0, 1 and 2), to tackle the problem I adopted the one vs. one approach, thus creating three binary classifiers: 0 vs. 1, 1 vs. 2 and 2 vs. 0. When I train the second classifier (class 1 vs. class 2) I get the following error:
IndexError: Target 2 is out of bounds.
Is there a method to solve it without reassigning labels? Thank you all!
This is my net, is a Graph Isomorphism Network build with Pytorch Geometric:
class GIN1(torch.nn.Module):
def __init__(self, h):
super(GIN1, self).__init__()
dim_h_conv = h
dim_h_fc = dim_h_conv*5
# Convolutional layers
self.conv1 = GINConv(Sequential(Linear(14, dim_h_conv),
BatchNorm1d(dim_h_conv), ReLU(),
Linear(dim_h_conv, dim_h_conv), ReLU()))
self.conv2 = GINConv(Sequential(Linear(dim_h_conv, dim_h_conv),
BatchNorm1d(dim_h_conv), ReLU(),
Linear(dim_h_conv, dim_h_conv), ReLU()))
self.conv3 = GINConv(Sequential(Linear(dim_h_conv, dim_h_conv),
BatchNorm1d(dim_h_conv), ReLU(),
Linear(dim_h_conv, dim_h_conv), ReLU()))
self.conv4 = GINConv(Sequential(Linear(dim_h_conv, dim_h_conv),
BatchNorm1d(dim_h_conv), ReLU(),
Linear(dim_h_conv, dim_h_conv), ReLU()))
self.conv5 = GINConv(Sequential(Linear(dim_h_conv, dim_h_conv),
BatchNorm1d(dim_h_conv), ReLU(),
Linear(dim_h_conv, dim_h_conv), ReLU()))
# Fully connected layers
self.lin1 = Linear(dim_h_fc, dim_h_fc)
self.lin2 = Linear(dim_h_fc, 2)
self.initialize_w()
def forward(self, x, edge_index, batch):
h1 = self.conv1(x, edge_index)
h2 = self.conv2(h1, edge_index)
h3 = self.conv3(h2, edge_index)
h4 = self.conv4(h3, edge_index)
h5 = self.conv5(h4, edge_index)
# Graph level readout
h1 = global_add_pool(h1, batch)
h2 = global_add_pool(h2, batch)
h3 = global_add_pool(h3, batch)
h4 = global_add_pool(h4, batch)
h5 = global_add_pool(h5, batch)
# Concatenate graph embeddings
h = torch.cat((h1, h2, h3, h4, h5), dim=1)
# Classifier
h = self.lin1(h)
h = h.relu()
h = F.dropout(h, p=hp_gin1['p'], training=self.training)
h = self.lin2(h)
h = F.log_softmax(h, dim=1)
return h
def initialize_w(self):
for m in self.modules():
if isinstance(m, Linear):
torch.nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
torch.nn.init.constant_(m.bias, 0)
if isinstance(m, BatchNorm1d):
torch.nn.init.constant_(m.weight, 1)
torch.nn.init.constant_(m.bias, 0)
And this is my training loop:
gin2 = GIN2(h=hp_gin2['h']) #40
optimizer = torch.optim.Adam(gin2.parameters(), lr=hp_gin2['lr'])
criterion = torch.nn.CrossEntropyLoss()
def train(train_loader):
gin2.train()
loss_all = 0
for data in train_loader:
output = gin2(data.x, data.edge_index, data.batch)
loss = criterion(output, data.y)
l2_lambda = hp_gin2['lambda']
l2_norm = sum(p.pow(2.0).sum()
for p in gin2.parameters())
loss = loss + l2_lambda * l2_norm
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_all += loss.item() * data.num_graphs
return loss_all / len(train_loader.dataset)
def test_loss(loader):
total_loss_val = 0
with torch.no_grad():
for data in loader:
output = gin2(data.x, data.edge_index, data.batch)
batch_loss = criterion(output, data.y)
total_loss_val += batch_loss.item() * data.num_graphs
return total_loss_val / len(loader.dataset)
def test(loader):
gin2.eval()
correct = 0
for data in loader:
output = gin2(data.x, data.edge_index, data.batch)
accuracy = Accuracy(average='macro', num_classes=2)
acc = accuracy(output, data.y)
return acc
OP needed to match the output dimension of their model with the number of label classes (see discussion).
I'm trying to build a custom Transformer model within TensorFlow but I'm stuck. I always receive this error:
OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed: AutoGraph did convert this function. This might indicate you are trying to use an unsupported feature.
Of course I did some research and I found many solutions, but I'm failing to implement these to my own model since I'm not even sure where exactly this error occurs. I tried to remove all for loops, but the error still remained. I hope someone can help me and point me to the right direction.
I'm using:
python: 3.9
tensorflow: 2.9.1
keras: 2.9.0
import tensorflow as tf
import keras.backend as K # for custom loss function
import tensorflow_probability as tfp
from tensorflow.python.keras.layers import Dense, Dropout, GlobalAveragePooling1D, MultiHeadAttention, Input, Conv1D
from keras.layers import LayerNormalization
import numpy as np
class MultiAttention(tf.keras.Model):
def __init__(self, head_size, num_heads, dropout_attention, dropout1, dropout2, ff_dim):
super(MultiAttention, self).__init__()
# define all layers in init
self.LN1 = LayerNormalization(epsilon=1e-6)
self.MHA = MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout_attention)
self.Dropout1 = Dropout(dropout1)
self.LN2 = LayerNormalization(epsilon=1e-6)
self.CONV1 = Conv1D(filters=ff_dim, kernel_size=1, activation="relu")
self.Dropout2 = Dropout(dropout2)
self.CONV2 = None
def build(self, input_shape):
self.CONV2 = Conv1D(filters=input_shape.shape[-1], kernel_size=1)
def call(self, inputs, training=False):
# Normalization and Attention
x = self.LN1(epsilon=1e-6)(inputs)
x = self.MHA(x, x)
x = self.Dropout1(x)
x_att = x + inputs
# Feed Forward Part
x = self.LN2(x_att)
x = self.Conv1(x)
x = self.Dropout2(x)
x = self.Conv1(x)
return x + x_att
class Transformer(tf.keras.Model): # tf.keras.Model inherits model.fit/predict/evaluate. tf.keras.Layer can't do this
def __init__(self, head_size, num_heads, dropout_attention, dropout1, dropout2, ff_dim,
num_transformer_blocks, mlp_units, mlp_dropout):
super(Transformer, self).__init__()
# define all layers in init
self.head_size = head_size
self.num_heads = num_heads
self.dropout_attention = dropout_attention
self.dropout1 = dropout1
self.dropout2 = dropout2
self.ff_dim = ff_dim
self.num_transformer_blocks = num_transformer_blocks
self.mlp_units = mlp_units
self.GAP = GlobalAveragePooling1D(data_format="channels_first")
self.Dense1 = Dense(mlp_units, activation="relu")
self.Dropout = Dropout(mlp_dropout)
self.Dense2 = Dense(1, activation="softmax")
self.MA = None
def build(self, input_shape):
self.MA = MultiAttention(self.head_size, self.num_heads, self.dropout_attention,
self.dropout1, self.dropout2, self.ff_dim)
# for dim in range(self.mlp_units):
# self.Dense1_list.append(Dense(dim, actiavtion="relu"))
def call(self, input_shape, training=False):
x = Input(input_shape)
for _ in range(self.num_transformer_blocks):
x = self.MA(x)
x = self.GAP(x)
for unit in self.Dense1_list:
x = unit(x)
x = self.Dropout(x)
x = self.Dense1(x)
x = self.Dropout(x)
return self.Dense2(x)
class CustomLoss(tf.keras.losses.Loss):
def __init__(self):
super().__init__()
def call(self, y_true, y_pred):
alpha = K.std(y_pred) / K.std(y_true)
beta = K.sum(y_pred) / K.sum(y_true) # no need to calc mean
r = tfp.stats.correlation(y_true, y_pred, sample_axis=None, event_axis=None)
return K.sqrt(K.square(1 - r) + K.square(1 - alpha) + K.square(1 - beta))
def create_model(head_size, num_heads, dropout_attention, dropout1, dropout2, ff_dim,
num_transformer_blocks, mlp_units, mlp_dropout, lr):
model = Transformer(head_size, num_heads, dropout_attention, dropout1, dropout2, ff_dim,
num_transformer_blocks, mlp_units, mlp_dropout)
model.compile(
loss=CustomLoss(),
optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
)
return model
model = create_model(256,4, 0.10, 0.12, 0.15, 2, 4, 3, 0.10, 0.001)
model.fit(np.zeros((2, 257, 11)),
np.zeros((2, 1)),
epochs=1,
batch_size=512,
)
I was building a self-defined encoder-decoder tf.keras.Model and saved my checkpoint. After closing my Jupyter notebook and open it again trying to restore my encoder-decoder parameters, I surprisingly found that it is not working. I am not sure it's that I understand the usage wrong or something was wrong with my steps of doing it. Here is my code.
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
tf.config.experimental_run_functions_eagerly(True)
EPOCHS = 10
TOLERANCE = 0.08
start = time.time()
for epoch in range(1, EPOCHS+1):
epoch_start = time.time()
# train the encoder-decoder model
batch = 0
total_loss = 0
total_accuracy = 0
for inp, targ in dataset.take(STEP_PER_EPOCH):
batch += 1
batch_loss, batch_accuracy = train_step(inp, targ, phoneme_tokenizer)
total_loss += batch_loss
total_accuracy += batch_accuracy
print("Epoch: {}/{} Batch: {} Loss: {:.4f} Accuracy: {:.4f} Time: {:.0f}s".
format(epoch, EPOCHS, batch, batch_loss.numpy(), batch_accuracy.numpy(), time.time()-epoch_start),
end="\r")
if batch % 1000 == 0:
print()
print()
# saving (checkpoint) the model when total loss is less than 0.9
checkpoint.save(file_prefix=checkpoint_prefix)
# validation process
total_val_loss = 0
total_val_acc = 0
for val_inp, val_targ in dataset_val.take(VAL_WAV_SIZE):
val_loss, val_acc = validate_step(val_inp, val_targ, phoneme_tokenizer)
total_val_loss += val_loss
total_val_acc += val_acc
# print out the epoch results
mean_total_acc = total_accuracy / STEP_PER_EPOCH
mean_total_loss = total_loss / STEP_PER_EPOCH
mean_val_acc = total_val_acc / VAL_WAV_SIZE
mean_val_loss = total_val_loss / VAL_WAV_SIZE
print("\n================================")
print("Epoch {}/{}".format(epoch, EPOCHS))
print('Accuracy: {:.4f} Loss: {:.4f} val_acc: {:.4f} val_loss: {:.4f}'.format(
mean_total_acc,
mean_total_loss,
mean_val_acc,
mean_val_loss))
print('Time taken for epoch {}: {:.2f} min'.format(epoch, (time.time() - epoch_start)/60))
print('Total Time taken: {:.2f} min'.format((time.time() - start)/60))
print("================================\n")
if mean_total_loss < TOLERANCE and mean_val_acc > 0.5:
break
After running the code above, it showed no errors and I got my checkpoints files in my directory. I closed my Jupyter notebook and built all the objects(which are my encoder and decoder) without training and typed
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
hoped that the parameters would come back and I can start my prediction but I just got pretty poor outcome anyway, which was against the outcome predicted after training right away. Should I add some more lines for restoring all the parameters or something else?
Below are more details about my encoder decoder structure, and my input shape is (batch size, 99, 13)
class ResnetIdentityBlock(tf.keras.layers.Layer):
def __init__(self, kernel_size, filters):
super(ResnetIdentityBlock, self).__init__()
self.filters1, self.filters2, self.filters3 = filters
self.conv1 = tf.keras.layers.Conv1D(self.filters1, 1, padding='valid')
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv1D(self.filters2, kernel_size, padding='same')
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv1D(self.filters3, 1, padding='valid')
self.bn3 = tf.keras.layers.BatchNormalization()
def call(self, input_tensor, training=False):
x = self.conv1(input_tensor)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x)
x = self.conv3(x)
x = self.bn3(x, training=training)
x += input_tensor
return tf.nn.relu(x)
class Encoder(tf.keras.Model):
'''
Encoder for MFCC transformed wave data
'''
def __init__(self,
lstm_units,
batch_sz,
dropout_rate,
units,
squeeze_time):
'''
Args:
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: layer dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Encoder, self).__init__()
self.lstm_units = lstm_units
self.squeeze_time = squeeze_time
# conv1d
self.feat_extract = tf.keras.layers.Dense(units=units, activation="relu")
self.feat_dropout = tf.keras.layers.Dropout(dropout_rate)
# ResNet
self.resnet1 = ResnetIdentityBlock(kernel_size=11, filters=[units, units, units])
units *= squeeze_time
self.resnet2 = ResnetIdentityBlock(kernel_size=7, filters=[units, units, units])
units *= squeeze_time
self.resnet3 = ResnetIdentityBlock(kernel_size=3, filters=[units, units, units])
# Encoder lstm
self.enc_lstm = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
def call(self, inputs):
'''
call pyramidal LSTM neural network encoder
Args:
inputs: wave input
'''
x = self.feat_extract(inputs)
x = self.feat_dropout(x)
# ResNet
x = self.resnet1(x)
x = self.reshape_pyramidal(x)
x = self.resnet2(x)
x = self.reshape_pyramidal(x)
x = self.resnet3(x)
# encoder output layer
fw_outputs, fw_state_h, fw_state_c = self.enc_lstm(x)
return fw_outputs, fw_state_h, fw_state_c
def reshape_pyramidal(self, outputs):
'''
After concatenating forward and backward outputs
return the reshaped output
Args:
outputs: outputs from LSTM
squeeze_time: time step one would like to squeeze in pyramidal LSTM
'''
batch_size, time_steps, num_units = outputs.shape
return tf.reshape(outputs, (batch_size, -1, num_units * self.squeeze_time))
class LuongAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(LuongAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units, activation="relu")
self.W2 = tf.keras.layers.Dense(units, activation="relu")
self.V = tf.keras.layers.Dense(1, activation="relu")
def call(self, query, values):
# query hidden state shape == (batch_size, hidden size)
# query_with_time_axis shape == (batch_size, 1, hidden size)
# values shape == (batch_size, max_len, hidden size)
# we are doing this to broadcast addition along the time axis to calculate the score
query_with_time_axis = tf.expand_dims(query, 1)
# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying score to self.V
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))
# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
class Decoder(tf.keras.Model):
'''
Decoder for output phonemes
'''
def __init__(self,
target_sz,
embedding_dim,
lstm_units,
batch_sz,
dropout_rate):
'''
Args:
target_sz: target size, total phoneme size in this case
embedding_dim: embedding dimension
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.target_sz = target_sz
self.lstm_units = lstm_units
self.embedding = tf.keras.layers.Embedding(target_sz, embedding_dim)
# attention model
self.attention = LuongAttention(lstm_units)
# decoder rnn
self.lstm1 = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
# Fully-connected
self.fc1 = tf.keras.layers.Dense(64, activation="relu")
self.fc1_dropout = tf.keras.layers.Dropout(dropout_rate)
self.fc2 = tf.keras.layers.Dense(target_sz, activation="softmax")
# build layer info dictionary
self.layer_info = dict()
def call(self, inputs, enc_hidden_h, enc_hidden_c, enc_output):
'''
call LSTM decoder
Args:
inputs: target output, following phoneme for wave data input in this case
enc_hidden_h: encoder hidden state h
enc_hidden_c: encoder hidden state c
enc_output: encoder outputs
'''
# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(inputs)
# enc_output shape == (batch_size, max_length, hidden_size)
context_vector, attention_weights = self.attention(enc_hidden_h, enc_output)
# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
# passing the concatenated vector to the 2-layer LSTM (Decoder)
x, state_h, state_c = self.lstm1(x)
# dense layer before final predict output dense layer
x = tf.reshape(x, (-1, x.shape[-1]))
x = self.fc1(x)
x = self.fc1_dropout(x)
# output shape == (batch_size, phoneme size)
x = self.fc2(x)
return x, (state_h, state_c), attention_weights
I'm trying to follow this tutrial https://colab.research.google.com/github/tensorflow/examples/blob/master/community/en/transformer_chatbot.ipynb, However, when I tried to save the model in order to load it again without training I got an error mentioned here NotImplementedError: Layers with arguments in `__init__` must override `get_config`
I understood from the answer that I need to make the encoder and decoder as classes and customise it(instead of leaving it as functions like the colab tutrial) so I went back to tensor flow documentation of this model here: https://www.tensorflow.org/tutorials/text/transformer#encoder_layer and tried to edit in it. I made the encoder layer as:
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, rate=0.1,**kwargs,):
#super(EncoderLayer, self).__init__()
super().__init__(**kwargs)
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def get_config(self):
config = super().get_config().copy()
config.update({
#'vocab_size': self.vocab_size,
#'num_layers': self.num_layers,
#'units': self.units,
'd_model': self.d_model,
'num_heads': self.num_heads,
'dropout': self.dropout,
})
return config
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
and same for the decoder layer class. Then the same encoder in the documentation of tf
class Encoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
maximum_position_encoding, rate=0.1):
super(Encoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding,
self.d_model)
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
seq_len = tf.shape(x)[1]
# adding embedding and position encoding.
x = self.embedding(x) # (batch_size, input_seq_len, d_model)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)
return x # (batch_size, input_seq_len, d_model)
the function of the model as:
def transformer(vocab_size,
num_layers,
units,
d_model,
num_heads,
dropout,
name="transformer"):
inputs = tf.keras.Input(shape=(None,), name="inputs")
dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
enc_padding_mask = tf.keras.layers.Lambda(
create_padding_mask, output_shape=(1, 1, None),
name='enc_padding_mask')(inputs)
# mask the future tokens for decoder inputs at the 1st attention block
look_ahead_mask = tf.keras.layers.Lambda(
create_look_ahead_mask,
output_shape=(1, None, None),
name='look_ahead_mask')(dec_inputs)
# mask the encoder outputs for the 2nd attention block
dec_padding_mask = tf.keras.layers.Lambda(
create_padding_mask, output_shape=(1, 1, None),
name='dec_padding_mask')(inputs)
enc_outputs = Encoder(
num_layers=num_layers, d_model=d_model, num_heads=num_heads,
input_vocab_size=vocab_size,
)(inputs=[inputs, enc_padding_mask])
dec_outputs = Decoder(
num_layers=num_layers, d_model=d_model, num_heads=num_heads,
target_vocab_size=vocab_size,
)(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])
outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)
return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)
and calling the model:
#the model itself with its paramters:
# Hyper-parameters
NUM_LAYERS = 3
D_MODEL = 256
#D_MODEL=tf.cast(D_MODEL, tf.float32)
NUM_HEADS = 8
UNITS = 512
DROPOUT = 0.1
model = transformer(
vocab_size=VOCAB_SIZE,
num_layers=NUM_LAYERS,
units=UNITS,
d_model=D_MODEL,
num_heads=NUM_HEADS,
dropout=DROPOUT)
However, I got that error:
TypeError: __init__() missing 2 required positional arguments: 'dff' and 'maximum_position_encoding'
I am really confused and I don't understand what dff and maximum position encoding mean in the documentation and when I removed them from the encoder and decoder classes, I got anther error as positional_encoding function takes maximum position as input and also dff is passed as input inside the class. I am not so sure what I should do as I am not sure whether I am following the right steps or not
If you get this error while calling transformer then your problem is with creating the model, not saving it.
Other than that, I see several issues with your get_config:
You defined dropout instead of rate.
The attributes you address (self.d_model etc.) are not defined or assigned at __init__.
It doesn't exist for your Encoder class.