I'm trying to follow this tutrial https://colab.research.google.com/github/tensorflow/examples/blob/master/community/en/transformer_chatbot.ipynb, However, when I tried to save the model in order to load it again without training I got an error mentioned here NotImplementedError: Layers with arguments in `__init__` must override `get_config`
I understood from the answer that I need to make the encoder and decoder as classes and customise it(instead of leaving it as functions like the colab tutrial) so I went back to tensor flow documentation of this model here: https://www.tensorflow.org/tutorials/text/transformer#encoder_layer and tried to edit in it. I made the encoder layer as:
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, rate=0.1,**kwargs,):
#super(EncoderLayer, self).__init__()
super().__init__(**kwargs)
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def get_config(self):
config = super().get_config().copy()
config.update({
#'vocab_size': self.vocab_size,
#'num_layers': self.num_layers,
#'units': self.units,
'd_model': self.d_model,
'num_heads': self.num_heads,
'dropout': self.dropout,
})
return config
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
and same for the decoder layer class. Then the same encoder in the documentation of tf
class Encoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
maximum_position_encoding, rate=0.1):
super(Encoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding,
self.d_model)
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
seq_len = tf.shape(x)[1]
# adding embedding and position encoding.
x = self.embedding(x) # (batch_size, input_seq_len, d_model)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)
return x # (batch_size, input_seq_len, d_model)
the function of the model as:
def transformer(vocab_size,
num_layers,
units,
d_model,
num_heads,
dropout,
name="transformer"):
inputs = tf.keras.Input(shape=(None,), name="inputs")
dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
enc_padding_mask = tf.keras.layers.Lambda(
create_padding_mask, output_shape=(1, 1, None),
name='enc_padding_mask')(inputs)
# mask the future tokens for decoder inputs at the 1st attention block
look_ahead_mask = tf.keras.layers.Lambda(
create_look_ahead_mask,
output_shape=(1, None, None),
name='look_ahead_mask')(dec_inputs)
# mask the encoder outputs for the 2nd attention block
dec_padding_mask = tf.keras.layers.Lambda(
create_padding_mask, output_shape=(1, 1, None),
name='dec_padding_mask')(inputs)
enc_outputs = Encoder(
num_layers=num_layers, d_model=d_model, num_heads=num_heads,
input_vocab_size=vocab_size,
)(inputs=[inputs, enc_padding_mask])
dec_outputs = Decoder(
num_layers=num_layers, d_model=d_model, num_heads=num_heads,
target_vocab_size=vocab_size,
)(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])
outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)
return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)
and calling the model:
#the model itself with its paramters:
# Hyper-parameters
NUM_LAYERS = 3
D_MODEL = 256
#D_MODEL=tf.cast(D_MODEL, tf.float32)
NUM_HEADS = 8
UNITS = 512
DROPOUT = 0.1
model = transformer(
vocab_size=VOCAB_SIZE,
num_layers=NUM_LAYERS,
units=UNITS,
d_model=D_MODEL,
num_heads=NUM_HEADS,
dropout=DROPOUT)
However, I got that error:
TypeError: __init__() missing 2 required positional arguments: 'dff' and 'maximum_position_encoding'
I am really confused and I don't understand what dff and maximum position encoding mean in the documentation and when I removed them from the encoder and decoder classes, I got anther error as positional_encoding function takes maximum position as input and also dff is passed as input inside the class. I am not so sure what I should do as I am not sure whether I am following the right steps or not
If you get this error while calling transformer then your problem is with creating the model, not saving it.
Other than that, I see several issues with your get_config:
You defined dropout instead of rate.
The attributes you address (self.d_model etc.) are not defined or assigned at __init__.
It doesn't exist for your Encoder class.
Related
My Input data and target data are different types of data, and in different dimensions. The input data is 1D, and the target data is 2D. I don’t know how to modify the model to fit this situation.
I have built an Encoder/Decoder model with tokenized MIDI data as input, and coordinate data in CSV format as output.
The input dimension is
(num_of_data, sequence_length, data_dimension) = (22, 1000~3000, 1)
The target dimension is
(num_of_data, sequence_length, data_dimension) = (22, 1000~3000, 102)
The Encoder/Decoder model architecture I found :
class Encoder(nn.Module):
def __init__(self,
input_size = 2,
embedding_size = 128,
hidden_size = 256,
n_layers = 4,
dropout = 0.5):
super().__init__()
self.hidden_size = hidden_size
self.n_layers = n_layers
self.linear = nn.Linear(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, n_layers,
dropout = dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
embedded = self.dropout(F.relu(self.linear(x)))
output, (hidden, cell) = self.rnn(embedded)
return hidden, cell
class Decoder(nn.Module):
def __init__(self,
output_size = 2,
embedding_size = 128,
hidden_size = 256,
n_layers = 4,
dropout = 0.5):
super().__init__()
self.output_size = output_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Linear(output_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, n_layers, dropout = dropout)
self.linear = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, x, hidden, cell):
x = x.unsqueeze(0)
embedded = self.dropout(F.relu(self.embedding(x)))
output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
prediction = self.linear(output.squeeze(0))
return prediction, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
assert encoder.hidden_size == decoder.hidden_size, \
"Hidden dimensions of encoder and decoder must be equal!"
assert encoder.n_layers == decoder.n_layers, \
"Encoder and decoder must have equal number of layers!"
def forward(self, x, y, teacher_forcing_ratio = 0.5):
batch_size = x.shape[1]
target_len = y.shape[0]
outputs = torch.zeros(y.shape).to(self.device)
hidden, cell = self.encoder(x)
decoder_input = x[-1, :, :]
for i in range(target_len):
output, hidden, cell = self.decoder(decoder_input, hidden, cell)
outputs[i] = output
teacher_forcing = random.random() < teacher_forcing_ratio
decoder_input = y[i] if teacher_forcing else output
return outputs
I set the input data and target data length to 900 as they need to have the same sequence_length before I convert them to tensor:
tokenized_data shape: (22, n)
target_data shape: (22, m, 102)
↓
tokenized_data shape: (22, 900)
target_data shape: (22, 900, 102)
input_tensor = torch.Tensor(input_data)
target_tensor = torch.Tensor(target_data)
torch.Size([22, 900])
torch.Size([22, 900, 102])
The parameters and model are listed below:
source = input_tensor.to(device)
target = target_tensor.to(device)
input_size = 900 # I am not sure if this is correct
output_size = (900,102) # I am not sure if this is correct
print('Input : {} Output : {}'.format(input_size, output_size))
embed_size = 256
hidden_size = 512
num_layers = 3
num_iteration = 100
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
encoder = Encoder(input_size, hidden_size, embed_size, num_layers, ENC_DROPOUT)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers, DEC_DROPOUT)
model = Seq2Seq(encoder, decoder, device).to(device)
I then tried to train the model with:
model = trainModel(model, source, target, num_iteration)
The above results in the error message shown below:
TypeError: empty(): argument 'size' must be tuple of ints, but found element of type tuple at pos 2
I revised output_size = [900,102] to output_size = 900, but I got the error below:
IndexError: too many indices for tensor of dimension 2
I am having problems with the Encoder/Decoder model that can’t deal with the input and target tensors having different dimensions. Any help or advice on how to create an Encoder/Decoder model that will take input and target tensors in different dimensions will be greatly appreciated.
Thank you in advance for your input.
I have a model implemented in pytorch that applies a final fully connected layer before running the softmax function.
The architecture is defined to solve a 4-class Speech Emotion Recognition task: given an audio track, it transforms it into its spectrogram and uses it to predict the emotion between happiness, sadness, neutrality and anger.
Unlike the architecture of the paper, it attempts to adapt the implementation of the Compact Convolutional Transformer found on Github at the link https://github.com/SHI-Labs/Compact-Transformers/blob/main/src/cct.py.
To improve the performance of the model I am following some tricks defined in the paper https://arxiv.org/abs/2104.07288.
Like what is described in the paper, however, my model also suffers from a "class collapse" problem: even by balancing the dataset, it tends to predict the anger and sadness classes well and the other two badly.
In the paper to solve this problem they apply a particular weight regularization technique to the fully connected layer, described in chapter 2.4.
Unfortunately, however, I cannot understand how I should modify my fully connected layer in pytorch to implement this type of regularization.
Code of the model:
class CCT(nn.Module):
def __init__(self,
img_size=224,
embedding_dim=768,
n_input_channels=3,
n_conv_layers=1,
kernel_size=7,
stride=2,
padding=3,
pooling_kernel_size=3,
pooling_stride=2,
pooling_padding=1,
dropout=0.,
attention_dropout=0.1,
stochastic_depth=0.1,
num_layers=14,
num_heads=6,
mlp_ratio=4.0,
num_classes=1000,
positional_embedding='learnable',
*args, **kwargs):
super(CCT, self).__init__()
self.tokenizer = Tokenizer(n_input_channels=n_input_channels,
n_output_channels=embedding_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding,
pooling_kernel_size=pooling_kernel_size,
pooling_stride=pooling_stride,
pooling_padding=pooling_padding,
max_pool=True,
activation=nn.ReLU,
n_conv_layers=n_conv_layers,
conv_bias=False)
self.classifier = TransformerClassifier(
sequence_length=self.tokenizer.sequence_length(n_channels=n_input_channels,
height=img_size,
width=img_size),
embedding_dim=embedding_dim,
seq_pool=True,
dropout=dropout,
attention_dropout=attention_dropout,
stochastic_depth=stochastic_depth,
num_layers=num_layers,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
num_classes=num_classes,
positional_embedding=positional_embedding
)
def forward(self, x):
x = self.tokenizer(x)
return self.classifier(x)
class Tokenizer(nn.Module):
def __init__(self,
kernel_size, stride, padding,
pooling_kernel_size=3, pooling_stride=2, pooling_padding=1,
n_conv_layers=1,
n_input_channels=3,
n_output_channels=64,
in_planes=64,
activation=None,
max_pool=True,
conv_bias=False):
super(Tokenizer, self).__init__()
n_filter_list = [n_input_channels] + \
[in_planes for _ in range(n_conv_layers - 1)] + \
[n_output_channels]
self.conv_layers = nn.Sequential(
*[nn.Sequential(
nn.Conv2d(n_filter_list[i], n_filter_list[i + 1],
kernel_size=(kernel_size, kernel_size),
stride=(stride, stride),
padding=(padding, padding), bias=conv_bias),
nn.Identity() if activation is None else activation(),
nn.MaxPool2d(kernel_size=pooling_kernel_size,
stride=pooling_stride,
padding=pooling_padding) if max_pool else nn.Identity()
)
for i in range(n_conv_layers)
])
self.flattener = nn.Flatten(2, 3)
self.apply(self.init_weight)
def sequence_length(self, n_channels=3, height=224, width=224):
return self.forward(torch.zeros((1, n_channels, height, width))).shape[1]
def forward(self, x):
return self.flattener(self.conv_layers(x)).transpose(-2, -1)
#staticmethod
def init_weight(m):
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
class TransformerClassifier(Module):
def __init__(self,
seq_pool=True,
embedding_dim=768,
num_layers=12,
num_heads=12,
mlp_ratio=4.0,
num_classes=1000,
dropout=0.1,
attention_dropout=0.1,
stochastic_depth=0.1,
positional_embedding='learnable',
sequence_length=None):
super().__init__()
positional_embedding = positional_embedding if \
positional_embedding in ['sine', 'learnable', 'none'] else 'sine'
dim_feedforward = int(embedding_dim * mlp_ratio)
self.embedding_dim = embedding_dim
self.sequence_length = sequence_length
self.seq_pool = seq_pool
self.num_tokens = 0
assert sequence_length is not None or positional_embedding == 'none', \
f"Positional embedding is set to {positional_embedding} and" \
f" the sequence length was not specified."
if not seq_pool:
sequence_length += 1
self.class_emb = Parameter(torch.zeros(1, 1, self.embedding_dim),
requires_grad=True)
self.num_tokens = 1
else:
self.attention_pool = Linear(self.embedding_dim, 1)
if positional_embedding != 'none':
if positional_embedding == 'learnable':
self.positional_emb = Parameter(torch.zeros(1, sequence_length, embedding_dim),
requires_grad=True)
init.normal_(self.positional_emb, std=0.2)
else:
self.positional_emb = Parameter(self.sinusoidal_embedding(sequence_length, embedding_dim),
requires_grad=False)
else:
self.positional_emb = None
self.dropout = Dropout(p=dropout)
dpr = [x.item() for x in torch.linspace(0, stochastic_depth, num_layers)]
self.blocks = ModuleList([
TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads,
dim_feedforward=dim_feedforward, dropout=dropout,
attention_dropout=attention_dropout, drop_path_rate=dpr[i])
for i in range(num_layers)])
self.norm = LayerNorm(embedding_dim)
self.fc = Linear(embedding_dim, num_classes)
self.apply(self.init_weight)
def forward(self, x):
if self.positional_emb is None and x.size(1) < self.sequence_length:
x = F.pad(x, (0, 0, 0, self.n_channels - x.size(1)), mode='constant', value=0)
if not self.seq_pool:
cls_token = self.class_emb.expand(x.shape[0], -1, -1)
x = torch.cat((cls_token, x), dim=1)
if self.positional_emb is not None:
x += self.positional_emb
x = self.dropout(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
if self.seq_pool:
x = torch.matmul(F.softmax(self.attention_pool(x), dim=1).transpose(-1, -2), x).squeeze(-2)
else:
x = x[:, 0]
x = self.fc(x)
return x
Can someone help me?
As you have not shared any network architecture I will try to give a basic example. I am not sure the regularization of the paper but I will give a simple example that will apply L1 regularization on specific layer (e.g. layer 0)
my_model = nn.Sequential(
nn.Linear(5, 5),
nn.ReLU(),
nn.Linear(5, 2)
)
x = torch.randn(5, 5)
target = torch.ones(5, dtype=torch.long)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
for epoch in range(10):
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
# This will be your weight regularization, choose your layer like model[0] and apply normalization that you want on that layer weights
l1_norm = torch.norm(model[0].weight, p=1)
loss += l1_norm
loss.backward()
optimizer.step()
print('Epoch {}, loss {}, norm layer {}'.format(
epoch, loss.item(), l1_norm.item()))
I want to perform some transformation on the input batch during training. For example, if I have a batch of images of size (number of samples, width, height, channels), I want to replace the 3rd channel with the difference of the first two channels, then resize the image size and finally normalize it.
I tried to define a custom layer:
class CustomLayer(tf.keras.layers.Layer):
def __init__(self):
super(CustomLayer, self).__init__()
def build(self, input_shape):
pass
def call(self, input_):
#Loaded images
self.img_tr = []
for image in input_:
img_input = resize(image,(267,400))#from skimage import resize
img_diff = (img_input[:,:,1]/np.max(img_input[:,:,1]))-((img_input[:,:,0]+img_input[:,:,2])/np.max(img_input[:,:,0]+img_input[:,:,2]))
img_temp = np.zeros((267,400,3))
img_temp[:,:,0] = img_input[:,:,0]/np.max(img_input[:,:,0])
img_temp[:,:,1] = img_input[:,:,1]/np.max(img_input[:,:,1])
img_temp[:,:,2] = img_diff/np.max(img_diff)
self.img_tr.append(img_temp)
self.img_tr= np.asarray(self.img_tr)
return self.img_tr
Then I used:
input_0 = tf.keras.Input(shape = (None,None,3))
clayer = CustomLayer()
input_1 = clayer(input_0)
x = tf.keras.layers.Conv2D(filters = 16, kernel_size = (7,7), activation = tf.keras.activations.relu)(input_1)
x = tf.keras.layers.MaxPool2D(pool_size = (2,2))(x)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(units = 64, activation = tf.keras.activations.relu)(x)
output = tf.keras.layers.Dense(units = 12)(x)
model = tf.keras.Model(inputs = input_0, outputs = output)
model.compile(
optimizer = tf.keras.optimizers.Adam(),
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
metrics = tf.keras.metrics.SparseCategoricalAccuracy()
)
model.summary()
I get an error that says:
AttributeError: 'Tensor' object has no attribute 'ndim'
I think the issue related to the fact that my custom layer expects a 4d numpy array but the input is having this format:
<KerasTensor: shape=(None, None, None, 3) dtype=float32 (created by layer 'input_20')>
How can I resolve the issue? I cannot find a way to convert KerasTensor to a numpy array inside my custom layer.
Edit
I tried to avoid for loops and numpy so I tried:
class CustomLayer(tf.keras.layers.Layer):
def __init__(self):
super(CustomLayer, self).__init__()
def build(self, input_shape):
pass
def call(self, input_):
input_ = tf.Variable(input_)
img_input = tf.image.resize(input_,(267,400))
img_diff = (img_input[:,:,:,1])-((img_input[:,:,:,0]+img_input[:,:,:,2]))
img_input[:,:,:,2] = img_diff
output_img = tf.image.per_image_standardization(img_input)
return input_
However, when I use the custom layer in the functional API I get the error:
ValueError: Tensor-typed variable initializers must either be wrapped in an init_scope or callable (e.g., `tf.Variable(lambda : tf.truncated_normal([10, 40]))`) when building functions. Please file a feature request if this restriction inconveniences you.
It seems has something to do with tf.Variable. Even if I set validate_shape to False I still get the same error.
Simply removing tf.Variable does the job. Below is the full layer:
class CustomLayer(tf.keras.layers.Layer):
def __init__(self):
super(CustomLayer, self).__init__()
def build(self, input_shape):
pass
def call(self, inp):
img_input = tf.image.resize(inp, (267,400))
img_diff = (img_input[:,:,:,1])-((img_input[:,:,:,0]+img_input[:,:,:,2]))
img_diff = tf.expand_dims(img_diff, -1)
img_input = tf.keras.layers.Concatenate()([img_input[:,:,:,:-1], img_diff])
output_img = tf.image.per_image_standardization(img_input)
return output_img
I used tf.keras.layers.Concatenate to replace the last channel of img_input with img_diff.
Here the running notebook
I was building a self-defined encoder-decoder tf.keras.Model and saved my checkpoint. After closing my Jupyter notebook and open it again trying to restore my encoder-decoder parameters, I surprisingly found that it is not working. I am not sure it's that I understand the usage wrong or something was wrong with my steps of doing it. Here is my code.
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
tf.config.experimental_run_functions_eagerly(True)
EPOCHS = 10
TOLERANCE = 0.08
start = time.time()
for epoch in range(1, EPOCHS+1):
epoch_start = time.time()
# train the encoder-decoder model
batch = 0
total_loss = 0
total_accuracy = 0
for inp, targ in dataset.take(STEP_PER_EPOCH):
batch += 1
batch_loss, batch_accuracy = train_step(inp, targ, phoneme_tokenizer)
total_loss += batch_loss
total_accuracy += batch_accuracy
print("Epoch: {}/{} Batch: {} Loss: {:.4f} Accuracy: {:.4f} Time: {:.0f}s".
format(epoch, EPOCHS, batch, batch_loss.numpy(), batch_accuracy.numpy(), time.time()-epoch_start),
end="\r")
if batch % 1000 == 0:
print()
print()
# saving (checkpoint) the model when total loss is less than 0.9
checkpoint.save(file_prefix=checkpoint_prefix)
# validation process
total_val_loss = 0
total_val_acc = 0
for val_inp, val_targ in dataset_val.take(VAL_WAV_SIZE):
val_loss, val_acc = validate_step(val_inp, val_targ, phoneme_tokenizer)
total_val_loss += val_loss
total_val_acc += val_acc
# print out the epoch results
mean_total_acc = total_accuracy / STEP_PER_EPOCH
mean_total_loss = total_loss / STEP_PER_EPOCH
mean_val_acc = total_val_acc / VAL_WAV_SIZE
mean_val_loss = total_val_loss / VAL_WAV_SIZE
print("\n================================")
print("Epoch {}/{}".format(epoch, EPOCHS))
print('Accuracy: {:.4f} Loss: {:.4f} val_acc: {:.4f} val_loss: {:.4f}'.format(
mean_total_acc,
mean_total_loss,
mean_val_acc,
mean_val_loss))
print('Time taken for epoch {}: {:.2f} min'.format(epoch, (time.time() - epoch_start)/60))
print('Total Time taken: {:.2f} min'.format((time.time() - start)/60))
print("================================\n")
if mean_total_loss < TOLERANCE and mean_val_acc > 0.5:
break
After running the code above, it showed no errors and I got my checkpoints files in my directory. I closed my Jupyter notebook and built all the objects(which are my encoder and decoder) without training and typed
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
hoped that the parameters would come back and I can start my prediction but I just got pretty poor outcome anyway, which was against the outcome predicted after training right away. Should I add some more lines for restoring all the parameters or something else?
Below are more details about my encoder decoder structure, and my input shape is (batch size, 99, 13)
class ResnetIdentityBlock(tf.keras.layers.Layer):
def __init__(self, kernel_size, filters):
super(ResnetIdentityBlock, self).__init__()
self.filters1, self.filters2, self.filters3 = filters
self.conv1 = tf.keras.layers.Conv1D(self.filters1, 1, padding='valid')
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv1D(self.filters2, kernel_size, padding='same')
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv1D(self.filters3, 1, padding='valid')
self.bn3 = tf.keras.layers.BatchNormalization()
def call(self, input_tensor, training=False):
x = self.conv1(input_tensor)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x)
x = self.conv3(x)
x = self.bn3(x, training=training)
x += input_tensor
return tf.nn.relu(x)
class Encoder(tf.keras.Model):
'''
Encoder for MFCC transformed wave data
'''
def __init__(self,
lstm_units,
batch_sz,
dropout_rate,
units,
squeeze_time):
'''
Args:
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: layer dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Encoder, self).__init__()
self.lstm_units = lstm_units
self.squeeze_time = squeeze_time
# conv1d
self.feat_extract = tf.keras.layers.Dense(units=units, activation="relu")
self.feat_dropout = tf.keras.layers.Dropout(dropout_rate)
# ResNet
self.resnet1 = ResnetIdentityBlock(kernel_size=11, filters=[units, units, units])
units *= squeeze_time
self.resnet2 = ResnetIdentityBlock(kernel_size=7, filters=[units, units, units])
units *= squeeze_time
self.resnet3 = ResnetIdentityBlock(kernel_size=3, filters=[units, units, units])
# Encoder lstm
self.enc_lstm = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
def call(self, inputs):
'''
call pyramidal LSTM neural network encoder
Args:
inputs: wave input
'''
x = self.feat_extract(inputs)
x = self.feat_dropout(x)
# ResNet
x = self.resnet1(x)
x = self.reshape_pyramidal(x)
x = self.resnet2(x)
x = self.reshape_pyramidal(x)
x = self.resnet3(x)
# encoder output layer
fw_outputs, fw_state_h, fw_state_c = self.enc_lstm(x)
return fw_outputs, fw_state_h, fw_state_c
def reshape_pyramidal(self, outputs):
'''
After concatenating forward and backward outputs
return the reshaped output
Args:
outputs: outputs from LSTM
squeeze_time: time step one would like to squeeze in pyramidal LSTM
'''
batch_size, time_steps, num_units = outputs.shape
return tf.reshape(outputs, (batch_size, -1, num_units * self.squeeze_time))
class LuongAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(LuongAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units, activation="relu")
self.W2 = tf.keras.layers.Dense(units, activation="relu")
self.V = tf.keras.layers.Dense(1, activation="relu")
def call(self, query, values):
# query hidden state shape == (batch_size, hidden size)
# query_with_time_axis shape == (batch_size, 1, hidden size)
# values shape == (batch_size, max_len, hidden size)
# we are doing this to broadcast addition along the time axis to calculate the score
query_with_time_axis = tf.expand_dims(query, 1)
# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying score to self.V
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))
# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
class Decoder(tf.keras.Model):
'''
Decoder for output phonemes
'''
def __init__(self,
target_sz,
embedding_dim,
lstm_units,
batch_sz,
dropout_rate):
'''
Args:
target_sz: target size, total phoneme size in this case
embedding_dim: embedding dimension
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.target_sz = target_sz
self.lstm_units = lstm_units
self.embedding = tf.keras.layers.Embedding(target_sz, embedding_dim)
# attention model
self.attention = LuongAttention(lstm_units)
# decoder rnn
self.lstm1 = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
# Fully-connected
self.fc1 = tf.keras.layers.Dense(64, activation="relu")
self.fc1_dropout = tf.keras.layers.Dropout(dropout_rate)
self.fc2 = tf.keras.layers.Dense(target_sz, activation="softmax")
# build layer info dictionary
self.layer_info = dict()
def call(self, inputs, enc_hidden_h, enc_hidden_c, enc_output):
'''
call LSTM decoder
Args:
inputs: target output, following phoneme for wave data input in this case
enc_hidden_h: encoder hidden state h
enc_hidden_c: encoder hidden state c
enc_output: encoder outputs
'''
# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(inputs)
# enc_output shape == (batch_size, max_length, hidden_size)
context_vector, attention_weights = self.attention(enc_hidden_h, enc_output)
# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
# passing the concatenated vector to the 2-layer LSTM (Decoder)
x, state_h, state_c = self.lstm1(x)
# dense layer before final predict output dense layer
x = tf.reshape(x, (-1, x.shape[-1]))
x = self.fc1(x)
x = self.fc1_dropout(x)
# output shape == (batch_size, phoneme size)
x = self.fc2(x)
return x, (state_h, state_c), attention_weights
I have a Bucketiterator from torchtext that I feed to a model in pytorch. An example of how the iterator is constructed:
train_iter, val_iter = BucketIterator.splits((train,val),
batch_size=batch_size,
sort_within_batch = True,
device = device,
shuffle=True,
sort_key=lambda x: (len(x.src), len(x.trg)))
The data is then fed to a model like this, where I use the nn.Embedding layer.
class encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.input_dim = input_dim
self.emb_dim = emb_dim
self.hid_dim = hid_dim
self.n_layers = n_layers
self.dropout = dropout
self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
#src = [src sent len, batch size]
embedded = self.dropout(self.embedding(src))
#embedded = [src sent len, batch size, emb dim]
hidden_enc = []
outputs, hidden = self.rnn(embedded[0,:,:].unsqueeze(0))
for i in range(1,len(embedded[:,1,1])):
outputs, hidden = self.rnn(embedded[i,:,:].unsqueeze(0),hidden)
hidden_cpu = []
for k in range(len(hidden)):
hidden_cpu.append(hidden[k])
hidden_cpu[k] = hidden[k].cpu()
hidden_enc.append(tuple(hidden_cpu))
#outputs, hidden = self.rnn(embedded)
#outputs = [src sent len, batch size, hid dim * n directions]
#hidden = [n layers * n directions, batch size, hid dim]
#cell = [n layers * n directions, batch size, hid dim]
None
#outputs are always from the top hidden layer
return hidden, hidden_enc
But what if I wanted the embedding to be one-hot encoded? I work on formal languages and it would be nice to preserve orthogonality between tokens. It doesn't seem like pytorch or torchtext has any functionality for doing this.
def get_one_hot_torch_tensor(in_tensor):
"""
Function converts a 1d or 2d torch tensor to one-hot encoded
"""
n_channels = torch.max(in_tensor)+1 # maximum number of channels
if in_tensor.ndim == 2:
out_one_hot = torch.zeros((n_channels, in_tensor.shape[0], in_tensor.shape[1]))
# print(out_one_hot)
index = np.indices((in_tensor.shape[0], in_tensor.shape[1])) # create an array of indices
x, y = index[0], index[1]
print(x, y)
out_one_hot[in_tensor, x, y] = 1
print(out_one_hot)