Modifying T5 for sequence labelling - python

I am trying to modify the T5-model as a sequence labelling task (to do NER).
I create my model class by taking the last hidden states of the T5-model and add a linear layer with 3 out-features (for simple IOB-tags).
Here is my model class:
class Seq2SeqTokenCLS(nn.Module):
def __init__(self):
super(Seq2SeqTokenCLS, self).__init__()
self.num_labels = 3
self.base_model = T5ForConditionalGeneration.from_pretrained('t5-small')
# average of n last hidden layers
self.layers = 3
# change beam search or greedy search here
# Suggested parameters from the T5 paper: num_beams = 4 and length penalty alpha = 0.6
self.base_model.config.num_beams = 1 # <-- change to 1 for greedy decoding
self.base_model.config.length_penalty = 0.6 # <-- comment this out for greedy decoding
self.dropout = nn.Dropout(0.5)
self.dense = nn.Linear(in_features=512 * self.layers, out_features=self.num_labels)
def forward(self, input_ids, attn_mask, labels):
hidden_states = self.base_model(
hidden_states =[hidden_states['decoder_hidden_states'][-(n+1)] for n in range(self.layers)], dim=2)
logits = self.dense(self.dropout(hidden_states))
loss = None
loss_fct = nn.CrossEntropyLoss(weight=class_weights)
# Only keep active parts of the loss
if attn_mask is not None:
active_loss = attn_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
loss = loss_fct(active_logits, active_labels)
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return {'logits':logits,
However, I am confused about how should do inference in this approach. Should I use the .generate function as when T5 has a standard LM head? If that is the case, then I don't know how to inherit the function into my new model class...
Or can I use a normal evaluation loop?
E.g. something like this?:
predictions = []
all_labels = []
with torch.no_grad():
for batch in tqdm(test_loader):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids=input_ids,
for sample, lab in zip(outputs['logits'],labels):
preds = torch.argmax(sample, dim=1)
I would still like to experiment with beam search...


Is there a way to fix param.grad = none in pytorch model?

I am working on the Point Cloud Registration Network(PCRNET) and I have a issue with the training process. For that I wrote a pytorch model that consists of 5 convolutional layers and 5 fully connected layers. My custom loss output changes with each new initialization of the network but then for each epoch I obtain the same values for each batch. Therefore no training is happening. I narrowed the error down to the fact that no gradients are being computed.
Here is my network and forward pass
class pcrnetwork(nn.Module):
def __init__(self,):
# This is the network that gets initialized with every new instance
self.conv1 = nn.Conv1d(3,64,1, padding="valid")
self.conv2 = nn.Conv1d(64,64,1,padding="valid")
self.conv3 = nn.Conv1d(64,64,1,padding="valid")
self.conv4 = nn.Conv1d(64,128,1,padding="valid")
self.conv5 = nn.Conv1d(128,1024,1,padding="valid")
self.fc1 = nn.Linear(2048,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,512)
self.fc4 = nn.Linear(512,256)
self.fc5 = nn.Linear(256,6)
self.bn1 = nn.BatchNorm1d(64)
self.bn2 = nn.BatchNorm1d(64)
self.bn3 = nn.BatchNorm1d(64)
self.bn4 = nn.BatchNorm1d(128)
self.bn6 = nn.BatchNorm1d(1024)
self.bn7 = nn.BatchNorm1d(512)
self.bn8 = nn.BatchNorm1d(512)
self.bn9 = nn.BatchNorm1d(256)
def forward1(self,input,input1,points):
point_cloud =,input1),dim=2)
net = Func.relu(self.bn1(self.conv1(point_cloud)))
net = Func.relu(self.bn2(self.conv2(net)))
net = Func.relu(self.bn3(self.conv3(net)))
net = Func.relu(self.bn4(self.conv4(net)))
net = Func.relu(self.conv5(net))
net_s = net[:,:,0:points]
net_t = net[:,:,points:points*2]
pool = nn.MaxPool1d(net_s.size(-1))(net_s)
pool2 = nn.MaxPool1d(net_t.size(-1))(net_t)
global_feature =,pool2),1)
#global_feature = torch.squeeze(global_feature,dim=2)
global_feature = torch.flatten(global_feature,start_dim=1)
# fully connected part
net = Func.relu(self.bn6(self.fc1(global_feature)))
net = Func.relu(self.bn7(self.fc2(net)))
net = Func.relu(self.bn8(self.fc3(net)))
net = Func.relu(self.bn9(self.fc4(net)))
net = Func.relu(self.fc5(net))
pose = net
output = appply_transformation(torch.transpose(input,1,2),pose)
return output
my training loop looks like this:
def train1():
losss = []
for epoch in range(1):
total_loss = 0.0
#poses = []
for idx, data in enumerate(train_loader,0):
x = data["source"] # shape= [32,2048,3]
y = data["target"]
x = torch.transpose(x,1,2)
x =
y = torch.transpose(y,1,2)
y =
output = model.forward1(x,y,2048)
y = torch.transpose(y,1,2)
loss = og_chamfer1(y,output)
And finally here is the code for my loss function. The idea here is to let the network calculate 6 parameters(3 rotational, 3 translational) that get fed into my apply transformation function. Then my actual loss(=Chamfer Distance) is being calculated on the transformed source point cloud and the target point cloud.
def dist_vec(source, targ):
#AB = torch.matmul(targ,torch.transpose(source,1,2))
AB = torch.matmul(targ,torch.transpose(source,0,1))
#print("ab hat die shape",AB.shape)
AA = torch.sum(torch.square(targ),1)
#AA = AA[:,:,None]
#print("AA hat die shape", AA.shape)
BB = torch.sum(torch.square(source),1)
#BB = BB[:,:,None]
dist_matrix = torch.transpose((BB - 2 * AB), 0,1) + AA
return dist_matrix
def og_chamfer1(sourc,targ): # source =[32,2048,3]
batch_loss1 = torch.zeros(size=(len(sourc),))
batch_loss = []
for i in range(len(sourc)):
dist = dist_vec(sourc[i],targ[i])
#print("dist hat die shape", dist.shape)
min_x_val, min_x_idx = torch.min(dist, axis=0)
#print("this is minx", min_x_val)
#min_x = torch.tensor(min_x[0])
min_y_val, min_y_idx = torch.min(dist,axis=1)
#print("this is min y", min_y_val)
mean = torch.mean(min_x_val) + torch.mean(min_y_val)
batch_loss1[i] = mean
#batch_loss_total = sum(batch_loss)/len(sourc)
batch_loss1 = torch.mean(batch_loss1)
return batch_loss1
all of these functions should work, I just post them for reference. I think the problem for para.grad=None lays somewhere in my apply transformation function:
def rotate_cloud_by_angle_z(input, rotation_angle):
# the input here should have shape=(num.of points x 3)
# dtype for the rotation matrix needs to be set to float64
cosval = torch.cos(rotation_angle) # DONT USE TF.MATH.COS BECAUSE U GET A TENSOR NOT A NUMBER
sinval = torch.sin(rotation_angle)
#print("sinval hat shape:",sinval.shape)
#cosval = torch.from_numpy(cosval)
#sinval = torch.from_numpy(sinval)
rotation_matrix =torch.tensor([[cosval.item(),-sinval.item(),0],[sinval.item(),cosval.item(),0],[0,0,1]],dtype=torch.float32, requires_grad=False)
rotation_matrix =
product = torch.matmul(input, rotation_matrix)
return product
def appply_transformation(datas,poses):
transformed_data = datas
#print("poses hat die shape", poses.shape)
for i in range(datas.shape[0]):
#print("poses[i,5] hat shape:", poses[i,5])
#print("poses hat shape:", poses.shape)
transformed_data[i,:,:] = rotate_cloud_by_angle_z(transformed_data[i,:,:].clone(),poses[i,5])
#print("poses[i,5] hat shape:", poses[i,5])
transformed_data[i,:,:] = rotate_cloud_by_angle_y(transformed_data[i,:,:].clone(),poses[i,4])
transformed_data[i,:,:] = rotate_cloud_by_angle_x(transformed_data[i,:,:].clone(),poses[i,3])
transformed_data[i,:,:] = translation(transformed_data[i,:,:].clone(),torch.tensor([poses[i,0],poses[i,1],poses[i,2]],requires_grad=False).to(device))
return transformed_data
on I could find out that one shouldn't use .item() or rewrapping of tensors like x = torch.tensor(x) but essentially I don't know how to change my apply transformation function in such that the gradient calculation works.
If anyone has any tips on that I would be super grateful!

Constant loss and accuracy in pytorch

I am training a model whose output and ground truth should be binary. It's an inception based two stream models. Inception architecture is used as an encoder and for decoder a custom based model is designed consisting of conv layers, batch normalization, up sampling and using tanh as non linearity.I have tried with relu but still no result.
Model is initializing at different values but not updating. My model's forward function is:
def forward(self, inp):
# Preprocessing
out = self.conv3d_1a_7x7(inp)
skip1 = out
out = self.maxPool3d_2a_3x3(out)
out = self.dropout(out)
out = self.conv3d_2b_1x1(out)
out = self.conv3d_2c_3x3(out)
out = self.maxPool3d_3a_3x3(out)
out = self.dropout(out)
out = self.mixed_3b(out)
skip2 = out
out = self.mixed_3c(out)
out = self.maxPool3d_4a_3x3(out)
out = self.dropout(out)
out = self.mixed_4b(out)
out = self.mixed_4c(out)
out = self.dropout(out)
out = self.mixed_4d(out)
skip3 = out
out = self.dropout(out)
out = self.mixed_4e(out)
out = self.mixed_4f(out)
out = self.maxPool3d_5a_2x2(out)
out = self.dropout(out)
out = self.mixed_5b(out)
out = self.mixed_5c(out)
out = self.dropout(out)
out = self.tconv6(out, skip1,skip2,skip3)
out = self.sigmoid(out)
print("Before permutation", out.shape)
out = out.permute(0,1,3,4,2)
out_logits = out
return out, out_logits
My train function is:
misc,out_logits[stream] = models[stream](data[stream])
out_softmax = torch.nn.functional.softmax(out_logits[stream], 1).requires_grad_()
val, preds = torch.max(out_logits[stream].data, 1)
preds =, dtype=torch.float)
gt = torch.round(gt)
gt_avg = torch.mean(gt)
gt[gt>gt_avg] = 1
gt[gt<=gt_avg] = 0
out_logits[stream] = out_logits[stream].squeeze(1)
losses[stream] = criterion(preds.cpu(), gt.cpu()).requires_grad_()
if phase == 'train':
running_losses[stream] += losses[stream].item() * data[stream].shape[0]
running_corrects[stream] += torch.sum(val.cpu() ==
correct_t = torch.sum(preds==gt_c).item()
total_t = gt_c.shape[0]*gt_c.shape[1]*gt_c.shape[2]*gt_c.shape[3]
acc_epc = 100*correct_t/total_t
for scheduler in schedulers.values():
My loss and accuracy is always constant shown here
I have tried using different optimizers like SGD, Adam , RMSprop. Furthermore, I have tried tuning the hyperparameters but model is not converging. What am I missing?
You send the wrong variable into loss fuction if you are doing crossentropy. Change preds to out_logits[stream] and there's no need to do .cpu() and require_grad().
losses[stream] = criterion(out_logits[stream], gt)
Also, you performed argmax for preds. It's not differentiable regardless the loss function you used.

Unexpected shape of output from raw_rnn and how to inspect weights in raw_rnn

I have a simple code below for testing a RNN cell by feeding previous output as current input.
I was to do this after training.
When I call
tf.compat.v1.nn.raw_rnn(cell, rnn_loop)
after training I want it to use the weights that were achieved in training using another
tf.compat.v1.nn.raw_rnn(cell, rnn_loop)
Will the weights be the same or will the weights for raw_rnn during testing be initialized from zero? I will not run I want know if I can safely call
tf.compat.v1.nn.raw_rnn(cell, rnn_loop) twice and still be using the same weights.
I also want to know how to inspect the trained weight values? so that I can confirm this.
The shape of rnn_outputs_tensor is (None,64,128) but I am expecting (10,64,128) because there are 10 steps (HORIZON) right?
import tensorflow as tf
state_size = 128
cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size)
class RnnLoop:
def __init__(self, initial_state, cell):
self.initial_state = initial_state
self.cell = cell
def __call__(self, time, cell_output, cell_state, loop_state):
emit_output = cell_output # == None for time == 0
if cell_output is None: # time == 0
initial_input = tf.fill([BATCH_SIZE, state_size], 0.0)
next_input = initial_input
next_cell_state = self.initial_state
next_input = cell_output
next_cell_state = cell_state
elements_finished = (time >= HORIZON)
next_loop_state = None
return elements_finished, next_input, next_cell_state, emit_output, next_loop_state
initial_state_tensor = tf.zeros((BATCH_SIZE,state_size),dtype=tf.float32)
rnn_loop = RnnLoop(initial_state=initial_state_tensor, cell=cell)
rnn_outputs_tensor_array, _, _ = tf.compat.v1.nn.raw_rnn(cell, rnn_loop)
rnn_outputs_tensor = rnn_outputs_tensor_array.stack()
var = [v for v in tf.compat.v1.trainable_variables()]

Transformer PyTorch Error- ValueError: too many values to unpack (expected 2)

I am having issues getting my model to run. I am not sure which model to use in the translate_sentence function, I have tried model.transformer, model.encoder_de, etc. It is based off of the Transformer class and the forward() function I believe but I am getting a type error. These are the directions:
As in the forward(self, src, tgt) function of the
TransformerModel class, you need to create the appropriate
mask and encode the source sentence (just once).
You also need to create the appropriate mask and encode the
output sentence for sequential predictions. Unlike the source,
for every iteration, you need to re-encode the previous output and
pass both the source sentence and previous output into the
from torch.nn import Transformer
class TransformerModel(nn.Module):
def __init__(self, ntoken_in, ntoken_out, ninp, nhead, npf_dim, nlayers, src_pad_idx, trg_pad_idx, dropout=0.5):
super(TransformerModel, self).__init__()
# --------------- param -----------------
# ntoken_in: the idx of the input word after tokenization
# ntoken_out: the idx of the input word w.r.t. the tokenization
# ninp: the number of expected features in the encoder/decoder inputs
# nhead: the number of multiAttention heads
# npf_dim: the dimension of the feedforward layer
# src_pad_idx: the token for padding in source language
# trg_pad_idx: the token for padding in target language
# ----------------------------------------
self.model_type = 'Transformer'
self.pos_encoder = PositionalEncoding(ninp, dropout)
self.transformer = Transformer(d_model=ninp, nhead=nhead, num_encoder_layers=nlayers, num_decoder_layers=nlayers,
dim_feedforward=npf_dim, dropout=dropout, activation='relu')
self.encoder_en = nn.Embedding(ntoken_in, ninp) # tok_embedding for input
self.encoder_de = nn.Embedding(ntoken_out, ninp) # tok_embedding for output
self.ninp = ninp
self.decoder = nn.Linear(ninp, ntoken_out)
self.src_pad_idx = src_pad_idx
self.tgt_pad_idx = trg_pad_idx
def _generate_src_key_mask(self, src):
# for key_padding_mask in transformer
# the positions with the value of True will be ignored while the position
# with the value of False will be unchanged. We mask all padding words.
# The output dim is b*s
src_mask = (src == self.src_pad_idx)
return src_mask.T
def _generate_tgt_mask(self, tgt, sz):
# Beside key_padding_mask in transformer, the output or teacher input
# should be masked sequentially to prevent the model get any information
# from the future words it is going to predict
tgt_key_mask = tgt == self.tgt_pad_idx
# We provide FloatTensor attn_mask. It will be added to the attention weight.
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
attn_mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(tgt.device)
return attn_mask, tgt_key_mask.T
def init_weights(self):
initrange = 0.1, initrange), initrange), initrange)
def forward(self, src, tgt):
# src
src_key_mask = self._generate_src_key_mask(src)
src = self.encoder_en(src) * math.sqrt(self.ninp) # use a learned encoder put stoi index to a feature space s*b --> s*b*e
src = self.pos_encoder(src) # add the pos feature toward feature space
# tgt
tgt_mask, tgt_key_mask = self._generate_tgt_mask(tgt, tgt.size(0))
tgt = self.encoder_de(tgt) * math.sqrt(self.ninp)
tgt = self.pos_encoder(tgt)
output = self.transformer(src, tgt, tgt_mask=tgt_mask,
src_key_padding_mask = src_key_mask,
tgt_key_padding_mask = tgt_key_mask)
output = self.decoder(output)
return output
class PositionalEncoding(nn.Module):
# The positional encoding as described in the paper
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x +[:x.size(0), :]
return self.dropout(x)
# Here we intialize our model
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
N_PF_DIM = 512
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
model =TransformerModel(ntoken_in = INPUT_DIM, ntoken_out=OUTPUT_DIM, ninp=HID_DIM,
nhead=N_HEADS, npf_dim=N_PF_DIM, nlayers=N_LAYERS,
src_pad_idx=SRC_PAD_IDX, trg_pad_idx=TRG_PAD_IDX, dropout=DROPOUT).to(device)
def count_parameters(model: nn.Module):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')
def initialize_weights(m):
if hasattr(m, 'weight') and m.weight.dim() > 1:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
if isinstance(sentence, str):
nlp = spacy.load('de')
tokens = [token.text.lower() for token in nlp(sentence)]
tokens = [token.lower() for token in sentence]
#tokens = [src_field.init_token] + tokens + [src_field.eos_token]
src_indexes = [src_field.vocab.stoi[token] for token in tokens]
src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
with torch.no_grad():
hidden, cell = model.encoder_en(src_tensor)
# create a list to hold the output sentence, initialized with an <sos> token
trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
for i in range(max_len):
trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(1).to(device)
with torch.no_grad():
output, hidden, cell = model.encoder_de(trg_tensor, hidden, cell)
pred_token = output.argmax(1).item()
if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
return trg_tokens[1:]
#getting error here
example_idx = 18
src = vars(train_data.examples[example_idx])['src']
trg = vars(train_data.examples[example_idx])['trg']
print(f'src = {src}')
print(f'trg = {trg}')
translation = translate_sentence(src, TRG, SRC, model, device)
print(f'predicted trg = {translation}')

Adding layers and bidirectionality to custom LSTM cell in pytorch

I use a very custom LSTM-cell inspired by
I use it to look at intermediate gating values. My question is, how would I expand this class to have an option for adding more layers and for adding bidirectionality? Should it be wrapped in a new class or added in the present one?
class Dim(IntEnum):
batch = 0
seq = 1
class simpleLSTM(nn.Module):
def __init__(self, input_sz: int, hidden_sz: int):
self.input_size = input_sz
self.hidden_size = hidden_sz
# input gate
self.W_ii = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hi = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_i = Parameter(torch.Tensor(hidden_sz))
# forget gate
self.W_if = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hf = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_f = Parameter(torch.Tensor(hidden_sz))
# ???
self.W_ig = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hg = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_g = Parameter(torch.Tensor(hidden_sz))
# output gate
self.W_io = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_ho = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_o = Parameter(torch.Tensor(hidden_sz))
self.out = nn.Linear(hidden_sz, len(TRG.vocab))
def init_weights(self):
for p in self.parameters():
if >= 2:
def forward(self, x, init_states=None ):
"""Assumes x is of shape (batch, sequence, feature)"""
seq_sz, bs, = x.size()
hidden_seq = []
prediction = []
if init_states is None:
h_t, c_t = torch.zeros(self.hidden_size).to(x.device), torch.zeros(self.hidden_size).to(x.device)
h_t, c_t = init_states
for t in range(seq_sz): # iterate over the time steps
x_t = x[t, :].float()
i_t = torch.sigmoid(x_t # self.W_ii + h_t # self.W_hi + self.b_i)
f_t = torch.sigmoid(x_t # self.W_if + h_t # self.W_hf + self.b_f)
g_t = torch.tanh(x_t # self.W_ig + h_t # self.W_hg + self.b_g)
o_t = torch.sigmoid(x_t # self.W_io + h_t # self.W_ho + self.b_o)
c_t = f_t * c_t + i_t * g_t
h_t = o_t * torch.tanh(c_t)
pred_t = self.out(h_t.unsqueeze(Dim.batch))
#pred_t = F.softmax(pred_t)
hidden_seq =, dim=Dim.batch)
prediction =, dim=Dim.batch)
# reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
hidden_seq = hidden_seq.transpose(Dim.batch, Dim.seq).contiguous()
prediction = prediction.transpose(Dim.batch, Dim.seq).contiguous()
return prediction, hidden_seq, (h_t, c_t)
I call it and train using the following as an example.
lstm = simpleLSTM(1, 100)
hidden_size = lstm.hidden_size
optimizer = optim.Adam(lstm.parameters())
h_0, c_0 = (torch.zeros(hidden_size, requires_grad=True),
torch.zeros(hidden_size, requires_grad=True))
grads = []
h_t, c_t = h_0, c_0
for epoch in range(N_EPOCHS):
epoch_loss = 0
for i, batch in enumerate(train):
src, src_len = batch.src
trg = batch.trg
trg = trg.view(-1)
predict, output, hidden_states = lstm(src)
predict = predict.t().unsqueeze(1)
predict= predict.view(-1, predict.shape[-1])
loss = criterion(predict,trg)
epoch_loss += loss.item()
The easiest would be to create another module (say Bidirectional) and pass any cell you want to it.
Implementation itself is quite easy to do. Notice that I'm using concat operation for joining bi-directional output, you may want to specify other modes like summation etc.
Please read the comments in the code below, you may have to change it appropriately.
import torch
class Bidirectional(torch.nn.Module):
def __init__(self, cell):
self.cell = cell
def __call__(self, x, init_states=None):
prediction, hidden_seq, (h_t, c_t) = self.cell(x, init_states)
backward_prediction, backward_hidden_seq, (
# Assuming sequence is first dimension, otherwise change 0 appropriately
# Reverses sequences so the LSTM cell acts on the reversed sequence
) = self.cell(torch.flip(x, (0,)), init_states)
return (
# Assuming you transpose so it has (batch, seq, features) dimensionality, backward_prediction), 2),, backward_hidden_seq), 2),
# Assuming it has (batch, features) dimensionality, backward_ht), 1),, backward_ct), 1),
When it comes to multiple layers you could do something similiar in principle:
import torch
class Multilayer(torch.nn.Module):
def __init__(self, *cells):
self.cells = torch.nn.ModuleList(cells)
def __call__(self, x, init_states=None):
inputs = x
for cell in self.cells:
prediction, hidden_seq, (h_t, c_t) = cell(inputs, init_states)
inputs = hidden_seq
return prediction, hidden_seq, (h_t, c_t)
Please note you have to pass created cell objects into Multilayer e.g.:
# For three layers of LSTM, each needs features to be set up correctly
multilayer_LSTM = Multilayer(LSTM(), LSTM(), LSTM())
You may also pass classes instead of instances into constructor and create those inside Multilayer (so hidden_size matches automatically), but those ideas should get you started.
