RuntimeError: mat1 and mat2 shapes cannot be multiplied when use BiGRU? - python

can someone figure out what cause this error? I am working on NMT and BiGRU to transfer grammatically incorrect Arabic sentence into grammatically correct ones. I'm not sure how BiGRU should work because the original code was GRU only.
The problem started when I converted the GRU to bidirectional.
This is the code:
#Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE,ENC_DROPOUT)
encoder.to(device)
#obtain one sample from the data iterator
it = iter(dataset)
x, y, x_len = next(it)
print("Input: ", x.shape)
print("Output: ", y.shape)
#sort the batch first to be able to use with pac_pack_sequence
xs, ys, lens = sort_batch(x, y, x_len)
enc_output, enc_hidden = encoder(xs.to(device), lens, device)
print("Encoder Output: ", enc_output.shape) # batch_size X max_length X enc_units
print("Encoder Hidden: ", enc_hidden.shape) # batch_size X enc_units (corresponds to the last state)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE, DEC_DROPOUT)
decoder = decoder.to(device)
#print(enc_hidden.squeeze(0).shape)
dec_hidden = enc_hidden#.squeeze(0)
dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
print("Decoder Input: ", dec_input.shape)
print("--------")
for t in range(1, y.size(1)):
print(dec_input.shape)
print(dec_hidden.shape)
print(enc_output.shape)
# enc_hidden: 1, batch_size, enc_units
# output: max_length, batch_size, enc_units
predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
print("Prediction: ", predictions.shape)
print("Decoder Hidden: ", dec_hidden.shape)
#loss += loss_function(y[:, t].to(device), predictions.to(device))
dec_input = y[:, t].unsqueeze(1)
print(dec_input.shape)
break
And this is the Encoder code:
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, dropout):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
self.gru = nn.GRU(self.embedding_dim, self.enc_units,dropout = dropout, bidirectional=True)
self.dropout = nn.Dropout(dropout)
def forward(self, x, lens, device):
# x: batch_size, max_length
# x: batch_size, max_length, embedding_dim
x = self.embedding(x).to(device)
# x transformed = max_len X batch_size X embedding_dim
# x = x.permute(1,0,2)
x = pack_padded_sequence(x, lens) # unpad
self.hidden = self.initialize_hidden_state(device)
# output: max_length, batch_size, enc_units
# self.hidden: 1, batch_size, enc_units
output, self.hidden = self.gru(x, self.hidden) # gru returns hidden state of all timesteps as well as hidden state at last timestep
# pad the sequence to the max length in the batch
output, _ = pad_packed_sequence(output)
return output, self.hidden
def initialize_hidden_state(self,device):
return torch.zeros((2, self.batch_sz, self.enc_units)).to(device)```

Related

How to plot Receptive Fields, for a CNN/fashionMNIST?

I created my CNN with PyTorch Lightning, and I am actually looking for plotting the Receptive Fields.
Do you have any suggestions about it?
I look for different solutions here and there, but I actually can't make them synergize with PyTorch Lightning.
Is it possible to visualize the Receptive fields directly inside Tensorboard?
I'll share with you my Dataset:
train_dataset = torchvision.datasets.FashionMNIST('classifier_data', train=True, download=True, transform=transforms.ToTensor())
train, val = train_test_split(train_dataset, test_size = .2)
train_loader = DataLoader(train, batch_size = 32)
val_loader = DataLoader(train, batch_size = 32)
test_dataset = torchvision.datasets.FashionMNIST('classifier_data', train=False, download=True, transform=transforms.ToTensor())
test_loader = DataLoader(test_dataset, batch_size = 32)
and CNN:
def __init__(self, dropout, learn_rate, momentum, weight_decay, optimizer):
#def __init__(self, dropout, learn_rate, weight_decay, optimizer):
super().__init__()
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 5)
self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 12 , kernel_size = 5)
self.fc1 = nn.Linear(in_features = 12*4*4, out_features = 120)
self.fc2 = nn.Linear(in_features = 120, out_features = 60)
self.out = nn.Linear(in_features = 60, out_features = 10)
self.do = nn.Dropout(dropout) #for overfitting issues
self.loss = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy()
self.learn_rate = learn_rate
self.momentum = momentum #with Adam we don't have momentum. To Check best Optimizer with Optune, please comment this line.
self.weight_decay = weight_decay
self.optimizer = optimizer
self.train_loss = []
self.val_loss = []
self.train_acc = []
self.test_acc = []
#plot into tensorboard
log_dir = pathlib.Path.cwd() / "lightning_logs"
self.writer = SummaryWriter(log_dir)
#forward step
#I add each layer to the histogram. It's plotted into tensorboard
def forward(self, x, additional_out=False):
#conv1
x = self.conv1(x)
self.writer.add_histogram("First convolutional layer CNN", x)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size = 2, stride = 2)
#conv2
x = self.conv2(x)
self.writer.add_histogram("Second convolutional layer CNN", x)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size = 2, stride = 2)
#fuly connected 1
x = x.reshape(-1, 12*4*4)
x = self.fc1(x)
self.writer.add_histogram("First linear layer CNN", x)
x = F.relu(x)
x = self.do(x)
#fully connected 2
x=self.fc2(x)
self.writer.add_histogram("Second linear layer CNN", x)
x = F.relu(x)
x = self.do(x)
#output
x = self.out(x)
self.writer.add_histogram("Output layer CNN", x)
return x
#optimizer
def configure_optimizers(self):
#optimizer = self.optimizer(self.parameters(), lr = self.learn_rate, momentum = self.momentum, weight_decay = self.weight_decay)
optimizer = self.optimizer(self.parameters(), lr = self.learn_rate, weight_decay = self.weight_decay)
return optimizer
#training step
def training_step(self, batch, batch_idx):
x, y = batch
b = x.size(0)
x = x.view(b, -1, 28, 28)
logit = self(x)
J = self.loss(logit, y) #loss
#self.train_loss.append(J) #no need to append
acc = self.accuracy(logit, y) #accuracy
#self.train_acc.append(acc) #no need to append
self.log("train_loss_cnn", J.item())
self.log("train_acc_cnn", acc.item())
return {'loss': J}
#Since I used Tensorboard, it don't have to append to loss
def test_step(self, batch, batch_idx):
p, q = batch
b = p.size(0)
p = p.view(b, -1, 28, 28)
logit = self(p)
J = self.loss(logit, q) #loss
acc_test = self.accuracy(logit, q) #accuracy
#self.train_acc.append(acc_test) #no need to append
#self.train_loss.append(J) #no need to append
self.log("test_acc_cnn", acc_test.item())
self.log("test_loss_cnn", J.item())
def validation_step(self, batch, batch_idx=None):
u, v = batch
b = u.size(0)
u = u.view(b, -1, 28, 28)
logit = self(u)
J = self.loss(logit, v) #loss
#self.val_loss.append(J) #no need to append
acc_val = self.accuracy(logit, v) #accuracy
#self.train_acc.append(acc_val) #no need to append
self.log("val_loss_cnn", J.item())
self.log("val_acc_cnn", acc_val.item())
return {"loss": J, "pred": logit, "target": v}
#Once saves from validation step, I take with me the returned elements, and I can plot the Confusion Matrix inside Tensorboard
def validation_epoch_end(self, outputs):
preds = torch.cat([tmp['pred'] for tmp in outputs])
targets = torch.cat([tmp['target'] for tmp in outputs])
conf_mat = confusion_matrix(preds, targets, num_classes=10)
df_cm = pd.DataFrame(conf_mat.numpy(), index = range(10), columns=range(10))
plt.figure(figsize = (10,7))
fig_ = sns.heatmap(df_cm, annot=True, cmap='Spectral').get_figure()
plt.close(fig_)
self.logger.experiment.add_figure("Confusion matrix CNN", fig_, self.current_epoch)

TypeError: forward() missing 1 required positional argument: 'hidden' - LSTM Model

I'm a beginner to the LSTM and PyTorch. I try to create a model for a bursty traffic prediction scenario. This is kind of a overfit model. First, It matches all the next values using the x_data as inputs and targets. You can identify the shape of the x_data by looking at the main. Then try to predict the whole traffic shape using the first 100 seed values. But this model gives me an error when running "outputs, hidden = model(inputs)". please help me to solve this error.
Here is my complete code,
import numpy as np
import torch
import torch,torch.nn as nn
from torch import Tensor
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from scipy.signal import savgol_filter
is_cuda = torch.cuda.is_available()
if is_cuda:
device = torch.device("cuda")
print("GPU is available")
else:
device = torch.device("cpu")
print("GPU not available, CPU used")
class FPredRNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout_val = 0.1):
super(FPredRNN, self).__init__()
self.input_size = input_size
self.nh = hidden_size
self.nl = num_layers
self.lstm = nn.LSTM(self.input_size, self.nh, self.nl, dropout = dropout_val)
self.dropout = nn.Dropout(dropout_val)
self.linear = nn.Linear(self.nh, 1)
def forward(self, x, hidden, steps = 1000, eval = False):
predictions = []
batch_size = x.size(0)
if(hidden.size(0) != batch_size):
self.init_hidden(batch_size)
l_out, hidden = self.lstm(x, hidden)
l_out = l_out.contiguous().view(-1, self.nh)
out = self.dropout(l_out)
out = self.linear(out)
#out = out.view(batch_size, -1)
#out = out[:,-1]
if(eval):
eval_input = out[-1:]
for i in range(steps):
lstm_out, hidden = self.lstm(eval_input, hidden)
linear_out = self.linear(lstm_out)
predictions += [linear_out]
eval_input = linear_out
out = torch.stack(predictions).squeeze()
return out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(self.nl, batch_size, self.nh).zero_().to(device), weight.new(self.nl, batch_size, self.nh).zero_().to(device))
#hidden = ((self.nl, batch_size, self.nh).zero_().to(device), (self.nl, batch_size, self.nh).zero_().to(device))
return hidden
if __name__ == "__main__":
x_data = np.empty((1, 2000))
y_data = np.empty((1, 1))
for n in [30000]:
traffic_generator = GenerateTraffic()
bursty_traffic, a_t = traffic_generator.create_bursty_traffic(n_d=n)
detected, attempted = traffic_generator.simulate_bursty_traffic_arrivals(bursty_traffic, backoff_bool= True)
smooth_x = savgol_filter(detected, 97, 2)
x_data[(n//10000)-3] = smooth_x
inputs = x_data[:, :1999]
targets = x_data[:, 1:2000]
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs.size(1))
print(inputs.size(0))
#print(hidden.size(0))
model = FPredRNN(input_size = inputs.size(1), hidden_size = 1100, num_layers = 2, dropout_val = 0.1)
model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
# Train Model
n_epochs = 2
for epoch in range(1, n_epochs + 1):
optimizer.zero_grad()
inputs.to(device)
outputs, hidden = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
# Test Model
seed_lenght = 100
seed = inputs[:seed_lenght]
outt = model(seed, steps=1000, eval = True)
test_out = torch.cat((seed.squeeze(), outt))
Error code
TypeError Traceback (most recent call last)
<ipython-input-2-0e63f8c64103> in <module>()
101 optimizer.zero_grad()
102 inputs.to(device)
--> 103 outputs, hidden = model(inputs)
104 loss = criterion(outputs, targets)
105 loss.backward()
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
TypeError: forward() missing 1 required positional argument: 'hidden'
The forward method needs another value inputted, hidden. I think what you want to do is:
hidden = model.init_hidden()
outputs, hidden = model(inputs, hidden)
this way the first input for hidden would just be an tensor full of zeros, and the next hidden inputs would be the inputs of the previous letters.
You don't have to provide hidden for recurrent layers.
hidden should only be passed to recurrent layers if element from next batch is a continuation of previous element
It is almost never the case
hidden is implicitly initialized with zeros, documentation here, please read it before using
Given that, your model can be (and probably can be simplified further, what are you trying to achieve with nn.Linear?):
class FPredRNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout_val=0.1):
super(FPredRNN, self).__init__()
self.input_size = input_size
self.nh = hidden_size
self.nl = num_layers
self.lstm = nn.LSTM(self.input_size, self.nh, self.nl, dropout=dropout_val)
self.dropout = nn.Dropout(dropout_val)
self.linear = nn.Linear(self.nh, 1)
def forward(self, x, steps=1000):
predictions = []
l_out, hidden = self.lstm(x)
l_out = l_out.contiguous().view(-1, self.nh)
out = self.dropout(l_out)
out = self.linear(out)
# Not sure what is going on here, but eval is an attribute of nn.Module
if self.eval:
eval_input = out[-1:]
for i in range(steps):
lstm_out, hidden = self.lstm(eval_input, hidden)
linear_out = self.linear(lstm_out)
predictions += [linear_out]
eval_input = linear_out
out = torch.stack(predictions).squeeze()
return out
Please read nn.LSTM documentation in-depth, it will help with recurrent layers.

Pytorch LSTM- VAE Sentence Generator: RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

I am trying to make a LSTM VAE as a learning stage for future work with pytorch.
I managed to get it to work on some small tester data but now that I want to run it on my actual data I am continuously getting this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [10, 40]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Any help on how to solve this error and generally just make my code more efficient would be great!
Here is the appropriate section of the traceback:
File "<ipython-input-16-7fe0e9e30e5d>", line 190, in <module>
rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
File "<ipython-input-16-7fe0e9e30e5d>", line 166, in train_batch
reconstruction, hidden, kld = model(x, G_inp, None, None)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "<ipython-input-16-7fe0e9e30e5d>", line 93, in forward
mu, logvar, z = self.encoder(x)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "<ipython-input-16-7fe0e9e30e5d>", line 37, in forward
out1, self.hidden = self.lstm(x, self.hidden)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py", line 582, in forward
self.dropout, self.training, self.bidirectional, self.batch_first)
(Triggered internally at /pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
allow_unreachable=True) # allow_unreachable flag
The code is below: (excuse all the .clone(), I read that this could be a solution so I was testing it out everywhere with no help)
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
from keras.preprocessing.text import Tokenizer
from keras import preprocessing
import torch.nn.functional as F
import math
import random
#encoder
class Encoder(nn.Module):
def __init__(self,embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size):
super(Encoder,self).__init__()
self.n_layers_E = n_layers_E
self.n_hidden_E = n_hidden_E
self.batch_size = batch_size
self.dim_z = dim_z
self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = n_hidden_E, num_layers = n_layers_E, batch_first=True, bidirectional = True)
self.hidden_to_mu = nn.Linear(2*n_hidden_E,dim_z)
self.hidden_to_logvar = nn.Linear(2*n_hidden_G, dim_z)
self.hidden = (torch.zeros(2*n_layers_E, batch_size, n_hidden_E),torch.zeros(2*n_layers_E, batch_size, n_hidden_E))
def forward(self,x):
batch_size, n_seq, n_embed = x.size()
#batch_size, n_seq = x.size()
out1, self.hidden = self.lstm(x, self.hidden)
e_hidden = self.hidden[0].view(batch_size, 2 * self.n_hidden_E).clone()
#e_hidden = torch.cat(list(hidden),dim = 0)
mu = self.hidden_to_mu(e_hidden)
logvar = self.hidden_to_logvar(e_hidden)
epsilon = torch.randn([batch_size, self.dim_z])
z = mu + torch.exp(logvar*0.5)*epsilon
return mu, logvar, z
class Generator(nn.Module):
def __init__(self,n_hidden_G,n_layers_G, embedding_dim, dim_z, vocab_size , batch_size):
super(Generator,self).__init__()
self.n_hidden_G = n_hidden_G
self.n_layers_G = n_layers_G
self.n_z = dim_z
self.batch_size = batch_size
self.LSTM = nn.LSTM(input_size = embedding_dim + dim_z, hidden_size = n_hidden_G, num_layers = n_layers_G, batch_first = True)
self.fc = nn.Linear(n_hidden_G, vocab_size)
self.hidden = (torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G)
,torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G))
def forward(self,x,z, g_hidden = None):
batch_size,n_seq, n_embed = x.size()
#batch_size, n_seq= x.size()
z = torch.cat([z]*n_seq,1).view(batch_size, n_seq, self.n_z)
x = torch.cat([x,z], dim = 2)
if g_hidden is None: #if we are validating
self.hidden = (torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G)
,torch.zeros(self.n_layers_G, batch_size, self.n_hidden_G))
else: #if we are training
self.hidden = g_hidden
output, self.hidden = self.LSTM(x, self.hidden)
output = self.fc(output)
return output, self.hidden
class VAE(nn.Module):
def __init__(self, embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size,n_layers_G ):
super(VAE, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.encoder = Encoder(embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size)
self.generator = Generator(n_hidden_G,n_layers_G, embedding_dim, dim_z, vocab_size, batch_size )
self.n_z = dim_z
def forward(self, x, G_inp, z, G_hidden):
if z is None:
batch_size, n_seq = x.size()
x = self.embedding(x)
mu, logvar, z = self.encoder(x)
kld = -0.5*torch.sum(logvar-mu.pow(2)-logvar.exp()+1).mean()
else:
kld = None
G_inp = self.embedding(G_inp)
logit, G_hidden = self.generator(G_inp,z, G_hidden)
return logit, G_hidden, kld
train_df = pd.read_csv("train.csv", header =None)[0:500]
test_df = pd.read_csv("test.csv",header =None)[0:500]
train = train_df.iloc[:,0]
max_words = 2000
max_len = 25
tok = Tokenizer(num_words = max_words)
tok.fit_on_texts(train)
sequences = tok.texts_to_sequences(train)
sequences_matrix = preprocessing.sequence.pad_sequences(sequences, maxlen = max_len)
#tok.sequences_to_texts(sequences)
n_hidden_E = 10
n_layers_E = 1
embedding_dim = 10
vocab_size = max_words
n_hidden_G = 10
n_layers_G = 2
dim_z = 10
train_size = len(train)
batch_size = 100
rec_coef = 7
lr = 0.01
epochs = 100
def create_generator_input(x, train):
G_inp = x[:, 0:max_len-1].clone() #input for generator should exclude last word of sequence
# if train == False:
# return G_inp
# r = np.random.rand(G_inp.size(0), G_inp.size(1))
# #Perform word_dropout according to random values (r) generated for each word
# for i in range(len(G_inp)):
# for j in range(1,G_inp.size(1)):
# if r[i, j] < opt.word_dropout and G_inp[i, j] not in [vocab.stoi[opt.pad_token], vocab.stoi[opt.end_token]]:
# G_inp[i, j] = vocab.stoi[opt.unk_token]
return G_inp
def producebatches(x,batch_size):
k = math.floor(x.shape[0]/batch_size)
total = (k)*batch_size
flatten = x[0:total].flatten()
batches = flatten.reshape((k,batch_size,x.shape[1]))
return batches
batches = producebatches(sequences_matrix, batch_size)
model = VAE(embedding_dim, vocab_size, n_layers_E, n_hidden_E, dim_z, n_hidden_G, batch_size,n_layers_G)
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
criterion = nn.BCELoss(reduction = 'sum')
model.train()
def train_batch(x,G_inp,step,train =True):
reconstruction, hidden, kld = model(x, G_inp, None, None)
reconstruction2 = reconstruction.view(-1, vocab_size).clone() #converting into shape (batch_size*(n_seq-1), n_vocab) to facilitate performing F.cross_entropy()
#y = x[:, 1:x.size(1)].clone() #target for generator should exclude first word of sequence
#y = y.contiguous().view(-1)
G_inp2 = G_inp.contiguous().view(-1) #converting into shape (batch_size*(n_seq-1),1) to facilitate performing F.cross_entropy()
rec_loss = F.cross_entropy(reconstruction2,G_inp2)
kld_coef = (math.tanh((step - 15000)/1000) + 1) / 2
#kld_coef = min(1,step/(200000.0))
loss = rec_coef*rec_loss + kld_coef*kld
if train == True:
torch.autograd.set_detect_anomaly(True) #skip below step if we are performing validation
optimizer.zero_grad()
loss.backward(retain_graph=True)
optimizer.step()
return rec_loss.item(), kld.item()
for epoch in range(epochs):
train_rec_loss = []
train_kl_loss = []
for i in range(batches.shape[0]):
x = torch.tensor(batches[i], dtype = torch.long)
G_inp = create_generator_input(x, train = True)
rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
train_rec_loss.append(rec_loss)
train_kl_loss.append(kl_loss)
train_rec_loss = np.mean(train_rec_loss)
train_kl_loss = np.mean(train_kl_loss)
print("No.", epoch, "T_rec:", '%.2f'%rec_loss, "T_kld:", '%.2f'%kl_loss)
def generate_sentences(n):
model.eval()
sentences = []
for i in range(n):
z = torch.randn([1, dim_z])
hidden = (torch.zeros(n_layers_G,1, n_hidden_G)
,torch.zeros(n_layers_G, 1, n_hidden_G))
G_inp = torch.LongTensor(1,1).fill_(1)
str_ind = []
while len(str_ind)<49:
with torch.autograd.no_grad():
logit, G_hidden, _ = model(None, G_inp, z, hidden)
probs = F.softmax(logit[0],dim=1)
G_inp = torch.multinomial(probs,1)
str_ind.append(G_inp[0][0].item())
sentences.append(str_ind)
return sentences
t = generate_sentences(1)
First, you can re-initialize your hidden layer after each epoch. This will overcome the error that you are facing without any major changes:
for epoch in range(epochs):
train_rec_loss = []
train_kl_loss = []
for i in range(batches.shape[0]):
x = torch.tensor(batches[i], dtype = torch.long)
G_inp = create_generator_input(x, train = True)
rec_loss, kl_loss = train_batch(x,G_inp,epoch,train=True)
train_rec_loss.append(rec_loss)
train_kl_loss.append(kl_loss)
model.hidden = (torch.zeros(n_layers_G, batch_size, n_hidden_G)
,torch.zeros(n_layers_G, batch_size, n_hidden_G))
train_rec_loss = np.mean(train_rec_loss)
train_kl_loss = np.mean(train_kl_loss)
print("No.", epoch, "T_rec:", '%.2f'%rec_loss, "T_kld:", '%.2f'%kl_loss)
Furthermore, you can avoid inplace operations on the activation functions and dropout operations (inplace = False)(I think it is not your case).

How to restore checkpoint in Tensorflow 2.0.0

I was building a self-defined encoder-decoder tf.keras.Model and saved my checkpoint. After closing my Jupyter notebook and open it again trying to restore my encoder-decoder parameters, I surprisingly found that it is not working. I am not sure it's that I understand the usage wrong or something was wrong with my steps of doing it. Here is my code.
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
tf.config.experimental_run_functions_eagerly(True)
EPOCHS = 10
TOLERANCE = 0.08
start = time.time()
for epoch in range(1, EPOCHS+1):
epoch_start = time.time()
# train the encoder-decoder model
batch = 0
total_loss = 0
total_accuracy = 0
for inp, targ in dataset.take(STEP_PER_EPOCH):
batch += 1
batch_loss, batch_accuracy = train_step(inp, targ, phoneme_tokenizer)
total_loss += batch_loss
total_accuracy += batch_accuracy
print("Epoch: {}/{} Batch: {} Loss: {:.4f} Accuracy: {:.4f} Time: {:.0f}s".
format(epoch, EPOCHS, batch, batch_loss.numpy(), batch_accuracy.numpy(), time.time()-epoch_start),
end="\r")
if batch % 1000 == 0:
print()
print()
# saving (checkpoint) the model when total loss is less than 0.9
checkpoint.save(file_prefix=checkpoint_prefix)
# validation process
total_val_loss = 0
total_val_acc = 0
for val_inp, val_targ in dataset_val.take(VAL_WAV_SIZE):
val_loss, val_acc = validate_step(val_inp, val_targ, phoneme_tokenizer)
total_val_loss += val_loss
total_val_acc += val_acc
# print out the epoch results
mean_total_acc = total_accuracy / STEP_PER_EPOCH
mean_total_loss = total_loss / STEP_PER_EPOCH
mean_val_acc = total_val_acc / VAL_WAV_SIZE
mean_val_loss = total_val_loss / VAL_WAV_SIZE
print("\n================================")
print("Epoch {}/{}".format(epoch, EPOCHS))
print('Accuracy: {:.4f} Loss: {:.4f} val_acc: {:.4f} val_loss: {:.4f}'.format(
mean_total_acc,
mean_total_loss,
mean_val_acc,
mean_val_loss))
print('Time taken for epoch {}: {:.2f} min'.format(epoch, (time.time() - epoch_start)/60))
print('Total Time taken: {:.2f} min'.format((time.time() - start)/60))
print("================================\n")
if mean_total_loss < TOLERANCE and mean_val_acc > 0.5:
break
After running the code above, it showed no errors and I got my checkpoints files in my directory. I closed my Jupyter notebook and built all the objects(which are my encoder and decoder) without training and typed
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
hoped that the parameters would come back and I can start my prediction but I just got pretty poor outcome anyway, which was against the outcome predicted after training right away. Should I add some more lines for restoring all the parameters or something else?
Below are more details about my encoder decoder structure, and my input shape is (batch size, 99, 13)
class ResnetIdentityBlock(tf.keras.layers.Layer):
def __init__(self, kernel_size, filters):
super(ResnetIdentityBlock, self).__init__()
self.filters1, self.filters2, self.filters3 = filters
self.conv1 = tf.keras.layers.Conv1D(self.filters1, 1, padding='valid')
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv1D(self.filters2, kernel_size, padding='same')
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv1D(self.filters3, 1, padding='valid')
self.bn3 = tf.keras.layers.BatchNormalization()
def call(self, input_tensor, training=False):
x = self.conv1(input_tensor)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x)
x = self.conv3(x)
x = self.bn3(x, training=training)
x += input_tensor
return tf.nn.relu(x)
class Encoder(tf.keras.Model):
'''
Encoder for MFCC transformed wave data
'''
def __init__(self,
lstm_units,
batch_sz,
dropout_rate,
units,
squeeze_time):
'''
Args:
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: layer dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Encoder, self).__init__()
self.lstm_units = lstm_units
self.squeeze_time = squeeze_time
# conv1d
self.feat_extract = tf.keras.layers.Dense(units=units, activation="relu")
self.feat_dropout = tf.keras.layers.Dropout(dropout_rate)
# ResNet
self.resnet1 = ResnetIdentityBlock(kernel_size=11, filters=[units, units, units])
units *= squeeze_time
self.resnet2 = ResnetIdentityBlock(kernel_size=7, filters=[units, units, units])
units *= squeeze_time
self.resnet3 = ResnetIdentityBlock(kernel_size=3, filters=[units, units, units])
# Encoder lstm
self.enc_lstm = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
def call(self, inputs):
'''
call pyramidal LSTM neural network encoder
Args:
inputs: wave input
'''
x = self.feat_extract(inputs)
x = self.feat_dropout(x)
# ResNet
x = self.resnet1(x)
x = self.reshape_pyramidal(x)
x = self.resnet2(x)
x = self.reshape_pyramidal(x)
x = self.resnet3(x)
# encoder output layer
fw_outputs, fw_state_h, fw_state_c = self.enc_lstm(x)
return fw_outputs, fw_state_h, fw_state_c
def reshape_pyramidal(self, outputs):
'''
After concatenating forward and backward outputs
return the reshaped output
Args:
outputs: outputs from LSTM
squeeze_time: time step one would like to squeeze in pyramidal LSTM
'''
batch_size, time_steps, num_units = outputs.shape
return tf.reshape(outputs, (batch_size, -1, num_units * self.squeeze_time))
class LuongAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(LuongAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units, activation="relu")
self.W2 = tf.keras.layers.Dense(units, activation="relu")
self.V = tf.keras.layers.Dense(1, activation="relu")
def call(self, query, values):
# query hidden state shape == (batch_size, hidden size)
# query_with_time_axis shape == (batch_size, 1, hidden size)
# values shape == (batch_size, max_len, hidden size)
# we are doing this to broadcast addition along the time axis to calculate the score
query_with_time_axis = tf.expand_dims(query, 1)
# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying score to self.V
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))
# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
class Decoder(tf.keras.Model):
'''
Decoder for output phonemes
'''
def __init__(self,
target_sz,
embedding_dim,
lstm_units,
batch_sz,
dropout_rate):
'''
Args:
target_sz: target size, total phoneme size in this case
embedding_dim: embedding dimension
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.target_sz = target_sz
self.lstm_units = lstm_units
self.embedding = tf.keras.layers.Embedding(target_sz, embedding_dim)
# attention model
self.attention = LuongAttention(lstm_units)
# decoder rnn
self.lstm1 = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
# Fully-connected
self.fc1 = tf.keras.layers.Dense(64, activation="relu")
self.fc1_dropout = tf.keras.layers.Dropout(dropout_rate)
self.fc2 = tf.keras.layers.Dense(target_sz, activation="softmax")
# build layer info dictionary
self.layer_info = dict()
def call(self, inputs, enc_hidden_h, enc_hidden_c, enc_output):
'''
call LSTM decoder
Args:
inputs: target output, following phoneme for wave data input in this case
enc_hidden_h: encoder hidden state h
enc_hidden_c: encoder hidden state c
enc_output: encoder outputs
'''
# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(inputs)
# enc_output shape == (batch_size, max_length, hidden_size)
context_vector, attention_weights = self.attention(enc_hidden_h, enc_output)
# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
# passing the concatenated vector to the 2-layer LSTM (Decoder)
x, state_h, state_c = self.lstm1(x)
# dense layer before final predict output dense layer
x = tf.reshape(x, (-1, x.shape[-1]))
x = self.fc1(x)
x = self.fc1_dropout(x)
# output shape == (batch_size, phoneme size)
x = self.fc2(x)
return x, (state_h, state_c), attention_weights

One-hot encoding in pytorch/torchtext

I have a Bucketiterator from torchtext that I feed to a model in pytorch. An example of how the iterator is constructed:
train_iter, val_iter = BucketIterator.splits((train,val),
batch_size=batch_size,
sort_within_batch = True,
device = device,
shuffle=True,
sort_key=lambda x: (len(x.src), len(x.trg)))
The data is then fed to a model like this, where I use the nn.Embedding layer.
class encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.input_dim = input_dim
self.emb_dim = emb_dim
self.hid_dim = hid_dim
self.n_layers = n_layers
self.dropout = dropout
self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
#src = [src sent len, batch size]
embedded = self.dropout(self.embedding(src))
#embedded = [src sent len, batch size, emb dim]
hidden_enc = []
outputs, hidden = self.rnn(embedded[0,:,:].unsqueeze(0))
for i in range(1,len(embedded[:,1,1])):
outputs, hidden = self.rnn(embedded[i,:,:].unsqueeze(0),hidden)
hidden_cpu = []
for k in range(len(hidden)):
hidden_cpu.append(hidden[k])
hidden_cpu[k] = hidden[k].cpu()
hidden_enc.append(tuple(hidden_cpu))
#outputs, hidden = self.rnn(embedded)
#outputs = [src sent len, batch size, hid dim * n directions]
#hidden = [n layers * n directions, batch size, hid dim]
#cell = [n layers * n directions, batch size, hid dim]
None
#outputs are always from the top hidden layer
return hidden, hidden_enc
But what if I wanted the embedding to be one-hot encoded? I work on formal languages and it would be nice to preserve orthogonality between tokens. It doesn't seem like pytorch or torchtext has any functionality for doing this.
def get_one_hot_torch_tensor(in_tensor):
"""
Function converts a 1d or 2d torch tensor to one-hot encoded
"""
n_channels = torch.max(in_tensor)+1 # maximum number of channels
if in_tensor.ndim == 2:
out_one_hot = torch.zeros((n_channels, in_tensor.shape[0], in_tensor.shape[1]))
# print(out_one_hot)
index = np.indices((in_tensor.shape[0], in_tensor.shape[1])) # create an array of indices
x, y = index[0], index[1]
print(x, y)
out_one_hot[in_tensor, x, y] = 1
print(out_one_hot)

Categories