PyTorch LSTM with multivariate time series (Many-to-Many) - python

Given 5 features on a time series we want to predict the following values using an LSTM Recurrent Neural Network, using PyTorch. The problem is that the Loss Value starts very low (i.e. 0.04) and it increases a bit as the computation runs (it seems it converge to a slightly higher value, but it never decreases).
Moreover, the dataset is normalized, and we tried different values of learning rate, epochs, batch sizes etc.
An example of loss during training:
step : 0 loss : 0.0016425768844783306
step : 1 loss : 0.0028163508977741003
step : 2 loss : 0.009786984883248806
This is the class:
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features
self.seq_len = seq_length
self.n_hidden = 40 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 5)
def init_hidden(self, batch_size):
hidden_state = torch.randn(self.n_layers,batch_size,self.n_hidden)
cell_state = torch.randn(self.n_layers,batch_size,self.n_hidden)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out.contiguous().view(batch_size,-1)
return self.l_linear(x)
This is the main code:
n_features = 5 # this is number of parallel inputs
n_timesteps = 24 # this is number of timesteps
# convert dataset into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X
y
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(mv_net.parameters(), lr=1e-4)
train_episodes = 50
batch_size = 16
This is the training:
mv_net.train()
for t in range(train_episodes):
X, y = sklearn.utils.shuffle(X, y)
for b in range(0,len(X),batch_size):
inpt = X[b:b+batch_size,:,:]
target = y[b:b+batch_size,:]
x_batch = torch.tensor(inpt,dtype=torch.float32)
y_batch = torch.tensor(target,dtype=torch.float32)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
loss = criterion(output.view(-1,5), y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
print('step : ' , t , 'loss : ' , loss.item())
Thank you for your time, and sorry for our unexperience (this is our first RNN).

Related

Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead BUT i'm not using numpy here

I get this error in the training loop for this neural network:
class YourModel(torch.nn.Module):
def __init__(self):
super(YourModel, self).__init__()
self.fc1 = nn.Linear(50, 128)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(128, 1)
def forward(self, x1, x2):
x = torch.cat((x1, x2), dim=1)
out = self.fc1(x)
out = self.sigmoid(out)
out = self.fc2(out)
return out
model = YourModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()
My dataloader contains 3 datasets, 1 with 25 features for 8000 documents, other with 25 features of 8000 queries and the last one with the relation between both (0 or 1). So that's why I'm using a neural network for binary classification. (However if you know an alternative neural network I'm open to options)
My batch_size is 1 right now and here is my training loop:
def train(dataloader, model, loss_fn, optimizer):
model.train()
train_loss = 0
num_batches = len(dataloader)
all_pred = []
all_real = []
for batch, i in enumerate(train_dataloader): #access to each batch
i_1 = i[0]
i_2 = i[1]
y = i[2].float().view(1, 1) #find relevance
#y = torch.clamp(y, min=0, max=1)
#x = np.hstack((i_1, i_2))
#x = torch.Tensor(x)
#x = torch.clamp(x, min=0, max=1)
# Zero the gradients
optimizer.zero_grad()
# Forward pass
y_pred = model(i_1, i_2).float()
y_pred = torch.clamp(y_pred, min=0, max=1)
loss = loss_fn(y_pred, y)
# Backward pass
loss.backward()
# Update the parameters
optimizer.step()
train_loss += loss.item() #sum the loss
all_pred.append(y_pred)
all_real.append(y)
if batch > 0 and batch%1000 == 0:
print(f"Partial loss: {train_loss/batch}, F1: {f1_score(all_real, all_pred)}")
train_loss /= num_batches
print(f"Total loss: {train_loss}") #print loss of every epoch
return train_loss
I'm getting this error: "Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead." but as far as I know I'm not calling numpy on any tensors. And if I use the detach method then I get the an error saying that the loss can not be computed because the tensor of 0 doesn't need grad. So it is pretty much a loop.

Mismatching dims in GRU for classification

I'm trying to complete a task and write simple RNN. Here's the class:
class RNNBaseline(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim) #RNN(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim) # YOUR CODE GOES HERE
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths, hidden = None):
#text = [sent len, batch size]
embedded = self.embedding(text)
#embedded = [sent len, batch size, emb dim]
#pack sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
# cell arg for LSTM, remove for GRU
# packed_output, (hidden, cell) = self.rnn(packed_embedded)
# unpack sequence
# output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
#output = [sent len, batch size, hid dim * num directions]
#output over padding tokens are zero tensors
#hidden = [num layers * num directions, batch size, hid dim]
#cell = [num layers * num directions, batch size, hid dim]
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
#and apply dropout
output, hidden = self.rnn(packed_embedded, hidden)
#hidden = None # concatenate
#hidden = [batch size, hid dim * num directions] or [batch_size, hid dim * num directions]
return self.fc(hidden)
For now I'm not using LSTM or trying to do bidirectional RNN, I just want simple GRU to train without errors. This is the training function:
import numpy as np
min_loss = np.inf
cur_patience = 0
for epoch in range(1, max_epochs + 1):
train_loss = 0.0
model.train()
pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
#YOUR CODE GOES HERE
opt.zero_grad()
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
train_loss = loss_func(output, labels)
train_loss.backward()
opt.step()
train_loss /= len(train_iter)
val_loss = 0.0
model.eval()
pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
val_loss = loss_func(output, labels)
val_loss /= len(valid_iter)
if val_loss < min_loss:
min_loss = val_loss
best_model = model.state_dict()
else:
cur_patience += 1
if cur_patience == patience:
cur_patience = 0
break
print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)
And some variables:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = False
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()
max_epochs = 1
But I get this error:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1, 64, 1]))
... in this line:
---> 18 train_loss = loss_func(output, labels)
What am I doing wrong?
nn.BCEWithLogitsLoss expects both outputs and targets (or in your case labels) to be of size [b,d] where b is the batch size and d is the number of classes (or dimension of whatever you are trying to predict). Currently, your outputs are of size [b,d,1] and your targets are of size [d]. Two fixes are necessary, and both are very simple:
Add a batch dimension to your targets (labels). This is a common error when using a dataset that returns data elements because it generally does not add a batch dimension. Encapsulating your dataset class within a pytorch dataloader, but if you don't want to do this simply add an unsqueeze() operation. Note that the unsqueeze operation only works with a batch size of 1, otherwise using dataloader is probably a better bet.
Your output has an empty 3rd dimension, which can easily be flattened with a squeeze() operation. Both unsqueeze and squeeze are differentiable so shouldn't present problems for backpropagation.
... code before here
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device).unsqueeze(0) # added unsqueeze operation
output = model(input, txt_len.type(torch.int64).cpu())
output = output.squeeze(-1) # added squeeze on last dim
val_loss = loss_func(output, labels)
... code after here

RuntimeError: 1D target tensor expected, multi-target not supported Pytorch

I recently shifted to pytorch from keras and I am still trying to understand how all this work. Below is the code I have implemented to classify mnist dataset using a simple MLP. Just like I used to do in keras I have flattend each of 28x28 image into a vector of 784 , and I have also created a one-hot representation for my labels.
In the model I was hoping that given a vector of 784 the model would output a one-hot vector with probabilities,but as soon as my code reaches to compute the loss I get the following error :
RuntimeError: 1D target tensor expected, multi-target not supported
Below is my code :
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
from torch import nn, optim
from keras.datasets import mnist
from torch.utils.data import Dataset, DataLoader
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
# ----------------------------------------------------
class MnistDataset(Dataset):
def __init__(self, data_size=0):
(x, y), (_, _) = mnist.load_data()
x = [i.flatten() for i in x]
x = np.array(x, dtype=np.float32)
if data_size < 0 or data_size > len(y):
assert ("Data size should be between 0 to number of files in the dataset")
if data_size == 0:
data_size = len(y)
self.data_size = data_size
# picking 'data_size' random samples
self.x = x[:data_size]
self.y = y[:data_size]
# scaling between 0-1
self.x = (self.x / 255)
# Creating one-hot representation of target
y_encoded = []
for label in y:
encoded = np.zeros(10)
encoded[label] = 1
y_encoded.append(encoded)
self.y = np.array(y_encoded)
def __len__(self):
return self.data_size
def __getitem__(self, index):
x_sample = self.x[index]
label = self.y[index]
return x_sample, label
# ----------------------------------------------------
num_train_samples = 10000
num_test_samples = 2000
# Each generator returns a single
# sample & its label on each iteration.
mnist_train = MnistDataset(data_size=num_train_samples)
mnist_test = MnistDataset(data_size=num_test_samples)
# Each generator returns a batch of samples on each iteration.
train_loader = DataLoader(mnist_train, batch_size=128, shuffle=True) # 79 batches
test_loader = DataLoader(mnist_test, batch_size=128, shuffle=True) # 16 batches
# ----------------------------------------------------
# Defining the Model Architecture
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(28 * 28, 100)
self.act1 = nn.ReLU()
self.fc2 = nn.Linear(100, 50)
self.act2 = nn.ReLU()
self.fc3 = nn.Linear(50, 10)
self.act3 = nn.Sigmoid()
def forward(self, x):
x = self.act1(self.fc1(x))
x = self.act2(self.fc2(x))
output = self.act3(self.fc3(x))
return output
# ----------------------------------------------------
model = MLP()
# Defining optimizer and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
# ----------------------------------------------------
# Training the model
epochs = 10
print("Training Started...")
for epoch in range(epochs):
for batch_index, (inputs, targets) in enumerate(train_loader):
optimizer.zero_grad() # Zero the gradients
outputs = model(inputs) # Forward pass
loss = criterion(outputs, targets) # Compute the Loss
loss.backward() # Compute the Gradients
optimizer.step() # Update the parameters
# Evaluating the model
total = 0
correct = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(test_loader):
outputs = model(inputs)
_, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
correct += predicted.eq(targets.data).cpu().sum()
print('Epoch : {} Test Acc : {}'.format(epoch, (100. * correct / total)))
print("Training Completed Sucessfully")
# ----------------------------------------------------
I also read some other posts related to the same problem & most of them said that the CrossEntropy loss the target has to be a single number ,which totally gets over my head.Can someone please explain a solution.Thank you.
For nn.CrossEntropyLoss, you don't need one-hot representation of the label, you just need to pass the prediction's logit, which shape is (batch_size, n_class), and a target vector (batch_size,)
So just pass in the label index vector y instead of one-hot vector.
Fixed of your code:
class MnistDataset(Dataset):
def __init__(self, data_size=0):
(x, y), (_, _) = mnist.load_data()
x = [i.flatten() for i in x]
x = np.array(x, dtype=np.float32)
if data_size < 0 or data_size > len(y):
assert ("Data size should be between 0 to number of files in the dataset")
if data_size == 0:
data_size = len(y)
self.data_size = data_size
# picking 'data_size' random samples
self.x = x[:data_size]
self.y = y[:data_size]
# scaling between 0-1
self.x = (self.x / 255)
self.y = y # <--
def __len__(self):
return self.data_size
def __getitem__(self, index):
x_sample = self.x[index]
label = self.y[index]
return x_sample, label
Take a look at Pytorch example for more detail:
https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html

Getting F-Score of 0 when combining RoBERTa and BiLSTM

I am trying to stack an LSTM on top of RoBERTa model for binary classification problem
I've tried to configurations:
- Freeze RoBERTa embedding
- Fine-tune embedding
In Freezing case I get around 57% F-score , this is relatively low compared to regular RoBERTa for sequence classification which got the same data around 81%
In fine-tune case I get 0% F-score and validation loss isn't converging
Most probably I am doing something wrong but I can't really spot it.
Here is the model part
class RoBERTaLSTMClassifier(nn.Module):
def __init__(self, bert_config, num_classes, hidden_size=None, dropout=0.5):
"""
bert: pretrained bert model
num_classes: the number of num_classes
hidden_size: the number of hiddens which will be used by LSTM layer
dropout: dropout rate
"""
super(RoBERTaLSTMClassifier, self).__init__()
self.num_classes = num_classes
self.model = RobertaModel(bert_config)
if hidden_size is None: self.hidden_size = bert_config.hidden_size
else: self.hidden_size = hidden_size
self.lstm = nn.LSTM(bert_config.hidden_size, self.hidden_size, bidirectional=True,batch_first=True)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(self.hidden_size * 2, 1)
self.softmax = nn.Softmax()
## add sigmoid non linearity for binary classification
self.sig = nn.Sigmoid()
def forward(self, input_ids, attention_mask, current_batch_size, hidden):
"""
all_layers: whether or not to return all encoded_layers
return: logits in the following format (batch_size, num_classes)
"""
with torch.no_grad():
## freeze embedding from BERT
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
# last hidden state is input to the LSTM
output, (hidden_h, hidden_c) = self.lstm(outputs[0], hidden)
output_hidden = torch.cat((hidden_h[0], hidden_h[1]), dim=1) #[B, H*2]
logits = self.classifier(self.dropout(output_hidden)) #[B, C]
sig_out = self.sig(logits).view(current_batch_size, -1)
## get the last batch output
sig_out = sig_out[:, -1] # get last batch of labels
hidden = (hidden_h, hidden_c)
return sig_out, hidden
def init_bilstm_hidden(self, batch_size):
h0 = torch.zeros(2, batch_size, self.hidden_size).to(device) # 2 for bidirection
c0 = torch.zeros(2, batch_size, self.hidden_size).to(device)
return (h0, c0)
** Here is the Training Loop Part**
from sklearn.metrics import f1_score
from tqdm import tqdm, trange
import numpy as np
lr=0.0001
roberta_conf = RobertaConfig.from_pretrained('roberta-base')
num_classes = 2
hidden_size = 256
LSTMRoBERTaModel = RoBERTaLSTMClassifier(roberta_conf, num_classes=num_classes,hidden_size= hidden_size,dropout=0.5)
criterion = nn.BCELoss() ## binary cross entropy
optimizer = torch.optim.Adam(LSTMRoBERTaModel.parameters(), lr=lr)
epochs = 5
counter = 0
max_grad_norm = 1.0
nb_tr_examples, nb_tr_steps = 0, 0
for _ in trange(epochs, desc="Epoch"):
LSTMRoBERTaModel.cuda()
LSTMRoBERTaModel.train()
tr_loss = 0
y_preds = []
y_true = []
hidden_init = LSTMRoBERTaModel.init_bilstm_hidden(batch_size=bs)
h = hidden_init
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
current_batch_size = b_input_ids.size()[0]
##
h = tuple([each.data for each in h])
## forward pass
preds, h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size,h)
loss = criterion(preds.squeeze(),b_labels.float())
# track train loss
tr_loss += loss.item()
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1
# gradient clipping
torch.nn.utils.clip_grad_norm_(parameters=LSTMRoBERTaModel.parameters(), max_norm=max_grad_norm)
loss.backward()
optimizer.step()
LSTMRoBERTaModel.zero_grad()
# print train loss per epoch
print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))
LSTMRoBERTaModel.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
val_h = LSTMRoBERTaModel.init_bilstm_hidden(bs)
for batch in dev_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
current_batch_size = b_input_ids.size()[0]
with torch.no_grad():
preds, val_h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size, val_h)
loss = criterion(preds.squeeze(),b_labels.float())
eval_loss += loss
y_preds.extend(np.round(preds.data.cpu()))
y_true.extend(b_labels.data.cpu())
#print(preds[2], b_labels[2] )
#eval_accuracy += f1_score(torch.tensor.numpy(b_labels.float), toch.tensor.numpy(preds))
nb_eval_examples += b_input_ids.size(0)
nb_eval_steps += 1
eval_loss = eval_loss/nb_eval_steps
print("Validation loss: {}".format(eval_loss))
print("F1 - Score: {}".format(f1_score(y_true,y_preds)))
#print("F1- Score: {}".format(eval_accuracy/nb_eval_steps))

PyTorch - How to set Activation Rules of neurons to increase efficiency of Neural Network?

I'm trying to make a Back Propagation Neural Network with PyTorch. I can successfully execute and test its accuracy, but it doesn't work very efficiently. Now, I'm supposed to increase its efficiency by setting different activation rules for neurons, so that those neurons that don't contribute to the final output get excluded (pruned) from the computations, thereby increasing the time and accuracy.
My code looks like this (extracted snippets) -
# Hyper Parameters
input_size = 20
hidden_size = 50
num_classes =130
num_epochs = 500
batch_size = 5
learning_rate = 0.1
# normalise input data
for column in data:
# the last column is target
if column != data.shape[1] - 1:
data[column] = data.loc[:, [column]].apply(lambda x: (x - x.mean()) / x.std())
# randomly split data into training set (80%) and testing set (20%)
msk = np.random.rand(len(data)) < 0.8
train_data = data[msk]
test_data = data[~msk]
# define train dataset and a data loader
train_dataset = DataFrameDataset(df=train_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Neural Network
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.sigmoid(out)
out = self.fc2(out)
return out
net = Net(input_size, hidden_size, num_classes)
# train the model by batch
for epoch in range(num_epochs):
for step, (batch_x, batch_y) in enumerate(train_loader):
# convert torch tensor to Variable
X = Variable(batch_x)
Y = Variable(batch_y.long())
# Forward + Backward + Optimize
optimizer.zero_grad() # zero the gradient buffer
outputs = net(X)
loss = criterion(outputs, Y)
all_losses.append(loss.data[0])
loss.backward()
optimizer.step()
if epoch % 50 == 0:
_, predicted = torch.max(outputs, 1)
# calculate and print accuracy
total = predicted.size(0)
correct = predicted.data.numpy() == Y.data.numpy()
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Accuracy: %.2f %%' % (epoch + 1, num_epochs, step + 1, len(train_data) // batch_size + 1, loss.data[0], 100 * sum(correct)/total))
Can someone tell me how to do that in PyTorch as I'm very new to PyTorch.
I'm not sure if that question is supposed to be on stackoverflow, but I will give you a hint anyway. You are working with a sigmoid activation function at the moment, the gradient of which vanishes if the input value is too large to small. A commonly used approach is to use the ReLU activation function (stands for rectified linear unit).
ReLU(x) is the identity for the positive domain and 0 for the negative domain, in Python that would be written as follows:
def ReLU(x):
if(x > 0):
return x
else:
return 0
It should be readily available in PyTorch

Categories