I'm trying to complete a task and write simple RNN. Here's the class:
class RNNBaseline(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim) #RNN(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim) # YOUR CODE GOES HERE
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths, hidden = None):
#text = [sent len, batch size]
embedded = self.embedding(text)
#embedded = [sent len, batch size, emb dim]
#pack sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
# cell arg for LSTM, remove for GRU
# packed_output, (hidden, cell) = self.rnn(packed_embedded)
# unpack sequence
# output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
#output = [sent len, batch size, hid dim * num directions]
#output over padding tokens are zero tensors
#hidden = [num layers * num directions, batch size, hid dim]
#cell = [num layers * num directions, batch size, hid dim]
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
#and apply dropout
output, hidden = self.rnn(packed_embedded, hidden)
#hidden = None # concatenate
#hidden = [batch size, hid dim * num directions] or [batch_size, hid dim * num directions]
return self.fc(hidden)
For now I'm not using LSTM or trying to do bidirectional RNN, I just want simple GRU to train without errors. This is the training function:
import numpy as np
min_loss = np.inf
cur_patience = 0
for epoch in range(1, max_epochs + 1):
train_loss = 0.0
model.train()
pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
#YOUR CODE GOES HERE
opt.zero_grad()
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
train_loss = loss_func(output, labels)
train_loss.backward()
opt.step()
train_loss /= len(train_iter)
val_loss = 0.0
model.eval()
pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
val_loss = loss_func(output, labels)
val_loss /= len(valid_iter)
if val_loss < min_loss:
min_loss = val_loss
best_model = model.state_dict()
else:
cur_patience += 1
if cur_patience == patience:
cur_patience = 0
break
print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)
And some variables:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = False
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()
max_epochs = 1
But I get this error:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1, 64, 1]))
... in this line:
---> 18 train_loss = loss_func(output, labels)
What am I doing wrong?
nn.BCEWithLogitsLoss expects both outputs and targets (or in your case labels) to be of size [b,d] where b is the batch size and d is the number of classes (or dimension of whatever you are trying to predict). Currently, your outputs are of size [b,d,1] and your targets are of size [d]. Two fixes are necessary, and both are very simple:
Add a batch dimension to your targets (labels). This is a common error when using a dataset that returns data elements because it generally does not add a batch dimension. Encapsulating your dataset class within a pytorch dataloader, but if you don't want to do this simply add an unsqueeze() operation. Note that the unsqueeze operation only works with a batch size of 1, otherwise using dataloader is probably a better bet.
Your output has an empty 3rd dimension, which can easily be flattened with a squeeze() operation. Both unsqueeze and squeeze are differentiable so shouldn't present problems for backpropagation.
... code before here
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device).unsqueeze(0) # added unsqueeze operation
output = model(input, txt_len.type(torch.int64).cpu())
output = output.squeeze(-1) # added squeeze on last dim
val_loss = loss_func(output, labels)
... code after here
Related
I get this error in the training loop for this neural network:
class YourModel(torch.nn.Module):
def __init__(self):
super(YourModel, self).__init__()
self.fc1 = nn.Linear(50, 128)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(128, 1)
def forward(self, x1, x2):
x = torch.cat((x1, x2), dim=1)
out = self.fc1(x)
out = self.sigmoid(out)
out = self.fc2(out)
return out
model = YourModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()
My dataloader contains 3 datasets, 1 with 25 features for 8000 documents, other with 25 features of 8000 queries and the last one with the relation between both (0 or 1). So that's why I'm using a neural network for binary classification. (However if you know an alternative neural network I'm open to options)
My batch_size is 1 right now and here is my training loop:
def train(dataloader, model, loss_fn, optimizer):
model.train()
train_loss = 0
num_batches = len(dataloader)
all_pred = []
all_real = []
for batch, i in enumerate(train_dataloader): #access to each batch
i_1 = i[0]
i_2 = i[1]
y = i[2].float().view(1, 1) #find relevance
#y = torch.clamp(y, min=0, max=1)
#x = np.hstack((i_1, i_2))
#x = torch.Tensor(x)
#x = torch.clamp(x, min=0, max=1)
# Zero the gradients
optimizer.zero_grad()
# Forward pass
y_pred = model(i_1, i_2).float()
y_pred = torch.clamp(y_pred, min=0, max=1)
loss = loss_fn(y_pred, y)
# Backward pass
loss.backward()
# Update the parameters
optimizer.step()
train_loss += loss.item() #sum the loss
all_pred.append(y_pred)
all_real.append(y)
if batch > 0 and batch%1000 == 0:
print(f"Partial loss: {train_loss/batch}, F1: {f1_score(all_real, all_pred)}")
train_loss /= num_batches
print(f"Total loss: {train_loss}") #print loss of every epoch
return train_loss
I'm getting this error: "Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead." but as far as I know I'm not calling numpy on any tensors. And if I use the detach method then I get the an error saying that the loss can not be computed because the tensor of 0 doesn't need grad. So it is pretty much a loop.
Given 5 features on a time series we want to predict the following values using an LSTM Recurrent Neural Network, using PyTorch. The problem is that the Loss Value starts very low (i.e. 0.04) and it increases a bit as the computation runs (it seems it converge to a slightly higher value, but it never decreases).
Moreover, the dataset is normalized, and we tried different values of learning rate, epochs, batch sizes etc.
An example of loss during training:
step : 0 loss : 0.0016425768844783306
step : 1 loss : 0.0028163508977741003
step : 2 loss : 0.009786984883248806
This is the class:
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features
self.seq_len = seq_length
self.n_hidden = 40 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 5)
def init_hidden(self, batch_size):
hidden_state = torch.randn(self.n_layers,batch_size,self.n_hidden)
cell_state = torch.randn(self.n_layers,batch_size,self.n_hidden)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out.contiguous().view(batch_size,-1)
return self.l_linear(x)
This is the main code:
n_features = 5 # this is number of parallel inputs
n_timesteps = 24 # this is number of timesteps
# convert dataset into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X
y
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(mv_net.parameters(), lr=1e-4)
train_episodes = 50
batch_size = 16
This is the training:
mv_net.train()
for t in range(train_episodes):
X, y = sklearn.utils.shuffle(X, y)
for b in range(0,len(X),batch_size):
inpt = X[b:b+batch_size,:,:]
target = y[b:b+batch_size,:]
x_batch = torch.tensor(inpt,dtype=torch.float32)
y_batch = torch.tensor(target,dtype=torch.float32)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
loss = criterion(output.view(-1,5), y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
print('step : ' , t , 'loss : ' , loss.item())
Thank you for your time, and sorry for our unexperience (this is our first RNN).
I am trying to stack an LSTM on top of RoBERTa model for binary classification problem
I've tried to configurations:
- Freeze RoBERTa embedding
- Fine-tune embedding
In Freezing case I get around 57% F-score , this is relatively low compared to regular RoBERTa for sequence classification which got the same data around 81%
In fine-tune case I get 0% F-score and validation loss isn't converging
Most probably I am doing something wrong but I can't really spot it.
Here is the model part
class RoBERTaLSTMClassifier(nn.Module):
def __init__(self, bert_config, num_classes, hidden_size=None, dropout=0.5):
"""
bert: pretrained bert model
num_classes: the number of num_classes
hidden_size: the number of hiddens which will be used by LSTM layer
dropout: dropout rate
"""
super(RoBERTaLSTMClassifier, self).__init__()
self.num_classes = num_classes
self.model = RobertaModel(bert_config)
if hidden_size is None: self.hidden_size = bert_config.hidden_size
else: self.hidden_size = hidden_size
self.lstm = nn.LSTM(bert_config.hidden_size, self.hidden_size, bidirectional=True,batch_first=True)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(self.hidden_size * 2, 1)
self.softmax = nn.Softmax()
## add sigmoid non linearity for binary classification
self.sig = nn.Sigmoid()
def forward(self, input_ids, attention_mask, current_batch_size, hidden):
"""
all_layers: whether or not to return all encoded_layers
return: logits in the following format (batch_size, num_classes)
"""
with torch.no_grad():
## freeze embedding from BERT
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
# last hidden state is input to the LSTM
output, (hidden_h, hidden_c) = self.lstm(outputs[0], hidden)
output_hidden = torch.cat((hidden_h[0], hidden_h[1]), dim=1) #[B, H*2]
logits = self.classifier(self.dropout(output_hidden)) #[B, C]
sig_out = self.sig(logits).view(current_batch_size, -1)
## get the last batch output
sig_out = sig_out[:, -1] # get last batch of labels
hidden = (hidden_h, hidden_c)
return sig_out, hidden
def init_bilstm_hidden(self, batch_size):
h0 = torch.zeros(2, batch_size, self.hidden_size).to(device) # 2 for bidirection
c0 = torch.zeros(2, batch_size, self.hidden_size).to(device)
return (h0, c0)
** Here is the Training Loop Part**
from sklearn.metrics import f1_score
from tqdm import tqdm, trange
import numpy as np
lr=0.0001
roberta_conf = RobertaConfig.from_pretrained('roberta-base')
num_classes = 2
hidden_size = 256
LSTMRoBERTaModel = RoBERTaLSTMClassifier(roberta_conf, num_classes=num_classes,hidden_size= hidden_size,dropout=0.5)
criterion = nn.BCELoss() ## binary cross entropy
optimizer = torch.optim.Adam(LSTMRoBERTaModel.parameters(), lr=lr)
epochs = 5
counter = 0
max_grad_norm = 1.0
nb_tr_examples, nb_tr_steps = 0, 0
for _ in trange(epochs, desc="Epoch"):
LSTMRoBERTaModel.cuda()
LSTMRoBERTaModel.train()
tr_loss = 0
y_preds = []
y_true = []
hidden_init = LSTMRoBERTaModel.init_bilstm_hidden(batch_size=bs)
h = hidden_init
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
current_batch_size = b_input_ids.size()[0]
##
h = tuple([each.data for each in h])
## forward pass
preds, h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size,h)
loss = criterion(preds.squeeze(),b_labels.float())
# track train loss
tr_loss += loss.item()
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1
# gradient clipping
torch.nn.utils.clip_grad_norm_(parameters=LSTMRoBERTaModel.parameters(), max_norm=max_grad_norm)
loss.backward()
optimizer.step()
LSTMRoBERTaModel.zero_grad()
# print train loss per epoch
print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))
LSTMRoBERTaModel.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
val_h = LSTMRoBERTaModel.init_bilstm_hidden(bs)
for batch in dev_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
current_batch_size = b_input_ids.size()[0]
with torch.no_grad():
preds, val_h = LSTMRoBERTaModel.forward(b_input_ids, b_input_mask, current_batch_size, val_h)
loss = criterion(preds.squeeze(),b_labels.float())
eval_loss += loss
y_preds.extend(np.round(preds.data.cpu()))
y_true.extend(b_labels.data.cpu())
#print(preds[2], b_labels[2] )
#eval_accuracy += f1_score(torch.tensor.numpy(b_labels.float), toch.tensor.numpy(preds))
nb_eval_examples += b_input_ids.size(0)
nb_eval_steps += 1
eval_loss = eval_loss/nb_eval_steps
print("Validation loss: {}".format(eval_loss))
print("F1 - Score: {}".format(f1_score(y_true,y_preds)))
#print("F1- Score: {}".format(eval_accuracy/nb_eval_steps))
so I am new with machine learning and I got a bonus course at my university where I have to train a lstm model to generate captions. I have read this so far: Blogpost_about_lstms
And used this as reference: some_random_code
So what I want to achieve:
I have an Dataset which is structured like this:
output from an CNN with a Vector on size 2048 that holds some "features" of an image. And 5 Captions describing that image.
Training:
input: CNN vector + Captions
output: Caption (guess)
Validation:
input: CNN vector
output: caption (guess)
So how can I use 2 Inputs (the CNN data and a Caption Sequence) to train to generate new captions only from an CNN input vector!
This is kinda tricky and I cannot grasp the theory in this. And Tensorflow is also quite a thing I have to say.
I have a normal Seq_2_Seq model in place that works. But now I am stuck :/
class Model(object):
def __init__(self, _input, is_training, hidden_size, vocab_size, num_layers,
dropout=config.trainer.dropout, init_scale=config.trainer.init_scale):
self.is_training = is_training
self.input_obj = _input
self.batch_size = _input.batch_size
self.num_steps = _input.num_steps
self.hidden_size = hidden_size
# create the word embeddings
with tf.device("/cpu:0"):
randomized = tf.random_uniform([vocab_size, hidden_size], -init_scale, init_scale)
print("randomized: ", randomized)
embedding = tf.Variable(randomized)
inputs = tf.nn.embedding_lookup(embedding, self.input_obj.input_data)
if is_training and dropout < 1:
inputs = tf.nn.dropout(inputs, dropout)
# set up the state storage / extraction
self.init_state = tf.placeholder(tf.float32, [num_layers, 2, self.batch_size, hidden_size])
state_per_layer_list = tf.unstack(self.init_state, axis=0)
rnn_tuple_state = tuple([tf.contrib.rnn.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1])for idx in range(num_layers)])
# create an LSTM cell to be unrolled
print("Hidden size: ", hidden_size)
cell = tf.contrib.rnn.LSTMCell(hidden_size, forget_bias=config.trainer.forget_bias)
# add a dropout wrapper if training
if is_training and dropout < 1:
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
if num_layers > 1:
cell = tf.contrib.rnn.MultiRNNCell([cell for _ in range(num_layers)], state_is_tuple=True)
print("input: ", inputs)
output, self.state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, initial_state=rnn_tuple_state)
# reshape to (batch_size * num_steps, hidden_size)
output = tf.reshape(output, [-1, hidden_size])
softmax_w = tf.Variable(tf.random_uniform([hidden_size, vocab_size], -init_scale, init_scale))
softmax_b = tf.Variable(tf.random_uniform([vocab_size], -init_scale, init_scale))
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
# Reshape logits to be a 3-D tensor for sequence loss
logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])
# Use the contrib sequence loss and average over the batches
loss = tf.contrib.seq2seq.sequence_loss(logits,
self.input_obj.targets,
tf.ones([self.batch_size, self.num_steps], dtype=tf.float32),
average_across_timesteps=False,
average_across_batch=True)
# Update the cost
self.cost = tf.reduce_sum(loss)
# get the prediction accuracy
self.softmax_out = tf.nn.softmax(tf.reshape(logits, [-1, vocab_size]))
self.predict = tf.cast(tf.argmax(self.softmax_out, axis=1), tf.int32)
correct_prediction = tf.equal(self.predict, tf.reshape(self.input_obj.targets, [-1]))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
if not is_training:
return
self.learning_rate = tf.Variable(0.01, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 5)
optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
self.train_op = optimizer.apply_gradients(zip(grads, tvars),
global_step=tf.contrib.framework.get_or_create_global_step())
self.new_lr = tf.placeholder(tf.float32, shape=[])
self.lr_update = tf.assign(self.learning_rate, self.new_lr)
def assign_lr(self, session, lr_value):
session.run(self.lr_update, feed_dict={self.new_lr: lr_value})
I don't need a solution but some explanation how to move forward would be awesome!!
I'm trying to make a Back Propagation Neural Network with PyTorch. I can successfully execute and test its accuracy, but it doesn't work very efficiently. Now, I'm supposed to increase its efficiency by setting different activation rules for neurons, so that those neurons that don't contribute to the final output get excluded (pruned) from the computations, thereby increasing the time and accuracy.
My code looks like this (extracted snippets) -
# Hyper Parameters
input_size = 20
hidden_size = 50
num_classes =130
num_epochs = 500
batch_size = 5
learning_rate = 0.1
# normalise input data
for column in data:
# the last column is target
if column != data.shape[1] - 1:
data[column] = data.loc[:, [column]].apply(lambda x: (x - x.mean()) / x.std())
# randomly split data into training set (80%) and testing set (20%)
msk = np.random.rand(len(data)) < 0.8
train_data = data[msk]
test_data = data[~msk]
# define train dataset and a data loader
train_dataset = DataFrameDataset(df=train_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Neural Network
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.sigmoid(out)
out = self.fc2(out)
return out
net = Net(input_size, hidden_size, num_classes)
# train the model by batch
for epoch in range(num_epochs):
for step, (batch_x, batch_y) in enumerate(train_loader):
# convert torch tensor to Variable
X = Variable(batch_x)
Y = Variable(batch_y.long())
# Forward + Backward + Optimize
optimizer.zero_grad() # zero the gradient buffer
outputs = net(X)
loss = criterion(outputs, Y)
all_losses.append(loss.data[0])
loss.backward()
optimizer.step()
if epoch % 50 == 0:
_, predicted = torch.max(outputs, 1)
# calculate and print accuracy
total = predicted.size(0)
correct = predicted.data.numpy() == Y.data.numpy()
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Accuracy: %.2f %%' % (epoch + 1, num_epochs, step + 1, len(train_data) // batch_size + 1, loss.data[0], 100 * sum(correct)/total))
Can someone tell me how to do that in PyTorch as I'm very new to PyTorch.
I'm not sure if that question is supposed to be on stackoverflow, but I will give you a hint anyway. You are working with a sigmoid activation function at the moment, the gradient of which vanishes if the input value is too large to small. A commonly used approach is to use the ReLU activation function (stands for rectified linear unit).
ReLU(x) is the identity for the positive domain and 0 for the negative domain, in Python that would be written as follows:
def ReLU(x):
if(x > 0):
return x
else:
return 0
It should be readily available in PyTorch