Problem
I don't understand how to handle the LSTM hidden cells when training in mini-batches, since the training data is sent to the network in batches of n sequences, while only 1 sequence is processed each time during test.
Code
Specifically, my network is:
class Pytorch_LSTM(nn.Module):
def __init__(self, params):
super(Pytorch_LSTM, self).__init__()
self.params = params
self.hidden_layer_size = params['hidden_layer_size']
# Define layers
self.lstm = nn.LSTM(input_size = params['in_features'], hidden_size = params['hidden_layer_size'])
self.linear1 = nn.Linear(params['hidden_layer_size'], params['hidden_layer_size'])
self.linear2 = nn.Linear(params['hidden_layer_size'], params['out_features'])
self.hidden_cell = (torch.zeros(1,self.params['batch_size'],self.hidden_layer_size),
torch.zeros(1,self.params['batch_size'],self.hidden_layer_size))
def forward(self, input_seq):
lstm_out, self.hidden_cell = self.lstm(input_seq.view(self.params['time_window'],-1,self.params['in_features']), self.hidden_cell)
linear1_out = self.linear1(lstm_out)
predictions = self.linear2(linear1_out)
return predictions[-1]
In my train() method:
def train(self, input_sequence, params, test_idx, final, verbose=True):
....
....
# Model
self.model = Pytorch_LSTM(params)
# Let's train the model
for epoch in range(epochs):
for count_1,seq in enumerate(train_data_batch):
optimizer.zero_grad()
self.model.hidden_cell = (torch.zeros(1, params['batch_size'], self.model.hidden_layer_size),
torch.zeros(1, params['batch_size'], self.model.hidden_layer_size))
y_pred = self.model(seq) # seq.shape: (n_batches, 25, 4)
single_loss = mse_loss(y_pred, y_label) # y_pred.shape, y_label.shape : (batch_size, 4)
This trains the model in mini-batches, I believe correctly.
When I test it, I only have one single sequence per time, instead of multiple batches. In my test():
for count,seq in enumerate(val_data[j]):
y_pred = self.model(seq) # seq.shape: (25,4)
single_loss = mse_loss(y_pred, y_label)
This returns the error:
RuntimeError: Expected hidden[0] size (1, 1, 100), got (1, 704, 100)
where n_batches= 704.
How should I handle the hidden_cell?
You are passing the (h_0, c_0) parameters to the lstm on each call of shape (1, batch_size, 100). batch_size is for parallel processing and is arbitrary but you are hardcoading every time in
self.hidden_cell = (torch.zeros(1,self.params['batch_size'],self.hidden_layer_size),
torch.zeros(1,self.params['batch_size'],self.hidden_layer_size))
This hidden_cell is h_0 and c_0 parameters i.e., the initial values of hidden and cell state.
Trying to pass (1, batch_size, 100) sized array is unnecessary as it defaults to zero vector of the required size by itself.
Just get rid of the self.hidden_cell and only pass input_seq to the self.lstm in forward method. It should work
Related
I'm trying to complete a task and write simple RNN. Here's the class:
class RNNBaseline(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim) #RNN(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim) # YOUR CODE GOES HERE
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths, hidden = None):
#text = [sent len, batch size]
embedded = self.embedding(text)
#embedded = [sent len, batch size, emb dim]
#pack sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
# cell arg for LSTM, remove for GRU
# packed_output, (hidden, cell) = self.rnn(packed_embedded)
# unpack sequence
# output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
#output = [sent len, batch size, hid dim * num directions]
#output over padding tokens are zero tensors
#hidden = [num layers * num directions, batch size, hid dim]
#cell = [num layers * num directions, batch size, hid dim]
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
#and apply dropout
output, hidden = self.rnn(packed_embedded, hidden)
#hidden = None # concatenate
#hidden = [batch size, hid dim * num directions] or [batch_size, hid dim * num directions]
return self.fc(hidden)
For now I'm not using LSTM or trying to do bidirectional RNN, I just want simple GRU to train without errors. This is the training function:
import numpy as np
min_loss = np.inf
cur_patience = 0
for epoch in range(1, max_epochs + 1):
train_loss = 0.0
model.train()
pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
#YOUR CODE GOES HERE
opt.zero_grad()
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
train_loss = loss_func(output, labels)
train_loss.backward()
opt.step()
train_loss /= len(train_iter)
val_loss = 0.0
model.eval()
pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
val_loss = loss_func(output, labels)
val_loss /= len(valid_iter)
if val_loss < min_loss:
min_loss = val_loss
best_model = model.state_dict()
else:
cur_patience += 1
if cur_patience == patience:
cur_patience = 0
break
print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)
And some variables:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = False
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()
max_epochs = 1
But I get this error:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1, 64, 1]))
... in this line:
---> 18 train_loss = loss_func(output, labels)
What am I doing wrong?
nn.BCEWithLogitsLoss expects both outputs and targets (or in your case labels) to be of size [b,d] where b is the batch size and d is the number of classes (or dimension of whatever you are trying to predict). Currently, your outputs are of size [b,d,1] and your targets are of size [d]. Two fixes are necessary, and both are very simple:
Add a batch dimension to your targets (labels). This is a common error when using a dataset that returns data elements because it generally does not add a batch dimension. Encapsulating your dataset class within a pytorch dataloader, but if you don't want to do this simply add an unsqueeze() operation. Note that the unsqueeze operation only works with a batch size of 1, otherwise using dataloader is probably a better bet.
Your output has an empty 3rd dimension, which can easily be flattened with a squeeze() operation. Both unsqueeze and squeeze are differentiable so shouldn't present problems for backpropagation.
... code before here
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device).unsqueeze(0) # added unsqueeze operation
output = model(input, txt_len.type(torch.int64).cpu())
output = output.squeeze(-1) # added squeeze on last dim
val_loss = loss_func(output, labels)
... code after here
Hello I'm trying to understand the following thing:
I have created the following neural network model using PyTorch to run a regression task.
class Model(nn.Module):
def __init__(self, in_features, h1, h2, out_features=0):
super(Model, self).__init__()
self.fc1 = nn.Linear(in_features,h1) # input layer
self.fc2 = nn.Linear(h1, h2) # hidden layer
self.out = nn.Linear(h2, out_features) # output layer
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.out(x)
return x
model = Model(in_features=59, h1=64, h2=32, out_features=1)
Then we get to the training where I run the following code:
epochs = 300
losses = []
for i in range(epochs):
y_pred = model(X_train)
loss = criterion(y_pred, y_train)
losses.append(loss.detach().numpy())
optimizer.zero_grad()
loss.backward()
optimizer.step()
Everything works fine but through the model's forward() method my y_pred gets the shape [1359, 1] (I guess it should be [1359] cause my y_train matches this shape and I get the following warning:
C:\Users\hp\anaconda3\lib\site-packages\torch\nn\modules\loss.py:528: UserWarning: Using a target size (torch.Size([1359])) that is different to the input size (torch.Size([1359, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
This also happens when I try to evaluate my model
with torch.no_grad():
y_val = model(X_test)
loss = criterion(y_val.flatten(), y_test)
print(loss)
Indeed you have a shape mismatch, your model will output a tensor of shape (batch_size, 1) while your target is shaped (batch_size,). You have to explicitly broadcast your tensor such that the inputs of your criterion have shape.
Either by reshaping the prediction y_val itself:
>>> loss = criterion(y_val[:,0], y_test)
Or the target tensor y_test:
>>> loss = criterion(y_val, y_test[:,None])
I am trying to use RNN to do a binary classification. But when my model is training, it gets stuck at loss.backward().
Here is my model:
class RNN2(nn.Module):
def __init__(self, input_size, hidden_size, output_size=2, num_layers=1):
super(RNN2, self).__init__()
self.rnn = nn.RNN(input_size, hidden_size, num_layers)
self.reg = nn.Linear(hidden_size, output_size)
#self.softmax = nn.LogSoftmax(dim=1)
def forward(self,x):
x, hidden = self.rnn(x)
return self.reg(x[:,2])
rnn = RNN2(13,10)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
for e in range(10):
out = rnn(train_X)
optimizer.zero_grad()
print(out[0])
print(out.shape)
print(train_Y.shape)
loss = criterion(out, train_Y)
print(loss)
loss.backward()
print("1")
optimizer.step()
print("2")
The shape of train_X is 420000*3*13 and the shape of train_Y is 420000
So it can print loss. Can anyone tell me why it gets stuck at loss.backward(). It can't print 1.
You have to know that in RRNs, computing the backward function for a sequence of length 420000 is extremely slow. If you run your code on a machine with a GPU (or google colab) and add the following lines before the for loop, your code finishes executing in less than two minutes.
rnn = rnn.cuda()
train_X = train_X.cuda()
train_Y = train_Y.cuda()
Note that by default, the second input dimension passed to RNN will be treated as the batch size. Therefore, if the 420000 is the number of batches, pass batch_first=True to the RNN constructor.
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
This would significantly speed up the process (less than one second in google colab). However, if that is not the case, you should try chunking the sequences into smaller parts and increasing the batch size from 3 to a larger value.
I currently have my neural network training with a batch_size =1 , To run it across multiple gpus i need to increase the batch size to be larger than the amount of gpus so i want batch_size=16, although the way i have my data set up i am not sure how to change that
The data is read from a csv file
raw_data = pd.read_csv("final.csv")
train_data = raw_data[:750]
test_data = raw_data[750:]
Then the data is normalized and turned to Tensors
# normalize features
scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_train = scaler.fit_transform(train_data)
scaled_test = scaler.transform(test_data)
# Turn into Tensorflow Tensors
train_data_normalized = torch.FloatTensor(scaled_train).view(-1)
test_data_normalized = torch.FloatTensor(scaled_test).view(-1)
Then the data is turned into a Tensor Tuple of [input list, output] format
e.g (tensor([1,3,56,63,3]),tensor([34]))
# Convert to tensor tuples
def input_series_sequence(input_data, tw):
inout_seq = []
L = len(input_data)
i = 0
for index in range(L - tw):
train_seq = input_data[i:i + tw]
train_label = input_data[i + tw:i + tw + 1]
inout_seq.append((train_seq, train_label))
i = i + tw
return inout_seq
train_inout_seq = input_series_sequence(train_data_normalized, train_window)
test_input_seq = input_series_sequence(test_data_normalized, train_window)
And then the model is trained like so
for i in range(epochs):
for seq, labels in train_inout_seq:
optimizer.zero_grad()
model.module.hidden_cell = model.module.init_hidden()
seq = seq.to(device)
labels = labels.to(device)
y_pred = model(seq)
single_loss = loss_function(y_pred, labels)
single_loss.backward()
optimizer.step()
So i want to know how exactly to change the batch_size from 1 -> 16 , Do i need to use Dataset and Dataloader? and if so how exactly would it fit in with my current code, thanks!
Edit: Model is defined like this, might have to change the forward function?
class LSTM(nn.Module):
def __init__(self, input_size=1, hidden_layer_size=100, output_size=1):
super().__init__()
self.hidden_layer_size = hidden_layer_size
self.lstm = nn.LSTM(input_size, hidden_layer_size)
self.linear = nn.Linear(hidden_layer_size, output_size)
self.hidden_cell = (torch.zeros(1, 1, self.hidden_layer_size),
torch.zeros(1, 1, self.hidden_layer_size))
def init_hidden(self):
return (torch.zeros(1, 1, self.hidden_layer_size),
torch.zeros(1, 1, self.hidden_layer_size))
def forward(self, input_seq):
lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq), 1, -1), self.hidden_cell)
predictions = self.linear(lstm_out.view(len(input_seq), -1))
return predictions[-1]
You can do this by wrapping your model by a nn.DataParallel class.
model = nn.DataParallel(model)
Since I don't have access to multiple GPUs and your data right now to test, I'll direct you here
I have an autoencoder set up in Keras. I want to be able to weight the features of the input vector according to a predetermined 'precision' vector. This continuous valued vector has the same length as the input, and each element lies in the range [0, 1], corresponding to the confidence in the corresponding input element, where 1 is completely confident and 0 is no confidence.
I have a precision vector for every example.
I have defined a loss that takes into account this precision vector. Here, reconstructions of low-confidence features are down-weighted.
def MAEpw_wrapper(y_prec):
def MAEpw(y_true, y_pred):
return K.mean(K.square(y_prec * (y_pred - y_true)))
return MAEpw
My issue is that the precision tensor y_prec depends on the batch. I want to be able to update y_prec according to the current batch so that each precision vector is correctly associated with its observation.
I have the done the following:
global y_prec
y_prec = K.variable(P[:32])
Here P is a numpy array containing all precision vectors with the indices corresponding to the examples. I initialize y_prec to have the correct shape for a batch size of 32. I then define the following DataGenerator:
class DataGenerator(Sequence):
def __init__(self, batch_size, y, shuffle=True):
self.batch_size = batch_size
self.y = y
self.shuffle = shuffle
self.on_epoch_end()
def on_epoch_end(self):
self.indexes = np.arange(len(self.y))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __len__(self):
return int(np.floor(len(self.y) / self.batch_size))
def __getitem__(self, index):
indexes = self.indexes[index * self.batch_size: (index+1) * self.batch_size]
# Set precision vector.
global y_prec
new_y_prec = K.variable(P[indexes])
y_prec = K.update(y_prec, new_y_prec)
# Get training examples.
y = self.y[indexes]
return y, y
Here I am aiming to update y_prec in the same function that generates the batch. This seems to be updating y_prec as expected. I then define my model architecture:
dims = [40, 20, 2]
model2 = Sequential()
model2.add(Dense(dims[0], input_dim=64, activation='relu'))
model2.add(Dense(dims[1], input_dim=dims[0], activation='relu'))
model2.add(Dense(dims[2], input_dim=dims[1], activation='relu', name='bottleneck'))
model2.add(Dense(dims[1], input_dim=dims[2], activation='relu'))
model2.add(Dense(dims[0], input_dim=dims[1], activation='relu'))
model2.add(Dense(64, input_dim=dims[0], activation='linear'))
And finally, I compile and run:
model2.compile(optimizer='adam', loss=MAEpw_wrapper(y_prec))
model2.fit_generator(DataGenerator(32, digits.data), epochs=100)
Where digits.data is a numpy array of observations.
However, this ends up defining separate graphs:
StopIteration: Tensor("Variable:0", shape=(32, 64), dtype=float32_ref) must be from the same graph as Tensor("Variable_4:0", shape=(32, 64), dtype=float32_ref).
I've scoured SO for a solution to my problem but nothing I've found works. Any help on how to do this properly is appreciated.
This autoencoder can be easily implemented using the Keras functional API. This will allow to have an additional input placeholder y_prec_input, which will be fed with the "precision" vector. The full source code can be found here.
Data generator
First, let's reimplement your data generator as follows:
class DataGenerator(Sequence):
def __init__(self, batch_size, y, prec, shuffle=True):
self.batch_size = batch_size
self.y = y
self.shuffle = shuffle
self.prec = prec
self.on_epoch_end()
def on_epoch_end(self):
self.indexes = np.arange(len(self.y))
if self.shuffle:
np.random.shuffle(self.indexes)
def __len__(self):
return int(np.floor(len(self.y) / self.batch_size))
def __getitem__(self, index):
indexes = self.indexes[index * self.batch_size: (index + 1) * self.batch_size]
y = self.y[indexes]
y_prec = self.prec[indexes]
return [y, y_prec], y
Note that I got rid of the global variable. Now, instead, the precision vector P is provided as input argument (prec), and the generator yields an additional input that will be fed to the precision placeholder y_prec_input (see model definition).
Model
Finally, your model can be defined and trained as follows:
y_input = Input(shape=(input_dim,))
y_prec_input = Input(shape=(1,))
h_enc = Dense(dims[0], activation='relu')(y_input)
h_enc = Dense(dims[1], activation='relu')(h_enc)
h_enc = Dense(dims[2], activation='relu', name='bottleneck')(h_enc)
h_dec = Dense(dims[1], activation='relu')(h_enc)
h_dec = Dense(input_dim, activation='relu')(h_dec)
model2 = Model(inputs=[y_input, y_prec_input], outputs=h_dec)
model2.compile(optimizer='adam', loss=MAEpw_wrapper(y_prec_input))
# Train model
model2.fit_generator(DataGenerator(32, digits.data, P), epochs=100)
where input_dim = digits.data.shape[1]. Note that I also changed the output dimension of the decoder to input_dim, since it must match the input dimension.
Try to test your code with worker=0 when you call fit_generator, if it works normally then threading is your problem.
If threading is the cause, try this:
# In the code that executes on the main thread
graph = tf.get_default_graph()
# In code that executes in other threads(e.g. your generator)
with graph.as_default():
...
...
new_y_prec = K.variable(P[indexes])
y_prec = K.update(y_prec, new_y_prec)