class LSTM(nn.Module):
def __init__(self, input_size=1, output_size=1, hidden_size=100, num_layers=16):
super().__init__()
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.linear = nn.Linear(hidden_size, output_size)
self.num_layers = num_layers
self.hidden_cell = (torch.zeros(self.num_layers,12 ,self.hidden_size).to(device),
torch.zeros(self.num_layers,12 ,self.hidden_size).to(device))
def forward(self, input_seq):
#lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
lstm_out, self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
predictions = self.linear(lstm_out[:,-1,:])
return predictions
This is my LSTM model, Input is a 4 dimension vector. Batch size is 16 and time stamp is 12. I want to find 13th vector with using 12 sequence vector. My LSTM block have [16,12,48] output. I did not understand why i have choose the last one:
out[:,-1,:]
From how it looks, your problem is like a text (i.e., sequence) classification problem, with output_size being the number of classes that you want to assign text to. By choosing lstm_out[:,-1,:], you actually intend to predict the label associated with the input text only using the last hidden state of your LSTM network, which totally makes sense. This is what people commonly do for text classification problems. Your linear layer, thereafter, will output logits for each class, and then you can use nn.Softmax() to get the probabilities of those.
The last hidden state of the LSTM network is the propagation of all previous hidden states of the LSTM, meaning that it has the aggregated information of previous input states that it has encoded (let's consider that you're using uni-directional LSTM as in your example). So for classifying an input text, you will have to do the classification based on the overall information of all tokens within the input text (which is encoded in the last hidden state of your LSTM). That is why you feed only the last hidden state to the linear layer that is upon your LSTM network.
Note: If you intended to do sequence-tagging (such as Named-entity recognition), then you would use all the hidden state outputs from your LSTM network. In such tasks, you would actually need information about a specific token within the input.
Related
I'm new to pytorch, I followed a tutorial on sentence generation with RNN and I'm trying to modify it to generate sequences of positions, however I'm having trouble with defining the correct model parameters such as input_size, output_size, hidden_dim, batch_size.
Background:
I have 596 sequences of x,y positions, each looking like [[x1,y1],[x2,y2],...,[xn,yn]]. Each sequence represents the 2D path of a vehicle. I would like to to train a model that, given a starting point (or a partial sequence), could generate one of these sequences.
-I have padded/truncated the sequences so that they all have length 50, meaning each sequence is an array of shape [50,2]
-I then divided this data into input_seq and target_seq:
input_seq: tensor of torch.Size([596, 49, 2]). contains all the 596 sequences, each without its last position.
target_seq: tensor of torch.Size([596, 49, 2]). contains all the 596 sequences, each without its first position.
The model class:
class Model(nn.Module):
def __init__(self, input_size, output_size, hidden_dim, n_layers):
super(Model, self).__init__()
# Defining some parameters
self.hidden_dim = hidden_dim
self.n_layers = n_layers
#Defining the layers
# RNN Layer
self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
# Fully connected layer
self.fc = nn.Linear(hidden_dim, output_size)
def forward(self, x):
batch_size = x.size(0)
# Initializing hidden state for first input using method defined below
hidden = self.init_hidden(batch_size)
# Passing in the input and hidden state into the model and obtaining outputs
out, hidden = self.rnn(x, hidden)
# Reshaping the outputs such that it can be fit into the fully connected layer
out = out.contiguous().view(-1, self.hidden_dim)
out = self.fc(out)
return out, hidden
def init_hidden(self, batch_size):
# This method generates the first hidden state of zeros which we'll use in the forward pass
# We'll send the tensor holding the hidden state to the device we specified earlier as well
hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
return hidden
I instantiate the model with the following parameters:
input_size of 2 (an [x,y] position)
output_size of 2 (an [x,y] position)
hidden_dim of 2 (an [x,y] position) (or should this be 50 as in the length of a full sequence?)
model = Model(input_size=2, output_size=2, hidden_dim=2, n_layers=1)
n_epochs = 100
lr=0.01
# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Training Run
for epoch in range(1, n_epochs + 1):
optimizer.zero_grad() # Clears existing gradients from previous epoch
output, hidden = model(input_seq)
loss = criterion(output, target_seq.view(-1).long())
loss.backward() # Does backpropagation and calculates gradients
optimizer.step() # Updates the weights accordingly
if epoch%10 == 0:
print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
print("Loss: {:.4f}".format(loss.item()))
When I run the training loop, it fails with this error:
ValueError Traceback (most recent call last)
<ipython-input-9-ad1575e0914b> in <module>
3 optimizer.zero_grad() # Clears existing gradients from previous epoch
4 output, hidden = model(input_seq)
----> 5 loss = criterion(output, target_seq.view(-1).long())
6 loss.backward() # Does backpropagation and calculates gradients
7 optimizer.step() # Updates the weights accordingly
...
ValueError: Expected input batch_size (29204) to match target batch_size (58408).
I tried modifying input_size, output_size, hidden_dim and batch_size and reshaping the tensors, but the more I try the more confused I get. Could someone point out what I am doing wrong?
Furthermore, since batch size is defined as x.size(0) in Model.forward(self,x), this means I only have a single batch of size 596 right? What would be the correct way to have multiple smaller batches?
The output has size [batch_size * seq_len, 2] = [29204, 2], and you flatten the target_seq, which has size [batch_size * seq_len * 2] = [58408]. They don't have the same number of dimensions, while having the same number of total elements, therefore the first dimensions are not identical.
Regardless of the dimension mismatch, nn.CrossEntropyLoss is a categorical loss function, which means it would only predict a class from the output. You don't have any classes, but you are trying to predict coordinates, which are continuous values. For this you need to use a regression loss function, such as nn.MSELoss, which calculates the squared error/distance between the predicted and target coordinates.
criterion = nn.MSELoss()
# .flatten() does the same thing as .view(-1) but is more descriptive
loss = criterion(output.flatten(), target_seq.flatten())
The flattening can be avoided as the loss functions as well as the linear layer can operate on multidimensional inputs, which removes the potential risk of getting lost with the flattening and restoring of the dimensions, and the output is more comprehensible to inspect or use later outside of the training. For the linear layer, only the last dimension of the input needs to match the in_features of nn.Linear, which is hidden_dim in your case.
def forward(self, x):
batch_size = x.size(0)
# Initializing hidden state for first input using method defined below
hidden = self.init_hidden(batch_size)
# Passing in the input and hidden state into the model and obtaining outputs
# out size: [batch_size, seq_len, hidden_dim]
out, hidden = self.rnn(x, hidden)
# out size: [batch_size, seq_len, output_size]
out = self.fc(out)
return out, hidden
Now the output of the model has the same size as the target_seq and you can directly call the loss function without flattening:
loss = criterion(output, target_seq)
hidden_dim of 2 (an [x,y] position) (or should this be 50 as in the length of a full sequence?)
The hidden_dim is not a pair of [x, y] and is completely unrelated to both the input_size and output_size. It defines the number of hidden features of the RNN, which is kind of its complexity, and bigger sizes potentially have more room to retain essential information, but also require more computations. There is no perfect hidden size and it largely depends on the use case. You can experiment with different sizes, e.g. 100, 256, etc. and see whether that improves your results.
Furthermore, since batch size is defined as x.size(0) in Model.forward(self,x), this means I only have a single batch of size 596 right? What would be the correct way to have multiple smaller batches?
Yes, you only have a single batch of size 596. If you want to use smaller batches, for example if you cannot fit all of them into a more complex model, you could easily use slices of them, but it would be better to use PyTorch's data utilities: torch.utils.data.TensorDataset to get a dataset, where each sequence of the input has a corresponding target, in combination with torch.utils.data.DataLoader to create batches for you.
from torch.utils.data import DataLoader, TensorDataset
# Match each sequence of the input_seq to the corresponding target_seq.
# e.g. dataset[0] == (input_seq[0], target_seq[0])
dataset = TensorDataset(input_seq, target_seq)
# Randomly shuffle the data and load it in batches of 16
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)
# Process one batch at a time
for input, target in data_loader:
output, hidden = model(input)
loss = criterion(output, target)
I am learning deep learning and am trying to understand the pytorch code given below. I'm struggling to understand how the probability calculation works. Can somehow break it down in lay-man terms. Thanks a ton.
ps = model.forward(images[0,:])
# Hyperparameters for our network
input_size = 784
hidden_sizes = [128, 64]
output_size = 10
# Build a feed-forward network
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
nn.ReLU(),
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.ReLU(),
nn.Linear(hidden_sizes[1], output_size),
nn.Softmax(dim=1))
print(model)
# Forward pass through the network and display output
images, labels = next(iter(trainloader))
images.resize_(images.shape[0], 1, 784)
print(images.shape)
ps = model.forward(images[0,:])
I'm a layman so I'll help you with the layman's terms :)
input_size = 784
hidden_sizes = [128, 64]
output_size = 10
These are parameters for the layers in your network. Each neural network consists of layers, and each layer has an input and an output shape.
Specifically input_size deals with the input shape of the first layer. This is the input_size of the entire network. Each sample that is input into the network will be a 1 dimension vector that is length 784 (array that is 784 long).
hidden_size deals with the shapes inside the network. We will cover this a little later.
output_size deals with the output shape of the last layer. This means that our network will output a 1 dimensional vector that is length 10 for each sample.
Now to break up model definition line by line:
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
The nn.Sequential part simply defines a network, each argument that is input defines a new layer in that network in that order.
nn.Linear(input_size, hidden_sizes[0]) is an example of such a layer. It is the first layer of our network takes in an input of size input_size, and outputs a vector of size hidden_sizes[0]. The size of the output is considered "hidden" in that it is not the input or the output of the whole network. It "hidden" because it's located inside of the network far from the input and output ends of the network that you interact with when you actually use it.
This is called Linear because it applies a linear transformation by multiplying the input by its weights matrix and adding its bias matrix to the result. (Y = Ax + b, Y = output, x = input, A = weights, b = bias).
nn.ReLU(),
ReLU is an example of an activation function. What this function does is apply some sort of transformation to the output of the last layer (the layer discussed above), and outputs the result of that transformation. In this case the function being used is the ReLU function, which is defined as ReLU(x) = max(x, 0). Activation functions are used in neural networks because they create non-linearities. This allows your model to model non-linear relationships.
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
From what we discussed above, this is a another example of a layer. It takes an input of hidden_sizes[0] (same shape as the output of the last layer) and outputs a 1D vector of length hidden_sizes[1].
nn.ReLU(),
Apples the ReLU function again.
nn.Linear(hidden_sizes[1], output_size)
Same as the above two layers, but our output shape is the output_size this time.
nn.Softmax(dim=1))
Another activation function. This activation function turns the logits outputted by nn.Linear into an actual probability distribution. This lets the model output the probability for each class. At this point our model is built.
# Forward pass through the network and display output
images, labels = next(iter(trainloader))
images.resize_(images.shape[0], 1, 784)
print(images.shape)
These are simply just preprocessing training data and putting it into the correct format
ps = model.forward(images[0,:])
This passes the images through the model (forward pass) and applies the operations previously discussed in layer. You get the resultant output.
Although not new to Machine Learning, I am still relatively new to Neural Networks, more specifically how to implement them (In Keras/Python). Feedforwards and Convolutional architectures are fairly straightforward, but I am having trouble with RNNs.
My X data consists of variable length sequences, each data-point in that sequence having 26 features. My y data, although of variable length, each pair of X and y have the same length, e.g:
X_train[0].shape: (226,26)
y_train[0].shape: (226,)
X_train[1].shape: (314,26)
y_train[1].shape: (314,)
X_train[2].shape: (189,26)
y_train[2].shape: (189,)
And my objective is to classify each item in the sequence into one of 39 categories.
What I can gather thus far from reading example code, is that we do something like the following:
encoder_inputs = Input(shape=(None, 26))
encoder = GRU(256, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)
decoder_inputs = Input(shape=(None, 39))
decoder_gru= GRU(256, return_sequences=True)
decoder_outputs, _ = decoder_gru(decoder_inputs, initial_state=state_h)
decoder_dense = Dense(39, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adadelta(),
metrics=['accuracy'])
Which makes sense to me, because each of the sequences have different lengths.
So with a for loop that loops over all sequences, we use None in the input shape of the first GRU layer because we are unsure what the sequence length will be, and then return the hidden state state_h of that encoder. With the second GRU layer returning sequences, and the initial state being the state returned from the encoder, we then pass the outputs to a final softmax activation layer.
Obviously something is flawed here because I get:
decoder_outputs, _ = decoder_gru(decoder_inputs, initial_state=state_h)
File "/usr/local/lib/python3.6/dist-
packages/tensorflow/python/framework/ops.py", line 458, in __iter__
"Tensor objects are only iterable when eager execution is "
TypeError: Tensor objects are only iterable when eager execution is
enabled. To iterate over this tensor use tf.map_fn.
This link points to a proposed solution, but I don't understand why you would add encoder states to a tuple for as many layers you have in the network.
I'm really looking for help in being able to successfully write this RNN to do this task, but also understanding. I am very interested in RNNs and want to understand them more in depth so I can apply them to other problems.
As an extra note, each sequence is of shape (sequence_length, 26), but I expand the dimension to be (1, sequence_length, 26) for X and (1, sequence_length) for y, and then pass them in a for loop to be fit, with the decoder_target_data one step ahead of the current input:
for idx in range(X_train.shape[0]):
X_train_s = np.expand_dims(X_train[idx], axis=0)
y_train_s = np.expand_dims(y_train[idx], axis=0)
y_train_s1 = np.expand_dims(y_train[idx+1], axis=0)
encoder_input_data = X_train_s
decoder_input_data = y_train_s
decoder_target_data = y_train_s1
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
epochs=50,
validation_split=0.2)
With other networks I have wrote (FeedForward and CNN), I specify the model by adding layers on top of Keras's Sequential class. Because of the inherent complexity of RNNs I see the general format of using Keras's Input class like above and retrieving hidden states (and cell states for LSTM) etc... to be logical, but I have also seen them built from using Keras's Sequential Class. Although these were many to one type tasks, I would be interested in how you would write it that way too.
The problem is that the decoder_gru layer does not return its state, therefore you should not use _ as the return value for the state (i.e. just remove , _):
decoder_outputs = decoder_gru(decoder_inputs, initial_state=state_h)
Since the input and output lengths are the same and there is a one to one mapping between the elements of input and output, you can alternatively construct the model this way:
inputs = Input(shape=(None, 26))
gru = GRU(64, return_sequences=True)(inputs)
outputs = Dense(39, activation='softmax')(gru)
model = Model(inputs, outputs)
Now you can make this model more complex (i.e. increase its capacity) by stacking multiple GRU layers on top of each other:
inputs = Input(shape=(None, 26))
gru = GRU(256, return_sequences=True)(inputs)
gru = GRU(128, return_sequences=True)(gru)
gru = GRU(64, return_sequences=True)(gru)
outputs = Dense(39, activation='softmax')(gru)
model = Model(inputs, outputs)
Further, instead of using GRU layers, you can use LSTM layers which has more representational capacity (of course this may come at the cost of increasing computational cost). And don't forget that when you increase the capacity of the model you increase the chance of overfitting as well. So you must keep that in mind and consider solutions that prevent overfitting (e.g. adding regularization).
Side note: If you have a GPU available, then you can use CuDNNGRU (or CuDNNLSTM) layer instead, which has been optimized for GPUs so it runs much faster compared to GRU.
I have developed an Encoder(CNN)-Decoder (RNN) network for image captioning in pytorch. The decoder network takes in two inputs- Context feature vector from the Encoder and the word embeddings of the caption for training. The context feature vector is of size = embed_size , which is also the embedding size of each word in the caption. My question here is more concerned with the output of the Class DecoderRNN. Please refer to the code below.
class DecoderRNN(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
super(DecoderRNN, self).__init__()
self.embed_size = embed_size
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.num_layers = num_layers
self.linear = nn.Linear(hidden_size, vocab_size)
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first = True)
def forward(self, features, captions):
embeddings = self.embed(captions)
embeddings = torch.cat((features.unsqueeze(1), embeddings),1)
hiddens,_ = self.lstm(embeddings)
outputs = self.linear(hiddens)
return outputs
In the forward function , I send in a sequence of (batch_size, caption_length+1, embed_size) (concatenated tensor of context feature vector and the embedded caption) . The output of the sequence should be captions and of the shape (batch_size, caption_length, vocab_size) , but I am still receiving an output of shape (batch_size, caption_length +1, vocab_size) . Can anyone please suggest what should I alter in my forward function so that extra 2nd dimension is not received? Thanks in advance
Since in LSTM (or in any RNN) for each time step (or caption length here), there will be one output, I do not see any problem here. What you need to do is make input size (caption_length) at the second dimension to get the required output. (or people usually add a < END of SENTENCE > tag to target. Hence the target length is caption+1)
I'm working on text summarization, starting with the pointer-generator network described in the paper: https://arxiv.org/pdf/1704.04368.pdf, with a code release at: https://github.com/abisee/pointer-generator
I want to add hierarchical encoding to this network to be able to handle larger input documents for summarization. Right now, input documents are truncated at a length of 400 words because LSTMs have a keeping memory over very long inputs. We want to reweight the attention distribution considering sentence input.
The reweighting I am considering is from the paper: https://arxiv.org/pdf/1602.06023.pdf where the final distribution is
where "P_a^w(j) is the word-level attention weight at jth position of the source document, and s(j) is the ID of the sentence at jth word position, P_a^s(l)is the sentence-level attention weight for the lth sentence in the source, Nd is the number of words in the source document, and P_a(j) is the re-scaled attention at the jth word position"
The Pointer-Generator attention is calculated as
then
softmax(e^t_ j)
So I think I need to make another Pt-gen attention LSTM, feed the outputs through the same attention calculation with softmax as the words, and then at the end reweight as described in equation 1.
I have added a new bidirectionalLSTM under def _add_encoder. I am wondering how I should handle input into the sentence LSTM. The model needs to have some kind of indication of the position of the sentence and the words in it. How should I structure the sentences to be processed by the sentence LSTM?
def _add_encoder(self, encoder_inputs, seq_len):
"""Add a single-layer bidirectional LSTM encoder to the graph.
Args:
encoder_inputs: A tensor of shape [batch_size, <=max_enc_steps, emb_size].
seq_len: Lengths of encoder_inputs (before padding). A tensor of shape [batch_size].
Returns:
encoder_outputs:
A tensor of shape [batch_size, <=max_enc_steps, 2*hidden_dim]. It's 2*hidden_dim because it's the concatenation of the forwards and backwards states.
fw_state, bw_state:
Each are LSTMStateTuples of shape ([batch_size,hidden_dim],[batch_size,hidden_dim])
"""
with tf.variable_scope('encoder'):
cell_fw = tf.contrib.rnn.LSTMCell(self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True)
cell_bw = tf.contrib.rnn.LSTMCell(self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True)
(encoder_outputs, (fw_st, bw_st)) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, encoder_inputs, dtype=tf.float32, sequence_length=seq_len, swap_memory=True)
encoder_outputs = tf.concat(axis=2, values=encoder_outputs) # concatenate the forwards and backwards states
sentence_encoder_inputs = ???
sentence_lens = len(sentences_encoder_inputs)
### NEW CODE ###
sentence_cell_fw = tf.contrib.rnn.LSTMCell(self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True)
sentence_cell_bw = tf.contrib.rnn.LSTMCell(self._hps.hidden_dim, initializer=self.rand_unif_init, state_is_tuple=True)
(sentence_encoder_outputs, (fw_st, bw_st)) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, sentence_encoder_inputs, dtype=tf.float32, sequence_length=sentence_lens, swap_memory=True)
sentence_encoder_outputs = tf.concat(axis=2, values=sentence_encoder_outputs) # concatenate the forwards and backwards states
return encoder_outputs, sentence_encoder_outputs, fw_st, bw_st