RNN model (GRU) of word2vec to regression not learning

RNN model (GRU) of word2vec to regression not learning - python

I am converting Keras code into PyTorch because I am more familiar with the latter than the former. However, I found that it is not learning (or only barely).
Below I have provided almost all of my PyTorch code, including the initialisation code so that you can try it out yourself. The only thing you would need to provide yourself, is the word embeddings (I'm sure you can find many word2vec models online). The first input file should be a file with tokenised text, the second input file should be a file with floating-point numbers, one per line. Because I have provided all the code, this question may seem huge and too broad. However, my question is specific enough I think: what is wrong in my model or training loop that causes my model to not or barely improve. (See below for results.)
I have tried to provide many comments where applicable, and I have provided the shape transformations as well so you do not have to run the code to see what is going on. The data prep methods are not important to inspect.
The most important parts are the forward method of the RegressorNet, and the training loop of RegressionNN (admittedly, these names were badly chosen). I think the mistake is there somewhere.
from pathlib import Path
import time
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import gensim
from scipy.stats import pearsonr
from LazyTextDataset import LazyTextDataset
class RegressorNet(nn.Module):
def __init__(self, hidden_dim, embeddings=None, drop_prob=0.0):
super(RegressorNet, self).__init__()
self.hidden_dim = hidden_dim
self.drop_prob = drop_prob
# Load pretrained w2v model, but freeze it: don't retrain it.
self.word_embeddings = nn.Embedding.from_pretrained(embeddings)
self.word_embeddings.weight.requires_grad = False
self.w2v_rnode = nn.GRU(embeddings.size(1), hidden_dim, bidirectional=True, dropout=drop_prob)
self.dropout = nn.Dropout(drop_prob)
self.linear = nn.Linear(hidden_dim * 2, 1)
# LeakyReLU rather than ReLU so that we don't get stuck in a dead nodes
self.lrelu = nn.LeakyReLU()
def forward(self, batch_size, sentence_input):
# shape sizes for:
# * batch_size 128
# * embeddings of dim 146
# * hidden dim of 200
# * sentence length of 20
# sentence_input: torch.Size([128, 20])
# Get word2vec vector representation
embeds = self.word_embeddings(sentence_input)
# embeds: torch.Size([128, 20, 146])
# embeds.view(-1, batch_size, embeds.size(2)): torch.Size([20, 128, 146])
# Input vectors into GRU, only keep track of output
w2v_out, _ = self.w2v_rnode(embeds.view(-1, batch_size, embeds.size(2)))
# w2v_out = torch.Size([20, 128, 400])
# Leaky ReLU it
w2v_out = self.lrelu(w2v_out)
# Dropout some nodes
if self.drop_prob > 0:
w2v_out = self.dropout(w2v_out)
# w2v_out: torch.Size([20, 128, 400
# w2v_out[-1, :, :]: torch.Size([128, 400])
# Only use the last output of a sequence! Supposedly that cell outputs the final information
regression = self.linear(w2v_out[-1, :, :])
regression: torch.Size([128, 1])
return regression
class RegressionRNN:
def __init__(self, train_files=None, test_files=None, dev_files=None):
print('Using torch ' + torch.__version__)
self.datasets, self.dataloaders = RegressionRNN._set_data_loaders(train_files, test_files, dev_files)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model = self.w2v_vocab = self.criterion = self.optimizer = self.scheduler = None
#staticmethod
def _set_data_loaders(train_files, test_files, dev_files):
# labels must be the last input file
datasets = {
'train': LazyTextDataset(train_files) if train_files is not None else None,
'test': LazyTextDataset(test_files) if test_files is not None else None,
'valid': LazyTextDataset(dev_files) if dev_files is not None else None
}
dataloaders = {
'train': DataLoader(datasets['train'], batch_size=128, shuffle=True, num_workers=4) if train_files is not None else None,
'test': DataLoader(datasets['test'], batch_size=128, num_workers=4) if test_files is not None else None,
'valid': DataLoader(datasets['valid'], batch_size=128, num_workers=4) if dev_files is not None else None
}
return datasets, dataloaders
#staticmethod
def prepare_lines(data, split_on=None, cast_to=None, min_size=None, pad_str=None, max_size=None, to_numpy=False,
list_internal=False):
""" Converts the string input (line) to an applicable format. """
out = []
for line in data:
line = line.strip()
if split_on:
line = line.split(split_on)
line = list(filter(None, line))
else:
line = [line]
if cast_to is not None:
line = [cast_to(l) for l in line]
if min_size is not None and len(line) < min_size:
# pad line up to a number of tokens
line += (min_size - len(line)) * ['#pad#']
elif max_size and len(line) > max_size:
line = line[:max_size]
if list_internal:
line = [[item] for item in line]
if to_numpy:
line = np.array(line)
out.append(line)
if to_numpy:
out = np.array(out)
return out
def prepare_w2v(self, data):
idxs = []
for seq in data:
tok_idxs = []
for word in seq:
# For every word, get its index in the w2v model.
# If it doesn't exist, use #unk# (available in the model).
try:
tok_idxs.append(self.w2v_vocab[word].index)
except KeyError:
tok_idxs.append(self.w2v_vocab['#unk#'].index)
idxs.append(tok_idxs)
idxs = torch.tensor(idxs, dtype=torch.long)
return idxs
def train(self, epochs=10):
valid_loss_min = np.Inf
train_losses, valid_losses = [], []
for epoch in range(1, epochs + 1):
epoch_start = time.time()
train_loss, train_results = self._train_valid('train')
valid_loss, valid_results = self._train_valid('valid')
# Calculate Pearson correlation between prediction and target
try:
train_pearson = pearsonr(train_results['predictions'], train_results['targets'])
except FloatingPointError:
train_pearson = "Could not calculate Pearsonr"
try:
valid_pearson = pearsonr(valid_results['predictions'], valid_results['targets'])
except FloatingPointError:
valid_pearson = "Could not calculate Pearsonr"
# calculate average losses
train_loss = np.mean(train_loss)
valid_loss = np.mean(valid_loss)
train_losses.append(train_loss)
valid_losses.append(valid_loss)
# print training/validation statistics
print(f'----------\n'
f'Epoch {epoch} - completed in {(time.time() - epoch_start):.0f} seconds\n'
f'Training Loss: {train_loss:.6f}\t Pearson: {train_pearson}\n'
f'Validation loss: {valid_loss:.6f}\t Pearson: {valid_pearson}')
# validation loss has decreased
if valid_loss <= valid_loss_min and train_loss > valid_loss:
print(f'!! Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
if train_loss <= valid_loss:
print('!! Training loss is lte validation loss. Might be overfitting!')
# Optimise with scheduler
if self.scheduler is not None:
self.scheduler.step(valid_loss)
print('Done training...')
def _train_valid(self, do):
""" Do training or validating. """
if do not in ('train', 'valid'):
raise ValueError("Use 'train' or 'valid' for 'do'.")
results = {'predictions': np.array([]), 'targets': np.array([])}
losses = np.array([])
self.model = self.model.to(self.device)
if do == 'train':
self.model.train()
torch.set_grad_enabled(True)
else:
self.model.eval()
torch.set_grad_enabled(False)
for batch_idx, data in enumerate(self.dataloaders[do], 1):
# 1. Data prep
sentence = data[0]
target = data[-1]
curr_batch_size = target.size(0)
# Returns list of tokens, possibly padded #pad#
sentence = self.prepare_lines(sentence, split_on=' ', min_size=20, max_size=20)
# Converts tokens into w2v IDs as a Tensor
sent_w2v_idxs = self.prepare_w2v(sentence)
# Converts output to Tensor of floats
target = torch.Tensor(self.prepare_lines(target, cast_to=float))
# Move input to device
sent_w2v_idxs, target = sent_w2v_idxs.to(self.device), target.to(self.device)
# 2. Predictions
pred = self.model(curr_batch_size, sentence_input=sent_w2v_idxs)
loss = self.criterion(pred, target)
# 3. Optimise during training
if do == 'train':
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 4. Save results
pred = pred.detach().cpu().numpy()
target = target.cpu().numpy()
results['predictions'] = np.append(results['predictions'], pred, axis=None)
results['targets'] = np.append(results['targets'], target, axis=None)
losses = np.append(losses, float(loss))
torch.set_grad_enabled(True)
return losses, results
if __name__ == '__main__':
HIDDEN_DIM = 200
# Load embeddings from pretrained gensim model
embed_p = Path('path-to.w2v_model').resolve()
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(str(embed_p))
# add a padding token with only zeros
w2v_model.add(['#pad#'], [np.zeros(w2v_model.vectors.shape[1])])
embed_weights = torch.FloatTensor(w2v_model.vectors)
# Text files are used as input. Every line is one datapoint.
# *.tok.low.*: tokenized (space-separated) sentences
# *.cross: one floating point number per line, which we are trying to predict
regr = RegressionRNN(train_files=(r'train.tok.low.en',
r'train.cross'),
dev_files=(r'dev.tok.low.en',
r'dev.cross'),
test_files=(r'test.tok.low.en',
r'test.cross'))
regr.w2v_vocab = w2v_model.vocab
regr.model = RegressorNet(HIDDEN_DIM, embed_weights, drop_prob=0.2)
regr.criterion = nn.MSELoss()
regr.optimizer = optim.Adam(list(regr.model.parameters())[0:], lr=0.001)
regr.scheduler = optim.lr_scheduler.ReduceLROnPlateau(regr.optimizer, 'min', factor=0.1, patience=5, verbose=True)
regr.train(epochs=100)
For the LazyTextDataset, you can refer to the class below.
from torch.utils.data import Dataset
import linecache
class LazyTextDataset(Dataset):
def __init__(self, paths):
# labels are in the last path
self.paths, self.labels_path = paths[:-1], paths[-1]
with open(self.labels_path, encoding='utf-8') as fhin:
lines = 0
for line in fhin:
if line.strip() != '':
lines += 1
self.num_entries = lines
def __getitem__(self, idx):
data = [linecache.getline(p, idx + 1) for p in self.paths]
label = linecache.getline(self.labels_path, idx + 1)
return (*data, label)
def __len__(self):
return self.num_entries
As I wrote before, I am trying to convert a Keras model to PyTorch. The original Keras code does not use an embedding layer, and uses pre-built word2vec vectors per sentence as input. In the model below, there is no embedding layer. The Keras summary looks like this (I don't have access to the base model setup).
Layer (type) Output Shape Param # Connected to
====================================================================================================
bidirectional_1 (Bidirectional) (200, 400) 417600
____________________________________________________________________________________________________
dropout_1 (Dropout) (200, 800) 0 merge_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (200, 1) 801 dropout_1[0][0]
====================================================================================================
The issue is that with identical input, the Keras model works and gets a +0.5 Pearson correlation between predicted and actual labels. The PyTorch model above, though, does not seem to work at all. To give you an idea, here is the loss (mean squared error) and Pearson (correlation coefficient, p-value) after the first epoch:
Epoch 1 - completed in 11 seconds
Training Loss: 1.684495 Pearson: (-0.0006077809280690612, 0.8173368901481127)
Validation loss: 1.708228 Pearson: (0.017794288315261794, 0.4264098054188664)
And after the 100th epoch:
Epoch 100 - completed in 11 seconds
Training Loss: 1.660194 Pearson: (0.0020315421756790806, 0.4400929436716754)
Validation loss: 1.704910 Pearson: (-0.017288118524826892, 0.4396865964324158)
The loss is plotted below (when you look at the Y-axis, you can see the improvements are minimal).
A final indicator that something may be wrong, is that for my 140K lines of input, each epoch only takes 10 seconds on my GTX 1080TI. I feel that his is not much and I would guess that the optimisation is not working/running. I cannot figure out why, though. To issue will probably be in my train loop or the model itself, but I cannot find it.
Again, something must be going wrong because:
- the Keras model does perform well;
- the training speed is 'too fast' for 140K sentences
- almost no improvemnts after training.
What am I missing? The issue is more than likely present in the training loop or in the network structure.

TL;DR: Use permute instead of view when swapping axes, see the end of answer to get an intuition about the difference.
About RegressorNet (neural network model)
No need to freeze embedding layer if you are using from_pretrained. As documentation states, it does not use gradient updates.
This part:
self.w2v_rnode = nn.GRU(embeddings.size(1), hidden_dim, bidirectional=True, dropout=drop_prob)
and especially dropout without providable num_layers is totally pointless (as no dropout can be specified with shallow one layer network).
BUG AND MAIN ISSUE: in your forward function you are using view instead of permute, here:
w2v_out, _ = self.w2v_rnode(embeds.view(-1, batch_size, embeds.size(2)))
See this answer and appropriate documentation for each of those functions and try to use this line instead:
w2v_out, _ = self.w2v_rnode(embeds.permute(1, 0, 2))
You may consider using batch_first=True argument during w2v_rnode creation, you won't have to permute indices that way.
Check documentation of torch.nn.GRU, you are after last step of the sequence, not after all of the sequences you have there, so you should be after:
_, last_hidden = self.w2v_rnode(embeds.permute(1, 0, 2))
but I think this part is fine otherwise.
Data preparation
No offence, but prepare_lines is very unreadable and seems pretty hard to maintain as well, not to say spotting an eventual bug (I suppose it lies in here).
First of all, it seems like you are padding manually. Please don't do it that way, use torch.nn.pad_sequence to work with batches!
In essence, first you encode each word in every sentence as index pointing into embedding (as you seem to do in prepare_w2v), after that you use torch.nn.pad_sequence and torch.nn.pack_padded_sequence or torch.nn.pack_sequence if the lines are already sorted by length.
Proper batching
This part is very important and it seems you are not doing that at all (and likely this is the second error in your implementation).
PyTorch's RNN cells take inputs not as padded tensors, but as torch.nn.PackedSequence objects. This is an efficient object storing indices which specify unpadded length of each sequence.
See more informations on the topic here, here and in many other blog posts throughout the web.
First sequence in batch has to be the longest, and all others have to be provided in the descending length. What follows is:
You have to sort your batch each time by sequences length and sort your targets in an analogous way OR
Sort your batch, push it through the network and unsort it afterwards to match with your targets.
Either is fine, it's your call what seems to be more intuitive for you.
What I like to do is more or less the following, hope it helps:
Create unique indices for each word and map each sentence appropriately (you've already done it).
Create regular torch.utils.data.Dataset object returning single sentence for each geitem, where it is returned as a tuple consisting of features (torch.Tensor) and labels (single value), seems like you're doing it as well.
Create custom collate_fn for use with torch.utils.data.DataLoader, which is responsible for sorting and padding each batch in this scenario (+ it returns lengths of each sentence to be passed into neural network).
Using sorted and padded features and their lengths I'm using torch.nn.pack_sequence inside neural network's forward method (do it after embedding!) to push it through RNN layer.
Depending on the use-case I unpack them using torch.nn.pad_packed_sequence. In your case, you only care about last hidden state, hence you don't have to do that. If you were using all of the hidden outputs (like is the case with, say, attention networks), you would add this part.
When it comes to the third point, here is a sample implementation of collate_fn, you should get the idea:
import torch
def length_sort(features):
# Get length of each sentence in batch
sentences_lengths = torch.tensor(list(map(len, features)))
# Get indices which sort the sentences based on descending length
_, sorter = sentences_lengths.sort(descending=True)
# Pad batch as you have the lengths and sorter saved already
padded_features = torch.nn.utils.rnn.pad_sequence(features, batch_first=True)
return padded_features, sentences_lengths, sorter
def pad_collate_fn(batch):
# DataLoader return batch like that unluckily, check it on your own
features, labels = (
[element[0] for element in batch],
[element[1] for element in batch],
)
padded_features, sentences_lengths, sorter = length_sort(features)
# Sort by length features and labels accordingly
sorted_padded_features, sorted_labels = (
padded_features[sorter],
torch.tensor(labels)[sorter],
)
return sorted_padded_features, sorted_labels, sentences_lengths
Use those as collate_fn in DataLoaders and you should be just about fine (maybe with minor adjustments, so it's essential you understand the idea standing behind it).
Other possible problems and tips
Training loop: great place for a lot of small errors, you may want to minimalize those by using PyTorch Ignite. I am having unbelievably hard time going through your Tensorflow-like-Estimator-like-API-like training loop (e.g. self.model = self.w2v_vocab = self.criterion = self.optimizer = self.scheduler = None this). Please, don't do it this way, separate each task (data creating, data loading, data preparation, model setup, training loop, logging) into it's own respective module. All in all there is a reason why PyTorch/Keras is more readable and sanity-preserving than Tensorflow.
Make the first row of your embedding equal to vector containg zeros: By default, torch.nn.functional.embedding expects the first row to be used for padding. Hence you should start your unique indexing for each word at 1 or specify an argument padding_idx to different value (though I highly discourage this approach, confusing at best).
I hope this answer helps you at least a little bit, if something is unclear post a comment below and I'll try to explain it from a different perspective/more detail.
Some final comments
This code is not reproducible, nor the question's specific. We don't have the data you are using, neither we got your word vectors, random seed is not fixed etc.
PS. One last thing: Check your performance on really small subset of your data (say 96 examples), if it does not converge, it is very likely you indeed have a bug in your code.
About the times: they are probably off (due to not sorting and not padding I suppose), usually Keras and PyTorch's times are quite similar (if I understood this part of your question as intended) for correct and efficient implementations.
Permute vs view vs reshape explanation
This simple example show the differences between permute() and view(). The first one swaps axes, while the second does not change memory layout, just chunks the array into desired shape (if possible).
import torch
a = torch.tensor([[1, 2], [3, 4], [5, 6]])
print(a)
print(a.permute(1, 0))
print(a.view(2, 3))
And the output would be:
tensor([[1, 2],
[3, 4],
[5, 6]])
tensor([[1, 3, 5],
[2, 4, 6]])
tensor([[1, 2, 3],
[4, 5, 6]])
reshape is almost like view, was added for those coming from numpy, so it's easier and more natural for them, but it has one important difference:
view never copies data and work only on contiguous memory (so after permutation like the one above your data may not be contiguous, hence acces to it might be slower)
reshape can copy data if needed, so it would work for non-contiguous arrays as well.

Related

Implementing a many-to-many regression task

Sorry if I present my problem not clearly, English is not my first language
Problem
Short description:
I want to train a model which map input x (with shape of [n_sample, timestamp, feature]) to an output y (with exact same shape). It's like mapping 2 space
Longer version:
I have 2 float ndarrays of shape [n_sample, timestamp, feature], representing MFCC feature of n_sample audio file. These 2 ndarray are 2 speakers' speech of the same corpus, which was aligned by DTW. Lets name these 2 arrays x and y. I want to train a model, which predict y[k] given x[k]. It's like mapping from space x to space y, and the output must be exact same shape as the input
What I've tried
It's time-series problem so I decide to use RNN approach. Here is my code in PyTorch (I put comment along the code. I removed the calculation of average loss for simplicity). Note that I've tried many option for learning rate, the behavior still the same
Class define
class Net(nn.Module):
def __init__(self, in_size, hidden_size, out_size, nb_lstm_layers):
super().__init__()
self.in_size = in_size
self.hidden_size = hidden_size
self.out_size = out_size
self.nb_lstm_layers = nb_lstm_layers
# self.fc1 = nn.Linear()
self.lstm = nn.LSTM(input_size=self.in_size, hidden_size=self.hidden_size, num_layers=self.nb_lstm_layers, batch_first=True, bias=True)
# self.fc = nn.Linear(self.hidden_size, self.out_size)
self.fc1 = nn.Linear(self.hidden_size, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, self.out_size)
def forward(self, x, h_state):
out, h_state = self.lstm(x, h_state)
output_fc = []
for frame in out:
output_fc.append(self.fc3(torch.tanh(self.fc1(frame)))) # I added fully connected layer to each frame, to make an output with same shape as input
return torch.stack(output_fc), h_state
def hidden_init(self):
if use_cuda:
h_state = torch.stack([torch.zeros(nb_lstm_layers, batch_size, 20) for _ in range(2)]).cuda()
else:
h_state = torch.stack([torch.zeros(nb_lstm_layers, batch_size, 20) for _ in range(2)])
return h_state
Training step:
net = Net(20, 20, 20, nb_lstm_layers)
optimizer = optim.Adam(net.parameters(), lr=0.0001, weight_decay=0.0001)
criterion = nn.MSELoss()
for epoch in range(nb_epoch):
count = 0
loss_sum = 0
batch_x = None
for i in (range(len(data))):
# data is my entire data, which contain A and B i specify above.
temp_x = torch.tensor(data[i][0])
temp_y = torch.tensor(data[i][1])
for ii in range(0, data[i][0].shape[0] - nb_frame_in_batch*2 + 1): # Create batches
batch_x, batch_y = get_batches(temp_x, temp_y, ii, batch_size, nb_frame_in_batch)
# this will return 2 tensor of shape (batch_size, nb_frame_in_batch, 20),
# with `batch_size` is the number of sample each time I feed to the net,
# nb_frame_in_batch is the number of frame in each sample
optimizer.zero_grad()
h_state = net.hidden_init()
prediction, h_state = net(batch_x.float(), h_state)
loss = criterion(prediction.float(), batch_y.float())
h_state = (h_state[0].detach(), h_state[1].detach())
loss.backward()
optimizer.step()
Problem is, the loss seems not to decrease but fluctuate a lot, without a clear behaviour
Please help me. Any suggestion will be greatly appreciated. If somebody can inspect my code and provide some comment, that would be so kind.
Thanks in advance!

It seems the network learning nothing from your data, hence the loss fluctuation (since weights depends on random initialization only). There are something you can try:
Try to normalize the data (this suggestion is quite broad, but I can't give you more details since I don't have your data, but normalize it to a specific range like [0, 1], or to a mean and std value is worth trying)
One very typical problem of LSTM in pytorch is its input dimension is quite different to other type of neural network. You must feed into your network a tensor with shape (seq_len, batch, input_size). You should go here, LSTM section for better details
One more thing: try to tune your hyperparameters. LSTM is harder to train compare to FC or CNN (to my experience).
Tell me if you have improvement. Debugging a neural network is always hard and full of potential coding mistake

With most ML algorithms it is tough to diagnose without seeing the data. Based on the inconsistency of your loss results this might be an issue with your data pre-processing. Have you tried normalizing the data first? Often times with large fluctuations in results, one of your input neuron values may be skewing your loss function making it unable to find a good direction.
How to normalize a NumPy array to within a certain range?
This is an example for audio normalization but I would also try adjusting the learning rate as it looks high and possibly removing a hidden layer.

May the problem was in the calculation of the loss. Try to sum the losses of each time-step in a sequence and then take the average over the batch. May it helps

Keras Word2Vec implementation

I'm using the implementation found in http://adventuresinmachinelearning.com/word2vec-keras-tutorial/ to learn something about word2Vec. What I am not understanding is why isn't the loss function decreasing?
Iteration 119200, loss=0.7305528521537781
Iteration 119300, loss=0.6254740953445435
Iteration 119400, loss=0.8255964517593384
Iteration 119500, loss=0.7267132997512817
Iteration 119600, loss=0.7213149666786194
Iteration 119700, loss=0.6156617999076843
Iteration 119800, loss=0.11473365128040314
Iteration 119900, loss=0.6617216467857361
The net, from my understanding, is a standard one used in this task:
input_target = Input((1,))
input_context = Input((1,))
embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)
dot_product = Dot(axes=1)([target, context])
dot_product = Reshape((1,))(dot_product)
output = Dense(1, activation='sigmoid')(dot_product)
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop') #adam??
Words come from a vocabulary of size 10000 from http://mattmahoney.net/dc/text8.zip (english text)
What I notice is that some words are somewhat learned in time like the context for numbers and articles is easily guessed, yet the loss is quite stuck around 0.7 from the beginning, and as iterations goes it only fluctuates randomly.
The training part is made like this (which I sense strange since the absence of the standard fit method)
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
idx = np.random.randint(0, len(labels)-1)
arr_1[0,] = word_target[idx]
arr_2[0,] = word_context[idx]
arr_3[0,] = labels[idx]
loss = model.train_on_batch([arr_1, arr_2], arr_3)
if cnt % 100 == 0:
print("Iteration {}, loss={}".format(cnt, loss))
Am i missing something important about these type of net? What is not written is implemented exactly like the link above

I followed the same tutorial and the loss drops after the algorithm went through a sample again. Note that the loss function is calculated only for the current target and context word pair. In the code example from the tutorial one epoch is only one sample, therefore you would need more than the number of target and context words to come to a point where the loss drops.
I implemented the training part with the following line
model.fit([word_target, word_context], labels, epochs=5)
Be warned that this can take a long time depending on how large the corpus is. The train_on_batch function gives you more control in training and you can vary the batch size or select samples you choose at every step of the training.

Seq2Seq in TensorFlow without embeddings

I'm trying to create a very basic multivariate time series auto-encoder.
I want to be able to reconstruct the exact two signals I pass in.
Most of the references I'm looking at are using older versions of APIs or use embeddings.
I'm trying to use the latest higher level APIs, but its not obvious how you cobble them together.
class Seq2SeqConfig():
def __init__(self):
self.sequence_length = 15 # num of time steps
self.hidden_units = 64 # ?
self.num_features = 2
self.batch_size = 10
# Expect input as batch major.
encoder_inputs = tf.placeholder(shape=(None, config.sequence_length, config.num_features), dtype=tf.float32)
decoder_inputs = tf.placeholder(shape=(None, config.sequence_length, config.num_features), dtype=tf.float32))
# Convert inputs to time major
encoder_inputs_tm = tf.transpose(encoder_inputs, [1, 0, 2])
decoder_inputs_tm = tf.transpose(decoder_inputs, [1, 0, 2])
# setup encoder
encoder_cell = tf.contrib.rnn.LSTMCell(config.hidden_units)
encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
cell=encoder_cell,
inputs=encoder_inputs_tm,
time_major=True)
# setup decoder
decoder_cell = tf.contrib.rnn.LSTMCell(config.hidden_units)
# The sequence length is mandatory. Not sure what the expectation is here?
helper = tf.contrib.seq2seq.TrainingHelper(
decoder_inputs_tm,
sequence_length=tf.constant(config.sequence_length, dtype=tf.int32, shape=[config.batch_size]),
time_major=True)
decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, encoder_final_state)
decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True)
# loss calculation
loss_op = tf.reduce_mean(tf.square(decoder_outputs.rnn - decoder_targets_tm)
The loss operation fails because the shapes are different.
decoder_targets is (?, 15, 2) and decoder_outputs.rnn is (?, ?, 64).
Question 1:
Am I missing an operation somewhere where I reshape the decoder output?
I loosely followed this tensorflow tutorial: https://www.tensorflow.org/tutorials/seq2seq
There is a projection_layer operation passed into the basic decoder. Is that the purpose of this?
projection_layer = layers_core.Dense(tgt_vocab_size, use_bias=False)
I don't see a layers_core.Dense() function anywhere. I assume its deprecated or internal.
Question 2:
Which helper does one use for Inference when not using embeddings?
Question 3:
What would the ideal size of the hidden units be?
I assume because we want to reduce the dimensions in the latent space, it needs to be less that the size of the inputs. How does that translate when you have a input with sequence length = 15 and number of features = 2.
Should the number of hidden units be < 15, < 2 or < 15 *2?

Figured out the answer to Question 1
from tensorflow.python.layers.core import Dense
output_layer = Dense(config.num_features)
decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, encoder_final_state, output_layer)
Reference: https://github.com/udacity/deep-learning/blob/master/seq2seq/sequence_to_sequence_implementation.ipynb
Other two questions still stand.

Regarding question 3: I suggest you run several training and validation cycles with different hyperparameters to find what works best for your data and requirements. You can take a look at my implementation here (https://github.com/fritzfitzpatrick/tensorflow-seq2seq-generic-example) where I have built a very simple training & validation loop that stops once the validation loss has not gone down for a number of cycles to prevent overfitting.
Regarding question 2: I am still working on a CustomHelper implementation at the moment, and it looks like it is going somewhere. You can find the full sample code here (https://github.com/fritzfitzpatrick/tensorflow-seq2seq-generic-example/blob/master/tensorflow_custom_helper.ipynb).
batch_size = 5
features_dec_inp = 2 # number of features in target sequence
go_token = 2
end_token = 3
sess = tf.InteractiveSession()
def initialize_fn():
finished = tf.tile([False], [batch_size])
start_inputs = tf.fill([batch_size, features_dec_inp], go_token)
return (finished, start_inputs)
def next_inputs_fn(time, outputs, state, sample_ids):
del time, sample_ids
# finished needs to update after last step.
# one could use conditional logic based on sequence length
# if sequence length is known in advance
finished = tf.tile([False], [batch_size])
# next inputs should be the output of the dense layer
# unless the above finished logic returns [True]
# in which case next inputs can be anything in the right shape
next_inputs = tf.fill([batch_size, features_dec_inp], 0.5)
return (finished, next_inputs, state)
helper = tf.contrib.seq2seq.CustomHelper(
initialize_fn = initialize_fn,
sample_fn = tf.identity,
next_inputs_fn = next_inputs_fn)
print(helper)
Regarding question 1: This is the code that I am using to reduce the dimensionality of my decoder output to the number of features in my target sequence:
train_output_dense = tf.layers.dense(
train_dec_out_logits, # [batch_size x seq_length x num_units]
features_dec_exp_out) # [batch_size x seq_length x num_target_features]

Confused about tensor dimensions and batch sizes in pytorch

So I'm very new to PyTorch and Neural Networks in general, and I'm having some problems creating a Neural Network that classifies names by gender.
I based this off of the PyTorch tutorial for RNNs that classify names by nationality, but I decided not to go with a recurrent approach... Stop me right here if this was the wrong idea!
However, whenever I try to run an input through the network it tells me:
RuntimeError: matrices expected, got 3D, 2D tensors at /py/conda-bld/pytorch_1493681908901/work/torch/lib/TH/generic/THTensorMath.c:1232
I know this has something to do with how PyTorch always expects there to be a batch size or something, and I have my tensor set up that way, but you can probably tell by this point that I have no idea what I'm talking about.
Here's my code:
from future import unicode_literals, print_function, division
from io import open
import glob
import unicodedata
import string
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import random
from torch.autograd import Variable
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
"""------GLOBAL VARIABLES------"""
all_letters = string.ascii_letters + " .,;'"
num_letters = len(all_letters)
all_names = {}
genders = ["Female", "Male"]
"""-------DATA EXTRACTION------"""
def findFiles(path):
return glob.glob(path)
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
# Read a file and split into lines
def readLines(filename):
lines = open(filename, encoding='utf-8').read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]
for file in findFiles("/home/andrew/PyCharm/PycharmProjects/CantStop/data/names/*.txt"):
gender = file.split("/")[-1].split(".")[0]
names = readLines(file)
all_names[gender] = names
"""-----DATA INTERPRETATION-----"""
def nameToTensor(name):
tensor = torch.zeros(len(name), 1, num_letters)
for index, letter in enumerate(name):
tensor[index][0][all_letters.find(letter)] = 1
return tensor
def outputToGender(output):
gender, gender_index = output.data.topk(1)
if gender_index[0][0] == 0:
return "Female"
return "Male"
"""------NETWORK SETUP------"""
class Net(nn.Module):
def __init__(self, input_size, output_size):
super(Net, self).__init__()
#Layer 1
self.Lin1 = nn.Linear(input_size, int(input_size/2))
self.ReLu1 = nn.ReLU()
self.Batch1 = nn.BatchNorm1d(int(input_size/2))
#Layer 2
self.Lin2 = nn.Linear(int(input_size/2), output_size)
self.ReLu2 = nn.ReLU()
self.Batch2 = nn.BatchNorm1d(output_size)
self.softMax = nn.LogSoftmax()
def forward(self, input):
output1 = self.Batch1(self.ReLu1(self.Lin1(input)))
output2 = self.softMax(self.Batch2(self.ReLu2(self.Lin2(output1))))
return output2
NN = Net(num_letters, 2)
"""------TRAINING------"""
def getRandomTrainingEx():
gender = genders[random.randint(0, 1)]
name = all_names[gender][random.randint(0, len(all_names[gender])-1)]
gender_tensor = Variable(torch.LongTensor([genders.index(gender)]))
name_tensor = Variable(nameToTensor(name))
return gender_tensor, name_tensor, gender
def train(input, target):
loss_func = nn.NLLLoss()
optimizer = optim.SGD(NN.parameters(), lr=0.0001, momentum=0.9)
optimizer.zero_grad()
output = NN(input)
loss = loss_func(output, target)
loss.backward()
optimizer.step()
return output, loss
all_losses = []
current_loss = 0
for i in range(100000):
gender_tensor, name_tensor, gender = getRandomTrainingEx()
output, loss = train(name_tensor, gender_tensor)
current_loss += loss
if i%1000 == 0:
print("Guess: %s, Correct: %s, Loss: %s" % (outputToGender(output), gender, loss.data[0]))
if i%100 == 0:
all_losses.append(current_loss/10)
current_loss = 0
# plt.figure()
# plt.plot(all_losses)
# plt.show()
Please help a newbie out!

Debugging your bug out:
Pycharm is a helpful python debugger that let you set breakpoint and views dimension of your tensor.
For easier debug, do not stack forward thing up like that
output1 = self.Batch1(self.ReLu1(self.Lin1(input)))
Instead,
h1 = self.ReLu1(self.Lin1(input))
h2 = self.Batch1(h1)
For the stacktrace, Pytorch also provide Pythonic error stacktrack. I believe that before
RuntimeError: matrices expected, got 3D, 2D tensors at /py/conda-bld/pytorch_1493681908901/work/torch/lib/TH/generic/THTensorMath.c:1232
There are some python error stacktrace that point right into your code. For easier debug, as I said, don't stack forward.
You use Pycharm to create break point before crash point. In debugger watcher Then use Variable(torch.rand(dim1, dim2)) to test out forward pass input, output dimension, and if a dimension is incorrect. Comparing with dimension of input. Call input.size() in debugger watcher.
For example, self.ReLu1(self.Lin1(Variable(torch.rand(10, 20)))).size() . If it show read text (error), then the input dimension is incorrect. Else, it show the size of the output.
Read the docs
In Pytorch Docs, it specify input/output dimension. It also have a example code snip
>>> rnn = nn.RNN(10, 20, 2)
>>> input = Variable(torch.randn(5, 3, 10))
>>> h0 = Variable(torch.randn(2, 3, 20))
>>> output, hn = rnn(input, h0)
You may use the code snip in PyCharm Debugger to explore dimension of input, output of specific layer of your interest (RNN, Linear, BatchNorm1d).

First, regarding your error, as other answers say and also your exception, it is probably because your input parameters are not shaped correctly. You could try debugging to isolate the line that gives the error, and then edit your question with it, so we know for sure what is causing the problem and correct it (without full stack trace it is harder to know what is the problem).
Now, you are trying to implement a Neural Network that classifies names by gender, as you indicated. We can see that this task will require to somehow input a name (which have different sizes) and output a gender (a binary variable: male, female). However, Neural Networks in general are built and trained to classify inputs (vectors) of fixed size of features, like they mention in the pytorch docs:
Parameters: input_size – The number of expected features in the input x
...
Looking at the tutorial you mentioned, they do consider this situation, as in their case the input for the network is a single letter transformed to a "one-hot vector", as they indicate:
To run a step of this network we need to pass an input (in our case, the Tensor for the current letter) and a previous hidden state (which we initialize as zeros at first). We’ll get back the output (probability of each language) and a next hidden state (which we keep for the next step).
And even give an example of it (remember tensors are Variables in pytorch):
input = Variable(letterToTensor('A'))
hidden = Variable(torch.zeros(1, n_hidden))
output, next_hidden = rnn(input, hidden)
Note: That being said, there are some other things you can do to adapt your implementation to variable-sized inputs. Based on my experience and also complemented by this and this other great questions, you could:
Preprocess your data to extract new features and transform it to fixed-size inputs. This is usually the most used approach but requires experience and patience to get good features. Some techniques used are PCA (Principal Component Analysis) and LDA (Latent Dirichlet Allocation)
For example, you could extract from your data features like: the length of the name, the number of letter a's in the name (female names tend to have more a's), the number of letter e's in the name (the same but with male names maybe?), and others... so you can generate new features like [name_length, a_found, e_found, ...]. Then you could follow a regular approach with you new fixed-size vectors. Do note that those features have to be meaningful; these ones I just came up for example (although they could work).
Split your input names into fixed-sized substring (or iterate them with a sliding window), so then you can classify them with a network designed for that size and combine the outputs in an ensemble way to obtain the final classification.

Compute updates in Theano after N number of loss calculations

I've constructed a LSTM recurrent NNet using lasagne that is loosely based on the architecture in this blog post. My input is a text file that has around 1,000,000 sentences and a vocabulary of 2,000 word tokens. Normally, when I construct networks for image recognition my input layer will look something like the following:
l_in = nn.layers.InputLayer((32, 3, 128, 128))
(where the dimensions are batch size, channel, height and width) which is convenient because all the images are the same size so I can process them in batches. Since each instance in my LSTM network has a varying sentence length, I have an input layer that looks like the following:
l_in = nn.layers.InputLayer((None, None, 2000))
As described in above referenced blog post,
Masks:
Because not all sequences in each minibatch will always have the same length, all recurrent layers in
lasagne
accept a separate mask input which has shape
(batch_size, n_time_steps)
, which is populated such that
mask[i, j] = 1
when
j <= (length of sequence i)
and
mask[i, j] = 0
when
j > (length
of sequence i)
.
When no mask is provided, it is assumed that all sequences in the minibatch are of length
n_time_steps.
My question is: Is there a way to process this type of network in mini-batches without using a mask?
Here is a simplified version if my network.
# -*- coding: utf-8 -*-
import theano
import theano.tensor as T
import lasagne as nn
softmax = nn.nonlinearities.softmax
def build_model():
l_in = nn.layers.InputLayer((None, None, 2000))
lstm = nn.layers.LSTMLayer(l_in, 4096, grad_clipping=5)
rs = nn.layers.SliceLayer(lstm, 0, 0)
dense = nn.layers.DenseLayer(rs, num_units=2000, nonlinearity=softmax)
return l_in, dense
model = build_model()
l_in, l_out = model
all_params = nn.layers.get_all_params(l_out)
target_var = T.ivector("target_output")
output = nn.layers.get_output(l_out)
loss = T.nnet.categorical_crossentropy(output, target_var).sum()
updates = nn.updates.adagrad(loss, all_params, 0.005)
train = theano.function([l_in.input_var, target_var], cost, updates=updates)
From there I have generator that spits out (X, y) pairs and I am computing train(X, y) and updating the gradient with each iteration. What I want to do is do an N number of training steps and then update the parameters with the average gradient.
To do this, I tried creating a compute_gradient function:
gradient = theano.grad(loss, all_params)
compute_gradient = theano.function(
[l_in.input_var, target_var],
output=gradient
)
and then looping over several training instances to create a "batch" and collect the gradient calculations to a list:
grads = []
for _ in xrange(1024):
X, y = train_gen.next() # generator for producing training data
grads.append(compute_gradient(X, y))
this produces a list of lists
>>> grads
[[<CudaNdarray at 0x7f83b5ff6d70>,
<CudaNdarray at 0x7f83b5ff69f0>,
<CudaNdarray at 0x7f83b5ff6270>,
<CudaNdarray at 0x7f83b5fc05f0>],
[<CudaNdarray at 0x7f83b5ff66f0>,
<CudaNdarray at 0x7f83b5ff6730>,
<CudaNdarray at 0x7f83b5ff6b70>,
<CudaNdarray at 0x7f83b5ff64f0>] ...
From here I would need to take the mean of the gradient at each layer, and then update the model parameters. This is possible to do in pieces like this does does the gradient calc/parameter update need to happen all in one theano function?
Thanks.

NOTE: this is a solution, but by no means do i have enough experience to verify its best and the code is just a sloppy example
You need 2 theano functions. The first being the grad one you seem to have already judging from the information provided in your question.
So after computing the batched gradients you want to immediately feed them as an input argument back into another theano function dedicated to updating the shared variables. For this you need to specify the expected batch size at the compile time of your neural network. so you could do something like this: (for simplicity i will assume you have a global list variable where all your params are stored)
params #list of params you wish to update
BATCH_SIZE = 1024 #size of the expected training batch
G = [T.matrix() for i in range(BATCH_SIZE) for param in params] #placeholder for grads result flattened so they can be fed into a theano function
updates = [G[i] for i in range(len(params))] #starting with list of param updates from first batch
for i in range(len(params)): #summing the gradients for each individual param
for j in range(1, len(G)/len(params)):
updates[i] += G[i*BATCH_SIZE + j]
for i in range(len(params)): #making a list of tuples for theano.function updates argument
updates[i] = (params[i], updates[i]/BATCH_SIZE)
update = theano.function([G], 0, updates=updates)
Like this theano will be taking the mean of the gradients and updating the params as usual
dont know if you need to flatten the inputs as I did, but probably
EDIT: gathering from how you edited your question it seems important that the batch size can vary in that case you could add 2 theano functions to your existing one:
the first theano function takes a batch of size 2 of your params and returns the sum. you could apply this theano function using python's reduce() and get the sum of the over the whole batch of gradients
the second theano function takes those summed param gradients and a scaler (the batch size) as input and hence is able to update the NN params over the mean of the summed gradients.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

RNN model (GRU) of word2vec to regression not learning - python

Related

Implementing a many-to-many regression task

Keras Word2Vec implementation

Seq2Seq in TensorFlow without embeddings

Confused about tensor dimensions and batch sizes in pytorch

Compute updates in Theano after N number of loss calculations

Categories

Resources