I'm building a multilayered bidirectional RNN using Tensorflow .I'm a bit confused about the implementation though .
I have built two functions that creates multilayered bidirectional RNN the first one works fine , but I'm not sure about the predictions its making, as it is performing as a unidirectional multilayered RNN . below is my implementation :
def encoding_layer_old(rnn_inputs, rnn_size, num_layers, keep_prob,
source_sequence_length, source_vocab_size,
encoding_embedding_size):
"""
Create encoding layer
:param rnn_inputs: Inputs for the RNN
:param rnn_size: RNN Size
:param num_layers: Number of layers
:param keep_prob: Dropout keep probability
:param source_sequence_length: a list of the lengths of each sequence in the batch
:param source_vocab_size: vocabulary size of source data
:param encoding_embedding_size: embedding size of source data
:return: tuple (RNN output, RNN state)
"""
# Encoder embedding
enc_embed = tf.contrib.layers.embed_sequence(rnn_inputs, source_vocab_size, encoding_embedding_size)
def create_cell_fw(rnn_size):
with tf.variable_scope("create_cell_fw"):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2), reuse=False)
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
def create_cell_bw(rnn_size):
with tf.variable_scope("create_cell_bw"):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2), reuse=False)
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
enc_cell_fw = tf.contrib.rnn.MultiRNNCell([create_cell_fw(rnn_size) for _ in range(num_layers)])
enc_cell_bw = tf.contrib.rnn.MultiRNNCell([create_cell_bw(rnn_size) for _ in range(num_layers)])
((encoder_fw_outputs, encoder_bw_outputs),(encoder_fw_final_state,encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn(enc_cell_fw,enc_cell_bw, enc_embed,
sequence_length=source_sequence_length,dtype=tf.float32)
encoder_outputs = tf.concat([encoder_fw_outputs, encoder_bw_outputs], 2)
print(encoder_outputs)
#encoder_final_state_c=[]#tf.Variable([num_layers] , dtype=tf.int32)
#encoder_final_state_h=[]#tf.Variable([num_layers] , dtype=tf.int32)
encoder_final_state = ()
for x in range((num_layers)):
encoder_final_state_c=tf.concat((encoder_fw_final_state[x].c, encoder_bw_final_state[x].c), 1)#tf.stack(tf.concat((encoder_fw_final_state[x].c, encoder_bw_final_state[x].c), 1))
encoder_final_state_h=tf.concat((encoder_fw_final_state[x].h, encoder_bw_final_state[x].h), 1)# tf.stack(tf.concat((encoder_fw_final_state[x].h, encoder_bw_final_state[x].h), 1))
encoder_final_state =encoder_final_state+ (tf.contrib.rnn.LSTMStateTuple(c=encoder_final_state_c,h=encoder_final_state_h),)
#encoder_final_state = tf.contrib.rnn.LSTMStateTuple(c=encoder_final_state_c,h=encoder_final_state_h)
print('before')
print(encoder_fw_final_state)
return encoder_outputs, encoder_final_state
I have found another implementation here as shown below :
t
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob,
source_sequence_length, source_vocab_size,
encoding_embedding_size):
"""
Create encoding layer
:param rnn_inputs: Inputs for the RNN
:param rnn_size: RNN Size
:param num_layers: Number of layers
:param keep_prob: Dropout keep probability
:param source_sequence_length: a list of the lengths of each sequence in the batch
:param source_vocab_size: vocabulary size of source data
:param encoding_embedding_size: embedding size of source data
:return: tuple (RNN output, RNN state)
"""
# Encoder embedding
enc_embed = tf.contrib.layers.embed_sequence(rnn_inputs, source_vocab_size, encoding_embedding_size)
def create_cell_fw(rnn_size,x):
with tf.variable_scope("create_cell_fw_"+str(x)):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2) , reuse=tf.AUTO_REUSE )
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
def create_cell_bw(rnn_size,x):
with tf.variable_scope("create_cell_bw_"+str(x)):
lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,initializer=tf.random_uniform_initializer(-0.1,0.1,seed=2) ,reuse=tf.AUTO_REUSE )
drop = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
return drop
enc_cell_fw = [create_cell_fw(rnn_size,x) for x in range(num_layers)]
enc_cell_bw = [create_cell_bw(rnn_size,x) for x in range(num_layers)]
output=enc_embed
for n in range(num_layers):
cell_fw = enc_cell_fw[n]
cell_bw = enc_cell_bw[n]
state_fw = cell_fw.zero_state(batch_size, tf.float32)
state_bw = cell_bw.zero_state(batch_size, tf.float32)
((output_fw, output_bw),(encoder_fw_final_state,encoder_bw_final_state))= tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, output,source_sequence_length,
state_fw, state_bw, dtype=tf.float32)
output = tf.concat([output_fw, output_bw], axis=2)
final_state=tf.concat([encoder_fw_final_state,encoder_bw_final_state], axis=2 )
return output , final_state
the problem with this implementation is that I get a shape error :
Trying to share variable bidirectional_rnn/fw/lstm_cell/kernel, but specified shape (168, 224) and found shape (256, 224).
it appears that other people have faced a similar when creating the RNN cells and the solution is to use the MultiRNNCell to create the layered cell . But if use MultiRNNCell I will not be able to use the second implementation since the multiRNNCell does not support indexing. thus I will not be ale to loop through the list of cells and create multiples RNNs .
I would really appreciate your help to guide me on this .
I'm using tensorflow 1.3
Both codes does seem a little overly complex. Anyway I tried a much simpler version of it and it worked. In your code, try after removing reuse=tf.AUTO_REUSE from create_cell_fw and create_cell_bw. Below is my simpler implementation.
def encoding_layer(input_data, num_layers, rnn_size, sequence_length, keep_prob):
output = input_data
for layer in range(num_layers):
with tf.variable_scope('encoder_{}'.format(layer),reuse=tf.AUTO_REUSE):
cell_fw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob = keep_prob)
cell_bw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob = keep_prob)
outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw,
cell_bw,
output,
sequence_length,
dtype=tf.float32)
output = tf.concat(outputs,2)
state = tf.concat(states,2)
return output, state
Related
I am creating a captcha image recognition system. It first extracts the features of the images with ResNet and then uses LSTM to recognize the words and letter in the image. An fc layer is supposed to connect the two. I have not designed a LSTM model before and am very new to machine learning, so I am pretty confused and overwhelmed by this.
I am confused enough that I am not even totally sure what questions I should ask. But here are a couple things that stand out to me:
What is the purpose of embedding the captions if the captcha images are all randomized?
Is the linear fc layer in the first part of the for loop the correct way to connect the CNN feature vectors to the LSTM?
Is this a correct use of the LSTM cell in the LSTM?
And, in general, if there are any suggestions of general directions to look into, that would be really appreciated.
So far, I have:
class LSTM(nn.Module):
def __init__(self, cnn_dim, hidden_size, vocab_size, num_layers=1):
super(LSTM, self).__init__()
self.cnn_dim = cnn_dim #i think this is the input size
self.hidden_size = hidden_size
self.vocab_size = vocab_size #i think this should be the output size
# Building your LSTM cell
self.lstm_cell = nn.LSTMCell(input_size=self.vocab_size, hidden_size=hidden_size)
'''Connect CNN model to LSTM model'''
# output fully connected layer
# CNN does not necessarily need the FCC layers, in this example it is just extracting the features, that gets set to the LSTM which does the actual processing of the features
self.fc_in = nn.Linear(cnn_dim, vocab_size) #this takes the input from the CNN takes the features from the cnn #cnn_dim = 512, hidden_size = 128
self.fc_out = nn.Linear(hidden_size, vocab_size) # this is the looper in the LSTM #I think this is correct?
# embedding layer
self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.vocab_size)
# activations
self.softmax = nn.Softmax(dim=1)
def forward(self, features, captions):
#features: extracted features from ResNet
#captions: label of images
batch_size = features.size(0)
cnn_dim = features.size(1)
hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize hidden state with zeros
cell_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize cell state with zeros
outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()
captions_embed = self.embed(captions)
'''Design LSTM model for captcha image recognition'''
# Pass the caption word by word for each time step
# It receives an input(x), makes an output(y), and receives this output as an input again recurrently
'''Defined hidden state, cell state, outputs, embedded captions'''
# can be designed to be word by word or character by character
for t in range(captions).size(1):
# for the first time step the input is the feature vector
if t == 0:
# probably have to get the output from the ResNet layer
# use the LSTM cells in here i presume
x = self.fc_in(features)
hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
x = self.fc_out(hidden_state)
outputs.append(hidden_state)
# for the 2nd+ time steps
else:
hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
x = self.fc_out(hidden_state)
outputs.append(hidden_state)
# build the output tensor
outputs = torch.stack(outputs,dim=0)
return outputs
nn.Embedding() is usually used to transfer a sparse one-hot vector to a dense vector (e.g. transfer 'a' to [0.1,0.2,...]) for computation practically. I do not understand why you try to embed captions, which looks like ground-truth. If you want to compute loss with that, try nn.CTCLoss().
If you are going to send a string to LSTM, it is recommended to embed characters in the string with nn.Embedding() firstly, which makes them dense and computational-practical. But if the inputs of LSTM is something extracted from CNN (or other modules), it is already dense and computational-practical and not necessary to project them with fc_in from my view.
I often use nn.LSTM() instead of nn.LSTMCell(), for the latter is troublesome.
There are some bugs in your code and I fixed them:
import torch
from torch import nn
class LSTM(nn.Module):
def __init__(self, cnn_dim, hidden_size, vocab_size, num_layers=1):
super(LSTM, self).__init__()
self.cnn_dim = cnn_dim # i think this is the input size
self.hidden_size = hidden_size
self.vocab_size = vocab_size # i think this should be the output size
# Building your LSTM cell
self.lstm_cell = nn.LSTMCell(input_size=self.vocab_size, hidden_size=hidden_size)
'''Connect CNN model to LSTM model'''
# output fully connected layer
# CNN does not necessarily need the FCC layers, in this example it is just extracting the features, that gets set to the LSTM which does the actual processing of the features
self.fc_in = nn.Linear(cnn_dim,
vocab_size) # this takes the input from the CNN takes the features from the cnn #cnn_dim = 512, hidden_size = 128
self.fc_out = nn.Linear(hidden_size,
vocab_size) # this is the looper in the LSTM #I think this is correct?
# embedding layer
self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.vocab_size)
# activations
self.softmax = nn.Softmax(dim=1)
def forward(self, features, captions):
# features: extracted features from ResNet
# captions: label of images
batch_size = features.size(0)
cnn_dim = features.size(1)
hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize hidden state with zeros
cell_state = torch.zeros((batch_size, self.hidden_size)).cuda() # Initialize cell state with zeros
# outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()
outputs = torch.Tensor([]).cuda()
captions_embed = self.embed(captions)
'''Design LSTM model for captcha image recognition'''
# Pass the caption word by word for each time step
# It receives an input(x), makes an output(y), and receives this output as an input again recurrently
'''Defined hidden state, cell state, outputs, embedded captions'''
# can be designed to be word by word or character by character
# for t in range(captions).size(1):
for t in range(captions.size(1)):
# for the first time step the input is the feature vector
if t == 0:
# probably have to get the output from the ResNet layer
# use the LSTM cells in here i presume
x = self.fc_in(features)
# hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
hidden_state, cell_state = self.lstm_cell(x, (hidden_state, cell_state))
x = self.fc_out(hidden_state)
# outputs.append(hidden_state)
outputs = torch.cat([outputs, hidden_state])
# for the 2nd+ time steps
else:
# hidden_state, cell_state = self.lstm_cell(x[t], (hidden_state, cell_state))
hidden_state, cell_state = self.lstm_cell(x, (hidden_state, cell_state))
x = self.fc_out(hidden_state)
# outputs.append(hidden_state)
outputs = torch.cat([outputs, hidden_state])
# build the output tensor
# outputs = torch.stack(outputs, dim=0)
return outputs
m = LSTM(16, 32, 10)
m = m.cuda()
features = torch.randn((2, 16))
features = features.cuda()
captions = torch.randn((2, 10))
captions = torch.clip(captions, 0, 9)
captions = captions.long()
captions = captions.cuda()
m(features, captions)
This paper may help you somewhat: https://arxiv.org/abs/1904.01906
Google's Recommendation System course include a section on Retrieval, where it is mentioned that recommendations can be made by checking similarity between user embedding Ψ(X) and movie embedding Vj.
How to get particular user embedding through Ψ(X)? Going through below code (which can be found here), output in create_network() should be Ψ(X), so how would we extract embedding of particular user to create user recommendations?
def build_softmax_model(rated_movies, embedding_cols, hidden_dims):
"""Builds a Softmax model for MovieLens.
Args:
rated_movies: DataFrame of traing examples.
embedding_cols: A dictionary mapping feature names (string) to embedding
column objects. This will be used in tf.feature_column.input_layer() to
create the input layer.
hidden_dims: int list of the dimensions of the hidden layers.
Returns:
A CFModel object.
"""
def create_network(features):
"""Maps input features dictionary to user embeddings.
Args:
features: A dictionary of input string tensors.
Returns:
outputs: A tensor of shape [batch_size, embedding_dim].
"""
# Create a bag-of-words embedding for each sparse feature.
inputs = tf.feature_column.input_layer(features, embedding_cols)
# Hidden layers.
input_dim = inputs.shape[1].value
for i, output_dim in enumerate(hidden_dims):
w = tf.get_variable(
"hidden%d_w_" % i, shape=[input_dim, output_dim],
initializer=tf.truncated_normal_initializer(
stddev=1./np.sqrt(output_dim))) / 10.
outputs = tf.matmul(inputs, w)
input_dim = output_dim
inputs = outputs
return outputs
train_rated_movies, test_rated_movies = split_dataframe(rated_movies)
train_batch = make_batch(train_rated_movies, 200)
test_batch = make_batch(test_rated_movies, 100)
with tf.variable_scope("model", reuse=False):
# Train
train_user_embeddings = create_network(train_batch)
train_labels = select_random(train_batch["label"])
with tf.variable_scope("model", reuse=True):
# Test
test_user_embeddings = create_network(test_batch)
test_labels = select_random(test_batch["label"])
movie_embeddings = tf.get_variable(
"input_layer/movie_id_embedding/embedding_weights")
test_loss = softmax_loss(
test_user_embeddings, movie_embeddings, test_labels)
train_loss = softmax_loss(
train_user_embeddings, movie_embeddings, train_labels)
_, test_precision_at_10 = tf.metrics.precision_at_k(
labels=test_labels,
predictions=tf.matmul(test_user_embeddings, movie_embeddings, transpose_b=True),
k=10)
metrics = (
{"train_loss": train_loss, "test_loss": test_loss},
{"test_precision_at_10": test_precision_at_10}
)
embeddings = {"movie_id": movie_embeddings}
return CFModel(embeddings, train_loss, metrics)
CFModel is a helper class to train model using SGD.
If you pass the feature vector for a single user to the embedding model, its output Ψ(X) will be the query vector for that user, with shape [1, embedding_dim].
I am trying to implement a hierarchical transformer for document classification in Keras/tensorflow, in which:
(1) a word-level transformer produces a representation of each sentence, and attention weights for each word, and,
(2) a sentence-level transformer uses the outputs from (1) to produce a representation of each document, and attention weights for each sentence, and finally,
(3) the document representations produced by (2) are used to classify documents (in the following example, as belonging or not belonging to a given class).
I am attempting to model the classifier on Yang et al.'s approach here (https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf), but replacing the GRU and attention layers with transformers.
I am using Apoorv Nandan's transformer implementation from https://keras.io/examples/nlp/text_classification_with_transformer/.
I have two issues for which I would be grateful for the community's help:
(1) I get an error in the upper (sentence) level model that I can't resolve (details and code below)
(2) I don't know how to extract the word- and sentence-level attention weights, and value advice on how best to do this.
I am new to both Keras and this forum, so apologies for obvious mistakes and thank you in advance for any help.
Here is a reproducible example, indicating where I encounter errors:
First, establish the multi-head attention, transformer, and token/position embedding layers, after Nandan.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
For the purpose of this example, the data are 10,000 documents, each truncated to 15 sentences, each sentence with a maximum of 60 words, which are already converted to integer tokens 1-1000.
X is a 3-D tensor (10000, 15, 60) containing these tokens. y is a 1-D tensor containing the classes of the documents (1 or 0). For the purpose of this example there is no relation between X and y.
The following produces the example data:
max_docs = 10000
max_sentences = 15
max_words = 60
X = tf.random.uniform(shape=(max_docs, max_sentences, max_words), minval=1, maxval=1000, dtype=tf.dtypes.int32, seed=1)
y = tf.random.uniform(shape=(max_docs,), minval=0, maxval=2, dtype=tf.dtypes.int32, seed=1)
Here I attempt to construct the word level encoder, after https://keras.io/examples/nlp/text_classification_with_transformer/:
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size=1000
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate,name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu",name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
It looks as though this word encoder works as intended to produce a representation of each sentence. Here, run on the 1st document, it produces a tensor of shape (15, 100), containing the vectors representing each of 15 sentences:
word_encoder(X[0]).shape
My problem is in connecting this to the higher (sentence) level model, to produce document representations.
I get error "NotImplementedError" when trying to apply the word encoder to each sentence in a document. I would be grateful for any help in fixing this issue, since the error message is not informative as to the specific problem.
After applying the word encoder to each sentence, the goal is to apply another transformer to produce attention weights for each sentence, and a document-level representation with which to perform classification. I can't determine whether this part of the model will work because of the error above.
Finally, I would like to extract word- and sentence-level attention weights for each document, and would be grateful for advice on how to do so.
Thank you in advance for any insight.
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
# This is the line producing "NotImplementedError":
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_dense = layers.TimeDistributed(Dense(int(L2_dense_units)),name='sentence_dense')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_dense)
preds = layers.Dense(1, activation='sigmoid', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
I got NotImplementedError as well while trying to do the same thing as you. The thing is Keras's TimeDistributed layer needs to know its inner custom layer's output shapes. So you should add compute_output_shape method to your custom layers.
In your case MultiHeadSelfAttention, TransformerBlock and TokenAndPositionEmbedding layers should include:
class MultiHeadSelfAttention(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
After you add these methods you should be able to run your code.
As for your second question, I am not sure but maybe you can return the "weights" variable that is returned from MultiHeadSelfAttention's attention method in call methods of both MultiHeadSelfAttention and TransformerBlock. So that you can access it where you build your model.
The idea behind this is I want to try some old school gradient ascent style visualization with Bert Model.
I want to know the effect of input on a specific layer's specific dimension. Thus, I took the gradient of the output of a specific layer's specific dimension wrt the first word embedding layer's output.
The best thing I can do here is the following:
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True,output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)
s = 'I want to sleep'
inputs = tokenizer.encode_plus(s,return_tensors='pt', add_special_tokens=False,is_pretokenized=True)
input_ids = inputs['input_ids']
output = model(input_ids)
hidden_states = output[-2]
X = hidden_states[0] #embedding space, shape: [1,4,768] (batch_size,sentence_length,embedding dimension)
y = hidden_states[3][0][0][0] ##the 0th position and 0th dimension of output of 3rd hidden layer. Dimension should just be [1], a scalar.
torch.autograd.grad(y,X,retain_graph=True, create_graph=True) #I take the gradient of y wrt. Since y is scalar. The dimension of the gradient is just the dimension of X.
This is, however, not good enough. I want the gradient wrt the actual word embedding layer. However, Transformer's embedding contains "position_embedding" and "token_type_embedding". Here's the code for the first layer embedding:
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids, token_type_ids=None, position_ids=None):
seq_length = input_ids.size(1)
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
Ideally, I want the gradient wrt JUST “words_embeddings" Rather than, wrt "words_embeddings + position_embeddings + token_type_embeddings" and follows by layerNorm and dropout.
I think I can do this by modifying changing the model. Is there a way to it without changing the model?
I am building a next-character prediction LSTM for sentences.
I was following the tutorial here https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/ on how to make the data input process part of the tensorflow graph, and now I have a stateful LSTM that is fed with symbolic (!) batches generated by tf.contrib.training.batch_sequences_with_states, which are in turn read from TF.SequenceExamples of varying lengths (Char-RNN working on characters in a sentence), as shown in the code below.
The whole input and batching process is therefore part of the compute graph.
The training works, but since the input is symbolic (not a TF.placeholder), I cannot figure out how to feed in my own sentence defined as a string to the LSTM to perform inference (sample from model). Any ideas?
import tensorflow as tf
import numpy as np
from tensorflow.python.util import nest
import SequenceHandler
import DataLoader
# SETTINGS
learning_rate = 0.001
batch_size = 128
num_unroll = 200
num_enqueue_threads = 10
lstm_size = 256
vocab_size = 39
# DATA
key, context, sequences = SequenceHandler.loadSequence("input.tf") # Loads TF.SequenceExample sequence using TF.RecordReader
# MODEL
cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=lstm_size)
initial_states = {"lstm_state_c": tf.zeros(cell.state_size[0], dtype=tf.float32), "lstm_state_h": tf.zeros(cell.state_size[0], dtype=tf.float32)}
batch = tf.contrib.training.batch_sequences_with_states(
input_key=key,
input_sequences=sequences,
input_context=context,
input_length=tf.cast(context["length"], tf.int32),
initial_states=initial_states,
num_unroll=num_unroll,
batch_size=batch_size,
num_threads=num_enqueue_threads,
capacity=batch_size * num_enqueue_threads * 2)
# BATCH INPUT
inputs = batch.sequences["inputs"]
targets = batch.sequences["outputs"]
# Convert input into float one-hot representation
embedding = tf.constant(np.eye(vocab_size), dtype=tf.float32)
inputs = tf.nn.embedding_lookup(embedding, inputs)
# Reshape inputs (and targets respectively) into list of length T (unrolling length), with each element being a Tensor of shape (batch_size, input_dimensionality)
inputs_by_time = tf.split(1, num_unroll, inputs)
inputs_by_time = [tf.squeeze(elem, squeeze_dims=1) for elem in inputs_by_time]
targets_by_time = tf.split(1, num_unroll, targets)
targets_by_time = [tf.squeeze(elem, squeeze_dims=1) for elem in targets_by_time]
targets_by_time_packed = tf.pack(targets_by_time)
# Build RNN
state_name=("lstm_state_c", "lstm_state_h")
state_size = cell.state_size
state_is_tuple = nest.is_sequence(state_size)
state_name_tuple = nest.is_sequence(state_name)
state_name_flat = nest.flatten(state_name)
state_size_flat = nest.flatten(state_size)
initial_state = nest.pack_sequence_as(
structure=state_size,
flat_sequence=[batch.state(s) for s in state_name_flat])
seq_lengths = batch.context["length"]
(outputs, state) = tf.nn.state_saving_rnn(cell, inputs_by_time, state_saver=batch,
sequence_length=seq_lengths, state_name=state_name)
# Create softmax parameters, weights and bias, and apply to RNN outputs at each timestep
with tf.variable_scope('softmax') as sm_vs:
softmax_w = tf.get_variable("softmax_w", [lstm_size, vocab_size])
softmax_b = tf.get_variable("softmax_b", [vocab_size])
logits = [tf.matmul(outputStep, softmax_w) + softmax_b for outputStep in outputs]
logit = tf.pack(logits)
probs = tf.nn.softmax(logit)
with tf.name_scope('loss'):
# Compute mean cross entropy loss for each output.
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, targets_by_time_packed)
mean_loss = tf.reduce_mean(loss)
global_step = tf.get_variable('global_step', [],
initializer=tf.constant_initializer(0.0))
learning_rate = tf.constant(learning_rate)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(mean_loss, tvars),
5.0)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.apply_gradients(zip(grads, tvars),
global_step=global_step)
# TRAINING LOOP
# Start a prefetcher in the background
sess = tf.Session()
tf.train.start_queue_runners(sess=sess)
init_op = tf.initialize_all_variables()
sess.run(init_op)
# LOGGING
summary_writer = tf.train.SummaryWriter("log", sess.graph)
vocab_index_dict, index_vocab_dict, vocab_size = DataLoader.load_vocab("characters.json", "UTF-8")
while True:
# Step through batches, perform training
trainOps = [mean_loss, state, train_op,
global_step]
res = sess.run(trainOps) # THIS WORKS - LOSS DECLINES
testString = "Hello"
# HOW TO SAMPLE FROM MODEL, GIVEN INPUT testString HERE?
In general, I have trouble understanding how to work with the data input as part of the compute graph, in terms of how to split it for cross-validation etc., and there seem to be no examples in that direction using TFRecords.