I am trying to train mbert with LSTM , then fully connected layer to detect paraphrasing between two sentences. But, thers is error in the dimensions.
this is the clsss of class SentencePairClassifier(nn.Module):
class SentencePairClassifier(nn.Module):
def __init__(self, bert_model="bert-base-multilingual-cased", freeze_bert=False):
super(SentencePairClassifier, self).__init__()
# Instantiating BERT-based model object
self.bert_layer = AutoModel.from_pretrained(bert_model)
# Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
if bert_model == "bert-base-multilingual-cased": #179M parameters
hidden_size = 768
# Freeze bert layers and only train the classification layer weights
if freeze_bert:
for p in self.bert_layer.parameters():
p.requires_grad = False
#lstm layer
self.LSTM = nn.LSTM(hidden_size,512,batch_first=True,bidirectional=True)
# Classification layer
self.cls_layer = nn.Linear(512*2, 1)
self.dropout = nn.Dropout(p=0.1)
#autocast() # run in mixed precision
def forward(self, input_ids, attn_masks, token_type_ids):
'''
Inputs:
-input_ids : Tensor containing token ids
-attn_masks : Tensor containing attention masks to be used to focus on non-padded values
-token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
'''
# Feeding the inputs to the BERT-based model to obtain contextualized representations
cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)
cont_reps = cont_reps.permute(1, 0, 2)
lstm_output, (last_hidden, last_cell) = self.lstm(cont_reps) ## extract the 1st token's embeddings
hidden = torch.cat((lstm_output[:,-1, :512],lstm_output[:,0, 512:]),dim=-1)
# Feeding to the classifier layer the last layer hidden-state of the [CLS] token further processed by a
# Linear Layer and a Tanh activation. The Linear layer weights were trained from the sentence order prediction (ALBERT) or next sentence prediction (BERT)
# objective during pre-training.
logits = self.cls_layer(hidden.view(-1,512*2))
return logits
this is class CustomDataset(Dataset):
class CustomDataset(Dataset):
def __init__(self, data, maxlen, with_labels=True, bert_model="bert-base-multilingual-cased"):
self.data = data # pandas dataframe
#Initialize the tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
self.maxlen = maxlen
self.with_labels = with_labels
def __len__(self):
return len(self.data)
def __getitem__(self, index):
# Selecting sentence1 and sentence2 at the specified index in the data frame
sent1 = str(self.data.loc[index, 'sentence1'])
sent2 = str(self.data.loc[index, 'sentence2'])
# Tokenize the pair of sentences to get token ids, attention masks and token type ids
encoded_pair = self.tokenizer(sent1, sent2,
padding='max_length', # Pad to max_length
truncation=True, # Truncate to max_length
max_length=self.maxlen,
return_tensors='pt') # Return torch.Tensor objects
token_ids = np.array(encoded_pair["input_ids"], dtype="int32")
attn_masks = np.array(encoded_pair["attention_mask"], dtype="int32")
token_type_ids = np.array(encoded_pair["token_type_ids"], dtype="int32")
# token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids
# attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with "0" for padded values and "1" for the other values
# token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
if self.with_labels: # True if the dataset has labels
label = self.data.loc[index, 'label']
return token_ids, attn_masks, token_type_ids, label
else:
return token_ids, attn_masks, token_type_ids
note: I tried the code without LSTM its working but after adding the LSTM the errors starts appears.
This is the error
/usr/local/lib/python3.8/dist-packages/transformers/modeling_bert.py in transpose_for_scores(self, x)
237 new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
238 x = x.view(*new_x_shape)
--> 239 return x.permute(0, 2, 1, 3)
240
241 def forward(
RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 5 is not equal to len(dims) = 4
I tried to change the dimensions but it didn't work.
Related
The actual code
import torch
from torch import nn
import re
# Define the vocabulary of the input and output sequences
vocab = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", ".",
"old", "man", "and", "sea", "A", "story", "as", "time", "cat", "in", "hat",
"To", "be", "or", "not", "to", "be"]
# Define the input and output sizes
input_size = len(vocab)
output_size = len(vocab)
# Define the other parameters of the model
hidden_size = 256
n_layers = 2
dropout = 0.2
batch_size = 64
learning_rate = 0.01
class MyLSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout, batch_size):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = dropout
self.batch_size = batch_size
# Define the LSTM cell to use in the layer
self.lstm_cell = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
# Define the architecture of the using the LSTM cell
self.model = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
def forward(self, input_seq, hidden):
# Preprocess the input string to lowercase and remove punctuation marks
input_seq = str(input_seq).lower()
input_seq = re.sub(r'[^\w\s]', '', input_seq)
# Convert the input string to a list of integers, where each integer represents the index of a word in the vocab list
input_seq = [vocab.index(word) for word in input_seq.split() if word in vocab]
# Convert the list of integers to a tensor
input_seq = torch.tensor(input_seq).unsqueeze(0)
# Reshape the input tensor to have three dimensions
input_seq = input_seq.unsqueeze(1)
# Apply the LSTM cell to the input sequence
hidden = self.init_hidden()
output, hidden = self.lstm_cell(input_seq, hidden)
return output, hidden
def loss(self, output, target_seq):
# Convert the target string to a list of integers, where each integer represents the index of a word in the vocab list
target_seq = [vocab.index(word) for word in target_seq.split() if word in vocab]
# Convert the list of integers to a tensor
target_seq = torch.tensor(target_seq)
# Calculate the cross-entropy loss between the output and the target sequence, using a sequence loss function
output = output.view(-1, output_size)
target_seq = target_seq.view(-1)
loss = nn.CrossEntropyLoss()(output, target_seq)
return loss
def init_hidden(self):
# Initialize the hidden state with zeros
return torch.zeros(self.num_layers, self.batch_size, self.hidden_size).unsqueeze(0)
def train(self, train_data):
# Loop through each training example
for input_seq, target_seq in train_data:
# Convert input and target sequences to tensors
input_seq = [vocab.index(word) for word in input_seq.split()]
input_seq = torch.tensor(input_seq)
target_seq = torch.Tensor(target_seq)
# Set initial hidden state to all zeros
hidden = torch.zeros(1, self.hidden_size)
# Forward pass to get the output sequence
output, hidden = self.forward(input_seq, hidden)
# Calculate the loss between the output sequence and the target sequence
loss = self.loss(output, target_seq)
# Backward pass to compute the gradients of the loss with respect to the model's parameters
loss.backward()
# Update the model's parameters using the optimizer object
optimizer = torch.optim.SGD(self.parameters(), lr=learning_rate)
optimizer.step()
# Create an instance of the MyLSTMModel class
model = MyLSTMModel(input_size, hidden_size, n_layers, dropout, batch_size)
# Define some example training data
train_data = [("The quick brown fox jumps over the lazy dog", "A story as old as time"),
("To be or not to be", "That is the question")]
# Train the model on the training data
model.train(train_data)
i run the code as a normal python script
my code goal
The code defines a MyLSTMModel class which extends the nn.Module class from PyTorch.
This class contains methods for defining the model architecture, initializing the hidden state of the model,
calculating the loss, training the model, and generating text using the trained model.
The MyLSTMModel class uses an LSTM (Long Short-Term Memory) layer to process the input sequence
and generate an output sequence. LSTM is a type of recurrent neural network that is particularly
well-suited for processing sequences of data. It can remember important information from earlier in
the sequence and use that information to make predictions about the future. In this case, the LSTM layer
is used to generate text based on the input sequence.
*but I get this error when I run the code:*
Traceback (most recent call last):
File "D:\path", line 121, in
model.train(train_data)
File "D:\path"", line 95, in train
target_seq = torch.Tensor(target_seq)
TypeError: new(): invalid data type 'str'
I have seen other people have this error and I try to follow the steps to resolve, but continue to receive this error. "RuntimeError: Input and parameter tensors are not at the same device, found input tensor at cpu and parameter tensor at cuda:0"
I run both model.to(device) and input_seq.to(device). Error says it found an input tensor on CPU, but all input data should be on GPU with input_seq.to(device). Below is fill code
text = ['hey how are you','good i am fine','have a nice day']
# Join all the sentences together and extract the unique characters from the combined sentences
chars = set(''.join(text))
# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))
# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}
# Finding the length of the longest string in our data
maxlen = len(max(text, key=len))
# Padding
# A simple loop that loops through the list of sentences and adds a ' ' whitespace until the length of
# the sentence matches the length of the longest sentence
for i in range(len(text)):
while len(text[i])<maxlen:
text[i] += ' '
# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []
for i in range(len(text)):
# Remove last character for input sequence
input_seq.append(text[i][:-1])
# Remove first character for target sequence
target_seq.append(text[i][1:])
print("Input Sequence: {}\nTarget Sequence: {}".format(input_seq[i], target_seq[i]))
for i in range(len(text)):
input_seq[i] = [char2int[character] for character in input_seq[i]]
target_seq[i] = [char2int[character] for character in target_seq[i]]
dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)
def one_hot_encode(sequence, dict_size, seq_len, batch_size):
# Creating a multi-dimensional array of zeros with the desired output shape
features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
# Replacing the 0 at the relevant character index with a 1 to represent that character
for i in range(batch_size):
for u in range(seq_len):
features[i, u, sequence[i][u]] = 1
return features
# Input shape --> (Batch Size, Sequence Length, One-Hot Encoding Size)
input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()
# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
device = torch.device("cuda")
print("GPU is available")
else:
device = torch.device("cpu")
print("GPU not available, CPU used")
class Model(nn.Module):
def __init__(self, input_size, output_size, hidden_dim, n_layers):
super(Model, self).__init__()
# Defining some parameters
self.hidden_dim = hidden_dim
self.n_layers = n_layers
#Defining the layers
# RNN Layer
self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
# Fully connected layer
self.fc = nn.Linear(hidden_dim, output_size)
def forward(self, x):
batch_size = x.size(0)
# Initializing hidden state for first input using method defined below
hidden = self.init_hidden(batch_size)
# Passing in the input and hidden state into the model and obtaining outputs
out, hidden = self.rnn(x, hidden)
# Reshaping the outputs such that it can be fit into the fully connected layer
out = out.contiguous().view(-1, self.hidden_dim)
out = self.fc(out)
return out, hidden
def init_hidden(self, batch_size):
# This method generates the first hidden state of zeros which we'll use in the forward pass
# We'll send the tensor holding the hidden state to the device we specified earlier as well
hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
return hidden
# Instantiate the model with hyperparameters
model = Model(input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1)
# We'll also set the model to the device that we defined earlier (default is CPU)
model.to(device)
# Define hyperparameters
n_epochs = 100
lr=0.01
# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Training Run
for epoch in range(1, n_epochs + 1):
optimizer.zero_grad() # Clears existing gradients from previous epoch
input_seq.to(device)
target_seq.to(device)
output, hidden = model(input_seq)
loss = criterion(output, target_seq.view(-1).long())
loss.backward() # Does backpropagation and calculates gradients
optimizer.step() # Updates the weights accordingly
if epoch%10 == 0:
print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
print("Loss: {:.4f}".format(loss.item()))
Unlike the to method available on nn.Modules such as your model. The to method on Tensors is not an in-place operation! As stated on the documentation page:
This method [nn.Module.to] modifies the module in-place.
vs for Tensor.to:
[...] the returned tensor is a copy of self with the desired [...] torch.device.
In other words, you need to reassign the tensors in order to effectively send them to the device.
input_seq = input_seq.to(device)
target_seq = target_seq.to(device)
While an nn.Module won't need this treatment:
model.to(device)
To clearly understand what happens here, take this example:
>>> x = torch.zeros(1) # on cpu
>>> y = x.cuda() # y is a copy of x
>>> y.device # placed on cuda device
'cuda:0'
>>> x.device # but x remains on the original device
'cpu'
I have a document classification task, that classifies documents as good (1) or bad (0), and I use some sentence embeddings for each document to classify the documents accordingly.
What I like to do is retrieving the attention scores for each document, to obtain the most "relevant" sentences (i.e., those with high attention scores)
I padded each document to the same length (i.e., 1000 sentences per document). So my tensor for 5000 documents looks like this X = np.ones(shape=(5000, 1000, 200)) (5000 documents with each having a 1000 sequence of sentence vectors and each sentence vector consisting of 200 features).
My network looks like this:
no_sentences_per_doc = 1000
sentence_embedding = 200
sequence_input = Input(shape=(no_sentences_per_doc, sentence_embedding))
gru_layer = Bidirectional(GRU(50,
return_sequences=True
))(sequence_input)
sent_dense = Dense(100, activation='relu', name='sent_dense')(gru_layer)
sent_att,sent_coeffs = AttentionLayer(100,return_coefficients=True, name='sent_attention')(sent_dense)
preds = Dense(1, activation='sigmoid',name='output')(sent_att)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=[TruePositives(name='true_positives'),
TrueNegatives(name='true_negatives'),
FalseNegatives(name='false_negatives'),
FalsePositives(name='false_positives')
])
history = model.fit(X, y, validation_data=(x_val, y_val), epochs=10, batch_size=32)
After training I retrieved the attention scores as follows:
sent_att_weights = Model(inputs=sequence_input,outputs=sent_coeffs)
## load a single sample
## from file with 150 sentences (one sentence per line)
## each sentence consisting of 200 features
x_sample = np.load(x_sample)
## and reshape to (1, 1000, 200)
x_sample = x_sample.reshape(1,1000,200)
output_array = sent_att_weights.predict(x_sample)
However, if I show the top 3 attention scores for the sentences, I also obtain sentence indices that are, for example, [432, 434, 999] for a document that has only 150 sentences (the rest is padded, i.e., just zeros).
Does that make sense or am I doing something wrong here? (is there a mistake in my attention layer? Or is due to a low F-score?)
The attention layer I use is the following:
class AttentionLayer(Layer):
"""
https://humboldt-wi.github.io/blog/research/information_systems_1819/group5_han/
"""
def __init__(self,attention_dim=100,return_coefficients=False,**kwargs):
# Initializer
self.supports_masking = True
self.return_coefficients = return_coefficients
self.init = initializers.get('glorot_uniform') # initializes values with uniform distribution
self.attention_dim = attention_dim
super(AttentionLayer, self).__init__(**kwargs)
def build(self, input_shape):
# Builds all weights
# W = Weight matrix, b = bias vector, u = context vector
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)),name='W')
self.b = K.variable(self.init((self.attention_dim, )),name='b')
self.u = K.variable(self.init((self.attention_dim, 1)),name='u')
self.trainable_weights = [self.W, self.b, self.u]
super(AttentionLayer, self).build(input_shape)
def compute_mask(self, input, input_mask=None):
return None
def call(self, hit, mask=None):
# Here, the actual calculation is done
uit = K.bias_add(K.dot(hit, self.W),self.b)
uit = K.tanh(uit)
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
ait = K.expand_dims(ait)
weighted_input = hit * ait
if self.return_coefficients:
return [K.sum(weighted_input, axis=1), ait]
else:
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
if self.return_coefficients:
return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[-1], 1)]
else:
return input_shape[0], input_shape[-1]
Note that I use keras with tensorflow backend version 2.1.; the attention layer was originally written for theano, but I use import tensorflow.keras.backend as K
I am trying to implement a hierarchical transformer for document classification in Keras/tensorflow, in which:
(1) a word-level transformer produces a representation of each sentence, and attention weights for each word, and,
(2) a sentence-level transformer uses the outputs from (1) to produce a representation of each document, and attention weights for each sentence, and finally,
(3) the document representations produced by (2) are used to classify documents (in the following example, as belonging or not belonging to a given class).
I am attempting to model the classifier on Yang et al.'s approach here (https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf), but replacing the GRU and attention layers with transformers.
I am using Apoorv Nandan's transformer implementation from https://keras.io/examples/nlp/text_classification_with_transformer/.
I have two issues for which I would be grateful for the community's help:
(1) I get an error in the upper (sentence) level model that I can't resolve (details and code below)
(2) I don't know how to extract the word- and sentence-level attention weights, and value advice on how best to do this.
I am new to both Keras and this forum, so apologies for obvious mistakes and thank you in advance for any help.
Here is a reproducible example, indicating where I encounter errors:
First, establish the multi-head attention, transformer, and token/position embedding layers, after Nandan.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
For the purpose of this example, the data are 10,000 documents, each truncated to 15 sentences, each sentence with a maximum of 60 words, which are already converted to integer tokens 1-1000.
X is a 3-D tensor (10000, 15, 60) containing these tokens. y is a 1-D tensor containing the classes of the documents (1 or 0). For the purpose of this example there is no relation between X and y.
The following produces the example data:
max_docs = 10000
max_sentences = 15
max_words = 60
X = tf.random.uniform(shape=(max_docs, max_sentences, max_words), minval=1, maxval=1000, dtype=tf.dtypes.int32, seed=1)
y = tf.random.uniform(shape=(max_docs,), minval=0, maxval=2, dtype=tf.dtypes.int32, seed=1)
Here I attempt to construct the word level encoder, after https://keras.io/examples/nlp/text_classification_with_transformer/:
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size=1000
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate,name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu",name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
It looks as though this word encoder works as intended to produce a representation of each sentence. Here, run on the 1st document, it produces a tensor of shape (15, 100), containing the vectors representing each of 15 sentences:
word_encoder(X[0]).shape
My problem is in connecting this to the higher (sentence) level model, to produce document representations.
I get error "NotImplementedError" when trying to apply the word encoder to each sentence in a document. I would be grateful for any help in fixing this issue, since the error message is not informative as to the specific problem.
After applying the word encoder to each sentence, the goal is to apply another transformer to produce attention weights for each sentence, and a document-level representation with which to perform classification. I can't determine whether this part of the model will work because of the error above.
Finally, I would like to extract word- and sentence-level attention weights for each document, and would be grateful for advice on how to do so.
Thank you in advance for any insight.
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
# This is the line producing "NotImplementedError":
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_dense = layers.TimeDistributed(Dense(int(L2_dense_units)),name='sentence_dense')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_dense)
preds = layers.Dense(1, activation='sigmoid', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
I got NotImplementedError as well while trying to do the same thing as you. The thing is Keras's TimeDistributed layer needs to know its inner custom layer's output shapes. So you should add compute_output_shape method to your custom layers.
In your case MultiHeadSelfAttention, TransformerBlock and TokenAndPositionEmbedding layers should include:
class MultiHeadSelfAttention(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
After you add these methods you should be able to run your code.
As for your second question, I am not sure but maybe you can return the "weights" variable that is returned from MultiHeadSelfAttention's attention method in call methods of both MultiHeadSelfAttention and TransformerBlock. So that you can access it where you build your model.
The idea behind this is I want to try some old school gradient ascent style visualization with Bert Model.
I want to know the effect of input on a specific layer's specific dimension. Thus, I took the gradient of the output of a specific layer's specific dimension wrt the first word embedding layer's output.
The best thing I can do here is the following:
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True,output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)
s = 'I want to sleep'
inputs = tokenizer.encode_plus(s,return_tensors='pt', add_special_tokens=False,is_pretokenized=True)
input_ids = inputs['input_ids']
output = model(input_ids)
hidden_states = output[-2]
X = hidden_states[0] #embedding space, shape: [1,4,768] (batch_size,sentence_length,embedding dimension)
y = hidden_states[3][0][0][0] ##the 0th position and 0th dimension of output of 3rd hidden layer. Dimension should just be [1], a scalar.
torch.autograd.grad(y,X,retain_graph=True, create_graph=True) #I take the gradient of y wrt. Since y is scalar. The dimension of the gradient is just the dimension of X.
This is, however, not good enough. I want the gradient wrt the actual word embedding layer. However, Transformer's embedding contains "position_embedding" and "token_type_embedding". Here's the code for the first layer embedding:
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids, token_type_ids=None, position_ids=None):
seq_length = input_ids.size(1)
if position_ids is None:
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
Ideally, I want the gradient wrt JUST “words_embeddings" Rather than, wrt "words_embeddings + position_embeddings + token_type_embeddings" and follows by layerNorm and dropout.
I think I can do this by modifying changing the model. Is there a way to it without changing the model?