I have trained a classifier and now trying to load it and run some predictions
I am getting an error that is provided below
....
return self._conv_forward(input, self.weight, self.bias)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
TypeError: conv2d() received an invalid combination of arguments - got (list, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (list, Parameter, Parameter, tuple, tuple, tuple, int)
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (list, Parameter, Parameter, tuple, tuple, tuple, int)
Here is the code
import torch
import torch.nn as nn
import numpy as np
from PIL import Image
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
Transformer - used to encode images
transformer = transforms.Compose([
transforms.RandomHorizontalFlip(0.5),
transforms.ToTensor(),
])
Getting a file and converting to Tensor
def get_file_as_tensor(file_path):
with np.load(file_path) as f:
melspec_image_array = f['arr_0']
image = Image.fromarray(melspec_image_array, mode='RGB')
image_tensor = transformer(image).div_(255).float()
return image_tensor.clone().detach()
Prediction function that is on top of the stack because the error occures when I run model([tensor])
def predict(tensor, model):
yhat = model([tensor])
yhat = yhat.clone().detach()
return yhat
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, 1, 1),
nn.BatchNorm2d(out_channels),
nn.ReLU(),
)
self.conv2 = nn.Sequential(
nn.Conv2d(out_channels, out_channels, 3, 1, 1),
nn.ReLU(),
nn.Dropout(0.5)
)
self._init_weights()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.zeros_(m.bias)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = F.avg_pool2d(x, 2)
return x
class Classifier(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.conv = nn.Sequential(
ConvBlock(in_channels=3, out_channels=64),
ConvBlock(in_channels=64, out_channels=128),
ConvBlock(in_channels=128, out_channels=256),
ConvBlock(in_channels=256, out_channels=512),
)
self.fc = nn.Sequential(
nn.Dropout(0.4),
nn.Linear(512, 128),
nn.PReLU(),
#nn.BatchNorm1d(128),
nn.Dropout(0.2),
nn.Linear(128, num_classes),
)
def forward(self, x):
x = self.conv(x)
x = torch.mean(x, dim=3)
x, _ = torch.max(x, dim=2)
x = self.fc(x)
return x
PATH = "models/model.pt"
model = Classifier()
model.load_state_dict(torch.load(PATH))
model.eval()
cry_file_path = "processed_np/car_file.npz"
car_tensor = get_file_as_tensor(car_file_path)
no_car_file_path = "raw_negative_processed/nocar-1041.npz"
no_car_tensor = get_file_as_tensor(no_car_file_path)
car_prediction = predict(car_tensor, model)
no_cry_prediction = predict(no_car_tensor, model)
print("car", car_prediction)
print("no car", no_car_prediction)
The code is self explanatory but SO keeps asking for more text
Would really appreciate some help as I am new to ML
def predict(tensor, model):
yhat = model(tensor.unsqueeze(0))
yhat = yhat.clone().detach()
return yhat
You should use this method definition instead of yours.
Why are you applying your model to [tensor], that is to a python list containing a single element tensor?
You should apply your model to tensor directly: model(tensor).
You might need to add a singleton "batch dimension" to tensor. See this answer for more details.
The error is about conv2d() function not module.
The only thing I can think of here is that your input data is incorrect. Make sure it is a tensor in a form of (B, C, H, W).
Related
I have this script to classify the text using only two classes (1 and 0).
I would rather use Bert along with to CNN, but I have some problem to build the model. However, below I put some steps, maybe they are clearer. I mean, I believe that I am making a mistake, but I don't find solution. Can you help me?
Load dataset
data = pd.read_csv('file.csv', delimiter=';', error_bad_lines=False)
sentences = data.text.values
labels = data.frana_si_o_no.values
Preprocess
pre_process_dataset is a function to clean data with some cases: stopwords, whitespace etc.....
# clean dataframe's text column
data['text'] = data['text'].apply(pre_process_dataset)
# preview some cleaned tweets
sentences = data.text.values
Tokenization
Here there BertTokenizer with "bert-base-uncase" as model to apply the tokenization. But I load also AutoModel that I will apply in other part of the script.
input_ids = []
attention_masks = []
bert = AutoModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
for sentence in sentences:
bert_inp = bert_tokenizer.__call__(sentence, max_length=32,
padding='max_length', pad_to_max_length=True,
truncation=True, return_token_type_ids=False, return_tensors = 'pt')
input_ids.append(bert_inp['input_ids'])
attention_masks.append(bert_inp['attention_mask'])
input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(labels)
Split in train, test, val
Below I put also the torch.cat to obtain tensors.
data = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])
train_text, temp_text, train_labels, temp_labels = train_test_split(data, labels, test_size=0.2)
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, test_size=0.5)
train_seq = torch.cat(train_text['input_ids'].tolist())
train_mask = torch.cat(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())
# for validation set
val_seq = torch.cat(val_text['input_ids'].tolist())
val_mask = torch.cat(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())
# for test set
test_seq = torch.cat(test_text['input_ids'].tolist())
test_mask = torch.cat(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())
Create Dataloader
batch_size = 32
# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)
# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)
# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)
# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)
# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
BERT_CNN
Below there the model with BERT and CNN, but I have some problem herex=torch.cat(all_layers).
class BERT_CNN(nn.Module):
def __init__(self):
super(BERT_CNN, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.conv = nn.Conv2d(in_channels=13, out_channels=13, kernel_size=(3, 768), padding=True)
self.relu = nn.ReLU()
self.pool = nn.MaxPool2d(kernel_size=3, stride=1)
self.dropout = nn.Dropout(0.1)
self.fc = nn.Linear(442, 3)
self.flat = nn.Flatten()
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, sent_id, mask, labels=None):
cls_hs = self.bert(input_ids=sent_id, attention_mask=mask, return_dict= False, output_hidden_states=True)
x=cls_hs[0]
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
x = self.flat(x)
x = self.fc(x)
return self.softmax(x)
At the end I try to apply the training but I have this error and I don't understand where is my mistake. Below I put the Type error that specified that there is some invalid combination with conv2d().
TypeError: conv2d() received an invalid combination of arguments - got (Tensor, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
(Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (Tensor, !Parameter!, !Parameter!, !tuple!, !tuple!, !tuple!, int) *
(Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str
padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (Tensor, !Parameter!, !Parameter!, !tuple!, !tuple!, !tuple!, int)
I am trying to call tf.nn.avg_pool2d() inside a function decorated with #tf.fuction. How do I have to pass the parameters ksize and strides?
Both, ksize and strides change during execution. Therefore, I store them in a tf.Variable.
import tensorflow as tf
class MyModel(tf.keras.Model):
def __init__(self):
super(MyModel, self).__init__()
self.lod_in = tf.Variable(initial_value=5.0, name='level of detail', trainable=False)
self.k_size = tf.Variable(initial_value=[1, 1, 1, 1], trainable=False)
def call(self, inputs, training=None, mask=None):
x = inputs
x = self.special_method(x)
return x
#tf.function
def special_method(self, x):
factor = int(2 ** tf.floor(self.lod_in))
print(type(factor))
# Method 1 - TypeError: Expected int for argument 'ksize' not <tf.Tensor 'Cast:0' shape=() dtype=int8>.
ksize = [1, factor, factor, 1]
x = tf.nn.avg_pool2d(x, ksize=ksize, strides=ksize, padding='VALID')
# Method 2 - AttributeError: 'Tensor' object has no attribute 'numpy'
# self.k_size.assign([1, factor, factor, 1])
# x = tf.nn.avg_pool2d(x, ksize=self.k_size, strides=self.k_size, padding='VALID')
return x
def get_config(self):
config = super(MyModel, self).get_config()
return config
model = MyModel()
model.compile()
x = tf.ones(shape=[8, 128, 128, 16])
y = model(x)
Edit
I run my code in graph mode and would like to change self.lod_in during execution after a certain amount of steps.
tf.nn.avg_pool2d expects a python int or a list of int, not a Tensor. Just use normal python lists or integer.
From the documentation (emphasis is mine):
Args
input Tensor of rank N+2, of shape [batch_size] +
input_spatial_shape + [num_channels] if data_format does not start
with "NC" (default), or [batch_size, num_channels] +
input_spatial_shape if data_format starts with "NC". Pooling happens
over the spatial dimensions only.
ksize An int or list of ints that
has length 1, N or N+2. The size of the window for each dimension of
the input tensor.
strides An int or list of ints that has length 1, N
or N+2. The stride of the sliding window for each dimension of the
input tensor.
import tensorflow as tf
import math
class MyModel(tf.keras.Model):
def __init__(self):
super(MyModel, self).__init__()
self.lod_in = 5.0
def call(self, inputs, training=None, mask=None):
x = inputs
x = self.special_method(x)
return x
#tf.function
def special_method(self, x):
factor = int(2 ** math.floor(self.lod_in))
ksize = [1, factor, factor, 1]
x = tf.nn.avg_pool2d(x, ksize=ksize, strides=ksize, padding='VALID')
return x
def get_config(self):
config = super(MyModel, self).get_config()
return config
A possible solution was suggested here. However, it is only implemented for max_pooling but not for average_pooling.
from tensorflow.python.ops import gen_nn_ops
conv_pooled = gen_nn_ops.max_pool_v2(
conv,
ksize=[1,1, tf.shape(h_conv)[-2], 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
This is a minimally working/reproducible example:
import torch
import torch.nn as nn
from torchsummary import summary
class Network(nn.Module):
def __init__(self, channels_img, features_d, num_classes, img_size):
super(Network, self).__init__()
self.img_size = img_size
self.disc = nn.Conv2d(
in_channels = channels_img + 1,
out_channels = features_d,
kernel_size = (4,4)
)
# ConditionalGan:
self.embed = nn.Embedding(
num_embeddings = num_classes,
embedding_dim = img_size * img_size
)
def forward(self, x, labels):
embedding = self.embed(labels).view(labels.shape[0], 1, self.img_size, self.img_size)
x = torch.cat([x, embedding], dim = 1)
return self.disc(x)
# device:
device = torch.device("cpu")
# hyperparameter:
batch_size = 64
# Initialize model:
model = Network(
channels_img = 1,
features_d = 16,
num_classes = 10,
img_size = 28).to(device)
# Print model summary:
summary(
model,
input_size = [(1, 28, 28), (1, 28, 28)], # MNIST
batch_size = batch_size
)
The error message I get is (for the line with summary(...)):
Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)
I saw in this post, that .to(torch.int64) is supposed to help, but I honestly don't know where to write it.
Thank you!
The problem lies here:
self.embed(labels)...
An embedding layer is kind of a mapping between discrete indices and continuous values, as stated here. That is, its inputs should be integers and it will give you back floats. In your case, for example, you are embedding class labels of the MNIST which range from 0 to 9, to a contiuum (for some reason that I don't know as i'm not familiar with GANs :)). But in short, that embedding layer will give a transformation of 10 -> 784 for you and those 10 numbers should be integers, PyTorch says.
A fancy name for an integer type is "long", so you need to make sure the data type of what goes into self.embed is of that type. There are some ways to do that:
self.embed(labels.long())
or
self.embed(labels.to(torch.long))
or
self.embed(labels.to(torch.int64))
Long datatype is really an 64 bit integer (you may see here), so all these work.
I'm trying to translate the following Inception code from tutorial in Keras functional API (link) to PyTorch nn.Module:
def conv_module(x, K, kX, kY, stride, chanDim, padding="same"):
# define a CONV => BN => RELU pattern
x = Conv2D(K, (kX, kY), strides=stride, padding=padding)(x)
x = BatchNormalization(axis=chanDim)(x)
x = Activation("relu")(x)
# return the block
return x
def inception_module(x, numK1x1, numK3x3, chanDim):
# define two CONV modules, then concatenate across the
# channel dimension
conv_1x1 = conv_module(x, numK1x1, 1, 1, (1, 1), chanDim)
conv_3x3 = conv_module(x, numK3x3, 3, 3, (1, 1), chanDim)
x = concatenate([conv_1x1, conv_3x3], axis=chanDim)
# return the block
return x
I'm having trouble translating the Conv2D. If I understand correctly:
There is no in_features in Keras - how should I represent it in PyTorch?
Keras filters is PyTorch out_features
kernel_size, stride and padding are the same (maybe a few options for padding are called differently)
Do I understand this correctly? If so, what should I do with in_features? My code so far:
class BasicConv2d(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int
) -> None:
super().__init__()
self.conv = nn.Conv2d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride)
self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
self.relu = nn.ReLU()
def forward(self, x: Tensor) -> Tensor:
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x
class Inception(nn.Module):
def __init__(
self,
in_channels: int,
num_1x1_filters: int,
num_3x3_filters: int,
) -> None:
super().__init__()
# how to fill this further?
self.conv_1d = BasicConv2d(
num_1x1_filters,
)
You're correct for the most part. The in_channels parameter in Con2d corresponds to the no. of output channels from the previous layer. If Conv2d is the first layer, the in_channels correspond to the no. of channels in your image. It will be 1 for a Grayscale image and 3 for an RGB image.
But I'm not sure how you could concat the two BasicConv2d outputs.
Fixing batch_size as 1, assume that the image size is 256*256 and out_channels for conv1x1 is 64. This would output a tensor of shape torch.Size([1, 64, 256, 256]). Assuming out_channels of the conv3x3 as 32, this layer would output a tensor of shape torch.Size([1, 32, 254, 254]). We will not be able to concat these two tensors without some trick, such as using padding=1 for the conv3x3 alone as this would produce an output of shape torch.Size([1, 32, 256, 256]) and therefore we would be able to concat.
Your implementation of BasicConv2d is fine, here is the code of Inception module.
class Inception(nn.Module):
def __init__(
self,
in_channels: int,
num_1x1_filters: int,
num_3x3_filters: int,
) -> None:
super().__init__()
# how to fill this further?
self.conv1 = BasicConv2d(in_channels, num_1x1_filters, 1,1)
self.conv3 = BasicConv2d(in_channels, num_3x3_filters, 3,1)
def forward(self,x):
conv1_out = self.conv1(x)
conv3_out = self.conv3(x)
x = torch.cat([conv1_out, conv3_out],)
return x
You need define two basic conv layers, and use them in the forward pass with same input separately.
As #planet_pluto pointed, you can't concatenate two feature maps have different size. you can choose a better stride, padding to construct two feature maps with same size, alternatively, do upsampling or downsampling before you concatenate them.
I am trying to implement a hierarchical transformer for document classification in Keras/tensorflow, in which:
(1) a word-level transformer produces a representation of each sentence, and attention weights for each word, and,
(2) a sentence-level transformer uses the outputs from (1) to produce a representation of each document, and attention weights for each sentence, and finally,
(3) the document representations produced by (2) are used to classify documents (in the following example, as belonging or not belonging to a given class).
I am attempting to model the classifier on Yang et al.'s approach here (https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf), but replacing the GRU and attention layers with transformers.
I am using Apoorv Nandan's transformer implementation from https://keras.io/examples/nlp/text_classification_with_transformer/.
I have two issues for which I would be grateful for the community's help:
(1) I get an error in the upper (sentence) level model that I can't resolve (details and code below)
(2) I don't know how to extract the word- and sentence-level attention weights, and value advice on how best to do this.
I am new to both Keras and this forum, so apologies for obvious mistakes and thank you in advance for any help.
Here is a reproducible example, indicating where I encounter errors:
First, establish the multi-head attention, transformer, and token/position embedding layers, after Nandan.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
For the purpose of this example, the data are 10,000 documents, each truncated to 15 sentences, each sentence with a maximum of 60 words, which are already converted to integer tokens 1-1000.
X is a 3-D tensor (10000, 15, 60) containing these tokens. y is a 1-D tensor containing the classes of the documents (1 or 0). For the purpose of this example there is no relation between X and y.
The following produces the example data:
max_docs = 10000
max_sentences = 15
max_words = 60
X = tf.random.uniform(shape=(max_docs, max_sentences, max_words), minval=1, maxval=1000, dtype=tf.dtypes.int32, seed=1)
y = tf.random.uniform(shape=(max_docs,), minval=0, maxval=2, dtype=tf.dtypes.int32, seed=1)
Here I attempt to construct the word level encoder, after https://keras.io/examples/nlp/text_classification_with_transformer/:
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size=1000
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate,name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu",name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
It looks as though this word encoder works as intended to produce a representation of each sentence. Here, run on the 1st document, it produces a tensor of shape (15, 100), containing the vectors representing each of 15 sentences:
word_encoder(X[0]).shape
My problem is in connecting this to the higher (sentence) level model, to produce document representations.
I get error "NotImplementedError" when trying to apply the word encoder to each sentence in a document. I would be grateful for any help in fixing this issue, since the error message is not informative as to the specific problem.
After applying the word encoder to each sentence, the goal is to apply another transformer to produce attention weights for each sentence, and a document-level representation with which to perform classification. I can't determine whether this part of the model will work because of the error above.
Finally, I would like to extract word- and sentence-level attention weights for each document, and would be grateful for advice on how to do so.
Thank you in advance for any insight.
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
# This is the line producing "NotImplementedError":
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_dense = layers.TimeDistributed(Dense(int(L2_dense_units)),name='sentence_dense')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_dense)
preds = layers.Dense(1, activation='sigmoid', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
I got NotImplementedError as well while trying to do the same thing as you. The thing is Keras's TimeDistributed layer needs to know its inner custom layer's output shapes. So you should add compute_output_shape method to your custom layers.
In your case MultiHeadSelfAttention, TransformerBlock and TokenAndPositionEmbedding layers should include:
class MultiHeadSelfAttention(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
...
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
After you add these methods you should be able to run your code.
As for your second question, I am not sure but maybe you can return the "weights" variable that is returned from MultiHeadSelfAttention's attention method in call methods of both MultiHeadSelfAttention and TransformerBlock. So that you can access it where you build your model.