How to use TensorFlow RelativePositionEmbedding layers with batches? - python

I'm trying to incorporate a RelativePositionEmbedding layer into a transformer example. The embedding layer can be found in the build_model method below:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from official.nlp.modeling.layers import position_embedding
def readucr(filename):
data = np.loadtxt(filename, delimiter="\t")
y = data[:, 0]
x = data[:, 1:]
return x, y.astype(int)
root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"
x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv")
x_test, y_test = readucr(root_url + "FordA_TEST.tsv")
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))
n_classes = len(np.unique(y_train))
idx = np.random.permutation(len(x_train))
x_train = x_train[idx]
y_train = y_train[idx]
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0
# Build model
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
# Attention and Normalization
x = layers.MultiHeadAttention(
key_dim=head_size, num_heads=num_heads, dropout=dropout
)(inputs, inputs)
x = layers.Dropout(dropout)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
res = x + inputs
# Feed Forward Part
x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
x = layers.Dropout(dropout)(x)
x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
x = layers.LayerNormalization(epsilon=1e-6)(x)
return x + res
def build_model(
input_shape,
head_size,
num_heads,
ff_dim,
num_transformer_blocks,
mlp_units,
dropout=0,
mlp_dropout=0
):
inputs = keras.Input(shape=input_shape)
x = inputs # => shape is (None, 500, 1)
x = position_embedding.RelativePositionEmbedding(hidden_size=500)(x) # Now (500, 500)
# Add batch dimension back. But how to accept batch size greater than 1?
x = layers.Lambda(lambda x: tf.expand_dims(x, axis=0))(x) # Now (1, 500, 500)
for _ in range(num_transformer_blocks):
x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
for dim in mlp_units:
x = layers.Dense(dim, activation="relu")(x)
x = layers.Dropout(mlp_dropout)(x)
outputs = layers.Dense(n_classes, activation="softmax")(x)
return keras.Model(inputs, outputs)
input_shape = x_train.shape[1:]
model = build_model(
input_shape,
head_size=256,
num_heads=4,
ff_dim=4,
num_transformer_blocks=4,
mlp_units=[128],
mlp_dropout=0.4,
dropout=0.25
)
model.compile(
loss="sparse_categorical_crossentropy",
optimizer=keras.optimizers.Adam(learning_rate=1e-4),
metrics=["sparse_categorical_accuracy"]
)
callbacks = [
keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
keras.callbacks.TensorBoard(log_dir="./logs")
]
model.fit(
x_train,
y_train,
validation_split=0.2,
epochs=5,
batch_size=64,
callbacks=callbacks
)
model.evaluate(x_test, y_test, verbose=1)
The following blows up because I've specified batch_size of 64. However everything works fine when setting batch_size to 1 because the expand_dims operation only adds a size 1 batch dimension, as opposed to an Input layer that adds None for arbitrary batch sizes.
So how can I add "back in" a batch dimension greater than 1? Is there another way I should be using the RelativePositionEncoding layer to not interfere with batch sizes?
I've tried looking into the Reshape method as well without success.
I thought this question would solve my issue, but this only adds a leading 1 dimension like the Lambda layer I incorporated, rather than None, which I think would resolve the issue.

I do not think you can pass the output of the RelativePositionEmbedding directly to another layer. If you take a look here, the authors are adding the output of this layer to the original input. Your code will work if you change your model like this:
# ....
# Your code
def build_model(
input_shape,
head_size,
num_heads,
ff_dim,
num_transformer_blocks,
mlp_units,
dropout=0,
mlp_dropout=0
):
inputs = keras.Input(shape=input_shape)
x = inputs # => shape is (None, 500, 1)
pos_encoding = position_embedding.RelativePositionEmbedding(hidden_size=500)(x) # Now (500, 500)
x = inputs + pos_encoding
for _ in range(num_transformer_blocks):
x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
for dim in mlp_units:
x = layers.Dense(dim, activation="relu")(x)
x = layers.Dropout(mlp_dropout)(x)
outputs = layers.Dense(n_classes, activation="softmax")(x)
return keras.Model(inputs, outputs)
# ....
# Your code
45/45 [==============================] - 54s 1s/step - loss: 1.0281 - sparse_categorical_accuracy: 0.5111 - val_loss: 0.7387 - val_sparse_categorical_accuracy: 0.5645
42/42 [==============================] - 8s 187ms/step - loss: 0.7440 - sparse_categorical_accuracy: 0.5424
[0.7440475225448608, 0.5424242615699768]

Related

How to use view in 4-dimensional neural network

I'm new to Neural Networks, so I hope u will forgive me if this is really basic stuff.
So far i managed to use view once in my first very simple project and now i tried to understand something more complex.
I found this code on github and I'm trying to understand it.
This is how network is built.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = x.view(-1, 64 * 12 * 12)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
I'm trying to get the image of number network failed to recognize, yet I don't really know how to use view here.
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
mistakes = 0
images, labels = next(iter(test_loader))
img = images[0].view(1, 784)
with torch.no_grad():
logps = model(img)
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
I couldn't find usefull examples so I would be gratefull for any kind of hint.
Every time I modify it I get one of these two errors.
builtins.RuntimeError: Expected 4-dimensional input for 4-dimensional weight [32, 1, 3, 3], but got 2-dimensional input of size [1, 784] instead
builtins.RuntimeError: shape '[1, 784, 3, 3]' is invalid for input of size 784
Thanks in advance for any kind of help.
Go with this
Since your dataloader already has data in 4D . No need to change it using .view()
And your CNN expects data in 4D
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
mistakes = 0
""" NO NEED OF THIS
images, labels = next(iter(test_loader))
img = images[0].view(1, 784) # This makes your image to 1D
"""
with torch.no_grad():
#logps = model(img)
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

Empty trainable variable in keras model(keras version = 2.2.4-tf)

I am a beginner in Keras programming. I just want to manually update the model weights manually in keras so as to get a deep understanding of gradient descent. However, when I tried it, the model either cannot get converged or the loss even gets exploded. My steps are listed as follows:
First, I use keras sequential model to fit a quadratic function y = 2*x*x - 7*x + 11
below is the code using the sequential model:
model = Sequential()
model.add(Dense(64, input_dim = 1, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.summary()
training loss
fitted curved and original one
Then, I use the following code to update the weight manually:
class MyModel(keras.Model):
def __init__(self):
super().__init__()
self.layer1 = Dense(64, input_shape = (1, ))
self.layer2 = Dense(32)
self.layer3 = Dense(1)
def forward(self, x):
y = keras.activations.relu(self.layer1(x))
y = keras.activations.relu(self.layer2(y))
y = self.layer3(y)
return y
def loss_fun(y_pred, y):
return keras_backend.mean(keras.losses.mean_squared_error(y, y_pred))
def compute_loss(model, x, y, loss_fun = loss_fun):
logits = model.forward(x)
mse = loss_fun(y, logits)
return mse, logits
def compute_gradients(model, x, y, loss_fun = loss_fun):
with tf.GradientTape() as tape:
loss, _ = compute_loss(model, x, y, loss_fun)
return tape.gradient(loss, model.trainable_variables), loss
def apply_gradients(optimizer, gradients, variables):
optimizer.apply_gradients(zip(gradients, variables))
def train_batch(x, y, model, optimizer):
'''
one step batch training
'''
gradients, loss = compute_gradients(model, x, y)
apply_gradients(optimizer, gradients, model.trainable_variables)
return loss
model2 = MyModel()
epochs = 200
optimizer = keras.optimizers.Adam(learning_rate = 0.01) #据查这个0.01是keras默认的learning rate
loss = []
x_train = np.expand_dims(x_train, axis = 0)
y_train = np.expand_dims(y_train, axis = 0)
for i in range(epochs):
l = train_batch(x_train, y_train, model2, optimizer)
loss.append(l)
if i % 10 == 0:
print(f'current loss = {l}')
while the loss looks like this:
I also try another way to manually update the weights:
epochs = 200
lr = 0.01
optimizer = keras.optimizers.Adam(learning_rate = 0.01)
loss = []
x_train = np.expand_dims(x_train, axis = 0)
y_train = np.expand_dims(y_train, axis = 0)
x_train = tf.cast(x_train, tf.float32)
y_train = tf.cast(y_train, tf.float32)
for i in range(epochs):
y_pred = model5.forward(x_train)
l = k.mean(keras.losses.mean_squared_error(y_train, y_pred))
gradient = k.gradients(l, model5.trainable_weights)
new_weights = model5.get_weights() - 0.001 * np.array(gradients)
model5.set_weights(new_weights)
if i % 10 == 0:
loss.append(l)
print(f'{i}th loss is: {l}')
In this case, the loss explodes like this:
where is the problem?
I have figure out where the problem is.
When getting the model through the following code:
model = MyModel()
The trainable variables in model are null.
When I try to print them using this:
print(model.trainable_variables)
it outputs
[]
I try to make the weight trainable manually by the following code:
for layers in model.layers:
layers.trainable = True
But it still doesn't work at all.

RuntimeError: The size of tensor a (133) must match the size of tensor b (10) at non-singleton dimension 1

I am training a CNN model. I am facing issue while doing the training iteration for my model. The code is as below:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
#convo layers
self.conv1 = nn.Conv2d(3,32,3)
self.conv2 = nn.Conv2d(32,64,3)
self.conv3 = nn.Conv2d(64,128,3)
self.conv4 = nn.Conv2d(128,256,3)
self.conv5 = nn.Conv2d(256,512,3)
#pooling layer
self.pool = nn.MaxPool2d(2,2)
#linear layers
self.fc1 = nn.Linear(512*5*5,2048)
self.fc2 = nn.Linear(2048,1024)
self.fc3 = nn.Linear(1024,133)
#dropout layer
self.dropout = nn.Dropout(0.3)
def forward(self, x):
#first layer
x = self.conv1(x)
x = F.relu(x)
x = self.pool(x)
#x = self.dropout(x)
#second layer
x = self.conv2(x)
x = F.relu(x)
x = self.pool(x)
#x = self.dropout(x)
#third layer
x = self.conv3(x)
x = F.relu(x)
x = self.pool(x)
#x = self.dropout(x)
#fourth layer
x = self.conv4(x)
x = F.relu(x)
x = self.pool(x)
#fifth layer
x = self.conv5(x)
x = F.relu(x)
x = self.pool(x)
#x = self.dropout(x)
#reshape tensor
x = x.view(-1,512*5*5)
#last layer
x = self.dropout(x)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
return x
#loss func
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.0001)
#criterion = nn.CrossEntropyLoss()
#optimizer = optim.SGD(net.parameters(), lr = 0.05)
def train(n_epochs,model,loader,optimizer,criterion,save_path):
for epoch in range(n_epochs):
train_loss = 0
valid_loss = 0
#training
net.train()
for batch, (data,target) in enumerate(loaders['train']):
optimizer.zero_grad()
outputs = net(data)
#print(outputs.shape)
loss = criterion(outputs,target)
loss.backward()
optimizer.step()
When I use the CrossEntropy Loss function and SGD optimizer, I able able to train the model with no error.
When I use MSE loss function and Adam optimizer, I am facing the following error:
RuntimeError Traceback (most recent call last) <ipython-input-20-2223dd9058dd> in <module>
1 #train the model
2 n_epochs = 2
----> 3 train(n_epochs,net,loaders,optimizer,criterion,'saved_model/dog_model.pt')
<ipython-input-19-a93d145ef9f7> in train(n_epochs, model, loader, optimizer, criterion, save_path)
22
23 #calculate loss
---> 24 loss = criterion(outputs,target)
25
26 #backward prop
RuntimeError: The size of tensor a (133) must match the size of tensor b (10) at non-singleton dimension 1.
Does the selected loss function and optimizer effect the training of the model? Can anyone please help on this?
Well, the error is because the nn.MSELoss() and nn.CrossEntropyLoss() expect different input/target combinations. You cannot simply change the criterion function without changing the inputs and targets appropriately. From the docs:
nn.CrossEntropyLoss:
Input:
(N, C) where C = number of classes, or
(N, C, d_1, d_2, ..., d_K) with K >= 1 in the case of K-dimensional loss.
Target:
(N) where each value is in range [0, C-1] or
(N, d_1, d_2, ..., d_K) with K >= 1 in the case of K-dimensional loss.
nn.MSELoss:
Input:
(N,∗) where ∗ means, any number of additional dimensions.
Target:
(N,∗), same shape as the input
As you can see, in the MSELoss, Target is expect to have the same shape as input, while in the CrossEntropyLoss, the C dimension is dropped. You cannot use MSELoss as a drop-in replacement for CrossEntropyLoss.
The error message clearly suggests that the error occurred at the line
loss = criterion(outputs,target)
where you are trying to compute the mean-squared error between the input and the target.
See this line: criterion = nn.MSELoss().
I think you should modify your code where you are estimating loss between (output, target) pair of inputs,i.e., loss = criterion(outputs,target) to something like below:
loss = criterion(outputs,target.view(1, -1))
Here, you are making target shape same as outputs from model on line
outputs = net(data)
One more think to notice here is the output of the net model, i.e., outputs will be of shape batch_size X output_channels, where batch size if the first dimension of input images as during the training you will get batches of images, so your shape in the forward method will get an additional batch dimension at dim0: [batch_size, channels, height, width], and ouput_channels is number of output features/channels from the last linear layer in the net model.
And, the the target labels will be of shape batch_size, which is 10 in your case, check batch_size you passed in torch.utils.data.DataLoader(). Therefore, on reshaping it using view(1, -1), it will be of converted into a shape 1 X batch_size, i.e., 1 X 10.
That's why, you are getting the error:
RuntimeError: input and target shapes do not match: input [10 x 133],
target [1 x 10]
So, a way around is to replace loss = criterion(outputs,target.view(1, -1)) with loss = criterion(outputs,target.view(-1, 1)) and change the output_channels of last linear layer to 1 instead of 133. In this way, both of outputs and target shape will be equal and we can compute MSE value then.
Learn more about pytorch MSE loss function from here.

Pytorch loss function error in the last batch

Assume that I have 77 samples to train my CNN, and my batch size is 10. Then the last batch has a batch size of 7 instead of 10. Somehow when I pass it to the loss function such as nn.MSELoss(), it gives me the error:
RuntimeError: The size of tensor a (10) must match the size of tensor
b (7) at non-singleton dimension 1
So pytorch doesn't support batches with different sizes?
My code in doubt:
import numpy as np
import torch
from torch import nn
import torchvision
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 6, (5,4))
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(64, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, x.shape[1] * x.shape[2] * x.shape[3])
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
model = Net()
batch_size = 10
# Generating Artifical data
x_train = torch.randn((77,1,20,20))
y_train = torch.randint(0,10,size=(77,),dtype=torch.float)
trainset = torch.utils.data.TensorDataset(x_train,y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
# testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=0)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(20): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i%10==0:
print('epoch{}, step{}, loss: {}'.format(epoch + 1, i + 1, running_loss))
# print("frac post = {}".format(frac_post))
running_loss = 0.0
The problem is not due to the batch size, but to a failure to broadcast properly between the 10 outputs of your CNN and the single label provided in each example.
If you look at the model output and label tensor shapes during the batch where the error is thrown,
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7])
you'll see that the labels are stored in a singleton tensor. According to pytorch broadcasting rules, to be broadcastable two tensors have to be compatible in all trailing dimensions. In this case, the trailing dimension of the model output (10) is incompatible with that of the label (7).
To fix, either add a dummy dimension to the label (assuming you actually want to broadcast the labels to match your ten network outputs), or define a network with scalar outputs. For example:
y_train = torch.randint(0,10,size=(77,1),dtype=torch.float)
results in
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7,1])
# these are broadcastable

Keras: Many batch sizes fail

I am working on generalizing the inputs to the sample variational autoencoder in the Keras repository, but seem to have made some elementary mistakes. In particular, only certain batch sizes work for the model below:
from keras.layers import Lambda, Input, Dense, Reshape
from keras.models import Model
from keras.losses import mse
from keras import backend as K
import numpy as np
# reparameterization trick
# instead of sampling from Q(z|X), sample epsilon = N(0,I)
# z = z_mean + sqrt(var) * epsilon
def sampling(args):
z_mean, z_log_var = args
batch = K.shape(z_mean)[0]
dim = K.int_shape(z_mean)[1]
# by default, random_normal has mean = 0 and std = 1.0
epsilon = K.random_normal(shape=(batch, dim))
return z_mean + K.exp(0.5 * z_log_var) * epsilon
# network parameters
original_dim = 45
input_shape = (original_dim, )
intermediate_dim = 512
latent_dim = 2
# VAE model = encoder + decoder
# build encoder model
inputs = Input(shape=input_shape, name='encoder_input')
x = Reshape((original_dim,))(inputs)
x = Dense(intermediate_dim, activation='relu')(x)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
# build decoder model
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(intermediate_dim, activation='relu')(latent_inputs)
x = Dense(original_dim, activation='sigmoid')(x)
outputs = Reshape(input_shape)(x)
decoder = Model(latent_inputs, outputs, name='decoder')
# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae_mlp')
vae.add_loss(mse(inputs, outputs))
vae.compile(optimizer='adam')
x_train = np.random.rand(1000, 45)
vae.fit(x_train, epochs=100, batch_size=10) # works, while 23 fails
Can anyone help me understand why some batch sizes fail (e.g. 23)? I'd be grateful for any insights others can offer on this question.
You currently have unequal batch sizes if data%batch_size != 0.You can solve your problem by changing your code to:
x_train = np.random.rand(1000, 45)
batch_size = 23
vae.fit(x_train, epochs=100, steps_per_epoch = x_train.size//batch_size)
This results in all batches having the same size, here is the documentation of fit with its attributes.

Categories