I was building a self-defined encoder-decoder tf.keras.Model and saved my checkpoint. After closing my Jupyter notebook and open it again trying to restore my encoder-decoder parameters, I surprisingly found that it is not working. I am not sure it's that I understand the usage wrong or something was wrong with my steps of doing it. Here is my code.
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
tf.config.experimental_run_functions_eagerly(True)
EPOCHS = 10
TOLERANCE = 0.08
start = time.time()
for epoch in range(1, EPOCHS+1):
epoch_start = time.time()
# train the encoder-decoder model
batch = 0
total_loss = 0
total_accuracy = 0
for inp, targ in dataset.take(STEP_PER_EPOCH):
batch += 1
batch_loss, batch_accuracy = train_step(inp, targ, phoneme_tokenizer)
total_loss += batch_loss
total_accuracy += batch_accuracy
print("Epoch: {}/{} Batch: {} Loss: {:.4f} Accuracy: {:.4f} Time: {:.0f}s".
format(epoch, EPOCHS, batch, batch_loss.numpy(), batch_accuracy.numpy(), time.time()-epoch_start),
end="\r")
if batch % 1000 == 0:
print()
print()
# saving (checkpoint) the model when total loss is less than 0.9
checkpoint.save(file_prefix=checkpoint_prefix)
# validation process
total_val_loss = 0
total_val_acc = 0
for val_inp, val_targ in dataset_val.take(VAL_WAV_SIZE):
val_loss, val_acc = validate_step(val_inp, val_targ, phoneme_tokenizer)
total_val_loss += val_loss
total_val_acc += val_acc
# print out the epoch results
mean_total_acc = total_accuracy / STEP_PER_EPOCH
mean_total_loss = total_loss / STEP_PER_EPOCH
mean_val_acc = total_val_acc / VAL_WAV_SIZE
mean_val_loss = total_val_loss / VAL_WAV_SIZE
print("\n================================")
print("Epoch {}/{}".format(epoch, EPOCHS))
print('Accuracy: {:.4f} Loss: {:.4f} val_acc: {:.4f} val_loss: {:.4f}'.format(
mean_total_acc,
mean_total_loss,
mean_val_acc,
mean_val_loss))
print('Time taken for epoch {}: {:.2f} min'.format(epoch, (time.time() - epoch_start)/60))
print('Total Time taken: {:.2f} min'.format((time.time() - start)/60))
print("================================\n")
if mean_total_loss < TOLERANCE and mean_val_acc > 0.5:
break
After running the code above, it showed no errors and I got my checkpoints files in my directory. I closed my Jupyter notebook and built all the objects(which are my encoder and decoder) without training and typed
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
hoped that the parameters would come back and I can start my prediction but I just got pretty poor outcome anyway, which was against the outcome predicted after training right away. Should I add some more lines for restoring all the parameters or something else?
Below are more details about my encoder decoder structure, and my input shape is (batch size, 99, 13)
class ResnetIdentityBlock(tf.keras.layers.Layer):
def __init__(self, kernel_size, filters):
super(ResnetIdentityBlock, self).__init__()
self.filters1, self.filters2, self.filters3 = filters
self.conv1 = tf.keras.layers.Conv1D(self.filters1, 1, padding='valid')
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv1D(self.filters2, kernel_size, padding='same')
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv1D(self.filters3, 1, padding='valid')
self.bn3 = tf.keras.layers.BatchNormalization()
def call(self, input_tensor, training=False):
x = self.conv1(input_tensor)
x = self.bn1(x, training=training)
x = tf.nn.relu(x)
x = self.conv2(x)
x = self.bn2(x, training=training)
x = tf.nn.relu(x)
x = self.conv3(x)
x = self.bn3(x, training=training)
x += input_tensor
return tf.nn.relu(x)
class Encoder(tf.keras.Model):
'''
Encoder for MFCC transformed wave data
'''
def __init__(self,
lstm_units,
batch_sz,
dropout_rate,
units,
squeeze_time):
'''
Args:
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: layer dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Encoder, self).__init__()
self.lstm_units = lstm_units
self.squeeze_time = squeeze_time
# conv1d
self.feat_extract = tf.keras.layers.Dense(units=units, activation="relu")
self.feat_dropout = tf.keras.layers.Dropout(dropout_rate)
# ResNet
self.resnet1 = ResnetIdentityBlock(kernel_size=11, filters=[units, units, units])
units *= squeeze_time
self.resnet2 = ResnetIdentityBlock(kernel_size=7, filters=[units, units, units])
units *= squeeze_time
self.resnet3 = ResnetIdentityBlock(kernel_size=3, filters=[units, units, units])
# Encoder lstm
self.enc_lstm = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
def call(self, inputs):
'''
call pyramidal LSTM neural network encoder
Args:
inputs: wave input
'''
x = self.feat_extract(inputs)
x = self.feat_dropout(x)
# ResNet
x = self.resnet1(x)
x = self.reshape_pyramidal(x)
x = self.resnet2(x)
x = self.reshape_pyramidal(x)
x = self.resnet3(x)
# encoder output layer
fw_outputs, fw_state_h, fw_state_c = self.enc_lstm(x)
return fw_outputs, fw_state_h, fw_state_c
def reshape_pyramidal(self, outputs):
'''
After concatenating forward and backward outputs
return the reshaped output
Args:
outputs: outputs from LSTM
squeeze_time: time step one would like to squeeze in pyramidal LSTM
'''
batch_size, time_steps, num_units = outputs.shape
return tf.reshape(outputs, (batch_size, -1, num_units * self.squeeze_time))
class LuongAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(LuongAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units, activation="relu")
self.W2 = tf.keras.layers.Dense(units, activation="relu")
self.V = tf.keras.layers.Dense(1, activation="relu")
def call(self, query, values):
# query hidden state shape == (batch_size, hidden size)
# query_with_time_axis shape == (batch_size, 1, hidden size)
# values shape == (batch_size, max_len, hidden size)
# we are doing this to broadcast addition along the time axis to calculate the score
query_with_time_axis = tf.expand_dims(query, 1)
# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying score to self.V
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))
# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)
# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
class Decoder(tf.keras.Model):
'''
Decoder for output phonemes
'''
def __init__(self,
target_sz,
embedding_dim,
lstm_units,
batch_sz,
dropout_rate):
'''
Args:
target_sz: target size, total phoneme size in this case
embedding_dim: embedding dimension
lstm_units: LSTM units number
batch_sz: batch size
dropout_rate: dropout ratio
rnn_initial_weight: type of weight initialization
'''
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.target_sz = target_sz
self.lstm_units = lstm_units
self.embedding = tf.keras.layers.Embedding(target_sz, embedding_dim)
# attention model
self.attention = LuongAttention(lstm_units)
# decoder rnn
self.lstm1 = tf.keras.layers.LSTM(units=lstm_units,
return_sequences=True,
return_state=True,
kernel_initializer="lecun_normal",
activation='tanh',
recurrent_activation='sigmoid',
recurrent_initializer='orthogonal',
dropout=dropout_rate)
# Fully-connected
self.fc1 = tf.keras.layers.Dense(64, activation="relu")
self.fc1_dropout = tf.keras.layers.Dropout(dropout_rate)
self.fc2 = tf.keras.layers.Dense(target_sz, activation="softmax")
# build layer info dictionary
self.layer_info = dict()
def call(self, inputs, enc_hidden_h, enc_hidden_c, enc_output):
'''
call LSTM decoder
Args:
inputs: target output, following phoneme for wave data input in this case
enc_hidden_h: encoder hidden state h
enc_hidden_c: encoder hidden state c
enc_output: encoder outputs
'''
# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x = self.embedding(inputs)
# enc_output shape == (batch_size, max_length, hidden_size)
context_vector, attention_weights = self.attention(enc_hidden_h, enc_output)
# x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
# passing the concatenated vector to the 2-layer LSTM (Decoder)
x, state_h, state_c = self.lstm1(x)
# dense layer before final predict output dense layer
x = tf.reshape(x, (-1, x.shape[-1]))
x = self.fc1(x)
x = self.fc1_dropout(x)
# output shape == (batch_size, phoneme size)
x = self.fc2(x)
return x, (state_h, state_c), attention_weights
Related
I am building a model to classify news (AG news dataset). The vocab size ~33k with custom embedding layer. I have run this for 20 epochs but the loss and accuracy (1.3 and 26% respec.) is almost constant even at the end of 20th epoch. Can someone please help me with this? Also, am I feeding the correct input to the fc layer? I am using CrossEntropyLoss as the loss function.
Here is my model class:
class NewsClassifier(nn.Module):
def __init__(self, vocab_weights = None, rnn_type = 'LSTM', vocab_size = len(vocab.vocab), n_classes = 4, embed_size = 300, rnn_units = 512, \
n_layers = 2, bi_dir = True, rnn_drop = 0.0, padding_index = vocab['<unk>']):
super().__init__()
self.rnn_units = rnn_units
self.n_classes = n_classes
self.rnn_type = rnn_type
if vocab_weights:
self.embedding = nn.Embedding.from_pretrained(torch.as_tensor(vocab_weights))
else:
self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx = padding_index)
if rnn_type == 'LSTM':
self.rnn = nn.LSTM(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
elif rnn_type == 'GRU':
self.rnn = nn.GRU(embed_size, rnn_units, num_layers = n_layers, bidirectional = bi_dir, dropout = rnn_drop)
else:
raise NotImplementError
self.fc = nn.Linear(2 * rnn_units if bi_dir else rnn_units, self.n_classes)
def forward(self, data, lens):
x_embed = self.embedding(data) # (padded_lens, batch_size, embed_dim)
x_packed = pack_padded_sequence(x_embed, lens.cpu(), enforce_sorted = False) #packing sequences and passing to RNN unit
if self.rnn_type == 'LSTM':
output_packed, (hidden,cell) = self.rnn(x_packed) #output is packed and cannot be fed to linear layers
else:
output_packed, hidden = self.rnn(x_packed) #For GRU there is only hidden state
#Though n number of layers are stacked the output is always 1
output_padded, _ = pad_packed_sequence(output_packed) #output is padded to be fed to linear layer (padded_lens, batch size, hidden_units)
#Picking only the last output --> equivalent to reutrn_sequences = False in Keras
out_reduced = torch.cat((output_padded[-1, :, : self.rnn_units], output_padded[-1, :, self.rnn_units :]), 1)
return self.fc(out_reduced)
model = NewsClassifier()
print(f'The total number of trainable parameters are : {sum(p.numel() for p in model.parameters() if p.requires_grad)}')
My training function is:
def train(model, iterator = trainDataloader, optimizer = optimizer, loss_fn = criterion):
e_loss = e_acc = i = 0
model.train()
for inputs, leng, labels in iterator:
inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
optimizer.zero_grad()
preds = model(inputs, leng).squeeze(1)
loss = loss_fn(preds, labels.long())
acc = accuracy(preds, labels)
loss.backward()
optimizer.step()
e_loss += loss.item()
e_acc += acc.item()
i += 1
return e_loss/i, e_acc/i
def predict(model, iterator = testDataloader, loss_fn = criterion):
e_loss = e_acc = i = 0
model.eval()
with torch.no_grad():
for inputs, leng, labels in iterator:
inputs, leng, labels = inputs.to(device), leng.to(device), labels.to(device)
preds = model(inputs, leng).squeeze(1)
loss = loss_fn(preds, labels.long())
acc = accuracy(preds, labels)
e_loss += loss.item()
e_acc += acc.item()
i += 1
return e_loss/i, e_acc/i
N_EPOCHS = 20
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(model)
valid_loss, valid_acc = predict(model)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut1-model.pt')
print(f'Epoch: {epoch+1:02} / {N_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
I have a model implemented in pytorch that applies a final fully connected layer before running the softmax function.
The architecture is defined to solve a 4-class Speech Emotion Recognition task: given an audio track, it transforms it into its spectrogram and uses it to predict the emotion between happiness, sadness, neutrality and anger.
Unlike the architecture of the paper, it attempts to adapt the implementation of the Compact Convolutional Transformer found on Github at the link https://github.com/SHI-Labs/Compact-Transformers/blob/main/src/cct.py.
To improve the performance of the model I am following some tricks defined in the paper https://arxiv.org/abs/2104.07288.
Like what is described in the paper, however, my model also suffers from a "class collapse" problem: even by balancing the dataset, it tends to predict the anger and sadness classes well and the other two badly.
In the paper to solve this problem they apply a particular weight regularization technique to the fully connected layer, described in chapter 2.4.
Unfortunately, however, I cannot understand how I should modify my fully connected layer in pytorch to implement this type of regularization.
Code of the model:
class CCT(nn.Module):
def __init__(self,
img_size=224,
embedding_dim=768,
n_input_channels=3,
n_conv_layers=1,
kernel_size=7,
stride=2,
padding=3,
pooling_kernel_size=3,
pooling_stride=2,
pooling_padding=1,
dropout=0.,
attention_dropout=0.1,
stochastic_depth=0.1,
num_layers=14,
num_heads=6,
mlp_ratio=4.0,
num_classes=1000,
positional_embedding='learnable',
*args, **kwargs):
super(CCT, self).__init__()
self.tokenizer = Tokenizer(n_input_channels=n_input_channels,
n_output_channels=embedding_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding,
pooling_kernel_size=pooling_kernel_size,
pooling_stride=pooling_stride,
pooling_padding=pooling_padding,
max_pool=True,
activation=nn.ReLU,
n_conv_layers=n_conv_layers,
conv_bias=False)
self.classifier = TransformerClassifier(
sequence_length=self.tokenizer.sequence_length(n_channels=n_input_channels,
height=img_size,
width=img_size),
embedding_dim=embedding_dim,
seq_pool=True,
dropout=dropout,
attention_dropout=attention_dropout,
stochastic_depth=stochastic_depth,
num_layers=num_layers,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
num_classes=num_classes,
positional_embedding=positional_embedding
)
def forward(self, x):
x = self.tokenizer(x)
return self.classifier(x)
class Tokenizer(nn.Module):
def __init__(self,
kernel_size, stride, padding,
pooling_kernel_size=3, pooling_stride=2, pooling_padding=1,
n_conv_layers=1,
n_input_channels=3,
n_output_channels=64,
in_planes=64,
activation=None,
max_pool=True,
conv_bias=False):
super(Tokenizer, self).__init__()
n_filter_list = [n_input_channels] + \
[in_planes for _ in range(n_conv_layers - 1)] + \
[n_output_channels]
self.conv_layers = nn.Sequential(
*[nn.Sequential(
nn.Conv2d(n_filter_list[i], n_filter_list[i + 1],
kernel_size=(kernel_size, kernel_size),
stride=(stride, stride),
padding=(padding, padding), bias=conv_bias),
nn.Identity() if activation is None else activation(),
nn.MaxPool2d(kernel_size=pooling_kernel_size,
stride=pooling_stride,
padding=pooling_padding) if max_pool else nn.Identity()
)
for i in range(n_conv_layers)
])
self.flattener = nn.Flatten(2, 3)
self.apply(self.init_weight)
def sequence_length(self, n_channels=3, height=224, width=224):
return self.forward(torch.zeros((1, n_channels, height, width))).shape[1]
def forward(self, x):
return self.flattener(self.conv_layers(x)).transpose(-2, -1)
#staticmethod
def init_weight(m):
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
class TransformerClassifier(Module):
def __init__(self,
seq_pool=True,
embedding_dim=768,
num_layers=12,
num_heads=12,
mlp_ratio=4.0,
num_classes=1000,
dropout=0.1,
attention_dropout=0.1,
stochastic_depth=0.1,
positional_embedding='learnable',
sequence_length=None):
super().__init__()
positional_embedding = positional_embedding if \
positional_embedding in ['sine', 'learnable', 'none'] else 'sine'
dim_feedforward = int(embedding_dim * mlp_ratio)
self.embedding_dim = embedding_dim
self.sequence_length = sequence_length
self.seq_pool = seq_pool
self.num_tokens = 0
assert sequence_length is not None or positional_embedding == 'none', \
f"Positional embedding is set to {positional_embedding} and" \
f" the sequence length was not specified."
if not seq_pool:
sequence_length += 1
self.class_emb = Parameter(torch.zeros(1, 1, self.embedding_dim),
requires_grad=True)
self.num_tokens = 1
else:
self.attention_pool = Linear(self.embedding_dim, 1)
if positional_embedding != 'none':
if positional_embedding == 'learnable':
self.positional_emb = Parameter(torch.zeros(1, sequence_length, embedding_dim),
requires_grad=True)
init.normal_(self.positional_emb, std=0.2)
else:
self.positional_emb = Parameter(self.sinusoidal_embedding(sequence_length, embedding_dim),
requires_grad=False)
else:
self.positional_emb = None
self.dropout = Dropout(p=dropout)
dpr = [x.item() for x in torch.linspace(0, stochastic_depth, num_layers)]
self.blocks = ModuleList([
TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads,
dim_feedforward=dim_feedforward, dropout=dropout,
attention_dropout=attention_dropout, drop_path_rate=dpr[i])
for i in range(num_layers)])
self.norm = LayerNorm(embedding_dim)
self.fc = Linear(embedding_dim, num_classes)
self.apply(self.init_weight)
def forward(self, x):
if self.positional_emb is None and x.size(1) < self.sequence_length:
x = F.pad(x, (0, 0, 0, self.n_channels - x.size(1)), mode='constant', value=0)
if not self.seq_pool:
cls_token = self.class_emb.expand(x.shape[0], -1, -1)
x = torch.cat((cls_token, x), dim=1)
if self.positional_emb is not None:
x += self.positional_emb
x = self.dropout(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
if self.seq_pool:
x = torch.matmul(F.softmax(self.attention_pool(x), dim=1).transpose(-1, -2), x).squeeze(-2)
else:
x = x[:, 0]
x = self.fc(x)
return x
Can someone help me?
As you have not shared any network architecture I will try to give a basic example. I am not sure the regularization of the paper but I will give a simple example that will apply L1 regularization on specific layer (e.g. layer 0)
my_model = nn.Sequential(
nn.Linear(5, 5),
nn.ReLU(),
nn.Linear(5, 2)
)
x = torch.randn(5, 5)
target = torch.ones(5, dtype=torch.long)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)
for epoch in range(10):
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
# This will be your weight regularization, choose your layer like model[0] and apply normalization that you want on that layer weights
l1_norm = torch.norm(model[0].weight, p=1)
loss += l1_norm
loss.backward()
optimizer.step()
print('Epoch {}, loss {}, norm layer {}'.format(
epoch, loss.item(), l1_norm.item()))
can someone figure out what cause this error? I am working on NMT and BiGRU to transfer grammatically incorrect Arabic sentence into grammatically correct ones. I'm not sure how BiGRU should work because the original code was GRU only.
The problem started when I converted the GRU to bidirectional.
This is the code:
#Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE,ENC_DROPOUT)
encoder.to(device)
#obtain one sample from the data iterator
it = iter(dataset)
x, y, x_len = next(it)
print("Input: ", x.shape)
print("Output: ", y.shape)
#sort the batch first to be able to use with pac_pack_sequence
xs, ys, lens = sort_batch(x, y, x_len)
enc_output, enc_hidden = encoder(xs.to(device), lens, device)
print("Encoder Output: ", enc_output.shape) # batch_size X max_length X enc_units
print("Encoder Hidden: ", enc_hidden.shape) # batch_size X enc_units (corresponds to the last state)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE, DEC_DROPOUT)
decoder = decoder.to(device)
#print(enc_hidden.squeeze(0).shape)
dec_hidden = enc_hidden#.squeeze(0)
dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
print("Decoder Input: ", dec_input.shape)
print("--------")
for t in range(1, y.size(1)):
print(dec_input.shape)
print(dec_hidden.shape)
print(enc_output.shape)
# enc_hidden: 1, batch_size, enc_units
# output: max_length, batch_size, enc_units
predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device))
print("Prediction: ", predictions.shape)
print("Decoder Hidden: ", dec_hidden.shape)
#loss += loss_function(y[:, t].to(device), predictions.to(device))
dec_input = y[:, t].unsqueeze(1)
print(dec_input.shape)
break
And this is the Encoder code:
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, dropout):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
self.gru = nn.GRU(self.embedding_dim, self.enc_units,dropout = dropout, bidirectional=True)
self.dropout = nn.Dropout(dropout)
def forward(self, x, lens, device):
# x: batch_size, max_length
# x: batch_size, max_length, embedding_dim
x = self.embedding(x).to(device)
# x transformed = max_len X batch_size X embedding_dim
# x = x.permute(1,0,2)
x = pack_padded_sequence(x, lens) # unpad
self.hidden = self.initialize_hidden_state(device)
# output: max_length, batch_size, enc_units
# self.hidden: 1, batch_size, enc_units
output, self.hidden = self.gru(x, self.hidden) # gru returns hidden state of all timesteps as well as hidden state at last timestep
# pad the sequence to the max length in the batch
output, _ = pad_packed_sequence(output)
return output, self.hidden
def initialize_hidden_state(self,device):
return torch.zeros((2, self.batch_sz, self.enc_units)).to(device)```
I made a CNN model for emotion recognition on 5 emotions. I wanted to test it on an single image to get the individual class predictions for each emotion.
Evaluating the model works, but I can't seem to find how to make a prediction with a single image. How can I do that?
The Model
def conv_block(in_channels, out_channels, pool=False):
layers = [nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ELU(inplace=True)]
if pool: layers.append(nn.MaxPool2d(2))
return nn.Sequential(*layers)
class ResNet(ImageClassificationBase):
def __init__(self, in_channels, num_classes):
super().__init__()
self.conv1 = conv_block(in_channels, 128)
self.conv2 = conv_block(128, 128, pool=True)
self.res1 = nn.Sequential(conv_block(128, 128), conv_block(128, 128))
self.drop1 = nn.Dropout(0.5)
self.conv3 = conv_block(128, 256)
self.conv4 = conv_block(256, 256, pool=True)
self.res2 = nn.Sequential(conv_block(256, 256), conv_block(256, 256))
self.drop2 = nn.Dropout(0.5)
self.conv5 = conv_block(256, 512)
self.conv6 = conv_block(512, 512, pool=True)
self.res3 = nn.Sequential(conv_block(512, 512), conv_block(512, 512))
self.drop3 = nn.Dropout(0.5)
self.classifier = nn.Sequential(nn.MaxPool2d(6),
nn.Flatten(),
nn.Linear(512, num_classes))
def forward(self, xb):
out = self.conv1(xb)
out = self.conv2(out)
out = self.res1(out) + out
out = self.drop1(out)
out = self.conv3(out)
out = self.conv4(out)
out = self.res2(out) + out
out = self.drop2(out)
out = self.conv5(out)
out = self.conv6(out)
out = self.res3(out) + out
out = self.drop3(out)
out = self.classifier(out)
return out
The fit_one_cycle function is called to train the model
#torch.no_grad()
def evaluate(model, val_loader):
model.eval()
outputs = [model.validation_step(batch) for batch in val_loader]
return model.validation_epoch_end(outputs)
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def fit_one_cycle(epochs, max_lr, model, train_loader, val_loader,
weight_decay=0, grad_clip=None, opt_func=torch.optim.SGD):
torch.cuda.empty_cache()
history = []
# Set up custom optimizer with weight decay
optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
# Set up one-cycle learning rate scheduler
sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs,
steps_per_epoch=len(train_loader))
for epoch in range(epochs):
# Training Phase
model.train()
train_losses = []
lrs = []
for batch in train_loader:
loss = model.training_step(batch)
train_losses.append(loss)
loss.backward()
# Gradient clipping
if grad_clip:
nn.utils.clip_grad_value_(model.parameters(), grad_clip)
optimizer.step()
optimizer.zero_grad()
# Record & update learning rate
lrs.append(get_lr(optimizer))
sched.step()
# Validation phase
result = evaluate(model, val_loader)
result['train_loss'] = torch.stack(train_losses).mean().item()
result['lrs'] = lrs
model.epoch_end(epoch, result)
history.append(result)
return history
This returns the accuracy and loss, i want to change this so it returns prediction percentages for each class.
def accuracy(outputs, labels):
_, preds = torch.max(outputs, dim=1)
return torch.tensor(torch.sum(preds == labels).item() / len(preds))
class ImageClassificationBase(nn.Module):
def training_step(self, batch):
images, labels = batch
out = self(images)
loss = F.cross_entropy(out, labels)
return loss
def validation_step(self, batch):
images, labels = batch
out = self(images)
loss = F.cross_entropy(out, labels)
acc = accuracy(out, labels)
return {'val_loss': loss, 'val_acc': acc}
def validation_epoch_end(self, outputs):
batch_losses = [x['val_loss'] for x in outputs]
epoch_loss = torch.stack(batch_losses).mean()
batch_accs = [x['val_acc'] for x in outputs]
epoch_acc = torch.stack(batch_accs).mean()
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print("Epoch [{}], last_lr: {:.5f}, train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
epoch, result['lrs'][-1], result['train_loss'], result['val_loss'], result['val_acc']))
Evaluating the model works, but I can't seem to find how to make a
prediction with a single image. How can I do that?
Simply, if you have a single image make sure to:
use additional 1 dimension at the beginning
make sure to use CHW format instead of HWC (or specify that within pytorch, check out how to do that here)
For example:
my_model = CNN(...)
random_image = torch.randn(1, 3, 100, 100) # 3 channels, 100x100 img
BTW. Your accuracy could be written a little simpler like this:
def accuracy(outputs, labels):
preds = torch.argmax(outputs, dim=1)
return torch.sum(preds == labels) / len(preds)
Getting class probability
Similar to argmax you can use softmax which transforms logits (unnormalized probability outputted by your network) into probabilities:
def probability(outputs):
return torch.nn.functional.softmax(outputs, dim=1)
I have a time series data looking something like this:
I am trying to model this with a sequence to sequence RNN in pytorch. It trains well and I can see the loss going down. But on testing it gives the same out put irrespective of the input.
My Model:
class RNNModel(nn.Module):
def __init__(self, predictor_size, hidden_size, num_layers, dropout = 0.3, output_size=83):
super(RNNModel, self).__init__()
self.drop = nn.Dropout(dropout)
self.rnn = nn.GRU(predictor_size, hidden_size, num_layers=num_layers, dropout = dropout)
self.decoder = nn.Linear(hidden_size, output_size)
self.init_weights()
self.hidden_size = hidden_size
self.num_layers = num_layers
def init_weights(self):
initrange = 0.1
self.decoder.bias.data.fill_(0)
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, input, hidden):
output, hidden = self.rnn(input, hidden)
output = self.drop(output)
decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
return Variable(weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
Train Method:
def train(data_source, lr):
# turn on training mode that enables dropout
model.train()
total_loss = 0
hidden = model.init_hidden(bs_train)
optimizer = optim.Adam(model.parameters(), lr = lr)
for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt_size)):
data, targets = get_batch(data_source, i)
# Starting each batch, we detach the hidden state from how it was previously produced
# so that model doesen't ry to backprop to all the way start of the dataset
# unrolling of the graph will go from the last iteration to the first iteration
hidden = Variable(hidden.data)
if cuda.is_available():
hidden = hidden.cuda()
optimizer.zero_grad()
output, hidden = model(data, hidden)
loss = criterion(output, targets)
loss.backward()
# clip_grad_norm to prevent gradient explosion
torch.nn.utils.clip_grad_norm(model.parameters(), clip)
optimizer.step()
total_loss += len(data) * loss.data
# return accumulated loss for all the iterations
return total_loss[0] / len(data_source)
Evaluation Method:
def evaluate(data_source):
# turn on evaluation to disable dropout
model.eval()
model.train(False)
total_loss = 0
hidden = model.init_hidden(bs_valid)
for i in range(0, data_source.size(0) - 1, bptt_size):
data, targets = get_batch(data_source, i, evaluation = True)
if cuda.is_available():
hidden = hidden.cuda()
output, hidden = model(data, hidden)
total_loss += len(data) * criterion(output, targets).data
hidden = Variable(hidden.data)
return total_loss[0]/len(data_source)
Training Loop:
best_val_loss = None
best_epoch = 0
def run(epochs, lr):
val_losses = []
num_epochs = []
global best_val_loss
global best_epoch
for epoch in range(0, epochs):
train_loss = train(train_set, lr)
val_loss = evaluate(test_set)
num_epochs.append(epoch)
val_losses.append(val_loss)
print("Train Loss: ", train_loss, " Validation Loss: ", val_loss)
if not best_val_loss or val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(model.state_dict(), "./4.model.pth")
best_epoch = epoch
return num_epochs, val_losses
Loss with epochs:
Getting the output:
model = RNNModel(predictor_size, hidden_size, num_layers, dropout_pct, output_size)
model.load_state_dict(torch.load("./4.model.pth"))
if cuda.is_available():
model.cuda()
model.eval()
model.train(False)
hidden = model.init_hidden(1)
inp = torch.Tensor(var[105])
input = Variable(inp.contiguous().view(1,1,predictor_size), volatile=True)
if cuda.is_available():
input.data = input.data.cuda()
output, hidden = model(input, hidden)
op = output.squeeze().data.cpu()
print(op)
Here I always get the same output irrespective of datapoint I give as input. Can somebody please tell me what I am doing wrong.