Howdy!
Recently I have built my own library for neural networks.
Without convolutional layers it worked fine. However, now, that I have implemented convolutional layers, it doesn't improve (in comparison to the dense nn) at all, which is inacceptable for more complex tasks like for instance Pneumonia Detection.
For backprop, every layer updates it's own values and passes it's input gradient to the layer behind it. For forwardprop every layer just gives its output as input to the next layer.
In the example below the model is set up for pneumonia detection with 128px * 128px images.
The accuracy allways stays under 60% no matter how much it's trained.
Here is the relevant code:
model = nG.NeuralNetwork([
nG.Convolutional(1, (128, 128), 3, 8),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(8, (63, 63), 3, 16),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(16, (31, 31), 3, 16),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(16, (15, 15), 3, 32),
nG.Pooling(2),
nG.ReLU(),
nG.Convolutional(32, (7, 7), 3, 64),
nG.ReLU(),
nG.Reshape((64, 5, 5), (1600, 1)),
nG.Dense(1600, 128),
nG.Tanh(),
nG.Dense(128, 2),
nG.Tanh()],
nG.MSE()
)
model.train(images, labels, epochs=3, lr=0.01)
class NeuralNetwork:
def __init__(self, layers, loss):
self.layers = layers
self.loss = loss
def forwardProp(self, input):
output = input
for layer in self.layers:
output = layer.forwardProp(output)
return output
def backwardProp(self, errorDeriv, lr):
deltaOutput = errorDeriv
for layer in reversed(self.layers):
deltaOutput = layer.backwardProp(deltaOutput, lr)
def train(self, xTrain, yTrain, epochs=1, lr=1, interimResult=False):
corrects = 0
print("Precompiling ... This might take a few seconds", end="\n\n")
for epoch in range(epochs):
print(f"{epoch+1}th epoch:")
round_start = time.time()
i = -1
for X, Y in zip(xTrain, yTrain):
i += 1
start = time.time()
output = self.forwardProp(X)
errorDeriv = self.loss.errorDerivative(output, Y)
self.backwardProp(errorDeriv, lr)
--
class Convolutional():
def __init__(self, input_depth, input_size, kernel_size, depth):
self.input_depth = input_depth
self.input_size = input_size
self.kernel_size = kernel_size
self.depth = depth
self.kernels = np.random.uniform(-0.5, 0.5, (depth, input_depth, kernel_size, kernel_size))
self.bias = [np.random.uniform(-0.5, 0.5, (input_size[0] - kernel_size + 1, input_size[1] - kernel_size + 1)) for i in range(depth)]
self.input = None
def forwardProp(self, input):
self.input = input
output = get_output(input, self.depth, self.input_size, self.kernel_size, self.input_depth, self.kernels, self.bias)
return output
def backwardProp(self, output_delta, lr):
kernels_gradient, input_delta = get_gradients(self.kernels, self.input, self.depth, self.input_depth, output_delta)
self.kernels -= lr * kernels_gradient
self.bias -= lr * output_delta
return input_delta
#numba.njit
def get_gradients(kernels, input, depth, input_depth, output_delta):
kernels_gradient = np.zeros(kernels.shape)
input_delta = np.zeros(input.shape)
for i in range(depth):
for j in range(input_depth):
kernels_gradient[i, j] = valid_correlate(input[j], output_delta[i])
input_delta[j] += full_convolve(output_delta[i], kernels[i, j])
return kernels_gradient, input_delta
#numba.njit(fastmath=True, nogil=True)
def get_output(input, depth, input_size, kernel_size, input_depth, kernels, bias):
out = np.zeros((depth, input_size[0] - kernel_size + 1, input_size[0] - kernel_size + 1))
for k in range(depth):
for i in range(input_depth):
out[k] += valid_correlate(input[i], kernels[k][i])
out[k] += bias[k]
return out
class Pooling:
def __init__(self, size):
self.size = size
self.input = None
def forwardProp(self, input):
self.input = input
output = []
for i in range(input.shape[0]):
output.append(pool(input[i], self.size))
output = np.asarray(output)
return output
def backwardProp(self, output_delta, lr):
input_delta = anti_pool(output_delta, self.input.shape, self.size, self.input)
return input_delta
def anti_pool(output_delta, input_shape, size, input):
input_delta = np.zeros(input_shape)
for l in range(input_delta.shape[0]):
for x in range(output_delta.shape[1]):
for y in range(output_delta.shape[2]):
area_start = (x * size, y * size)
area_end = (min((x + 1) * size, input_delta.shape[1]),
min((y + 1) * size, input_delta.shape[2]))
area = (input[l, area_start[0]:area_end[0], area_start[1]:area_end[1]])
highest_pos = np.unravel_index(area.argmax(), area.shape)
highest_pos = [x * size + highest_pos[0],
y * size + highest_pos[1]]
input_delta[l, highest_pos[0], highest_pos[1]] = output_delta[l, x, y]
return input_delta
#numba.njit("float64[:,:](float64[:,:], int32)")
def pool(mat, size):
def pool_at_position(mat, pos):
end_pos = (min(mat.shape[0], pos[0] + size),
min(mat.shape[1], pos[1] + size))
area = mat[pos[0]:end_pos[0], pos[1]:end_pos[1]]
result = np.max(area)
return result
output_size = (int(np.ceil(mat.shape[0] / size)), int(np.ceil(mat.shape[1] / size)))
output = np.zeros(output_size)
for x in range(output_size[0]):
for y in range(output_size[1]):
output[x, y] = pool_at_position(mat, (x * size, y * size))
return output
class Dense:
def __init__(self, inputSize, outputSize):
self.weights = np.random.randn(outputSize, inputSize)
self.bias = np.random.randn(outputSize, 1)
def forwardProp(self, input):
self.input = input
return np.dot(self.weights, self.input) + self.bias
def backwardProp(self, output_gradient, lr):
weights_gradient = np.dot(output_gradient, self.input.T)
input_gradient = np.dot(self.weights.T, output_gradient)
self.weights -= lr * weights_gradient
self.bias -= lr * output_gradient
return input_gradient
class Tanh:
def __init__(self):
self.input = None
self.output = None
def forwardProp(self, input):
self.input = input
self.output = tanh(input)
return self.output
def backwardProp(self, outputDelta, lr):
inputDelta = 1 - (np.tanh(self.input) ** 2)
inputDelta *= outputDelta
return inputDelta
#numba.vectorize
def tanh(x):
return np.tanh(x)
class ReLU:
def __init__(self):
self.input = None
self.output = None
def forwardProp(self, input):
self.input = input
self.output = np.maximum(input, 0)
return self.output
def backwardProp(self, outputDelta, lr):
inputDelta = np.multiply(outputDelta, np.vectorize(self.anti_relu)(self.input))
return inputDelta
def anti_relu(self, x):
if x < 0:
return 0
else:
return 1
class MSE :
def __init__(self):
pass
def errorFunction(self, output, Y):
error = (output - Y) ** 2
return error
def errorDerivative(self, output, Y):
error_deriv = 2 * (output - Y)
return error_deriv
For the functions/Classes that I've not included I'm dead sure that they work.
I spent the last couple of days reading over the code and still haven't found the problem yet.
I would be extremely thankful for any kind of response.
Kind Regards
Eirik
Related
I am using a 2 layer stacked lstm with a width of 128 in PyTorch. Since I want to use more layers and change the width of the layers, I have written the lstm class I am using in a generalized form. When I use the general model with width=128, depth=2 I get the exact same results in each epoch (with set random state). The only difference is the performance of both models. While the hardcoded 2 layer lstm takes ~60s per epoch, the general model takes ~90s. How is this possible?
Hard coded model (60s per epoch):
class Network(nn.Module):
def __init__(
self,
input_size,
width,
depth,
device
):
super(Network, self).__init__()
self.input_size = input_size
self.device = device
self.lstm_1 = nn.LSTMCell(self.input_size, 128)
self.lstm_2 = nn.LSTMCell(128, 128)
self.linear_1 = nn.Linear(128, 32)
self.linear_2 = nn.Linear(32, 2)
self.dropout_1 = nn.Dropout(0.2)
def forward(self, data):
h_t_1 = torch.zeros(data.size(0), 128).to(self.device)
c_t_1 = torch.zeros(data.size(0), 128).to(self.device)
h_t_2 = torch.zeros(data.size(0), 128).to(self.device)
c_t_2 = torch.zeros(data.size(0), 128).to(self.device)
for time_step in data.split(1, dim=1):
h_t_1, c_t_1 = self.lstm_1(time_step.view(data.size(0), self.input_size), (h_t_1, c_t_1))
h_t_2, c_t_2 = self.lstm_2(self.dropout_1(h_t_1), (h_t_2, c_t_2))
output = self.dropout_1(self.linear_1(self.dropout_1(h_t_2)))
output = self.linear_2(output)
mean = output[..., 0][..., None]
std = torch.clamp(output[..., 1][..., None], min=0.01)
norm_dist = torch.distributions.Normal(mean, std)
return norm_dist
General model (90s per epoch):
class Network(nn.Module):
def __init__(
self,
input_size,
width,
depth,
device
):
super(Network, self).__init__()
self.input_size = input_size
self.width = width
self.depth = depth
self.device = device
self.lstm_1 = nn.LSTMCell(self.input_size, self.width)
for i in range(self.depth - 1):
setattr(self, f'lstm_{i+2}', nn.LSTMCell(self.width, self.width))
self.linear_1 = nn.Linear(self.width, 32)
self.linear_2 = nn.Linear(32, 2)
self.dropout_1 = nn.Dropout(0.2)
def forward(self, data):
h_t_1 = torch.zeros(data.size(0), self.width).to(self.device)
c_t_1 = torch.zeros(data.size(0), self.width).to(self.device)
for i in range(self.width - 1):
locals()[f'h_t_{i+2}'] = torch.zeros(data.size(0), self.width).to(self.device)
locals()[f'c_t_{i+2}'] = torch.zeros(data.size(0), self.width).to(self.device)
for time_step in data.split(1, dim=1):
h_t_1, c_t_1 = self.lstm_1(time_step.view(data.size(0), self.input_size), (h_t_1, c_t_1))
for i in range(self.depth - 1):
locals()[f'h_t_{i+2}'], locals()[f'c_t_{i+2}'] = getattr(self, f'lstm_{i+2}')(self.dropout_1(locals()[f'h_t_{i+1}']), (locals()[f'h_t_{i+2}'], locals()[f'c_t_{i+2}']))
output = self.dropout_1(self.linear_1(self.dropout_1(locals()[f'h_t_{self.depth}'])))
output = self.linear_2(output)
mean = output[..., 0][..., None]
std = torch.clamp(output[..., 1][..., None], min=0.01)
norm_dist = torch.distributions.Normal(mean, std)
return norm_dist
I am trying to use the vanilla transformer from PyTorch using Pytorch Lightning. I tried to test the model with a reverse number task. So given [1, 3, 5, 4, 13, 19] it returns [1, 13, 4, 5, 3, 19] with 1, 19 being start and end token respectively. The full code is below. The code can run without error but there seems to be a problem with the backpropagation. The training loss does go down at first but it doesn't go beyond 2.8 and the accuracy doesn't go beyond 11%.
It seems that part of the model is able to optimize, I am guessing it is because the weights located in Embeddings and Generator can backpropagate, but weights located in nn.Transformer cannot? I am really not sure.
import math
import torch.nn.functional as F
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
class Embeddings(pl.LightningModule):
def __init__(self, d_model, vocab):
super(Embeddings, self).__init__()
self.lut = nn.Embedding(vocab, d_model)
self.d_model = d_model
def forward(self, x):
a = self.lut(x) * math.sqrt(self.d_model)
return a
class PositionalEncoding(pl.LightningModule):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
class Generator(pl.LightningModule):
def __init__(self, size):
super(Generator, self).__init__()
self.proj = nn.Linear(512, size)
def forward(self, x):
return F.log_softmax(self.proj(x), dim=-1)
class Model(pl.LightningModule):
def __init__(self, src_embed, tgt_embed, transformer, generator):
super(Model, self).__init__()
self.src_embed = src_embed
self.tgt_embed = tgt_embed
self.transformer = transformer
self.generator = generator
self.valLoss = 0
self.valAcc = 0
self.automatic_optimization = False
self.optimizer = None
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, x, y, tgt_mask=None):
x = self.src_embed(x)
y = self.tgt_embed(y)
return self.generator(self.transformer(x, y, tgt_mask=tgt_mask))
def training_step(self, batch, batch_idx):
if self.optimizer is None:
self.optimizer = self.optimizers()
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.contiguous().view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
self.log('train_loss', loss)
print(loss)
def validation_step(self, batch, batch_idx):
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
self.log('val_loss', loss)
self.valLoss += loss
if batch_idx % 10 == 0:
print(loss)
if batch_idx == 99:
print(self.valLoss/100)
self.valLoss = 0
return {"x": output, "trg": batch.trg_y, "index": batch_idx}
def validation_step_end(self, batch):
output, trg, idx = batch["x"], batch["trg"], batch["index"]
accuracy = getAccuracy(output, trg)
self.log("accuracy", accuracy)
self.valAcc += accuracy
if idx == 99:
print(self.valAcc/100)
self.valAcc = 0
def train_dataloader(self):
data = data_gen(V, 0, 3000)
return DataLoader(data, batch_size=30, shuffle=False, num_workers=2, pin_memory=True)
def val_dataloader(self):
data = data_gen(V, 1, 1000)
return DataLoader(data, batch_size=10, shuffle=False, num_workers=2, pin_memory=True)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)
class LossCompute(pl.LightningModule):
def __init__(self, size):
super(LossCompute, self).__init__()
self.criterion = nn.KLDivLoss(reduction='sum')
self.size = size
self.true_dist = None
def forward(self, x, target):
# x has size (batch_size x length, vocab_size)
assert x.size(1) == self.size
true_dist = x.data.clone()
true_dist.fill_(0)
true_dist.scatter_(1, target.data.unsqueeze(1).long(), 1)
self.true_dist = true_dist
return self.criterion(x, true_dist)
# prepare data
class Batch:
"Object for holding a batch of data with mask during training."
def __init__(self, src, trg=None):
self.src = src
if trg is not None:
self.trg = trg[:, :-1]
self.trg_y = trg[:, 1:]
self.trg_mask = \
self.make_std_mask(self.trg)
self.ntokens = self.trg_y.size(0) * self.trg_y.size(1)
print("")
#staticmethod
def make_std_mask(tgt):
"Create a mask to hide padding and future words."
tgt_mask = subsequent_mask(tgt.size(-1)).type_as(tgt.data)
return tgt_mask
def subsequent_mask(size):
"Mask out subsequent positions."
attn_shape = (1, size, size)
subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
return torch.from_numpy(subsequent_mask) == 0
def data_gen(V, randomSeed, totalTrainingSample):
np.random.seed(randomSeed)
x = torch.from_numpy(np.random.randint(2, V - 2, size=(totalTrainingSample, 10)))
y = torch.flip(torch.flip(x, [0, 1]), [0])
x[:, 0] = 1
y[:, 0] = 1
x[:, -1] = V - 1
y[:, -1] = V - 1
return list(zip(x, y))
def getAccuracy(x, trg):
totalValAcc = 0
totalValAccToken = 0
trg = trg.contiguous().view(-1)
out = x.view(-1, x.size(-1)) # (batch_size * tgt_length, src_vocab)
_, index = torch.max(out, dim=-1) # index (batch_size * tgt_length)
correct = list((trg == index)).count(True)
totalValAcc += correct
totalValAccToken += index.size(0)
return totalValAcc / totalValAccToken
V = 20
transformer = nn.Transformer(num_encoder_layers=2, num_decoder_layers=2, batch_first=True)
PositionEnc = PositionalEncoding(512, 0.1)
src_emb = Embeddings(512, V)
tgt_emb = Embeddings(512, V)
gen = Generator(V)
if __name__ == '__main__':
model = Model(nn.Sequential(src_emb, PositionEnc), nn.Sequential(tgt_emb, PositionEnc), transformer, gen)
earlyStopping = EarlyStopping(monitor='val_loss', patience=3)
trainer = pl.Trainer(max_epochs=10, callbacks=[earlyStopping])
trainer.fit(model)
I got an AssertionError from MultiHeadAttention Class from PyTorch.
class Attention(nn.Module):
def __init__(self, d_model: int, max_pos, num_head, dropout: float = 0.1):
super(Attention, self).__init__()
self.num_head = num_head
self.dropout = nn.Dropout(p=dropout).to(device)
self.embed = torch.nn.Embedding(d_model, max_pos).to(device)
self.pos = torch.from_numpy(t.positional_encoding(max_pos, d_model)).to(device)
self.MHA = nn.MultiheadAttention(d_model, num_head, self.dropout).to(device)
def forward(self, x):
seq = x.size(dim=1)
x = self.embed(x)
x += self.pos[:,:seq]
x = self.dropout(x)
attn, _ = self.MHA(x, x, x)
return attn, _
def create_encode_mask(self, seq):
out = t.generate_square_subsequent_mask(seq)
return out
att = Attention(d_model, max_pos, num_heads, dropout_rate)
for epoch in range(EPOCHS):
# Training
print("EPOCH = ", epoch)
for (batch, (src, trg)) in enumerate(train_data):
print("BATCH = ", batch)
src, trg = src.to(device), trg.to(device)
out = att(src)
I am just trying to see the result of MultiHeadAttention.
But the error seems to be unavoidable, I tried to input the embed_dim_to_check value in the MHA including the num_head.
attn, _ = self.MHA(x, x, x, self.create_encode_mask(seq), self.num_head)
But the error remains the same
Thank You
I'm trying to build a custom ConvLSTM layer in keras using the following code but it didn't work:
import tensorflow as tf
from tensorflow import keras
from keras.layers import InputSpec, Layer
class Padding2D(Layer):
def __init__(self, padding = (1,1), **kwargs):
self.padding = tuple(padding)
self.input_spec = [InputSpec(ndim = 4)]
super(Padding2D,self).__init__(**kwargs)
def compute_output_shape(self, s):
return (s[0], s[1] + 2*self.padding[0], s[2] + 2*self.padding[1], s[3])
def call(self, x):
w_pad, h_pad = self.padding
return tf.pad(x, [[0,0], [h_pad,h_pad],[w_pad,w_pad],[0,0]])
class ConvLSTM(Layer):
def __init__(self, out_channels, kernel_size=5, forget_bias=1.0, padding=0):
super(ConvLSTM, self).__init__()
self.out_channels = out_channels
self.forget_bias = forget_bias
self.states = None
def call(self, inputs):
if self.states is None:
#inputs.shape : [Batch, Height, Width, Channel]
self.states = (tf.zeros([inputs.shape[0], inputs.shape[1], inputs.shape[2], self.out_channels]),
tf.zeros([inputs.shape[0], inputs.shape[1], inputs.shape[2]], self.out_channels))
c, h = self.states
if not (len(c.shape) == 4 and len(h.shape) == 4 and len(inputs.shape) == 4):
raise TypeError("Incorrect shapes")
inputs_h = tf.concat((inputs, h), axis=3)
padded_inputs_h = Padding2D(padding = (padding,padding))(inputs_h)
i_j_f_o = Conv2D( 4 * out_channels, kernel_size, strides=1)(padded_inputs_h)
i = i_j_f_o[:,:,:,: self.out_channels]
j = i_j_f_o[:,:,:,self.out_channels : 2*self.out_channels]
f= i_j_f_o[:,:,:, 2*self.out_channels : 3*self.out_channels]
o = i_j_f_o[:,:,:, 3*self.out_channels :]
# i, j, f, o = torch.split(i_j_f_o, self.out_channels, dim=3)
new_c = c * sigmoid(f + self.forget_bias) + sigmoid(i) * tanh(j)
new_h = tanh(new_c) * sigmoid(o)
self.states = (new_c, new_h)
return new_h
input0 = tf.keras.Input(shape= (2,2,1))
x = ConvLSTM(out_channels= 1)(input0)
model = tf.keras.Model(input0,x)
print(model(tf.ones((1,2,2,1))))
Error output
----> x = ConvLSTM(out_channels= 1)(input0)
TypeError: in user code:
<ipython-input-1-2e11c0026581>:28 call *
self.states = (tf.zeros([inputs.shape[0], inputs.shape[1], inputs.shape[2], self.out_channels]),
TypeError: Expected int32, got None of type 'NoneType' instead.
I think the error occurs because the model don't know in advance the value of the batch_size dimension (inputs.shape[0]) which is set to None when the model is built (before execution) but I need to make the model figure out by itself the batch size dimension during execution time (and ignore it in building time). Can anyone help please ?
By following the suggestion given by Marc above in the comments, this code solved the problem:
import tensorflow as tf
from tensorflow import keras
from keras.layers import InputSpec, Layer, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
class Padding2D(Layer):
def __init__(self, padding = (1,1), **kwargs):
self.padding = tuple(padding)
self.input_spec = [InputSpec(ndim = 4)]
super(Padding2D,self).__init__(**kwargs)
def compute_output_shape(self, s):
return (s[0], s[1] + 2*self.padding[0], s[2] + 2*self.padding[1], s[3])
def call(self, x):
w_pad, h_pad = self.padding
return tf.pad(x, [[0,0], [h_pad,h_pad],[w_pad,w_pad],[0,0]])
class ConvLSTM(Layer):
def __init__(self, out_channels, kernel_size=1, forget_bias=1.0, padding=0):
super(ConvLSTM, self).__init__()
self.out_channels = out_channels
self.kernel_size = kernel_size
self.forget_bias = forget_bias
self.padding=padding
self.states = None
def call(self, inputs):
if self.states is None:
#inputs.shape : [Batch, Height, Width, Channel]
self.states = ( tf.zeros_like(tf.tile(tf.expand_dims(inputs[:,:,:,0], axis=-1), (1,1,1,self.out_channels))),
tf.zeros_like(tf.tile(tf.expand_dims(inputs[:,:,:,0], axis=-1), (1,1,1,self.out_channels))))
c, h = self.states
if not (len(c.shape) == 4 and len(h.shape) == 4 and len(inputs.shape) == 4):
raise TypeError("Incorrect shapes")
inputs_h = tf.concat((inputs, h), axis=3)
padded_inputs_h = Padding2D(padding = (self.padding,self.padding))(inputs_h)
i_j_f_o = Conv2D( 4 * self.out_channels, self.kernel_size, strides=1)(padded_inputs_h)
i = i_j_f_o[:,:,:,: self.out_channels]
j = i_j_f_o[:,:,:,self.out_channels : 2*self.out_channels]
f= i_j_f_o[:,:,:, 2*self.out_channels : 3*self.out_channels]
o = i_j_f_o[:,:,:, 3*self.out_channels :]
new_c = c * sigmoid(f + self.forget_bias) + sigmoid(i) * tanh(j)
new_h = tanh(new_c) * sigmoid(o)
self.states = (new_c, new_h)
return new_h
I also found another alternative to solve the problem by providing the batch size and input shape when initializing the layer.
The code is given below:
import tensorflow as tf
from tensorflow import keras
from keras.layers import InputSpec, Layer, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
class Padding2D(Layer):
def __init__(self, padding = (1,1), **kwargs):
self.padding = tuple(padding)
self.input_spec = [InputSpec(ndim = 4)]
super(Padding2D,self).__init__(**kwargs)
def compute_output_shape(self, s):
return (s[0], s[1] + 2*self.padding[0], s[2] + 2*self.padding[1], s[3])
def call(self, x):
w_pad, h_pad = self.padding
return tf.pad(x, [[0,0], [h_pad,h_pad],[w_pad,w_pad],[0,0]])
class ConvLSTM(Layer):
def __init__(self,batch_size, input_shape, out_channels, kernel_size=1, forget_bias=1.0, padding=0):
super(ConvLSTM, self).__init__()
self.out_channels = out_channels
self.kernel_size = kernel_size
self.forget_bias = forget_bias
self.shape = input_shape
self.padding=padding
self.states = None
self.batch_size = batch_size
def build(self, input_shape):
if self.states is None:
#input_shape : [Height, Width, Channel]
self.states = (tf.zeros([self.batch_size]+ self.shape[:-1] + [self.out_channels]),
tf.zeros([self.batch_size]+ self.shape[:-1] + [self.out_channels]))
super(ConvLSTM,self).build(input_shape)
def call(self, inputs):
c, h = self.states
if not (len(c.shape) == 4 and len(h.shape) == 4 and len(inputs.shape) == 4):
raise TypeError("Incorrect shapes")
inputs_h = tf.concat((inputs, h), axis=3)
padded_inputs_h = Padding2D(padding = (self.padding,self.padding))(inputs_h)
i_j_f_o = Conv2D( 4 * self.out_channels, self.kernel_size, strides=1)(padded_inputs_h)
i,j,f,o = tf.split(i_j_f_o, num_or_size_splits=4, axis=3)
new_c = c * sigmoid(f + self.forget_bias) + sigmoid(i) * tanh(j)
new_h = tanh(new_c) * sigmoid(o)
self.states = (new_c, new_h)
return new_h
Yet, even if these implementations solved the question asked in this post, there still remain a problem in both implementations related to how I update lstm cell state (line self.states = (new_c, new_h) """Last line in ConvLSTM class"""") but since the problem is different I opened this issue in a different post
I am trying to deal with neural networks, but I'm very new to this. I've never understood the different implementations I found.
So I tried to make a simple implementation of the XOR problem with a MultiLayer Perceptron and backpropagation according to the book of Virginie MATHIVET, but my algorithm is not converging. I tried a lot of things but the result does not change. Here is my code:
class Neuron:
def __init__(self, nb_inputs, bias):
self.nb_inputs = nb_inputs
self.bias = bias
self.weights = [random()*2.0-1 for _ in range(nb_inputs+1)]
self.deltas = [0.0 for _ in range(nb_inputs+1)]
self.output = None
def init_deltas(self):
self.deltas = [0.0 for _ in range(self.nb_inputs+1)]
def aggregation(self, inputs):
return sum([self.weights[i] * inputs[i] for i in range(self.nb_inputs)]) + self.bias * self.weights[self.nb_inputs]
def activation(self, value):
return 1.0/(1.0+math.exp(-value))
def compute_output(self, inputs):
self.output = self.activation(self.aggregation(inputs))
return self.output
class NeuralNetwork:
def __init__(self, nb_inputs, nb_hidden, nb_outputs, learning_rate):
self.nb_inputs = nb_inputs
self.nb_hidden = nb_hidden
self.nb_outputs = nb_outputs
self.learning_rate = learning_rate
self.output_layer = [Neuron(self.nb_hidden, 1.0) for _ in range(self.nb_outputs)]
self.hidden_layer = [Neuron(self.nb_inputs, 1.0) for _ in range(self.nb_hidden)]
def compute_outputs(self, inputs):
hidden_outputs = [neuron.compute_output(inputs) for neuron in self.hidden_layer]
outputs = [neuron.compute_output(hidden_outputs) for neuron in self.output_layer]
return outputs[0]
def init_deltas(self):
for neuron in self.output_layer + self.hidden_layer:
neuron.init_deltas()
def train(self, data, nb_iterations):
for _ in range(nb_iterations):
self.init_deltas()
for inputs in data:
si, yi = self.compute_outputs(inputs), data[inputs]
print si, yi
# For each output neuron
for neuron in self.output_layer:
# For each weight
for i in range(neuron.nb_inputs):
neuron.deltas[i] = si * (1 - si) * (yi - si)
# For each hidden neuron
for i in range(self.nb_hidden):
hidden_neuron = self.hidden_layer[i]
# For each weight
for k in range(hidden_neuron.nb_inputs):
total = 0.0
# For each output neuron
for output_neuron in self.output_layer:
total += output_neuron.deltas[i] * output_neuron.weights[i]
hidden_neuron.deltas[k] = hidden_neuron.output * (1 - hidden_neuron.output) * total
# adjust weights in output_layer
for neuron in self.output_layer:
for i in range(self.nb_hidden):
neuron.weights[i] += self.learning_rate * neuron.deltas[i] * neuron.output
neuron.weights[self.nb_hidden] += self.learning_rate * neuron.deltas[self.nb_hidden] * neuron.bias
# adjust weights in hidden_layer
for neuron in self.hidden_layer:
for i in range(self.nb_inputs):
neuron.weights[i] += self.learning_rate * neuron.deltas[i] * neuron.output
neuron.weights[self.nb_inputs] += self.learning_rate * neuron.deltas[self.nb_inputs] * neuron.bias
def predict(self, inputs):
return self.compute_outputs(inputs)
And to test it:
DATA = {
(0, 0): 0,
(1, 0): 1,
(0, 1): 1,
(1, 1): 0
}
nn = NeuralNetwork(2, 3, 1, 0.2)
nn.train(DATA, 50000)
nn.predict([0,0])
Thanks in advance for your help