I'm trying to implement a Neural Net in python without the use of libraries like Keras or Tensorflow. I still have to test the net, right now I just tried to train it on Iris dataset and check afterwards the correctness of the backpropagation algorithm.
To do so, I wrote the gradient checking procedure, calculating the analytical gradients and comparing them with the gradients from backpropagation.
The point is that, even if the backpropagation algorithm seems correct to me, the difference between the gradients is always high (around 0.8, instead of the classic 1e-7).
Layer class
class Dense(Layer):
def __init__(self, input_shape, name=None, activation='relu', regularization='l2'):
self.name = name
self.is_output = False
self.weights = np.random.uniform(low=0.01, high=0.10, size=input_shape)
self.biases = np.ones((1,input_shape[1]))
if activation == 'sigmoid':
self.activation = Activation_Sigmoid()
else: #activation == 'relu':
self.activation = Activation_ReLU()
self.cost = Categorical_CrossEntropyLoss()
def set_as_output(self, is_output=True):
self.is_output = is_output
def forward(self, inputs, debug=False, epsilon=None):
self.net_input = inputs
if debug:
augmented_parameters = np.zeros(epsilon.shape)
weights_column_vector = np.reshape(self.weights,(-1,1))
biases_column_vector = np.reshape(self.biases,(-1,1))
concatenated_parameters = np.concatenate((weights_column_vector, biases_column_vector))
for i in range(concatenated_parameters.shape[0]):
augmented_parameters[i] = concatenated_parameters[i]
# make the augmented parameter long as theta in order to sum them
# this because epsilon is a standard basis vector
augmented_parameters += epsilon
# rebuild the weights matrix and biases vector to apply forward propagation
weights_end = self.weights.shape[0] * self.weights.shape[1]
biases_end = self.biases.shape[0] * self.biases.shape[1] + weights_end
weights = np.reshape(augmented_parameters[0:weights_end],self.weights.shape)
biases = np.reshape(augmented_parameters[weights_end:biases_end], self.biases.shape)
output = np.dot(inputs, weights) + biases
activated_output = self.activation.forward(output)
return activated_output
self.output = np.dot(inputs, self.weights) + self.biases
self.activated_output = self.activation.forward(self.output)
return self.activated_output
def backward(self, X, y, output, step, l2=0.5): #backpropagation
m = X.shape[0] # number of examples
if self.is_output:
error = self.cost.backward(output, y) #(a_k - y_hat_k)
delta_k = self.activation.backward(self.output)* error
# net input for neuron k is a_j^(l-1)
grad = np.dot(self.net_input.T, delta_k)
#update weights with l2 regularization
self.grad_w = grad + (l2 / m)*self.weights
self.grad_b = np.sum(delta_k * 1,axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_k ,self.weights.T)
else:
delta_j = self.activation.backward(self.output) * output
grad = np.dot(self.net_input.T, delta_j)
self.grad_w = grad + (l2 / m) * self.weights
self.grad_b = np.sum(delta_j * 1, axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_j, self.weights.T)
def get_parameters(self):
return self.weights, self.biases
def get_gradients(self):
return self.grad_w, self.grad_b
Neural Net class
class NeuralNet():
def __init__(self):
self.layers = []
self.layers_output = []
self.cost = None
self.regularization = L2_Regularization()
def add(self,layer):
self.layers.append(layer)
def forward(self, inputs, debug=False, epsilon=None):
input = np.copy(inputs)
for layer in self.layers:
output = layer.forward(input, debug=debug, epsilon=epsilon)
input = output
return input
def backward(self, X, y, output, step):
prev_delta = None
out = output
for layer in self.layers[::-1]:
prev_delta = layer.backward(X, y, out, step)
out = prev_delta
def fit(self, X, y, batch_size=1, epochs=10, step=0.05, shuffle=True):
self.layers[-1].set_as_output()
self.error = []
i = 0.005 * epochs
for epoch in range(epochs):
if shuffle:
X = np.random.permutation(X)
batches = int(np.ceil(X.shape[0]/batch_size))
batches_error = []
for t in range(batches):
batch_X = X[t*batch_size:np.min([X.shape[0],(t+1)*batch_size]),:]
batch_y = y[t*batch_size:np.min([y.shape[0],(t+1)*batch_size]),:]
output = self.forward(batch_X)
cost = self.cost.forward(output,batch_y)
cost += self.regularization.forward(X, self.layers)
batches_error.append(cost)
self.backward(batch_X, batch_y, output, step)
self.error.append(np.mean(batches_error))
if epoch % i == 0:
print('epoch:', epoch, 'error:', np.mean(self.error))
return self
def parameters_to_theta(self):
theta = []
for layer in self.layers:
w, b = layer.get_parameters()
#flatten parameter w
new_vector = np.reshape(w, (-1,1))
theta.append(new_vector)
#flatten parameter b
new_vector = np.reshape(b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradients_to_theta(self):
theta = []
for layer in self.layers:
grad_w, grad_b = layer.get_gradients()
new_vector = np.reshape(grad_w, (-1,1))
theta.append(new_vector)
new_vector = np.reshape(grad_b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradient_check(self, X, y, epsilon=1e-7):
theta = self.parameters_to_theta()
dtheta = self.gradients_to_theta()
num_parameters = theta.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
dtheta_approx = np.zeros((num_parameters, 1))
for i in range(num_parameters):
theta_plus = np.zeros((num_parameters,1))
theta_plus[i] = epsilon
J_plus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_plus),y)
theta_minus = np.zeros((num_parameters,1))
theta_minus[i] = - epsilon
J_minus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_minus),y)
dtheta_approx[i] = (J_plus[i] - J_minus[i])/ (2 * epsilon)
numerator = np.linalg.norm(dtheta - dtheta_approx)
denominator = np.linalg.norm(dtheta_approx) + np.linalg.norm(dtheta)
difference = numerator / denominator
return difference
I'm using ReLU and Sigmoid as activation functions, and Categorical Cross Entropy for the cost
import numpy as np
from scipy.special import expit as sigmoid
class Activation_ReLU:
def forward(self, inputs):
return np.maximum(0, inputs)
def backward(self, inputs):
return np.greater(inputs,0).astype(int)
class Activation_Sigmoid:
def forward(self, inputs):
return sigmoid(inputs)
def backward(self, inputs):
return sigmoid(inputs) * (1 - sigmoid(inputs))
class Categorical_CrossEntropyLoss():
def forward(self, y_pred, y_real):
predictions = np.copy(y_pred)
predictions = np.clip(predictions, 1e-12, 1 - 1e-12) # avoid zero values for log
n = y_real.shape[0]
return - (1 / n) * np.sum(y_real * np.log(y_pred))
def backward(self, y_pred, y_real):
return y_real - y_pred
These are the main classes that define the net. The model that I create to train on Iris dataset is a NN with 1 hidden layer.
# random seed is 1
X, y = load_iris(return_X_y=True)
X = (X - np.mean(X)) / np.std(X) # standardize data to improve network convergence
y = y.reshape((-1,1))
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)
model = NeuralNet()
model.add(Dense((4,10),name='input_layer',activation='relu'))
model.add(Dense((10,10),name='hidden_layer',activation='relu'))
model.add(Dense((10,3),name='output_layer',activation='sigmoid'))
model.fit(X_train,y_train, batch_size=5, epochs=200, step=1e-3)
difference = model.gradient_check(X_train, y_train)
And then, the result of print(difference) is
0.7992920544491866
So there is something wrong with my implementation. What things I have to check to determine the causes of this high difference between gradients?
Related
I am trying to use the vanilla transformer from PyTorch using Pytorch Lightning. I tried to test the model with a reverse number task. So given [1, 3, 5, 4, 13, 19] it returns [1, 13, 4, 5, 3, 19] with 1, 19 being start and end token respectively. The full code is below. The code can run without error but there seems to be a problem with the backpropagation. The training loss does go down at first but it doesn't go beyond 2.8 and the accuracy doesn't go beyond 11%.
It seems that part of the model is able to optimize, I am guessing it is because the weights located in Embeddings and Generator can backpropagate, but weights located in nn.Transformer cannot? I am really not sure.
import math
import torch.nn.functional as F
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
class Embeddings(pl.LightningModule):
def __init__(self, d_model, vocab):
super(Embeddings, self).__init__()
self.lut = nn.Embedding(vocab, d_model)
self.d_model = d_model
def forward(self, x):
a = self.lut(x) * math.sqrt(self.d_model)
return a
class PositionalEncoding(pl.LightningModule):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
class Generator(pl.LightningModule):
def __init__(self, size):
super(Generator, self).__init__()
self.proj = nn.Linear(512, size)
def forward(self, x):
return F.log_softmax(self.proj(x), dim=-1)
class Model(pl.LightningModule):
def __init__(self, src_embed, tgt_embed, transformer, generator):
super(Model, self).__init__()
self.src_embed = src_embed
self.tgt_embed = tgt_embed
self.transformer = transformer
self.generator = generator
self.valLoss = 0
self.valAcc = 0
self.automatic_optimization = False
self.optimizer = None
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, x, y, tgt_mask=None):
x = self.src_embed(x)
y = self.tgt_embed(y)
return self.generator(self.transformer(x, y, tgt_mask=tgt_mask))
def training_step(self, batch, batch_idx):
if self.optimizer is None:
self.optimizer = self.optimizers()
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.contiguous().view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
self.log('train_loss', loss)
print(loss)
def validation_step(self, batch, batch_idx):
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
self.log('val_loss', loss)
self.valLoss += loss
if batch_idx % 10 == 0:
print(loss)
if batch_idx == 99:
print(self.valLoss/100)
self.valLoss = 0
return {"x": output, "trg": batch.trg_y, "index": batch_idx}
def validation_step_end(self, batch):
output, trg, idx = batch["x"], batch["trg"], batch["index"]
accuracy = getAccuracy(output, trg)
self.log("accuracy", accuracy)
self.valAcc += accuracy
if idx == 99:
print(self.valAcc/100)
self.valAcc = 0
def train_dataloader(self):
data = data_gen(V, 0, 3000)
return DataLoader(data, batch_size=30, shuffle=False, num_workers=2, pin_memory=True)
def val_dataloader(self):
data = data_gen(V, 1, 1000)
return DataLoader(data, batch_size=10, shuffle=False, num_workers=2, pin_memory=True)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)
class LossCompute(pl.LightningModule):
def __init__(self, size):
super(LossCompute, self).__init__()
self.criterion = nn.KLDivLoss(reduction='sum')
self.size = size
self.true_dist = None
def forward(self, x, target):
# x has size (batch_size x length, vocab_size)
assert x.size(1) == self.size
true_dist = x.data.clone()
true_dist.fill_(0)
true_dist.scatter_(1, target.data.unsqueeze(1).long(), 1)
self.true_dist = true_dist
return self.criterion(x, true_dist)
# prepare data
class Batch:
"Object for holding a batch of data with mask during training."
def __init__(self, src, trg=None):
self.src = src
if trg is not None:
self.trg = trg[:, :-1]
self.trg_y = trg[:, 1:]
self.trg_mask = \
self.make_std_mask(self.trg)
self.ntokens = self.trg_y.size(0) * self.trg_y.size(1)
print("")
#staticmethod
def make_std_mask(tgt):
"Create a mask to hide padding and future words."
tgt_mask = subsequent_mask(tgt.size(-1)).type_as(tgt.data)
return tgt_mask
def subsequent_mask(size):
"Mask out subsequent positions."
attn_shape = (1, size, size)
subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
return torch.from_numpy(subsequent_mask) == 0
def data_gen(V, randomSeed, totalTrainingSample):
np.random.seed(randomSeed)
x = torch.from_numpy(np.random.randint(2, V - 2, size=(totalTrainingSample, 10)))
y = torch.flip(torch.flip(x, [0, 1]), [0])
x[:, 0] = 1
y[:, 0] = 1
x[:, -1] = V - 1
y[:, -1] = V - 1
return list(zip(x, y))
def getAccuracy(x, trg):
totalValAcc = 0
totalValAccToken = 0
trg = trg.contiguous().view(-1)
out = x.view(-1, x.size(-1)) # (batch_size * tgt_length, src_vocab)
_, index = torch.max(out, dim=-1) # index (batch_size * tgt_length)
correct = list((trg == index)).count(True)
totalValAcc += correct
totalValAccToken += index.size(0)
return totalValAcc / totalValAccToken
V = 20
transformer = nn.Transformer(num_encoder_layers=2, num_decoder_layers=2, batch_first=True)
PositionEnc = PositionalEncoding(512, 0.1)
src_emb = Embeddings(512, V)
tgt_emb = Embeddings(512, V)
gen = Generator(V)
if __name__ == '__main__':
model = Model(nn.Sequential(src_emb, PositionEnc), nn.Sequential(tgt_emb, PositionEnc), transformer, gen)
earlyStopping = EarlyStopping(monitor='val_loss', patience=3)
trainer = pl.Trainer(max_epochs=10, callbacks=[earlyStopping])
trainer.fit(model)
this is fragment of my code
def train(self, features, targets):
for X, y in zip(features, targets):
X = X.reshape(1, X.shape[0])
outputs = self.feed_forward(X)
when I try to use the method with data:
train(np.array([gameDataList[n].ball_position, gameDataList[n].wall_position]), np.array(gameDataList[n].upOrDown))
where gameDataList[n].upOrDown is an array e.g. [0.1, 0.9], and gameDataList[n].ball_position and gameDataList[n].wall_position are floats, I get this error.
Full code:
#### Imports ####
import numpy as np
#### Neural Network Class ####
class MLP:
##### Constructor ####
def __init__(self, n_input_nodes, hidden_nodes, n_output_nodes, lr):
## Network ##
self.n_input_nodes = n_input_nodes
self.n_output_nodes = n_output_nodes
self.nodes = hidden_nodes
self.nodes.insert(0, n_input_nodes)
self.nodes.append(n_output_nodes)
## Weights and Biases##
self.weights = []
self.biases = []
for i in range(1, len(self.nodes)):
self.weights.append(np.random.uniform(-1.0, 1.0, (self.nodes[i - 1], self.nodes[i])))
self.biases.append(np.random.uniform(-1.0, 1.0, (1, self.nodes[i])))
## Learning Rate ##
self.lr = lr
## Activation Functions ##
# Linear Activation
self.linear = lambda x: x
self.d_linear = lambda x: np.ones(x.shape)
# Relu Activation
def relu(x):
x[x < 0] = 0
return x
def d_relu(out):
out: x[x > 0] = 1
return out
self.relu = relu
self.d_relu = d_relu
# Sigmoid Activation
self.sigmoid = lambda x: 1 / (1 + np.exp(-x))
self.d_sigmoid = lambda out: out * (1 - out) # assumes out is tanh(x)
# Hyperbolic Tangent Activation
self.tanh = lambda x: np.tanh(x)
self.d_tanh = lambda out: 1 - out ** 2 # assumes out is tanh(x)
def getWeights(self):
return self.weights.copy()
def getBiases(self):
return self.biases.copy()
def setWeights(self, weights):
self.weights = weights.copy()
def setBiases(self, biases):
self.biases = biases.copy()
#### Feed Forward ####
def feed_forward(self, X):
outputs = [X]
logits = np.dot(X, self.weights[0]) + self.biases[0]
for i in range(1, len(self.nodes) - 1):
out = self.sigmoid(logits)
outputs.append(out)
logits = np.dot(out, self.weights[i]) + self.biases[i]
out = self.sigmoid(logits)
outputs.append(out)
return outputs
#### Backpropagation ####
def backpropagation(self, X, y, outputs):
weights_gradients = []
biases_gradients = []
d1 = y - outputs[-1]
d2 = self.d_sigmoid(outputs[-1])
error = d1 * d2
grad = outputs[-2].T * error
weights_gradients.append(grad)
biases_gradients.append(error)
for i in range(len(self.weights) - 2, 1, -1):
d = self.d_sigmoid(outputs[i])
error = np.dot(error, self.weights[i + 1].T) * d
grad = outputs[i - 1].T * error
weights_gradients.append(grad)
biases_gradients.append(error)
return weights_gradients, biases_gradients
#### Training ####
def train(self, features, targets):
# Batch Size for weight update step
batch_size = features.shape[0]
# Delta Weights Variables
delta_weights = [np.zeros(weight.shape) for weight in self.weights]
delta_biases = [np.zeros(bias.shape) for bias in self.biases]
# For every data point, forward pass, backpropogation, store weights change
for X, y in zip(features, targets):
# Forward pass
X = X.reshape(1, X.shape[0])
outputs = self.feed_forward(X)
# Back propogation
weights_gradients, biases_gradients = self.backpropagation(X, y, outputs)
for i in range(len(weights_gradients)):
delta_weights[-(i + 1)] += weights_gradients[i]
delta_biases[-(i + 1)] += biases_gradients[i]
for i in range(len(delta_weights)):
self.weights[i] += (self.lr * delta_weights[i]) / batch_size
self.biases[i] += (self.lr * delta_biases[i]) / batch_size
#### Testing Methods ####
def predict(self, X):
# Gives prediction
return self.feed_forward(X)[-1]
def test(self, features, targets):
predictions = self.predict(features)
n_correct = 0
for i in range(len(predictions)):
prediction = np.argmax(predictions[i])
correct = np.argmax(targets[i])
if prediction == correct:
n_correct += 1
return n_correct / len(targets)
class GameData:
def __init__(self, ball_position, wall_position, upOrDown):
self.wall_position = wall_position
self.ball_position = ball_position
self.upOrDown = upOrDown
I collect data, and train my network, in this way:
gameDataList.append(GameData(ball.trt.ycor(), b.trt.ycor(), [0.1, 0.9]))
mlp = MLP(2, [32, 32], 2, 0.0001)
n = random.randint(0, 999)
mlp.train(np.array([gameDataList[n].ball_position, gameDataList[n].wall_position]), np.array(gameDataList[n].upOrDown))
Problem solved. It was needed to write two square brackets instead of one.
wrong example:
np.array([gameDataList[n].ball_position, gameDataList[n].wall_position])
correct example:
np.array([[gameDataList[n].ball_position, gameDataList[n].wall_position]])
I have CSI data with the following shape (5520, 90, 200) being samples, the stacked antennas and subcarriers and time frame. I am trying to build a variational autoencoder however I keep getting stuck on the training part as it says that the matrixes I am inputting cannot be multiplied.
class Encoder(nn.Module):
def __init__(self, z_dim):
super(Encoder, self).__init__()
x_dim = 90
# Layer 1
self.encoder = nn.Sequential()
self.encoder.add_module('fc1', nn.Linear(x_dim, 360))
self.encoder.add_module('Sigmoid1', nn.Sigmoid())
self.encoder.add_module('BN1', nn.BatchNorm1d(360))
self.encoder.add_module('DO1', nn.Dropout())
# Layer 2
self.encoder.add_module('fc2', nn.Linear(360, 50))
self.encoder.add_module('Sigmoid2', nn.Sigmoid())
self.encoder.add_module('BN2', nn.BatchNorm1d(50))
self.encoder.add_module('DO2', nn.Dropout())
# Latent
self.k_mu = nn.Linear(50, z_dim)
self.k_sigma = nn.Linear(50, z_dim)
def reparameterize(self, mean, logvar):
eps = np.random.normal(shape=mean.shape)
return eps * tf.exp(logvar * .5) + mean
def forward(self, x):
h = self.encoder(x)
k_mu = self.k_mu(h)
k_sigma = self.k_sigma(h)
z_dim = reparameterize(k_mu, log(k_sigma))
return z_dim
class Decoder(nn.Module):
def __init__(self, z_dim):
super(Decoder, self).__init__()
x_dim = 90
#1
self.decoder = nn.Sequential()
self.decoder.add_module('fc3', nn.Linear(z_dim, 50))
self.decoder.add_module('Sigmoid3', nn.Sigmoid())
self.decoder.add_module('BN3', nn.BatchNorm1d(50))
self.decoder.add_module('DO3', nn.Dropout())
#2
self.decoder.add_module('fc4', nn.Linear(50, 360))
self.decoder.add_module('Sigmoid4', nn.Sigmoid())
self.decoder.add_module('BN4', nn.BatchNorm1d(360))
self.decoder.add_module('DO4', nn.Dropout())
self.decoder.add_module('x_recon', nn.Linear(360, x_dim))
def forward(self, z):
x_ = self.decoder(z)
return x_
class VAE(nn.Module):
def __init__(self, z_dim):
super(VAE, self).__init__()
self.encoder = Encoder(z_dim)
self.decoder = Decoder(z_dim)
def sample(self, z_mu, z_log_var):
z_std = torch.exp(z_log_var)
eps = torch.randn_like(z_std)
z = z_mu + z_std*eps
return z
def forward(self, x):
# Encode
z_mu, z_log_var = self.encoder(x)
# Sample
if self.training:
# Sample if we are training
z = self.sample(z_mu, z_log_var)
else:
z = z_mu
# Decode
x_ = self.decoder(z)
return x_, z_mu, z_log_var
vae = VAE(z_dim=10).cuda()
optimizer = optim.Adam(vae.parameters())
epochs = 20 #Number of epochs
# Loop over epochs
for epoch in range(1, epochs + 1):
vae.train()
train_loss = 0
# Loop over batches. Note that the labels are ignored in this case
for batch_idx, (data) in enumerate(train_loader):
data = data.cuda()
optimizer.zero_grad()
data = data.view(data.size(1), -1)
x_recon, z_mu, z_log_var = vae(data)
# Calculate the loss
KL_loss, E_loss = loss_function(x_recon, data, z_mu, z_log_var)
loss = KL_loss + E_loss
loss.backward()
# Accumulate the loss to calculate the average
train_loss += loss.item()
# Step the optimizer
optimizer.step()
if batch_idx % 100 == 0:
print('Epoch: {} [{}/{} ({:.0f}%)]\tKL Loss: {:.6f}\tBC Loss: {:.6f}\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), KL_loss.item() / len(data), E_loss.item() / len(data), loss.item() / len(data)))
print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss / len(train_loader.dataset)))
And I get the following error: mat1 and mat2 shapes cannot be multiplied (90x12800 and 90x360)
I cannot figure out why I am getting it.
I am training simple variational autoencoder with negative binomial likelihood for decoder. I used python 3.7.1 and tensorflow 2.0.0.
The model was trained well without any problems for tens of epochs, but all weights, loss, and gradients suddenly became NaN during training. I modified the code to find which variables become NaN first (among weights, loss, and gradients) and found that gradients first became nan and this affected other variables.
I have googled similar issues but most of case nan appeared in the loss, which is different from this case. I tried: smaller learning rate, clipping loss... but nothing could resolve the problem.
Here is the model class for autoencoder model:
class Encoder(tf.keras.layers.Layer):
def __init__(self, hidden_dim, latent_dim):
super(Encoder, self).__init__()
self.encoder_fc_1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.encoder_fc_2 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.encoder_latent = tf.keras.layers.Dense(latent_dim + latent_dim)
def call(self, input):
h = tf.math.l2_normalize(input, 1)
h = self.encoder_fc_1(h)
h = self.encoder_fc_2(h)
return tf.split(self.encoder_latent(h), num_or_size_splits=2, axis=1)
class Decoder(tf.keras.layers.Layer):
def __init__(self, hidden_dim, vocab_size):
super(Decoder, self).__init__()
self.decoder_fc_1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.decoder_fc_2 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.decoder_fc_3 = tf.keras.layers.Dense(vocab_size + vocab_size)
def call(self, z):
h = self.decoder_fc_1(z)
h = self.decoder_fc_2(h)
return tf.split(self.decoder_fc_3(h), num_or_size_splits=2, axis=1)
class NBVAE(tf.keras.Model):
def __init__(self, config):
super(NBVAE, self).__init__()
self.optimizer = tf.keras.optimizers.Adam(config["learning_rate"])
self.encoder = Encoder(config["hidden_dim"], config["latent_dim"])
self.decoder = Decoder(config["hidden_dim"], config["vocab_size"])
def call(self, input):
mean, logvar = self.encoder(input)
z = reparameterize(mean, logvar)
h_r, h_p = self.decoder(z)
return mean, logvar, z, h_r, h_p
def reparameterize(mean, logvar):
eps = tf.random.normal(shape=mean.shape)
return tf.add(tf.multiply(eps, tf.math.exp( tf.math.divide(logvar, 2))), mean)
def log_normal_pdf(sample, mean, logvar, raxis=1):
log2pi = tf.math.log(2. * np.pi)
return tf.reduce_sum(-.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi), axis=raxis)
def compute_logpx_z(input, h_r, h_p):
temp = tf.exp(-tf.multiply(tf.exp(h_r), tf.math.log(tf.exp(h_p) + 1)))
temp_cliped = tf.clip_by_value(temp, 1e-5, 1 - 1e-5)
ll = tf.multiply(input, tf.math.log(1 - temp_cliped)) + tf.multiply(1 - input, tf.math.log(temp_cliped))
#print("logpx_z: {}".format(tf.reduce_sum(ll, axis=-1)))
return tf.reduce_sum(ll, axis=-1), temp
def compute_loss(model, input):
mean, logvar, z, h_r, h_p = model(input)
logpx_z, temp = compute_logpx_z(input, h_r, h_p)
logpz = log_normal_pdf(z, 0., 0.)
logqz_x = log_normal_pdf(z, mean, logvar)
return tf.negative(tf.reduce_mean(logpx_z + logpz - logqz_x)), temp
and here is the code snippet for training the model.
I put some if statements in the middle of the code to check which variable become NaN first.
print("start training...")
num_batches = int(np.ceil(len(training_data) / batch_size))
epoch_loss = []
for epoch in range(epochs):
print("epoch: {}".format(epoch+1))
progbar = tf.keras.utils.Progbar(num_batches)
loss_record = []
for i in range(num_batches):
x_batch = training_data[i*batch_size:(i+1)*batch_size]
x_batch = one_hot(x_batch, depth=len(concept2id))
with tf.GradientTape() as tape:
loss, temp = compute_loss(nbvae, x_batch)
print("step{s} loss: {l}".format(s=i, l=loss.numpy()))
# checking the loss
if np.isnan(loss.numpy()):
print("nan loss is detected")
detect_nan = True
break
loss_record.append(loss.numpy())
gradients = tape.gradient(loss, nbvae.trainable_variables)
#gradients, global_norm = tf.clip_by_global_norm(tape.gradient(loss, nbvae.trainable_variables), 10)
print("checking gradients...")
gradient_nancount = 0
for _, grad in enumerate(gradients):
gradient_nancount += np.sum(np.isnan(grad))
if gradient_nancount != 0:
print("nan is detected in gradients")
print("saving the current gradients and weights...")
save_data(os.path.join(output_path, "error_gradients.pkl"), gradients)
save_data(os.path.join(output_path, "error_tvariables.pkl"), nbvae.trainable_variables)
detect_nan = True
break
nbvae.optimizer.apply_gradients(zip(gradients, nbvae.trainable_variables))
print("checking the updated weights...")
weight_nancount = 0
for _, weight in enumerate(nbvae.weights):
weight_nancount += np.sum(np.isnan(weight))
if weight_nancount != 0:
print("nan is detected in weights")
print("saving the current gradients and weights...")
save_data(os.path.join(output_path, "error_gradients.pkl"), gradients)
save_data(os.path.join(output_path, "error_tvariables.pkl"), nbvae.trainable_variables)
detect_nan = True
break
progbar.add(1)
if detect_nan:
epoch_loss.append(np.nan)
nbvae.save_weights(os.path.join(output_path, "nbvae_error{}".format(epoch+1)))
break
print("average epoch loss: {}".format(np.mean(loss_record)))
epoch_loss.append(np.mean(loss_record))
Anyone knows the way to resolve this problem or possible reasons? Thank you for your time in advance.
I am trying to deal with neural networks, but I'm very new to this. I've never understood the different implementations I found.
So I tried to make a simple implementation of the XOR problem with a MultiLayer Perceptron and backpropagation according to the book of Virginie MATHIVET, but my algorithm is not converging. I tried a lot of things but the result does not change. Here is my code:
class Neuron:
def __init__(self, nb_inputs, bias):
self.nb_inputs = nb_inputs
self.bias = bias
self.weights = [random()*2.0-1 for _ in range(nb_inputs+1)]
self.deltas = [0.0 for _ in range(nb_inputs+1)]
self.output = None
def init_deltas(self):
self.deltas = [0.0 for _ in range(self.nb_inputs+1)]
def aggregation(self, inputs):
return sum([self.weights[i] * inputs[i] for i in range(self.nb_inputs)]) + self.bias * self.weights[self.nb_inputs]
def activation(self, value):
return 1.0/(1.0+math.exp(-value))
def compute_output(self, inputs):
self.output = self.activation(self.aggregation(inputs))
return self.output
class NeuralNetwork:
def __init__(self, nb_inputs, nb_hidden, nb_outputs, learning_rate):
self.nb_inputs = nb_inputs
self.nb_hidden = nb_hidden
self.nb_outputs = nb_outputs
self.learning_rate = learning_rate
self.output_layer = [Neuron(self.nb_hidden, 1.0) for _ in range(self.nb_outputs)]
self.hidden_layer = [Neuron(self.nb_inputs, 1.0) for _ in range(self.nb_hidden)]
def compute_outputs(self, inputs):
hidden_outputs = [neuron.compute_output(inputs) for neuron in self.hidden_layer]
outputs = [neuron.compute_output(hidden_outputs) for neuron in self.output_layer]
return outputs[0]
def init_deltas(self):
for neuron in self.output_layer + self.hidden_layer:
neuron.init_deltas()
def train(self, data, nb_iterations):
for _ in range(nb_iterations):
self.init_deltas()
for inputs in data:
si, yi = self.compute_outputs(inputs), data[inputs]
print si, yi
# For each output neuron
for neuron in self.output_layer:
# For each weight
for i in range(neuron.nb_inputs):
neuron.deltas[i] = si * (1 - si) * (yi - si)
# For each hidden neuron
for i in range(self.nb_hidden):
hidden_neuron = self.hidden_layer[i]
# For each weight
for k in range(hidden_neuron.nb_inputs):
total = 0.0
# For each output neuron
for output_neuron in self.output_layer:
total += output_neuron.deltas[i] * output_neuron.weights[i]
hidden_neuron.deltas[k] = hidden_neuron.output * (1 - hidden_neuron.output) * total
# adjust weights in output_layer
for neuron in self.output_layer:
for i in range(self.nb_hidden):
neuron.weights[i] += self.learning_rate * neuron.deltas[i] * neuron.output
neuron.weights[self.nb_hidden] += self.learning_rate * neuron.deltas[self.nb_hidden] * neuron.bias
# adjust weights in hidden_layer
for neuron in self.hidden_layer:
for i in range(self.nb_inputs):
neuron.weights[i] += self.learning_rate * neuron.deltas[i] * neuron.output
neuron.weights[self.nb_inputs] += self.learning_rate * neuron.deltas[self.nb_inputs] * neuron.bias
def predict(self, inputs):
return self.compute_outputs(inputs)
And to test it:
DATA = {
(0, 0): 0,
(1, 0): 1,
(0, 1): 1,
(1, 1): 0
}
nn = NeuralNetwork(2, 3, 1, 0.2)
nn.train(DATA, 50000)
nn.predict([0,0])
Thanks in advance for your help