I'm studying pipeline model parallelism with TensorFlow 2 and MPI. But I can't figure out how to apply the chain rule when using multiple tf.GradientTape across multiple processes.
Here is the code I'm currently working on:
import tensorflow as tf
from mpi4py import MPI
minibatch_size = 64
class Input(tf.keras.Model):
def __init__(self):
super().__init__()
self.flatten = tf.keras.layers.Flatten()
self.dense = tf.keras.layers.Dense(128, activation='relu')
def call(self, inputs, **kwargs):
x = self.flatten(inputs)
x = self.dense(x)
return x
class Block(tf.keras.Model):
def __init__(self):
super().__init__()
self.dense_1 = tf.keras.layers.Dense(128, activation='relu')
self.dense_2 = tf.keras.layers.Dense(128, activation='relu')
def call(self, inputs, **kwargs):
x = self.dense_1(inputs)
x = self.dense_2(x)
return x
class Head(tf.keras.Model):
def __init__(self):
super().__init__()
self.dropout = tf.keras.layers.Dropout(0.2)
self.dense = tf.keras.layers.Dense(10, activation='softmax')
def call(self, inputs, **kwargs):
x = self.dropout(inputs)
x = self.dense(x)
return x
class Trainer:
def __init__(self,
comm,
model: tf.keras.Model,
optimizer: tf.keras.optimizers.Optimizer,
loss_fn: tf.keras.losses.Loss):
self._comm = comm
self._size = comm.Get_size()
self._rank = comm.Get_rank()
self._next_rank = self._rank + 1 if self._rank + 1 < self._size else MPI.PROC_NULL
self._prev_rank = self._rank - 1 if self._rank - 1 >= 0 else MPI.PROC_NULL
self._model = model
self._optimizer = optimizer
self._loss_fn = loss_fn
def _is_first_node(self) -> bool:
return self._rank == 0
def _is_last_node(self) -> bool:
return self._rank == self._size - 1
def _forward_pass(self, minibatch):
assert minibatch_size % self._size == 0
microbatch_size = minibatch_size // self._size
microbatches = tf.data.Dataset \
.from_tensor_slices(minibatch) \
.batch(microbatch_size)
predictions = []
tapes = []
losses = []
for microbatch in microbatches:
x, y = microbatch
with tf.GradientTape() as tape:
if self._is_first_node():
prediction = self._model(x)
self._comm.send(prediction, dest=self._next_rank)
elif self._is_last_node():
recvd = self._comm.recv(source=self._prev_rank)
prediction = self._model(recvd)
loss = self._loss_fn(y, prediction)
losses.append(loss)
else:
recvd = self._comm.recv(source=self._prev_rank)
prediction = self._model(recvd)
self._comm.send(prediction, dest=self._next_rank)
predictions.append(prediction)
tapes.append(tape)
return predictions, tapes, losses
def _backward_pass(self, predictions, tapes, losses):
grads = []
for i in range(self._size):
if self._is_first_node():
errors = self._comm.recv(source=self._next_rank)
grad = tapes[i].gradient(predictions[i],
self._model.trainable_weights,
output_gradients=errors)
elif self._is_last_node():
grad = tapes[i].gradient(losses[i], self._model.trainable_weights)
self._comm.send(grad, dest=self._prev_rank)
else:
errors = self._comm.recv(source=self._next_rank)
grad = tapes[i].gradient(predictions[i],
self._model.trainable_weights,
output_gradients=errors)
self._comm.send(grad, dest=self._prev_rank)
grads.append(grad)
grads = [tf.reduce_mean(grad, axis=0) for grad in grads]
self._optimizer.apply_gradients(zip(grads, self._model.trainable_weights))
def train_minibatch(self, minibatch):
predictions, tapes, losses = self._forward_pass(minibatch)
self._backward_pass(predictions, tapes, losses)
def main():
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
n_train = len(x_train)
n_minibatch = n_train // minibatch_size
x_train = tf.data.Dataset \
.from_tensor_slices((x_train, y_train)) \
.batch(minibatch_size, drop_remainder=True) \
.shuffle(len(x_train))
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
if rank == 0:
model = Input()
elif rank == size - 1:
model = Head()
else:
model = Block()
trainer = Trainer(comm, model, optimizer, loss_fn)
if rank == 0:
progbar = tf.keras.utils.Progbar(n_minibatch)
for minibatch in x_train:
trainer.train_minibatch(minibatch)
if rank == 0:
progbar.add(1)
if __name__ == '__main__':
main()
However, running this code with
mpirun -n 4 python main.py
produces the following error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Inputs to operation ReluGrad of type ReluGrad must have the same size and shape. Input 0: [128,10] != input 1: [16,128] [Op:ReluGrad]
Could any expert show me how to do this properly?
Related
I'm new to PyTorch not able to figure out what I'm doing wrong, below is the code
x_np, y_np = datasets.make_regression(n_samples=100,n_features=1,noise=20,random_state=0)
x = torch.from_numpy(x_np.astype(np.float32))
y = torch.from_numpy(y_np.astype(np.float32))
y = y.view(y.shape[0],1)
n_samples, n_features = x.shape
class Regression(nn.Module):
def __init__(self, inputsize, outputsize, hiddensize):
super(Regression, self).__init__()
self.hidden_size = hiddensize
self.input_size = inputsize
self.output_size = outputsize
self.i2h = nn.Linear(self.input_size+self.hidden_size, self.hidden_size)
self.h2o = nn.Linear(self.input_size+self.hidden_size, self.output_size)
def forward(self, x):
hidden = torch.zeros(1, self.hidden_size)
print(x.shape)
print(hidden.shape)
combined = torch.cat((x,hidden), 1)
hidden = self.i2h(combined)
output = self.h2o(combined)
return output
model = Regression(n_features, n_features, 16)
lr = 0.01
loss = nn.MSELoss()
opt = torch.optim.SGD(model.parameters(), lr = lr)
for epoch in range(1000):
ypred = model(x)
l = loss(y, ypred)
l.backward()
opt.step()
opt.zero_grad()
if epoch % 100 == 0:
[w, b] = model.parameters()
print(f'epoch {epoch+1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')
While training, I am getting this error
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 100 but got size 1 for tensor number 1 in the list
i2h maps self.input_size + self.hidden_size dimension to self.hidden_size, so for h2o, you have to define a mapping starting from self.hidden dimension. Also, you have to update the forward accordingly. Here is the complete code:
class Regression(nn.Module):
def __init__(self, inputsize, outputsize, hiddensize):
super(Regression, self).__init__()
self.hidden_size = hiddensize
self.input_size = inputsize
self.output_size = outputsize
self.i2h = nn.Linear(self.input_size+self.hidden_size, self.hidden_size)
self.h2o = nn.Linear(self.hidden_size, self.output_size)
def forward(self, x):
hidden = torch.zeros(1, self.hidden_size)
print(x.shape)
print(hidden.shape)
combined = torch.cat((x,hidden), 1)
hidden = self.i2h(combined)
output = self.h2o(hidden)
return output
I am trying to use the vanilla transformer from PyTorch using Pytorch Lightning. I tried to test the model with a reverse number task. So given [1, 3, 5, 4, 13, 19] it returns [1, 13, 4, 5, 3, 19] with 1, 19 being start and end token respectively. The full code is below. The code can run without error but there seems to be a problem with the backpropagation. The training loss does go down at first but it doesn't go beyond 2.8 and the accuracy doesn't go beyond 11%.
It seems that part of the model is able to optimize, I am guessing it is because the weights located in Embeddings and Generator can backpropagate, but weights located in nn.Transformer cannot? I am really not sure.
import math
import torch.nn.functional as F
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
class Embeddings(pl.LightningModule):
def __init__(self, d_model, vocab):
super(Embeddings, self).__init__()
self.lut = nn.Embedding(vocab, d_model)
self.d_model = d_model
def forward(self, x):
a = self.lut(x) * math.sqrt(self.d_model)
return a
class PositionalEncoding(pl.LightningModule):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
class Generator(pl.LightningModule):
def __init__(self, size):
super(Generator, self).__init__()
self.proj = nn.Linear(512, size)
def forward(self, x):
return F.log_softmax(self.proj(x), dim=-1)
class Model(pl.LightningModule):
def __init__(self, src_embed, tgt_embed, transformer, generator):
super(Model, self).__init__()
self.src_embed = src_embed
self.tgt_embed = tgt_embed
self.transformer = transformer
self.generator = generator
self.valLoss = 0
self.valAcc = 0
self.automatic_optimization = False
self.optimizer = None
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, x, y, tgt_mask=None):
x = self.src_embed(x)
y = self.tgt_embed(y)
return self.generator(self.transformer(x, y, tgt_mask=tgt_mask))
def training_step(self, batch, batch_idx):
if self.optimizer is None:
self.optimizer = self.optimizers()
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.contiguous().view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
self.log('train_loss', loss)
print(loss)
def validation_step(self, batch, batch_idx):
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
self.log('val_loss', loss)
self.valLoss += loss
if batch_idx % 10 == 0:
print(loss)
if batch_idx == 99:
print(self.valLoss/100)
self.valLoss = 0
return {"x": output, "trg": batch.trg_y, "index": batch_idx}
def validation_step_end(self, batch):
output, trg, idx = batch["x"], batch["trg"], batch["index"]
accuracy = getAccuracy(output, trg)
self.log("accuracy", accuracy)
self.valAcc += accuracy
if idx == 99:
print(self.valAcc/100)
self.valAcc = 0
def train_dataloader(self):
data = data_gen(V, 0, 3000)
return DataLoader(data, batch_size=30, shuffle=False, num_workers=2, pin_memory=True)
def val_dataloader(self):
data = data_gen(V, 1, 1000)
return DataLoader(data, batch_size=10, shuffle=False, num_workers=2, pin_memory=True)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)
class LossCompute(pl.LightningModule):
def __init__(self, size):
super(LossCompute, self).__init__()
self.criterion = nn.KLDivLoss(reduction='sum')
self.size = size
self.true_dist = None
def forward(self, x, target):
# x has size (batch_size x length, vocab_size)
assert x.size(1) == self.size
true_dist = x.data.clone()
true_dist.fill_(0)
true_dist.scatter_(1, target.data.unsqueeze(1).long(), 1)
self.true_dist = true_dist
return self.criterion(x, true_dist)
# prepare data
class Batch:
"Object for holding a batch of data with mask during training."
def __init__(self, src, trg=None):
self.src = src
if trg is not None:
self.trg = trg[:, :-1]
self.trg_y = trg[:, 1:]
self.trg_mask = \
self.make_std_mask(self.trg)
self.ntokens = self.trg_y.size(0) * self.trg_y.size(1)
print("")
#staticmethod
def make_std_mask(tgt):
"Create a mask to hide padding and future words."
tgt_mask = subsequent_mask(tgt.size(-1)).type_as(tgt.data)
return tgt_mask
def subsequent_mask(size):
"Mask out subsequent positions."
attn_shape = (1, size, size)
subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
return torch.from_numpy(subsequent_mask) == 0
def data_gen(V, randomSeed, totalTrainingSample):
np.random.seed(randomSeed)
x = torch.from_numpy(np.random.randint(2, V - 2, size=(totalTrainingSample, 10)))
y = torch.flip(torch.flip(x, [0, 1]), [0])
x[:, 0] = 1
y[:, 0] = 1
x[:, -1] = V - 1
y[:, -1] = V - 1
return list(zip(x, y))
def getAccuracy(x, trg):
totalValAcc = 0
totalValAccToken = 0
trg = trg.contiguous().view(-1)
out = x.view(-1, x.size(-1)) # (batch_size * tgt_length, src_vocab)
_, index = torch.max(out, dim=-1) # index (batch_size * tgt_length)
correct = list((trg == index)).count(True)
totalValAcc += correct
totalValAccToken += index.size(0)
return totalValAcc / totalValAccToken
V = 20
transformer = nn.Transformer(num_encoder_layers=2, num_decoder_layers=2, batch_first=True)
PositionEnc = PositionalEncoding(512, 0.1)
src_emb = Embeddings(512, V)
tgt_emb = Embeddings(512, V)
gen = Generator(V)
if __name__ == '__main__':
model = Model(nn.Sequential(src_emb, PositionEnc), nn.Sequential(tgt_emb, PositionEnc), transformer, gen)
earlyStopping = EarlyStopping(monitor='val_loss', patience=3)
trainer = pl.Trainer(max_epochs=10, callbacks=[earlyStopping])
trainer.fit(model)
Given a simple multivariate time series problem
l = [ list(range(1000)),list(range(1000,2000)),list(range(2000,3000)), list(range(3000,4000)), list(range(4000,5000))]
df = pd.DataFrame(l).T
df.columns = ['feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5']
target_sensor = 'feat_5'
If We want to predict the value of target_sensor in t+forecast_lead time steps
forecast_lead = 15
print('\nforecast_lead', forecast_lead)
target = f"{target_sensor}_TARGET{forecast_lead}"
features = list(df.columns.difference([target]))
df[target] = df[target_sensor].shift(-forecast_lead)
df = df.iloc[:-forecast_lead]
The input preparation is based on this torch.Dataset class:
class My_Dataset(Dataset):
def __init__(self, dataframe, target, features, sequence_length):
self.features = features #list of columns
self.target = target #str target col name
self.sequence_length = sequence_length #history we want to use
self.X = torch.tensor(dataframe[features].values).float() #to tensor
self.y = torch.tensor(dataframe[target].values).float() #to tensor
def __len__(self):
return self.X.shape[0]
def __getitem__(self, i):
if i >= self.sequence_length - 1:
i_start = i - self.sequence_length + 1
x = self.X[i_start:(i + 1), :]
else:
padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
x = self.X[0:(i + 1), :]
x = torch.cat((padding, x), 0)
return x, self.y[i]
With the following Dataloader (batch_size =1 ) for simplicity.
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False,num_workers=1)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1)
The Model I am using is this:
class LSTM_Multivariate_Time_Series_Regression(nn.Module):
def __init__(self, num_features, hidden_size):
super().__init__()
self.num_feture= num_features # this is the number of features
self.hidden_size = hidden_size
self.num_layers = 1 # OR MORE THAN 1 HERE
self.lstm = nn.LSTM(input_size=num_features,
hidden_size=hidden_size,
batch_first=True,
num_layers=self.num_layers
)
self.linear = nn.Linear(in_features=self.hidden_size,
out_features=1)
In the forward pass if NUM_LAYER = 1
def forward(self, x):
lstm_output , (hn, cn) = self.lstm(x)
out = self.linear(hn[0]) # First dim of Hn is num_layers, which is set to 1 above.
In the forward pass, before passing through the Linear Layer IF SELF.NUM_LAYER > 1, I suppose following options are available
Use the last hidden state hn[-1]
Use the concatenation of all hidden state
#Docs WITH BATCH FIRST = TRUE : lstm_output tensor of shape: (BATCH_SIZE , SEQ_LENGHT, HIDDEN_SIZE)
#docs WITH BATCH FIRST h_n: tensor of shape (NUM_LAYER, BATCH_SIZE, HIDDEN_SIZE) containing the final hidden state for each element in the batch
.1
out = self.linear(hn[-1]).flatten()
return out
Is a correct solution to add final Linear layer to the lstm layer this way?
I'm trying to implement a Neural Net in python without the use of libraries like Keras or Tensorflow. I still have to test the net, right now I just tried to train it on Iris dataset and check afterwards the correctness of the backpropagation algorithm.
To do so, I wrote the gradient checking procedure, calculating the analytical gradients and comparing them with the gradients from backpropagation.
The point is that, even if the backpropagation algorithm seems correct to me, the difference between the gradients is always high (around 0.8, instead of the classic 1e-7).
Layer class
class Dense(Layer):
def __init__(self, input_shape, name=None, activation='relu', regularization='l2'):
self.name = name
self.is_output = False
self.weights = np.random.uniform(low=0.01, high=0.10, size=input_shape)
self.biases = np.ones((1,input_shape[1]))
if activation == 'sigmoid':
self.activation = Activation_Sigmoid()
else: #activation == 'relu':
self.activation = Activation_ReLU()
self.cost = Categorical_CrossEntropyLoss()
def set_as_output(self, is_output=True):
self.is_output = is_output
def forward(self, inputs, debug=False, epsilon=None):
self.net_input = inputs
if debug:
augmented_parameters = np.zeros(epsilon.shape)
weights_column_vector = np.reshape(self.weights,(-1,1))
biases_column_vector = np.reshape(self.biases,(-1,1))
concatenated_parameters = np.concatenate((weights_column_vector, biases_column_vector))
for i in range(concatenated_parameters.shape[0]):
augmented_parameters[i] = concatenated_parameters[i]
# make the augmented parameter long as theta in order to sum them
# this because epsilon is a standard basis vector
augmented_parameters += epsilon
# rebuild the weights matrix and biases vector to apply forward propagation
weights_end = self.weights.shape[0] * self.weights.shape[1]
biases_end = self.biases.shape[0] * self.biases.shape[1] + weights_end
weights = np.reshape(augmented_parameters[0:weights_end],self.weights.shape)
biases = np.reshape(augmented_parameters[weights_end:biases_end], self.biases.shape)
output = np.dot(inputs, weights) + biases
activated_output = self.activation.forward(output)
return activated_output
self.output = np.dot(inputs, self.weights) + self.biases
self.activated_output = self.activation.forward(self.output)
return self.activated_output
def backward(self, X, y, output, step, l2=0.5): #backpropagation
m = X.shape[0] # number of examples
if self.is_output:
error = self.cost.backward(output, y) #(a_k - y_hat_k)
delta_k = self.activation.backward(self.output)* error
# net input for neuron k is a_j^(l-1)
grad = np.dot(self.net_input.T, delta_k)
#update weights with l2 regularization
self.grad_w = grad + (l2 / m)*self.weights
self.grad_b = np.sum(delta_k * 1,axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_k ,self.weights.T)
else:
delta_j = self.activation.backward(self.output) * output
grad = np.dot(self.net_input.T, delta_j)
self.grad_w = grad + (l2 / m) * self.weights
self.grad_b = np.sum(delta_j * 1, axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_j, self.weights.T)
def get_parameters(self):
return self.weights, self.biases
def get_gradients(self):
return self.grad_w, self.grad_b
Neural Net class
class NeuralNet():
def __init__(self):
self.layers = []
self.layers_output = []
self.cost = None
self.regularization = L2_Regularization()
def add(self,layer):
self.layers.append(layer)
def forward(self, inputs, debug=False, epsilon=None):
input = np.copy(inputs)
for layer in self.layers:
output = layer.forward(input, debug=debug, epsilon=epsilon)
input = output
return input
def backward(self, X, y, output, step):
prev_delta = None
out = output
for layer in self.layers[::-1]:
prev_delta = layer.backward(X, y, out, step)
out = prev_delta
def fit(self, X, y, batch_size=1, epochs=10, step=0.05, shuffle=True):
self.layers[-1].set_as_output()
self.error = []
i = 0.005 * epochs
for epoch in range(epochs):
if shuffle:
X = np.random.permutation(X)
batches = int(np.ceil(X.shape[0]/batch_size))
batches_error = []
for t in range(batches):
batch_X = X[t*batch_size:np.min([X.shape[0],(t+1)*batch_size]),:]
batch_y = y[t*batch_size:np.min([y.shape[0],(t+1)*batch_size]),:]
output = self.forward(batch_X)
cost = self.cost.forward(output,batch_y)
cost += self.regularization.forward(X, self.layers)
batches_error.append(cost)
self.backward(batch_X, batch_y, output, step)
self.error.append(np.mean(batches_error))
if epoch % i == 0:
print('epoch:', epoch, 'error:', np.mean(self.error))
return self
def parameters_to_theta(self):
theta = []
for layer in self.layers:
w, b = layer.get_parameters()
#flatten parameter w
new_vector = np.reshape(w, (-1,1))
theta.append(new_vector)
#flatten parameter b
new_vector = np.reshape(b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradients_to_theta(self):
theta = []
for layer in self.layers:
grad_w, grad_b = layer.get_gradients()
new_vector = np.reshape(grad_w, (-1,1))
theta.append(new_vector)
new_vector = np.reshape(grad_b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradient_check(self, X, y, epsilon=1e-7):
theta = self.parameters_to_theta()
dtheta = self.gradients_to_theta()
num_parameters = theta.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
dtheta_approx = np.zeros((num_parameters, 1))
for i in range(num_parameters):
theta_plus = np.zeros((num_parameters,1))
theta_plus[i] = epsilon
J_plus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_plus),y)
theta_minus = np.zeros((num_parameters,1))
theta_minus[i] = - epsilon
J_minus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_minus),y)
dtheta_approx[i] = (J_plus[i] - J_minus[i])/ (2 * epsilon)
numerator = np.linalg.norm(dtheta - dtheta_approx)
denominator = np.linalg.norm(dtheta_approx) + np.linalg.norm(dtheta)
difference = numerator / denominator
return difference
I'm using ReLU and Sigmoid as activation functions, and Categorical Cross Entropy for the cost
import numpy as np
from scipy.special import expit as sigmoid
class Activation_ReLU:
def forward(self, inputs):
return np.maximum(0, inputs)
def backward(self, inputs):
return np.greater(inputs,0).astype(int)
class Activation_Sigmoid:
def forward(self, inputs):
return sigmoid(inputs)
def backward(self, inputs):
return sigmoid(inputs) * (1 - sigmoid(inputs))
class Categorical_CrossEntropyLoss():
def forward(self, y_pred, y_real):
predictions = np.copy(y_pred)
predictions = np.clip(predictions, 1e-12, 1 - 1e-12) # avoid zero values for log
n = y_real.shape[0]
return - (1 / n) * np.sum(y_real * np.log(y_pred))
def backward(self, y_pred, y_real):
return y_real - y_pred
These are the main classes that define the net. The model that I create to train on Iris dataset is a NN with 1 hidden layer.
# random seed is 1
X, y = load_iris(return_X_y=True)
X = (X - np.mean(X)) / np.std(X) # standardize data to improve network convergence
y = y.reshape((-1,1))
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)
model = NeuralNet()
model.add(Dense((4,10),name='input_layer',activation='relu'))
model.add(Dense((10,10),name='hidden_layer',activation='relu'))
model.add(Dense((10,3),name='output_layer',activation='sigmoid'))
model.fit(X_train,y_train, batch_size=5, epochs=200, step=1e-3)
difference = model.gradient_check(X_train, y_train)
And then, the result of print(difference) is
0.7992920544491866
So there is something wrong with my implementation. What things I have to check to determine the causes of this high difference between gradients?
I am making a prediction and implementing a neural network, is currently working with the numpy library and I am adapting the code to the data that I have.
I leave the current progress of the neural network, I have an error at the end of the code and I do not understand it well.
Anyone who can help me please?
import numpy as np
from sklearn.cross_validation import train_test_split
class LinearLayer:
def __init__(self, n_input, n_output):
self.n = n_input
self.m = n_output
self.W = (1/np.sqrt(n_input))*np.random.rand(n_input+1, n_output)
def forward(self, X):
self.input = np.zeros((X.shape[0],self.n+1))
# if only one feature, the input should always be a batch, at least
if len(X.shape) == 1: # of one element
self.input[:-1,:] = X.reshape(-1,self.n)
else:
self.input[:,:-1] = X
self.input[:,-1] = 1
self.output = self.input.dot(self.W) # xW + b
return self.output
def backward(self, d_out):
self.gradients = self.W.dot(d_out)[:-1]
self.dW = np.einsum("ij,ki", self.input, d_out)
return self.gradients
def updateWeights(self, lr=0.1):
self.W = self.W - lr*self.dW
class Sigmoid:
def __init__(self, n_input):
self.output = np.zeros(n_input)
self.gradients = np.zeros(n_input)
def forward(self, X):
self.output = 1/(np.exp(-X)+1)
return self.output
def backward(self, d_out):
ds = self.output.T*(1 - self.output).T
self.gradients = ds*d_out
return self.gradients
print("Training a multilayer perceptron\n")
import pandas as pd
data = pd.read_csv('Data_Balanceada.csv') #Data (74,11)
X = data.iloc[:,0:11]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=1)
h1 = LinearLayer(11,1) #stack some layers
s1 = Sigmoid(7)
h2 = LinearLayer(7,1)
s2 = Sigmoid(1)
def loss(pred, target):
return np.mean(np.power(pred-target,2))
predict = lambda x: s2.forward(h2.forward(s1.forward(h1.forward(x))))
backpropagate = lambda d: h1.backward(s1.backward(h2.backward(s2.backward(d))))
lr = 0.005
n = 0 # patience
max_epochs = 1500
valid = loss(predict(X_test), y_test)
for i in range(max_epochs):
l = 0
p = predict(X_train)
backpropagate(p.T-y_train.T)
h1.updateWeights(lr)
h2.updateWeights(lr)
l = loss(p,y_train)
new_valid = loss(predict(X_test), y_test)
if new_valid < valid:
valid = new_valid
n = 0
else:
n += 1
if n > 50: break
if i%50 == 0:
print("Loss: {0}\t\tValidation: {1}".format(l/100, valid))
lr = lr*0.97
# Validation
print("\nFinal validation loss: {0}. {1} epochs\n".format(loss(predict(X_test), y_test),i+1))
#print(np.argmax(predict(X_test), axis=1))
#print(np.argmax(y_test, axis=1))
link Dataset:
https://mega.nz/#!jM8AQAbB!61NOeJadGXtiKJQsn_tdJ955p5lRD6kQjBlCQTHtt6I
I have this error:
Data must be 1-dimensional
IMG - ERROR