Pytorch: visualize architecture of loss function in vae - python

I am new to machine learning in general and pytorch, so I apologize if my terminology is incorrect. I am trying to understand the code that was used to train a temporal dependent VAE based on this paper. I am trying to follow the architecture of the model based on the answers here. The answer using torchviz is not working for me but torchview is working. The issue is that it only gives me the architecture included in the forward function (ie the functions PreProcess and LSTM in the code) as shown in the image below. I have another function which is used to calculate the loss. I would like to able to generate a similar flow chart following the input and output dimensions for this part of the loss function (DBlock in the code below). Is this possible to visualize? .
'''
class DBlock(nn.Module):
# A basic building block for parameterizing a normal distribution.
# It corresponds to the D operation in the reference Appendix.
def __init__(self, input_size, hidden_size, output_size):
super(DBlock, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(input_size, hidden_size)
self.fc_mu = nn.Linear(hidden_size, output_size)
self.fc_logsigma = nn.Linear(hidden_size, output_size)
def forward(self, input):
t = torch.tanh(self.fc1(input))
t = t * torch.sigmoid(self.fc2(input))
mu = self.fc_mu(t)
logsigma = self.fc_logsigma(t)
return mu, logsigma
class PreProcess(nn.Module):
# The pre-process layer for MNIST image
def __init__(self, input_size, processed_x_size):
super(PreProcess, self).__init__()
self.input_size = input_size
self.fc1 = nn.Linear(input_size, processed_x_size)
self.fc2 = nn.Linear(processed_x_size, processed_x_size)
def forward(self, input):
t = torch.relu(self.fc1(input))
t = torch.relu(self.fc2(t))
return t
class Decoder(nn.Module):
# The decoder layer converting state to observation.
# Because the observation is MNIST image whose elements are values
# between 0 and 1, the output of this layer are probabilities of
# elements being 1.
def __init__(self, z_size, hidden_size, x_size):
super(Decoder, self).__init__()
self.fc1 = nn.Linear(z_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, x_size)
def forward(self, z):
t = torch.tanh(self.fc1(z))
t = torch.tanh(self.fc2(t))
p = torch.sigmoid(self.fc3(t))
return p
class TD_VAE(nn.Module):
The full TD_VAE model with jumpy prediction.
First, let's first go through some definitions which would help
understanding what is going on in the following code.
Belief: As the model is fed a sequence of observations, x_t, the
model updates its belief state, b_t, through a LSTM network.
It is a deterministic function of x_t.
We call b_t the belief at time t instead of belief state,
because we call the hidden state z state.
State: The latent state variable, z.
Observation: The observed variable, x.
In this case, it represents binarized MNIST images
def __init__(self, x_size, processed_x_size, b_size, z_size):
super(TD_VAE, self).__init__()
self.x_size = x_size
self.processed_x_size = processed_x_size
self.b_size = b_size
self.z_size = z_size
## input pre-process layer
self.process_x = PreProcess(self.x_size, self.processed_x_size)
## one layer LSTM for aggregating belief states
## One layer LSTM is used here and I am not sure how many layers
## are used in the original paper
self.lstm = nn.LSTM(input_size = self.processed_x_size,
hidden_size = self.b_size,
batch_first = True)
## Two layer state model is used
## belief to state (b to z)
## (this is corresponding to P_B distribution in the reference;
## weights are shared across time but not across layers.)
self.l2_b_to_z = DBlock(b_size, 50, z_size) # layer 2
# TODO: input size is to clean, what does this mean?
self.l1_b_to_z = DBlock(b_size + z_size, 50, z_size) # layer 1
## Given belief and state at time t2, infer the state at time t1
self.l2_infer_z = DBlock(b_size + 2*z_size, 50, z_size) # layer 2
self.l1_infer_z = DBlock(b_size + 2*z_size + z_size, 50, z_size) # layer 1
## Given the state at time t1, model state at time t2 through state transition
self.l2_transition_z = DBlock(2*z_size, 50, z_size)
self.l1_transition_z = DBlock(2*z_size + z_size, 50, z_size)
## state to observation
self.z_to_x = Decoder(2*z_size, 200, x_size)
def forward(self, images):
self.batch_size = images.size()[0]
self.x = images
## pre-precess image x
self.processed_x = self.process_x(self.x)
## aggregate the belief b
# TODO: are h_n and c_n used internally by pytorch?
self.b, (h_n, c_n) = self.lstm(self.processed_x)
def calculate_loss(self, t1, t2):
"""
Calculate the jumpy VD-VAE loss, which is corresponding to
the equation (6) and equation (8) in the reference.
"""
## Because the loss is based on variational inference, we need to
## draw samples from the variational distribution in order to estimate
## the loss function.
## sample a state at time t2 (see the reparameterization trick is used)
## z in layer 2
t2_l2_z_mu, t2_l2_z_logsigma = self.l2_b_to_z(self.b[:, t2, :])
t2_l2_z_epsilon = torch.randn_like(t2_l2_z_mu)
t2_l2_z = t2_l2_z_mu + torch.exp(t2_l2_z_logsigma)*t2_l2_z_epsilon
## z in layer 1
t2_l1_z_mu, t2_l1_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t2,:], t2_l2_z),dim = -1))
t2_l1_z_epsilon = torch.randn_like(t2_l1_z_mu)
t2_l1_z = t2_l1_z_mu + torch.exp(t2_l1_z_logsigma)*t2_l1_z_epsilon
## concatenate z from layer 1 and layer 2
t2_z = torch.cat((t2_l1_z, t2_l2_z), dim = -1)
## sample a state at time t1
## infer state at time t1 based on states at time t2
t1_l2_qs_z_mu, t1_l2_qs_z_logsigma = self.l2_infer_z(
torch.cat((self.b[:,t1,:], t2_z), dim = -1))
t1_l2_qs_z_epsilon = torch.randn_like(t1_l2_qs_z_mu)
t1_l2_qs_z = t1_l2_qs_z_mu + torch.exp(t1_l2_qs_z_logsigma)*t1_l2_qs_z_epsilon
t1_l1_qs_z_mu, t1_l1_qs_z_logsigma = self.l1_infer_z(
torch.cat((self.b[:,t1,:], t2_z, t1_l2_qs_z), dim = -1))
t1_l1_qs_z_epsilon = torch.randn_like(t1_l1_qs_z_mu)
t1_l1_qs_z = t1_l1_qs_z_mu + torch.exp(t1_l1_qs_z_logsigma)*t1_l1_qs_z_epsilon
t1_qs_z = torch.cat((t1_l1_qs_z, t1_l2_qs_z), dim = -1)
#### After sampling states z from the variational distribution, we can calculate
#### the loss.
## state distribution at time t1 based on belief at time 1
t1_l2_pb_z_mu, t1_l2_pb_z_logsigma = self.l2_b_to_z(self.b[:, t1, :])
t1_l1_pb_z_mu, t1_l1_pb_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t1,:], t1_l2_qs_z),dim = -1))
## state distribution at time t2 based on states at time t1 and state transition
t2_l2_t_z_mu, t2_l2_t_z_logsigma = self.l2_transition_z(t1_qs_z)
t2_l1_t_z_mu, t2_l1_t_z_logsigma = self.l1_transition_z(
torch.cat((t1_qs_z, t2_l2_z), dim = -1))
## observation distribution at time t2 based on state at time t2
t2_x_prob = self.z_to_x(t2_z)
#### start calculating the loss
#### KL divergence between z distribution at time t1 based on variational
#### distribution (inference model) and z distribution at time t1 based on belief.
#### This divergence is between two normal distributions and it can be
#### calculated analytically
## KL divergence between t1_l2_pb_z, and t1_l2_qs_z
loss = 0.5*torch.sum(((t1_l2_pb_z_mu - t1_l2_qs_z)/torch.exp(t1_l2_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l2_pb_z_logsigma, -1) - torch.sum(t1_l2_qs_z_logsigma, -1)
## KL divergence between t1_l1_pb_z and t1_l1_qs_z
loss += 0.5*torch.sum(((t1_l1_pb_z_mu - t1_l1_qs_z)/torch.exp(t1_l1_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l1_pb_z_logsigma, -1) - torch.sum(t1_l1_qs_z_logsigma, -1)
#### The following four terms estimate the KL divergence between
#### the z distribution at time t2 based on variational distribution
#### (inference model) and z distribution at time t2 based on transition.
#### In contrast with the above KL divergence for z distribution at time t1,
#### this KL divergence can not be calculated analytically because
#### the transition distribution depends on z_t1, which is sampled after z_t2.
#### Therefore, the KL divergence is estimated using samples
## state log probability at time t2 based on belief
loss += torch.sum(-0.5*t2_l2_z_epsilon**2 - 0.5*t2_l2_z_epsilon.new_tensor(2*np.pi) - t2_l2_z_logsigma, dim = -1)
loss += torch.sum(-0.5*t2_l1_z_epsilon**2 - 0.5*t2_l1_z_epsilon.new_tensor(2*np.pi) - t2_l1_z_logsigma, dim = -1)
## state log probability at time t2 based on transition
loss += torch.sum(0.5*((t2_l2_z - t2_l2_t_z_mu)/torch.exp(t2_l2_t_z_logsigma))**2 + 0.5*t2_l2_z.new_tensor(2*np.pi) + t2_l2_t_z_logsigma, -1)
loss += torch.sum(0.5*((t2_l1_z - t2_l1_t_z_mu)/torch.exp(t2_l1_t_z_logsigma))**2 + 0.5*t2_l1_z.new_tensor(2*np.pi) + t2_l1_t_z_logsigma, -1)
## observation prob at time t2
loss += -torch.sum(self.x[:,t2,:]*torch.log(t2_x_prob) + (1-self.x[:,t2,:])*torch.log(1-t2_x_prob), -1)
loss = torch.mean(loss)
return loss
'''

Related

Neural network built from scratch in python to classify digits stuck at 11.35 percent accuracy. I am using the MNIST dataset

My neural network is stuck at 11.35 percent accuracy and i am unable to trace the error.
low accuracy at 11.35 percent
I am following this code https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb which I found in a youtube video.
Here is my code for the neural network(I have defined Xavier weight initialization in a module called nn):
"""1. 784 neurons in input layer
2. 128 neurons in hidden layer 1
3. 64 neurons in hidden layer 2
4. 10 neurons in output layer"""
def softmax(input):
y = np.exp(input - input.max())
activated = y/ np.sum(y, axis=0)
return activated
def softmax_grad(x):
exps = np.exp(x-x.max())
return exps / np.sum(exps,axis = 0) * (1 - exps /np.sum(exps,axis = 0))
def sigmoid(input):
activated = 1/(1 + np.exp(-input))
return activated
def sigmoid_grad(input):
grad = input*(1-input)
return grad
class DenseNN:
def __init__(self,d0,d1,d2,d3):
self.params = {'w1': nn.Xavier.initialize(d0, d1),
'w2': nn.Xavier.initialize(d1, d2),
'w3': nn.Xavier.initialize(d2, d3)}
def forward(self,a0):
params = self.params
params['a0'] = a0
params['z1'] = np.dot(params['w1'],params['a0'])
params['a1'] = sigmoid(params['z1'])
params['z2'] = np.dot(params['w2'],params['a1'])
params['a2'] = sigmoid(params['z2'])
params['z3'] = np.dot(params['w3'],params['a2'])
params['a3'] = softmax(params['z3'])
return params['a3']
def backprop(self,y_true,y_pred):
params = self.params
w_change = {}
error = softmax_grad(params['z3'])*((y_pred - y_true)/y_true.shape[0])
w_change['w3'] = np.outer(error,params['a2'])
error = np.dot(params['w3'].T,error)*sigmoid_grad(params['a2'])
w_change['w2'] = np.outer(error,params['a1'])
error = np.dot(params['w2'].T,error)*sigmoid_grad(params['a1'])
w_change['w1'] = np.outer(error,params['a0'])
return w_change
def update_weights(self,learning_rate,w_change):
self.params['w1'] -= learning_rate*w_change['w1']
self.params['w2'] -= learning_rate*w_change['w2']
self.params['w3'] -= learning_rate*w_change['w3']
def train(self,epochs,lr):
for epoch in range(epochs):
for i in range(60000):
a0 = np.array([x_train[i]]).T
o = np.array([y_train[i]]).T
y_pred = self.forward(a0)
w_change = self.backprop(o,y_pred)
self.update_weights(lr,w_change)
# print(self.compute_accuracy()*100)
# print(calc_mse(a3, o))
print((self.compute_accuracy())*100)
def compute_accuracy(self):
'''
This function does a forward pass of x, then checks if the indices
of the maximum value in the output equals the indices in the label
y. Then it sums over each prediction and calculates the accuracy.
'''
predictions = []
for i in range(10000):
idx = i
a0 = x_test[idx]
a0 = np.array([a0]).T
#print("acc a1",np.shape(a1))
o = y_test[idx]
o = np.array([o]).T
#print("acc o",np.shape(o))
output = self.forward(a0)
pred = np.argmax(output)
predictions.append(pred == np.argmax(o))
return np.mean(predictions)
Here is the code for loading the data:
#load dataset csv
train_data = pd.read_csv('../Datasets/MNIST/mnist_train.csv')
test_data = pd.read_csv('../Datasets/MNIST/mnist_test.csv')
#train data
x_train = train_data.drop('label',axis=1).to_numpy()
y_train = pd.get_dummies(train_data['label']).values
#test data
x_test = test_data.drop('label',axis=1).to_numpy()
y_test = pd.get_dummies(test_data['label']).values
fac = 0.99 / 255
x_train = np.asfarray(x_train) * fac + 0.01
x_test = np.asfarray(x_test) * fac + 0.01
# train_labels = np.asfarray(train_data[:, :1])
# test_labels = np.asfarray(test_data[:, :1])
#printing dimensions
print(np.shape(x_train)) #(60000,784)
print(np.shape(y_train)) #(60000,10)
print(np.shape(x_test)) #(10000,784)
print(np.shape(y_test)) #(10000,10)
print((x_train))
Kindly help
I am a newbie in machine learning so any help would be appreciated.I am unable to figure out where i am going wrong.Most of the code is almost similar to https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb but it manages to get 60 percent accuracy.
EDIT
I found the mistake :
Thanks to Bartosz Mikulski.
The problem was with how the weights were initialized in my Xavier weights initialization algorithm.
I changed the code for weights initialization to this:
self.params = {
'w1':np.random.randn(d1, d0) * np.sqrt(1. / d1),
'w2':np.random.randn(d2, d1) * np.sqrt(1. / d2),
'w3':np.random.randn(d3, d2) * np.sqrt(1. / d3),
'b1':np.random.randn(d1, 1) * np.sqrt(1. / d1),
'b2':np.random.randn(d2, 1) * np.sqrt(1. / d2),
'b3':np.random.randn(d3, 1) * np.sqrt(1. / d3),
}
then i got the output:
After changing weights initialization
after adding the bias parameters i got the output:
After changing weights initialization and adding bias
3: After changing weights initialization and adding bias
The one problem that I can see is that you are using only weights but no biases. They are very important because they allow your model to change the position of the decision plane (boundary) in the solution space. If you only have weights you can only angle the solution.
I guess that basically, this is the best fit you can get without biases. The dense layer is basically a linear function: w*x + b and you are missing the b. See the PyTorch documentation for the example: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#linear.
Also, can you show your Xavier initialization? In your case, even the simple normal distributed values would be enough as initialization, no need to rush into more advanced topics.
I would also suggest you start from the smaller problem (for example Iris dataset) and no hidden layers (just a simple linear regression that learns by using gradient descent). Then you can expand it by adding hidden layers, and then by trying harder problems with the code you already have.

Sequence classification binary model LSTM from scratch

I am writing a LSTM sequence classifier from scratch (no use of AI library).
I first tried with a classical RNN which I started from a many to many model for a many to one model, with a forward propagation looking like that:
def rnn_forward(inputs,rnnNet):
fw_cache = []
hidden_state = np.zeros((rnnNet.d[0], 1))
fw_cache = []
for t in range(len(inputs)):
hidden_state = cm.tanh( np.dot(rnnNet.p['U'], inputs[t]) + np.dot(rnnNet.p['V'], hidden_state) + rnnNet.p['b_h'] )
fw_cache.append(hidden_state.copy())
outputs = cm.softmax( np.dot(rnnNet.p['W'], hidden_state) + rnnNet.p['b_o'],rnn=True)
return outputs, fw_cache
I could rewrite my parameters dimensions accordingly and this is working as expected.
However, I struggle with doing the same thing on a LSTM network. Below is the forward prop:
def lstm_forward(inputs,lstmNet):
fw_cache = []
# lstmNet.d[0] is the hidden_size
h_prev = np.zeros((lstmNet.d[0], 1))
C_prev = np.zeros((lstmNet.d[0], 1))
for x in inputs:
cache = {'C': C_prev, 'h': h_prev}
# Concatenate input and hidden state
cache['z'] = np.row_stack((cache['h'], x))
# Calculate forget gate
cache['f'] = cm.sigmoid(np.dot(lstmNet.p['W_f'], cache['z']) + lstmNet.p['b_f'])
# Calculate input gate
cache['i'] = cm.sigmoid(np.dot(lstmNet.p['W_i'], cache['z']) + lstmNet.p['b_i'])
# Calculate candidate
cache['g'] = cm.tanh(np.dot(lstmNet.p['W_g'], cache['z']) + lstmNet.p['b_g'])
# Calculate memory state
C_prev = cache['f'] * cache['C'] + cache['i'] * cache['g']
# Calculate output gate
cache['o'] = cm.sigmoid(np.dot(lstmNet.p['W_o'], cache['z']) + lstmNet.p['b_o'])
# Calculate hidden state
h_prev = cache['o'] * cm.tanh(cache['C'])
# Calculate logits
cache['v'] = np.dot(lstmNet.p['W_v'], h_prev) + lstmNet.p['b_v']
# Calculate softmax
fw_cache.append(copy.deepcopy(cache))
outputs = cm.softmax(cache['v'],rnn=True)
return outputs, fw_cache
My parameters are:
def init_params(lstmNet):
hidden_size = lstmNet.d[0]
vocab_size = lstmNet.d[1]
z_size = lstmNet.d[2]
output_size = lstmNet.d[3]
# Weight matrix (forget gate)
lstmNet.p['W_f'] = np.random.randn(hidden_size, z_size)
# Bias for forget gate
lstmNet.p['b_f'] = np.zeros((hidden_size, 1))
# Weight matrix (input gate)
lstmNet.p['W_i'] = np.random.randn(hidden_size, z_size)
# Bias for input gate
lstmNet.p['b_i'] = np.zeros((hidden_size, 1))
# Weight matrix (candidate)
lstmNet.p['W_g'] = np.random.randn(hidden_size, z_size)
# Bias for candidate
lstmNet.p['b_g'] = np.zeros((hidden_size, 1))
# Weight matrix of the output gate !!! I expect this to change dimensions
lstmNet.p['W_o'] = np.random.randn(hidden_size, z_size)
lstmNet.p['b_o'] = np.zeros((hidden_size, 1))
# Weight matrix relating the hidden-state to the output !!! I expect this to change dimensions
lstmNet.p['W_v'] = np.random.randn(vocab_size, hidden_size)
lstmNet.p['b_v'] = np.zeros((vocab_size, 1))
Any help in passing from this LSTM many to many model to a many to one model with output only on the last cell / input would be much appreciated.

No gradients provided for any variable - Custom loss function with random weights depending on the Softmax output

I have difficulties writing a custom loss function that makes use of some random weights generated according to the class/state predicted by the Softmax output. The desired property is:
The model is a simple feedforward neural network with input-dimension as 1 and the output dimension as 6.
The activation function of the output layer is Softmax, which intends to estimate the actual number of classes or states using Argmax.
Note that the training data only consists of X (there is no Y).
The loss function is defined according to random weights (i.e., Weibull distribution) sampled based on the predicted state number for each input sample X.
As follows, I provided a minimal example for illustration. For simplification purposes, I only define the loss function based on the random weights for state/class-1. I get: "ValueError: No gradients provided for any variable: ['dense_41/kernel:0', 'dense_41/bias:0', 'dense_42/kernel:0', 'dense_42/bias:0']."
As indicated in the post below, I found out that argmax is not differntiable, and a softargmax function would help (as I implemented in the following code). However, I still get the same error.
Getting around tf.argmax which is not differentiable
import sys
import time
from tqdm import tqdm
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from scipy.stats import weibull_min
###############################################################################################
# Generate Dataset
lb = np.array([2.0]) # Left boundary
ub = np.array([100.0]) # Right boundary
# Data Points - uniformly distributed
N_r = 50
X_r = np.linspace(lb, ub, N_r)
###############################################################################################
#Define Model
class DGM:
# Initialize the class
def __init__(self, X_r):
#Normalize training input data
self.Xmean, self.Xstd = np.mean(X_r), np.std(X_r)
X_r = (X_r - self.Xmean) / self.Xstd
self.X_r = X_r
#Input and output variable dimensions
self.X_dim = 1; self.Y_dim = 6
# Define tensors
self.X_r_tf = tf.convert_to_tensor(X_r, dtype=tf.float32)
#Learning rate
self.LEARNING_RATE=1e-4
#Feedforward neural network model
self.modelTest = self.test_model()
###############################################
# Initialize network weights and biases
def test_model(self):
input_shape = self.X_dim
dimensionality = self.Y_dim
model = tf.keras.Sequential()
model.add(layers.Input(shape=input_shape))
model.add(layers.Dense(64, kernel_initializer='glorot_uniform',bias_initializer='zeros'))
model.add(layers.Activation('tanh'))
model.add(layers.Dense(dimensionality))
model.add(layers.Activation('softmax'))
return model
##############################################
def compute_loss(self):
#Define optimizer
gen_opt = tf.keras.optimizers.Adam(lr=self.LEARNING_RATE, beta_1=0.0,beta_2=0.9)
with tf.GradientTape() as test_tape:
###### calculate loss
generated_u = self.modelTest(self.X_r_tf, training=True)
#number of data
n_data = generated_u.shape[0]
#initialize random weights assuming state-1 at all input samples
wt1 = np.zeros((n_data, 1),dtype=np.float32) #initialize weights
for b in range(n_data):
wt1[b] = weibull_min.rvs(c=2, loc=0, scale =4 , size=1)
wt1 = tf.reshape(tf.convert_to_tensor(wt1, dtype=tf.float32),shape=(n_data,1))
#print('-----------sampling done-----------')
#determine the actual state using softargmax
idst = self.softargmax(generated_u)
idst = tf.reshape(tf.cast(idst, tf.float32),shape=(n_data,1))
#index state-1
id1 = tf.constant(0.,dtype=tf.float32)
#assign weights if predicted state is state-1
wt1_final = tf.cast(tf.equal(idst, id1), dtype=tf.float32)*wt1
#final loss
test_loss = tf.reduce_mean(tf.square(wt1_final))
#print('-----------test loss calcuated-----------')
gradients_of_modelTest = test_tape.gradient(test_loss,
[self.modelTest.trainable_variables])
gen_opt.apply_gradients(zip(gradients_of_modelTest[0],self.modelTest.trainable_variables))
return test_loss
#reference: Getting around tf.argmax which is not differentiable
#https://stackoverflow.com/questions/46926809/getting-around-tf-argmax-which-is-not-differentiable
def softargmax(self, x, beta=1e10):
x = tf.convert_to_tensor(x)
x_range = tf.range(x.shape.as_list()[-1], dtype=x.dtype)
return tf.reduce_sum(tf.nn.softmax(x*beta,axis=1) * x_range, axis=-1)
##############################################
def train(self,training_steps=100):
train_start_time = time.time()
for step in tqdm(range(training_steps), desc='Training'):
start = time.time()
test_loss = self.compute_loss()
if (step + 1) % 10 == 0:
elapsed_time = time.time() - train_start_time
sec_per_step = elapsed_time / step
mins_left = ((training_steps - step) * sec_per_step)
tf.print("\nStep # ", step, "/", training_steps,
output_stream=sys.stdout)
tf.print("Current time:", elapsed_time, " time left:",
mins_left, output_stream=sys.stdout)
tf.print("Test Loss: ", test_loss, output_stream=sys.stdout)
###############################################################################################
#Define and train the model
model = DGM(X_r)
model.train(training_steps=100)

I can't find the bug in this implementation of backpropogation?

My data is 4123 rows of inputs and outputs to an xor gate.
I want to write a Neural Network with three input layer neurons (the third one is bias), a hidden layer, and an output layer.
Here's my implementation
import numpy as np
class TwoLayerNetwork:
def __init__(self, input_size, hidden_size, output_size):
"""
input_size: the number of neurons in the input layer
hidden_size: the number of neurons in the hidden layer
output_size: the number of neurons in the output layer
"""
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.params = {}
self.params['W1'] = 0.01 * np.random.randn(input_size, hidden_size) # FxH
self.params['b1'] = np.zeros((hidden_size, 1)) # Hx1
self.params['W2'] = 0.01 * np.random.randn(hidden_size, output_size) # HxO
self.params['b2'] = np.zeros((output_size, 1)) # Ox1
self.optimal_weights = []
self.errors = {}
def train(self, X, y, epochs):
"""
X: input data matrix, NxF
y: output vector, Nx1
returns:
the optimal set of parameters that best minimize the loss function
"""
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
for iteration in range(epochs):
forward_to_hidden = X.dot(W1) # NxH
activate_hidden = sigmoid(forward_to_hidden) # NxH
forward_to_output = activate_hidden.dot(W2) # NxO
output = sigmoid(forward_to_output) # NxO
self.errors[iteration] = np.mean(0.5 * (y**2 - output**2))
output_error = y - output # NxO
output_layer_delta = output_error * sigmoidPrime(output) # NxO
hidden_layer_error = output_layer_delta.dot(W2.T) # NxO . OxH = NxH
hidden_layer_delta = hidden_layer_error * sigmoidPrime(activate_hidden) # NxH
W1_update = X.T.dot(hidden_layer_delta) # FxN . NxH = FxH
W2_update = activate_hidden.T.dot(output_layer_delta) # HxN . NxO = HxO
W1 += W1_update
W2 += W2_update
self.optimal_weights.append(W1)
self.optimal_weights.append(W2)
def predict(self, X):
W1, W2 = self.optimal_weights[0], self.optimal_weights[1]
forward = sigmoid(X.dot(W1)) # NxH
forward = forward.dot(W2) # NxO
forward = sigmoid(forward) # NxO
return forward
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoidPrime(x):
return sigmoid(x) * (1 - sigmoid(x))
I realize that's very vanilla, but that's intentional. I want to understand the most basic form of NN architecture first.
Now, my problem is that my error plot is confusing.
The neural network just stops learning.
My second problem is that my weights are blowing up reaching up to -10000, which causes overflow because of exp in the sigmoid function.
My third problem is that my output vector only outputs 0.5 instead of 1 or 0
import pandas as pd
data = pd.read_csv('xor.csv').sample(frac=1)
X = data.iloc[:, [0, 1]] # 1st and 2nd cols are the input
X = np.hstack((X, np.ones((data.shape[0], 1)))) # adding the bias 1's
y = data.iloc[:, 2][:, np.newaxis] # 3rd col is the output
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
nn.train(X_train, y_train, 100)
plt.plot(range(100), [i for i in nn.errors.values()])
plt.show()
The link for the dataset
So, if I read your code correctly, your network is specified correctly, but is missing a few key points in order to learn XOR by backpropagation.
The fun part is, your error specification is weird.
I made it into
self.errors[iteration] = np.mean(0.5 * (y - output)**2)
for visualization.
With x-axis denoting epoch and y-axis denoting error:
So what happens, the backpropagation hits a plateau, then rapidly blows up the weights. To slow down the blowing up of the weights and allow the network some time to re-evaluate its mistakes, you can add a so-called "learning rate" != 1. This adresses one of the pitfalls.
Another one is the second figure: you hit oscillatory behaviour in the updates and the program will never reach its optimum state. To adress this, you can deliberately enter an imperfection in the form of a "momentum".
Additionally, the initial conditions matter for the speed at which you converge, so you need to have enough epochs to overcome the local plateaux:
Last, but certainly not least, I did find an error with your specification, but all of the above still applies.
In your layer_deltas you do sigmoidPrime(sigmoid(forwards)) which is one call to sigmoid too many.
last_update = np.zeros((X.shape[1], W1.shape[1]))
last_update2 = np.zeros((W1.shape[1], W2.shape[1]))
output_layer_delta = output_error * sigmoidPrime(forward_to_output) # NxO
hidden_layer_delta = hidden_layer_error * sigmoidPrime(forward_to_hidden) # NxH
W1 += 0.001*(W1_update + last_update * 0.5)
W2 += 0.001*(W2_update + last_update2 * 0.5)
# W1 = 0.001*W1_update
# W2 = 0.001*W2_update
last_update = W1_update.copy()
last_update2 = W2_update.copy()
Did the final trick for me. Now please verify and appease this grumbling man who spent the better part of a night and day on figuring it out. ;)

Why does the lasso here didn't provide me with zero coefficient?

I got the idea of implementing my version of deep feature selection is from the paper here,http://link.springer.com/chapter/10.1007%2F978-3-319-16706-0_20
The basic idea of deep feature selection according to this paper is to add a one to one mapping layer before any full connected hidden layer, then by adding a regularization term (whether lasso or elastic net) to produce zeros in the input layer weights.
My question is, even though it seems I have implemented the deep feature selection framework well, while testing on the random data generated by numpy.rand.random(1000,50) fails to give me any zeros on the initial weight. Is is a common thing for lasso like regularization? Am I going to adjust the parameters I used for this framework (even larger epochs)? Or did I do something wrong with my code.
class DeepFeatureSelectionMLP:
def __init__(self, X, Y, hidden_dims=[100], epochs=1000,
lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.0, learning_rate=0.1):
# Initiate the input layer
# Get the dimension of the input X
n_sample, n_feat = X.shape
n_classes = len(np.unique(Y))
# One hot Y
one_hot_Y = np.zeros((len(Y), n_classes))
for i,j in enumerate(Y):
one_hot_Y[i][j] = 1
self.epochs = epochs
Y = one_hot_Y
# Store up original value
self.X = X
self.Y = Y
# Two variables with undetermined length is created
self.var_X = tf.placeholder(dtype=tf.float32, shape=[None, n_feat], name='x')
self.var_Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes], name='y')
self.input_layer = One2OneInputLayer(self.var_X)
self.hidden_layers = []
layer_input = self.input_layer.output
# Create hidden layers
for dim in hidden_dims:
self.hidden_layers.append(DenseLayer(layer_input, dim))
layer_input = self.hidden_layers[-1].output
# Final classification layer, variable Y is passed
self.softmax_layer = SoftmaxLayer(self.hidden_layers[-1].output, n_classes, self.var_Y)
n_hidden = len(hidden_dims)
# regularization terms on coefficients of input layer
self.L1_input = tf.reduce_sum(tf.abs(self.input_layer.w))
self.L2_input = tf.nn.l2_loss(self.input_layer.w)
# regularization terms on weights of hidden layers
L1s = []
L2_sqrs = []
for i in xrange(n_hidden):
L1s.append(tf.reduce_sum(tf.abs(self.hidden_layers[i].w)))
L2_sqrs.append(tf.nn.l2_loss(self.hidden_layers[i].w))
L1s.append(tf.reduce_sum(tf.abs(self.softmax_layer.w)))
L2_sqrs.append(tf.nn.l2_loss(self.softmax_layer.w))
self.L1 = tf.add_n(L1s)
self.L2_sqr = tf.add_n(L2_sqrs)
# Cost with two regularization terms
self.cost = self.softmax_layer.cost \
+ lambda1*(1.0-lambda2)*0.5*self.L2_input + lambda1*lambda2*self.L1_input \
+ alpha1*(1.0-alpha2)*0.5 * self.L2_sqr + alpha1*alpha2*self.L1
self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
self.y = self.softmax_layer.y
def train(self, batch_size=100):
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in xrange(self.epochs):
x_batch, y_batch = get_batch(self.X, self.Y, batch_size)
sess.run(self.optimizer, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
if (i + 1) % 50 == 0:
l = sess.run(self.cost, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
print('epoch {0}: global loss = {1}'.format(i, l))
self.selected_w = sess.run(self.input_layer.w)
print(self.selected_w)
class One2OneInputLayer(object):
# One to One Mapping!
def __init__(self, input):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.zeros([n_in,]), name='w')
self.w = w
self.output = self.w * self.input
self.params = [w]
class DenseLayer(object):
# Canonical dense layer
def __init__(self, input, n_out, activation='sigmoid'):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.ones([n_in, n_out]), name='w')
b = tf.Variable(tf.ones([n_out]), name='b')
output = tf.add(tf.matmul(input, w), b)
output = activate(output, activation)
self.w = w
self.b = b
self.output = output
self.params = [w]
class SoftmaxLayer(object):
def __init__(self, input, n_out, y):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight and biases for this layer
w = tf.Variable(tf.random_normal([n_in, n_out]), name='w')
b = tf.Variable(tf.random_normal([n_out]), name='b')
pred = tf.add(tf.matmul(input, w), b)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
self.y = y
self.w = w
self.b = b
self.cost = cost
self.params= [w]
Gradient descent algorithms such as Adam do not give exact zeros when using l1 regularization. Instead, something like ftrl or proximal adagrad can give you exact zeros.

Categories