Sequence classification binary model LSTM from scratch - python

I am writing a LSTM sequence classifier from scratch (no use of AI library).
I first tried with a classical RNN which I started from a many to many model for a many to one model, with a forward propagation looking like that:
def rnn_forward(inputs,rnnNet):
fw_cache = []
hidden_state = np.zeros((rnnNet.d[0], 1))
fw_cache = []
for t in range(len(inputs)):
hidden_state = cm.tanh( np.dot(rnnNet.p['U'], inputs[t]) + np.dot(rnnNet.p['V'], hidden_state) + rnnNet.p['b_h'] )
fw_cache.append(hidden_state.copy())
outputs = cm.softmax( np.dot(rnnNet.p['W'], hidden_state) + rnnNet.p['b_o'],rnn=True)
return outputs, fw_cache
I could rewrite my parameters dimensions accordingly and this is working as expected.
However, I struggle with doing the same thing on a LSTM network. Below is the forward prop:
def lstm_forward(inputs,lstmNet):
fw_cache = []
# lstmNet.d[0] is the hidden_size
h_prev = np.zeros((lstmNet.d[0], 1))
C_prev = np.zeros((lstmNet.d[0], 1))
for x in inputs:
cache = {'C': C_prev, 'h': h_prev}
# Concatenate input and hidden state
cache['z'] = np.row_stack((cache['h'], x))
# Calculate forget gate
cache['f'] = cm.sigmoid(np.dot(lstmNet.p['W_f'], cache['z']) + lstmNet.p['b_f'])
# Calculate input gate
cache['i'] = cm.sigmoid(np.dot(lstmNet.p['W_i'], cache['z']) + lstmNet.p['b_i'])
# Calculate candidate
cache['g'] = cm.tanh(np.dot(lstmNet.p['W_g'], cache['z']) + lstmNet.p['b_g'])
# Calculate memory state
C_prev = cache['f'] * cache['C'] + cache['i'] * cache['g']
# Calculate output gate
cache['o'] = cm.sigmoid(np.dot(lstmNet.p['W_o'], cache['z']) + lstmNet.p['b_o'])
# Calculate hidden state
h_prev = cache['o'] * cm.tanh(cache['C'])
# Calculate logits
cache['v'] = np.dot(lstmNet.p['W_v'], h_prev) + lstmNet.p['b_v']
# Calculate softmax
fw_cache.append(copy.deepcopy(cache))
outputs = cm.softmax(cache['v'],rnn=True)
return outputs, fw_cache
My parameters are:
def init_params(lstmNet):
hidden_size = lstmNet.d[0]
vocab_size = lstmNet.d[1]
z_size = lstmNet.d[2]
output_size = lstmNet.d[3]
# Weight matrix (forget gate)
lstmNet.p['W_f'] = np.random.randn(hidden_size, z_size)
# Bias for forget gate
lstmNet.p['b_f'] = np.zeros((hidden_size, 1))
# Weight matrix (input gate)
lstmNet.p['W_i'] = np.random.randn(hidden_size, z_size)
# Bias for input gate
lstmNet.p['b_i'] = np.zeros((hidden_size, 1))
# Weight matrix (candidate)
lstmNet.p['W_g'] = np.random.randn(hidden_size, z_size)
# Bias for candidate
lstmNet.p['b_g'] = np.zeros((hidden_size, 1))
# Weight matrix of the output gate !!! I expect this to change dimensions
lstmNet.p['W_o'] = np.random.randn(hidden_size, z_size)
lstmNet.p['b_o'] = np.zeros((hidden_size, 1))
# Weight matrix relating the hidden-state to the output !!! I expect this to change dimensions
lstmNet.p['W_v'] = np.random.randn(vocab_size, hidden_size)
lstmNet.p['b_v'] = np.zeros((vocab_size, 1))
Any help in passing from this LSTM many to many model to a many to one model with output only on the last cell / input would be much appreciated.

Related

Pytorch: visualize architecture of loss function in vae

I am new to machine learning in general and pytorch, so I apologize if my terminology is incorrect. I am trying to understand the code that was used to train a temporal dependent VAE based on this paper. I am trying to follow the architecture of the model based on the answers here. The answer using torchviz is not working for me but torchview is working. The issue is that it only gives me the architecture included in the forward function (ie the functions PreProcess and LSTM in the code) as shown in the image below. I have another function which is used to calculate the loss. I would like to able to generate a similar flow chart following the input and output dimensions for this part of the loss function (DBlock in the code below). Is this possible to visualize? .
'''
class DBlock(nn.Module):
# A basic building block for parameterizing a normal distribution.
# It corresponds to the D operation in the reference Appendix.
def __init__(self, input_size, hidden_size, output_size):
super(DBlock, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(input_size, hidden_size)
self.fc_mu = nn.Linear(hidden_size, output_size)
self.fc_logsigma = nn.Linear(hidden_size, output_size)
def forward(self, input):
t = torch.tanh(self.fc1(input))
t = t * torch.sigmoid(self.fc2(input))
mu = self.fc_mu(t)
logsigma = self.fc_logsigma(t)
return mu, logsigma
class PreProcess(nn.Module):
# The pre-process layer for MNIST image
def __init__(self, input_size, processed_x_size):
super(PreProcess, self).__init__()
self.input_size = input_size
self.fc1 = nn.Linear(input_size, processed_x_size)
self.fc2 = nn.Linear(processed_x_size, processed_x_size)
def forward(self, input):
t = torch.relu(self.fc1(input))
t = torch.relu(self.fc2(t))
return t
class Decoder(nn.Module):
# The decoder layer converting state to observation.
# Because the observation is MNIST image whose elements are values
# between 0 and 1, the output of this layer are probabilities of
# elements being 1.
def __init__(self, z_size, hidden_size, x_size):
super(Decoder, self).__init__()
self.fc1 = nn.Linear(z_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, x_size)
def forward(self, z):
t = torch.tanh(self.fc1(z))
t = torch.tanh(self.fc2(t))
p = torch.sigmoid(self.fc3(t))
return p
class TD_VAE(nn.Module):
The full TD_VAE model with jumpy prediction.
First, let's first go through some definitions which would help
understanding what is going on in the following code.
Belief: As the model is fed a sequence of observations, x_t, the
model updates its belief state, b_t, through a LSTM network.
It is a deterministic function of x_t.
We call b_t the belief at time t instead of belief state,
because we call the hidden state z state.
State: The latent state variable, z.
Observation: The observed variable, x.
In this case, it represents binarized MNIST images
def __init__(self, x_size, processed_x_size, b_size, z_size):
super(TD_VAE, self).__init__()
self.x_size = x_size
self.processed_x_size = processed_x_size
self.b_size = b_size
self.z_size = z_size
## input pre-process layer
self.process_x = PreProcess(self.x_size, self.processed_x_size)
## one layer LSTM for aggregating belief states
## One layer LSTM is used here and I am not sure how many layers
## are used in the original paper
self.lstm = nn.LSTM(input_size = self.processed_x_size,
hidden_size = self.b_size,
batch_first = True)
## Two layer state model is used
## belief to state (b to z)
## (this is corresponding to P_B distribution in the reference;
## weights are shared across time but not across layers.)
self.l2_b_to_z = DBlock(b_size, 50, z_size) # layer 2
# TODO: input size is to clean, what does this mean?
self.l1_b_to_z = DBlock(b_size + z_size, 50, z_size) # layer 1
## Given belief and state at time t2, infer the state at time t1
self.l2_infer_z = DBlock(b_size + 2*z_size, 50, z_size) # layer 2
self.l1_infer_z = DBlock(b_size + 2*z_size + z_size, 50, z_size) # layer 1
## Given the state at time t1, model state at time t2 through state transition
self.l2_transition_z = DBlock(2*z_size, 50, z_size)
self.l1_transition_z = DBlock(2*z_size + z_size, 50, z_size)
## state to observation
self.z_to_x = Decoder(2*z_size, 200, x_size)
def forward(self, images):
self.batch_size = images.size()[0]
self.x = images
## pre-precess image x
self.processed_x = self.process_x(self.x)
## aggregate the belief b
# TODO: are h_n and c_n used internally by pytorch?
self.b, (h_n, c_n) = self.lstm(self.processed_x)
def calculate_loss(self, t1, t2):
"""
Calculate the jumpy VD-VAE loss, which is corresponding to
the equation (6) and equation (8) in the reference.
"""
## Because the loss is based on variational inference, we need to
## draw samples from the variational distribution in order to estimate
## the loss function.
## sample a state at time t2 (see the reparameterization trick is used)
## z in layer 2
t2_l2_z_mu, t2_l2_z_logsigma = self.l2_b_to_z(self.b[:, t2, :])
t2_l2_z_epsilon = torch.randn_like(t2_l2_z_mu)
t2_l2_z = t2_l2_z_mu + torch.exp(t2_l2_z_logsigma)*t2_l2_z_epsilon
## z in layer 1
t2_l1_z_mu, t2_l1_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t2,:], t2_l2_z),dim = -1))
t2_l1_z_epsilon = torch.randn_like(t2_l1_z_mu)
t2_l1_z = t2_l1_z_mu + torch.exp(t2_l1_z_logsigma)*t2_l1_z_epsilon
## concatenate z from layer 1 and layer 2
t2_z = torch.cat((t2_l1_z, t2_l2_z), dim = -1)
## sample a state at time t1
## infer state at time t1 based on states at time t2
t1_l2_qs_z_mu, t1_l2_qs_z_logsigma = self.l2_infer_z(
torch.cat((self.b[:,t1,:], t2_z), dim = -1))
t1_l2_qs_z_epsilon = torch.randn_like(t1_l2_qs_z_mu)
t1_l2_qs_z = t1_l2_qs_z_mu + torch.exp(t1_l2_qs_z_logsigma)*t1_l2_qs_z_epsilon
t1_l1_qs_z_mu, t1_l1_qs_z_logsigma = self.l1_infer_z(
torch.cat((self.b[:,t1,:], t2_z, t1_l2_qs_z), dim = -1))
t1_l1_qs_z_epsilon = torch.randn_like(t1_l1_qs_z_mu)
t1_l1_qs_z = t1_l1_qs_z_mu + torch.exp(t1_l1_qs_z_logsigma)*t1_l1_qs_z_epsilon
t1_qs_z = torch.cat((t1_l1_qs_z, t1_l2_qs_z), dim = -1)
#### After sampling states z from the variational distribution, we can calculate
#### the loss.
## state distribution at time t1 based on belief at time 1
t1_l2_pb_z_mu, t1_l2_pb_z_logsigma = self.l2_b_to_z(self.b[:, t1, :])
t1_l1_pb_z_mu, t1_l1_pb_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t1,:], t1_l2_qs_z),dim = -1))
## state distribution at time t2 based on states at time t1 and state transition
t2_l2_t_z_mu, t2_l2_t_z_logsigma = self.l2_transition_z(t1_qs_z)
t2_l1_t_z_mu, t2_l1_t_z_logsigma = self.l1_transition_z(
torch.cat((t1_qs_z, t2_l2_z), dim = -1))
## observation distribution at time t2 based on state at time t2
t2_x_prob = self.z_to_x(t2_z)
#### start calculating the loss
#### KL divergence between z distribution at time t1 based on variational
#### distribution (inference model) and z distribution at time t1 based on belief.
#### This divergence is between two normal distributions and it can be
#### calculated analytically
## KL divergence between t1_l2_pb_z, and t1_l2_qs_z
loss = 0.5*torch.sum(((t1_l2_pb_z_mu - t1_l2_qs_z)/torch.exp(t1_l2_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l2_pb_z_logsigma, -1) - torch.sum(t1_l2_qs_z_logsigma, -1)
## KL divergence between t1_l1_pb_z and t1_l1_qs_z
loss += 0.5*torch.sum(((t1_l1_pb_z_mu - t1_l1_qs_z)/torch.exp(t1_l1_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l1_pb_z_logsigma, -1) - torch.sum(t1_l1_qs_z_logsigma, -1)
#### The following four terms estimate the KL divergence between
#### the z distribution at time t2 based on variational distribution
#### (inference model) and z distribution at time t2 based on transition.
#### In contrast with the above KL divergence for z distribution at time t1,
#### this KL divergence can not be calculated analytically because
#### the transition distribution depends on z_t1, which is sampled after z_t2.
#### Therefore, the KL divergence is estimated using samples
## state log probability at time t2 based on belief
loss += torch.sum(-0.5*t2_l2_z_epsilon**2 - 0.5*t2_l2_z_epsilon.new_tensor(2*np.pi) - t2_l2_z_logsigma, dim = -1)
loss += torch.sum(-0.5*t2_l1_z_epsilon**2 - 0.5*t2_l1_z_epsilon.new_tensor(2*np.pi) - t2_l1_z_logsigma, dim = -1)
## state log probability at time t2 based on transition
loss += torch.sum(0.5*((t2_l2_z - t2_l2_t_z_mu)/torch.exp(t2_l2_t_z_logsigma))**2 + 0.5*t2_l2_z.new_tensor(2*np.pi) + t2_l2_t_z_logsigma, -1)
loss += torch.sum(0.5*((t2_l1_z - t2_l1_t_z_mu)/torch.exp(t2_l1_t_z_logsigma))**2 + 0.5*t2_l1_z.new_tensor(2*np.pi) + t2_l1_t_z_logsigma, -1)
## observation prob at time t2
loss += -torch.sum(self.x[:,t2,:]*torch.log(t2_x_prob) + (1-self.x[:,t2,:])*torch.log(1-t2_x_prob), -1)
loss = torch.mean(loss)
return loss
'''

Visualizing the attention map of a multihead attention in ViT

I'm trying to visualize the attention map of mit Visual Transformer architecture in keras/tensorflow. For this I was able to implement the ViT model the following way:
def model():
input_layer = layers.Input(shape=input_shape)
#image_patches = create_patches(input_layer)
#print(input_layer.shape)
image_patches = Patches(patch_size)(input_layer)
#print(image_patches.shape)
encoded_patches = PatchEncoder(num_patch, projection_dim)(image_patches)
#print(encoded_patches.shape)
#for i in range(transformer_blocks):
x1 = layers.LayerNormalization()(encoded_patches)
x1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, name='MHA_1')(x1, x1)
x = layers.Add()([x1, encoded_patches])
x2 = layers.LayerNormalization()(x)
x2 = mlp_head(x2, transformer_units)
encoded_patches = layers.Add()([x2, x])
x = layers.LayerNormalization()(encoded_patches)
x = layers.Flatten()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(2)(x)
model = tf.keras.Model(inputs=input_layer, outputs=x)
print(model.summary())
return model
I'm now trying to visualize the attention map based on an input image and my model output. For this I first try to predict the outcome and reshape the weights:
def attention_map(model, image):
size = model.input_shape[1]
grid_size = int(np.sqrt(model.layers[4].output_shape[-2] - 1))
# Prepare the input
X = preprocess_inputs(cv2.resize(image, (size, size)))#[np.newaxis, :] # type: ignore
# Get the attention weights from each transformer.
outputs = [
l.output[1] for l in model.layers if isinstance(l, layers.MultiHeadAttention)
]
weights = np.array(
tf.keras.models.Model(inputs=model.inputs, outputs=outputs).predict(X_test)
)
print(weights.shape)
num_layers = weights.shape[0]
num_heads = weights.shape[1]
reshaped = weights.reshape(
(num_layers, num_heads, grid_size ** 2 + 1, grid_size ** 2 + 1)
)
# From Appendix D.6 in the paper ...
# Average the attention weights across all heads.
reshaped = reshaped.mean(axis=1)
# From Section 3 in https://arxiv.org/pdf/2005.00928.pdf ...
# To account for residual connections, we add an identity matrix to the
# attention matrix and re-normalize the weights.
reshaped = reshaped + np.eye(reshaped.shape[1])
reshaped = reshaped / reshaped.sum(axis=(1, 2))[:, np.newaxis, np.newaxis]
# Recursively multiply the weight matrices
v = reshaped[-1]
for n in range(1, len(reshaped)):
v = np.matmul(v, reshaped[-1 - n])
# Attention from the output token to the input space.
mask = v[0, 1:].reshape(grid_size, grid_size)
mask = cv2.resize(mask / mask.max(), (image.shape[1], image.shape[0]))[
..., np.newaxis
]
return (mask * image).astype("uint8")
However my problem is now to reshape my weight matrix getting in mismatch. Can someone give me a hint on why this is occuring? A hint based on the output dimension given by
weights = np.array(
tf.keras.models.Model(inputs=model.inputs, outputs=outputs).predict(X_test)
)
would also help.

NN Output leans towards latest training examples

I'm trying to make my NN recognize "happy" and "sad" faces, hand-drawn in a 20x20 canvas. I'm using 400 input layer units, 100 hidden layer units, and 2 output units (happy/sad). The activation function for hidden & output layer is sigmoid.
I'm training the neural network progressively by manually drawing a face, selecting if it's happy or sad, and feeding the training sample to the neural network. Also, as I draw, the program shows the NN output dynamically.
The problem is that, as soon as I add a bunch of happy faces in a row, the output for "happy" raises up to almost 1. However, if I start adding sad faces, the output changes to happy=0.0241... sad=0.9988...
Sometimes, both outputs rise to almost one (happy = 0.99 / sad = 0.99).
It seems it should work that way, but in order to test the neural network, I started painting in the upper half of the canvas for HAPPY faces, and in the lower half for SAD faces. Same thing happens (the output leaning towards the latest samples), but I added like 50 training examples and the outputs don't seem to react to me painting in the lower / upper half at all.
I don't understand if I'm building the NN correctly or feeding the data correctly (it's just a 400-sized array with either 0.01 (white) or 0.99 (black)).
How can I prevent this behavior?
Neural Network class:
import scipy.special
import numpy
class NeuralNetwork:
def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
# set number of nodes in each input, hidden, output layer
self.inodes = inputnodes
self.hnodes = hiddennodes
self.onodes = outputnodes
# learning rate
self.lr = learningrate
# activation function
self.activation_function = lambda x: scipy.special.expit(x)
# link weight matrices, wih and who
self.wih = numpy.random.normal(0.0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes))
self.who = numpy.random.normal(0.0, pow(self.onodes, -0.5), (self.onodes, self.hnodes))
pass
# train the neural network
def train(self, inputs_list, targets_list):
# convert inputs list to 2d array
inputs = numpy.array(inputs_list, ndmin=2).T
targets = numpy.array(targets_list, ndmin=2).T
# calculate signals into hidden layer
hidden_inputs = numpy.dot(self.wih, inputs)
# calculate the signals emerging from hidden layer
hidden_outputs = self.activation_function(hidden_inputs)
# calculate signals into final output layer
final_inputs = numpy.dot(self.who, hidden_outputs)
# calculate signals emerging from final output layer
final_outputs = self.activation_function(final_inputs)
# error is the (target - actual)
output_errors = targets - final_outputs
# hidden layer error is the output_errors, split by weights, recombined at hidden nodes
hidden_errors = numpy.dot(self.who.T, output_errors)
# update the weights for the links between the hidden and output layers
self.who += self.lr * numpy.dot((output_errors * final_outputs * (1.0 - final_outputs)),
numpy.transpose(hidden_outputs))
# update the weights for the links between the input and hidden layers
self.wih += self.lr * numpy.dot((hidden_errors * hidden_outputs * (1.0 - hidden_outputs)),
numpy.transpose(inputs))
pass
def query(self, inputs_list):
# convert inputs list to 2d array
inputs = numpy.array(inputs_list, ndmin=2).T
# calculate signals into hidden layer
hidden_inputs = numpy.dot(self.wih, inputs)
# calculate the signals emerging from the hidden layer
hidden_outputs = self.activation_function(hidden_inputs)
# calculate signals into final output layer
final_inputs = numpy.dot(self.who, hidden_outputs)
# calculate the signals emerging from final output layer
final_outputs = self.activation_function(final_inputs)
return final_outputs
Main code:
from tkinter import *
import numpy
# scipy.special for the sigmoid function expit()
import scipy.special
# library for plotting arrays
import matplotlib.pyplot
from nn import *
root = Tk()
frame = Frame(root)
w = Canvas(frame, width=400, height=400, background="green")
w.pack()
canvasRectangles = []
for i in range(20):
ls = []
for k in range(20):
x = 20*i
y = 20*k
ls.append(w.create_rectangle(x,y,x+20,y+20,fill="white"))
canvasRectangles.append(ls)
label = Label(frame,text='Number pressed: N/A')
label.pack()
hdnLayer1Label = Label(frame,text="Hidden Layer 1",justify=LEFT,wraplength=300)
hdnLayer1Label.pack()
outLayerLabel = Label(frame,text="Output Layer",justify=LEFT,wraplength=300)
outLayerLabel.pack()
def clearCanvas():
for i in range(20):
for k in range(20):
w.itemconfig(canvasRectangles[i][k],fill="white")
def key(event):
if event.char is 'r':
clearCanvas()
else:
if event.char is '1':
label.config(text="SMILE: Happy")
if event.char is '2':
label.config(text="SMILE: Sad")
global number
number = event.char
def initNN():
input_nodes = 400
hidden_nodes = 100
output_nodes = 2
learning_rate = 0.3
global n
n = NeuralNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)
def queryNN(rectangles):
unscaledInputs = getRectangleValues(rectangles)
arr = numpy.array(unscaledInputs)
scaledInputs = scaleInputs(arr)
res = n.query(scaledInputs)
return res
def trainNN(rectangles, desiredOuts):
inputs = getRectangleValues(rectangles)
arr = numpy.array(inputs)
# Scale the inputs
scaledInputs = scaleInputs(arr)
global n
n.train(scaledInputs, desiredOuts)
pass
def getRectangleValues(rectangles):
rectvals = []
for col in range(20):
for row in range(20):
if w.itemcget(rectangles[col][row], "fill") == "black":
rectvals.append(0)
else:
rectvals.append(1)
return rectvals
def tab(event):
desiredOuts = [0, 0]
if number is '1':
desiredOuts[0] = 1
print("desiredSmile= HAPPY")
if number is '2':
desiredOuts[1] = 1
print("desiredSmile= SAD")
print(desiredOuts)
trainNN(canvasRectangles, desiredOuts)
clearCanvas()
def draw(event):
x = event.x
y = event.y
if x <= 400 and y <= 400 and x >= 0 and y >= 0:
squarex = int(translate(x, 0, 400, 0, 20))
squarey = int(translate(y, 0, 400, 0, 20))
if not squarex is 20 or not squarey is 20:
w.itemconfig(canvasRectangles[squarex][squarey], fill="black")
outLayerLabel.config(text="Output: " + repr(queryNN(canvasRectangles)))
def translate(value, leftMin, leftMax, rightMin, rightMax):
# Figure out how 'wide' each range is
leftSpan = leftMax - leftMin
rightSpan = rightMax - rightMin
# Convert the left range into a 0-1 range (float)
valueScaled = float(value - leftMin) / float(leftSpan)
# Convert the 0-1 range into a value in the right range.
return rightMin + (valueScaled * rightSpan)
def scaleInputs(unscaledins):
return (numpy.asfarray(unscaledins) / 1.0 * 0.99) + 0.01
initNN()
w.bind("<Button-1>", draw)
w.bind("<B1-Motion>", draw)
frame.bind("1",key)
frame.bind("2",key)
frame.bind("r", key)
frame.bind("<Tab>", tab)
frame.pack()
frame.focus_set()
'root.after(0,doWork)'
root.mainloop()
If anyone else has a similar problem, I solved it by reducing the canvas resolution (from 20x20 to 7x7). I guess that such a high resolution needs much more training samples.
Also, I made a training set and iterated through it several times (about 8 epochs did the trick for me).
The solution is simple: if your inputs are more complex, you need much more training.

First Neural Network, (MLP), from Scratch, Python -- Questions

I understand how the Neural Network with backpropogation is supposed to work. I know how to use Python's own MLPClassifier and fit functions work in sklearn. I am creating my own because I'd like to know the details better. I will first show my code (with comments) and then discuss my problems.
import numpy as np
import scipy as sp
import sklearn as ML
# z: the linear combination of the previous layer
#
# returns the activation for the node
#
def sigmoid(z):
a = 1 / (1 + np.exp(-z))
return a
# z: the contribution of a layer
#
# returns the derivative of the sigmoid evaluated at z
#
def sig_grad(z):
d = (1 - sigmoid(z))*sigmoid(z)
return d
# input: the data we want to train the network with
# hidden_layers: the number of nodes in the hidden layers
# num_layers: how many hidden layers between the input layer and the output layer
# num_output: how many outputs there are... this becomes relevant when we input many features.
#
# returns the activations determined
# and the linear combinations of previous layer's nodes for each layer
#
def feedforward(input, hidden_layers, num_layers, num_output, thresh, weights):
#initialize the vector for inputs AND threshold values
X = np.hstack([thresh[0], input])
#intialize the activations list
A = []
#intialize the linear combos for each layer
Z = []
w = list(weights)
#place ones in the first row of each layer of weights for the threshold
w[0] = np.vstack([np.ones([1,hidden_layers]), w[0]])
for i in range(1,num_layers):
w[i] = np.vstack([np.ones([1,hidden_layers]), weights[i]])
w[-1] = np.vstack([np.ones([1,num_output]), w[-1]])
#the first layer of weights are initialized outside function
#cycle through the hidden layers
for i in range(1, num_layers+1):
Z.append( np.dot(X, w[i-1])); S = sigmoid(Z[i-1]); A.append(S); X = np.hstack([thresh[i], A[i-1]])
#find the output/last layer activations
Z.append( np.dot(X, w[-1]) ); S = sigmoid(Z[-1]); A.append(S);
return A, Z
#
# truth: what we know the output should be
# activations: the activations determined at each node by the sigmoid
# function in the previous feedforward pass
# combos: the linear combinations at each layer in the prev. ff pass
# num_layers: the number of hidden layers
#
# error: the errors determined at each layer; will be needed for gradient descent
#
def backprop(input, truth, activations, combos, num_layers, weights):
#initialize an array of errors for each hidden layer and the output layer
error = [0 for x in range(0,num_layers+1)]
#intialize the lists containing the gradients w.r.t. weights and threshold
derivW = []; derivb = []
#set the output layer since its error is computed differently than the others
error[num_layers] = (activations[num_layers] - truth)*sig_grad(combos[num_layers])
#find the rate of change for weights and thresh for connections to output
derivW.append( activations[num_layers-1]*error[num_layers]); derivb.append(np.sum(error[num_layers]))
if(num_layers > 1):
#find the errors for each of the hidden layers
for i in range(num_layers - 1, 0, -1):
error[i] = np.dot(weights[i+1],error[i+1])*sig_grad(combos[i])
derivW.append( np.outer(activations[i-1], error[i]) ); derivb.append(np.sum(error[i]))
#
#finding the derivative for weights of input to next layer
#
error[0] = np.dot(weights[i],error[i])*sig_grad(combos[0])
derivW.append( np.outer(input, error[0]) ); derivb.append(np.sum(error[0]))
return derivW, derivb
#
# weights: our networks weights to update via gradient descent
# thresh: the threshold values to update for our system
# derivb: the derivative of our cost function with respect to b for each layer
# derivW: the derivative of our cost function with respect to W for each layer
# stepsize: the stepsize we want to take, determines how big of a step we take
#
# returns the updated weights and threshold values for our network
def gradDesc(weights, thresh, derivb, derivW, stepsize, num_layers):
#perform gradient descent
for j in range(100):
for i in range(0, num_layers + 1):
weights[i] = weights[i] - stepsize*derivW[num_layers-i]
thresh[i] = thresh[i] - stepsize*derivb[num_layers-i]
return weights, thresh
#input: the data to send through the network
#hidden_layers: the number of hidden_layers between the input layer and the output layer
#num_layers: the number of nodes in the hidden layer
#num_output: the number of nodes in the output layer
#
#returns the output of the network
#
def nNetwork(input, truth, hidden_layers, num_layers, num_output, maxiter, stepsize):
#assuming that input is an array where each element is an input/sample
#we also need to know the size of each sample itself
m = input.size
thresh = np.random.randn(num_layers + 1, 1)
thresh_weights = np.ones([num_layers + 1, 1])
# initialize the weights as a list because each layer might have
# a different number of weights
weights = []; weights.append(np.random.randn(m,hidden_layers));
if( num_layers > 1):
for i in range(1, num_layers):
weights.append(np.random.randn(hidden_layers, hidden_layers))
weights.append(np.random.randn(hidden_layers, num_output))
for i in range(maxiter):
activations, combos = feedforward(input, hidden_layers, num_layers, num_output, thresh, weights)
derivW, derivb = backprop(input, truth, activations, combos, num_layers, weights)
weights, thresh = gradDesc(weights, thresh, derivb, derivW, stepsize, num_layers)
return weights, thresh
def main():
# a very, very simple neural network
input = np.array([1,0,0])
truth = 0
hidden_layers = 3
num_layers = 2
num_output = 1
#train the network
w, t = nNetwork(input, truth, hidden_layers, num_layers, num_output, maxiter = 10, stepsize = 0.001)
#test the network on a new set of arguments
#activations, combos = feedforward(new_input, hidden_layers = 3, num_layers = 2, thresh = t, weights = w)
main()
I've tested this code on simple examples where there are n input of one dimension and output of n dimension (not yet able to work out the bugs when I type import NN.py into the console, but works when I run it piece by piece in the console). I have a few questions to help me better understand what is going on when I have n input there are m dimensions. For example, the digits data in Python (there are 1797 samples and each sample is 64x1 -- an 8x8 image vectorized).
1) Is each of the 64 pixels considered an input? If so, is the neural net trained one image at a time? This would be an easy fix for me.
2) If the neural net is trained all images at once, what are suggestions for modifying my code?
3) Obviously the output for an image comes in the form of 0, 1, 2, 3, ... , or 9. But, does the output come in the form of a vector 10x1 where there is a 1 in the digit the image represents and 0's elsewhere? So, my prediction vector would have the highest value where the 1 might be, right?
4) Then, I'm not quite sure how #3 would look if #2 is true..
I apologize for the long note. Thanks for taking a look and helping me understand better!

Why does the lasso here didn't provide me with zero coefficient?

I got the idea of implementing my version of deep feature selection is from the paper here,http://link.springer.com/chapter/10.1007%2F978-3-319-16706-0_20
The basic idea of deep feature selection according to this paper is to add a one to one mapping layer before any full connected hidden layer, then by adding a regularization term (whether lasso or elastic net) to produce zeros in the input layer weights.
My question is, even though it seems I have implemented the deep feature selection framework well, while testing on the random data generated by numpy.rand.random(1000,50) fails to give me any zeros on the initial weight. Is is a common thing for lasso like regularization? Am I going to adjust the parameters I used for this framework (even larger epochs)? Or did I do something wrong with my code.
class DeepFeatureSelectionMLP:
def __init__(self, X, Y, hidden_dims=[100], epochs=1000,
lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.0, learning_rate=0.1):
# Initiate the input layer
# Get the dimension of the input X
n_sample, n_feat = X.shape
n_classes = len(np.unique(Y))
# One hot Y
one_hot_Y = np.zeros((len(Y), n_classes))
for i,j in enumerate(Y):
one_hot_Y[i][j] = 1
self.epochs = epochs
Y = one_hot_Y
# Store up original value
self.X = X
self.Y = Y
# Two variables with undetermined length is created
self.var_X = tf.placeholder(dtype=tf.float32, shape=[None, n_feat], name='x')
self.var_Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes], name='y')
self.input_layer = One2OneInputLayer(self.var_X)
self.hidden_layers = []
layer_input = self.input_layer.output
# Create hidden layers
for dim in hidden_dims:
self.hidden_layers.append(DenseLayer(layer_input, dim))
layer_input = self.hidden_layers[-1].output
# Final classification layer, variable Y is passed
self.softmax_layer = SoftmaxLayer(self.hidden_layers[-1].output, n_classes, self.var_Y)
n_hidden = len(hidden_dims)
# regularization terms on coefficients of input layer
self.L1_input = tf.reduce_sum(tf.abs(self.input_layer.w))
self.L2_input = tf.nn.l2_loss(self.input_layer.w)
# regularization terms on weights of hidden layers
L1s = []
L2_sqrs = []
for i in xrange(n_hidden):
L1s.append(tf.reduce_sum(tf.abs(self.hidden_layers[i].w)))
L2_sqrs.append(tf.nn.l2_loss(self.hidden_layers[i].w))
L1s.append(tf.reduce_sum(tf.abs(self.softmax_layer.w)))
L2_sqrs.append(tf.nn.l2_loss(self.softmax_layer.w))
self.L1 = tf.add_n(L1s)
self.L2_sqr = tf.add_n(L2_sqrs)
# Cost with two regularization terms
self.cost = self.softmax_layer.cost \
+ lambda1*(1.0-lambda2)*0.5*self.L2_input + lambda1*lambda2*self.L1_input \
+ alpha1*(1.0-alpha2)*0.5 * self.L2_sqr + alpha1*alpha2*self.L1
self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
self.y = self.softmax_layer.y
def train(self, batch_size=100):
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in xrange(self.epochs):
x_batch, y_batch = get_batch(self.X, self.Y, batch_size)
sess.run(self.optimizer, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
if (i + 1) % 50 == 0:
l = sess.run(self.cost, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
print('epoch {0}: global loss = {1}'.format(i, l))
self.selected_w = sess.run(self.input_layer.w)
print(self.selected_w)
class One2OneInputLayer(object):
# One to One Mapping!
def __init__(self, input):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.zeros([n_in,]), name='w')
self.w = w
self.output = self.w * self.input
self.params = [w]
class DenseLayer(object):
# Canonical dense layer
def __init__(self, input, n_out, activation='sigmoid'):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.ones([n_in, n_out]), name='w')
b = tf.Variable(tf.ones([n_out]), name='b')
output = tf.add(tf.matmul(input, w), b)
output = activate(output, activation)
self.w = w
self.b = b
self.output = output
self.params = [w]
class SoftmaxLayer(object):
def __init__(self, input, n_out, y):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight and biases for this layer
w = tf.Variable(tf.random_normal([n_in, n_out]), name='w')
b = tf.Variable(tf.random_normal([n_out]), name='b')
pred = tf.add(tf.matmul(input, w), b)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
self.y = y
self.w = w
self.b = b
self.cost = cost
self.params= [w]
Gradient descent algorithms such as Adam do not give exact zeros when using l1 regularization. Instead, something like ftrl or proximal adagrad can give you exact zeros.

Categories