How to implement a custom non-trainable PCA layer on Keras - python

I am trying to develop a non-trainable custom PCA layer, where, starting from a float input of size equal to 256, the output is a two-dimensional vector obtained from the PCA application inside the network as last level. Prior to this layer I need to have one Dense layer. Below is my PCA implementation via tensorflow and the custom level:
class TF_PCA:
def __init__(self, X):
self.X = X
self.u = None
self.singular_values = None
self.sigma = None
def fit(self):
# Perform SVD
self.singular_values, self.u, _ = tf.linalg.svd(self.X)
# Create sigma matrix
self.sigma = tf.linalg.diag(self.singular_values)
def reduce(self, n_dimensions=None, keep_info=None):
if keep_info:
# Normalize singular values
normalized_singular_values = self.singular_values / sum(self.singular_values)
# Create the aggregated ladder of kept information per dimension
ladder = np.cumsum(normalized_singular_values)
# Get the first index which is above the given information threshold
index = next(idx for idx, value in enumerate(ladder) if value >= keep_info) + 1
n_dimensions = index
# Cut out the relevant part from sigma
sigma = tf.slice(self.sigma, [0, 0], [self.X.shape[1], n_dimensions])
# PCA
pca = tf.matmul(self.u, sigma)
return pca
class CustomPCALayer(tf.keras.layers.Layer):
def __init__(self, num_outputs):
super(CustomPCALayer, self).__init__()
self.num_outputs = num_outputs
self.total = tf.Variable(initial_value=tf.zeros((num_outputs,)), trainable=False)
def call(self, inputs):
tf_pca = TF_PCA(inputs)
tf_pca.fit()
pca = tf_pca.reduce(n_dimensions=self.num_outputs)
return tf.convert_to_tensor(pca, dtype=tf.float32)
Once I implemented this I also implemented the neural network:
in_dim = (256,)
out_dim = 2
def build_network(inputs, out_dim):
x = Dense(256)(inputs)
x = Dropout(0.2)(x)
x = CustomPCALayer(out_dim)(x) # non-trainable layer
return x
inputs = Input(shape=in_dim)
network = build_network(inputs, out_dim)
model = Model(inputs=inputs, outputs=[network], name="network")
opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer,
loss="mse",
metrics=[tf.keras.metrics.CosineSimilarity()])
As input and output I have something like this:
x = tf.random.uniform((300, 256))
y = tf.random.uniform((300, 2))
If I try to train, it immediately fails with the error NotImplementedError: SVD gradient has not been implemented for input with unknown inner matrix shape.
I would like to understand what is wrong with my architecture in order to solve the error and have a non-trainable layer to perform PCA

Related

Pytorch: visualize architecture of loss function in vae

I am new to machine learning in general and pytorch, so I apologize if my terminology is incorrect. I am trying to understand the code that was used to train a temporal dependent VAE based on this paper. I am trying to follow the architecture of the model based on the answers here. The answer using torchviz is not working for me but torchview is working. The issue is that it only gives me the architecture included in the forward function (ie the functions PreProcess and LSTM in the code) as shown in the image below. I have another function which is used to calculate the loss. I would like to able to generate a similar flow chart following the input and output dimensions for this part of the loss function (DBlock in the code below). Is this possible to visualize? .
'''
class DBlock(nn.Module):
# A basic building block for parameterizing a normal distribution.
# It corresponds to the D operation in the reference Appendix.
def __init__(self, input_size, hidden_size, output_size):
super(DBlock, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(input_size, hidden_size)
self.fc_mu = nn.Linear(hidden_size, output_size)
self.fc_logsigma = nn.Linear(hidden_size, output_size)
def forward(self, input):
t = torch.tanh(self.fc1(input))
t = t * torch.sigmoid(self.fc2(input))
mu = self.fc_mu(t)
logsigma = self.fc_logsigma(t)
return mu, logsigma
class PreProcess(nn.Module):
# The pre-process layer for MNIST image
def __init__(self, input_size, processed_x_size):
super(PreProcess, self).__init__()
self.input_size = input_size
self.fc1 = nn.Linear(input_size, processed_x_size)
self.fc2 = nn.Linear(processed_x_size, processed_x_size)
def forward(self, input):
t = torch.relu(self.fc1(input))
t = torch.relu(self.fc2(t))
return t
class Decoder(nn.Module):
# The decoder layer converting state to observation.
# Because the observation is MNIST image whose elements are values
# between 0 and 1, the output of this layer are probabilities of
# elements being 1.
def __init__(self, z_size, hidden_size, x_size):
super(Decoder, self).__init__()
self.fc1 = nn.Linear(z_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, x_size)
def forward(self, z):
t = torch.tanh(self.fc1(z))
t = torch.tanh(self.fc2(t))
p = torch.sigmoid(self.fc3(t))
return p
class TD_VAE(nn.Module):
The full TD_VAE model with jumpy prediction.
First, let's first go through some definitions which would help
understanding what is going on in the following code.
Belief: As the model is fed a sequence of observations, x_t, the
model updates its belief state, b_t, through a LSTM network.
It is a deterministic function of x_t.
We call b_t the belief at time t instead of belief state,
because we call the hidden state z state.
State: The latent state variable, z.
Observation: The observed variable, x.
In this case, it represents binarized MNIST images
def __init__(self, x_size, processed_x_size, b_size, z_size):
super(TD_VAE, self).__init__()
self.x_size = x_size
self.processed_x_size = processed_x_size
self.b_size = b_size
self.z_size = z_size
## input pre-process layer
self.process_x = PreProcess(self.x_size, self.processed_x_size)
## one layer LSTM for aggregating belief states
## One layer LSTM is used here and I am not sure how many layers
## are used in the original paper
self.lstm = nn.LSTM(input_size = self.processed_x_size,
hidden_size = self.b_size,
batch_first = True)
## Two layer state model is used
## belief to state (b to z)
## (this is corresponding to P_B distribution in the reference;
## weights are shared across time but not across layers.)
self.l2_b_to_z = DBlock(b_size, 50, z_size) # layer 2
# TODO: input size is to clean, what does this mean?
self.l1_b_to_z = DBlock(b_size + z_size, 50, z_size) # layer 1
## Given belief and state at time t2, infer the state at time t1
self.l2_infer_z = DBlock(b_size + 2*z_size, 50, z_size) # layer 2
self.l1_infer_z = DBlock(b_size + 2*z_size + z_size, 50, z_size) # layer 1
## Given the state at time t1, model state at time t2 through state transition
self.l2_transition_z = DBlock(2*z_size, 50, z_size)
self.l1_transition_z = DBlock(2*z_size + z_size, 50, z_size)
## state to observation
self.z_to_x = Decoder(2*z_size, 200, x_size)
def forward(self, images):
self.batch_size = images.size()[0]
self.x = images
## pre-precess image x
self.processed_x = self.process_x(self.x)
## aggregate the belief b
# TODO: are h_n and c_n used internally by pytorch?
self.b, (h_n, c_n) = self.lstm(self.processed_x)
def calculate_loss(self, t1, t2):
"""
Calculate the jumpy VD-VAE loss, which is corresponding to
the equation (6) and equation (8) in the reference.
"""
## Because the loss is based on variational inference, we need to
## draw samples from the variational distribution in order to estimate
## the loss function.
## sample a state at time t2 (see the reparameterization trick is used)
## z in layer 2
t2_l2_z_mu, t2_l2_z_logsigma = self.l2_b_to_z(self.b[:, t2, :])
t2_l2_z_epsilon = torch.randn_like(t2_l2_z_mu)
t2_l2_z = t2_l2_z_mu + torch.exp(t2_l2_z_logsigma)*t2_l2_z_epsilon
## z in layer 1
t2_l1_z_mu, t2_l1_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t2,:], t2_l2_z),dim = -1))
t2_l1_z_epsilon = torch.randn_like(t2_l1_z_mu)
t2_l1_z = t2_l1_z_mu + torch.exp(t2_l1_z_logsigma)*t2_l1_z_epsilon
## concatenate z from layer 1 and layer 2
t2_z = torch.cat((t2_l1_z, t2_l2_z), dim = -1)
## sample a state at time t1
## infer state at time t1 based on states at time t2
t1_l2_qs_z_mu, t1_l2_qs_z_logsigma = self.l2_infer_z(
torch.cat((self.b[:,t1,:], t2_z), dim = -1))
t1_l2_qs_z_epsilon = torch.randn_like(t1_l2_qs_z_mu)
t1_l2_qs_z = t1_l2_qs_z_mu + torch.exp(t1_l2_qs_z_logsigma)*t1_l2_qs_z_epsilon
t1_l1_qs_z_mu, t1_l1_qs_z_logsigma = self.l1_infer_z(
torch.cat((self.b[:,t1,:], t2_z, t1_l2_qs_z), dim = -1))
t1_l1_qs_z_epsilon = torch.randn_like(t1_l1_qs_z_mu)
t1_l1_qs_z = t1_l1_qs_z_mu + torch.exp(t1_l1_qs_z_logsigma)*t1_l1_qs_z_epsilon
t1_qs_z = torch.cat((t1_l1_qs_z, t1_l2_qs_z), dim = -1)
#### After sampling states z from the variational distribution, we can calculate
#### the loss.
## state distribution at time t1 based on belief at time 1
t1_l2_pb_z_mu, t1_l2_pb_z_logsigma = self.l2_b_to_z(self.b[:, t1, :])
t1_l1_pb_z_mu, t1_l1_pb_z_logsigma = self.l1_b_to_z(
torch.cat((self.b[:,t1,:], t1_l2_qs_z),dim = -1))
## state distribution at time t2 based on states at time t1 and state transition
t2_l2_t_z_mu, t2_l2_t_z_logsigma = self.l2_transition_z(t1_qs_z)
t2_l1_t_z_mu, t2_l1_t_z_logsigma = self.l1_transition_z(
torch.cat((t1_qs_z, t2_l2_z), dim = -1))
## observation distribution at time t2 based on state at time t2
t2_x_prob = self.z_to_x(t2_z)
#### start calculating the loss
#### KL divergence between z distribution at time t1 based on variational
#### distribution (inference model) and z distribution at time t1 based on belief.
#### This divergence is between two normal distributions and it can be
#### calculated analytically
## KL divergence between t1_l2_pb_z, and t1_l2_qs_z
loss = 0.5*torch.sum(((t1_l2_pb_z_mu - t1_l2_qs_z)/torch.exp(t1_l2_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l2_pb_z_logsigma, -1) - torch.sum(t1_l2_qs_z_logsigma, -1)
## KL divergence between t1_l1_pb_z and t1_l1_qs_z
loss += 0.5*torch.sum(((t1_l1_pb_z_mu - t1_l1_qs_z)/torch.exp(t1_l1_pb_z_logsigma))**2,-1) + \
torch.sum(t1_l1_pb_z_logsigma, -1) - torch.sum(t1_l1_qs_z_logsigma, -1)
#### The following four terms estimate the KL divergence between
#### the z distribution at time t2 based on variational distribution
#### (inference model) and z distribution at time t2 based on transition.
#### In contrast with the above KL divergence for z distribution at time t1,
#### this KL divergence can not be calculated analytically because
#### the transition distribution depends on z_t1, which is sampled after z_t2.
#### Therefore, the KL divergence is estimated using samples
## state log probability at time t2 based on belief
loss += torch.sum(-0.5*t2_l2_z_epsilon**2 - 0.5*t2_l2_z_epsilon.new_tensor(2*np.pi) - t2_l2_z_logsigma, dim = -1)
loss += torch.sum(-0.5*t2_l1_z_epsilon**2 - 0.5*t2_l1_z_epsilon.new_tensor(2*np.pi) - t2_l1_z_logsigma, dim = -1)
## state log probability at time t2 based on transition
loss += torch.sum(0.5*((t2_l2_z - t2_l2_t_z_mu)/torch.exp(t2_l2_t_z_logsigma))**2 + 0.5*t2_l2_z.new_tensor(2*np.pi) + t2_l2_t_z_logsigma, -1)
loss += torch.sum(0.5*((t2_l1_z - t2_l1_t_z_mu)/torch.exp(t2_l1_t_z_logsigma))**2 + 0.5*t2_l1_z.new_tensor(2*np.pi) + t2_l1_t_z_logsigma, -1)
## observation prob at time t2
loss += -torch.sum(self.x[:,t2,:]*torch.log(t2_x_prob) + (1-self.x[:,t2,:])*torch.log(1-t2_x_prob), -1)
loss = torch.mean(loss)
return loss
'''

Trainable Matrix multiplication Layer

I'm trying to build a (custom) trainable matrix-multiplication layer in TensorFlow, but things aren't working out... More precisely, my model should look like this:
x -> A(x) x
where A(x) is a feed-forward network with values in the n x n matrix (and thus depends on the input x) and A(x) is matrix by vector multiplication.
Here's what I've coded-up:
class custom_layer(tf.keras.layers.Layer):
def __init__(self, units=16, input_dim=32):
super(custom_layer, self).__init__()
self.units = units
def build(self, input_shape):
self.Tw1 = self.add_weight(name='Weights_1 ',
shape=(input_shape[-1], input_shape[-1]),
initializer='GlorotUniform',
trainable=True)
self.Tw2 = self.add_weight(name='Weights_2 ',
shape=(input_shape[-1], (self.units)**2),
initializer='GlorotUniform',
trainable=True)
self.Tb = self.add_weight(name='basies',
shape=(input_shape[-1],),
initializer='GlorotUniform',#Previously 'ones'
trainable=True)
def call(self, input):
# Build Vector-Valued Feed-Forward Network
ffNN = tf.matmul(input, self.Tw1) + self.Tb
ffNN = tf.nn.relu(ffNN)
ffNN = tf.matmul(ffNN, self.Tw2)
# Map to Matrix
ffNN = tf.reshape(ffNN, [self.units,self.units])
# Multiply Matrix-Valued function with input data
x_out = tf.matmul(ffNN,input)
# Return Output
return x_out
Now I build the model:
input_layer = tf.keras.Input(shape=[2])
output_layer = custom_layer(2)(input_layer)
model = tf.keras.Model(inputs=[input_layer], outputs=[output_layer])
# Compile Model
#----------------#
# Define Optimizer
optimizer_on = tf.keras.optimizers.SGD(learning_rate=10**(-1))
# Compile
model.compile(loss = 'mse',
optimizer = optimizer_on,
metrics = ['mse'])
# Fit Model
#----------------#
model.fit(data_x, data_y, epochs=(10**1), verbose=0)
and then I get this error message:
InvalidArgumentError: Input to reshape is a tensor with 128 values, but the requested shape has 4
[[node model_62/reconfiguration_unit_70/Reshape (defined at <ipython-input-176-0b494fa3fc75>:46) ]] [Op:__inference_distributed_function_175181]
Errors may have originated from an input operation.
Input Source operations connected to node model_62/reconfiguration_unit_70/Reshape:
model_62/reconfiguration_unit_70/MatMul_1 (defined at <ipython-input-176-0b494fa3fc75>:41)
Function call stack:
distributed_function
Thoughts:
It seems like something is wrong with the network dimensions but I can't figure what/how to repair it...

Implementing Neural Network using pure Numpy (Softmax + CrossEntropy)

I am trying a simple implementation of a multi-layer perceptron (MLP) using pure NumPy. My previous implementation using RMSE and sigmoid activation at the output (single output) works perfectly with appropriate data. However, when I consider multi-output system (Due to one-hot encoding) with Cross-entropy loss function and softmax activation always fails.
I believe I am doing something wrong with my implementation for gradient calculation but unable to figure it out. So I am here for help.
For the current implementation, I use IRIS dataset for testing the model.
The data for IRIS is obtained as follows:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import minmax_scale
def one_hot_encoder(y):
y_oh = np.zeros((len(y), np.max(y)+1))
for t in np.unique(y):
y_oh[y==t,t] = 1
return y_oh
data = load_iris().data
target = load_iris().target
data_scaled = minmax_scale(data)
target_oh = one_hot_encoder(target)
A Neural network class is defined with a simple 1-hidden layer network as follows:
class NeuralNetwork:
def __init__(self, x, y):
self.x = x
# hidden layer with 16 nodes
self.weights1= np.random.rand(self.x.shape[1],16)
self.bias1 = np.random.rand(16)
# output layer with 3 nodes (for 3 output - One-hot encoded)
self.weights2 = np.random.rand(16,3)
self.bias2 = np.random.rand(3)
self.y = y
self.pred = np.zeros(y.shape)
self.lr = 0.001
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.x, self.weights1) + self.bias1)
self.layer2 = softmax(np.dot(self.layer1, self.weights2) + self.bias2)
# print(self.layer2.shape)
return self.layer2.clip(min=1e-8, max=None)
def backprop(self):
dloss = cross_entropy_derivative(self.pred, self.y) # 2*(self.y - self.pred)
d_weights2 = np.dot(self.layer1.T, dloss*softmax_derivative(self.pred))
d_bias2 = np.dot(np.ones([self.x.shape[0]]), dloss*softmax_derivative(self.pred))
d_weights1 = np.dot(self.x.T, np.dot(dloss*softmax_derivative(self.pred), self.weights2.T)*sigmoid_derivative(self.layer1))
d_bias1 = np.dot(np.ones([self.x.shape[0]]), np.dot(dloss*softmax_derivative(self.pred), self.weights2.T)*sigmoid_derivative(self.layer1))
self.weights1 += self.lr*d_weights1
self.weights2 += self.lr*d_weights2
self.bias1 += self.lr*d_bias1
self.bias2 += self.lr*d_bias2
def train(self, X, y):
self.x = X
self.y = y
self.pred = self.feedforward()
self.backprop()
def predict(self, X):
self.x = X
self.pred = self.feedforward()
return self.pred
def evaluate(self, y, pred):
self.y = y
self.pred = pred
# self.loss = np.sqrt(np.mean((self.pred-self.y)**2))
self.loss = cross_entropy(self.pred, self.y)
return self.loss
The activation functions and their derivatives are computed as follows (I feel there is something wrong here)
# Activation functions
def sigmoid(t):
return 1/(1+np.exp(-t))
# Derivative of sigmoid
def sigmoid_derivative(p):
return p * (1 - p)
# sofmax activation
def softmax(X):
exps = np.exp(X - np.max(X,axis=1).reshape(-1,1))
return exps / np.sum(exps,axis=1)[:,None]
# derivative of softmax
def softmax_derivative(pred):
return pred * (1 -(1 * pred).sum(axis=1)[:,None])
The cross-entropy loss function and its derivatives are as shown below:
def cross_entropy(X,y):
X = X.clip(min=1e-8,max=None)
# print('\n\nCE: ', (np.where(y==1,-np.log(X), 0)).sum(axis=1))
return (np.where(y==1,-np.log(X), 0)).sum(axis=1)
def cross_entropy_derivative(X,y):
X = X.clip(min=1e-8,max=None)
# print('\n\nCED: ', np.where(y==1,-1/X, 0))
return np.where(y==1,-1/X, 0)
The main function call:
NN = NeuralNetwork(data_scaled, target_oh)
for i in range(10000): # trains the NN 10,000 times
NN.train(data_scaled, target_oh)
loss.append(NN.evaluate(NN.y, NN.pred))
y_pred = NN.predict(data_scaled)
The output is approximately constant always predicting a single class. What am I doing wrong? Appreciate your help on the code or directions to look at. Thanks.
subtract the gradient and also derive the unactivated output instead of activated output. read this piece of code that i wrote for more info and watch Sebastian Lagues video about Neural Networks for help about this topic
P.S. The video is not in python, but it explains exactly what 3 years in college tries to explain.

Can't init the weights of my neural network PyTorch

I can't initialize the weights with the function MyNet.apply(init_weights).
These are my functions:
def init_weights(net):
if type(net) == torch.nn.Module:
torch.nn.init.kaiming_uniform_(net.weight)
net.bias.data.fill_(0.01) # tots els bias a 0.01
My neural net is the following:
class NeuralNet(torch.nn.Module):
def __init__(self):
super().__init__() # Necessary for torch to detect this class as trainable
# Here define network architecture
self.layer1 = torch.nn.Linear(28**2, 32).to(device) # Linear layer with 32 neurons
self.layer2 = torch.nn.Linear(32, 64).to(device) # Linear layer with 64 neurons
self.layer3 = torch.nn.Linear(64, 128).to(device) # Linear layer with 128 neurons
self.output = torch.nn.Linear(128, 1).to(device) # Linear layer with 1 output neuron (binary output)
def forward(self, x):
# Here define architecture behavior
x = torch.sigmoid(self.layer1(x)).to(device) # x = torch.nn.functional.relu(self.layer1(x))
x = torch.sigmoid(self.layer2(x)).to(device)
x = torch.sigmoid(self.layer3(x)).to(device)
return torch.sigmoid(self.output(x)).to(device) # Binary output
The type(net) prints as linear so it never gets inside the if statement, and if I remove it produces the following error:
AttributeError: 'NeuralNet' object has no attribute 'weight'
You should init only the weight of the linear layers:
def init_weights(net):
if type(net) == torch.nn.Linear:
torch.nn.init.kaiming_uniform_(net.weight)
net.bias.data.fill_(0.01) # tots els bias a 0.01

Why does the lasso here didn't provide me with zero coefficient?

I got the idea of implementing my version of deep feature selection is from the paper here,http://link.springer.com/chapter/10.1007%2F978-3-319-16706-0_20
The basic idea of deep feature selection according to this paper is to add a one to one mapping layer before any full connected hidden layer, then by adding a regularization term (whether lasso or elastic net) to produce zeros in the input layer weights.
My question is, even though it seems I have implemented the deep feature selection framework well, while testing on the random data generated by numpy.rand.random(1000,50) fails to give me any zeros on the initial weight. Is is a common thing for lasso like regularization? Am I going to adjust the parameters I used for this framework (even larger epochs)? Or did I do something wrong with my code.
class DeepFeatureSelectionMLP:
def __init__(self, X, Y, hidden_dims=[100], epochs=1000,
lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.0, learning_rate=0.1):
# Initiate the input layer
# Get the dimension of the input X
n_sample, n_feat = X.shape
n_classes = len(np.unique(Y))
# One hot Y
one_hot_Y = np.zeros((len(Y), n_classes))
for i,j in enumerate(Y):
one_hot_Y[i][j] = 1
self.epochs = epochs
Y = one_hot_Y
# Store up original value
self.X = X
self.Y = Y
# Two variables with undetermined length is created
self.var_X = tf.placeholder(dtype=tf.float32, shape=[None, n_feat], name='x')
self.var_Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes], name='y')
self.input_layer = One2OneInputLayer(self.var_X)
self.hidden_layers = []
layer_input = self.input_layer.output
# Create hidden layers
for dim in hidden_dims:
self.hidden_layers.append(DenseLayer(layer_input, dim))
layer_input = self.hidden_layers[-1].output
# Final classification layer, variable Y is passed
self.softmax_layer = SoftmaxLayer(self.hidden_layers[-1].output, n_classes, self.var_Y)
n_hidden = len(hidden_dims)
# regularization terms on coefficients of input layer
self.L1_input = tf.reduce_sum(tf.abs(self.input_layer.w))
self.L2_input = tf.nn.l2_loss(self.input_layer.w)
# regularization terms on weights of hidden layers
L1s = []
L2_sqrs = []
for i in xrange(n_hidden):
L1s.append(tf.reduce_sum(tf.abs(self.hidden_layers[i].w)))
L2_sqrs.append(tf.nn.l2_loss(self.hidden_layers[i].w))
L1s.append(tf.reduce_sum(tf.abs(self.softmax_layer.w)))
L2_sqrs.append(tf.nn.l2_loss(self.softmax_layer.w))
self.L1 = tf.add_n(L1s)
self.L2_sqr = tf.add_n(L2_sqrs)
# Cost with two regularization terms
self.cost = self.softmax_layer.cost \
+ lambda1*(1.0-lambda2)*0.5*self.L2_input + lambda1*lambda2*self.L1_input \
+ alpha1*(1.0-alpha2)*0.5 * self.L2_sqr + alpha1*alpha2*self.L1
self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
self.y = self.softmax_layer.y
def train(self, batch_size=100):
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in xrange(self.epochs):
x_batch, y_batch = get_batch(self.X, self.Y, batch_size)
sess.run(self.optimizer, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
if (i + 1) % 50 == 0:
l = sess.run(self.cost, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
print('epoch {0}: global loss = {1}'.format(i, l))
self.selected_w = sess.run(self.input_layer.w)
print(self.selected_w)
class One2OneInputLayer(object):
# One to One Mapping!
def __init__(self, input):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.zeros([n_in,]), name='w')
self.w = w
self.output = self.w * self.input
self.params = [w]
class DenseLayer(object):
# Canonical dense layer
def __init__(self, input, n_out, activation='sigmoid'):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.ones([n_in, n_out]), name='w')
b = tf.Variable(tf.ones([n_out]), name='b')
output = tf.add(tf.matmul(input, w), b)
output = activate(output, activation)
self.w = w
self.b = b
self.output = output
self.params = [w]
class SoftmaxLayer(object):
def __init__(self, input, n_out, y):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight and biases for this layer
w = tf.Variable(tf.random_normal([n_in, n_out]), name='w')
b = tf.Variable(tf.random_normal([n_out]), name='b')
pred = tf.add(tf.matmul(input, w), b)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
self.y = y
self.w = w
self.b = b
self.cost = cost
self.params= [w]
Gradient descent algorithms such as Adam do not give exact zeros when using l1 regularization. Instead, something like ftrl or proximal adagrad can give you exact zeros.

Categories