How to replace the softmax layer with sigmoid layer? (Theano, MNIST classification)

How to replace the softmax layer with sigmoid layer? (Theano, MNIST classification) - python

I need your help. I am trying to modify the python-based neural network for MNIST data classification developed by M. Nielsen [http://neuralnetworksanddeeplearning.com/index.html].
Particularly, I am using networ3.py script. It employs Theano library.
The last layer in this network is softmax, but in the future I want to use this program for regression purposes and, therefore, I need to modify it changing the last layer to sigmoid one.
When I simply change the
activation_fn=softmax
to
activation_fn=sigmoid
the program is not working properly.
The important parts of the code are provided below.
# Initialization of the neural network
net = Network([
ConvPoolLayer(input_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2),
activation_fn=ReLU),
ConvPoolLayer(input_shape=(mini_batch_size, 20, 12, 12),
filter_shape=(40, 20, 5, 5),
poolsize=(2, 2),
activation_fn=ReLU),
FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU, p_dropout=0.0),
SoftmaxLayer(n_in=100, n_out=10, activation_fn=softmax, p_dropout=0.0)],
mini_batch_size)
...
# Softmax layer
class SoftmaxLayer(object):
def __init__(self, n_in, n_out, activation_fn, p_dropout):
self.n_in = n_in
self.n_out = n_out
self.activation_fn = activation_fn
self.p_dropout = p_dropout
# Initialize weights and biases
self.w = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
dtype=theano.config.floatX), name='w', borrow=True)
self.b = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
dtype=theano.config.floatX), name='b', borrow=True)
self.params = [self.w, self.b]
def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
self.inpt = inpt.reshape((mini_batch_size, self.n_in))
self.output = self.activation_fn((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
self.y_out = T.argmax(self.output, axis=1) # ??? Change
self.inpt_dropout = dropout_layer(inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b)
# Return the log-likelihood cost
def cost(self, net):
return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
# Return the accuracy for the mini-batch
def accuracy(self, y):
return T.mean(T.eq(y, self.y_out))

I made following modifications:
1) changed the way how targets are represented (before it was 0, 5, 8, ... or any number corresponding to the picture that should be classified). Now it is vectors with 10 elements 0 is equivalent to [1,0,0,0..,0], 5 is equivalent to [0,0,0,0,0,1,0,...0], etc.
Correspondingly code should be fixed for this new format (minor modifications).
2) changed the layer definition (code is below). The main changes are in cost and accuracy.
# Fully connected layer
class FullyConnectedLayer(object):
def __init__(self, n_in, n_out, activation_fn, p_dropout):
self.n_in = n_in
self.n_out = n_out
self.activation_fn = activation_fn
self.p_dropout = p_dropout
# Initialize weights and biases
self.w = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
dtype=theano.config.floatX), name='w', borrow=True)
self.b = theano.shared(np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
dtype=theano.config.floatX), name='b', borrow=True)
self.params = [self.w, self.b]
def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
self.inpt = inpt.reshape((mini_batch_size, self.n_in))
self.output = self.activation_fn((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
self.y_out = self.output #T.argmax(self.output, axis=1) # ??? Change
self.inpt_dropout = dropout_layer(inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b)
# Return the cross-entropy cost ??? Change
def cost(self, net):
xent = -net.y*T.log(self.output_dropout) - (1-net.y)*T.log(1-self.output_dropout)
return T.mean(xent)
# Accuracy for the mini-batch
def accuracy(self, y):
y_pred = T.argmax(self.y_out, axis=1)
y_targ = T.argmax(y, axis=1)
return T.mean(T.eq(y_targ, y_pred))

Related

nn.LSTM not working together with functional_call for calculating the gradient

I have the following code:
import torch
from torch.nn.utils.stateless import functional_call
import torch.autograd as autograd
import torch.nn as nn
# This is the model
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
# This is the initialization function
def pars(model):
params = {}
for name, param in model.named_parameters():
if len(param.shape) == 1:
init = torch.nn.init.constant_(param, 0)
else:
init = torch.nn.init.orthogonal_(param)
params[name] = nn.Parameter(init)
return params
# Initializating the model
model = Encoder(4, 2, 5)
x = torch.rand(3, 5, 4)
params = pars(model)
# Running the model with functional_call and calculating gradient.
samp = functional_call(model, params, x)
grad_f = autograd.grad(torch.mean(samp), params.values(),
retain_graph=True, allow_unused=True)
print(grad_f)
# grad_f has gradient for the linear layer, but None for the LSTM layer.
# Running the model without functional_call and calculating gradient.
samp = model(x)
grad = autograd.grad(torch.mean(samp), model.parameters(), retain_graph=True)
print(grad)
# grad has gradient for all layers, e.g., linears and lstm.
I know the problem is with the LSTM layer because when I use a linear layer with nn.Linear, then the gradient depends on std as well as the linear layer. Unfortunately, I do not know to resolve this problem. I'd appreciate any help.
*Edit: I heavily edited the code provided just to further simplify the example. This code can be copied and run.
Update Dec 11, 2022
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
pdb.set_trace()
grad1 = autograd.grad(mean.mean(), params.values(),
retain_graph=True, allow_unused=True)
# This gives gradient for the self.lin1 layer, and None for the LSTM
grad2 = autograd.grad(mean.mean(), self.parameters(),
retain_graph=True, allow_unused=True)
# This gives gradient the LSTM, but None for the self.lin1 layer
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
When I run it the regular without functional_call and calling directly the model, then autograd.grad(mean.mean(), self.parameters(), allow_unused=True, retain_graph=True) has gradient for the self.lin1 and LSTM layer.
I don't know if this information is useful, but putting out there just in case.

How to connect a final linear layer correctly when using Multivariate Time series forecasting with pytorch LSTM,?

Given a simple multivariate time series problem
l = [ list(range(1000)),list(range(1000,2000)),list(range(2000,3000)), list(range(3000,4000)), list(range(4000,5000))]
df = pd.DataFrame(l).T
df.columns = ['feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5']
target_sensor = 'feat_5'
If We want to predict the value of target_sensor in t+forecast_lead time steps
forecast_lead = 15
print('\nforecast_lead', forecast_lead)
target = f"{target_sensor}_TARGET{forecast_lead}"
features = list(df.columns.difference([target]))
df[target] = df[target_sensor].shift(-forecast_lead)
df = df.iloc[:-forecast_lead]
The input preparation is based on this torch.Dataset class:
class My_Dataset(Dataset):
def __init__(self, dataframe, target, features, sequence_length):
self.features = features #list of columns
self.target = target #str target col name
self.sequence_length = sequence_length #history we want to use
self.X = torch.tensor(dataframe[features].values).float() #to tensor
self.y = torch.tensor(dataframe[target].values).float() #to tensor
def __len__(self):
return self.X.shape[0]
def __getitem__(self, i):
if i >= self.sequence_length - 1:
i_start = i - self.sequence_length + 1
x = self.X[i_start:(i + 1), :]
else:
padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
x = self.X[0:(i + 1), :]
x = torch.cat((padding, x), 0)
return x, self.y[i]
With the following Dataloader (batch_size =1 ) for simplicity.
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False,num_workers=1)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1)
The Model I am using is this:
class LSTM_Multivariate_Time_Series_Regression(nn.Module):
def __init__(self, num_features, hidden_size):
super().__init__()
self.num_feture= num_features # this is the number of features
self.hidden_size = hidden_size
self.num_layers = 1 # OR MORE THAN 1 HERE
self.lstm = nn.LSTM(input_size=num_features,
hidden_size=hidden_size,
batch_first=True,
num_layers=self.num_layers
)
self.linear = nn.Linear(in_features=self.hidden_size,
out_features=1)
In the forward pass if NUM_LAYER = 1
def forward(self, x):
lstm_output , (hn, cn) = self.lstm(x)
out = self.linear(hn[0]) # First dim of Hn is num_layers, which is set to 1 above.
In the forward pass, before passing through the Linear Layer IF SELF.NUM_LAYER > 1, I suppose following options are available
Use the last hidden state hn[-1]
Use the concatenation of all hidden state
#Docs WITH BATCH FIRST = TRUE : lstm_output tensor of shape: (BATCH_SIZE , SEQ_LENGHT, HIDDEN_SIZE)
#docs WITH BATCH FIRST h_n: tensor of shape (NUM_LAYER, BATCH_SIZE, HIDDEN_SIZE) containing the final hidden state for each element in the batch
.1
out = self.linear(hn[-1]).flatten()
return out
Is a correct solution to add final Linear layer to the lstm layer this way?

Tensorflow 2.0: flat_map() to flatten Dataset of Dataset returns cardinality -2

I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)

I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.

How to define the loss function using the output of intermediate layers?

class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.encoder = nn.Linear(300, 100)
self.dense1 = nn.Sequential(nn.Linear(100, 10),nn.ReLU())
self.dense2 = nn.Sequential(nn.Linear(10, 5),nn.ReLU())
self.dense3 = nn.Sequential(nn.Linear(5, 1))
def forward(self, x):
x = self.encoder(x)
x = self.dense1(x)
x = self.dense2(x)
x = self.dense3(x)
return x
I am working on a regression problem, and I need to use the output of the dense2 layer to calculate the loss.
output of dense2 layer is 5 dimensional (5x1).
I am using PyTorch.
Dataset: Suppose i am using 300 features and i need to predict some score(a floating value).
Input: 300 Features
Output: Some Floating Value

In general, your nn.Module can return as many elements as you like. Moreover, you don't have to use them anywhere - there is no mechanism that checks that. Pytorch philosophy is to compute computational graph on-the-run.
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.encoder = nn.Linear(300, 100)
self.dense1 = nn.Sequential(nn.Linear(100, 10),nn.ReLU())
self.dense2 = nn.Sequential(nn.Linear(10, 5),nn.ReLU())
self.dense3 = nn.Sequential(nn.Linear(5, 1))
def forward(self, x):
enc_output = self.encoder(x)
dense1_output = self.dense1(enc_output)
dense2_output = self.dense2(dense1_output)
dense3_output = self.dense3(dense2_output)
return dense3_output, dense2_output

How to pass a dictionary data in tensorflow function

I am new to tensorflow.
I want to build a 3 layer neural network,
i) I have declared weight W inside tf.function for which it is throwing the following error:
ValueError: tf.function-decorated function tried to create variables on non-first call.
ii) And also I have created a dictionary data of sample_batch which has x :features and y:labels which I want to pass to tf.function and use it later.But I am not understanding how to do it?
Here is a snippet of my code:
#tf.function
def forward_pass1(batch):
print((batch))
W = tf.Variable(tf.initializers.TruncatedNormal(stddev=np.sqrt(2.0 / 784))(shape=(784, 128)), name="W")
biases = tf.Variable(tf.zeros(shape=(128)), name="b")
hidden1 = tf.nn.relu(tf.matmul(batch['x'], W) + biases)
print(hidden1)
sample_batch = federated_train_data[5][-1]
forward_pass1((sample_batch))
Can any one please help?

In your case:
Initialize the variables outside the function and pass it as arguments
W = tf.Variable(tf.initializers.TruncatedNormal(stddev=np.sqrt(2.0 / 784))(shape=(784, 128)), name="W")
biases = tf.Variable(tf.zeros(shape=(128)), name="b")
Your function should be like this:
#tf.function
def forward_pass1(batch,W,biases):
print((batch))
hidden1 = tf.nn.relu(tf.matmul(batch['x'], W) + biases)
print(hidden1)
sample_batch = federated_train_data[5][-1]
forward_pass1((sample_batch))
The better way is to do in this way as per the Tensorflow Guide:
from tensorflow.keras import layers
class Linear(layers.Layer):
def __init__(self, units=128, input_dim=784):
super(Linear, self).__init__()
w_init = tf.random_normal_initializer()
self.W = tf.Variable(initial_value=w_init(shape=(input_dim, units),
dtype='float32'),
trainable=True)
b_init = tf.zeros_initializer()
self.biases = tf.Variable(initial_value=b_init(shape=(units,),
dtype='float32'),
trainable=True)
def call(self, inputs):
return tf.nn.relu(tf.matmul(inputs, self.W) + self.biases)
x = tf.ones((2, 2))
linear_layer = Linear(4, 2)
y = linear_layer(x)
print(y)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to replace the softmax layer with sigmoid layer? (Theano, MNIST classification) - python

Related

nn.LSTM not working together with functional_call for calculating the gradient

How to connect a final linear layer correctly when using Multivariate Time series forecasting with pytorch LSTM,?

Tensorflow 2.0: flat_map() to flatten Dataset of Dataset returns cardinality -2

How to define the loss function using the output of intermediate layers?

How to pass a dictionary data in tensorflow function

Categories

Resources