Does pytorch apply softmax automatically in nn.Linear - python

In pytorch a classification network model is defined as this,
class Net(torch.nn.Module):
def __init__(self, n_feature, n_hidden, n_output):
super(Net, self).__init__()
self.hidden = torch.nn.Linear(n_feature, n_hidden) # hidden layer
self.out = torch.nn.Linear(n_hidden, n_output) # output layer
def forward(self, x):
x = F.relu(self.hidden(x)) # activation function for hidden layer
x = self.out(x)
return x
Is softmax applied here? In my understanding, things should be like,
class Net(torch.nn.Module):
def __init__(self, n_feature, n_hidden, n_output):
super(Net, self).__init__()
self.hidden = torch.nn.Linear(n_feature, n_hidden) # hidden layer
self.relu = torch.nn.ReLu(inplace=True)
self.out = torch.nn.Linear(n_hidden, n_output) # output layer
self.softmax = torch.nn.Softmax(dim=n_output)
def forward(self, x):
x = self.hidden(x) # activation function for hidden layer
x = self.relu(x)
x = self.out(x)
x = self.softmax(x)
return x
I understand that F.relu(self.relu(x)) is also applying relu, but the first block of code doesn't apply softmax, right?

Latching on to what #jodag was already saying in his comment, and extending it a bit to form a full answer:
No, PyTorch does not automatically apply softmax, and you can at any point apply torch.nn.Softmax() as you want. But, softmax has some issues with numerical stability, which we want to avoid as much as we can. One solution is to use log-softmax, but this tends to be slower than a direct computation.
Especially when we are using Negative Log Likelihood as a loss function (in PyTorch, this is torch.nn.NLLLoss, we can utilize the fact that the derivative of (log-)softmax+NLLL is actually mathematically quite nice and simple, which is why it makes sense to combine the both into a single function/element. The result is then torch.nn.CrossEntropyLoss. Again, note that this only applies directly to the last layer of your network, any other computation is not affected by any of this.

Related

Implementing Additive (Bahdanau) Attention mechanism with keras functional API

I was looking for implementing attention in my seq2seq model. I know that keras has built-in attention layers (both bahdanau and luong), but I'm trying to fit a custom attention layer. I understand the math behind it but I'm not able to implement it in the desired way. I found out some custom made Additive attention layer but they only work with gradient tape and not with keras functional API.
Here's the custom attention layer -
class Attention(tf.keras.layers.Layer):
def __init__(self, units, **kwargs):
super(Attention, self).__init__(**kwargs)
self.units = units
def call(self, features, hidden):
hidden_with_time_axis = tf.expand_dims(hidden, 1)
score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
attention_weights = tf.nn.softmax(self.V(score), axis=1)
context_vector = attention_weights * features
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector
def build(self, input_shape):
self.W1 = tf.keras.layers.Dense(self.units)
self.W2 = tf.keras.layers.Dense(self.units)
self.V = tf.keras.layers.Dense(1)
def get_config(self):
config = super().get_config()
config.update({'units': self.units})
return config
Here feature refers to the encoder output and hidden means the previous decoder hidden state. The keras attention layer takes the decoder and encoder outputs (batch,steps,units) as the inputs and outputs a 3D vector (most probably the context vector) which is concatenated with the decoder output and then fed to the fully connected layer. But in the case of the custom layer, it outputs a 2D vector which doesn't seem to work, unlike the former.

Understanding Keras subclass method in Tensorflow's deep learning pipeline

I am trying to make a model in tensorflow using the keras subclasses method.
Q1) I am correctly calling layers as layers = [] and then using layers.append(GTLayer....) ?
Q2) calling GTLayer in init of GTN will run class GTLayer and will it call self.conv1 (which will return a tensor A from GTNconv) and self.conv2 (which will again return a tensor A from GTNconv)and then start the call mrthod of GTLayer to H,W , Am I right?
Q3) What happens to the returned H and W from 'Q2' will it store in layers[] list ? and then when we further call the GTNs call method it will bring up those layer? Am I correct?
Q4)Later in the GTNs call method I had to implement linear layers and thus I defined model = tf.keras.models.Sequential() and after theat initialised self.linear1 and self.linear2, this way I have implemented subclassing and sequential both! Is that correct?
Q5) I will finally get loss, y, Ws from calling GTN , now if I assign my model = GTN(arguments..) how will I do the training and back-propagation steps? using an optimiser and loss function? will it follow model.compile() and model.fit ? Or can we make it any different in the sub-classing method of keras?
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class GTN(layers.Layer):
def __init__(self, num_edge, num_channels,num_layers,norm):
super(GTN, self).__init__()
self.num_edge = num_edge
self.num_channels = num_channels
self.num_layers = num_layers
self.is_norm = norm
layers = []
for i in tf.range(num_layers):
if i == 0:
layers.append(GTLayer(num_edge, num_channels, first=True))
else:
layers.append(GTLayer(num_edge, num_channels, first=False))
model = tf.keras.models.Sequential()
self.loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
self.linear1 = model.add(tf.keras.layers.Dense(self.w_out, input_shape=(self.w_out*self.num_channels,), activation=None))
self.linear2 = model.add(tf.keras.layers.Dense(self.num_class, input_shape=(self.w_out,), activation=None))
def gcn_conv(self,X,H):
X = tf.matmul(X, self.weight)
H = self.norm(H, add=True)
return tf.matmul(tf.transpose(H),X)
def call(self, A, X, target_x, target):
A = tf.expand_dims(A, 0)
Ws = []
for i in range(self.num_layers):
H = self.normalization(H)
H, W = self.layers[i](A, H)
Ws.append(W)
for i in range(self.num_channels):
X_tmp = tf.nn.relu(self.gcn_conv(X,H[i])).numpy()
X_ = tf.concat((X_,X_tmp), dim=1)
X_ = self.linear1(X_)
X_ = tf.nn.relu(X_).numpy()
y = self.linear2(X_[target_x])
loss = self.loss(y, target)
return loss, y, Ws
class GTLayer(keras.layers.Layer):
def __init__(self, in_channels, out_channels, first=True):
super(GTLayer, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.conv1 = GTConv(in_channels, out_channels)
self.conv2 = GTConv(in_channels, out_channels)
def call(self, A, H_=None):
a = self.conv1(A)
b = self.conv2(A)
H = tf.matmul( a, b)
W = [tf.stop_gradient(tf.nn.softmax(self.conv1.weight, axis=1).numpy()),
tf.stop_gradient(tf.nn.softmax(self.conv2.weight, axis=1).numpy()) ]
return H,W
class GTConv(keras.layers.Layer):
def __init__(self, in_channels, out_channels):
super(GTConv, self).__init__()
def call(self, A):
A = tf.add_n(tf.nn.softmax(self.weight))
return A
Q1
No. There are two possibilities here
1 - If you want to access a standard layers property of Keras models:
Only Model has a layers property, a keras.layers.Layer doesn't have this property
You are not supposed to mess with the layers property of a Model, you should just read it
The variable you are creating named layers is not a property of your class because you did not use self.layers.
2 - If you just want a list named layers for personal use in your class:
I recommend you don't use a standard name like this and change it to myLayers or something like that to avoid confusion.
The variable layers you created is not being used anywhere else in your code, you just created it and never used.
Remember that layers = [] just creates a local variable, while self.layers = [] creates a property in your class that can be used in other methods inside your class
Q2
You are not "calling" GTLayer, you are "creating" GTLayer. This means that you are running GTLayer.__init__().
This distinction is important in Keras:
This is "creating" a layer: layer_instance = GTLayer(...), which runs __init__
This is "calling" a layer: layer_instance(input_tensors), which runs __call__ (which will eventually run call as defined by you)
You can do both in the same line as output_tensors = GTLayer(...)(input_tensors)
So, this is happening in GTN.__init__:
You are "creating" two instances of the GTLayer.
This runs GTLayer.__init__() for each instance
This hits the lines self.conv1 = GTConv(in_channels, out_channels) and self.conv2 = GTConv(in_channels, out_channels)
This is also "creating" (not "calling") GTConv.
self.conv1 and self.conv2 are "Layer" instances, not tensors.
Q3
No tensor is produced here because you never "called" any layer in GTN.__init__().
(And this is ok. Usually, you "create" layers inside __init__() and "call" layers inside call.)
Your layers local variable will have "instances of GTLayer".
Q4
You mixed two approaches in a strange way.
You can, of course, use a Sequential model if you want, but it's not necessary, and you're not using it correcly.
If in call you are calling each layer (that is X_ = self.linear1(X_) and y = self.linear2(X_[target_x])), you don't need a Sequential model at all, and you can just have the following in GTN.__init__() (this is the best approach for subclassing):
self.linear1 = tf.keras.layers.Dense(self.w_out, input_shape=(self.w_out*self.num_channels,), activation=None)
self.linear2 = tf.keras.layers.Dense(self.num_class, input_shape=(self.w_out,), activation=None)
But you could have self.submodel = Sequential(...) and then use self.submodel in GTN.call(). But having a Model inside a layer sounds weird and might cause some strange behavior in specific cases. And, of course, the ReLUs should be a part of this submodel.
Q5
I will finally get loss, y, Ws from calling GTN
That loss and weights coming from call is a "very very" strange thing. I never saw this and I don't understand why you're doing it this way. This is not standard use of Keras and only in very specific and otherwise unsolvable cases you'd try something like this. I cannot say it will work.
How will I do the training and back-propagation steps?
You should have implemented a keras.models.Model, not a keras.layers.Layer. Only models have the ability to compile and train.
Usually, you'd not create a loss in call, you'd create a loss in model.compile, unless you're dealing with unconventional losses, like weight or activity regularization, things that really depend on the layer and not on the model's inputs/outputs.
Extra tips
There is no need to create custom layers if you're not going to create custom trainable weights. It's not wrong, of course, but also not necessary. It can help organize your code, or just add extra complication.
You are trying to use weight from your layers, but you never defined any weight anywhere.
I'm pretty sure there is a better way to achieve what you want, but I don't know what you want (and that would be something for another question, I think...)
This might be a good reading for subclassing: https://www.tensorflow.org/guide/keras/custom_layers_and_models?hl=en

Creating custom layer as stack of individual neurons TensorFlow

So, I'm trying to create a custom layer in TensorFlow 2.4.1, using a function for a neuron I defined.
# NOTE: this is not the actual neuron I want to use,
# it's just a simple example.
def neuron(x, W, b):
return W # x + b
Where the W and b it gets would be of shape (1, x.shape[0]) and (1, 1) respectively. This means this is like a single neuron in a dense layer. So, I want to create a dense layer by stacking however many of these individual neurons I want.
class Layer(tf.keras.layers.Layer):
def __init__(self, n_units=5):
super(Layer, self).__init__() # handles standard arguments
self.n_units = n_units # Number of neurons to be in the layer
def build(self, input_shape):
# Create weights and biases for all neurons individually
for i in range(self.n_units):
# Create weights and bias for ith neuron
...
def call(self, inputs):
# Compute outputs for all neurons
...
# Concatenate outputs to create layer output
...
return output
How can I create a layer as a stack of individual neurons (also in a way it can train)? I have roughly outlined the idea for the layer in the above code, but the answer doesn't need to follow that as a blueprint.
Finally; yes I'm aware that to create a dense layer you don't need to go about it in such a roundabout way (you just need 1 weight and bias matrix), but in my actual use case, this is neccessary. Thanks!
So, person who asked this question here, I have found a way to do it, by dynamically creating variables and operations.
First, let's re-define the neuron to use tensorflow operations:
def neuron(x, W, b):
return tf.add(tf.matmul(W, x), b)
Then, let's create the layer (this uses the blueprint layed out in the question):
class Layer(tf.keras.layers.Layer):
def __init__(self, n_units=5):
super(Layer, self).__init__()
self.n_units = n_units
def build(self, input_shape):
for i in range(self.n_units):
exec(f'self.kernel_{i} = self.add_weight("kernel_{i}", shape=[1, int(input_shape[0])])')
exec(f'self.bias_{i} = self.add_weight("bias_{i}", shape=[1, 1])')
def call(self, inputs):
for i in range(self.n_units):
exec(f'out_{i} = neuron(inputs, self.kernel_{i}, self.bias_{i})')
return eval(f'tf.concat([{", ".join([ f"out_{i}" for i in range(self.n_units) ])}], axis=0)')
As you can see, we're using exec and eval to dynamically create variables and perform operations.
That's it! We can perform a few checks to see if TensorFlow could use this:
# Check to see if it outputs the correct thing
layer = Layer(5) # With 5 neurons, it should return a (5, 6)
print(layer(tf.zeros([10, 6])))
# Check to see if it has the right trainable parameters
print(layer.trainable_variables)
# Check to see if TensorFlow can find the gradients
layer = Layer(5)
x = tf.ones([10, 6])
with tf.GradientTape() as tape:
z = layer(x)
print(f"Parameter: {layer.trainable_variables[2]}")
print(f"Gradient: {tape.gradient(z, layer.trainable_variables[2])}")
This solution works, but it's not very elegant... I wonder if there's a better way to do it, some magical TF method that can map the neuron to create a layer, I'm too inexperienced to know for the moment. So, please answer if you have a (better) answer, I'll be happy to accept it :)

Is keras based on closures in python?

While working with keras and tensorflow, I found the following lines of code confusing.
w_init = tf.random_normal_initializer()
self.w = tf.Variable(initial_value=w_init(shape=(input_dim, units),
dtype='float32'),trainable=True)
Also, I have seen something like:
Dense(64, activation='relu')(x)
Therefore, if Dense(...) will create the object for me, then how can I follow that with with (x)?
Likewise for w_init above. How can I say such thing:
tf.random_normal_initializer()(shape=(input_dim, units), dtype='float32'),trainable=True)
Do we have such thing in python "ClassName()" followed by "()" while creating an object such as a layer?
While I was looking into Closures in python, I found that a function can return another function. Hence, is this what really happens in Keras?
Any help is much appreciated!!
These are two totally different ways to define models.
Keras
Keras works with the concept of layers. Each line defines a full layer of your network. What you are referring to in specific is keras' functional API. The concept is to combine layers like this:
inp = Input(shape=(28, 28, 1))
x = Conv2D((6,6), strides=(1,1), activation='relu')(inp)
# ... etc ...
x = Flatten()(x)
x = Dense(10, activation='softmax')(x)
model = Model(inputs=[inp], outputs=[x])
This way you've created a full CNN in just a few lines. Note that you never had to manually input the shape of the weight vectors or the operations that are performed. These are inferred automatically by keras.
Now, this just needs to be compiled through model.compile(...) and then you can train it through model.fit(...).
Tensorflow
On the other hand TensorFlow is a bit more low-level. This means that you have do define the variables and operations by hand. So in order to write a fully-connected layer you'd have to do the following:
# Input placeholders
x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
y = tf.placeholder(tf.float32, shape=(None, 10))
# Convolution layer
W1 = tf.Variable(tf.truncated_normal([6, 6, 1, 32], stddev=0.1))
b1 = tf.Variable(tf.constant(0.1, tf.float32, [32]))
z1 = tf.nn.conv2d(x_2d, W1, strides=[1, 1, 1, 1], padding='SAME') + b1
c1 = tf.nn.relu(z1)
# ... etc ...
# Flatten
flat = tf.reshape(p2, [-1, ...]) # need to calculate the ... by ourselves
# Dense
W3 = tf.Variable(tf.truncated_normal([..., 10], stddev=0.1)) # same size as before
b3 = tf.Variable(tf.constant(0.1, tf.float32, [10]))
fc1 = tf.nn.relu(tf.matmul(flat, W3) + b3)
Two things to note here. There is no explicit definition of a model here and this has to be trained through a tf.Session with a feed_dict feeding the data to the placeholders. If you're interested you'll find several guides online.
Closing notes...
TensorFlow has a much friendlier and easier way to define and train models through eager execution, which will be default in TF 2.0! So the code you posted is in a sense the old way of doing things in tensorflow. It's worth taking a look into TF 2.0, which actually recommends doing things the keras way!
Edit (after comment by OP):
No a layer is not a clojure. A keras layer is a class that implements a __call__ method which also makes it callable. The way they did it was so that it is a wrapper to the call method that users typically write.
You can take a look at the implementation here
Basically how this works is:
class MyClass:
def __init__(self, param):
self.p = param
def call(self, x):
print(x)
If you try to write c = MyClass(1)(3), you'll get a TypeError saying that MyClass is not callable. But if you write it like this:
class MyClass:
def __init__(self, param):
self.p = param
def __call__(self, x):
print(x)
It works now. Essentially keras does it like this:
class MyClass:
def __init__(self, param):
self.p = param
def call(self, x):
print(x)
def __call__(self, x):
self.call(x)
So that when you write your own layer you can implement your own call method and the __call__ method that wraps your one will get inherited from keras' base Layer class.
Just from the syntax, I would say that Dense() returns a function (or more accurately a callable). Similarly w_init is a callable as well.

Pytorch-tutorial: Strange input argument in class definition

I'm reading through some pytorch tutorials. Below is the definition of a residual block. However in the forward method each function handle only takes one argument out while in the __init__ function these functions have different number of input arguments:
# Residual Block
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(ResidualBlock, self).__init__()
self.conv1 = conv3x3(in_channels, out_channels, stride)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(out_channels, out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
Does anyone know how this works?
Is it a standard python class inheritance feature or is this specific to pytorch?
you define the layer in the init function, which means the parameters. In the forward function you only input the data that needs to be processed with the predefined settings from init. The nn.whatever builds a function with the settings you pass to it. Then this function can be used in forward and this function only takes one argument.
You define different layers of your network architecture in the constructor of the class (__init__ function). Essentially, when you create an instance of different layers, you initialize them with your settings parameters.
For example, when you declare the first convolution layer, self.conv1, you give the parameters required to initialize the layer. In the forward function, you just simply call the layers with the input to get the corresponding output. For example, in out = self.conv2(out), you take the output of the previous layer and give it as an input the next self.conv2 layer.
Please note, during initialization, you give information to the layer that what kind/shape of input will be provided to that layer. For example, you tell the first convolution layer that what will be number of input and output channels in your input. In the forward method, you just need to pass the input, that's it.

Categories