I'm programming some callable custom modules in PyTorch and I wanted to know if I'm doing it correctly. Here's an example scenario where I want to construct a module that takes a torch.Tensor as input, performs a learnable linear operation and outputs a diagonal covariance matrix to use in a multivariate distribution downstream.
class Exp(nn.Module):
def forward(self, x):
return x.exp()
class Diag(nn.Module):
def forward(self, x):
return x.diag_embed()
def init_model(input_size, output_size):
log_variance_module = nn.Linear(input_size, output_size)
diag_covariance_module = nn.Sequential(logvar_module, Exp(), Diag())
return diag_covariance_module
model = init_model(5, 5)
cov = model(some_input_tensor)
dist = MultivariateNormal(some_mean, cov)
I know that this works, but is it the right design pattern? How is one recommended to approach these modules?
This looks like the correct design pattern.
Ideally, you would also write your main network as an nn.Module:
class Model(nn.Sequential):
def __init__(self, input_size, output_size):
logvar_module = nn.Linear(input_size, output_size)
super().__init__(logvar_module, Exp(), Diag())
Related
I am trying to make a model in tensorflow using the keras subclasses method.
Q1) I am correctly calling layers as layers = [] and then using layers.append(GTLayer....) ?
Q2) calling GTLayer in init of GTN will run class GTLayer and will it call self.conv1 (which will return a tensor A from GTNconv) and self.conv2 (which will again return a tensor A from GTNconv)and then start the call mrthod of GTLayer to H,W , Am I right?
Q3) What happens to the returned H and W from 'Q2' will it store in layers[] list ? and then when we further call the GTNs call method it will bring up those layer? Am I correct?
Q4)Later in the GTNs call method I had to implement linear layers and thus I defined model = tf.keras.models.Sequential() and after theat initialised self.linear1 and self.linear2, this way I have implemented subclassing and sequential both! Is that correct?
Q5) I will finally get loss, y, Ws from calling GTN , now if I assign my model = GTN(arguments..) how will I do the training and back-propagation steps? using an optimiser and loss function? will it follow model.compile() and model.fit ? Or can we make it any different in the sub-classing method of keras?
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class GTN(layers.Layer):
def __init__(self, num_edge, num_channels,num_layers,norm):
super(GTN, self).__init__()
self.num_edge = num_edge
self.num_channels = num_channels
self.num_layers = num_layers
self.is_norm = norm
layers = []
for i in tf.range(num_layers):
if i == 0:
layers.append(GTLayer(num_edge, num_channels, first=True))
else:
layers.append(GTLayer(num_edge, num_channels, first=False))
model = tf.keras.models.Sequential()
self.loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
self.linear1 = model.add(tf.keras.layers.Dense(self.w_out, input_shape=(self.w_out*self.num_channels,), activation=None))
self.linear2 = model.add(tf.keras.layers.Dense(self.num_class, input_shape=(self.w_out,), activation=None))
def gcn_conv(self,X,H):
X = tf.matmul(X, self.weight)
H = self.norm(H, add=True)
return tf.matmul(tf.transpose(H),X)
def call(self, A, X, target_x, target):
A = tf.expand_dims(A, 0)
Ws = []
for i in range(self.num_layers):
H = self.normalization(H)
H, W = self.layers[i](A, H)
Ws.append(W)
for i in range(self.num_channels):
X_tmp = tf.nn.relu(self.gcn_conv(X,H[i])).numpy()
X_ = tf.concat((X_,X_tmp), dim=1)
X_ = self.linear1(X_)
X_ = tf.nn.relu(X_).numpy()
y = self.linear2(X_[target_x])
loss = self.loss(y, target)
return loss, y, Ws
class GTLayer(keras.layers.Layer):
def __init__(self, in_channels, out_channels, first=True):
super(GTLayer, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.conv1 = GTConv(in_channels, out_channels)
self.conv2 = GTConv(in_channels, out_channels)
def call(self, A, H_=None):
a = self.conv1(A)
b = self.conv2(A)
H = tf.matmul( a, b)
W = [tf.stop_gradient(tf.nn.softmax(self.conv1.weight, axis=1).numpy()),
tf.stop_gradient(tf.nn.softmax(self.conv2.weight, axis=1).numpy()) ]
return H,W
class GTConv(keras.layers.Layer):
def __init__(self, in_channels, out_channels):
super(GTConv, self).__init__()
def call(self, A):
A = tf.add_n(tf.nn.softmax(self.weight))
return A
Q1
No. There are two possibilities here
1 - If you want to access a standard layers property of Keras models:
Only Model has a layers property, a keras.layers.Layer doesn't have this property
You are not supposed to mess with the layers property of a Model, you should just read it
The variable you are creating named layers is not a property of your class because you did not use self.layers.
2 - If you just want a list named layers for personal use in your class:
I recommend you don't use a standard name like this and change it to myLayers or something like that to avoid confusion.
The variable layers you created is not being used anywhere else in your code, you just created it and never used.
Remember that layers = [] just creates a local variable, while self.layers = [] creates a property in your class that can be used in other methods inside your class
Q2
You are not "calling" GTLayer, you are "creating" GTLayer. This means that you are running GTLayer.__init__().
This distinction is important in Keras:
This is "creating" a layer: layer_instance = GTLayer(...), which runs __init__
This is "calling" a layer: layer_instance(input_tensors), which runs __call__ (which will eventually run call as defined by you)
You can do both in the same line as output_tensors = GTLayer(...)(input_tensors)
So, this is happening in GTN.__init__:
You are "creating" two instances of the GTLayer.
This runs GTLayer.__init__() for each instance
This hits the lines self.conv1 = GTConv(in_channels, out_channels) and self.conv2 = GTConv(in_channels, out_channels)
This is also "creating" (not "calling") GTConv.
self.conv1 and self.conv2 are "Layer" instances, not tensors.
Q3
No tensor is produced here because you never "called" any layer in GTN.__init__().
(And this is ok. Usually, you "create" layers inside __init__() and "call" layers inside call.)
Your layers local variable will have "instances of GTLayer".
Q4
You mixed two approaches in a strange way.
You can, of course, use a Sequential model if you want, but it's not necessary, and you're not using it correcly.
If in call you are calling each layer (that is X_ = self.linear1(X_) and y = self.linear2(X_[target_x])), you don't need a Sequential model at all, and you can just have the following in GTN.__init__() (this is the best approach for subclassing):
self.linear1 = tf.keras.layers.Dense(self.w_out, input_shape=(self.w_out*self.num_channels,), activation=None)
self.linear2 = tf.keras.layers.Dense(self.num_class, input_shape=(self.w_out,), activation=None)
But you could have self.submodel = Sequential(...) and then use self.submodel in GTN.call(). But having a Model inside a layer sounds weird and might cause some strange behavior in specific cases. And, of course, the ReLUs should be a part of this submodel.
Q5
I will finally get loss, y, Ws from calling GTN
That loss and weights coming from call is a "very very" strange thing. I never saw this and I don't understand why you're doing it this way. This is not standard use of Keras and only in very specific and otherwise unsolvable cases you'd try something like this. I cannot say it will work.
How will I do the training and back-propagation steps?
You should have implemented a keras.models.Model, not a keras.layers.Layer. Only models have the ability to compile and train.
Usually, you'd not create a loss in call, you'd create a loss in model.compile, unless you're dealing with unconventional losses, like weight or activity regularization, things that really depend on the layer and not on the model's inputs/outputs.
Extra tips
There is no need to create custom layers if you're not going to create custom trainable weights. It's not wrong, of course, but also not necessary. It can help organize your code, or just add extra complication.
You are trying to use weight from your layers, but you never defined any weight anywhere.
I'm pretty sure there is a better way to achieve what you want, but I don't know what you want (and that would be something for another question, I think...)
This might be a good reading for subclassing: https://www.tensorflow.org/guide/keras/custom_layers_and_models?hl=en
I am puzzled by the syntax of the functions used in neural networks in pytorch.
Here is an example of how one can define a linear transformation layer: (cf. https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)
m = nn.Linear(20, 30)
input = torch.randn(128, 20)
output = m(input)
print(output.size())
torch.Size([128, 30])
Can someone explain me where the expression nn.Linear(20,30)(input) comes from ? It disturbs me a bit.
Indeed, one can define a class neural network with such cosntructor : (for example)
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes, p=0):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size, bias=True)
self.fc2 = nn.Linear(hidden_size, hidden_size, bias=True)
self.fc3 = nn.Linear(hidden_size, hidden_size, bias=True)
self.fc4 = nn.Linear(hidden_size, num_classes, bias=False)
self.dropout = nn.Dropout(p=p)
and I was trying to write an attribute using numpy function, like:
self.enter_reshape = np.reshape(-1, input_size * input_size)
self.exit_reshape = np.reshape(input_size, num_classes / input_size)
or, using the view function from pytorch:
self.reshape = view(-1, self.num_flat_features())
The closest thing I know about is the partial function and closures, where one could write f(z)(x)(y). I looked into the definition of Linear, and I saw that linear is an object, but I don't see where they redefined __call__ magic function, which I thought would be used here when one calls the object.
So basically, can one explain what is up with such writting, and also, would it be possible to give to the neural network the numpy or view functions as attributes?
torch.nn.Linear inherits from torch.nn.Module (see source code), which in turn defines __call__ method.
You can see source code for torch.nn.Module here. This class allows users to make their own Modules by inheritance (as you did in your example) and is a base for all PyTorch defined modules like nn.Linear (see available methods, documentation here).
Its __call__ essentially calls forward but running registered hooks (and registering), checking torchscript etc. (see source code here, with relevant line here.
would it be possible to give to the neural network the numpy or view
functions as attributes?
From the example you've given, what you are trying to do is probably partial function (or lambda as in the example below) saved as attribute (though that is pretty uncommon and never seen it tbh), like this:
import torch
class MyModule(torch.nn.Module):
def __init__(self, shape: int = -1):
super().__init__() # required
self.reshape = lambda tensor: torch.reshape(tensor, (shape,))
def forward(self, tensor):
return self.reshape(tensor)
module = MyModule()
module(torch.randn(4, 5, 6)).shape # [120] shape
You shouldn't use numpy with pytorch unless you really need it and/or there is no sensible pytorch counterpart (although you can if you transform torch.Tensor to numpy). Also you shouldn't do anything like the code above as it's really confusing, just save attributes (anything like input_shape, hidden_dim, output_size) and use it in forward:
class MyModule(torch.nn.Module):
def __init__(self, shape: int = -1):
super().__init__() # required
self.shape = shape
def forward(self, tensor):
return torch.reshape(tensor, (self.shape,))
In pytorch a classification network model is defined as this,
class Net(torch.nn.Module):
def __init__(self, n_feature, n_hidden, n_output):
super(Net, self).__init__()
self.hidden = torch.nn.Linear(n_feature, n_hidden) # hidden layer
self.out = torch.nn.Linear(n_hidden, n_output) # output layer
def forward(self, x):
x = F.relu(self.hidden(x)) # activation function for hidden layer
x = self.out(x)
return x
Is softmax applied here? In my understanding, things should be like,
class Net(torch.nn.Module):
def __init__(self, n_feature, n_hidden, n_output):
super(Net, self).__init__()
self.hidden = torch.nn.Linear(n_feature, n_hidden) # hidden layer
self.relu = torch.nn.ReLu(inplace=True)
self.out = torch.nn.Linear(n_hidden, n_output) # output layer
self.softmax = torch.nn.Softmax(dim=n_output)
def forward(self, x):
x = self.hidden(x) # activation function for hidden layer
x = self.relu(x)
x = self.out(x)
x = self.softmax(x)
return x
I understand that F.relu(self.relu(x)) is also applying relu, but the first block of code doesn't apply softmax, right?
Latching on to what #jodag was already saying in his comment, and extending it a bit to form a full answer:
No, PyTorch does not automatically apply softmax, and you can at any point apply torch.nn.Softmax() as you want. But, softmax has some issues with numerical stability, which we want to avoid as much as we can. One solution is to use log-softmax, but this tends to be slower than a direct computation.
Especially when we are using Negative Log Likelihood as a loss function (in PyTorch, this is torch.nn.NLLLoss, we can utilize the fact that the derivative of (log-)softmax+NLLL is actually mathematically quite nice and simple, which is why it makes sense to combine the both into a single function/element. The result is then torch.nn.CrossEntropyLoss. Again, note that this only applies directly to the last layer of your network, any other computation is not affected by any of this.
I'm having issues with implementing custom activation functions in Pytorch, such as Swish. How should I go about implementing and using custom activation functions in Pytorch?
There are four possibilities depending on what you are looking for. You will need to ask yourself two questions:
Q1) Will your activation function have learnable parameters?
If yes, you have no choice but to create your activation function as an nn.Module class because you need to store those weights.
If no, you are free to simply create a normal function, or a class, depending on what is convenient for you.
Q2) Can your activation function be expressed as a combination of existing PyTorch functions?
If yes, you can simply write it as a combination of existing PyTorch function and won't need to create a backward function which defines the gradient.
If no you will need to write the gradient by hand.
Example 1: SiLU function
The SiLU function f(x) = x * sigmoid(x) does not have any learned weights and can be written entirely with existing PyTorch functions, thus you can simply define it as a function:
def silu(x):
return x * torch.sigmoid(x)
and then simply use it as you would have torch.relu or any other activation function.
Example 2: SiLU with learned slope
In this case you have one learned parameter, the slope, thus you need to make a class of it.
class LearnedSiLU(nn.Module):
def __init__(self, slope = 1):
super().__init__()
self.slope = slope * torch.nn.Parameter(torch.ones(1))
def forward(self, x):
return self.slope * x * torch.sigmoid(x)
Example 3: with backward
If you have something for which you need to create your own gradient function, you can look at this example: Pytorch: define custom function
You can write a customized activation function like below (e.g. weighted Tanh).
class weightedTanh(nn.Module):
def __init__(self, weights = 1):
super().__init__()
self.weights = weights
def forward(self, input):
ex = torch.exp(2*self.weights*input)
return (ex-1)/(ex+1)
Don’t bother about backpropagation if you use autograd compatible operations.
I wrote the following SinActivation sub-class of nn.Module to implement the sin activation function.
class SinActivation(torch.nn.Module):
def __init__(self):
super(SinActivation, self).__init__()
return
def forward(self, x):
return torch.sin(x)
How do I implement a custom activation function (RBF kernel with mean and variances adjusted by gradient descent) in Neupy or Theano for use in Neupy.
{Quick Background: Gradient Descent works with every parameter in the network. I want to make a specialized features space that contains optimized feature parameters so Neupy}
I think my problems is in the creation of parameters, how they are sized, and how they are all connected.
Primary functions of interest.
Activation Function Class
class RBF(layers.ActivationLayer):
def initialize(self):
super(RBF, self).initialize()
self.add_parameter(name='mean', shape=(1,),
value=init.Normal(), trainable=True)
self.add_parameter(name='std_dev', shape=(1,),
value=init.Normal(), trainable=True)
def output(self, input_value):
return rbf(input_value, self.parameters)
RBF Function
def rbf(input_value, parameters):
K = _outer_substract(input_value, parameters['mean'])
return np.exp(- np.linalg.norm(K)/parameters['std_dev'])
Function to shape?
def _outer_substract(x, y):
return (x - y.T).T
Help will be much appreciated as this is will provide great insight into how to customize neupy networks. The documentation could use some work in some areas to say the least...
When layer changes shape of the input variable it has to inform the subsequent layers about the change. For this case it must have customized output_shape property. For example:
from neupy import layers
from neupy.utils import as_tuple
import theano.tensor as T
class Flatten(layers.BaseLayer):
"""
Slight modification of the Reshape layer from the neupy library:
https://github.com/itdxer/neupy/blob/master/neupy/layers/reshape.py
"""
#property
def output_shape(self):
# Number of output feature depends on the input shape
# When layer receives input with shape (10, 3, 4)
# than output will be (10, 12). First number 10 defines
# number of samples which you typically don't need to
# change during propagation
n_output_features = np.prod(self.input_shape)
return (n_output_features,)
def output(self, input_value):
n_samples = input_value.shape[0]
return T.reshape(input_value, as_tuple(n_samples, self.output_shape))
If you run it in terminal you will see that it works
>>> network = layers.Input((3, 4)) > Flatten()
>>> predict = network.compile()
>>> predict(np.random.random((10, 3, 4))).shape
(10, 12)
In your example I can see a few issues:
The rbf function doesn't return theano expression. It should fail during the function compilation
Functions like np.linalg.norm will return you scalar if you won't specify axis along which you want to calculate norm.
The following solution should work for you
import numpy as np
from neupy import layers, init
import theano.tensor as T
def norm(value, axis=None):
return T.sqrt(T.sum(T.square(value), axis=axis))
class RBF(layers.BaseLayer):
def initialize(self):
super(RBF, self).initialize()
# It's more flexible when shape of the parameters
# denend on the input shape
self.add_parameter(
name='mean', shape=self.input_shape,
value=init.Constant(0.), trainable=True)
self.add_parameter(
name='std_dev', shape=self.input_shape,
value=init.Constant(1.), trainable=True)
def output(self, input_value):
K = input_value - self.mean
return T.exp(-norm(K, axis=0) / self.std_dev)
network = layers.Input(1) > RBF()
predict = network.compile()
print(predict(np.random.random((10, 1))))
network = layers.Input(4) > RBF()
predict = network.compile()
print(predict(np.random.random((10, 4))))
Although itdxer answered the question sufficiently, I would like to add the exact solution to this problem.
Creation of Architecture
network = layers.Input(size) > RBF() > layers.Softmax(num_out)
Activation Function
# Elementwise Gaussian (RBF)
def rbf(value, mean, std):
return T.exp(-.5*T.sqr(value-mean)/T.sqr(std))/(std*T.sqrt(2*np.pi))
RBF Class
class RBF(layers.BaseLayer):
def initialize(self):
# Begin by initializing.
super(RBF, self).initialize()
# Add parameters to train
self.add_parameter(name='means', shape=self.input_shape,
value=init.Normal(), trainable=True)
self.add_parameter(name='std_dev', shape=self.input_shape,
value=init.Normal(), trainable=True)
# Define output function for the RBF layer.
def output(self, input_value):
K = input_value - self.means
return rbf(input_value,self.means,self.std_dev
Training
If you are interested in training. It is as simple as,
# Set training algorithm
gdnet = algorithms.Momentum(
network,
momenutm = 0.1
)
# Train.
gdnet.train(x,y,max_iter=100)
This compiles with the proper input and target and mean and variances are updated on an elementwise basis.