How to do fully connected batch norm in PyTorch? - python

torch.nn has classes BatchNorm1d, BatchNorm2d, BatchNorm3d, but it doesn't have a fully connected BatchNorm class? What is the standard way of doing normal Batch Norm in PyTorch?

Ok. I figured it out. BatchNorm1d can also handle Rank-2 tensors, thus it is possible to use BatchNorm1d for the normal fully-connected case.
So for example:
import torch.nn as nn
class Policy(nn.Module):
def __init__(self, num_inputs, action_space, hidden_size1=256, hidden_size2=128):
super(Policy, self).__init__()
self.action_space = action_space
num_outputs = action_space
self.linear1 = nn.Linear(num_inputs, hidden_size1)
self.linear2 = nn.Linear(hidden_size1, hidden_size2)
self.linear3 = nn.Linear(hidden_size2, num_outputs)
self.bn1 = nn.BatchNorm1d(hidden_size1)
self.bn2 = nn.BatchNorm1d(hidden_size2)
def forward(self, inputs):
x = inputs
x = self.bn1(F.relu(self.linear1(x)))
x = self.bn2(F.relu(self.linear2(x)))
out = self.linear3(x)
return out

The BatchNorm1d normally comes before the ReLU, and the bias is redundant, so
import torch.nn as nn
class Policy(nn.Module):
def __init__(self, num_inputs, action_space, hidden_size1=256, hidden_size2=128):
super(Policy2, self).__init__()
self.action_space = action_space
num_outputs = action_space
self.linear1 = nn.Linear(num_inputs, hidden_size1, bias=False)
self.linear2 = nn.Linear(hidden_size1, hidden_size2, bias=False)
self.linear3 = nn.Linear(hidden_size2, num_outputs)
self.bn1 = nn.BatchNorm1d(hidden_size1)
self.bn2 = nn.BatchNorm1d(hidden_size2)
def forward(self, inputs):
x = inputs
x = F.relu(self.bn1(self.linear1(x)))
x = F.relu(self.bn2(self.linear2(x)))
out = self.linear3(x)
return out

Related

How to add Individual LSTM layers for each task in multi-task learning with Pytorch

for example, I define a model for 2 tasks in multi-task way.
class BertMy(nn.Module):
def __init__(self, segment_size, output_size, dropout):
super(BertMy, self).__init__()
self.bert = AutoModelForMaskedLM.from_pretrained("cl-tohoku/bert-base-japanese")
self.bert_vocab_size = 32000
self.bn = nn.BatchNorm1d(segment_size*self.bert_vocab_size)
self.fc1 = nn.Linear(segment_size*self.bert_vocab_size, output_size)
self.fc2 = nn.Linear(segment_size*self.bert_vocab_size, output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.bert(x).logits
x = x.view(x.shape[0], -1)
x1 = self.fc1(self.dropout(self.bn(x)))
x2 = self.fc2(self.dropout(self.bn(x)))
return [x1,x2]
I want add 2 Individual LSTM layers for 2 tasks, in order to predicate them Individually.
How should I define this in Pytorch?

Using captum with nn.Embedding getting RuntimeError

I am using captum library and getting following error. Here is the complete code to reproduce the error.
RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from captum.attr import IntegratedGradients
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
vocab_size = 1024
embedding_dim = 32
seq_len = 128
num_classes = 5
hidden_dim = 256
class predictor(nn.Module):
def __init__(self):
super().__init__()
self.seq_len = seq_len
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.vocab_size, self.embedding_dim = vocab_size, embedding_dim
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
self.linear = nn.Linear(self.seq_len*self.embedding_dim, self.num_classes)
def forward(self, x):
x = self.embedding(x.long())
x = x.reshape(-1, self.seq_len*self.embedding_dim)
x = F.relu(self.linear(x))
return x
class wrapper_predictor(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, x):
x = self.model(x)
x = F.softmax(x, dim=1)
return x
indexes = torch.Tensor(np.random.randint(0, vocab_size, (seq_len))).to(device)
model = predictor().to(device)
wrapper_model = wrapper_predictor(model).to(device)
ig = IntegratedGradients(wrapper_model)
attributions, delta = ig.attribute(inputs=indexes, target=0, n_steps=1, return_convergence_delta=True)
I resolved the issue with LayerIntegratedGradients.
Here is the link to read more to know other possible solutions. https://captum.ai/tutorials/IMDB_TorchText_Interpret
This is using an instance of LayerIntegratedGradients using forward function of model and the embedding layer as the example given in the link.
Here is sample code which using LayerIntegratedGradients with nn.Embedding
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from captum.attr import IntegratedGradients, LayerIntegratedGradients
from torchsummary import summary
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
vocab_size = 1024
embedding_dim = 1
seq_len = 128
num_classes = 5
hidden_dim = 256
class predictor(nn.Module):
def __init__(self):
super(predictor, self).__init__()
self.seq_len = seq_len
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.vocab_size, self.embedding_dim = vocab_size, embedding_dim
self.embedding = nn.Sequential(
nn.Embedding(self.vocab_size, self.embedding_dim),
)
self.embedding.weight = torch.randn((self.vocab_size, self.embedding_dim), requires_grad=True)
self.fc = nn.Sequential(
nn.Linear(self.seq_len*self.embedding_dim, self.hidden_dim, device=device, bias=False),
nn.Linear(self.hidden_dim, self.num_classes, device=device, bias=False),
)
def forward(self, x):
x = self.embedding(x.long())
x = x.view(-1, self.seq_len*self.embedding_dim)
x = self.fc(x)
return x
class wrapper_predictor(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, x):
x = self.model(x)
x = F.softmax(x, dim=1) #keep softmax out of forward if attribution score is too low.
return x
model = predictor().to(device)
indexes = torch.Tensor(np.random.randint(0, vocab_size, (seq_len))).to(device)
input_size = indexes.shape
summary(model=model, input_size=input_size, batch_size=-1, device='cuda')
wrapper_model = wrapper_predictor(model).to(device)
lig = LayerIntegratedGradients(model, model.embedding)
attributions, delta = lig.attribute(inputs=indexes, target=0, n_steps=1, return_convergence_delta=True)

2x nested Tensorflow custom layers results in zero trainable parameters

I am creating a series of custom Tensorflow (version 2.4.1) layers and am running into a problem where the model summary shows zero trainable parameters. Below is a series of examples showing how everything is fine until I add in the last custom layer.
Here are the imports and custom classes:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (BatchNormalization, Conv2D, Input, ReLU,
Layer)
class basic_conv_stack(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_conv_stack, self).__init__()
self.conv1 = Conv2D(filters, kernel_size, strides, padding='same')
self.bn1 = BatchNormalization()
self.relu = ReLU()
def call(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
return x
class basic_residual(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_residual, self).__init__()
self.bcs1 = basic_conv_stack(filters, kernel_size, strides)
self.bcs2 = basic_conv_stack(filters, kernel_size, strides)
def call(self, x):
x = self.bcs1(x)
x = self.bcs2(x)
return x
class basic_module(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_module, self).__init__()
self.res = basic_residual
self.args = (filters, kernel_size, strides)
def call(self, x):
for _ in range(4):
x = self.res(*self.args)(x)
return x
Now, if I do the following, everything works out ok and I get 300 trainable parameters:
input_layer = Input((128, 128, 3))
conv = basic_conv_stack(10, 3, 1)(input_layer)
model = Model(input_layer, conv)
print (model.summary())
Similarly, if I do the following, I get 1,230 trainable parameters:
input_layer = Input((128, 128, 3))
conv = basic_residual(10, 3, 1)(input_layer)
model = Model(input_layer, conv)
print (model.summary())
However, if I try the basic_module class, I get zero trainable parameters:
input_layer = Input((128, 128, 3))
conv = basic_module(10, 3, 1)(input_layer)
model = Model(input_layer, conv)
print (model.summary())
Does anyone know why this is happening?
Edit to add:
I discovered that the layers used in the call must be initialized in the class's init for things to work properly. So if I change the basic module to this:
class basic_module(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_module, self).__init__()
self.clayers = [basic_residual(filters, kernel_size, strides) for _ in range(4)]
def call(self, x):
for idx in range(4):
x = self.clayers[idx](x)
return x
Everything works fine. I don't know why this is the case, so I'll leave this question open in case someone can answer the why of this question.
You have to initialize the class instances with the required parameter such as filters, kernel_size, strides to the predefined base_mdoule. Also, note that these hyper-parameters are related to trainable weights properties.
# >>> a = basic_module
# >>> a __main__.basic_module
# >>> a = basic_module(10, 3, 1)
# >>> a
# >>> <__main__.basic_module at 0x7f6123eed510>
class basic_module(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_module, self).__init__()
self.res = basic_residual # < ---
self.args = (filters, kernel_size, strides)
def call(self, x):
for _ in range(4):
x = self.res(*self.args)(x)
return x

How to define the loss function using the output of intermediate layers?

class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.encoder = nn.Linear(300, 100)
self.dense1 = nn.Sequential(nn.Linear(100, 10),nn.ReLU())
self.dense2 = nn.Sequential(nn.Linear(10, 5),nn.ReLU())
self.dense3 = nn.Sequential(nn.Linear(5, 1))
def forward(self, x):
x = self.encoder(x)
x = self.dense1(x)
x = self.dense2(x)
x = self.dense3(x)
return x
I am working on a regression problem, and I need to use the output of the dense2 layer to calculate the loss.
output of dense2 layer is 5 dimensional (5x1).
I am using PyTorch.
Dataset: Suppose i am using 300 features and i need to predict some score(a floating value).
Input: 300 Features
Output: Some Floating Value
In general, your nn.Module can return as many elements as you like. Moreover, you don't have to use them anywhere - there is no mechanism that checks that. Pytorch philosophy is to compute computational graph on-the-run.
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.encoder = nn.Linear(300, 100)
self.dense1 = nn.Sequential(nn.Linear(100, 10),nn.ReLU())
self.dense2 = nn.Sequential(nn.Linear(10, 5),nn.ReLU())
self.dense3 = nn.Sequential(nn.Linear(5, 1))
def forward(self, x):
enc_output = self.encoder(x)
dense1_output = self.dense1(enc_output)
dense2_output = self.dense2(dense1_output)
dense3_output = self.dense3(dense2_output)
return dense3_output, dense2_output

Understanding the code in pyTorch

I am having problems with understanding the following part of the code from ResNet architecture. The full code is available at https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/deep_residual_network/main-gpu.py . I am not very familiar with Python.
# Residual Block
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(ResidualBlock, self).__init__()
self.conv1 = conv3x3(in_channels, out_channels, stride)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(out_channels, out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
# ResNet Module
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=10):
super(ResNet, self).__init__()
self.in_channels = 16
self.conv = conv3x3(3, 16)
self.bn = nn.BatchNorm2d(16)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self.make_layer(block, 16, layers[0])
self.layer2 = self.make_layer(block, 32, layers[0], 2)
self.layer3 = self.make_layer(block, 64, layers[1], 2)
self.avg_pool = nn.AvgPool2d(8)
self.fc = nn.Linear(64, num_classes)
def make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if (stride != 1) or (self.in_channels != out_channels):
downsample = nn.Sequential(
conv3x3(self.in_channels, out_channels, stride=stride),
nn.BatchNorm2d(out_channels))
layers = []
layers.append(block(self.in_channels, out_channels, stride, downsample))
self.in_channels = out_channels
for i in range(1, blocks):
layers.append(block(out_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv(x)
out = self.bn(out)
out = self.relu(out)
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.avg_pool(out)
out = out.view(out.size(0), -1)
out = self.fc(out)
return out
resnet = ResNet(ResidualBlock, [3, 3, 3])
My main question is why should we pass 'block' every time? In the function
def make_layer(self, block, out_channels, blocks, stride=1):
instead of passing 'block' why cant we create an instance of 'ResidualBlock' and append it with layers as follows?
block = ResidualBlock(self.in_channels, out_channels, stride, downsample)
layers.append(block)
The ResNet module is designed to be generic, so that it can create networks with arbitrary blocks. So, if you do not pass the block which you want to create you'll have to write the name of the block explicitly like below.
# Residual Block
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(ResidualBlock, self).__init__()
self.conv1 = conv3x3(in_channels, out_channels, stride)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(out_channels, out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
# ResNet Module
class ResNet(nn.Module):
def __init__(self, layers, num_classes=10):
super(ResNet, self).__init__()
self.in_channels = 16
self.conv = conv3x3(3, 16)
self.bn = nn.BatchNorm2d(16)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self.make_layer(16, layers[0])
self.layer2 = self.make_layer(32, layers[0], 2)
self.layer3 = self.make_layer(64, layers[1], 2)
self.avg_pool = nn.AvgPool2d(8)
self.fc = nn.Linear(64, num_classes)
def make_layer(self, out_channels, blocks, stride=1):
downsample = None
if (stride != 1) or (self.in_channels != out_channels):
downsample = nn.Sequential(
conv3x3(self.in_channels, out_channels, stride=stride),
nn.BatchNorm2d(out_channels))
layers = []
layers.append(ResidualBlock(self.in_channels, out_channels, stride, downsample)) # Major change here
self.in_channels = out_channels
for i in range(1, blocks):
layers.append(ResidualBlock(out_channels, out_channels)) # Major change here
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv(x)
out = self.bn(out)
out = self.relu(out)
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.avg_pool(out)
out = out.view(out.size(0), -1)
out = self.fc(out)
return out
resnet = ResNet([3, 3, 3])
This reduces the capability of your ResNet module and binds it with only the ResidualBlock. Now, if you create some other type of block (say ResidualBlock2), you will need to create another Resnet2 module specifically for that. So, it's better to create a generic ResNet module which takes in the block parameter, so that it can be used with different types of blocks.
A trivial python example to clarify
Suppose you want to create a function that can apply a mathematical operation on a list and returns its output. So, you might create something like below
def exp(inp_list):
out_list = []
for num in inp_list:
out_list.append(math.exp(num))
return out_list
def floor(inp_list):
out_list = []
for num in inp_list:
out_list.append(math.floor(num))
return out_list
Here, we are doing an exponent and a floor operation on some input list. But, we can do a better job by defining a generic function to do the same as
def apply_func(fn, inp_list):
out_list = []
for num in inp_list:
out_list.append(fn(num))
return out_list
and now call this apply_func as apply_func(math.exp, inp_list) for exponential and as apply_func(math.floor, inp_list) for floor function. Also this opens up possibility for any kind of operation.
Note: It's not a practical example as you can always use map or list comprehension for achieving the same thing. But, it demonstrates the use clearly.

Categories