Understanding the code in pyTorch - python

I am having problems with understanding the following part of the code from ResNet architecture. The full code is available at https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/deep_residual_network/main-gpu.py . I am not very familiar with Python.
# Residual Block
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(ResidualBlock, self).__init__()
self.conv1 = conv3x3(in_channels, out_channels, stride)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(out_channels, out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
# ResNet Module
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=10):
super(ResNet, self).__init__()
self.in_channels = 16
self.conv = conv3x3(3, 16)
self.bn = nn.BatchNorm2d(16)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self.make_layer(block, 16, layers[0])
self.layer2 = self.make_layer(block, 32, layers[0], 2)
self.layer3 = self.make_layer(block, 64, layers[1], 2)
self.avg_pool = nn.AvgPool2d(8)
self.fc = nn.Linear(64, num_classes)
def make_layer(self, block, out_channels, blocks, stride=1):
downsample = None
if (stride != 1) or (self.in_channels != out_channels):
downsample = nn.Sequential(
conv3x3(self.in_channels, out_channels, stride=stride),
nn.BatchNorm2d(out_channels))
layers = []
layers.append(block(self.in_channels, out_channels, stride, downsample))
self.in_channels = out_channels
for i in range(1, blocks):
layers.append(block(out_channels, out_channels))
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv(x)
out = self.bn(out)
out = self.relu(out)
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.avg_pool(out)
out = out.view(out.size(0), -1)
out = self.fc(out)
return out
resnet = ResNet(ResidualBlock, [3, 3, 3])
My main question is why should we pass 'block' every time? In the function
def make_layer(self, block, out_channels, blocks, stride=1):
instead of passing 'block' why cant we create an instance of 'ResidualBlock' and append it with layers as follows?
block = ResidualBlock(self.in_channels, out_channels, stride, downsample)
layers.append(block)

The ResNet module is designed to be generic, so that it can create networks with arbitrary blocks. So, if you do not pass the block which you want to create you'll have to write the name of the block explicitly like below.
# Residual Block
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(ResidualBlock, self).__init__()
self.conv1 = conv3x3(in_channels, out_channels, stride)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(out_channels, out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
# ResNet Module
class ResNet(nn.Module):
def __init__(self, layers, num_classes=10):
super(ResNet, self).__init__()
self.in_channels = 16
self.conv = conv3x3(3, 16)
self.bn = nn.BatchNorm2d(16)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self.make_layer(16, layers[0])
self.layer2 = self.make_layer(32, layers[0], 2)
self.layer3 = self.make_layer(64, layers[1], 2)
self.avg_pool = nn.AvgPool2d(8)
self.fc = nn.Linear(64, num_classes)
def make_layer(self, out_channels, blocks, stride=1):
downsample = None
if (stride != 1) or (self.in_channels != out_channels):
downsample = nn.Sequential(
conv3x3(self.in_channels, out_channels, stride=stride),
nn.BatchNorm2d(out_channels))
layers = []
layers.append(ResidualBlock(self.in_channels, out_channels, stride, downsample)) # Major change here
self.in_channels = out_channels
for i in range(1, blocks):
layers.append(ResidualBlock(out_channels, out_channels)) # Major change here
return nn.Sequential(*layers)
def forward(self, x):
out = self.conv(x)
out = self.bn(out)
out = self.relu(out)
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.avg_pool(out)
out = out.view(out.size(0), -1)
out = self.fc(out)
return out
resnet = ResNet([3, 3, 3])
This reduces the capability of your ResNet module and binds it with only the ResidualBlock. Now, if you create some other type of block (say ResidualBlock2), you will need to create another Resnet2 module specifically for that. So, it's better to create a generic ResNet module which takes in the block parameter, so that it can be used with different types of blocks.
A trivial python example to clarify
Suppose you want to create a function that can apply a mathematical operation on a list and returns its output. So, you might create something like below
def exp(inp_list):
out_list = []
for num in inp_list:
out_list.append(math.exp(num))
return out_list
def floor(inp_list):
out_list = []
for num in inp_list:
out_list.append(math.floor(num))
return out_list
Here, we are doing an exponent and a floor operation on some input list. But, we can do a better job by defining a generic function to do the same as
def apply_func(fn, inp_list):
out_list = []
for num in inp_list:
out_list.append(fn(num))
return out_list
and now call this apply_func as apply_func(math.exp, inp_list) for exponential and as apply_func(math.floor, inp_list) for floor function. Also this opens up possibility for any kind of operation.
Note: It's not a practical example as you can always use map or list comprehension for achieving the same thing. But, it demonstrates the use clearly.

Related

Why does GraphRNN pack_padded after linear transform?

I am a beginner of machine learning. I'm reading GraphRNN's code now
class GRU_plain(nn.Module):
def init(self, input_size, embedding_size, hidden_size, num_layers, has_input=True, has_output=False, output_size=None):
super(GRU_plain, self).init()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.has_input = has_input
self.has_output = has_output
if has_input:
self.input = nn.Linear(input_size, embedding_size)
self.rnn = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers,
batch_first=True)
else:
self.rnn = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
if has_output:
self.output = nn.Sequential(
nn.Linear(hidden_size, embedding_size),
nn.ReLU(),
nn.Linear(embedding_size, output_size)
)
self.relu = nn.ReLU()
# initialize
self.hidden = None # need initialize before forward run
for name, param in self.rnn.named_parameters():
if 'bias' in name:
nn.init.constant(param, 0.25)
elif 'weight' in name:
nn.init.xavier_uniform(param,gain=nn.init.calculate_gain('sigmoid'))
for m in self.modules():
if isinstance(m, nn.Linear):
m.weight.data = init.xavier_uniform(m.weight.data, gain=nn.init.calculate_gain('relu'))
def init_hidden(self, batch_size):
return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).cuda()
def forward(self, input_raw, pack=False, input_len=None):
if self.has_input:
input = self.input(input_raw)
input = self.relu(input)
else:
input = input_raw
if pack:
input = pack_padded_sequence(input, input_len, batch_first=True)
output_raw, self.hidden = self.rnn(input, self.hidden)
if pack:
output_raw = pad_packed_sequence(output_raw, batch_first=True)[0]
if self.has_output:
output_raw = self.output(output_raw)
# return hidden state at each time step
return output_raw
At forward, why does GraphRNN pack_padded_sequence after linear transform by self.input?
Doesn't the non-zero raw get the information of padded 0?
I think it may be convenient for coding, which has little effect on the result.
But I can't find a rnn example doing the same thing.

How can I add reshape layer in nn.Sequential?

So I'm implementing Generator of a GAN and I need the architecture as shown as below:
The problem is when I try to reshape the output of Linear layer after BatchNorm and ReLU (in fig. Dense as they have used Tensorflow) it is throwing error as :TypeError: reshape(): argument 'input' (position 1) must be Tensor, not int
I understand the error but I can't find its solution.
Is there any other way to reshape within nn.Sequential instead of calling torch explicitly?
class Generator(nn.Module):
def __init__(self, z_dim=100, im_chan=1, hidden_dim=64, rdim=9216):
super(Generator, self).__init__()
self.z_dim = z_dim
self.gen = nn.Sequential(
nn.Linear(z_dim, rdim),
nn.BatchNorm2d(rdim,momentum=0.9),
nn.ReLU(inplace=True),
----> torch.reshape(rdim, (6,6,256)),
self.make_gen_block(rdim, hidden_dim*2),
self.make_gen_block(hidden_dim*2,hidden_dim),
self.make_gen_block(hidden_dim,im_chan,final_layer=True),
)
def make_gen_block(self, input_channels, output_channels, kernel_size=1, stride=2, final_layer=False):
if not final_layer:
return nn.Sequential(
nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
nn.BatchNorm2d(output_channels),
nn.ReLU(inplace=True)
)
else:
return nn.Sequential(
nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
nn.Tanh()
)
def unsqueeze_noise(self, noise):
return noise.view(len(noise), self.zdim, 1, 1)
def forward(self, noise):
x = self.unsqueeze_noise(noise)
return self.gen(x)
def get_noise(n_samples, z_dim, device='cpu'):
return torch.randn(n_samples, z_dim, device=device)
#Testing the Gen arch
gen = Generator()
num_test = 100
#test the hidden block
test_hidden_noise = get_noise(num_test, gen.z_dim)
test_hidden_block = gen.make_gen_block(6, 6, kernel_size=1,stride=2)
test_uns_noise = gen.unsqueeze_noise(test_hidden_noise)
hidden_output = test_hidden_block(test_uns_noise)
In nn.Sequential, torch.nn.Unflatten() can help you achieve reshape operation.
For nn.Linear, its input shape is (N, *, H_{in}) and output shape is (H, *, H_{out}). Note that the feature dimension is last. So unsqueeze_noise() is not useful here.
Based on the network structure, the arguments passed to make_gen_block are wrong.
I have checked the following code:
import torch
from torch import nn
class Generator(nn.Module):
def __init__(self, z_dim=100, im_chan=1, hidden_dim=64, rdim=9216):
super(Generator, self).__init__()
self.z_dim = z_dim
self.gen = nn.Sequential(
nn.Linear(z_dim, rdim),
nn.BatchNorm1d(rdim,momentum=0.9), # use BN1d
nn.ReLU(inplace=True),
nn.Unflatten(1, (256,6,6)),
self.make_gen_block(256, hidden_dim*2,kernel_size=2), # note arguments
self.make_gen_block(hidden_dim*2,hidden_dim,kernel_size=2), # note kernel_size
self.make_gen_block(hidden_dim,im_chan,kernel_size=2,final_layer=True), # note kernel_size
)
def make_gen_block(self, input_channels, output_channels, kernel_size=1, stride=2, final_layer=False):
if not final_layer:
return nn.Sequential(
nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
nn.BatchNorm2d(output_channels),
nn.ReLU(inplace=True)
)
else:
return nn.Sequential(
nn.ConvTranspose2d(input_channels, output_channels, kernel_size, stride),
nn.Tanh()
)
def forward(self, x):
return self.gen(x)
def get_noise(n_samples, z_dim, device='cpu'):
return torch.randn(n_samples, z_dim, device=device)
gen = Generator()
num_test = 100
input_noise = get_noise(num_test, gen.z_dim)
output = gen(input_noise)
assert output.shape == (num_test, 1, 48, 48)

Changing the output of a convolutional layer to a tuple of tensors

For processing video frames, I use the squeeze and excitation block for weighting the channels of a convolutional layer.
I want to combine (using torch.stack) the channels(feature maps) of a convolutional layer with the weighted channels (by using the mentioned squeeze and excitation block). But I faced with an error that when using the torch.stack(x, weighted_channels) the argument that is related with the convolutional layer's channelsx, the error says that the TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor.
class conv(nn.Module):
def __init__(self, in_channel, out_channel, out_sigmoid=False):
super(conv, self).__init__()
self.deconv = self._deconv(in_channel=512, out_channel=256, num_conv=3)
self.upsample = Upsample(scale_factor=2, mode='bilinear')
self.SEBlock = SE_Block(c=256)
def _deconv(self, in_channel, out_channel, num_conv=2, kernel_size=3, stride=1, padding=1):
layers=[]
layers.append(BasicConv2d(in_channel, out_channel,kernel_size=kernel_size, stride=stride, padding=padding))
for i in range(1, num_conv):
layers.append(_SepConv2d(out_channel, out_channel,kernel_size=kernel_size, stride=stride, padding=padding))
return nn.Sequential(*layers)
def forward(self, x):
x=self.deconv(x)
x = self.upsample(x)
stack = torch.stack(x, self.SEBlock(x,c=256))
return x
class BasicConv2d(nn.Module):
def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
super(BasicConv2d, self).__init__()
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
self.bn = nn.BatchNorm2d(out_planes, eps=1e-3, momentum=0.001, affine=True)
self.relu = nn.ReLU()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x
class _SepConv2d(nn.Module):
def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
super(_SepConv2d, self).__init__()
self.conv_s = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, bias=False, groups=in_planes)
self.bn_s = nn.BatchNorm2d(out_planes)
self.relu_s = nn.ReLU()
self.conv_t = nn.Conv2d(out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn_t = nn.BatchNorm2d(out_planes)
self.relu_t = nn.ReLU()
def forward(self, x):
x = self.conv_s(x)
x = self.bn_s(x)
x = self.relu_s(x)
x = self.conv_t(x)
x = self.bn_t(x)
x = self.relu_t(x)
return x
class SE_Block(nn.Module):
"credits: https://github.com/moskomule/senet.pytorch/blob/master/senet/se_module.py#L4"
def __init__(self, c, r=16):
super().__init__()
self.squeeze = nn.AdaptiveAvgPool2d(1)
self.excitation = nn.Sequential(
nn.Linear(c, c // r, bias=False),
nn.ReLU(inplace=True),
nn.Linear(c // r, c, bias=False),
nn.Sigmoid()
)
def forward(self, x):
bs, c, _, _ = x.shape
y = self.squeeze(x).view(bs, c)
y = self.excitation(y).view(bs, c, 1, 1)
return x * y.expand_as(x)
I checked two arguments of torch.stack but the both are of the same size.
See https://pytorch.org/docs/stable/generated/torch.stack.html.
torch.stack(tensors, dim=0, *, out=None) → Tensor
tensors (sequence of Tensors) – sequence of tensors to concatenate
A sequences of tensors can be a tuple like (tensor1, tensor2, tensor3) or a list [tensor1, tensor2, tensor3]. What you did is input x which is a tensor instead of a sequence of tensors and weighted_channels as the dim parameter into the function.
So as noted in the comments either
torch.stack((x, weighted_channels)) or torch.stack([x, weighted_channels]) should work.
Keep in mind that this is the same for all functions which take an arbitrary number of tensors and does something with them, e.g. torch.cat and all other stack functions like vstack, hstack-

2x nested Tensorflow custom layers results in zero trainable parameters

I am creating a series of custom Tensorflow (version 2.4.1) layers and am running into a problem where the model summary shows zero trainable parameters. Below is a series of examples showing how everything is fine until I add in the last custom layer.
Here are the imports and custom classes:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (BatchNormalization, Conv2D, Input, ReLU,
Layer)
class basic_conv_stack(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_conv_stack, self).__init__()
self.conv1 = Conv2D(filters, kernel_size, strides, padding='same')
self.bn1 = BatchNormalization()
self.relu = ReLU()
def call(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
return x
class basic_residual(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_residual, self).__init__()
self.bcs1 = basic_conv_stack(filters, kernel_size, strides)
self.bcs2 = basic_conv_stack(filters, kernel_size, strides)
def call(self, x):
x = self.bcs1(x)
x = self.bcs2(x)
return x
class basic_module(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_module, self).__init__()
self.res = basic_residual
self.args = (filters, kernel_size, strides)
def call(self, x):
for _ in range(4):
x = self.res(*self.args)(x)
return x
Now, if I do the following, everything works out ok and I get 300 trainable parameters:
input_layer = Input((128, 128, 3))
conv = basic_conv_stack(10, 3, 1)(input_layer)
model = Model(input_layer, conv)
print (model.summary())
Similarly, if I do the following, I get 1,230 trainable parameters:
input_layer = Input((128, 128, 3))
conv = basic_residual(10, 3, 1)(input_layer)
model = Model(input_layer, conv)
print (model.summary())
However, if I try the basic_module class, I get zero trainable parameters:
input_layer = Input((128, 128, 3))
conv = basic_module(10, 3, 1)(input_layer)
model = Model(input_layer, conv)
print (model.summary())
Does anyone know why this is happening?
Edit to add:
I discovered that the layers used in the call must be initialized in the class's init for things to work properly. So if I change the basic module to this:
class basic_module(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_module, self).__init__()
self.clayers = [basic_residual(filters, kernel_size, strides) for _ in range(4)]
def call(self, x):
for idx in range(4):
x = self.clayers[idx](x)
return x
Everything works fine. I don't know why this is the case, so I'll leave this question open in case someone can answer the why of this question.
You have to initialize the class instances with the required parameter such as filters, kernel_size, strides to the predefined base_mdoule. Also, note that these hyper-parameters are related to trainable weights properties.
# >>> a = basic_module
# >>> a __main__.basic_module
# >>> a = basic_module(10, 3, 1)
# >>> a
# >>> <__main__.basic_module at 0x7f6123eed510>
class basic_module(Layer):
def __init__(self, filters, kernel_size, strides):
super(basic_module, self).__init__()
self.res = basic_residual # < ---
self.args = (filters, kernel_size, strides)
def call(self, x):
for _ in range(4):
x = self.res(*self.args)(x)
return x

How to do fully connected batch norm in PyTorch?

torch.nn has classes BatchNorm1d, BatchNorm2d, BatchNorm3d, but it doesn't have a fully connected BatchNorm class? What is the standard way of doing normal Batch Norm in PyTorch?
Ok. I figured it out. BatchNorm1d can also handle Rank-2 tensors, thus it is possible to use BatchNorm1d for the normal fully-connected case.
So for example:
import torch.nn as nn
class Policy(nn.Module):
def __init__(self, num_inputs, action_space, hidden_size1=256, hidden_size2=128):
super(Policy, self).__init__()
self.action_space = action_space
num_outputs = action_space
self.linear1 = nn.Linear(num_inputs, hidden_size1)
self.linear2 = nn.Linear(hidden_size1, hidden_size2)
self.linear3 = nn.Linear(hidden_size2, num_outputs)
self.bn1 = nn.BatchNorm1d(hidden_size1)
self.bn2 = nn.BatchNorm1d(hidden_size2)
def forward(self, inputs):
x = inputs
x = self.bn1(F.relu(self.linear1(x)))
x = self.bn2(F.relu(self.linear2(x)))
out = self.linear3(x)
return out
The BatchNorm1d normally comes before the ReLU, and the bias is redundant, so
import torch.nn as nn
class Policy(nn.Module):
def __init__(self, num_inputs, action_space, hidden_size1=256, hidden_size2=128):
super(Policy2, self).__init__()
self.action_space = action_space
num_outputs = action_space
self.linear1 = nn.Linear(num_inputs, hidden_size1, bias=False)
self.linear2 = nn.Linear(hidden_size1, hidden_size2, bias=False)
self.linear3 = nn.Linear(hidden_size2, num_outputs)
self.bn1 = nn.BatchNorm1d(hidden_size1)
self.bn2 = nn.BatchNorm1d(hidden_size2)
def forward(self, inputs):
x = inputs
x = F.relu(self.bn1(self.linear1(x)))
x = F.relu(self.bn2(self.linear2(x)))
out = self.linear3(x)
return out

Categories