Kernel size can't be greater than actual input size - python

I have a data with depth = 3 and I want to pass it through 3 convolution layers with 3x3x3 kernels each.
My current code is below. The first input is
[batch_size=10, in_channels=1, depth=3, height=128, width=256]
and I notice after the first conv3d layer the output is [10,8,1,126,254]. Obviously it has now depth 1 and doesn't accept it for another 3x3x3 layer. How can I achieve this?
class CNet(nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.conv1 = nn.Conv3d(1, 8, kernel_size=3, stride=1, padding=0)
self.conv2 = nn.Conv3d(8, 16, kernel_size=3, stride=1, padding=0)
self.conv3 = nn.Conv3d(16, 32, kernel_size=3, stride=1, padding=0)
self.fc1 = nn.Linear(value, 2)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.relu(self.conv2(X))
X = F.max_pool2d(X,2)
X = self.conv3(X)
X = F.max_pool2d(X,2)
X = self.fc1(X)
return F.softmax(X,dim =1)

You need to use padding. If you only want to pad the input for the convolutions after the first one and only in the depth dimensions to get the minimum dimension of 3, you would use padding=(1, 0, 0) (it's 1 because the same padding is applied to both sides, i.e. (padding, input, padding) along that dimension).
self.conv2 = nn.Conv3d(8, 16, kernel_size=3, stride=1, padding=(1, 0, 0))
self.conv3 = nn.Conv3d(16, 32, kernel_size=3, stride=1, padding=(1, 0, 0))
However, it is common to use padding=1 for all dimensions when using kernel_size=3, because that keeps the dimensions unchanged, which makes it much easier to build deeper network, as you don't need to worry about the sizes suddenly getting too small, as it happened already for your depth dimension. Also when no padding is used, the corners are only included in a single calculation, whereas all other elements contribute to multiple calculations. It is recommended to use kernel_size=3 and padding=1 for all your convolutions.
self.conv1 = nn.Conv3d(1, 8, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv3d(8, 16, kernel_size=3, stride=1, padding=1)
self.conv3 = nn.Conv3d(16, 32, kernel_size=3, stride=1, padding=1)

Related

Calculating dimensions of fully connected layer?

I am struggling to work out how to calculate the dimensions for the fully connected layer. I am inputing images which are (448x448) using a batch size (16). Below is the code for my convolutional layers:
class ConvolutionalNet(nn.Module):
def __init__(self, num_classes=182):
super().__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3, 16, kernal_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernal_size=2, stride=2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16, 32, kernal_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernal_size=2, stride=2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32, 32, kernal_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernal_size=2, stride=2)
)
self.layer4 = nn.Sequential(
nn.Conv2d(32, 64, kernal_size=5, stride=1, padding=2),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernal_size=2, stride=2)
)
self.layer5 = nn.Sequential(
nn.Conv2d(64, 64, kernal_size=5, stride=1, padding=2),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernal_size=2, stride=2)
)
I want to add a fully connected layer:
self.fc = nn.Linear(?, num_classes)
Would anyone be able to explain the best way to go about calculating this? Also, if I have multiple fully connected layers e.g. (self.fc2, self.fc3), would the second parameter always equal the number of classes. I am new to coding and finding it hard to wrap my head around this.
The conv layers don't change the width/height of the features since you've set padding equal to (kernel_size - 1) / 2. Max pooling with kernel_size = stride = 2 will decrease the width/height by a factor of 2 (rounded down if input shape is not even).
Using 448 as input width/height, the output width/height will be 448 // 2 // 2 // 2 // 2 // 2 = 448/32 = 14 (where // is floor-divide operator).
The number of channels is fully determined by the last conv layer, which outputs 64 channels.
Therefore you will have a [B,64,14,14] shaped tensor, so the Linear layer should have in_features = 64*14*14 = 12544.
Note you'll need to flatten the input beforehand, something like.
self.layer6 = nn.Sequential(
nn.Flatten(),
nn.Linear(12544, num_classes)
)

output of conv layer in AlexNet

I am looking at an implementation of AlexNet with PyTorch. According to the formula, output height = (input_height + padding_top + padding_bottom - kernel_height) / stride_height + 1. so using the formula, with input of size 224, stride = 4, padding = 1, and kernel size =11, the output should be of size 54.75. But if you run a summary of the model, you would see that the output of this first layer to be 54. Does PyTorch clip down the output size? If so, does it consistently clip down (seem like it)? I would like to understand what is going behind the scene please .
Here is the code that I refer to:
net = nn.Sequential(
nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(),
nn.Linear(6400, 4096), nn.ReLU(), nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5),
nn.Linear(4096, 10))
The output size is an whole number of course! It's just that your formula is not correct, the expression is height = floor((input_height + padding_top + padding_bottom - kernel_height) / stride_height + 1). This wouldn't make any sense otherwise.
Yes, it's clipped down when you have length of output by decimal point.
Because when you look at the output variable, it'd be a matrix. Every element in a matrix worth a length, and there is no guarantee an element for 0.75 length, so it's either being clipped down to 54 or rounded up to 55. And because the kernel have to stop striding at where the next stride don't match the kernel size(you may draw to understand how funny it would be if the kernel didn't stop there), it has to be clipped down.

LayerNorm inside nn.Sequential in torch

I am trying to use LayerNorm inside nn.Sequential in torch. This is what I am looking for-
import torch.nn as nn
class LayerNormCnn(nn.Module):
def __init__(self):
super(LayerNormCnn, self).__init__()
self.net = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
nn.LayerNorm(),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
nn.LayerNorm(),
nn.ReLU(),
)
def forward(self, x):
x = self.net(x)
return x
Unfortunately, it doesn't work because LayerNorm requires normalized_shape as input. The code above throws following exception-
nn.LayerNorm(),
TypeError: __init__() missing 1 required positional argument: 'normalized_shape'
Right now, this is how I have implemented it-
import torch.nn as nn
import torch.nn.functional as F
class LayerNormCnn(nn.Module):
def __init__(self, state_shape):
super(LayerNormCnn, self).__init__()
self.conv1 = nn.Conv2d(state_shape[0], 32, kernel_size=3, stride=2, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1)
# compute shape by doing a forward pass
with torch.no_grad():
fake_input = torch.randn(1, *state_shape)
out = self.conv1(fake_input)
bn1_size = out.size()[1:]
out = self.conv2(out)
bn2_size = out.size()[1:]
self.bn1 = nn.LayerNorm(bn1_size)
self.bn2 = nn.LayerNorm(bn2_size)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
return x
if __name__ == '__main__':
in_shape = (3, 128, 128)
batch_size = 32
model = LayerNormCnn(in_shape)
x = torch.randn((batch_size,) + in_shape)
out = model(x)
print(out.shape)
Is it possible to use LayerNorm inside nn.Sequential?
The original layer normalisation paper advised against using layer normalisation in CNNs, as receptive fields around the boundary of images will have different values as opposed to the receptive fields in the actual image content. This issue does not arise with RNNs, which is what layer norm was originally tested for. Are you sure you want to be using LayerNorm? If you're looking to compare a different normalisation technique against BatchNorm, consider GroupNorm. This gets rid of the LayerNorm assumption that all channels in a layer contribute equally to a prediction, which is problematic particularly if the layer is convolutional. Instead, each channel is divided further into groups, that still allows a GN layer to learn different statistics across channels.
Please refer here for related discussion.

How can I check the image sizes while CNN?

I'm trying to classify cat and dog in CNN with PyTorch.
While I made few layers and processing images, I found that final processed feature map size doesn't match with calculated size.
So I tried to check feature map size step by step in CNN process with print shape but it doesn't work.
I heard tensorflow enables check tensor size in steps but how can I do that?
What I want is :
def __init__(self):
super(CNN, self).__init__()
conv1 = nn.Conv2d(1, 16, 3, 1, 1)
conv1_1 = nn.Conv2d(16, 16, 3, 1, 1)
pool1 = nn.MaxPool2d(2)
conv2 = nn.Conv2d(16, 32, 3, 1, 1)
conv2_1 = nn.Conv2d(32, 32, 3, 1, 1)
pool2 = nn.MaxPool2d(2)
conv3 = nn.Conv2d(32, 64, 3, 1, 1)
conv3_1 = nn.Conv2d(64, 64, 3, 1, 1)
conv3_2 = nn.Conv2d(64, 64, 3, 1, 1)
pool3 = nn.MaxPool2d(2)
self.conv_module = nn.Sequential(
conv1,
nn.ReLU(),
conv1_1,
nn.ReLU(),
pool1,
# check first result size
conv2,
nn.ReLU(),
conv2_1,
nn.ReLU(),
pool2,
# check second result size
conv3,
nn.ReLU(),
conv3_1,
nn.ReLU(),
conv3_2,
nn.ReLU(),
pool3,
# check third result size
pool4,
# check fourth result size
pool5
# check fifth result size
)
If there's any other way to check feature size at every step, please give some advice.
Thanks in advance.
To do that you shouldn't use nn.Sequential. Just initialize your layers in __init__() and call them in the forward function. In the forward function you can print the shapes out. For example like this:
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(...)
self.maxpool1 = nn.MaxPool2d()
self.conv2 = nn.Conv2d(...)
self.maxpool2 = nn.MaxPool2d()
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.maxpool1(x)
print(x.size())
x = self.conv2(x)
x = F.relu(x)
x = self.maxpool2(x)
print(x.size())
Hope thats what you looking for!

How to get rid of checkerboard artifacts

I am using a fully convolutional autoencoder to color black and white images, however, the output has a checkerboard pattern and I want to get rid of it. The checkerboard artifacts I have seen so far allways have been far smaller than mine and the usual way to get rid of them is replacing all unpooling operations with bilinear upsampling (I have been told that).
But I can not simply replace the unpooling operation because I work with different sized images, thus the unpooling operation is needed, else the output tensor could have a different size than the original.
TLDR:
How can I get rid of these checkerboard-artifacts without replacing the unpooling operations?
class AE(nn.Module):
def __init__(self):
super(AE, self).__init__()
self.leaky_reLU = nn.LeakyReLU(0.2)
self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=1, return_indices=True)
self.unpool = nn.MaxUnpool2d(kernel_size=2, stride=2, padding=1)
self.softmax = nn.Softmax2d()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
self.conv4 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv5 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1)
self.conv6 = nn.ConvTranspose2d(in_channels=1024, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv7 = nn.ConvTranspose2d(in_channels=512, out_channels=256, kernel_size=3, stride=1, padding=1)
self.conv8 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=1)
self.conv9 = nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1)
self.conv10 = nn.ConvTranspose2d(in_channels=64, out_channels=2, kernel_size=3, stride=1, padding=1)
def forward(self, x):
# encoder
x = self.conv1(x)
x = self.leaky_reLU(x)
size1 = x.size()
x, indices1 = self.pool(x)
x = self.conv2(x)
x = self.leaky_reLU(x)
size2 = x.size()
x, indices2 = self.pool(x)
x = self.conv3(x)
x = self.leaky_reLU(x)
size3 = x.size()
x, indices3 = self.pool(x)
x = self.conv4(x)
x = self.leaky_reLU(x)
size4 = x.size()
x, indices4 = self.pool(x)
######################
x = self.conv5(x)
x = self.leaky_reLU(x)
x = self.conv6(x)
x = self.leaky_reLU(x)
######################
# decoder
x = self.unpool(x, indices4, output_size=size4)
x = self.conv7(x)
x = self.leaky_reLU(x)
x = self.unpool(x, indices3, output_size=size3)
x = self.conv8(x)
x = self.leaky_reLU(x)
x = self.unpool(x, indices2, output_size=size2)
x = self.conv9(x)
x = self.leaky_reLU(x)
x = self.unpool(x, indices1, output_size=size1)
x = self.conv10(x)
x = self.softmax(x)
return x
Instead of using an upconv layer such as nn.ConvTranspose2d, you can use interpolation in the decoder part to go back to your initial format, such as torch.nn.functional.interpolate. It will prevent you from having checkerboards artifacts.
If you want learnable weights in the decoder, you should also use a conv layer such as nn.Conv2d after each interpolation.
Skip connection is commonly used in Encoder-Decoder architecture and it helps to produce accurate result by passing appearance information from shallow layer of encoder (discriminator) to corresponding deeper layer of decoder (generator). Unet is the widely used Encoder-Decoder type architecture. Linknet is also very popular and it differs with Unet in the way of fusing appearance information of encoder layer with the decoder layer. In case of Unet, incoming features (from encoder) are concatenated in the corresponding decoder layer. On the other hand, Linknet performs addition and that why Linknet requires fewer number of operations in a single forward pass and significantly faster than the Unet.
Your each convolution block in Decoder might looks like following:
Additionally, i'm attaching a figure bellow depicting architecture of Unet and LinkNet. Hope using skip connection will help.
This pattern you have because of deconvolution (nn.ConvTranspose2d). The article explains it in detail.
You may try Upsample as alternative. This will not provide the checkerboard pattern.
Works like this:
import torch
input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
m = torch.nn.Upsample(scale_factor=2, mode='nearest')
m(input)
However you will not be able to learn anything with Upsample. It is just a transform.
So it is a trade. There are many papers online how to deal with the checkerboard pattern for different problems.
The idea is to train your network so the checkerboard pattern is gone.
As Kaushik Roy has specified, Skip-Connections are the way to go!
class AE(nn.Module):
def __init__(self):
super(AE, self).__init__()
self.leaky_reLU = nn.LeakyReLU(0.2)
self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=1, return_indices=True)
self.unpool = nn.MaxUnpool2d(kernel_size=2, stride=2, padding=1)
self.softmax = nn.Softmax2d()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
self.conv4 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv5 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1)
self.conv6 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv7 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=3, stride=1, padding=1)
self.conv8 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1)
self.conv9 = nn.Conv2d(in_channels=256, out_channels=64, kernel_size=3, stride=1, padding=1)
self.conv10 = nn.Conv2d(in_channels=128, out_channels=2, kernel_size=3, stride=1, padding=1)
def forward(self, x):
# encoder
x = self.conv1(x)
out1 = self.leaky_reLU(x)
x = out1
size1 = x.size()
x, indices1 = self.pool(x)
x = self.conv2(x)
out2 = self.leaky_reLU(x)
x = out2
size2 = x.size()
x, indices2 = self.pool(x)
x = self.conv3(x)
out3 = self.leaky_reLU(x)
x = out3
size3 = x.size()
x, indices3 = self.pool(x)
x = self.conv4(x)
out4 = self.leaky_reLU(x)
x = out4
size4 = x.size()
x, indices4 = self.pool(x)
######################
x = self.conv5(x)
x = self.leaky_reLU(x)
x = self.conv6(x)
x = self.leaky_reLU(x)
######################
# decoder
x = self.unpool(x, indices4, output_size=size4)
x = self.conv7(torch.cat((x, out4), 1))
x = self.leaky_reLU(x)
x = self.unpool(x, indices3, output_size=size3)
x = self.conv8(torch.cat((x, out3), 1))
x = self.leaky_reLU(x)
x = self.unpool(x, indices2, output_size=size2)
x = self.conv9(torch.cat((x, out2), 1))
x = self.leaky_reLU(x)
x = self.unpool(x, indices1, output_size=size1)
x = self.conv10(torch.cat((x, out1), 1))
x = self.softmax(x)
return x
This answer was posted as an edit to the question How to get rid of checkerboard artifacts by the OP Stefan under CC BY-SA 4.0.

Categories