Pytorch Conv1D gives different size to ConvTranspose1d - python

I am trying to build a basic/shallow CNN auto-encoder for 1D time series data in pytorch/pytorch-lightning.
Currently, my encoding block is:
class encodingBlock(nn.Module):
def __init__(self):
super().__init__()
self.conv1d_1 = nn.Conv1d(1, 64, kernel_size=32)
self.relu = nn.ReLU()
self.batchnorm = nn.BatchNorm1d(64)
self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2, return_indices=True)
self.fc = nn.Linear(64, 4)
def forward(self, x):
cnn_out1 = self.conv1d_1(x)
norm_out1 = self.batchnorm(cnn_out1)
relu_out1 = self.relu(norm_out1)
maxpool_out, indices = self.maxpool(relu_out1)
gap_out = torch.mean(maxpool_out, dim = 2)
fc_out = self.relu(self.fc(gap_out))
return fc_out, indices
And my decoding block is:
class decodingBlock(nn.Module):
def __init__(self):
super().__init__()
self.Tconv1d_1 = nn.ConvTranspose1d(64, 1, kernel_size=32, output_padding=1)
self.relu = nn.ReLU()
self.batchnorm = nn.BatchNorm1d(1)
self.maxunpool = nn.MaxUnpool1d(kernel_size=2, stride=2)
self.upsamp = nn.Upsample(size=59, mode='nearest')
self.fc = nn.Linear(4, 64)
def forward(self, x, indices):
fc_out = self.fc(x)
relu_out = self.relu(fc_out)
relu_out = relu_out.unsqueeze(dim = 2)
upsamp_out = self.upsamp(relu_out)
maxpool_out = self.maxunpool(upsamp_out, indices)
cnnT_out = self.Tconv1d_1(maxpool_out)
norm_out = self.batchnorm(cnnT_out)
relu_out = self.relu(norm_out)
return relu_out
However, looking at the outputs:
Input size: torch.Size([1, 1, 150])
Conv1D out size: torch.Size([1, 64, 119])
Maxpool out size: torch.Size([1, 64, 59])
Global average pooling out size: torch.Size([1, 64])
Encoder dense out size: torch.Size([1, 4])
...
Decoder input: torch.Size([1, 4])
Decoder dense out size: torch.Size([1, 64])
Unsqueeze out size: torch.Size([1, 64, 1])
Upsample out size: torch.Size([1, 64, 59])
Decoder maxunpool out size: torch.Size([1, 64, 118])
Transpose Conv out size: torch.Size([1, 1, 149])
The outputs from the MaxUnpool1d and ConvTranspose1d layers are not the expected dimension.
I have two questions that I was hoping to get some help on:
Why are the dimensions wrong?
Is there a better way to "reverse" the global average pooling than the upsampling procedure I have used?

1. Regarding input and output shapes:
pytorch's doc has the explicit formula relating input and output sizes.
For convolution:
Similarly for pooling:
For transposed convolution:
And for unpooling:
Make sure your padding and output_padding values add up to the proper output shape.
2. Is there a better way?
Transposed convolution has its faults, as you already noticed. It also tends to produce "checkerboard artifacts".
One solution is to use pixelshuffle: that is, predict for each low-res point twice the number of channels, and then split them into two points with the desired number of features.
Alternatively, you can interpolate using a fixed method from the low resolution to the higher one. Apply regular convolutions to the upsampled vectors.
If you choose this path, you might consider using ResizeRight instead of pytorch's interpolate - it has better handling of edge cases.

Related

Shape inference for PyTorch layers

I'd like to be able to write things in PyTorch like "add another Conv2D on top of the last Conv2D, where the output has 128 channels and the input has the right number of channels to match the previous layer." I end up writing code like this:
from torch import nn
CONV_CHANNELS = [3, 64, 128, 256, 512, 512]
CONV_SIZE = 3
POOL_SIZE = 2
class CNN(torch.nn.Module):
def __init__(self):
super().__init__()
def make_conv_pool(in_channels, out_channels):
return nn.Sequential(
nn.Conv2D(in_channels=in_channels,
out_channels=out_channels,
kernel_size=CONV_SIZE),
nn.ReLU(),
nn.MaxPool2d(kernel_size=POOL_SIZE, stride=2))
self.net = nn.Sequential(*[
make_conv_pool(in_channels, out_channels)
for in_channels, out_channels
in zip(CONV_CHANNELS, CONV_CHANNELS[1:])
])
Now I want to add a fully connected layer at the end, and I want the input to be the result of "flattening" the output of the last conv layer across the height, width, and channel dimensions. I end up with some awkward calculation like:
def calculate_fc_input_features(side_length):
for _ in range(len(CONV_CHANNELS)):
side_length -= (CONV_SIZE - 1)
side_length = side_length // POOL_SIZE
return side_length
so that I can write:
CONV_CHANNELS = [3, 64, 128, 256, 512, 512]
CONV_SIZE = 3
POOL_SIZE = 2
IMAGE_SIDE_LENGTH = 256
NUM_CLASSES = 3
class FlatFC(torch.nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.fc = nn.Linear(in_features, out_features)
def forward(self, x):
return self.fc(x.view(x.shape[0], -1))
class CNN(torch.nn.Module):
def __init__(self):
super().__init__()
def make_conv_pool(in_channels, out_channels):
return nn.Sequential(
nn.Conv2D(in_channels=in_channels,
out_channels=out_channels,
kernel_size=CONV_SIZE),
nn.ReLU(),
nn.MaxPool2d(kernel_size=POOL_SIZE, stride=2))
convs = nn.Sequential(*[
make_conv_pool(in_channels, out_channels)
for in_channels, out_channels
in zip(CONV_CHANNELS, CONV_CHANNELS[1:])
])
fc = FlatFC(in_features=calculate_fc_input_features(IMAGE_SIDE_LENGTH),
out_features=NUM_CLASSES)
self.net = nn.Sequential(convs, fc)
It feels like I'm writing a lot of unnecessary boilerplate here: for each of the conv layers, the number of input channels is completely determined by the output channels of the previous layer; for the fully connected layer, I have to do calculation that assumes things about the shape of the network rather than asking the layers themselves something like "if your initial input has shape [B, W, H, C], what is your output shape?"
Is there a better way to do this? Does Pytorch provide a more concise way to say "put another Conv2D on top of this network, and figure out the number of input channels for yourself, because there is only one value that works"? If not, are there libraries that fill this role? It feels like this should be a common task, so I'm surprised by how verbose it seems my implementation needs to be.

Using tf.image.pyramids during training to create downsampled feature maps

I'm attempting to use tf.image.pyramids.downsample from tensorflow_graphics in an auto-encoder model in every Down (encoding) block, to be then sent as a skip connection to Up (decoder) blocks.
class DownConv(Model):
n = 0
def __init__(self, kernel_size, filters, initializer, n_lower_levels):
super(DownConv, self).__init__(name=f"DownConv_{DownConv.n}")
DownConv.n += 1
self.pad = tf.constant([[0, 0], [kernel_size // 2, kernel_size // 2], [kernel_size // 2, kernel_size // 2], [0, 0]])
self.conv = L.Conv2D(filters, kernel_size, strides=2, kernel_initializer=initializer)
self.pyramid = None
self.filters = filters
self.n_lower_levels = n_lower_levels
def call(self, input_t):
logger.debug(f"Received {input_t.shape} in {self.name}")
x = tf.pad(input_t, self.pad, "SYMMETRIC")
x = self.conv(x)
p = tf.Variable(x)
self.pyramid = downsample(p, self.n_lower_levels)
pyramods = ", ".join([str(p.shape) for p in self.pyramid])
logger.debug(f"Received {input_t.shape} in {self.name}")
logger.debug(f"Generated pyramids: {pyramods}")
return tf.nn.selu(x)
However, thanks to logging I found out that this doesn't work. It seems only the very first pyramid (the first step of the downsample) contains the channels, the rest of them have None for channels.
self.pyramid[0].shape yields the correct (None, 256, 256, 64), but self.pyramid[1] yields (None, 256, 256, None) during a training step. Note that batches are correctly None here for axis=0, it is normal Tensorflow behavior for error logs.
Due to this issue, the training step produces an error in my Up blocks, when it tries to concatenate the two feature maps:
ValueError: The channel dimension of the inputs should be defined. The input_shape received is (None, 32, 32, None), where axis -1 (0-based) is the channel dimension, which found to be `None`.
Call arguments received:
• input_t=tf.Tensor(shape=(None, 32, 32, 256), dtype=float32)

Issues with the output size of a Many-to-Many CNN-LSTM in PyTorch

I am trying to build a binary temporal image classifier by combining ResNet18 and an LSTM. However, I have never really used RNNs before and have been struggling on getting the correct output shape.
I am using a batch size of 128 and a sequence size of 32. The images are 80x80 grayscale images.
The current model is:
class CNNLSTM(nn.Module):
def __init__(self):
super(CNNLSTM, self).__init__()
self.resnet = models.resnet18(pretrained=False)
self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3)
self.resnet.fc = nn.Sequential(nn.Linear(in_features=512, out_features=256, bias=True))
self.lstm = nn.LSTM(input_size=256, hidden_size=256, num_layers=3)
self.fc1 = nn.Linear(256, 128)
self.fc2 = nn.Linear(128, 1)
def forward(self, x_3d):
#x3d: torch.Size([128, 32, 1, 80, 80])
hidden = None
toret = []
for t in range(x_3d.size(1)):
x = self.resnet(x_3d[:, t, :, :, :])
out, hidden = self.lstm(x.unsqueeze(0), hidden)
x = self.fc1(out[-1, :, :])
x = F.relu(x)
x = self.fc2(x)
print("x shape: ", x.shape)
toret.append(x)
return torch.stack(toret)
Which returns a tensor of shape torch.Size([32, 128, 1]) which, according to what I understand, means that every nth row represents the nth time step of each element in the sequence.
How can I get output of shape 128x1x32 instead?
And is there a better way to do this?
You could permute the dimensions:
a = torch.rand(32, 128, 1)
a = a.permute(1, 2, 0) # these are the indices of the original dimensions
print(a.shape)
>> torch.Size([128, 1, 32])
But you could also set batch_first=True in the LSTM module:
self.lstm = nn.LSTM(input_size=256, hidden_size=256, num_layers=3, batch_first=True)
This will expect that the input to the LSTM has the shape batch-size x seq-len x features and will output a tensor in the same way.

Translate Keras functional API to PyTorch nn.Module - Conv2d

I'm trying to translate the following Inception code from tutorial in Keras functional API (link) to PyTorch nn.Module:
def conv_module(x, K, kX, kY, stride, chanDim, padding="same"):
# define a CONV => BN => RELU pattern
x = Conv2D(K, (kX, kY), strides=stride, padding=padding)(x)
x = BatchNormalization(axis=chanDim)(x)
x = Activation("relu")(x)
# return the block
return x
def inception_module(x, numK1x1, numK3x3, chanDim):
# define two CONV modules, then concatenate across the
# channel dimension
conv_1x1 = conv_module(x, numK1x1, 1, 1, (1, 1), chanDim)
conv_3x3 = conv_module(x, numK3x3, 3, 3, (1, 1), chanDim)
x = concatenate([conv_1x1, conv_3x3], axis=chanDim)
# return the block
return x
I'm having trouble translating the Conv2D. If I understand correctly:
There is no in_features in Keras - how should I represent it in PyTorch?
Keras filters is PyTorch out_features
kernel_size, stride and padding are the same (maybe a few options for padding are called differently)
Do I understand this correctly? If so, what should I do with in_features? My code so far:
class BasicConv2d(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int
) -> None:
super().__init__()
self.conv = nn.Conv2d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride)
self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
self.relu = nn.ReLU()
def forward(self, x: Tensor) -> Tensor:
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x
class Inception(nn.Module):
def __init__(
self,
in_channels: int,
num_1x1_filters: int,
num_3x3_filters: int,
) -> None:
super().__init__()
# how to fill this further?
self.conv_1d = BasicConv2d(
num_1x1_filters,
)
You're correct for the most part. The in_channels parameter in Con2d corresponds to the no. of output channels from the previous layer. If Conv2d is the first layer, the in_channels correspond to the no. of channels in your image. It will be 1 for a Grayscale image and 3 for an RGB image.
But I'm not sure how you could concat the two BasicConv2d outputs.
Fixing batch_size as 1, assume that the image size is 256*256 and out_channels for conv1x1 is 64. This would output a tensor of shape torch.Size([1, 64, 256, 256]). Assuming out_channels of the conv3x3 as 32, this layer would output a tensor of shape torch.Size([1, 32, 254, 254]). We will not be able to concat these two tensors without some trick, such as using padding=1 for the conv3x3 alone as this would produce an output of shape torch.Size([1, 32, 256, 256]) and therefore we would be able to concat.
Your implementation of BasicConv2d is fine, here is the code of Inception module.
class Inception(nn.Module):
def __init__(
self,
in_channels: int,
num_1x1_filters: int,
num_3x3_filters: int,
) -> None:
super().__init__()
# how to fill this further?
self.conv1 = BasicConv2d(in_channels, num_1x1_filters, 1,1)
self.conv3 = BasicConv2d(in_channels, num_3x3_filters, 3,1)
def forward(self,x):
conv1_out = self.conv1(x)
conv3_out = self.conv3(x)
x = torch.cat([conv1_out, conv3_out],)
return x
You need define two basic conv layers, and use them in the forward pass with same input separately.
As #planet_pluto pointed, you can't concatenate two feature maps have different size. you can choose a better stride, padding to construct two feature maps with same size, alternatively, do upsampling or downsampling before you concatenate them.

Why return self.head(x.view(x.size(0), -1)) in the nn.Module for pyTorch reinforcement learning example

I understand that the balancing the pole example requires 2 outputs. Reinforcement Learning (DQN) Tutorial
Here is the output for self.head
print ('x',self.head)
x = Linear(in_features=512, out_features=2, bias=True)
When I run the epochs below is the outputs:
print (self.head(x.view(x.size(0), -1)))
return self.head(x.view(x.size(0), -1))
tensor([[-0.6945, -0.1930]])
tensor([[-0.0195, -0.1452]])
tensor([[-0.0906, -0.1816]])
tensor([[ 0.0631, -0.9051]])
tensor([[-0.0982, -0.5109]])
...
The size of x is:
x = torch.Size([121, 32, 2, 8])
So I am trying to understand what x.view(x.size(0), -1) is doing?
I understand from the comment in the code that it's returning:
Returns tensor([[left0exp,right0exp]...]).
But how does x which is torch.Size([121, 32, 2, 8]) being reduced to a tensor of size 2?
Is there an alternative way of writing that makes more sense? What if I had 4 outputs. How would I represent that? Why x.size(0). Why -1?
So appears to take self.head with 4 outputs to 2 outputs. Is that correct?
At the bottom is that class I am referring:
class DQN(nn.Module):
def __init__(self, h, w, outputs):
super(DQN, self).__init__()
self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)
# Number of Linear input connections depends on output of conv2d layers
# and therefore the input image size, so compute it.
def conv2d_size_out(size, kernel_size = 5, stride = 2):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
linear_input_size = convw * convh * 32
self.head = nn.Linear(linear_input_size, outputs)
# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1))
x.view(x.size(0), -1) is flattening the tensor, this is because the Linear layer only accepts a vector (1d array). To break it down, x.view() reshapes the tensor of the specified shape (more info). x.shape(0) returns 1st dimension of the tensor (which is the batch size, this should remain the constant). The -1 in x.view() is a filler, in other words, its dimensions that we don't know, so PyTorch automatically calculates it. For example, if x = torch.tensor([1,2,3,4]), to reshape the tensor to a 2x2, you could do x.view(2,2) or x.view(2,-1) or x.view(-1,2).
The output shape is not a tensor shape of 2, but that of 121,2 (the 121 is the batch size, and the 2 comes from the Linear layers output). So to change the output size from 2, to 4, you would have to change the outputs argument in the __init__ function to 4.

Categories