Translate Keras functional API to PyTorch nn.Module - Conv2d - python

I'm trying to translate the following Inception code from tutorial in Keras functional API (link) to PyTorch nn.Module:
def conv_module(x, K, kX, kY, stride, chanDim, padding="same"):
# define a CONV => BN => RELU pattern
x = Conv2D(K, (kX, kY), strides=stride, padding=padding)(x)
x = BatchNormalization(axis=chanDim)(x)
x = Activation("relu")(x)
# return the block
return x
def inception_module(x, numK1x1, numK3x3, chanDim):
# define two CONV modules, then concatenate across the
# channel dimension
conv_1x1 = conv_module(x, numK1x1, 1, 1, (1, 1), chanDim)
conv_3x3 = conv_module(x, numK3x3, 3, 3, (1, 1), chanDim)
x = concatenate([conv_1x1, conv_3x3], axis=chanDim)
# return the block
return x
I'm having trouble translating the Conv2D. If I understand correctly:
There is no in_features in Keras - how should I represent it in PyTorch?
Keras filters is PyTorch out_features
kernel_size, stride and padding are the same (maybe a few options for padding are called differently)
Do I understand this correctly? If so, what should I do with in_features? My code so far:
class BasicConv2d(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int
) -> None:
super().__init__()
self.conv = nn.Conv2d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride)
self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
self.relu = nn.ReLU()
def forward(self, x: Tensor) -> Tensor:
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x
class Inception(nn.Module):
def __init__(
self,
in_channels: int,
num_1x1_filters: int,
num_3x3_filters: int,
) -> None:
super().__init__()
# how to fill this further?
self.conv_1d = BasicConv2d(
num_1x1_filters,
)

You're correct for the most part. The in_channels parameter in Con2d corresponds to the no. of output channels from the previous layer. If Conv2d is the first layer, the in_channels correspond to the no. of channels in your image. It will be 1 for a Grayscale image and 3 for an RGB image.
But I'm not sure how you could concat the two BasicConv2d outputs.
Fixing batch_size as 1, assume that the image size is 256*256 and out_channels for conv1x1 is 64. This would output a tensor of shape torch.Size([1, 64, 256, 256]). Assuming out_channels of the conv3x3 as 32, this layer would output a tensor of shape torch.Size([1, 32, 254, 254]). We will not be able to concat these two tensors without some trick, such as using padding=1 for the conv3x3 alone as this would produce an output of shape torch.Size([1, 32, 256, 256]) and therefore we would be able to concat.

Your implementation of BasicConv2d is fine, here is the code of Inception module.
class Inception(nn.Module):
def __init__(
self,
in_channels: int,
num_1x1_filters: int,
num_3x3_filters: int,
) -> None:
super().__init__()
# how to fill this further?
self.conv1 = BasicConv2d(in_channels, num_1x1_filters, 1,1)
self.conv3 = BasicConv2d(in_channels, num_3x3_filters, 3,1)
def forward(self,x):
conv1_out = self.conv1(x)
conv3_out = self.conv3(x)
x = torch.cat([conv1_out, conv3_out],)
return x
You need define two basic conv layers, and use them in the forward pass with same input separately.
As #planet_pluto pointed, you can't concatenate two feature maps have different size. you can choose a better stride, padding to construct two feature maps with same size, alternatively, do upsampling or downsampling before you concatenate them.

Related

Shape inference for PyTorch layers

I'd like to be able to write things in PyTorch like "add another Conv2D on top of the last Conv2D, where the output has 128 channels and the input has the right number of channels to match the previous layer." I end up writing code like this:
from torch import nn
CONV_CHANNELS = [3, 64, 128, 256, 512, 512]
CONV_SIZE = 3
POOL_SIZE = 2
class CNN(torch.nn.Module):
def __init__(self):
super().__init__()
def make_conv_pool(in_channels, out_channels):
return nn.Sequential(
nn.Conv2D(in_channels=in_channels,
out_channels=out_channels,
kernel_size=CONV_SIZE),
nn.ReLU(),
nn.MaxPool2d(kernel_size=POOL_SIZE, stride=2))
self.net = nn.Sequential(*[
make_conv_pool(in_channels, out_channels)
for in_channels, out_channels
in zip(CONV_CHANNELS, CONV_CHANNELS[1:])
])
Now I want to add a fully connected layer at the end, and I want the input to be the result of "flattening" the output of the last conv layer across the height, width, and channel dimensions. I end up with some awkward calculation like:
def calculate_fc_input_features(side_length):
for _ in range(len(CONV_CHANNELS)):
side_length -= (CONV_SIZE - 1)
side_length = side_length // POOL_SIZE
return side_length
so that I can write:
CONV_CHANNELS = [3, 64, 128, 256, 512, 512]
CONV_SIZE = 3
POOL_SIZE = 2
IMAGE_SIDE_LENGTH = 256
NUM_CLASSES = 3
class FlatFC(torch.nn.Module):
def __init__(self, in_features, out_features):
super().__init__()
self.fc = nn.Linear(in_features, out_features)
def forward(self, x):
return self.fc(x.view(x.shape[0], -1))
class CNN(torch.nn.Module):
def __init__(self):
super().__init__()
def make_conv_pool(in_channels, out_channels):
return nn.Sequential(
nn.Conv2D(in_channels=in_channels,
out_channels=out_channels,
kernel_size=CONV_SIZE),
nn.ReLU(),
nn.MaxPool2d(kernel_size=POOL_SIZE, stride=2))
convs = nn.Sequential(*[
make_conv_pool(in_channels, out_channels)
for in_channels, out_channels
in zip(CONV_CHANNELS, CONV_CHANNELS[1:])
])
fc = FlatFC(in_features=calculate_fc_input_features(IMAGE_SIDE_LENGTH),
out_features=NUM_CLASSES)
self.net = nn.Sequential(convs, fc)
It feels like I'm writing a lot of unnecessary boilerplate here: for each of the conv layers, the number of input channels is completely determined by the output channels of the previous layer; for the fully connected layer, I have to do calculation that assumes things about the shape of the network rather than asking the layers themselves something like "if your initial input has shape [B, W, H, C], what is your output shape?"
Is there a better way to do this? Does Pytorch provide a more concise way to say "put another Conv2D on top of this network, and figure out the number of input channels for yourself, because there is only one value that works"? If not, are there libraries that fill this role? It feels like this should be a common task, so I'm surprised by how verbose it seems my implementation needs to be.

conv2d getting bad input

I have trained a classifier and now trying to load it and run some predictions
I am getting an error that is provided below
....
return self._conv_forward(input, self.weight, self.bias)
File "/usr/local/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
TypeError: conv2d() received an invalid combination of arguments - got (list, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (list, Parameter, Parameter, tuple, tuple, tuple, int)
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (list, Parameter, Parameter, tuple, tuple, tuple, int)
Here is the code
import torch
import torch.nn as nn
import numpy as np
from PIL import Image
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
Transformer - used to encode images
transformer = transforms.Compose([
transforms.RandomHorizontalFlip(0.5),
transforms.ToTensor(),
])
Getting a file and converting to Tensor
def get_file_as_tensor(file_path):
with np.load(file_path) as f:
melspec_image_array = f['arr_0']
image = Image.fromarray(melspec_image_array, mode='RGB')
image_tensor = transformer(image).div_(255).float()
return image_tensor.clone().detach()
Prediction function that is on top of the stack because the error occures when I run model([tensor])
def predict(tensor, model):
yhat = model([tensor])
yhat = yhat.clone().detach()
return yhat
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, 1, 1),
nn.BatchNorm2d(out_channels),
nn.ReLU(),
)
self.conv2 = nn.Sequential(
nn.Conv2d(out_channels, out_channels, 3, 1, 1),
nn.ReLU(),
nn.Dropout(0.5)
)
self._init_weights()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.zeros_(m.bias)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = F.avg_pool2d(x, 2)
return x
class Classifier(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.conv = nn.Sequential(
ConvBlock(in_channels=3, out_channels=64),
ConvBlock(in_channels=64, out_channels=128),
ConvBlock(in_channels=128, out_channels=256),
ConvBlock(in_channels=256, out_channels=512),
)
self.fc = nn.Sequential(
nn.Dropout(0.4),
nn.Linear(512, 128),
nn.PReLU(),
#nn.BatchNorm1d(128),
nn.Dropout(0.2),
nn.Linear(128, num_classes),
)
def forward(self, x):
x = self.conv(x)
x = torch.mean(x, dim=3)
x, _ = torch.max(x, dim=2)
x = self.fc(x)
return x
PATH = "models/model.pt"
model = Classifier()
model.load_state_dict(torch.load(PATH))
model.eval()
cry_file_path = "processed_np/car_file.npz"
car_tensor = get_file_as_tensor(car_file_path)
no_car_file_path = "raw_negative_processed/nocar-1041.npz"
no_car_tensor = get_file_as_tensor(no_car_file_path)
car_prediction = predict(car_tensor, model)
no_cry_prediction = predict(no_car_tensor, model)
print("car", car_prediction)
print("no car", no_car_prediction)
The code is self explanatory but SO keeps asking for more text
Would really appreciate some help as I am new to ML
def predict(tensor, model):
yhat = model(tensor.unsqueeze(0))
yhat = yhat.clone().detach()
return yhat
You should use this method definition instead of yours.
Why are you applying your model to [tensor], that is to a python list containing a single element tensor?
You should apply your model to tensor directly: model(tensor).
You might need to add a singleton "batch dimension" to tensor. See this answer for more details.
The error is about conv2d() function not module.
The only thing I can think of here is that your input data is incorrect. Make sure it is a tensor in a form of (B, C, H, W).

Issues with the output size of a Many-to-Many CNN-LSTM in PyTorch

I am trying to build a binary temporal image classifier by combining ResNet18 and an LSTM. However, I have never really used RNNs before and have been struggling on getting the correct output shape.
I am using a batch size of 128 and a sequence size of 32. The images are 80x80 grayscale images.
The current model is:
class CNNLSTM(nn.Module):
def __init__(self):
super(CNNLSTM, self).__init__()
self.resnet = models.resnet18(pretrained=False)
self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3)
self.resnet.fc = nn.Sequential(nn.Linear(in_features=512, out_features=256, bias=True))
self.lstm = nn.LSTM(input_size=256, hidden_size=256, num_layers=3)
self.fc1 = nn.Linear(256, 128)
self.fc2 = nn.Linear(128, 1)
def forward(self, x_3d):
#x3d: torch.Size([128, 32, 1, 80, 80])
hidden = None
toret = []
for t in range(x_3d.size(1)):
x = self.resnet(x_3d[:, t, :, :, :])
out, hidden = self.lstm(x.unsqueeze(0), hidden)
x = self.fc1(out[-1, :, :])
x = F.relu(x)
x = self.fc2(x)
print("x shape: ", x.shape)
toret.append(x)
return torch.stack(toret)
Which returns a tensor of shape torch.Size([32, 128, 1]) which, according to what I understand, means that every nth row represents the nth time step of each element in the sequence.
How can I get output of shape 128x1x32 instead?
And is there a better way to do this?
You could permute the dimensions:
a = torch.rand(32, 128, 1)
a = a.permute(1, 2, 0) # these are the indices of the original dimensions
print(a.shape)
>> torch.Size([128, 1, 32])
But you could also set batch_first=True in the LSTM module:
self.lstm = nn.LSTM(input_size=256, hidden_size=256, num_layers=3, batch_first=True)
This will expect that the input to the LSTM has the shape batch-size x seq-len x features and will output a tensor in the same way.

What is the best way to represent spatio-temporal image data in a vanilla LSTM

I'm working on video frame segmentation prediction and I want to start using a vanilla LSTM as baseline, I know it won't get good results.
In my current approach is I have the original inputs image frame in the first channel and segmentation frame is the second channel like this Data representation, then a flatten out the input into a 1-D array. How should I represent my spatio-temporal such that it works for a vanilla LSTM?
Here is the snippet to the vanilla LSTM that i'm using in Pytorch
class ImageLSTM(nn.Module):
def __init__(self, n_inputs:int=49,
n_outputs:int=4096,
n_hidden:int=256,
n_layers:int=1,
bidirectional:bool=False):
"""
Takes a 1D flatten images.
"""
super(ImageLSTM, self).__init__()
self.n_inputs = n_inputs
self.n_hidden = n_hidden
self.n_outputs = n_outputs
self.n_layers = n_layers
self.bidirectional = bidirectional
self.lstm = nn.LSTM( input_size=self.n_inputs,
hidden_size=self.n_hidden,
batch_first=False,
num_layers=self.n_layers,
bidirectional=self.bidirectional)
if (self.bidirectional):
self.FC = nn.Sequential(
nn.Linear(self.n_hidden*2, self.n_outputs),
nn.Dropout(p=0.5),
nn.Sigmoid()
)
else:
self.FC = nn.Sequential(
nn.Linear(self.n_hidden, self.n_outputs),
nn.Dropout(p=0.5),
nn.Sigmoid()
)
def init_hidden(self, x, device=None): # input 4D tensor: (batch size, channels, width, height)
# initialize the hidden and cell state to zero
# vectors:(number of layer, sequence length, number of hidden nodes)
if (self.bidirectional):
h0 = torch.zeros(2*self.n_layers, 1, self.n_hidden)
c0 = torch.zeros(2*self.n_layers, 1, self.n_hidden)
else:
h0 = torch.zeros(self.n_layers, 1, self.n_hidden)
c0 = torch.zeros(self.n_layers, 1, self.n_hidden)
if device is not None:
h0 = h0.to(device)
c0 = c0.to(device)
self.hidden = (h0,c0)
def forward(self, X): # X: tensor of shape (batch_size, channels, width, height)
# forward propagate LSTM
lstm_out, self.hidden = self.lstm(X, self.hidden) # lstm_out: tensor of shape (seq_length, batch_size, hidden_size)
out = self.FC(lstm_out[:, -1, :])
return out

Why return self.head(x.view(x.size(0), -1)) in the nn.Module for pyTorch reinforcement learning example

I understand that the balancing the pole example requires 2 outputs. Reinforcement Learning (DQN) Tutorial
Here is the output for self.head
print ('x',self.head)
x = Linear(in_features=512, out_features=2, bias=True)
When I run the epochs below is the outputs:
print (self.head(x.view(x.size(0), -1)))
return self.head(x.view(x.size(0), -1))
tensor([[-0.6945, -0.1930]])
tensor([[-0.0195, -0.1452]])
tensor([[-0.0906, -0.1816]])
tensor([[ 0.0631, -0.9051]])
tensor([[-0.0982, -0.5109]])
...
The size of x is:
x = torch.Size([121, 32, 2, 8])
So I am trying to understand what x.view(x.size(0), -1) is doing?
I understand from the comment in the code that it's returning:
Returns tensor([[left0exp,right0exp]...]).
But how does x which is torch.Size([121, 32, 2, 8]) being reduced to a tensor of size 2?
Is there an alternative way of writing that makes more sense? What if I had 4 outputs. How would I represent that? Why x.size(0). Why -1?
So appears to take self.head with 4 outputs to 2 outputs. Is that correct?
At the bottom is that class I am referring:
class DQN(nn.Module):
def __init__(self, h, w, outputs):
super(DQN, self).__init__()
self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)
# Number of Linear input connections depends on output of conv2d layers
# and therefore the input image size, so compute it.
def conv2d_size_out(size, kernel_size = 5, stride = 2):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
linear_input_size = convw * convh * 32
self.head = nn.Linear(linear_input_size, outputs)
# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1))
x.view(x.size(0), -1) is flattening the tensor, this is because the Linear layer only accepts a vector (1d array). To break it down, x.view() reshapes the tensor of the specified shape (more info). x.shape(0) returns 1st dimension of the tensor (which is the batch size, this should remain the constant). The -1 in x.view() is a filler, in other words, its dimensions that we don't know, so PyTorch automatically calculates it. For example, if x = torch.tensor([1,2,3,4]), to reshape the tensor to a 2x2, you could do x.view(2,2) or x.view(2,-1) or x.view(-1,2).
The output shape is not a tensor shape of 2, but that of 121,2 (the 121 is the batch size, and the 2 comes from the Linear layers output). So to change the output size from 2, to 4, you would have to change the outputs argument in the __init__ function to 4.

Categories