I have a image input 340px*340px and I want to classify it to 2 classes.
I want to create convolution neural network (PyTorch framework). I have a problem with input and output of layer.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 3 channels (RGB), kernel=5, but i don't understand why 6.
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
#why 16?
self.conv2 = nn.Conv2d(6, 16, 5)
#why 107584 = 328*328
self.fc1 = nn.Linear(107584, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 2)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
# i dont understand this line
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
Is it network is correct?
# 3 channels (RGB), kernel=5, but i don't understand why 6.
The second parameter of Conv2d is out_channels. In a convolutional layer you can arbitrarily define a number of out channels. So it's set to 6 because someone set it to 6.
# why 16?
Same as above.
#why 107584 = 328*328
and
\ # i dont understand this line
Tensor.view() returns a new tensor with the same data as the self tensor but of a different size.
x = x.view(x.size(0), -1): -1 means "infer from other dimensions" so, you are forcing the Tensor to be [1, 15*164*164] => [1, 403440].
403440 is also the correct value for self.fc1 = nn.Linear(107584, 120), instead of 107584.
Related
I try to run following programe for images classification in Pytorch and it returns me
Given groups=1, weight of size [6, 3, 5, 5], expected input[192, 1, 256, 256] to have 3 channels, but got 1 channels instead ?
num_classes = 2
class CNN(nn.Module):
def __init__(self,input_size,n_features,output_size):
super(CNN, self).__init__()
self.n_features = n_features
self.conv1 = nn.Conv2d(in_channels=3,out_channels=n_features,kernel_size=5)
self.conv2 = nn.Conv2d(in_channels=n_features,out_channels=3*n_features,kernel_size=5)
self.conv3 = nn.Conv2d(in_channels=3*n_features,out_channels=n_features,kernel_size=5)
self.conv1_bn = nn.BatchNorm2d(n_features)
self.conv2_bn = nn.BatchNorm2d(3*n_features)
self.fc1 = nn.Linear(n_features*4*4,2)
def forward(self,x):
x = self.conv1(x)
x = F.relu(self.conv1_bn(x))
x = F.max_pool2d(x,kernel_size=2)
x = self.conv2(x)
x = F.relu(self.conv1_bn(x))
x = F.max_pool2d(x,kernel_size=2)
x = x.view(-1,self.n_features*4*4)
x = self.fc1(x)
x = F.log_softmax(x,dim=1)
return x
To get rid of the problem you should change number of in_channels in self.conv1 from 3 to 1, cause from error message it seems you training on images of size (1, 256, 256). Also consider setting stride option of convolutional laeyrs to something more than default 1, or increasing number of layers.
I am trying to build a binary temporal image classifier by combining ResNet18 and an LSTM. However, I have never really used RNNs before and have been struggling on getting the correct output shape.
I am using a batch size of 128 and a sequence size of 32. The images are 80x80 grayscale images.
The current model is:
class CNNLSTM(nn.Module):
def __init__(self):
super(CNNLSTM, self).__init__()
self.resnet = models.resnet18(pretrained=False)
self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3)
self.resnet.fc = nn.Sequential(nn.Linear(in_features=512, out_features=256, bias=True))
self.lstm = nn.LSTM(input_size=256, hidden_size=256, num_layers=3)
self.fc1 = nn.Linear(256, 128)
self.fc2 = nn.Linear(128, 1)
def forward(self, x_3d):
#x3d: torch.Size([128, 32, 1, 80, 80])
hidden = None
toret = []
for t in range(x_3d.size(1)):
x = self.resnet(x_3d[:, t, :, :, :])
out, hidden = self.lstm(x.unsqueeze(0), hidden)
x = self.fc1(out[-1, :, :])
x = F.relu(x)
x = self.fc2(x)
print("x shape: ", x.shape)
toret.append(x)
return torch.stack(toret)
Which returns a tensor of shape torch.Size([32, 128, 1]) which, according to what I understand, means that every nth row represents the nth time step of each element in the sequence.
How can I get output of shape 128x1x32 instead?
And is there a better way to do this?
You could permute the dimensions:
a = torch.rand(32, 128, 1)
a = a.permute(1, 2, 0) # these are the indices of the original dimensions
print(a.shape)
>> torch.Size([128, 1, 32])
But you could also set batch_first=True in the LSTM module:
self.lstm = nn.LSTM(input_size=256, hidden_size=256, num_layers=3, batch_first=True)
This will expect that the input to the LSTM has the shape batch-size x seq-len x features and will output a tensor in the same way.
I understand that the balancing the pole example requires 2 outputs. Reinforcement Learning (DQN) Tutorial
Here is the output for self.head
print ('x',self.head)
x = Linear(in_features=512, out_features=2, bias=True)
When I run the epochs below is the outputs:
print (self.head(x.view(x.size(0), -1)))
return self.head(x.view(x.size(0), -1))
tensor([[-0.6945, -0.1930]])
tensor([[-0.0195, -0.1452]])
tensor([[-0.0906, -0.1816]])
tensor([[ 0.0631, -0.9051]])
tensor([[-0.0982, -0.5109]])
...
The size of x is:
x = torch.Size([121, 32, 2, 8])
So I am trying to understand what x.view(x.size(0), -1) is doing?
I understand from the comment in the code that it's returning:
Returns tensor([[left0exp,right0exp]...]).
But how does x which is torch.Size([121, 32, 2, 8]) being reduced to a tensor of size 2?
Is there an alternative way of writing that makes more sense? What if I had 4 outputs. How would I represent that? Why x.size(0). Why -1?
So appears to take self.head with 4 outputs to 2 outputs. Is that correct?
At the bottom is that class I am referring:
class DQN(nn.Module):
def __init__(self, h, w, outputs):
super(DQN, self).__init__()
self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)
# Number of Linear input connections depends on output of conv2d layers
# and therefore the input image size, so compute it.
def conv2d_size_out(size, kernel_size = 5, stride = 2):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
linear_input_size = convw * convh * 32
self.head = nn.Linear(linear_input_size, outputs)
# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1))
x.view(x.size(0), -1) is flattening the tensor, this is because the Linear layer only accepts a vector (1d array). To break it down, x.view() reshapes the tensor of the specified shape (more info). x.shape(0) returns 1st dimension of the tensor (which is the batch size, this should remain the constant). The -1 in x.view() is a filler, in other words, its dimensions that we don't know, so PyTorch automatically calculates it. For example, if x = torch.tensor([1,2,3,4]), to reshape the tensor to a 2x2, you could do x.view(2,2) or x.view(2,-1) or x.view(-1,2).
The output shape is not a tensor shape of 2, but that of 121,2 (the 121 is the batch size, and the 2 comes from the Linear layers output). So to change the output size from 2, to 4, you would have to change the outputs argument in the __init__ function to 4.
On pytorch website they have the following model in their tutorial
class BasicCNN(nn.Module):
def __init__(self):
super(BasicCNN, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = x.permute(0, 3, 1, 2)
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
How many kernels/filters this model has? Is it two - e.g conv1 and conv2. How do I easily create many filters by specifying the number of them? For example 100 filters.
Thanks!
Your question is a little ambiguous but let me try to answer it.
Usually, in a convolutional layer, we set the number of filters as the number of out_channels. But this is not straight-forward. Let's discuss based on the example that you provided.
What are the convolutional layer parameters?
model = BasicCNN()
for name, params in model.named_parameters():
if 'conv' in name:
print(name, params.size())
Output:
conv1.weight torch.Size([6, 3, 5, 5])
conv1.bias torch.Size([6])
conv2.weight torch.Size([16, 6, 5, 5])
conv2.bias torch.Size([16])
Explanation
Let's consider conv1 layer in the above model. We can say, there are 6 filters of shape 5 x 5 because we have chosen 2d Convolution. Since the number of input channels is 3, so there are in total 6 x 3 = 18 kernels.
Here, the inputs of this model are 3d like images. You can consider, we have images with shape W x H and there are 3 channels (RGB) for images. So, we can feed 3d tensors representing images to this model.
Now coming back to your question, "How do I easily create many filters by specifying the number of them? For example 100 filters.". If you want to simply use 100 filters per input channel, then just set 100 in conv1 instead of 6. This is typically what people do in computer vision!
But you can definitely modify the architecture as per your need and identify the best setting.
Let me explain the objective first. Let's say I have 1000 images each with an associated quality score [in range of 0-10]. Now, I am trying to perform the image quality assessment using CNN with regression(in PyTorch). I have divided the images into equal size patches. Now, I have created a CNN network in order to perform the linear regression.
Following is the code:
class MultiLabelNN(nn.Module):
def __init__(self):
super(MultiLabelNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, 5)
self.fc1 = nn.Linear(3200,1024)
self.fc2 = nn.Linear(1024, 512)
self.fc3 = nn.Linear(512, 1)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.pool(x)
x = self.conv2(x)
x = F.relu(x)
x = x.view(-1, 3200)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
return x
While running this code of network I am getting following error
input and target shapes do not match: input [400 x 1], target [200 x 1]
the target shape is [200x1] is because I have taken the batch size of 200. I found the solution that if I change "self.fc1 = nn.Linear(3200,1024)" and "x = x.view(-1, 3200)" here from 3200 to 6400 my code runs without any error.
Similarly, It will throw an error input and target shapes do not match: input [100 x 1], target [200 x 1] if I put 12800 instead of 6400
Now my doubt is that I am not able to understand the reason behind this. If I am giving 200 images as input to my network then why the input shape is getting affected while changing the parameters when I move from convolutional layer to fully connected layer. I hope I have clearly mentioned my doubt. Even though I anybody has any doubt please ask me. It will be a great help. Thanks in advance.
class MultiLabelNN(nn.Module):
def __init__(self):
super(MultiLabelNN, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, 5)
self.fc1 = nn.Linear(6400,1024)
self.fc2 = nn.Linear(1024, 512)
self.fc3 = nn.Linear(512, 1)
def forward(self, x):
#shape of x is (b_s, 32,32,1)
x = self.conv1(x) #shape of x is (b_s, 28,28,132)
x = F.relu(x)
x = self.pool(x) #shape of x now becomes (b_s X 14 x 14 x 32)
x = self.conv2(x) # shape(b_s, 10x10x64)
x = F.relu(x)#size is (b_s x 10 x 10 x 64)
x = x.view(-1, 3200) # shape of x is now(b_s*2, 3200)
#this is the problem
#you can fc1 to be of shape (6400,1024) and that will work
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)
return x
I think this should work. Let me know if some errors still remain.