I am working on sentiment analysis, I want to classify the output into 4 classes. For loss I am using cross-entropy.
The problem is PyTorch cross-entropy needs the input of (batch_size, output) which is am having trouble with.
I am taking a batch size of 12 and sequence size is 32
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, hidden_dim = 256, input_size = 32 , num_layers = 1, num_classes=4, vocab_size = len(vocab_to_int)+1, embedding_dim=100):
super().__init__()
self.input_size = input_size
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.num_classes = num_classes
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
self.fc1 = nn.Linear(hidden_dim, 50)
self.fc2 = nn.Linear(50, 4)
def forward(self, x, hidden):
x = self.embedding(x)
x = x.view(32, 12, 100)
x, hidden = self.lstm(x, hidden)
x = x.contiguous().view(-1, 256)
x = self.fc1(x) # output shape ([384, 50])
x = self.fc2(x) # output shape [384, 4]
return x, hidden
def init_hidden(self, batch_size=12):
weight = next(self.parameters()).data
hidden = (weight.new(self.num_layers, 12, self.hidden_dim).zero_().cuda(), weight.new(self.num_layers, 12, self.hidden_dim).zero_().cuda())
return hidden
According to the CrossEntropyLoss docs:
input has to be a Tensor of size (C) for unbatched input, (minibatch,C) [for batched input] [...]
The code you provided is only the RNN class and not the data processing and the actual call to CrossEntropyLoss, but the error you stated in the comments makes me think that you didn't reshape the labels tensor to have the same size as the output from the neural network. Therefore, you'd be calculating the loss of a tensor with size (384, 4) against another tensor which I infer is of size (12, 32). Your labels tensor should be of size (384) to match the first dimension of the neural network output.
Also, you don't have to manually reshape your tensors, you can reshape them after the forward() call through the torch.nn.utils.rnn.pack_padded_sequence() function. If you do apply this function to both the output of the neural network and the labels, you will have a tensor of size (384, 4) that PyTorch can handle in the call to CrossEntropyLoss. See the note in the pack_padded_sequence() function docs for more details.
I have a simple convolution network:
import torch.nn as nn
class model(nn.Module):
def __init__(self, ks=1):
super(model, self).__init__()
self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=ks, stride=1)
self.fc1 = nn.Linear(8*8*32*ks, 64)
self.fc2 = nn.Linear(64, 64)
def forward(self, x):
x = F.relu(self.conv1(x))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
cnn = model(1)
Since the kernel size is 1 and the output channel is 32, I assume that there should be 32*1*1 weights in this layer. But, when I ask pytorch about the shape of the weight matrix cnn.conv1.weight.shape, it returns torch.Size([32, 4, 1, 1]). Why the number of input channel should matter on the weight of a conv2d layer?
Am I missing something?
It matters because you are doing 2D convolution over the images which means the depth of the filter(kernel) must be equal to the number of in_channels(pytorch sets it for you) so the true kernel size is [in_channels,1,1]. On the other hands we can say that out_channels number is the number of kernels so the number of weights = number of kernels * size of kernel = out_channels * (in_channels * kernel_size). Here is 2D conv with 3D input
I copied the CIFAR10 sample network from PyTorch tutorial and added more layers, including BN. Even after 45 epochs, the network keeps achieving 68% classification accuracy on the test set.
The network consists of:
2 convolutional layers with 3x3 kernels (input size reduces from 32px to 28px)
one max pooling layer (input size reduces from 28px to 14px)
3 convolutional layers with 3x3 kernels (input size reduces from 14px to 8px)
A fully connected network with 3 layers of 256->256->10 neurons
batch normaliation is applied on all layers, including the convolutional layers, except for the last FC layer
Relu is applied on all the convolutional layers and all the hidden FC layers
Did I build/use anything improperly?
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1_1 = nn.Conv2d(3, 16, 3) # 32 -> 30
self.bn1_1 = nn.BatchNorm2d(16)
self.conv1_2 = nn.Conv2d(16, 16, 3) # 30 - > 28
self.bn1_2 = nn.BatchNorm2d(16)
self.pool = nn.MaxPool2d(2, 2) # 28 -> 14
self.conv2_1 = nn.Conv2d(16, 16, 3) # 14 -> 12
self.bn2_1 = nn.BatchNorm2d(16)
self.conv2_2 = nn.Conv2d(16, 16, 3) # 12 -> 10
self.bn2_2 = nn.BatchNorm2d(16)
self.conv2_3 = nn.Conv2d(16, 16, 3) # 10 -> 8
self.bn2_3 = nn.BatchNorm2d(16)
self.fc1 = nn.Linear(16 * 8 * 8, 256)
self.bn4 = nn.BatchNorm1d(256)
self.fc2 = nn.Linear(256, 256)
self.bn5 = nn.BatchNorm1d(256)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
x = F.relu(self.bn1_1(self.conv1_1(x)))
x = self.pool(F.relu(self.bn1_2(self.conv1_2(x))))
x = F.relu(self.bn2_1(self.conv2_1(x)))
x = F.relu(self.bn2_2(self.conv2_2(x)))
x = F.relu(self.bn2_3(self.conv2_3(x)))
x = x.view(-1, 16 * 8 * 8)
x = F.relu(self.bn4(self.fc1(x)))
x = F.relu(self.bn5(self.fc2(x)))
x = self.fc3(x)
return x
net = Net()
device = 'cuda:0'
net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=8,
shuffle=True, num_workers=2)
for epoch in range(128): # loop over the dataset multiple times
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
Note: added the "Python" tag so the code gets highlighted
Note: updated the forward method to apply F.relu on the hidden FC layers
Use sigmoid activation for the last layer.
Assume that I have 77 samples to train my CNN, and my batch size is 10. Then the last batch has a batch size of 7 instead of 10. Somehow when I pass it to the loss function such as nn.MSELoss(), it gives me the error:
RuntimeError: The size of tensor a (10) must match the size of tensor
b (7) at non-singleton dimension 1
So pytorch doesn't support batches with different sizes?
My code in doubt:
import numpy as np
import torch
from torch import nn
import torchvision
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 6, (5,4))
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(64, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, x.shape[1] * x.shape[2] * x.shape[3])
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
model = Net()
batch_size = 10
# Generating Artifical data
x_train = torch.randn((77,1,20,20))
y_train = torch.randint(0,10,size=(77,),dtype=torch.float)
trainset = torch.utils.data.TensorDataset(x_train,y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
# testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=0)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(20): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i%10==0:
print('epoch{}, step{}, loss: {}'.format(epoch + 1, i + 1, running_loss))
# print("frac post = {}".format(frac_post))
running_loss = 0.0
The problem is not due to the batch size, but to a failure to broadcast properly between the 10 outputs of your CNN and the single label provided in each example.
If you look at the model output and label tensor shapes during the batch where the error is thrown,
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7])
you'll see that the labels are stored in a singleton tensor. According to pytorch broadcasting rules, to be broadcastable two tensors have to be compatible in all trailing dimensions. In this case, the trailing dimension of the model output (10) is incompatible with that of the label (7).
To fix, either add a dummy dimension to the label (assuming you actually want to broadcast the labels to match your ten network outputs), or define a network with scalar outputs. For example:
y_train = torch.randint(0,10,size=(77,1),dtype=torch.float)
results in
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7,1])
# these are broadcastable
Previously I built a network that implemented a binary image segmentation -- foreground & background. I did this by having two classifications. Now instead of a binary classification, I want to do a linear regression of each pixel.
Say there is a 3D surface within the image view, I want to segment the exact middle of that surface with a linear value 10. The edge of the surface will be, let's say, 5. Of course all the voxels in between are within the range 5-10. Then, as the voxels move away from the surface the values quickly go down to zero.
With the binary classification I had an image with 1's in the places of the foreground and an image with 1's in the place of the background -- in other words a classification :) Now I want to have just one ground truth image with values like the following...
Via this linear regression example, I assumed I could simply change the cost function to a least square function -- cost = tf.square(y - pred). And of course I would change the ground truth.
However, when I do this, my predictions output NaN. My last layer is a linear sum of matrix weight values multiplied by the final output. I'm guessing this has something to do with it? I can't make it a tf.nn.softmax() function because that would normalize the values between 0 and 1.
So I believe cost = tf.square(y - pred) is the source of the issue. I tried this next... cost = tf.reduce_sum(tf.square(y - pred)) and that didn't work.
So then I tried this (recommended here) cost = tf.reduce_sum(tf.pow(pred - y, 2))/(2 * batch_size) and that didn't work.
Should I be initializing weights differently? Normalize weights?
Full code looks like this:
import tensorflow as tf
import pdb
import numpy as np
from numpy import genfromtxt
from PIL import Image
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.contrib.learn.python.learn.datasets.scroll import scroll_data
# Parameters
learning_rate = 0.001
training_iters = 1000000
batch_size = 2
display_step = 1
# Network Parameters
n_input_x = 396 # Input image x-dimension
n_input_y = 396 # Input image y-dimension
n_classes = 1 # Binary classification -- on a surface or not
n_steps = 396
n_hidden = 128
n_output = n_input_y * n_classes
dropout = 0.75 # Dropout, probability to keep units
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input_x, n_input_y])
y = tf.placeholder(tf.float32, [None, n_input_x * n_input_y], name="ground_truth")
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
def deconv2d(prev_layer, w, b, output_shape, strides):
# Deconv layer
deconv = tf.nn.conv2d_transpose(prev_layer, w, output_shape=output_shape, strides=strides, padding="VALID")
deconv = tf.nn.bias_add(deconv, b)
deconv = tf.nn.relu(deconv)
return deconv
# Create model
def net(x, cnn_weights, cnn_biases, dropout):
# Reshape input picture
x = tf.reshape(x, shape=[-1, 396, 396, 1])
with tf.name_scope("conv1") as scope:
# Convolution Layer
conv1 = conv2d(x, cnn_weights['wc1'], cnn_biases['bc1'])
# Max Pooling (down-sampling)
#conv1 = tf.nn.local_response_normalization(conv1)
conv1 = maxpool2d(conv1, k=2)
# Convolution Layer
with tf.name_scope("conv2") as scope:
conv2 = conv2d(conv1, cnn_weights['wc2'], cnn_biases['bc2'])
# Max Pooling (down-sampling)
# conv2 = tf.nn.local_response_normalization(conv2)
conv2 = maxpool2d(conv2, k=2)
# Convolution Layer
with tf.name_scope("conv3") as scope:
conv3 = conv2d(conv2, cnn_weights['wc3'], cnn_biases['bc3'])
# Max Pooling (down-sampling)
# conv3 = tf.nn.local_response_normalization(conv3)
conv3 = maxpool2d(conv3, k=2)
temp_batch_size = tf.shape(x)[0] #batch_size shape
with tf.name_scope("deconv1") as scope:
output_shape = [temp_batch_size, 99, 99, 64]
strides = [1,2,2,1]
# conv4 = deconv2d(conv3, weights['wdc1'], biases['bdc1'], output_shape, strides)
deconv = tf.nn.conv2d_transpose(conv3, cnn_weights['wdc1'], output_shape=output_shape, strides=strides, padding="SAME")
deconv = tf.nn.bias_add(deconv, cnn_biases['bdc1'])
conv4 = tf.nn.relu(deconv)
# conv4 = tf.nn.local_response_normalization(conv4)
with tf.name_scope("deconv2") as scope:
output_shape = [temp_batch_size, 198, 198, 32]
strides = [1,2,2,1]
conv5 = deconv2d(conv4, cnn_weights['wdc2'], cnn_biases['bdc2'], output_shape, strides)
# conv5 = tf.nn.local_response_normalization(conv5)
with tf.name_scope("deconv3") as scope:
output_shape = [temp_batch_size, 396, 396, 1]
#this time don't use ReLu -- since output layer
conv6 = tf.nn.conv2d_transpose(conv5, cnn_weights['wdc3'], output_shape=output_shape, strides=[1,2,2,1], padding="VALID")
x = tf.nn.bias_add(conv6, cnn_biases['bdc3'])
# Include dropout
#conv6 = tf.nn.dropout(conv6, dropout)
x = tf.reshape(conv6, [-1, n_input_x, n_input_y])
# Prepare data shape to match `rnn` function requirements
# Current data input shape: (batch_size, n_steps, n_input)
# Permuting batch_size and n_steps
x = tf.transpose(x, [1, 0, 2])
# Reshaping to (n_steps*batch_size, n_input)
x = tf.reshape(x, [-1, n_input_x])
# Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden)
# This input shape is required by `rnn` function
x = tf.split(0, n_steps, x)
# Define a lstm cell with tensorflow
lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True, activation=tf.nn.relu)
# lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * 12, state_is_tuple=True)
# lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=0.8)
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
# pdb.set_trace()
output = []
for i in xrange(396):
output.append(tf.matmul(outputs[i], lstm_weights[i]) + lstm_biases[i])
return output
cnn_weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1' : tf.Variable(tf.random_normal([5, 5, 1, 32])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2' : tf.Variable(tf.random_normal([5, 5, 32, 64])),
# 5x5 conv, 32 inputs, 64 outputs
'wc3' : tf.Variable(tf.random_normal([5, 5, 64, 128])),
'wdc1' : tf.Variable(tf.random_normal([2, 2, 64, 128])),
'wdc2' : tf.Variable(tf.random_normal([2, 2, 32, 64])),
'wdc3' : tf.Variable(tf.random_normal([2, 2, 1, 32])),
}
cnn_biases = {
'bc1': tf.Variable(tf.random_normal([32])),
'bc2': tf.Variable(tf.random_normal([64])),
'bc3': tf.Variable(tf.random_normal([128])),
'bdc1': tf.Variable(tf.random_normal([64])),
'bdc2': tf.Variable(tf.random_normal([32])),
'bdc3': tf.Variable(tf.random_normal([1])),
}
lstm_weights = {}
lstm_biases = {}
for i in xrange(396):
lstm_weights[i] = tf.Variable(tf.random_normal([n_hidden, n_output]))
lstm_biases[i] = tf.Variable(tf.random_normal([n_output]))
# Construct model
# with tf.name_scope("net") as scope:
pred = net(x, cnn_weights, cnn_biases, keep_prob)
# pdb.set_trace()
pred = tf.pack(pred)
pred = tf.transpose(pred, [1,0,2])
pred = tf.reshape(pred, [-1, n_input_x * n_input_y])
with tf.name_scope("opt") as scope:
# cost = tf.reduce_sum(tf.square(y-pred))
cost = tf.reduce_sum(tf.pow((pred-y),2)) / (2*batch_size)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
with tf.name_scope("acc") as scope:
# accuracy is the difference between prediction and ground truth matrices
correct_pred = tf.equal(0,tf.cast(tf.sub(cost,y), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
saver = tf.train.Saver()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
summary = tf.train.SummaryWriter('/tmp/logdir/', sess.graph) #initialize graph for tensorboard
step = 1
# Import data
data = scroll_data.read_data('/home/kendall/Desktop/')
# Keep training until reach max iterations
while step * batch_size < training_iters:
batch_x, batch_y = data.train.next_batch(batch_size)
# Run optimization op (backprop)
# pdb.set_trace()
batch_x = batch_x.reshape((batch_size, n_input_x, n_input_y))
batch_y = batch_y.reshape(batch_size, n_input_x * n_input_y)
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})
step = step + 1
if step % display_step == 0:
batch_y = batch_y.reshape(batch_size, n_input_x * n_input_y)
loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
y: batch_y})
# Make prediction
im = Image.open('/home/kendall/Desktop/cropped/temp data0001.tif')
batch_x = np.array(im)
batch_x = batch_x.reshape((1, n_input_x, n_input_y))
batch_x = batch_x.astype(float)
prediction = sess.run(pred, feed_dict={x: batch_x})
prediction = prediction.reshape((1, n_input_x * n_input_y))
prediction = tf.nn.softmax(prediction)
prediction = prediction.eval()
prediction = prediction.reshape((n_input_x, n_input_y))
# my_accuracy = accuracy_custom(temp_arr1,batch_y[0,:,:,0])
#
# print "Step = " + str(step) + " | Accuracy = " + str(my_accuracy)
print "Step = " + str(step) + " | Accuracy = " + str(acc)
# csv_file = "CNN-LSTM-reg/CNNLSTMreg-step-" + str(step) + "-accuracy-" + str(my_accuracy) + ".csv"
csv_file = "CNN-LSTM-reg/CNNLSTMreg-step-" + str(step) + "-accuracy-" + str(acc) + ".csv"
np.savetxt(csv_file, prediction, delimiter=",")
As said in the comments, a good weight initialization is key to the success of a model:
too high: the model will not learn and may produce NaN values
too low: the model will learn very very slowly, because the gradient will be too small (see vanishing gradients)
There are good initializations already provided in TensorFlow here (as a contribution), feel free to use them.