Lasagne/Theano wrong number of dimensions - python

Headed into Lasagne and Theano with a modified (the primary example of Lasagne) to train a very simple XOR.
import numpy as np
import theano
import theano.tensor as T
import time
import lasagne
X_train = [[[[0, 0], [0, 1], [1, 0], [1, 1]]]] # (1)
y_train = [[[[1, 0], [0, 1], [0, 1], [1, 0]]]]
# [0, 1, 1, 0]
X_train = np.array(X_train).astype(np.uint8)
y_train = np.array(y_train).astype(np.uint8)
print X_train.shape
X_val = X_train
y_val = y_train
X_test = X_train
y_test = y_train
def build_mlp(input_var=None):
# This creates an MLP of two hidden layers of 800 units each, followed by
# a softmax output layer of 10 units. It applies 20% dropout to the input
# data and 50% dropout to the hidden layers.
# Input layer, specifying the expected input shape of the network
# (unspecified batchsize, 1 channel, 28 rows and 28 columns) and
# linking it to the given Theano variable `input_var`, if any:
l_in = lasagne.layers.InputLayer(shape=(None, 1, 4, 2), # (2)
# Apply 20% dropout to the input data:
# l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
# Add a fully-connected layer of 800 units, using the linear rectifier, and
# initializing weights with Glorot's scheme (which is the default anyway):
l_hid1 = lasagne.layers.DenseLayer(
l_in, num_units=4,
# Finally, we'll add the fully-connected output layer, of 10 softmax units:
l_out = lasagne.layers.DenseLayer(
l_hid1, num_units=2,
# Each layer is linked to its incoming layer(s), so we only need to pass
# the output layer to give access to a network in Lasagne:
return l_out
# Prepare Theano variables for inputs and targets
input_var = T.tensor4('inputs')
target_var = T.ivector('targets')
network = build_mlp(input_var)
# Create a loss expression for training, i.e., a scalar objective we want
# to minimize (for our multi-class problem, it is the cross-entropy loss):
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
# We could add some weight decay as well here, see lasagne.regularization.
# Create update expressions for training, i.e., how to modify the
# parameters at each training step. Here, we'll use Stochastic Gradient
# Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.nesterov_momentum(
loss, params, learning_rate=0.01, momentum=0.9)
# Create a loss expression for validation/testing. The crucial difference
# here is that we do a deterministic forward pass through the network,
# disabling dropout layers.
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
test_loss = test_loss.mean()
# As a bonus, also create an expression for the classification accuracy:
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
# Compile a function performing a training step on a mini-batch (by giving
# the updates dictionary) and returning the corresponding training loss:
train_fn = theano.function([input_var, target_var], loss, updates=updates)
# Compile a second function computing the validation loss and accuracy:
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
# ############################# Batch iterator ###############################
# This is just a simple helper function iterating over training data in
# mini-batches of a particular size, optionally in random order. It assumes
# data is available as numpy arrays. For big datasets, you could load numpy
# arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your
# own custom data iteration function. For small datasets, you can also copy
# them to GPU at once for slightly improved performance. This would involve
# several changes in the main program, though, and is not demonstrated here.
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
assert len(inputs) == len(targets)
if shuffle:
indices = np.arange(len(inputs))
for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
excerpt = slice(start_idx, start_idx + batchsize)
yield inputs[excerpt], targets[excerpt]
if shuffle:
excerpt = indices[0:len(inputs)]
excerpt = slice(0, len(inputs))
yield inputs[excerpt], targets[excerpt]
num_epochs = 4
# Finally, launch the training loop.
print("Starting training...")
# We iterate over epochs:
for epoch in range(num_epochs):
# In each epoch, we do a full pass over the training data:
train_err = 0
train_batches = 0
start_time = time.time()
for batch in iterate_minibatches(X_train, y_train, 4, shuffle=True):
inputs, targets = batch
print inputs.shape, targets.shape, input_var.shape, input_var.ndim, inputs.ndim
train_err += train_fn(inputs, targets) # (3)
train_batches += 1
# And a full pass over the validation data:
val_err = 0
val_acc = 0
val_batches = 0
for batch in iterate_minibatches(X_val, y_val, 4, shuffle=False):
inputs, targets = batch
err, acc = val_fn(inputs, targets)
val_err += err
val_acc += acc
val_batches += 1
# Then we print the results for this epoch:
print("Epoch {} of {} took {:.3f}s".format(
epoch + 1, num_epochs, time.time() - start_time))
print(" training loss:\t\t{:.6f}".format(train_err / train_batches))
print(" validation loss:\t\t{:.6f}".format(val_err / val_batches))
print(" validation accuracy:\t\t{:.2f} %".format(
val_acc / val_batches * 100))
# After training, we compute and print the test error:
test_err = 0
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
inputs, targets = batch
err, acc = val_fn(inputs, targets)
test_err += err
test_acc += acc
test_batches += 1
print("Final results:")
print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches))
print(" test accuracy:\t\t{:.2f} %".format(
test_acc / test_batches * 100))
# Optionally, you could now dump the network weights to a file like this:
# np.savez('model.npz', lasagne.layers.get_all_param_values(network))
Defined a training set at (1), modified the input to the new dimension at (2) and get an exception at (3):
Traceback (most recent call last):
File "", line 139, in <module>
train_err += train_fn(inputs, targets)
File "/usr/local/lib/python2.7/site-packages/theano/compile/", line 513, in __call__
File "/usr/local/lib/python2.7/site-packages/theano/tensor/", line 169, in filter
TypeError: ('Bad input argument to theano function with name "" at index 1(0-based)', 'Wrong number of dimensions: expected 1, got 4 with shape (1, 1, 4, 2).')
And I have no clue what I did wrong. When I print the dimension (or the output of the program until the exception) I get this
(1, 1, 4, 2)
Starting training...
(1, 1, 4, 2) (1, 1, 4, 2) Shape.0 4 4
Which seem to be perfect. What I'm doing wrong and how must the array be formed to work?

The problem is with the second input, targets. Note that the error message indicated this by saying " index 1(0-based)...", i.e. the second parameter.
target_var is an ivector but you're providing a 4-dimensional tensor for targets. The solution is to alter your y_train dataset so that it is 1-dimensional:
y_train = [0, 1, 1, 0]
This will cause another error because you currently assert that the first dimension of the inputs and targets should match, but changing
assert len(inputs) == len(targets)
assert inputs.shape[2] == len(targets)
will solve the second problem and allow the script to run successfully.


Can't backward pass two losses in Classification Transformer Model

For my model I'm using a roberta transformer model and the Trainer from the Huggingface transformer library.
I calculate two losses:
lloss is a Cross Entropy Loss and dloss calculates the loss inbetween hierarchy layers.
The total loss is the sum of lloss and dloss. (Based on this)
When calling total_loss.backwards() however, I get the error:
RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed
Any idea why that happens? Can I force it to only call backwards once? Here is the loss calculation part:
dloss = calculate_dloss(prediction, labels, 3)
lloss = calculate_lloss(predeiction, labels, 3)
total_loss = lloss + dloss
def calculate_lloss(predictions, true_labels, total_level):
'''Calculates the layer loss.
loss_fct = nn.CrossEntropyLoss()
lloss = 0
for l in range(total_level):
lloss += loss_fct(predictions[l], true_labels[l])
return self.alpha * lloss
def calculate_dloss(predictions, true_labels, total_level):
'''Calculate the dependence loss.
dloss = 0
for l in range(1, total_level):
current_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l]), dim=1)
prev_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l-1]), dim=1)
D_l = self.check_hierarchy(current_lvl_pred, prev_lvl_pred, l) #just a boolean tensor
l_prev = torch.where(prev_lvl_pred == true_labels[l-1], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
l_curr = torch.where(current_lvl_pred == true_labels[l], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
dloss += torch.sum(torch.pow(self.p_loss, D_l*l_prev)*torch.pow(self.p_loss, D_l*l_curr) - 1)
return self.beta * dloss
There is nothing wrong with having a loss that is the sum of two individual losses, here is a small proof of principle adapted from the docs:
import torch
import numpy
from sklearn.datasets import make_blobs
class Feedforward(torch.nn.Module):
def __init__(self, input_size, hidden_size):
super(Feedforward, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
self.relu = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(self.hidden_size, 1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
hidden = self.fc1(x)
relu = self.relu(hidden)
output = self.fc2(relu)
output = self.sigmoid(output)
return output
def blob_label(y, label, loc): # assign labels
target = numpy.copy(y)
for l in loc:
target[y == l] = label
return target
x_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True)
x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(blob_label(y_train, 0, [0]))
y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3]))
x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True)
x_test = torch.FloatTensor(x_test)
y_test = torch.FloatTensor(blob_label(y_test, 0, [0]))
y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3]))
model = Feedforward(2, 10)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
y_pred = model(x_test)
before_train = criterion(y_pred.squeeze(), y_test)
print('Test loss before training' , before_train.item())
epoch = 20
for epoch in range(epoch):
optimizer.zero_grad() # Forward pass
y_pred = model(x_train) # Compute Loss
lossCE= criterion(y_pred.squeeze(), y_train)
lossSQD = (y_pred.squeeze()-y_train).pow(2).mean()
print('Epoch {}: train loss: {}'.format(epoch, loss.item())) # Backward pass
There must be a real second time that you call directly or indirectly backward on some varaible that then traverses through your graph. It is a bit too much to ask for the complete code here, only you can check this or at least reduce it to a minimal example (while doing so, you might already find the issue). Apart from that, I would start checking:
Does it already occur in the first iteration of training? If not: are you reusing any calculation results for the second iteration without a detach?
When you do backward on your losses individually lloss.backward() followed by dloss.backward() (this has the same effect as adding them together first as gradients are accumulated): what happens? This will let you track down for which of the two losses the error occurs.
After backward() your comp. graph is freed so for the second backward you need to create a new graph by providing inputs again. If you want to reiterate the same graph after backward (for some reason) you need to specify retain_graph flag in backward as True. see retain_graph here.
P.S. As the summation of Tensors is automatically differentiable, summing the losses would not cause any issue in the backward.

RuntimeError: Tensor for argument #3 'mat2' is on CPU, but expected it to be on GPU (while checking arguments for addmm)

I have a custom-written model using PyTorch. However, when I try to use .to(device) to train in GPU, ı come across this error. The model does run on the CPU but is slow. Can Anyone give me a little insight on how to solve this?
I added the inputs to the GPU as well and still, I receive this error. The second picture added shows the initial part of the error and the title I mentioned in the last lie of the errors.
Model Train Loop
Error Initial Part
def build_network(dim_val, dim_attn, input_size, dec_seq_len, output_sequence_length, n_decoder_layers, n_encoder_layers, n_heads):
# Input Size = 1
t = Transformer(dim_val, dim_attn, input_size ,dec_seq_len, output_sequence_length, n_decoder_layers, n_encoder_layers, n_heads)
return t
def build_optimizer(model ,learning_rate):
optimizer = torch.optim.Adam(model.parameters(), learning_rate)
return optimizer
Initialize network
t = build_network(2, 2, 1, 2, 2, 2, 2, 2).to(device)
Initialize Optimizer
optimizer = build_optimizer(t, 0.001)
MAPE Formula
def mape(actual, pred):
actual, pred = np.array(actual), np.array(pred)
return round(np.mean(np.abs((actual - pred) / actual)) * 100,2)
for e in range(2):
start_time = time.time()
# Batch Train Loss
batch_losses = []
batch_mape = []
# Batch Test Loss
val_batch_loss = []
val_batch_mape = []
################################################ Train Batch Loop ################################################
for test_x, test_y in train_loader:
X =
Y =
# Zero Grad at start of loop
# Forward pass and calculate loss
net_out = t(X)
## Loss MSE
loss = torch.mean((net_out - Y) ** 2)
# Train MAPE
train_mape = mape(np.abs((Y.detach().numpy())),
# Backwards pass
# Append the Batch Loss
# Append the Batch MAPE
################################################ Validation Batch Loop ################################################
for test_x, test_y in val_loader:
X_Test =
Y_Test =
#test_values = torch.from_numpy(Transformer_Xtest)
val = t(X_Test)
# Val Loss
validation_loss = torch.mean((val - Y_Test) ** 2)
# Append the Batch Loss
# Val MAPE
validation_mape = mape(np.abs((Y_Test.detach().numpy())),
# Append the Batch MAPE
######## Train Epoch Summary
epoch_train_loss = round(np.mean(batch_losses), 5)
epoch_train_mape = round(np.mean(batch_mape), 5)
######## Val Epoch Summary
epoch_val_loss = round(np.mean(val_batch_loss), 5)
epoch_val_mape = round(np.mean(val_batch_mape), 5)
show_print = print("Epoch: ", e+1, "epoch_train_loss:", epoch_train_loss, "epoch_train_mape: ", epoch_train_mape, "epoch_val_loss:", epoch_val_loss, "epoch_val_mape:", epoch_val_mape,"--- Elapsed Time Seconds ---", round((time.time() - start_time), 3) )

Why the predictions are very off (which causes very large losses)?

I have this very simple resnet18 network that I am trying to train from scratch for task of landmark estimation (I have 4 landmarks):
num_classes = 4 * 2 #4 coordinates X and Y flattened --> 4 of 2D keypoints or landmarks
class Network(nn.Module):
def __init__(self,num_classes=8):
self.model_name = 'resnet18'
self.model = models.resnet18()
self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
def forward(self, x):
x = x.float()
out = self.model(x)
return out
For the following piece of code:
network = Network()
criterion = nn.MSELoss()
optimizer = optim.Adam(network.parameters(), lr=0.0001)
loss_min = np.inf
num_epochs = 1
start_time = time.time()
for epoch in range(1,num_epochs+1):
loss_train = 0
loss_test = 0
running_loss = 0
print('size of train loader is: ', len(train_loader))
for step in range(1,len(train_loader)+1):
batch = next(iter(train_loader))
images, landmarks = batch['image'], batch['landmarks']
#RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[64, 600, 800, 3] to have 3 channels, but got 600 channels instead
#using permute to fix the above error
images = images.permute(0,3,1,2)
images = images.cuda()
landmarks = landmarks.view(landmarks.size(0),-1).cuda()
##images = torchvision.transforms.Normalize(images) #find the args later
##landmarks = torchvision.transforms.Normalize(landmarks) #find the args later
predictions = network(images)
# clear all the gradients before calculating them
print('predictions are: ', predictions.float())
print('landmarks are: ', landmarks.float())
# find the loss for the current step
loss_train_step = criterion(predictions.float(), landmarks.float())
loss_train_step =
print("loss_train_step before backward: ", loss_train_step)
# calculate the gradients
# update the parameters
print("loss_train_step after backward: ", loss_train_step)
loss_train += loss_train_step.item()
print("loss_train: ", loss_train)
running_loss = loss_train/step
print('step: ', step)
print('running loss: ', running_loss)
print_overwrite(step, len(train_loader), running_loss, 'train')
with torch.no_grad():
for step in range(1,len(test_loader)+1):
batch = next(iter(train_loader))
images, landmarks = batch['image'], batch['landmarks']
images = images.permute(0,3,1,2)
images = images.cuda()
landmarks = landmarks.view(landmarks.size(0),-1).cuda()
predictions = network(images)
# find the loss for the current step
loss_test_step = criterion(predictions, landmarks)
loss_test += loss_test_step.item()
running_loss = loss_test/step
print_overwrite(step, len(test_loader), running_loss, 'Validation')
loss_train /= len(train_loader)
loss_test /= len(test_loader)
print('Epoch: {} Train Loss: {:.4f} Valid Loss: {:.4f}'.format(epoch, loss_train, loss_test))
if loss_test < loss_min:
loss_min = loss_test, '../moth_landmarks.pth')
print("\nMinimum Valid Loss of {:.4f} at epoch {}/{}".format(loss_min, epoch, num_epochs))
print('Model Saved\n')
print('Training Complete')
print("Total Elapsed Time : {} s".format(time.time()-start_time))
I get this NAN as well as very large MSE losses (showing only for one step):
size of train loader is: 90
predictions are: tensor([[-0.0380, -0.1871, 0.0729, -0.3570, -0.2153, 0.3066, 1.1273, -0.0558],
[-0.0316, -0.1876, 0.0317, -0.3613, -0.2333, 0.3023, 1.0940, -0.0665],
[-0.0700, -0.1882, 0.0068, -0.3201, -0.1884, 0.2953, 1.0516, -0.0567],
[-0.0844, -0.2009, 0.0573, -0.3166, -0.2597, 0.3127, 1.0343, -0.0573],
[-0.0486, -0.2333, 0.0535, -0.3245, -0.2310, 0.2818, 1.0590, -0.0716],
[-0.0240, -0.1989, 0.0572, -0.3135, -0.2435, 0.2912, 1.0612, -0.0560],
[-0.0942, -0.2439, 0.0277, -0.3147, -0.2368, 0.2978, 1.0110, -0.0874],
[-0.0356, -0.2285, 0.0064, -0.3179, -0.2432, 0.3083, 1.0300, -0.0756]],
device='cuda:0', grad_fn=<AddmmBackward>)
landmarks are: tensor([[501.9200, 240.1600, 691.0000, 358.0000, 295.0000, 294.0000, 488.6482,
[495.6300, 246.0600, 692.0000, 235.0000, 286.0000, 242.0000, 464.0000,
[488.7100, 240.8900, 613.4007, 218.3425, 281.0000, 220.0000, 415.9966,
[502.5721, 245.4983, 640.0000, 131.0000, 360.0000, 143.0000, 542.9840,
[505.1393, 246.4364, 700.0000, 306.0000, 303.0000, 294.0000, 569.6925,
[501.0900, 244.0100, 724.0000, 251.0000, 302.0000, 276.0000, 504.6415,
[495.9500, 244.2800, 608.0000, 127.0000, 323.0000, 166.0000, 491.0000,
[490.2500, 241.3400, 699.0000, 304.0000, 398.6197, 313.8339, 429.1374,
303.8483]], device='cuda:0')
loss_train_step before backward: tensor(166475.6875, device='cuda:0', grad_fn=<MseLossBackward>)
loss_train_step after backward: tensor(166475.6875, device='cuda:0', grad_fn=<MseLossBackward>)
loss_train: 166475.6875
step: 1
running loss: 166475.6875
The other thing besides Network I am also suspicious of is the transforms:
Here's the transforms I am using:
transformed_dataset = MothLandmarksDataset(csv_file='moth_gt.csv',
##transforms.Normalize(mean = [ 0.485, 0.456, 0.406 ],
## std = [ 0.229, 0.224, 0.225 ])
How can I fix it? Also, with having Normalize I do still get same problem (NAN and very large loss values since the predictions are very off).
How can I basically debug why the predictions are very off?
I think there is a problem with the way you are using DataLoader. iter(train_loader) creates an iterator out of the data loader and calling next should give you the next example from dataset. But you are calling next(iter(train_loader)) in each iteration which creates a new iterator each time from train_loader and returns the next example which would essentially be the first example in your dataset. So i think this way you end up training (and validating) on the same example in each iteration. Even if your data loader shuffles the dataset each time, you'll end up training on some random samples from the dataset (as each time only the first random sample will be used) and not on the complete dataset. Try changing your code such that you use iter(train_loader) only once and and call next in each iteration:
train_iter = iter(train_loader)
for step in range(1, len(train_loader)+1):
batch = next(train_iter)
# now batch contains (step)th batch (assuming 1-based indexing)
Or even better change your for loop to iterate on the train_loader itself:
for step, batch in enumerate(train_loader):
# now batch contains (step)th batch (assuming 0-based indexing)
Similar change would also be required for validation or evaluation. Let me know if this resolves your problem or helps in any way.

Tensorflow CNN model always predicts same class

I have been trying to develop a CNN model for image classification. I am new to tensorflow and getting help from the following books
TensorFlow For Machine Intelligence by Sam Abrahams
For the past few weeks I have been working to develop a good model but I always get the same prediction. I have tried many different architectures but no luck!
Lately I decided to test my model with CIFAR-10 dataset and using the exact same model as given in the Learning Tensorflow book. But the outcome was same (same class for every image) even after training for 50K steps.
Here is highlight of my model and code.
1.) Downloaded CIFAR-10 image sets, converted them into tfrecord files with labels(labels are string for each category of CIFAR-10 in the tfrecord file) each for training and test set.
2) Reading the images from tfrecord file and generating random shuffle batch of size 100.
3) Converting the label from string to the integer32 type from 0-9 each for given category
4) Pass the training and test batches to the network and getting the output of [batch_size , num_class] size.
5) Train the model using Adam optimizer and softmax cross entropy loss function (Have tried gradient optimizer as well)
7) evaluate the model for test batches before and after the training.
8) Getting the same prediction for entire data set (But different every time I re run the code to try again)
Is there something wrong I am doing here? I would appreciate if someone could help me out with this problem.
Note - My approach of converting images and labels into tfrecord could be unusual but believe me I have come up with this idea from the books I mentioned earlier.
My code for the problem:
import tensorflow as tf
import numpy as np
import _datetime as dt
import PIL
# The glob module allows directory listing
import glob
import random
from itertools import groupby
from collections import defaultdict
H , W = 32 , 32 # Height and weight of the image
C = 3 # Number of channels
sessInt = tf.InteractiveSession()
# Read file and return the batches of the input data
def get_Batches_From_TFrecord(tf_record_filenames_list, batch_size):
# Match and load all the tfrecords found in the specified directory
tf_record_filename_queue = tf.train.string_input_producer(tf_record_filenames_list)
# It may have more than one example in them.
tf_record_reader = tf.TFRecordReader()
tf_image_name, tf_record_serialized =
# The label and image are stored as bytes but could be stored as int64 or float64 values in a
# serialized tf.Example protobuf.
tf_record_features = tf.parse_single_example(tf_record_serialized,
features={'label': tf.FixedLenFeature([], tf.string),
'image': tf.FixedLenFeature([], tf.string), })
# Using tf.uint8 because all of the channel information is between 0-255
tf_record_image = tf.decode_raw(tf_record_features['image'], tf.uint8)
# Reshape the image to look like the input image
tf_record_image = tf.reshape(tf_record_image, [H, W, C])
tf_record_label = tf.cast(tf_record_features['label'], tf.string)
#Check the image and label
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sessInt, coord=coord)
label = tf_record_label.eval().decode()
image = PIL.Image.fromarray(tf_record_image.eval())
# creating a batch to feed the data
min_after_dequeue = 10 * batch_size
capacity = min_after_dequeue + 5 * batch_size
# Shuffle examples while feeding in the queue
image_batch, label_batch = tf.train.shuffle_batch([tf_record_image, tf_record_label], batch_size=batch_size,
capacity=capacity, min_after_dequeue=min_after_dequeue)
# Sequential feed in the examples in the queue (Don't shuffle)
# image_batch, label_batch = tf.train.batch([tf_record_image, tf_record_label], batch_size=batch_size, capacity=capacity)
# Converting the images to a float to match the expected input to convolution2d
float_image_batch = tf.image.convert_image_dtype(image_batch, tf.float32)
string_label_batch = label_batch
return float_image_batch, string_label_batch
#Count the number of images in the tfrecord file
def number_of_records(tfrecord_file_name):
count = 0
record_iterator = tf.python_io.tf_record_iterator(path = tfrecord_file_name)
for record in record_iterator:
return count
def get_num_of_samples(tfrecords_list):
total_samples = 0
for tfrecord in tfrecords_list:
total_samples += number_of_records(tfrecord)
return total_samples
# Provide the input tfrecord names in a list
train_filenames = ["./TFRecords/cifar_train.tfrecord"]
test_filename = ["./TFRecords/cifar_test.tfrecord"]
num_train_samples = get_num_of_samples(train_filenames)
num_test_samples = get_num_of_samples(test_filename)
print("Number of Training samples: ", num_train_samples)
print("Number of Test samples: ", num_test_samples)
IMP Note : (Batch_size * Training_Steps) should be at least greater than (2*Number_of_samples) for shuffling of batches
train_batch_size = 100
# Total number of batches for input records
# Note - Num of samples in the tfrecord file can be determined by the tfrecord iterator.
# Batch size for test samples
test_batch_size = 50
train_image_batch, train_label_batch = get_Batches_From_TFrecord(train_filenames, train_batch_size)
test_image_batch, test_label_batch = get_Batches_From_TFrecord(test_filename, test_batch_size)
# Definition of the convolution network which returns a single neuron for each input image in the batch
# Define a placeholder for keep probability in dropout
# (Dropout should only use while training, for testing dropout should be always 1.0)
fc_prob = tf.placeholder(tf.float32)
conv_prob = tf.placeholder(tf.float32)
#Helper function to add learned filters(images) into tensorboard summary - for a random input in the batch
def add_filter_summary(name, filter_tensor):
rand_idx = random.randint(0,filter_tensor.get_shape()[0]-1) #Choose any random number from[0,batch_size)
#dispay_filter = filter_tensor[random.randint(0,filter_tensor.get_shape()[3])]
dispay_filter = filter_tensor[5] #keeping the index fix for consistency in visualization
with tf.name_scope("Filter_Summaries"):
img_summary = tf.summary.image(name, tf.reshape(dispay_filter,[-1 , filter_tensor.get_shape()[1],filter_tensor.get_shape()[1],1] ), max_outputs = 500)
# Helper functions for the network
def weight_initializer(shape):
weights = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(weights)
def bias_initializer(shape):
biases = tf.constant(0.1, shape=shape)
return tf.Variable(biases)
def conv2d(input, weights, stride):
return tf.nn.conv2d(input, filter=weights, strides=[1, stride, stride, 1], padding="SAME")
def pool_layer(input, window_size=2 , stride=2):
return tf.nn.max_pool(input, ksize=[1, window_size, window_size, 1], strides=[1, stride, stride, 1], padding='VALID')
# This is the actual layer we will use.
# Linear convolution as defined in conv2d, with a bias,
# followed by the ReLU nonlinearity.
def conv_layer(input, filter_shape , stride=1):
W = weight_initializer(filter_shape)
b = bias_initializer([filter_shape[3]])
return tf.nn.relu(conv2d(input, W, stride) + b)
# A standard full layer with a bias. Notice that here we didn’t add the ReLU.
# This allows us to use the same layer for the final output,
# where we don’t need the nonlinear part.
def full_layer(input, out_size):
in_size = int(input.get_shape()[1])
W = weight_initializer([in_size, out_size])
b = bias_initializer([out_size])
return tf.matmul(input, W) + b
## Model fro the book learning tensorflow - for CIFAR data
def conv_network(image_batch, batch_size):
# Now create the model which returns the output neurons (eequals to the number of labels)
# as a final fully connecetd layer output. Which we can use as input to the softmax classifier
C1 , C2 , C3 = 30 , 50, 80 # Number of output features for each convolution layer
F1 = 500 # Number of output neuron for FC1 layer
#Add original image to tensorboard summary
add_filter_summary("Original" , image_batch)
# First convolutaion layer with 5x5 filter size and 32 filters
conv1 = conv_layer(image_batch, filter_shape=[3, 3, C, C1])
pool1 = pool_layer(conv1, window_size=2)
pool1 = tf.nn.dropout(pool1, keep_prob=conv_prob)
add_filter_summary("conv1" , pool1)
# Second convolutaion layer with 5x5 filter_size and 64 filters
conv2 = conv_layer(pool1, filter_shape=[5, 5, C1, C2])
pool2 = pool_layer(conv2, 2)
pool2 = tf.nn.dropout(pool2, keep_prob=conv_prob)
add_filter_summary("conv2" , pool2)
# Third convolution layer
conv3 = conv_layer(pool2, filter_shape=[5, 5, C2, C3])
# Since at this point the feature maps are of size 8×8 (following the first two poolings
# that each reduced the 32×32 pictures by half on each axis).
# This last pool layer pools each of the feature maps and keeps only the maximal value.
# The number of feature maps at the third block was set to 80,
# so at that point (following the max pooling) the representation is reduced to only 80 numbers
pool3 = pool_layer(conv3, window_size = 8 , stride=8)
pool3 = tf.nn.dropout(pool3, keep_prob=conv_prob)
add_filter_summary("conv3" , pool3)
# Reshape the output to feed to the FC layer
flatterned_layer = tf.reshape(pool3, [batch_size,
-1]) # -1 is to specify to use all the dimensions remaining in the input (other than batch_size).reshape(input , )
fc1 = tf.nn.relu(full_layer(flatterned_layer, F1))
full1_drop = tf.nn.dropout(fc1, keep_prob=fc_prob)
# Fully connected layer 2 (output layer)
final_Output = full_layer(full1_drop, 10)
return final_Output, tf.summary.merge_all()
# Now that architecture is created , next step is to create the classification model
# (to predict the output class of the input data)
# Here we have used Logistic regression (Sigmoid function) to predict the output because we have only rwo class.
# For multiple class problem - softmax is the best prediction function
# Prepare the inputs to the input
Train_X , img_summary = conv_network(train_image_batch, train_batch_size)
Test_X , _ = conv_network(test_image_batch, test_batch_size)
# Generate 0 based index for labels
Train_Y = tf.to_int32(tf.argmax(
tf.to_int32(tf.stack([tf.equal(train_label_batch, ["airplane"]), tf.equal(train_label_batch, ["automobile"]),
tf.equal(train_label_batch, ["bird"]),tf.equal(train_label_batch, ["cat"]),
tf.equal(train_label_batch, ["deer"]),tf.equal(train_label_batch, ["dog"]),
tf.equal(train_label_batch, ["frog"]),tf.equal(train_label_batch, ["horse"]),
tf.equal(train_label_batch, ["ship"]), tf.equal(train_label_batch, ["truck"]) ])), 0))
Test_Y = tf.to_int32(tf.argmax(
tf.to_int32(tf.stack([tf.equal(test_label_batch, ["airplane"]), tf.equal(test_label_batch, ["automobile"]),
tf.equal(test_label_batch, ["bird"]),tf.equal(test_label_batch, ["cat"]),
tf.equal(test_label_batch, ["deer"]),tf.equal(test_label_batch, ["dog"]),
tf.equal(test_label_batch, ["frog"]),tf.equal(test_label_batch, ["horse"]),
tf.equal(test_label_batch, ["ship"]), tf.equal(test_label_batch, ["truck"]) ])), 0))
# Y = tf.reshape(float_label_batch, X.get_shape())
# compute inference model over data X and return the result
# (using sigmoid function - as this function is the best to predict two class output)
# (For multiclass problem - Softmax is the bset prediction function)
def inference(X):
return tf.nn.softmax(X)
# compute loss over training data X and expected outputs Y
# Cross entropy function is the best suited for loss calculation (Than the squared error function)
# Get the second column of the input to get only the features
def loss(X, Y):
return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=X, labels=Y))
# train / adjust model parameters according to computed total loss (using gradient descent)
def train(total_loss, learning_rate):
return tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
# evaluate the resulting trained model with dropout probability (Ideally 1.0 for testing)
def evaluate(sess, X, Y, dropout_prob):
# predicted = tf.cast(inference(X) > 0.5 , tf.float32)
#print("\nNetwork output:")
#print( , feed_dict={conv_prob:1.0 , fc_prob:1.0}))
# Inference contains the predicted probability of each class for each input image.
# The class having higher probability is the prediction of the network. y_pred_cls = tf.argmax(y_pred, dimension=1)
predicted = tf.cast(tf.argmax(X, 1), tf.int32)
#print("\npredicted labels:")
#print( , feed_dict={conv_prob:1.0 , fc_prob:1.0}))
#print("\nTrue Labels:")
#print( , feed_dict={conv_prob:1.0 , fc_prob:1.0}))
batch_accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), tf.float32))
# calculate the mean of the accuracies of the each batch (iteration)
# No. of iteration Iteration should cover the (test_batch_size * num_of_iteration ) >= (2* num_of_test_samples ) condition
total_accuracy = np.mean([, feed_dict={conv_prob:1.0 , fc_prob:1.0}) for i in range(250)])
print("Accuracy of the model(in %): {:.4f} ".format(100 * total_accuracy))
# create a saver class to save the training checkpoints
saver = tf.train.Saver(max_to_keep=10)
# Create tensorboard sumamry for loss function
with tf.name_scope("summaries"):
loss_summary = tf.summary.scalar("loss", loss(Train_X, Train_Y))
#merged = tf.summary.merge_all()
# Launch the graph in a session, setup boilerplate
with tf.Session() as sess:
log_writer = tf.summary.FileWriter('./logs', sess.graph)
total_loss = loss(Train_X, Train_Y)
train_op = train(total_loss, 0.001)
#Initialise all variables after defining all variables
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
evaluate(sess, Test_X, Test_Y,1.0)
# actual training loop------------------------------------------------------
training_steps = 50000
print("\nStarting to train model with", str(training_steps), " steps...")
to1 =
for step in range(1, training_steps + 1):
# print([train_op], feed_dict={fc_prob: 0.5 , conv_prob:0.8}) # Pass the dropout value for training batch to the placeholder
# for debugging and learning purposes, see how the loss gets decremented thru training steps
if step % 100 == 0:
# print("\n")
# print(
loss_summaries, img_summaries , Tloss =[loss_summary, img_summary, total_loss],
feed_dict={fc_prob: 0.5 , conv_prob:0.8}) # evaluate total loss to add it in summary object
log_writer.add_summary(loss_summaries, step) # add summary for each step
log_writer.add_summary(img_summaries, step)
print("Step:", step, " , loss: ", Tloss)
if step%2000 == 0:, "./Models/BookLT_CIFAR", global_step=step, latest_filename="model_chkpoint")
evaluate(sess, Test_X, Test_Y,1.0), "./Models/BookLT_CIFAR", global_step=step, latest_filename="model_chkpoint")
to2 =
print("\nTotal Trainig time Elapsed: ", str(to2 - to1))
# once the training is complete, evaluate the model with test (validation set)-------------------------------------------
# Restore the model file and perform the testing
#saver.restore(sess, "./Models/BookLT3_CIFAR-15000")
print("\nPost Training....")
# Performs Evaluation of model on batches of test samples
# In order to evaluate entire test set , number of iteration should be chosen such that ,
# (test_batch_size * num_of_iteration ) >= (2* num_of_test_samples )
evaluate(sess, Test_X, Test_Y,1.0) # Evaluate multiple batch of test data set (randomly chosen by shuffle train batch queue)
evaluate(sess, Test_X, Test_Y,1.0)
evaluate(sess, Test_X, Test_Y,1.0)
Here is the screenshot of my Pre training result:
Here is the screenshot of the result during training:
Hereis the screenshot of the Post training result
I did not run the code to verify that this is the only issue, but here is one important issue. When classifying, you should use one-hot encoding for your labels. Meaning that if you have 3 classes, you want your labels to be [1, 0, 0] for class 1, [0, 1, 0] for class 2, [0, 0, 1] for class 3. Your approach of using 1, 2, and 3 as labels leads to various issues. For examples, the network is penalized more for predicting class 1 versus predicting class 2 for an image from class 3. TensorFlow functions like tf.nn.softmax_cross_entropy_with_logits work with such representations.
Here is the basic example of correctly using one_hot labels to compute loss:
Here is how the one_hot label is constructed for mnist digits:

Lasagne/Theano mnist example issue

I'm trying to slightly change the code from github here to a toy example of reading a simpler two dimensional data. My toy data set has the following structure
x-coordinate, y-coordinate, class
Some example data points are
and their corresponding classes
I'm able to read the data and create my custom mlp. However when I try to run the training part, I get the following error
(5, 2)
Traceback (most recent call last):
File "./", line 78, in <module>
train_err += train_fn(inputs, targets)
File "/usr/local/lib/python2.7/dist-packages/theano/compile/", line 786, in __call__
File "/usr/local/lib/python2.7/dist-packages/theano/tensor/", line 177, in filter
TypeError: ('Bad input argument to theano function with name "./" at index 0(0-based)', 'Wrong number of dimensions: expected 4, got 2 with shape (5, 2).')
This has clearly something to do with the shapes of the arrays I'm passing. But what I can't seem to figure out is why is my case any different from the mnist dataset which is also a two dimensional array of an image.
My entire code is the following.
def build_mlp(input_var=None):
l_in = lasagne.layers.InputLayer(shape=(None,1,1,2),input_var = input_var)
l_h1 = lasagne.layers.DropoutLayer(l_in,p = 0.2)
l_hid1 = lasagne.layers.DenseLayer(
l_h1,num_units = 10,
nonlinearity = lasagne.nonlinearities.rectify,
W = lasagne.init.GlorotUniform())
l_h2 = lasagne.layers.DropoutLayer(l_hid1,p = 0.2)
l_hid2 = lasagne.layers.DenseLayer(
l_h2,num_units = 10,
nonlinearity = lasagne.nonlinearities.rectify,
W = lasagne.init.GlorotUniform())
l_out = lasagne.layers.DenseLayer(
l_hid2,num_units = 5,
nonlinearity = lasagne.nonlinearities.softmax,
W = lasagne.init.GlorotUniform())
return l_out
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
assert len(inputs) == len(targets)
if shuffle:
indices = np.arange(len(inputs))
for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
excerpt = slice(start_idx, start_idx + batchsize)
yield inputs[excerpt], targets[excerpt]
x_data = np.genfromtxt('a.csv',delimiter=',')
y_data = np.genfromtxt('b.csv',delimiter=',')
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size = 0.33)
input_var = T.tensor4('inputs')
target_var = T.ivector('targets')
network = build_mlp(input_var)
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.4)
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
train_fn = theano.function([input_var, target_var], loss, updates=updates)
val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
num_epochs = 100
for epoch in range(num_epochs):
train_err = 0
start_time = time.time()
for batch in iterate_minibatches(x_train, y_train, 5, shuffle=True):
inputs, targets = batch
print inputs.shape
print targets.shape
train_err += train_fn(inputs, targets)
val_err = 0
val_acc = 0
val_batches = 0
for batch in iterate_minibatches(x_train, y_train, 5, shuffle=False):
inputs, targets = batch
err, acc = val_fn(inputs, targets)
val_err += err
val_acc += acc
val_batches += 1
print 'Epoch %d of %d took {:%0.3f}s' % (epoch + 1, num_epochs, time.time() - start_time)
print(" training loss:\t\t{:.6f}".format(train_err / train_batches))
print(" validation loss:\t\t{:.6f}".format(val_err / val_batches))
print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))
Could someone point to me what I'm doing off here please?
You're declaring input_var as a 4d tensor, but the error message suggests that you're passing a data matrix of size (5,2) as input. Based on the shape of your input layer, this should be (5, 1, 1, 2) (assuming the 5 corresponds to the number of training examples in a minibatch and the 2 corresponds to your x and y coordinates).
