U-net training Error: The size of tensor a (16) must match the size of tensor b (6) at non-singleton dimension 1 - python

I’m trying to train a Unit model on LandCoverNet dataset, which is a satellite imagery dataset that contains input images and corresponding land cover type masks.
I have created a custom dataset to get my images and masks:
# Create custom dataset that accepts 4 channels images
from torch.utils.data import Dataset, DataLoader, sampler
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import os
import numpy as np
import rasterio as rio
from torchvision import transforms, datasets, models
# We have two dir: inputs(folder for each image) and tatgets
class LandCoverNetDataset(BaseDataset):
CLASSES = ['otherland', 'cropland', 'pastureland', 'bare soil', 'openwater', 'forestland']
def __init__(self, inputs_dir, targets_dir,
classes = None,
augmentation=None ,
preprocessing = False,
self.samples = []
self.pytorch = pytorch
self.augmentation = augmentation
self.preprocessing = preprocessing
# Convert str names to class values on masks
self.class_value = [self.CLASSES.index(cls.lower()) for cls in classes]
# Create dictionary for images and targets
for sub_dir in os.listdir(inputs_dir):
files = {}
files = {
'img_bands' : os.path.join(inputs_dir, sub_dir),
'target' : os.path.join(targets_dir, sub_dir[:13] + "_LC_10m.png")
def __len__(self):
return len(self.samples)
def normalize(self, band):
'''Notmalize a numpy array to have values between 0 and 1'''
band_min, band_max = band.min(), band.max()
np.seterr(divide='ignore', invalid='ignore')
normalized_band = ((band - band_min)/(band_max - band_min))
#Remove any nan value and subtitute by zero
where_are_NaNs = isnan(normalized_band)
normalized_band[where_are_NaNs] = 0
return normalized_band
def open_as_array(self, idx, include_ndvi = False):
Merge the 4 bands into one image and normalize the bands
# List indivisual bands in each image folder
# Stack them togather
list_bands = []
for img_file in os.listdir(self.samples[idx]['img_bands']):
# Get the ndvi band
if 'NDVI' in img_file:
ndvi_band = os.path.join(self.samples[idx]['img_bands'], img_file)
# Get the rgb bands
band = rio.open(os.path.join(self.samples[idx]['img_bands'], img_file)).read(1)
if self.preprocessing:
# preprocess the bands before stacking them (only rgb)
band = self.normalize(band)
# Stack the bands
raw_rgb = np.stack(list_bands, axis=2).astype('float32')
if include_ndvi:
# Include the NDVI band in the input images
ndvi = np.expand_dims(rio.open(ndvi_band).read(1).astype('float32'), 2)
raw_rgb = np.concatenate([raw_rgb, ndvi], axis=2)
if self.augmentation:
transformed = self.augmentation(image = raw_rgb)
raw_rgb = transformed["image"]
if self.preprocessing:
# transpose to tensor shape
raw_rgb = raw_rgb.transpose((2,0,1)).astype('float32')
return raw_rgb
def open_mask(self, idx):
# Extract certain classes from mask
mask = cv2.imread(self.samples[idx]['target'], 0)
masks = [(mask == v) for v in self.class_value]
mask = np.stack(masks, axis=-1).astype('long')
if self.augmentation:
transformed = self.augmentation(image = mask)
mask = transformed["image"]
if self.preprocessing:
# preprocess the mask
mask = self.normalize(mask)
# transpose to tensor shape
mask = mask.transpose((2, 0, 1)).astype('long')
mask = mask[0, :, :]
return mask
def __getitem__(self, idx):
x = torch.tensor(self.open_as_array(idx, include_ndvi=True), dtype=torch.float)
y = torch.tensor(self.open_mask(idx), dtype=torch.long)
return x, y
def open_as_pil(self, idx):
arr = 256*self.open_as_array(idx)
return Image.fromarray(arr.astype(np.uint8), 'RGB')
def __repr__(self):
s = 'Dataset class with {} files'.format(self.__len__())
return s
The input here is 4 bands.
This is the shape of the first batch for both input/target
torch.Size([16, 4, 224, 224])
torch.Size([16, 224, 224])
I’m using a model from segmentation-models-pytorch library, and here is how I customized it for my case:
ENCODER = 'se_resnext50_32x4d'
ENCODER_WEIGHTS = 'imagenet'
ACTIVATION = 'softmax2d'
DEVICE = 'cuda'
model = smp.FPN(ENCODER, classes=len(CLASSES), activation=ACTIVATION)
# Replace the model.conv1 to accept 4 channels
# first: copy the layer's weights
weight = model.encoder.layer0.conv1.weight.clone()
model.encoder.layer0.conv1 = nn.Conv2d(4, 64,kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
with torch.no_grad():
model.encoder.layer0.conv1.weight[:, :3] = weight
model.encoder.layer0.conv1.weight[:, 3] = model.encoder.layer0.conv1.weight[:, 0]
loss = smp.utils.losses.NLLLoss()
metrics = [
optimizer = torch.optim.SGD([
dict(params=model.parameters(), lr=0.001, weight_decay=1e-8, momentum=0.9),
# create epoch runners
# it is a simple loop of iterating over dataloader`s samples
train_epoch = smp.utils.train.TrainEpoch(
valid_epoch = smp.utils.train.ValidEpoch(
And here is my training loop
# train model for 40 epochs
max_score = 0
for i in range(0, 40):
print('\nEpoch: {}'.format(i))
train_logs = train_epoch.run(train_loader)
valid_logs = valid_epoch.run(valid_loader)
# do something (save model, change lr, etc.)
if max_score < valid_logs['iou_score']:
max_score = valid_logs['iou_score']
torch.save(model, './best_model.pth')
print('Model saved!')
if i == 25:
optimizer.param_groups[0]['lr'] = 1e-5
print('Decrease decoder learning rate to 1e-5!')
At first, the target shape was [16, 6, 224, 224] but I had an error and found this thread that it should be [batch_size, height, width]
That’s why I added this line in the Dataset class : mask = mask[0, :, :]
to get ride of the number of classes dim, and here where things get confusing for me, because the output of me model is torch.Size([10, 6, 224, 224]).
This is the entire error message:
Epoch: 0
train: 0%| | 0/157 [00:00<?, ?it/s]
RuntimeError Traceback (most recent call last)
<ipython-input-215-2ae39e205dee> in <module>()
8 print('\nEpoch: {}'.format(i))
----> 9 train_logs = train_epoch.run(train_loader)
10 valid_logs = valid_epoch.run(valid_loader)
3 frames
/usr/local/lib/python3.6/dist-packages/segmentation_models_pytorch/utils/functional.py in iou(pr, gt, eps, threshold, ignore_channels)
32 pr, gt = _take_channels(pr, gt, ignore_channels=ignore_channels)
---> 34 intersection = torch.sum(gt * pr)
35 union = torch.sum(gt) + torch.sum(pr) - intersection + eps
36 return (intersection + eps) / union
RuntimeError: The size of tensor a (16) must match the size of tensor b (6) at non-singleton dimension 1

OK, I changed the loss function to smp.utils.losses.DiceLoss(), and I was able to start training my model. I also removed mask = mask[0, :, :].
I also had an issue with my normalization. Here is how I did it:
for input (4 bands):
for i in range(raw_rgb.shape[0]):
raw_rgb[i, :, :] = self.normalize(raw_rgb[i, :, :])
And the same for the masks (3 channels)
This was after converting them to tensor.
I would also still want to know how to prepare my masks for CrossEntropyLoss.


"Invalid argument: indices[0,0,0,0] = 30 is not in [0, 30)"

InvalidArgumentError: indices[0,0,0,0] = 30 is not in [0, 30)
[[{{node GatherV2}}]] [Op:IteratorGetNext]
I have a custom data loader for a tf.keras based U-Net for semantic segmentation, based on this example. It is written as follows:
def parse_image(img_path: str) -> dict:
# read image
image = tf.io.read_file(img_path)
#image = tfio.experimental.image.decode_tiff(image)
if xf == "png":
image = tf.image.decode_png(image, channels = 3)
image = tf.image.decode_jpeg(image, channels = 3)
image = tf.image.convert_image_dtype(image, tf.uint8)
#image = image[:, :, :-1]
# read mask
mask_path = tf.strings.regex_replace(img_path, "X", "y")
mask_path = tf.strings.regex_replace(mask_path, "X." + xf, "y." + yf)
mask = tf.io.read_file(mask_path)
#mask = tfio.experimental.image.decode_tiff(mask)
mask = tf.image.decode_png(mask, channels = 1)
#mask = mask[:, :, :-1]
mask = tf.where(mask == 255, np.dtype("uint8").type(NoDataValue), mask)
return {"image": image, "segmentation_mask": mask}
train_dataset = tf.data.Dataset.list_files(
dir_tls(myear = year, dset = "X") + "/*." + xf, seed = zeed)
train_dataset = train_dataset.map(parse_image)
val_dataset = tf.data.Dataset.list_files(
dir_tls(myear = year, dset = "X_val") + "/*." + xf, seed = zeed)
val_dataset = val_dataset.map(parse_image)
## data transformations--------------------------------------------------------
def normalise(input_image: tf.Tensor, input_mask: tf.Tensor) -> tuple:
input_image = tf.cast(input_image, tf.float32) / 255.0
return input_image, input_mask
def load_image_train(datapoint: dict) -> tuple:
input_image = tf.image.resize(datapoint["image"], (imgr, imgc))
input_mask = tf.image.resize(datapoint["segmentation_mask"], (imgr, imgc))
if tf.random.uniform(()) > 0.5:
input_image = tf.image.flip_left_right(input_image)
input_mask = tf.image.flip_left_right(input_mask)
input_image, input_mask = normalise(input_image, input_mask)
return input_image, input_mask
def load_image_test(datapoint: dict) -> tuple:
input_image = tf.image.resize(datapoint["image"], (imgr, imgc))
input_mask = tf.image.resize(datapoint["segmentation_mask"], (imgr, imgc))
input_image, input_mask = normalise(input_image, input_mask)
return input_image, input_mask
## create datasets-------------------------------------------------------------
buff_size = 1000
dataset = {"train": train_dataset, "val": val_dataset}
# -- Train Dataset --#
dataset["train"] = dataset["train"]\
.map(load_image_train, num_parallel_calls = tf.data.experimental.AUTOTUNE)
dataset["train"] = dataset["train"].shuffle(buffer_size = buff_size,
seed = zeed)
dataset["train"] = dataset["train"].repeat()
dataset["train"] = dataset["train"].batch(bs)
dataset["train"] = dataset["train"].prefetch(buffer_size = AUTOTUNE)
#-- Validation Dataset --#
dataset["val"] = dataset["val"].map(load_image_test)
dataset["val"] = dataset["val"].repeat()
dataset["val"] = dataset["val"].batch(bs)
dataset["val"] = dataset["val"].prefetch(buffer_size = AUTOTUNE)
Now I wanted to use a weighted version of tf.keras.losses.SparseCategoricalCrossentropy for my model and I found this tutorial, which is rather similar to the example above.
However, they also offered a weighted version of the loss, using:
def add_sample_weights(image, label):
# The weights for each class, with the constraint that:
# sum(class_weights) == 1.0
class_weights = tf.constant([2.0, 2.0, 1.0])
class_weights = class_weights/tf.reduce_sum(class_weights)
# Create an image of `sample_weights` by using the label at each pixel as an
# index into the `class weights` .
sample_weights = tf.gather(class_weights, indices=tf.cast(label, tf.int32))
return image, label, sample_weights
I combined those approaches since the latter tutorial uses previously loaded data, while I want to draw the images from disc (not enough RAM to load all at once).
Resulting in the code from the first example (long code block above) followed by
def add_sample_weights(image, segmentation_mask):
class_weights = tf.constant(inv_weights, dtype = tf.float32)
class_weights = class_weights/tf.reduce_sum(class_weights)
sample_weights = tf.gather(class_weights,
indices = tf.cast(segmentation_mask, tf.int32))
return image, segmentation_mask, sample_weights
(inv_weights are my weights, an array of 30 float64 values) and
epochs = 45, steps_per_epoch = np.ceil(N_img/bs),
validation_data = dataset["val"],
validation_steps = np.ceil(N_val/bs),
callbacks = cllbs)
When I run
as in the second example, I get an output that looks reasonable to me (similar to the one in the example):
(TensorSpec(shape=(None, 512, 512, 3), dtype=tf.float32, name=None),
TensorSpec(shape=(None, 512, 512, 1), dtype=tf.float32, name=None),
TensorSpec(shape=(None, 512, 512, 1), dtype=tf.float32, name=None))
However, when I try to fit the model or run something like
a, b, c = dataset["train"].map(add_sample_weights).take(1)
I will receive the error mentioned above.
So far, I have found quite some questions regarding this error (e.g., a, b, c, d), however, they all talk of "embedding layers" and things I am not aware of using.
Where does this error come from and how can I solve it?
Picture tf.gather as a fancy way to do indexing. The error you get is akin to the following example in python:
>>> my_list = [1,2,3]
>>> my_list[3]
IndexError: list index out of range
If you want to use tf.gather, then the range of value of your indices should not be bigger than the dimension size of the Tensor you are willing to index.
In your case, in the call tf.gather(class_weights,indices = tf.cast(segmentation_mask, tf.int32)), with class_weights being a Tensor of dimension (30,), the range of values of segmentation_mask should be between 0 and 29. As far as I can tell from your data pipeline, segmentation_mask has a range of value between 0 and 255. The fix will be problem dependent.

Get Each Layer Output in Keras Model for a Single Image

I would like to know how to get the output of each layer of a pre-trained CNN Keras model. What I am working on is to get the intermediate outputs of each layer in the model associated with a specific image I am providing to the model. Here is what I did:
model = load_model('model.h5')
img = Image.open('img.jpg')
img_array = np.array (img)
img_array = img_array/255
img_array = img_array.reshape(-1,512,512,1)
pred = model.predict(img_array)
I am just so confused about what to do next to print the output of each layer in such a case!
The outputs from layers could be collected by following the steps below:
from keras import backend as K
model = load_model('model.h5')
inp = model.input # input placeholder
out = [layer.output for layer in model.layers] # all layer outputs
get_outputs = K.function([inp, K.learning_phase()], out)
img = load_img('img.jpg')
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
x /= 255.
layer_outs = get_outputs([x, 1.])
The intermediate representation of the input image img.jpg could be replicated by running the following code snippet:
from tensorflow.keras.preprocessing.image import img_to_array, load_img
model = load_model('model.h5')
# Define a new Model that will take an image as input, and will output
# intermediate representations for all layers except the first layer.
layer_outputs = [layer.output for layer in model.layers[1:]]
visual_model = tf.keras.models.Model(inputs = model.input, outputs = layer_outputs)
# Read your image
img = load_img('img.jpg')
x = img_to_array(img)
x = x.reshape((1,) + x.shape) # add one extra dimension to the front
x /= 255. # rescale by 1/255.
# run your image through the network; make a prediction
feature_maps = visual_model.predict(x)
# Plotting intermediate representations for your image
# Collect the names of each layer except the first one for plotting
layer_names = [layer.name for layer in model.layers[1:]]
# Plotting intermediate representation images layer by layer
for layer_name, feature_map in zip(layer_names, feature_maps):
if len(feature_map.shape) == 4: # skip fully connected layers
# number of features in an individual feature map
n_features = feature_map.shape[-1]
# The feature map is in shape of (1, size, size, n_features)
size = feature_map.shape[1]
# Tile our feature images in matrix `display_grid
display_grid = np.zeros((size, size * n_features))
# Fill out the matrix by looping over all the feature images of your image
for i in range(n_features):
# Postprocess each feature of the layer to make it pleasible to your eyes
x = feature_map[0, :, :, i]
x -= x.mean()
x /= x.std()
x *= 64
x += 128
x = np.clip(x, 0, 255).astype('uint8')
# We'll tile each filter into this big horizontal grid
display_grid[:, i * size : (i + 1) * size] = x
# Display the grid
scale = 20. / n_features
plt.figure(figsize=(scale * n_features, scale))
plt.imshow(display_grid, aspect='auto', cmap='viridis')

How to use flow_from_directory in Keras for multi-class semantic segmentation?

Let's say I have 100 training grayscale images and 100 RGB training masks, each of size 512x512. I was able to one-hot encode the masks using to_categorical in Keras with the below
where maskArr is a 100x512x512x1, and masks_one_hot is 100x512x512x3.
However, to use ImageDataGenerator and flow_from_directory using trainGenerator from https://github.com/zhixuhao/unet/blob/master/data.py, I tried to save the one-hot encoded training images and then read them using trainGenerator. However, I noticed after using imwrite on them and then reading them with imread, they changed from one-hot encoded 512x512x3 to 512x512x3 RGB images. That is, instead of each channel having a value of 0 or 1, they now range from 0-255
As a result, if I do:
myGenerator = trainGeneratorOneHot(20,'data/membrane/train','image','label',data_gen_args,save_to_dir = "data/membrane/train/aug", flag_multi_class = True,
num_class = 3, target_size=(512,512,3))
for i,batch in enumerate(myGenerator):
if(i >= num_batch):
where trainGeneratorOneHot is below:
def trainGeneratorOneHot(batch_size,...class_mode=None, image_class_mode=None):
image_datagen = ImageDataGenerator(**aug_dict)
mask_datagen = ImageDataGenerator(**aug_dict)
image_generator = image_datagen.flow_from_directory(train_path,classes = [image_folder], class_mode = image_class_mode, color_mode = image_color_mode,target_size = target_size, ...)
mask_generator = mask_datagen.flow_from_directory(train_path, classes = [mask_folder], class_mode = class_mode, target_size = target_size,...)
train_generator = zip(image_generator, mask_generator)
for (img,mask) in train_generator:
img,mask = adjustDataOneHot(img,mask)
yield (img,mask)
def adjustDataOneHot(img,mask):
return (img,mask)
Then I get `ValueError: could not broadcast input array from shape (512,512,1) into shape (512,512,3,1)
How can I fix this?
Was dealing with the same issue a few days ago. I found it essential to make my own data generator class to deal with taking in data from a dataframe, augmenting it, and then one-hot-encoding it before passing it to my model. I was never able to get the Keras ImageDataGenerator to work for semantic segmentation problems with multiple classes.
Below is a data generator class in case it might help you out:
def one_hot_encoder(mask, num_classes = 8):
hot_mask = np.zeros(shape = mask.shape, dtype = 'uint8')
for _ in range(8):
temp = np.zeros(shape = mask.shape[0:2], dtype = 'uint8')
temp[mask[:, :, _] != 0] = 1
hot_mask[:, :, _] = temp
return hot_mask
# Image data generator class
class DataGenerator(keras.utils.Sequence):
def __init__(self, dataframe, batch_size, n_classes = 8, augment = False):
self.dataframe = dataframe
self.batch_size = batch_size
self.n_classes = n_classes
self.augment = augment
# Steps per epoch
def __len__(self):
return len(self.dataframe) // self.batch_size
# Shuffles and resets the index at the end of training epoch
def on_epoch_end(self):
self.dataframe = self.dataframe.reset_index(drop = True)
# Generates data, feeds to training
def __getitem__(self, index):
processed_images = []
processed_masks = []
for _ in range(self.batch_size):
the_image = io.imread(self.dataframe['Images'][index])
the_mask = io.imread(self.dataframe['Masks'][index]).astype('uint8');
one_hot_mask = one_hot_encoder(the_mask, 8)
# Resizing followed by some augmentations
processed_image = augs_for_images(image = the_image) / 255.0
processed_mask = augs_for_masks(image = one_hot_mask)
# Still resizing but no augmentations
processed_image = resize(image = the_image) / 255.0
processed_mask = resize(image = one_hot_mask)
batch_x = np.array( processed_images )
batch_y = np.array( processed_masks )
return (batch_x, batch_y)
Also, here's a link to a repo with some semantic segmentation models that might be of interest to you. The notebook itself shows how the author dealt with multi-class semantic segmentation.

Image Generator for 3D volumes in keras with data augmentation

Since the ImageDataGenerator by keras is not suitable for 3D volumes, I started to write my own generator for keras (semantic segmentation, not classification!).
1) If there is anybody out there that has adapted the ImageDataGenerator code to work with 3D volumes, please share it! This guy has done it for videos.
2) According to this tutorial I wrote a custom generator.
import glob
import os
import keras
import numpy as np
import skimage
from imgaug import augmenters as iaa
class DataGenerator(keras.utils.Sequence):
"""Generates data for Keras"""
"""This structure guarantees that the network will only train once on each sample per epoch"""
def __init__(self, list_IDs, im_path, label_path, batch_size=4, dim=(128, 128, 128),
n_classes=4, shuffle=True, augment=False):
self.dim = dim
self.batch_size = batch_size
self.list_IDs = list_IDs
self.im_path = im_path
self.label_path = label_path
self.n_classes = n_classes
self.shuffle = shuffle
self.augment = augment
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X, y = self.__data_generation(list_IDs_temp)
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
def __data_generation(self, list_IDs_temp):
if self.augment:
if not self.augment:
X = np.empty([self.batch_size, *self.dim])
Y = np.empty([self.batch_size, *self.dim, self.n_classes])
# Generate data
for i, ID in enumerate(list_IDs_temp):
img_X = skimage.io.imread(os.path.join(im_path, ID))
X[i,] = img_X
img_Y = skimage.io.imread(os.path.join(label_path, ID))
Y[i,] = keras.utils.to_categorical(img_Y, num_classes=self.n_classes)
X = X.reshape(self.batch_size, *self.dim, 1)
return X, Y
params = {'dim': (128, 128, 128),
'batch_size': 4,
'im_path': "some/path/for/the/images/",
'label_path': "some/path/for/the/label_images",
'n_classes': 4,
'shuffle': True,
'augment': True}
partition = {}
im_path = "some/path/for/the/images/"
label_path = "some/path/for/the/label_images/"
images = glob.glob(os.path.join(im_path, "*.tif"))
images_IDs = [name.split("/")[-1] for name in images]
partition['train'] = images_IDs
training_generator = DataGenerator(partition['train'], **params)
My images have the size (128, 128, 128) and when I load them in I get a 5D tensor of size (batch_size, depth, heigt, width, number_of_channels), e.g. (4, 128, 128, 128, 1). For the label_images (which have the same dimensions and are single channel coded (value 1 = label 1, value 2 = label 2, value 3 = label 3 and value 0 = label 4 or background)) I get a binary representation of the labels with the to_categorical() function from keras and end up with a 5D, e.g. (4, 128, 128, 128, 4). The images and label_images have the same name and are located in different folders.
As I only have very few images, I would like to extend the total number of images through image augmentation. How would I do that with this generator? I have successfully tested the imgaug package, but instead of adding images to my set I only transform the existing images (e.g. flip them horizontally)
Edit: I was in misconception regarding data augmentation. See this article about image augmentation. Images will be passed in with random transformations (on-the-fly). Now I just have to gather enough data and set the parameters with imgaug. I will update this soon.
I found an implementation of a Keras customDataGenerator for 3D volume. Here is a GitHub link. The implementation can easily be expanded to include new augmentation techniques. Here is a minimal working example I am working in my project (3D volume semantic segmentation) based in the implementation I shared in the link:
from generator import customImageDataGenerator
def generator(images, groundtruth, batch):
"""Load a batch of augmented images"""
gen = customImageDataGenerator(mirroring=True,
for b in gen.flow(x=images, y=groundtruth, batch_size=batch):
yield (b[0], (b[1]).astype(float))
# images = (123, 48,48,48,1)
# groundtruth = (123, 48,48,48,1)
history = model.fit(
x=generator(images, groundtruth, batchSize),
validation_data=(imagesTest, groundtruthTest),
steps_per_epoch=len(images) / batchSize,

training a multi-output keras model

I have 10,000 images, each of which are labeled with 20 tags. For each image, the tag is either true or false. I'm trying to train a multi-output model to perform all these 20 binary classifications with one network.
The network is a Residual Network. After the flatten layer, the network branches out into 20 branches. Each branch has 2 fully connected layers, each of which are followed by a drop out layer. And finally a dense layer with one node and sigmoid activation in the end.
The labels for each image and the image name are stored in a text file, for both train and validation set. Like this:
1.jpg 1 -1 1 -1 -1 1 -1.........
I wrote my own generator, but I can't get them to work. I keep getting this error:
Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 20 array(s), but instead got the following list of 1 arrays.
Function explanations: get_input function reads an image and resizes it.
get_output prepares the labels for each image. The labels are stored in a list and returned in the end. preprocess_input performs preprocessing and converting images into arrays. train_generator and validation_generator generate batches with size 32 to be fed to the model.
Here's my code:
def get_input(img_name):
path = os.path.join("images", img_name)
img = image.load_img(path, target_size=(224, 224))
return img
def get_output(img_name, file_path):
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
img_id = img_name.split(".")[0]
img_id = img_id.lstrip("0")
img_id = int(img_id)
labels = data.loc[img_id - 1].values
labels = labels[1:]
labels = list(labels)
label_arrays = []
for i in range(20):
val = np.zeros((1))
val[0] = labels[i]
return label_arrays
def preprocess_input(img_name):
img = get_input(img_name)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
return x
def train_generator(batch_size):
file_path = "train.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
while True:
for i in range(math.floor(8000/batch_size)):
x_batch = np.zeros(shape=(32, 224, 224, 3))
y_batch = np.zeros(shape=(32, 20))
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch[j, :, :, :] = x
y_batch[j] = y
yield(x_batch, y_batch)
def val_generator(batch_size):
file_path = "val.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
while True:
for i in range(math.floor(2000/batch_size)):
x_batch = np.zeros(shape=(32, 224, 224, 3))
y_batch = np.zeros(shape=(32, 20))
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch[j, :, :, :] = x
y_batch[j] = y
yield(x_batch, y_batch)
One quick question. What's the difference between this loop and the one in your answer:
ys = []
for i in range(batch_size):
ys.append(y_batch[i, :])
yield(x_batch, ys)
If your model has 20 outputs then you must provide a list of 20 arrays as target. One way of doing this is to modify the generator (for both training and validation):
ys = []
for i in range(20):
yield(x_batch, ys)
As a side note, you mentioned that you have 20 tags per sample then why have you specified 40 in the input shape?
y_batch = np.zeros(shape=(32, 40))
Further, I don't know about the specific problem you are working on but alternatively you could only have one output of size 20 instead of 20 outputs with size one.
You can test the generator output dimensions initializing the generator and call the function next() to check the dimensions. For example with the train_generator:
train_gen = train_generator(batch_size)
x_batch, y_batch = next(train_gen)
Then check x_batch and y_batch dimensions and datatype
I would make the generator in this way:
def train_generator(batch_size):
file_path = "train.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
# Initialize empty list
x_batch = []
y_batch = []
while True:
for i in range(math.floor(8000/batch_size)):
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
yield(np.array(x_batch), np.array(y_batch))
