Split queue into train/test set

Split queue into train/test set - python

I set up my pipeline starting with a filename queue as in the following pseudocode:
filename_queue = tf.train.string_input_producer(["file0.pd", "file1.pd"])
pointing to TFRecords containing multiple serialized tf.train.Example images.
Following the tensorflow guide a function which reads one example:
def read_my_file_format(filename_queue):
reader = tf.SomeReader()
key, record_string = reader.read(filename_queue)
example, label = tf.some_decoder(record_string)
processed_example = some_processing(example)
return processed_example, label
which is used for a batch queue:
def input_pipeline(filenames, batch_size):
filename_queue = tf.train.string_input_producer(filenames)
example, label = read_my_file_format(filename_queue)
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=100,
min_after_dequeue=10)
return example_batch, label_batch
I am looking for a way to split the data randomly into training and test sets. I don't want to save the training and test set into different files, but that the images are randomly assigned to the training or the test set independent of the file they are read from.
Ideally I would like to split the input pipeline into a training and test queue.
Here is what I normally do in numpy when I have to split a huge dataset
import numpy as np
from numpy.random import choice
from numpy.random import RandomState
queue = range(10)
weights = (.8,.2) # create 2 partitions with this weights
def sampler(partition, seed=0):
rng = RandomState(seed)
return lambda x: rng.choice(np.arange(len(weights)), p=weights) == partition
def split(queue, weights):
# filter the queue for each partition
return [filter(sampler(partition), queue) for partition in range(len(weights)) ]
(train, test) = split(queue, weights)
print(list(train)) # [0, 1, 2, 3, 4, 5, 6, 9]
print(list(test)) # [7, 8]

Suggestion, using Tensorflow Dataset API (map(), interleave(), filter()):
import tensorflow as tf
import numpy as np
def _parse_function(example_proto):
""" Parse TFRecord data """
features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
"label": tf.FixedLenFeature((), tf.int64, default_value=0)}
parsed_features = tf.parse_single_example(example_proto, features)
return parsed_features
def split_train_test(parsed_features, train_rate=0.8, seed=11):
""" Randomly classify samples into training or testing split """
# Snippet by Igor Gadelha Pereira (https://stackoverflow.com/a/49825457/624547)
parsed_features['is_train'] = tf.gather(tf.random_uniform([1], seed=seed) < train_rate, 0)
return parsed_features
def filter_per_split(parsed_features, train=True):
""" Filter samples depending on their split """
return parsed_features['is_train'] if train else ~parsed_features['is_train']
def select_features(parsed_features, keys=["image", "label"]):
""" Return array of features selected by key """
selected_features = [parsed_features[key] for key in keys]
return selected_features
weights = (.8,.2)
num_files = 3
file_block_length = 1
files = ["/tmp/file{}.tfrecords".format(i) for i in range(num_files)]
# ... where file{i}.tfrecords contains:
# [{"label": i, "image": "class_{}/img_{}.png".format(i, k)} for k in range(10)]
# Create TFRecord file list list:
files = tf.data.Dataset.from_tensor_slices(files)
# Interleave all records:
dataset = files.interleave(lambda x: tf.data.TFRecordDataset(x),
cycle_length=num_files, block_length=file_block_length)
# ^ dataset containing:
# [rec0#file0, rec0#file1, rec0#file2, rec1#file0, rec1#file1, rec1#file2, ...]
# Parse TFRecord samples:
dataset = dataset.map(_parse_function)
# Randomly classify samples between training or testing:
dataset = dataset.map(lambda x: split_train_test(x, train_rate=weights[0]))
# Split into 2 datasets accordingly:
dataset_train = dataset.filter(lambda x: filter_per_split(x, train=True))
dataset_test = dataset.filter(lambda x: filter_per_split(x, train=False))
# Opt. remove "is_train" key, keeping only the original features:
dataset_train = dataset_train.map(select_features)
dataset_test = dataset_test.map(select_features)
# Use:
iterator_train = dataset_train.make_one_shot_iterator()
iterator_test = dataset_test.make_one_shot_iterator()
with tf.Session() as sess:
for it, name in zip([iterator_train, iterator_test], ["Training", "Testing"]):
x = it.get_next()
count = 0
print("{} Split:".format(name))
try:
while True:
print(sess.run(x))
count += 1
except:
print("- End of Split ({} / {}".format(count, num_files * 10))
Output:
Training Split:
(b'class_0/img_0.png', 0)
(b'class_1/img_0.png', 1)
(b'class_2/img_0.png', 2)
(b'class_0/img_1.png', 0)
(b'class_1/img_1.png', 1)
(b'class_1/img_2.png', 1)
(b'class_2/img_2.png', 2)
(b'class_0/img_3.png', 0)
(b'class_1/img_3.png', 1)
(b'class_2/img_3.png', 2)
(b'class_1/img_4.png', 1)
(b'class_2/img_4.png', 2)
(b'class_0/img_5.png', 0)
(b'class_1/img_5.png', 1)
(b'class_2/img_5.png', 2)
(b'class_0/img_6.png', 0)
(b'class_1/img_6.png', 1)
(b'class_2/img_6.png', 2)
(b'class_0/img_7.png', 0)
(b'class_1/img_7.png', 1)
(b'class_2/img_7.png', 2)
(b'class_0/img_8.png', 0)
(b'class_1/img_8.png', 1)
(b'class_2/img_8.png', 2)
(b'class_0/img_9.png', 0)
(b'class_1/img_9.png', 1)
(b'class_2/img_9.png', 2)
- End of Split (27 / 30
Testing Split:
(b'class_2/img_1.png', 2)
(b'class_0/img_2.png', 0)
(b'class_0/img_4.png', 0)
- End of Split (3 / 30

Related

Use generator in TensorFlow/Keras to fit when the model gets 2 inputs

I want to train a model that uses an extra output layer to compute the loss (ArcFace) so the model gets two inputs: the features and the true label: [X, y].
So far I did with the all data loaded at once by the following code:
print("Unzipping DataSet to NumPy arrays")
x_train, y_train = dataset2arrays(train_ds, labels)
x_val, y_val = dataset2arrays(val_ds, val_labels)
model.fit(x=[x_train, y_train],
y=y_train,
batch_size=10,
validation_data=[[x_val, y_val], y_val],
n_epochs=20,
)
Now, this was done with "debugging" data, which is small (< 100 samples).
The real training data is very large (> 300 GB of files) so I can't load all the data at once.
Therefore I need to use a generator. In TensorFlow 2.8 a generator is implemented by inheriting from Keras Sequence class. The following generator is based on the example in https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
from os import path
import numpy as np
from tensorflow.keras.utils import Sequence
from keras.preprocessing.sequence import pad_sequences
from pre_processing import load_data
class DataGenerator(Sequence):
"""Generates data for Keras
Sequence based data generator. Suitable for building data generator for training and prediction.
"""
def __init__(self, list_IDs, labels, n_classes, input_path, target_path,
to_fit=True, batch_size=20, shuffle=True):
"""Initialization
:param list_IDs: list of all 'label' ids to use in the generator
:param to_fit: True to return X and y, False to return X only
:param batch_size: batch size at each iteration
:param shuffle: True to shuffle label indexes after every epoch
"""
self.input_path = input_path
self.target_path = target_path
self.list_IDs = list_IDs
self.labels = labels
self.n_classes = n_classes
self.to_fit = to_fit
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
"""Denotes the number of batches per epoch
:return: number of batches per epoch
"""
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
"""Generate one batch of data
:param index: index of the batch
:return: X and y when fitting. X only when predicting
"""
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
list_labels_temp = [self.labels[k] for k in indexes]
# Generate data
X = self._generate_X(list_IDs_temp)
if self.to_fit:
y = self._generate_y(list_labels_temp)
# print(indexes) # for debugging
return [X], y
else:
return [X]
def on_epoch_end(self):
"""
Updates indexes after each epoch
"""
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle:
np.random.shuffle(self.indexes)
def _generate_X(self, list_IDs_temp):
"""Generates data containing batch_size images
:param list_IDs_temp: list of label ids to load
:return: batch of images
"""
# Initialization
X = []
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
# temp = self._load_input(self.input_path, ID)
temp = load_data(path.join(self.input_path, ID))
X.append(temp)
X = pad_sequences(X, value=0, padding='post')
return X
def _generate_y(self, list_IDs_temp):
"""Generates data containing batch_size masks
:param list_IDs_temp: list of label ids to load
:return: batch if masks
"""
# TODO: modify
y = []
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
# y.append(self._load_target(self.target_path, ID))
y.append(ID)
# y = pad_sequences(y, value=0, padding='post')
return y
The most important part is:
if self.to_fit:
y = self._generate_y(list_labels_temp)
print(indexes)
# Option 1:
return [X], y
# Option 2
return tuple([[X], [y]])
# Option 3
return tuple(((X), (y)))
# Option 4
Xy = []
for i in range(len(y)):
Xy.append([X[i,:,:], y[i]])
return Xy
# Option 5
Xy = []
for i in range(len(y)):
Xy.append(X[i,:,:])
return tuple((Xy, y))
else:
return [X]
With all (or most) of the options I tried as the output which the generator returns.
The new fit is:
history = model.fit(gen,
callbacks=callbacks,
batch_size = 10,
epochs =20 ,
# validation_data = tuple(validation_data),
shuffle=True,
verbose = 1, # display training on the terminal
)
With option 1 I get the following error:
ValueError: Layer "ForTraining" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, None, None) dtype=int32>]
The other options don't work as well (most return the same error as above).
So what am I doing wrong?
So how to make my generator return correctly the tensor needed for training (features X and their labels y on batch-size b)?
The following link may be relevant: https://github.com/pierluigiferrari/ssd_keras/issues/380
Note that I am running TensorFlow 2.8 on Python 3.9.5 on a laptop with Windows 10 and without GPU (the real training on the full dataset will take place on a much stronger machine. This laptop is used only for debugging).
Solution:
The following solves the problem and now the training is running (when I comment out validation monitoring and callbacks):
def __getitem__(self, index):
"""Generate one batch of data
:param index: index of the batch
:return: X and y when fitting. X only when predicting
"""
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
list_labels_temp = [self.labels[k] for k in indexes]
# Generate data
X = self._generate_X(list_IDs_temp)
if self.to_fit:
# Training/Fit case
y = self._generate_y(list_labels_temp)
y = np.array(y).reshape((len(y),1))
return (X, y), y
else:
# Prediction only
return [X]
How do I use the generator for validation data? I created another generator (identical to the train generator) and put it in "validation data" and the training procedure was completed successfully (without throwing an exception). It seems this is the solution to the problem.

The correct modification is:
def __getitem__(self, index):
"""Generate one batch of data
:param index: index of the batch
:return: X and y when fitting. X only when predicting
"""
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
list_labels_temp = [self.labels[k] for k in indexes]
# Generate data
X = self._generate_X(list_IDs_temp)
if self.to_fit:
# Training/Fit case
y = self._generate_y(list_labels_temp)
y = np.array(y).reshape((len(y),1))
return (X, y), y
else:
# Prediction only
return [X]

Selecting according to labels in a TensorFlow generator

I have a very large dataset (VoxCeleb) and each datum has a label (multi-class, can assume that the label is a number between 1 to 5000) and an audio recording. Since it is too large to load entirely in one time, my strategy is to use a generator (in TensorFlow 1, it means using fit_generator instead of fit in the training. I, however, use TensorFlow 2.8 and Keras).
Usually, a generator selects the batch randomly (by shuffelling the indices). I want the batch to be selected only semi-randomly in the following sense:
In Each batch there are n_s total samples.
The batch contains n_c distinct labels/classes (they are chosen randomly).
Each label/class in the batch has n_p samples (utterances).
n_s = n_c * n_p
An epoch is running on all the labels, such that every label is seen at least once.
This a general DataGenerator class I modified:
from os import path
import numpy as np
from keras.utils import Sequence
from keras.preprocessing.sequence import pad_sequences
from pre_processing import load_data # customize function
class DataGenerator(Sequence):
"""Generates data for Keras
Sequence based data generator. Suitable for building data generator for training and prediction.
"""
def __init__(self, list_IDs, labels, n_classes, input_path, target_path,
to_fit=True, batch_size=n_s, shuffle=True):
"""Initialization
:param list_IDs: list of all 'label' ids to use in the generator
:param to_fit: True to return X and y, False to return X only
:param batch_size: batch size at each iteration
:param shuffle: True to shuffle label indexes after every epoch
"""
self.input_path = input_path
self.target_path = target_path
self.list_IDs = list_IDs
self.labels = labels
self.n_classes = n_classes
self.to_fit = to_fit
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
"""Denotes the number of batches per epoch
:return: number of batches per epoch
"""
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
"""Generate one batch of data
:param index: index of the batch
:return: X and y when fitting. X only when predicting
"""
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X = self._generate_X(list_IDs_temp)
if self.to_fit:
y = self._generate_y(list_IDs_temp)
return [X], y
else:
return [X]
def on_epoch_end(self):
"""
Updates indexes after each epoch
"""
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle:
np.random.shuffle(self.indexes)
def _generate_X(self, list_IDs_temp):
"""Generates data containing batch_size images
:param list_IDs_temp: list of label ids to load
:return: batch of images
"""
# Initialization
X = []
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
temp = self._load_input(self.input_path, ID)
X.append(temp)
X = pad_sequences(X, value=0, padding='post')
return X
def _generate_y(self, list_IDs_temp):
"""Generates data containing batch_size masks
:param list_IDs_temp: list of label ids to load
:return: batch if masks
"""
# TODO: modify
y = []
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
y.append(self._load_target(self.target_path, ID))
# y = pad_sequences(y, value=0, padding='post')
return y
def _load_input(self, input_path, ID):
feats = load_data(path.join(input_path, ID))
return feats
def _load_target(self, target_path, ID):
return self.labels[ID]
Here I assume input_file is the directory where the audio files are saved, and ID is the file name. The function load_data is a customized function which reads the audio file and extracts some features (returns a Tensor).
How to write a such a generator that selects n_s samples in each batch according to the specifications above? Just shuffling indices and choosing randomly won't work here.
Added in Edit:
One approach is to sample randomly many samples n_big >> n_s and then filter out samples until we have at least n_c labels with n_p samples each. However, this is not guaranteed to work, so it may be computationally expansive (for each batch try many random subsets).
Added in Edit:
I found something similar, but not what I need. This is a code in PyTorch (not TensorFlow), which creates a data generator for VoxCeleb:
https://github.com/clovaai/voxceleb_trainer/blob/master/DatasetLoader.py

Speaker Recognition TypeError: Input 'filename' of 'ReadFile' Op has type float32 that does not match expected type of string

I am running this code from the tutorial here:
https://keras.io/examples/audio/speaker_recognition_using_cnn/
with a custom dataset, that is divided in 2 datasets as in the tutorial. However, I got this error:
TypeError: Input 'filename' of 'ReadFile' Op has type float32 that does not match expected type of string.
col lab link https://colab.research.google.com/drive/1bBplixcAu6iCMfQ4njNeByozjet3GkNf#scrollTo=5SGb0kVC3JGW
code
def paths_and_labels_to_dataset(audio_paths, labels):
"""Constructs a dataset of audios and labels."""
path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
audio_ds = path_ds.map(lambda x: path_to_audio(x))
label_ds = tf.data.Dataset.from_tensor_slices(labels)
return tf.data.Dataset.zip((audio_ds, label_ds))
def path_to_audio(path):
"""Reads and decodes an audio file."""
audio = tf.io.read_file(path)
audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
return audio
def add_noise(audio, noises=None, scale=0.5):
if noises is not None:
# Create a random tensor of the same size as audio ranging from
# 0 to the number of noise stream samples that we have.
tf_rnd = tf.random.uniform(
(tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32
)
noise = tf.gather(noises, tf_rnd, axis=0)
# Get the amplitude proportion between the audio and the noise
prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)
# Adding the rescaled noise to audio
audio = audio + noise * prop * scale
return audio
def audio_to_fft(audio):
# Since tf.signal.fft applies FFT on the innermost dimension,
# we need to squeeze the dimensions and then expand them again
# after FFT
audio = tf.squeeze(audio, axis=-1)
fft = tf.signal.fft(
tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
)
fft = tf.expand_dims(fft, axis=-1)
# Return the absolute value of the first half of the FFT
# which represents the positive frequencies
return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])
# Get the list of audio file paths along with their corresponding labels
class_names = os.listdir(DATASET_AUDIO_PATH)
print("Our class names: {}".format(class_names,))
audio_paths = []
labels = []
for label, name in enumerate(class_names):
print("Processing speaker {}".format(name,))
dir_path = Path(DATASET_AUDIO_PATH) / name
speaker_sample_paths = [
os.path.join(dir_path, filepath)
for filepath in os.listdir(dir_path)
if filepath.endswith(".wav")
]
audio_paths += speaker_sample_paths
labels += [label] * len(speaker_sample_paths)
print(
"Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)
# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)
# Split into training and validation
num_val_samples = int(VALID_SPLIT * len(audio_paths))
print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]
print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]
# Create 2 datasets, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
BATCH_SIZE
)
valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)
# Add noise to the training set
train_ds = train_ds.map(
lambda x, y: (add_noise(x, noises, scale=SCALE), y),
num_parallel_calls=tf.data.AUTOTUNE,
)
# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.map(
lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

How to split the data into training and testing data

Hi so right now I got data load code and I'm not sure how would i split it into training and testing data. can anyone give me suggestion how to do it this is my data load code.
def __init__(self, root, specific_folder, img_extension, preprocessing_method=None, crop_size=(96, 112),train = True):
"""
Dataloader of the LFW dataset.
root: path to the dataset to be used.
specific_folder: specific folder inside the same dataset.
img_extension: extension of the dataset images.
preprocessing_method: string with the name of the preprocessing method.
crop_size: retrieval network specific crop size.
"""
self.preprocessing_method = preprocessing_method
self.crop_size = crop_size
self.imgl_list = []
self.classes = []
self.people = []
self.model_align = None
self.arr = []
# read the file with the names and the number of images of each people in the dataset
with open(os.path.join(root, 'people.txt')) as f:
people = f.read().splitlines()[1:]
# get only the people that have more than 20 images
for p in people:
p = p.split('\t')
if len(p) > 1:
if int(p[1]) >= 20:
for num_img in range(1, int(p[1]) + 1):
self.imgl_list.append(os.path.join(root, specific_folder, p[0], p[0] + '_' +
'{:04}'.format(num_img) + '.' + img_extension))
self.classes.append(p[0])
self.people.append(p[0])
le = preprocessing.LabelEncoder()
self.classes = le.fit_transform(self.classes)
print(len(self.imgl_list), len(self.classes), len(self.people))
def __getitem__(self, index):
imgl = imageio.imread(self.imgl_list[index])
cl = self.classes[index]
# if image is grayscale, transform into rgb by repeating the image 3 times
if len(imgl.shape) == 2:
imgl = np.stack([imgl] * 3, 2)
imgl, bb = preprocess(imgl, self.preprocessing_method, crop_size=self.crop_size,
is_processing_dataset=True, return_only_largest_bb=True, execute_default=True)
# append image with its reverse
imglist = [imgl, imgl[:, ::-1, :]]
# normalization
for i in range(len(imglist)):
imglist[i] = (imglist[i] - 127.5) / 128.0
imglist[i] = imglist[i].transpose(2, 0, 1)
imgs = [torch.from_numpy(i).float() for i in imglist]
return imgs, cl, imgl, bb, self.imgl_list[index], self.people[index]
def __len__(self):
return len(self.imgl_list)
I need to split the data in there into 20% and 80% data so I can test my module it been almost a week now and still have no idea at all how to do it would be appreciate so much if anyone can help:

In general using PyTorch:
import torch
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
dataset = yourdatahere
batch_size = 16 #change to whatever you'd like it to be
test_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(test_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
train_indices, test_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=test_sampler)
# Usage Example:
num_epochs = 10
for epoch in range(num_epochs):
# Train:
for batch_index, (faces, labels) in enumerate(train_loader):
# ...
Please note that you should also split your training data into training + validation data. You may use the same logic from above to do so.

training a multi-output keras model

I have 10,000 images, each of which are labeled with 20 tags. For each image, the tag is either true or false. I'm trying to train a multi-output model to perform all these 20 binary classifications with one network.
The network is a Residual Network. After the flatten layer, the network branches out into 20 branches. Each branch has 2 fully connected layers, each of which are followed by a drop out layer. And finally a dense layer with one node and sigmoid activation in the end.
The labels for each image and the image name are stored in a text file, for both train and validation set. Like this:
1.jpg 1 -1 1 -1 -1 1 -1.........
I wrote my own generator, but I can't get them to work. I keep getting this error:
Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 20 array(s), but instead got the following list of 1 arrays.
Function explanations: get_input function reads an image and resizes it.
get_output prepares the labels for each image. The labels are stored in a list and returned in the end. preprocess_input performs preprocessing and converting images into arrays. train_generator and validation_generator generate batches with size 32 to be fed to the model.
Here's my code:
def get_input(img_name):
path = os.path.join("images", img_name)
img = image.load_img(path, target_size=(224, 224))
return img
def get_output(img_name, file_path):
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
img_id = img_name.split(".")[0]
img_id = img_id.lstrip("0")
img_id = int(img_id)
labels = data.loc[img_id - 1].values
labels = labels[1:]
labels = list(labels)
label_arrays = []
for i in range(20):
val = np.zeros((1))
val[0] = labels[i]
label_arrays.append(val)
return label_arrays
def preprocess_input(img_name):
img = get_input(img_name)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
return x
def train_generator(batch_size):
file_path = "train.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
while True:
for i in range(math.floor(8000/batch_size)):
x_batch = np.zeros(shape=(32, 224, 224, 3))
y_batch = np.zeros(shape=(32, 20))
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch[j, :, :, :] = x
y_batch[j] = y
yield(x_batch, y_batch)
def val_generator(batch_size):
file_path = "val.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
while True:
for i in range(math.floor(2000/batch_size)):
x_batch = np.zeros(shape=(32, 224, 224, 3))
y_batch = np.zeros(shape=(32, 20))
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch[j, :, :, :] = x
y_batch[j] = y
yield(x_batch, y_batch)
Edit:
One quick question. What's the difference between this loop and the one in your answer:
ys = []
for i in range(batch_size):
ys.append(y_batch[i, :])
yield(x_batch, ys)

If your model has 20 outputs then you must provide a list of 20 arrays as target. One way of doing this is to modify the generator (for both training and validation):
ys = []
for i in range(20):
ys.append(y_batch[:,i])
yield(x_batch, ys)
As a side note, you mentioned that you have 20 tags per sample then why have you specified 40 in the input shape?
y_batch = np.zeros(shape=(32, 40))
Further, I don't know about the specific problem you are working on but alternatively you could only have one output of size 20 instead of 20 outputs with size one.

You can test the generator output dimensions initializing the generator and call the function next() to check the dimensions. For example with the train_generator:
train_gen = train_generator(batch_size)
x_batch, y_batch = next(train_gen)
Then check x_batch and y_batch dimensions and datatype
I would make the generator in this way:
def train_generator(batch_size):
file_path = "train.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
# Initialize empty list
x_batch = []
y_batch = []
while True:
for i in range(math.floor(8000/batch_size)):
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch.append(x)
y_batch.append(y)
yield(np.array(x_batch), np.array(y_batch))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Split queue into train/test set - python

Related

Use generator in TensorFlow/Keras to fit when the model gets 2 inputs

Selecting according to labels in a TensorFlow generator

Speaker Recognition TypeError: Input 'filename' of 'ReadFile' Op has type float32 that does not match expected type of string

How to split the data into training and testing data

training a multi-output keras model

Categories

Resources