Passing data from custom data generator to model.fit() - python

I am doing the end-to-end mapping. As I have to pass two images (input and output), I have created a custom generator. My generator gets two same images with different resolutions. Right now I can only get 5 images to pass to training but I want to pass the whole generator so that all my data gets trained. As I am new to using generators and yield I don't the correct way to pass the whole generator.
import os
import numpy as np
import cv2
class image_gen():
def __init__(self, idir,odir,batch_size, shuffle = True):
self.batch_index=0
self.idir=idir
self.odir=odir# directory containing input images
self.batch_size=batch_size #batch size is number of samples in a batch
self.shuffle=shuffle # set to True to shuffle images, False for no shuffle
self.label_list=[] # initialize list to hold sequential list of total labels generated
self.image_list=[] # initialize list to hold sequential list of total images filenames generated
self.i_list=os.listdir(self.idir)
self.o_list=os.listdir(self.odir)# list of images in directory
def get_images(self): # gets a batch of input images, resizes input image to make target images
while True:
input_image_batch=[]
output_image_batch=[]# initialize list to hold a batch of target images
sample_count=len(self.i_list) # determine total number of images available
for i in range(self.batch_index * self.batch_size, (self.batch_index + 1) * self.batch_size ): #iterate for a batch
j=i % sample_count # cycle j value over range of available images
k=j % self.batch_size # cycle k value over batch size
if self.shuffle: # if shuffle select a random integer between 0 and sample_count-1 to pick as the image=label pair
m=np.random.randint(low=0, high=sample_count-1, size=None, dtype=int)
else:
m=j # no shuffle
#input
path_to_in_img=os.path.join(self.idir,self.i_list[m])
path_to_out_img=os.path.join(self.odir,self.o_list[m])
# define the path to the m th image
input_image=cv2.imread(path_to_in_img)
input_image=cv2.resize( input_image,(3200,3200))#create the target image from the input image
output_image=cv2.imread(path_to_out_img)
output_image=cv2.resize(output_image,(3200,3200))
input_image_batch.append(input_image)
output_image_batch.append(output_image)
input_image_array=np.array(input_image_batch)
input_image_array = input_image_array / 255.0
output_image_array=np.array(output_image_batch)
output_image_array = output_image_array /255.0
self.batch_index= self.batch_index + 1
yield (input_image_array, output_image_array )
if self.batch_index * self.batch_size > sample_count:
break
This is how i get the images
batch_size=5
idir=r'D:\\train'
odir=r'D:\\Train\\train'#
shuffle=True
gen=image_gen(idir,odir,batch_size,shuffle=True) # instantiate an instance of the class
input_images,output_images = next(gen.get_images())
This is how i train.This way i only train 5 images and not the whole dataset
model.fit(input_images,output_images,validation_data = (valin_images,valout_images),batch_size= 5,epochs = 100)
when i try to pass the whole dataset
model.fit(gen(),validation_data = (valin_images,valout_images),batch_size= 5,epochs = 1)
I get a error "image_gen" object is not callable. How should i pass the generator to model.fit()

The reason why you have this problem is because this error is raised when you try to access a image_gen as if it were a function, but in fact it is an object of a class.
In the first snippet you provided, you accessed in fact the method of the class which is indeed a generator, which yielded some numpy arrays that could be fed as input to the model. The second snippet however fails, because of the error described in the first paragraph.
Two possible solutions for your problem would be the following:
Use a Keras Sequence() generator.
Use a function as a generator (def my_generator(...)).
I personally recommend the first solution, as the Sequence() generator ensures that you only train once per each sample during an epoch, property which is not satisfied in case of simple function generators.
Solution for Keras Sequence() :
You need to override the Sequence class and then overwrite its methods. A complete example from the TensorFlow official documentation is:
from skimage.io import imread
from skimage.transform import resize
import numpy as np
import math
# Here, `x_set` is list of path to the images
# and `y_set` are the associated classes.
class CIFAR10Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) *
self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) *
self.batch_size]
return np.array([
resize(imread(file_name), (200, 200))
for file_name in batch_x]), np.array(batch_y)
You can use the above code as a starting point for your solution. Incidentally, it is likely your network will not train with such huge image dimensions, you could also try to lower them.
A solution for simple generator could be:
def my_generator(path_to_dataset, other_argument):
...
...
yield image_1, image_2
train_generator = my_generator(path_to_train,argument_1)
val_generator = my_generator(path_to_val,argument_2)
model.fit(train_generator,
steps_per_epoch=len(training_samples) // BATCH_SIZE,
epochs=10, validation_data=val_generator,
validation_steps=len(validation_samples) // BATCH_SIZE)

Related

Selecting according to labels in a TensorFlow generator

I have a very large dataset (VoxCeleb) and each datum has a label (multi-class, can assume that the label is a number between 1 to 5000) and an audio recording. Since it is too large to load entirely in one time, my strategy is to use a generator (in TensorFlow 1, it means using fit_generator instead of fit in the training. I, however, use TensorFlow 2.8 and Keras).
Usually, a generator selects the batch randomly (by shuffelling the indices). I want the batch to be selected only semi-randomly in the following sense:
In Each batch there are n_s total samples.
The batch contains n_c distinct labels/classes (they are chosen randomly).
Each label/class in the batch has n_p samples (utterances).
n_s = n_c * n_p
An epoch is running on all the labels, such that every label is seen at least once.
This a general DataGenerator class I modified:
from os import path
import numpy as np
from keras.utils import Sequence
from keras.preprocessing.sequence import pad_sequences
from pre_processing import load_data # customize function
class DataGenerator(Sequence):
"""Generates data for Keras
Sequence based data generator. Suitable for building data generator for training and prediction.
"""
def __init__(self, list_IDs, labels, n_classes, input_path, target_path,
to_fit=True, batch_size=n_s, shuffle=True):
"""Initialization
:param list_IDs: list of all 'label' ids to use in the generator
:param to_fit: True to return X and y, False to return X only
:param batch_size: batch size at each iteration
:param shuffle: True to shuffle label indexes after every epoch
"""
self.input_path = input_path
self.target_path = target_path
self.list_IDs = list_IDs
self.labels = labels
self.n_classes = n_classes
self.to_fit = to_fit
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
"""Denotes the number of batches per epoch
:return: number of batches per epoch
"""
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
"""Generate one batch of data
:param index: index of the batch
:return: X and y when fitting. X only when predicting
"""
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
X = self._generate_X(list_IDs_temp)
if self.to_fit:
y = self._generate_y(list_IDs_temp)
return [X], y
else:
return [X]
def on_epoch_end(self):
"""
Updates indexes after each epoch
"""
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle:
np.random.shuffle(self.indexes)
def _generate_X(self, list_IDs_temp):
"""Generates data containing batch_size images
:param list_IDs_temp: list of label ids to load
:return: batch of images
"""
# Initialization
X = []
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
temp = self._load_input(self.input_path, ID)
X.append(temp)
X = pad_sequences(X, value=0, padding='post')
return X
def _generate_y(self, list_IDs_temp):
"""Generates data containing batch_size masks
:param list_IDs_temp: list of label ids to load
:return: batch if masks
"""
# TODO: modify
y = []
# Generate data
for i, ID in enumerate(list_IDs_temp):
# Store sample
y.append(self._load_target(self.target_path, ID))
# y = pad_sequences(y, value=0, padding='post')
return y
def _load_input(self, input_path, ID):
feats = load_data(path.join(input_path, ID))
return feats
def _load_target(self, target_path, ID):
return self.labels[ID]
Here I assume input_file is the directory where the audio files are saved, and ID is the file name. The function load_data is a customized function which reads the audio file and extracts some features (returns a Tensor).
How to write a such a generator that selects n_s samples in each batch according to the specifications above? Just shuffling indices and choosing randomly won't work here.
Added in Edit:
One approach is to sample randomly many samples n_big >> n_s and then filter out samples until we have at least n_c labels with n_p samples each. However, this is not guaranteed to work, so it may be computationally expansive (for each batch try many random subsets).
Added in Edit:
I found something similar, but not what I need. This is a code in PyTorch (not TensorFlow), which creates a data generator for VoxCeleb:
https://github.com/clovaai/voxceleb_trainer/blob/master/DatasetLoader.py

Keras data generators for image inpainting using autoencoder

I am trying to train an autoencoder for image inpainting where the input images are the corrupted ones, and the output images are the ground truth.
The dataset used is organized as:
/Dataset
/corrupted
img1.jpg
img2.jpg
.
.
/groundTruth
img1.jpg
img2.jpg
.
.
The number of images used is relatively large. How can I feed the data to the model using Keras image data generators? I checked flow_from_directory method but couldn't find a proper class_mode to use (each image in the 'corrupted' folder maps to the one with the same name in 'groundTruth' folder)
If there no pre-built image data generator that provides the functionality you require, you can create your own custom data generator.
To do so, you must create your new data generator class by subclassing tf.keras.utils.Sequence. You are required to implement the __getitem__ and the __len__ methods in the your new class. __len__ must return the number of batches in your dataset, while __getitem__ must return the elements in a single batch as a tuple.
You can read the official docs here. Below is a code example:
from skimage.io import imread
from skimage.transform import resize
import numpy as np
import math
# Here, `x_set` is list of path to the images
# and `y_set` are the associated classes.
class CIFAR10Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) *
self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) *
self.batch_size]
return np.array([
resize(imread(file_name), (200, 200))
for file_name in batch_x]), np.array(batch_y)
Hope the answer was helpful!

Custom generator runs out of data even when steps_per_epoch specified

I am training a model using custom generators, but just before finishing the first epoch, the model runs out of data. It gives me the following error:
Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least (steps_per_epoch * epochs) batches (in this case, 8740 batches). You may need to use the repeat() function when building your dataset
I have four generators (one for the train data, and another for the train label. Same thing with validation). I then zip train & label together. This is the prototype of my generators. I got the idea from here:
import numpy as np
import nibabel as nib
from tensorflow import keras
import os
def weirddivision(n,d):
return np.array(n)/np.array(d) if d else 0
class ImgDataGenerator(keras.utils.Sequence):
def __init__(self, file_list, batch_size=8, shuffle=True):
"""Constructor can be expanded,
with batch size, dimentation etc.
"""
self.file_list = file_list
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
'Take all batches in each iteration'
return int(np.floor(len(self.file_list) / self.batch_size))
def __getitem__(self, index):
'Get next batch'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# single file
file_list_temp = [self.file_list[k] for k in indexes]
# Set of X_train and y_train
X = self.__data_generation(file_list_temp)
return X
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.file_list))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, file_list_temp):
'Generates data containing batch_size samples'
train_loc = '/home/faruk/Desktop/BrainSeg/Dataset/Train/'
X = np.empty((self.batch_size,224,256,1))
# Generate data
for i, ID in enumerate(file_list_temp):
x_file_path = os.path.join(train_loc, ID)
img = np.load(x_file_path)
img = np.pad(img, pad_width=((14,13),(12,11)), mode='constant')
img = np.expand_dims(img,-1)
img = weirddivision(img, img.max())
# Store sample
X[i,] = img
return X
As mentioned, here I create four generators and zip them:
training_img_generator = ImgDataGenerator(train)
training_label_generator = LabelDataGenerator(train)
train_generator = zip(training_img_generator,training_label_generator)
val_img_generator = ValDataGenerator(val)
val_label_generator = ValLabelDataGenerator(val)
val_generator = zip(val_img_generator,val_label_generator)
Because the generator is generating data dynamically, I thought that maybe it was trying to generate more than what is actually available. Hence, I calculated the steps per epoch as follows and passed it to fit_generator:
batch_size = 8
spe = len(train)//batch_size # len(train) = 34965
val_spe = len(val)//batch_size # len(val) = 4347
History=model.fit_generator(generator=train_generator, validation_data=val_generator, epochs=2, steps_per_epoch=spe, validation_steps = val_spe, shuffle=True, verbose=1)
But still, this is not working. I have tried reducing the number of steps per epoch, and I am able to finish the first epoch, but the error then appears at the beginning of the second epoch. Apparently the generator needs to be repeated infinitely, but I don't know how to achieve this. Can I use an infinite while loop? If yes, where?
Try this:
train_generator = train_generator.repeat()
val_generator = val_generator.repeat()
I solved this. I was defining my Generator class as follows:
class ImgDataGenerator(keras.utils.Sequence)
However, my model was not sequential... It was functional. I solved this by creating my own custom generator without inheriting from the keras.utils.sequence.
I hope this is helpful to someone.

Keras predict_generator corrupted images

I am trying to predict several million images with my trained model using a predict_generator in python 3 with keras and tensorflow as backend. The generator and the model predictions work, however, some images in the directory are broken or corrupted and cause the predict_generator to stop and throw an error. Once the image is removed it works again until the next corrupted/broken image gets fed through the function.
Since there are so many images it is not feasible to run a script to open every image and delete the ones that are throwing an error. Is there a way to incorporate a "skip image if broken" argument into the generator or flow from directory function?
Any help is greatly appreciated!
There's no such argument in ImageDataGenerator and neither in flow_from_directory method as you can see int the Keras docs for both (here and here). One workaround would be to extend the ImageDataGenerator class and overload the flow_from_directory method to check wether the image is corrupted or not before yeld it in the generator. Here you can find it's source code.
Since it happens during prediction, if you skip any image or batch, you need to keep track of which images are skipped, so that you can correctly map the prediction scores to the image file name.
Based on this idea, my DataGenerator is implemented with a valid image index tracker. In particular, focus on the variable valid_index where index of valid images are tracked.
class DataGenerator(keras.utils.Sequence):
def __init__(self, df, batch_size, verbose=False, **kwargs):
self.verbose = verbose
self.df = df
self.batch_size = batch_size
self.valid_index = kwargs['valid_index']
self.success_count = self.total_count = 0
def __len__(self):
return int(np.ceil(self.df.shape[0] / float(self.batch_size)))
def __getitem__(self, idx):
print('generator is loading batch ',idx)
batch_df = self.df.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
self.total_count += batch_df.shape[0]
# return a list whose element is either an image array (when image is valid) or None(when image is corrupted)
x = load_batch_image_to_arrays(batch_df['image_file_names'])
# filter out corrupted images
tmp = [(u, i) for u, i in zip(x, batch_df.index.values.tolist()) if
u is not None]
# boundary case. # all image failed, return another random batch
if len(tmp) == 0:
print('[ERROR] All images loading failed')
# based on https://github.com/keras-team/keras/blob/master/keras/utils/data_utils.py#L621,
# Keras will automatically find the next batch if it returns None
return None
print('successfully loaded image in {}th batch {}/{}'.format(str(idx), len(tmp), self.batch_size))
self.success_count += len(tmp)
x, batch_index = zip(*tmp)
x = np.stack(x) # list to np.array
self.valid_index[idx] = batch_index
# follow preprocess input function provided by keras
x = resnet50_preprocess(np.array(x, dtype=np.float))
return x
def on_epoch_end(self):
print('total image count', self.total_count)
print('successful images count', self.success_count)
self.success_count = self.total_count = 0 # reset count after one epoch ends.
During prediction.
predictions = model.predict_generator(
generator=data_gen,
workers=10,
use_multiprocessing=False,
max_queue_size=20,
verbose=1
).squeeze()
indexes = []
for i in sorted(data_gen.valid_index.keys()):
indexes.extend(data_gen.valid_index[i])
result_df = df.loc[indexes]
result_df['score'] = predictions

Keras custom data generator for large hdf5 file which does not fit into memory

I'm trying to use the pretrained InceptionV3 model to classify the food-101 dataset, which containts food images for 101 categories, 1000 per category. I've preprocessed this dataset into a single hdf5 file (I assumed this is beneficial compared to loading images on the go when training) so far, which has the following tables inside:
The data split is the standard 70% train, 20% validation, 10% test, so for example the valid_img has a size of 20200*299*299*3. The labels are onehotencoded for Keras, so valid_labels has a size of 20200*101.
This hdf5 file has a size of 27.1 GB, so it will not fit into my memory. (Have 8 GB of it, although effectively only probably 4-5 gigs is usable while running Ubuntu. Also my GPU is GTX 960 with 2 GB of VRAM, and so far it looked like 1.5 GB is available for python when I try to start the training script). I'm using Tensorflow backend.
The first idea I had is to use model.train_on_batch() with a double nested for loop like this:
#Loading InceptionV3, adding my fully connected layers, compiling model...
dataset = h5py.File('/home/uzoltan/PycharmProjects/food-101/food-101_299x299.hdf5', 'r')
epoch = 50
for i in range(epoch):
for i in range(100): #1000 images can fit in the memory easily, this could probably be range(10) too
train_images = dataset["train_img"][i * 706:(i + 1) * 706, ...]
train_labels = dataset["train_labels"][i * 706:(i + 1) * 706, ...]
val_images = dataset["valid_img"][i * 202:(i + 1) * 202, ...]
val_labels = dataset["valid_labels"][i * 202:(i + 1) * 202, ...]
model.train_on_batch(x=train_images, y=train_labels, class_weight=None,
sample_weight=None, )
My problem with this approach is that train_on_batch provides 0 support for validation or batch shuffling, so that the batches are not in the same order every epoch.
So I looked towards model.fit_generator() which has the nice property of providing all the same functionality as fit(), plus with the built in ImageDataGenerator you can do image augmentations (rotations, horizontal flips, etc.) at the same time with the CPU, so that your model can be more robust. My problem here is, that if I understand it correctly, the ImageDataGenerator.flow(x,y) method needs all the samples and labels at once, but my training/validation data wont fit into my RAM.
Here is where I think custom data generators come into the picture, but after looking extensively at some examples I could find on the Keras GitHub/Issues page, I still dont really get how should I implement a custom generator, which would read in batches of data from my hdf5 file. Can someone provide me with a good example or pointers? How could I couple the custom batch generator with the image augmentations? Or maybe is it easier to implement some kind of manual validation and batch shuffling for train_on_batch()? If so, I could use some pointer there too.
For anyone still looking for an answer, I made the following "crude wrapper" around ImageDataGeneator's apply_transform method.
from numpy.random import uniform, randint
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
import numpy as np
class CustomImagesGenerator:
def __init__(self, x, zoom_range, shear_range, rescale, horizontal_flip, batch_size):
self.x = x
self.zoom_range = zoom_range
self.shear_range = shear_range
self.rescale = rescale
self.horizontal_flip = horizontal_flip
self.batch_size = batch_size
self.__img_gen = ImageDataGenerator()
self.__batch_index = 0
def __len__(self):
# steps_per_epoch, if unspecified, will use the len(generator) as a number of steps.
# hence this
return np.floor(self.x.shape[0]/self.batch_size)
def next(self):
return self.__next__()
def __next__(self):
start = self.__batch_index*self.batch_size
stop = start + self.batch_size
self.__batch_index += 1
if stop > len(self.x):
raise StopIteration
transformed = np.array(self.x[start:stop]) # loads from hdf5
for i in range(len(transformed)):
zoom = uniform(self.zoom_range[0], self.zoom_range[1])
transformations = {
'zx': zoom,
'zy': zoom,
'shear': uniform(-self.shear_range, self.shear_range),
'flip_horizontal': self.horizontal_flip and bool(randint(0,2))
}
transformed[i] = self.__img_gen.apply_transform(transformed[i], transformations)
return transformed * self.rescale
It can be called like so:
import h5py
f = h5py.File("my_heavy_dataset_file.hdf5", 'r')
images = f['mydatasets/images']
my_gen = CustomImagesGenerator(
images,
zoom_range=[0.8, 1],
shear_range=6,
rescale=1./255,
horizontal_flip=True,
batch_size=64
)
model.fit_generator(my_gen)
If I understood you correctly, you want to use the data (which does not fit in the memory) from HDF5 and at the same time use data augmentation on it.
I'm in the same situation as you, and I found this code that maybe can be helpful with some few modifications:
https://gist.github.com/wassname/74f02bc9134897e3fe4e60784f5aaa15
this is my solution for shuffle data per epoch with h5 file.
indices means train or val index list.
def generator(h5path, indices, batchSize=128, is_train=True, aug=None):
db = h5py.File(h5path, "r")
with open("mean.json") as f:
mean = json.load(f)
meanV = np.array([mean["R"], mean["G"], mean["B"]])
while True:
np.random.shuffle(indices)
for i in range(0, len(indices), batchSize):
t0 = time()
batch_indices = indices[i:i+batchSize]
batch_indices.sort()
by = db["labels"][batch_indices,:]
bx = db["images"][batch_indices,:,:,:]
bx[:,:,:,0] -= meanV[0]
bx[:,:,:,1] -= meanV[1]
bx[:,:,:,2] -= meanV[2]
t1=time()
if is_train:
#bx = random_crop(bx, (224,224))
if aug is not None:
bx,by = next(aug.flow(bx,by,batchSize))
yield (bx,by)
h5path='all_224.hdf5'
model.fit_generator(generator(h5path, train_indices, batchSize=batchSize, is_train=True, aug=aug),
steps_per_epoch = 20000//batchSize,
validation_data= generator(h5path, test_indices, is_train=False, batchSize=batchSize),
validation_steps = 2424//batchSize,
epochs=args.epoch,
max_queue_size=100,
callbacks=[checkpoint, early_stop])
You want to write a function which loads images from the HDF5 and then yields (not returns) them as a numpy array. Here is a simple example which uses OpenCV to load images directly from .png/.jpg files in a given directory:
def generate_data(directory, batch_size):
"""Replaces Keras' native ImageDataGenerator."""
i = 0
file_list = os.listdir(directory)
while True:
image_batch = []
for b in range(batch_size):
if i == len(file_list):
i = 0
random.shuffle(file_list)
sample = file_list[i]
i += 1
image = cv2.resize(cv2.imread(sample[0]), INPUT_SHAPE)
image_batch.append((image.astype(float) - 128) / 128)
yield np.array(image_batch)
Obviously you will have to modify it to read from the HDF5 instead.
Once you have written your function, the usage is simply:
model.fit_generator(
generate_data('~/my_data', batch_size),
steps_per_epoch=len(os.listdir('~/my_data')) // batch_size)
Again modified to reflect the fact that you are reading from an HDF5 and not a directory.

Categories