I am trying to create a custom data generator and don't know how integrate the yield function combined with an infinite loop inside the __getitem__ method.
EDIT: After the answer I realized that the code I am using is a Sequence which doesn't need a yield statement.
Currently I am returning multiple images with a return statement:
class DataGenerator(tensorflow.keras.utils.Sequence):
def __init__(self, files, labels, batch_size=32, shuffle=True, random_state=42):
'Initialization'
self.files = files
self.labels = labels
self.batch_size = batch_size
self.shuffle = shuffle
self.random_state = random_state
self.on_epoch_end()
def __len__(self):
return int(np.floor(len(self.files) / self.batch_size))
def __getitem__(self, index):
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
files_batch = [self.files[k] for k in indexes]
y = [self.labels[k] for k in indexes]
# Generate data
x = self.__data_generation(files_batch)
return x, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.files))
if self.shuffle == True:
np.random.seed(self.random_state)
np.random.shuffle(self.indexes)
def __data_generation(self, files):
imgs = []
for img_file in files:
img = cv2.imread(img_file, -1)
###############
# Augment image
###############
imgs.append(img)
return imgs
In this article I saw that yield is used in an infinite loop. I don't quite understand that syntax. How is the loop escaped?
You are using the Sequence API, which works a bit different than plain generators. In a generator function, you would use the yield keyword to perform iteration inside a while True: loop, so each time Keras calls the generator, it gets a batch of data and it automatically wraps around the end of the data.
But in a Sequence, there is an index parameter to the __getitem__ function, so no iteration or yield is required, this is performed by Keras for you. This is made so the sequence can run in parallel using multiprocessing, which is not possible with old generator functions.
So you are doing things the right way, there is no change needed.
Example of generator in Keras:
def datagenerator(images, labels, batchsize, mode="train"):
while True:
start = 0
end = batchsize
while start < len(images):
# load your images from numpy arrays or read from directory
x = images[start:end]
y = labels[start:end]
yield x, y
start += batchsize
end += batchsize
Keras wants you to have the infinite loop running in the generator.
If you want to learn about Python generators, then the link in the comments is actually a good place to start.
Related
I am doing the end-to-end mapping. As I have to pass two images (input and output), I have created a custom generator. My generator gets two same images with different resolutions. Right now I can only get 5 images to pass to training but I want to pass the whole generator so that all my data gets trained. As I am new to using generators and yield I don't the correct way to pass the whole generator.
import os
import numpy as np
import cv2
class image_gen():
def __init__(self, idir,odir,batch_size, shuffle = True):
self.batch_index=0
self.idir=idir
self.odir=odir# directory containing input images
self.batch_size=batch_size #batch size is number of samples in a batch
self.shuffle=shuffle # set to True to shuffle images, False for no shuffle
self.label_list=[] # initialize list to hold sequential list of total labels generated
self.image_list=[] # initialize list to hold sequential list of total images filenames generated
self.i_list=os.listdir(self.idir)
self.o_list=os.listdir(self.odir)# list of images in directory
def get_images(self): # gets a batch of input images, resizes input image to make target images
while True:
input_image_batch=[]
output_image_batch=[]# initialize list to hold a batch of target images
sample_count=len(self.i_list) # determine total number of images available
for i in range(self.batch_index * self.batch_size, (self.batch_index + 1) * self.batch_size ): #iterate for a batch
j=i % sample_count # cycle j value over range of available images
k=j % self.batch_size # cycle k value over batch size
if self.shuffle: # if shuffle select a random integer between 0 and sample_count-1 to pick as the image=label pair
m=np.random.randint(low=0, high=sample_count-1, size=None, dtype=int)
else:
m=j # no shuffle
#input
path_to_in_img=os.path.join(self.idir,self.i_list[m])
path_to_out_img=os.path.join(self.odir,self.o_list[m])
# define the path to the m th image
input_image=cv2.imread(path_to_in_img)
input_image=cv2.resize( input_image,(3200,3200))#create the target image from the input image
output_image=cv2.imread(path_to_out_img)
output_image=cv2.resize(output_image,(3200,3200))
input_image_batch.append(input_image)
output_image_batch.append(output_image)
input_image_array=np.array(input_image_batch)
input_image_array = input_image_array / 255.0
output_image_array=np.array(output_image_batch)
output_image_array = output_image_array /255.0
self.batch_index= self.batch_index + 1
yield (input_image_array, output_image_array )
if self.batch_index * self.batch_size > sample_count:
break
This is how i get the images
batch_size=5
idir=r'D:\\train'
odir=r'D:\\Train\\train'#
shuffle=True
gen=image_gen(idir,odir,batch_size,shuffle=True) # instantiate an instance of the class
input_images,output_images = next(gen.get_images())
This is how i train.This way i only train 5 images and not the whole dataset
model.fit(input_images,output_images,validation_data = (valin_images,valout_images),batch_size= 5,epochs = 100)
when i try to pass the whole dataset
model.fit(gen(),validation_data = (valin_images,valout_images),batch_size= 5,epochs = 1)
I get a error "image_gen" object is not callable. How should i pass the generator to model.fit()
The reason why you have this problem is because this error is raised when you try to access a image_gen as if it were a function, but in fact it is an object of a class.
In the first snippet you provided, you accessed in fact the method of the class which is indeed a generator, which yielded some numpy arrays that could be fed as input to the model. The second snippet however fails, because of the error described in the first paragraph.
Two possible solutions for your problem would be the following:
Use a Keras Sequence() generator.
Use a function as a generator (def my_generator(...)).
I personally recommend the first solution, as the Sequence() generator ensures that you only train once per each sample during an epoch, property which is not satisfied in case of simple function generators.
Solution for Keras Sequence() :
You need to override the Sequence class and then overwrite its methods. A complete example from the TensorFlow official documentation is:
from skimage.io import imread
from skimage.transform import resize
import numpy as np
import math
# Here, `x_set` is list of path to the images
# and `y_set` are the associated classes.
class CIFAR10Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) *
self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) *
self.batch_size]
return np.array([
resize(imread(file_name), (200, 200))
for file_name in batch_x]), np.array(batch_y)
You can use the above code as a starting point for your solution. Incidentally, it is likely your network will not train with such huge image dimensions, you could also try to lower them.
A solution for simple generator could be:
def my_generator(path_to_dataset, other_argument):
...
...
yield image_1, image_2
train_generator = my_generator(path_to_train,argument_1)
val_generator = my_generator(path_to_val,argument_2)
model.fit(train_generator,
steps_per_epoch=len(training_samples) // BATCH_SIZE,
epochs=10, validation_data=val_generator,
validation_steps=len(validation_samples) // BATCH_SIZE)
I am training a model using custom generators, but just before finishing the first epoch, the model runs out of data. It gives me the following error:
Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least (steps_per_epoch * epochs) batches (in this case, 8740 batches). You may need to use the repeat() function when building your dataset
I have four generators (one for the train data, and another for the train label. Same thing with validation). I then zip train & label together. This is the prototype of my generators. I got the idea from here:
import numpy as np
import nibabel as nib
from tensorflow import keras
import os
def weirddivision(n,d):
return np.array(n)/np.array(d) if d else 0
class ImgDataGenerator(keras.utils.Sequence):
def __init__(self, file_list, batch_size=8, shuffle=True):
"""Constructor can be expanded,
with batch size, dimentation etc.
"""
self.file_list = file_list
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
'Take all batches in each iteration'
return int(np.floor(len(self.file_list) / self.batch_size))
def __getitem__(self, index):
'Get next batch'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# single file
file_list_temp = [self.file_list[k] for k in indexes]
# Set of X_train and y_train
X = self.__data_generation(file_list_temp)
return X
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.file_list))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, file_list_temp):
'Generates data containing batch_size samples'
train_loc = '/home/faruk/Desktop/BrainSeg/Dataset/Train/'
X = np.empty((self.batch_size,224,256,1))
# Generate data
for i, ID in enumerate(file_list_temp):
x_file_path = os.path.join(train_loc, ID)
img = np.load(x_file_path)
img = np.pad(img, pad_width=((14,13),(12,11)), mode='constant')
img = np.expand_dims(img,-1)
img = weirddivision(img, img.max())
# Store sample
X[i,] = img
return X
As mentioned, here I create four generators and zip them:
training_img_generator = ImgDataGenerator(train)
training_label_generator = LabelDataGenerator(train)
train_generator = zip(training_img_generator,training_label_generator)
val_img_generator = ValDataGenerator(val)
val_label_generator = ValLabelDataGenerator(val)
val_generator = zip(val_img_generator,val_label_generator)
Because the generator is generating data dynamically, I thought that maybe it was trying to generate more than what is actually available. Hence, I calculated the steps per epoch as follows and passed it to fit_generator:
batch_size = 8
spe = len(train)//batch_size # len(train) = 34965
val_spe = len(val)//batch_size # len(val) = 4347
History=model.fit_generator(generator=train_generator, validation_data=val_generator, epochs=2, steps_per_epoch=spe, validation_steps = val_spe, shuffle=True, verbose=1)
But still, this is not working. I have tried reducing the number of steps per epoch, and I am able to finish the first epoch, but the error then appears at the beginning of the second epoch. Apparently the generator needs to be repeated infinitely, but I don't know how to achieve this. Can I use an infinite while loop? If yes, where?
Try this:
train_generator = train_generator.repeat()
val_generator = val_generator.repeat()
I solved this. I was defining my Generator class as follows:
class ImgDataGenerator(keras.utils.Sequence)
However, my model was not sequential... It was functional. I solved this by creating my own custom generator without inheriting from the keras.utils.sequence.
I hope this is helpful to someone.
I'm trying to train a deep learning model without loading the entire dataset into memory. My main question is, what's the best way of doing this?
It seems like HDF5 is a common method that people accomplish this, and is what I tried first. However when using pytorch's dataloader class, this ran extremely slowly. I created my own iterator which ran faster, however the data is not randomized every batch. I'm trying to understand why the pytorch dataloader is running slowly and if there is something I can do about it.
Below is my code
First I defined a dataset class that takes in a filepath to an HDF5 dataset. My understanding of this code is that it reads from disk whenever getitem is called.
class My_H5Dataset(torch.utils.data.Dataset):
def __init__(self, file_path):
super(My_H5Dataset, self).__init__()
h5_file = h5py.File(file_path , 'r')
self.features = h5_file['features']
self.labels = h5_file['labels']
self.index = h5_file['index']
self.labels_values = h5_file['labels_values']
self.index_values = h5_file['index_values']
def __getitem__(self, index):
return (torch.from_numpy(self.features[index,:]).float(),
torch.from_numpy(self.labels[index,:]).float(),
torch.from_numpy(self.index[index,:]).float(),
torch.from_numpy(np.array(self.labels_values[index])),
torch.from_numpy(np.array(self.index_values[index])))
def __len__(self):
return self.features.shape[0]
Then I simply pass this into a pytorch dataloader as follows
train_dataset = My_H5Dataset(hdf5_data_folder_train)
train_ms = MySampler(train_dataset)
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,
sampler=train_ms,num_workers=2)
My other method was to manually define an iterator. And this does run much faster. However I have not noticed a speed difference between the data stored in an HDD versus an SSD which makes me worried that there is a bottleneck somewhere that I am missing.
def hdf5_loader_generator(dataset, batch_size, as_tensor=True, n_samples = 10000):
"""Given an h5 path to a file that holds the arrays, returns a generator
that can get certain data at a time."""
stop = n_samples
curr_index = start = 0
while 1:
stop_index = min([curr_index + batch_size, stop])
ft, sp, index, sp_values, index_values = dataset[curr_index:stop_index]
curr_index += batch_size
if curr_index >= stop:
curr_index = start
continue
yield ft, sp, index, sp_values, index_values
def hdf5_data_iterator(dataset, batch_size, as_tensor=True):
return iter(hdf5_loader_generator(dataset, batch_size, as_tensor))
I timed pytorch's iterator as follows:
start = time.time()
for i, data in enumerate(trainloader, 0):
ft, sp, index, sp_values, index_values = data
ft, sp, index = ft.to(device), sp.to(device), index.to(device)
if i > 500:
break
end = time.time()
print(end-start)
I timed my iterator as follows:
train_dataset = My_H5Dataset(hdf5_data_folder_train)
samples = hdf5_data_iterator(train_dataset, batch_sizes)
start = time.time()
for i, data in enumerate(samples):
ft, sp, index, sp_values, index_values = data
ft, sp, index = ft.to(device), sp.to(device), index.to(device)
if i > 500:
break
end = time.time()
print(end-start)
Is there a better approach than what I am currently trying?
Thank you for the help!
i'm using keras with a simple cnn model.
i want to add gaussian noise to images in training. i want to change the noise parameters (mean and sigma) every epoch,based on some function. for example,
in epoch 1 i want to add noise with sigma=1
in epoch 2 i want to add noise with sigma=2
in epoch 3 i want to add noise with sigma=3
# note-mean is always zero
and so on...
inefficient way to solve it is with a for loop, save and load the mode after every epoch and call augmentation function.
more efficient way will be with a custom callback or generator, which i didn't succeed to do
inefficient way:
total_num_of_epochs=100
def sigma_function(current_epoch):
sigma_fun=current_epoch/total_num_of_epochs
return sigma_fun
for i in range(total_num_of_epochs):
x_train += np.random.normal(mean=0,sigma=sigma_fun(i),size=x_train shape) # augment x_train based on sigma_function and current epochs
model.compile(...)
model.fit(x_train ,y_train...initial_epoch=i,epochs=i+1) #load the model
# from previous loop
save model
load model for next loop
the desired result (i tried with ImageDataGenerator but maybe callback can do):
def sigma_function(current_epoch):
sigma_fun=current_epoch/total_num_of_epochs
return sigma_fun
datagen=ImageDataGenerator(preprocessing_function=sigma_function)
datagen.fit(x_train)
model.fit_generator(... don't know what to put here)
edit
according to the proposed solution by Daniel Möller,i tried this way and still got an error
sigmaParam = 1
def apply_sigma(x):
return x + np.random.normal(mean=0,scale=sigmaParam,size=(3,32,32))
imgGen = ImageDataGenerator( preprocesing_function=apply_sigma)
generator = imgGen.flow_from_directory('data/train') # folder that contains
# only x_train and y_train
from keras.utils import Sequence
class SigmaGenerator(Sequence):
def __init__(self, keras_generator):
self.keras_generator = keras_generator
def __len__(self):
return len(self.keras_generator)
def __getitem__(self,i):
return self.keras_generator[i]
def on_epoch_end(self):
sigmaParam += 1
self.keras_generator.on_epoch_end()
training_generator = SigmaGenerator(generator)
model.fit_generator(training_generator,validation_data=(x_test,y_test),
steps_per_epoch=x_train.shape[0]//batch_size,epochs=100)
the error i get:
process finished with exit code -1073741819 (0xC0000005)
You can try this:
sigmaParam = 1
def applySigma(x):
return x + np.random.normal(mean=0,scale=sigmaParam,size=x.shape)
Create the original generator:
imgGen = ImageDataGenerator(..., preprocesing_function=apply_sigma)
generator = imgGen.flow_from_directory(....)
Create a custom generator to wrap the original one, replace its on_epoch_end method to update sigmaParam.
from keras.utils import Sequence
class SigmaGenerator(Sequence):
def __init__(self, keras_generator):
self.keras_generator = keras_generator
def __len__(self):
return len(self.keras_generator)
def __getitem__(self,i):
return self.keras_generator[i]
def on_epoch_end(self):
sigmaParam += 1
self.keras_generator.on_epoch_end()
training_generator = SigmaGenerator(generator)
I am trying to predict several million images with my trained model using a predict_generator in python 3 with keras and tensorflow as backend. The generator and the model predictions work, however, some images in the directory are broken or corrupted and cause the predict_generator to stop and throw an error. Once the image is removed it works again until the next corrupted/broken image gets fed through the function.
Since there are so many images it is not feasible to run a script to open every image and delete the ones that are throwing an error. Is there a way to incorporate a "skip image if broken" argument into the generator or flow from directory function?
Any help is greatly appreciated!
There's no such argument in ImageDataGenerator and neither in flow_from_directory method as you can see int the Keras docs for both (here and here). One workaround would be to extend the ImageDataGenerator class and overload the flow_from_directory method to check wether the image is corrupted or not before yeld it in the generator. Here you can find it's source code.
Since it happens during prediction, if you skip any image or batch, you need to keep track of which images are skipped, so that you can correctly map the prediction scores to the image file name.
Based on this idea, my DataGenerator is implemented with a valid image index tracker. In particular, focus on the variable valid_index where index of valid images are tracked.
class DataGenerator(keras.utils.Sequence):
def __init__(self, df, batch_size, verbose=False, **kwargs):
self.verbose = verbose
self.df = df
self.batch_size = batch_size
self.valid_index = kwargs['valid_index']
self.success_count = self.total_count = 0
def __len__(self):
return int(np.ceil(self.df.shape[0] / float(self.batch_size)))
def __getitem__(self, idx):
print('generator is loading batch ',idx)
batch_df = self.df.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
self.total_count += batch_df.shape[0]
# return a list whose element is either an image array (when image is valid) or None(when image is corrupted)
x = load_batch_image_to_arrays(batch_df['image_file_names'])
# filter out corrupted images
tmp = [(u, i) for u, i in zip(x, batch_df.index.values.tolist()) if
u is not None]
# boundary case. # all image failed, return another random batch
if len(tmp) == 0:
print('[ERROR] All images loading failed')
# based on https://github.com/keras-team/keras/blob/master/keras/utils/data_utils.py#L621,
# Keras will automatically find the next batch if it returns None
return None
print('successfully loaded image in {}th batch {}/{}'.format(str(idx), len(tmp), self.batch_size))
self.success_count += len(tmp)
x, batch_index = zip(*tmp)
x = np.stack(x) # list to np.array
self.valid_index[idx] = batch_index
# follow preprocess input function provided by keras
x = resnet50_preprocess(np.array(x, dtype=np.float))
return x
def on_epoch_end(self):
print('total image count', self.total_count)
print('successful images count', self.success_count)
self.success_count = self.total_count = 0 # reset count after one epoch ends.
During prediction.
predictions = model.predict_generator(
generator=data_gen,
workers=10,
use_multiprocessing=False,
max_queue_size=20,
verbose=1
).squeeze()
indexes = []
for i in sorted(data_gen.valid_index.keys()):
indexes.extend(data_gen.valid_index[i])
result_df = df.loc[indexes]
result_df['score'] = predictions