I am trying to predict several million images with my trained model using a predict_generator in python 3 with keras and tensorflow as backend. The generator and the model predictions work, however, some images in the directory are broken or corrupted and cause the predict_generator to stop and throw an error. Once the image is removed it works again until the next corrupted/broken image gets fed through the function.
Since there are so many images it is not feasible to run a script to open every image and delete the ones that are throwing an error. Is there a way to incorporate a "skip image if broken" argument into the generator or flow from directory function?
Any help is greatly appreciated!
There's no such argument in ImageDataGenerator and neither in flow_from_directory method as you can see int the Keras docs for both (here and here). One workaround would be to extend the ImageDataGenerator class and overload the flow_from_directory method to check wether the image is corrupted or not before yeld it in the generator. Here you can find it's source code.
Since it happens during prediction, if you skip any image or batch, you need to keep track of which images are skipped, so that you can correctly map the prediction scores to the image file name.
Based on this idea, my DataGenerator is implemented with a valid image index tracker. In particular, focus on the variable valid_index where index of valid images are tracked.
class DataGenerator(keras.utils.Sequence):
def __init__(self, df, batch_size, verbose=False, **kwargs):
self.verbose = verbose
self.df = df
self.batch_size = batch_size
self.valid_index = kwargs['valid_index']
self.success_count = self.total_count = 0
def __len__(self):
return int(np.ceil(self.df.shape[0] / float(self.batch_size)))
def __getitem__(self, idx):
print('generator is loading batch ',idx)
batch_df = self.df.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
self.total_count += batch_df.shape[0]
# return a list whose element is either an image array (when image is valid) or None(when image is corrupted)
x = load_batch_image_to_arrays(batch_df['image_file_names'])
# filter out corrupted images
tmp = [(u, i) for u, i in zip(x, batch_df.index.values.tolist()) if
u is not None]
# boundary case. # all image failed, return another random batch
if len(tmp) == 0:
print('[ERROR] All images loading failed')
# based on https://github.com/keras-team/keras/blob/master/keras/utils/data_utils.py#L621,
# Keras will automatically find the next batch if it returns None
return None
print('successfully loaded image in {}th batch {}/{}'.format(str(idx), len(tmp), self.batch_size))
self.success_count += len(tmp)
x, batch_index = zip(*tmp)
x = np.stack(x) # list to np.array
self.valid_index[idx] = batch_index
# follow preprocess input function provided by keras
x = resnet50_preprocess(np.array(x, dtype=np.float))
return x
def on_epoch_end(self):
print('total image count', self.total_count)
print('successful images count', self.success_count)
self.success_count = self.total_count = 0 # reset count after one epoch ends.
During prediction.
predictions = model.predict_generator(
generator=data_gen,
workers=10,
use_multiprocessing=False,
max_queue_size=20,
verbose=1
).squeeze()
indexes = []
for i in sorted(data_gen.valid_index.keys()):
indexes.extend(data_gen.valid_index[i])
result_df = df.loc[indexes]
result_df['score'] = predictions
Related
I am doing the end-to-end mapping. As I have to pass two images (input and output), I have created a custom generator. My generator gets two same images with different resolutions. Right now I can only get 5 images to pass to training but I want to pass the whole generator so that all my data gets trained. As I am new to using generators and yield I don't the correct way to pass the whole generator.
import os
import numpy as np
import cv2
class image_gen():
def __init__(self, idir,odir,batch_size, shuffle = True):
self.batch_index=0
self.idir=idir
self.odir=odir# directory containing input images
self.batch_size=batch_size #batch size is number of samples in a batch
self.shuffle=shuffle # set to True to shuffle images, False for no shuffle
self.label_list=[] # initialize list to hold sequential list of total labels generated
self.image_list=[] # initialize list to hold sequential list of total images filenames generated
self.i_list=os.listdir(self.idir)
self.o_list=os.listdir(self.odir)# list of images in directory
def get_images(self): # gets a batch of input images, resizes input image to make target images
while True:
input_image_batch=[]
output_image_batch=[]# initialize list to hold a batch of target images
sample_count=len(self.i_list) # determine total number of images available
for i in range(self.batch_index * self.batch_size, (self.batch_index + 1) * self.batch_size ): #iterate for a batch
j=i % sample_count # cycle j value over range of available images
k=j % self.batch_size # cycle k value over batch size
if self.shuffle: # if shuffle select a random integer between 0 and sample_count-1 to pick as the image=label pair
m=np.random.randint(low=0, high=sample_count-1, size=None, dtype=int)
else:
m=j # no shuffle
#input
path_to_in_img=os.path.join(self.idir,self.i_list[m])
path_to_out_img=os.path.join(self.odir,self.o_list[m])
# define the path to the m th image
input_image=cv2.imread(path_to_in_img)
input_image=cv2.resize( input_image,(3200,3200))#create the target image from the input image
output_image=cv2.imread(path_to_out_img)
output_image=cv2.resize(output_image,(3200,3200))
input_image_batch.append(input_image)
output_image_batch.append(output_image)
input_image_array=np.array(input_image_batch)
input_image_array = input_image_array / 255.0
output_image_array=np.array(output_image_batch)
output_image_array = output_image_array /255.0
self.batch_index= self.batch_index + 1
yield (input_image_array, output_image_array )
if self.batch_index * self.batch_size > sample_count:
break
This is how i get the images
batch_size=5
idir=r'D:\\train'
odir=r'D:\\Train\\train'#
shuffle=True
gen=image_gen(idir,odir,batch_size,shuffle=True) # instantiate an instance of the class
input_images,output_images = next(gen.get_images())
This is how i train.This way i only train 5 images and not the whole dataset
model.fit(input_images,output_images,validation_data = (valin_images,valout_images),batch_size= 5,epochs = 100)
when i try to pass the whole dataset
model.fit(gen(),validation_data = (valin_images,valout_images),batch_size= 5,epochs = 1)
I get a error "image_gen" object is not callable. How should i pass the generator to model.fit()
The reason why you have this problem is because this error is raised when you try to access a image_gen as if it were a function, but in fact it is an object of a class.
In the first snippet you provided, you accessed in fact the method of the class which is indeed a generator, which yielded some numpy arrays that could be fed as input to the model. The second snippet however fails, because of the error described in the first paragraph.
Two possible solutions for your problem would be the following:
Use a Keras Sequence() generator.
Use a function as a generator (def my_generator(...)).
I personally recommend the first solution, as the Sequence() generator ensures that you only train once per each sample during an epoch, property which is not satisfied in case of simple function generators.
Solution for Keras Sequence() :
You need to override the Sequence class and then overwrite its methods. A complete example from the TensorFlow official documentation is:
from skimage.io import imread
from skimage.transform import resize
import numpy as np
import math
# Here, `x_set` is list of path to the images
# and `y_set` are the associated classes.
class CIFAR10Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) *
self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) *
self.batch_size]
return np.array([
resize(imread(file_name), (200, 200))
for file_name in batch_x]), np.array(batch_y)
You can use the above code as a starting point for your solution. Incidentally, it is likely your network will not train with such huge image dimensions, you could also try to lower them.
A solution for simple generator could be:
def my_generator(path_to_dataset, other_argument):
...
...
yield image_1, image_2
train_generator = my_generator(path_to_train,argument_1)
val_generator = my_generator(path_to_val,argument_2)
model.fit(train_generator,
steps_per_epoch=len(training_samples) // BATCH_SIZE,
epochs=10, validation_data=val_generator,
validation_steps=len(validation_samples) // BATCH_SIZE)
I am training a model using custom generators, but just before finishing the first epoch, the model runs out of data. It gives me the following error:
Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least (steps_per_epoch * epochs) batches (in this case, 8740 batches). You may need to use the repeat() function when building your dataset
I have four generators (one for the train data, and another for the train label. Same thing with validation). I then zip train & label together. This is the prototype of my generators. I got the idea from here:
import numpy as np
import nibabel as nib
from tensorflow import keras
import os
def weirddivision(n,d):
return np.array(n)/np.array(d) if d else 0
class ImgDataGenerator(keras.utils.Sequence):
def __init__(self, file_list, batch_size=8, shuffle=True):
"""Constructor can be expanded,
with batch size, dimentation etc.
"""
self.file_list = file_list
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
'Take all batches in each iteration'
return int(np.floor(len(self.file_list) / self.batch_size))
def __getitem__(self, index):
'Get next batch'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# single file
file_list_temp = [self.file_list[k] for k in indexes]
# Set of X_train and y_train
X = self.__data_generation(file_list_temp)
return X
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.file_list))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, file_list_temp):
'Generates data containing batch_size samples'
train_loc = '/home/faruk/Desktop/BrainSeg/Dataset/Train/'
X = np.empty((self.batch_size,224,256,1))
# Generate data
for i, ID in enumerate(file_list_temp):
x_file_path = os.path.join(train_loc, ID)
img = np.load(x_file_path)
img = np.pad(img, pad_width=((14,13),(12,11)), mode='constant')
img = np.expand_dims(img,-1)
img = weirddivision(img, img.max())
# Store sample
X[i,] = img
return X
As mentioned, here I create four generators and zip them:
training_img_generator = ImgDataGenerator(train)
training_label_generator = LabelDataGenerator(train)
train_generator = zip(training_img_generator,training_label_generator)
val_img_generator = ValDataGenerator(val)
val_label_generator = ValLabelDataGenerator(val)
val_generator = zip(val_img_generator,val_label_generator)
Because the generator is generating data dynamically, I thought that maybe it was trying to generate more than what is actually available. Hence, I calculated the steps per epoch as follows and passed it to fit_generator:
batch_size = 8
spe = len(train)//batch_size # len(train) = 34965
val_spe = len(val)//batch_size # len(val) = 4347
History=model.fit_generator(generator=train_generator, validation_data=val_generator, epochs=2, steps_per_epoch=spe, validation_steps = val_spe, shuffle=True, verbose=1)
But still, this is not working. I have tried reducing the number of steps per epoch, and I am able to finish the first epoch, but the error then appears at the beginning of the second epoch. Apparently the generator needs to be repeated infinitely, but I don't know how to achieve this. Can I use an infinite while loop? If yes, where?
Try this:
train_generator = train_generator.repeat()
val_generator = val_generator.repeat()
I solved this. I was defining my Generator class as follows:
class ImgDataGenerator(keras.utils.Sequence)
However, my model was not sequential... It was functional. I solved this by creating my own custom generator without inheriting from the keras.utils.sequence.
I hope this is helpful to someone.
i'm using keras with a simple cnn model.
i want to add gaussian noise to images in training. i want to change the noise parameters (mean and sigma) every epoch,based on some function. for example,
in epoch 1 i want to add noise with sigma=1
in epoch 2 i want to add noise with sigma=2
in epoch 3 i want to add noise with sigma=3
# note-mean is always zero
and so on...
inefficient way to solve it is with a for loop, save and load the mode after every epoch and call augmentation function.
more efficient way will be with a custom callback or generator, which i didn't succeed to do
inefficient way:
total_num_of_epochs=100
def sigma_function(current_epoch):
sigma_fun=current_epoch/total_num_of_epochs
return sigma_fun
for i in range(total_num_of_epochs):
x_train += np.random.normal(mean=0,sigma=sigma_fun(i),size=x_train shape) # augment x_train based on sigma_function and current epochs
model.compile(...)
model.fit(x_train ,y_train...initial_epoch=i,epochs=i+1) #load the model
# from previous loop
save model
load model for next loop
the desired result (i tried with ImageDataGenerator but maybe callback can do):
def sigma_function(current_epoch):
sigma_fun=current_epoch/total_num_of_epochs
return sigma_fun
datagen=ImageDataGenerator(preprocessing_function=sigma_function)
datagen.fit(x_train)
model.fit_generator(... don't know what to put here)
edit
according to the proposed solution by Daniel Möller,i tried this way and still got an error
sigmaParam = 1
def apply_sigma(x):
return x + np.random.normal(mean=0,scale=sigmaParam,size=(3,32,32))
imgGen = ImageDataGenerator( preprocesing_function=apply_sigma)
generator = imgGen.flow_from_directory('data/train') # folder that contains
# only x_train and y_train
from keras.utils import Sequence
class SigmaGenerator(Sequence):
def __init__(self, keras_generator):
self.keras_generator = keras_generator
def __len__(self):
return len(self.keras_generator)
def __getitem__(self,i):
return self.keras_generator[i]
def on_epoch_end(self):
sigmaParam += 1
self.keras_generator.on_epoch_end()
training_generator = SigmaGenerator(generator)
model.fit_generator(training_generator,validation_data=(x_test,y_test),
steps_per_epoch=x_train.shape[0]//batch_size,epochs=100)
the error i get:
process finished with exit code -1073741819 (0xC0000005)
You can try this:
sigmaParam = 1
def applySigma(x):
return x + np.random.normal(mean=0,scale=sigmaParam,size=x.shape)
Create the original generator:
imgGen = ImageDataGenerator(..., preprocesing_function=apply_sigma)
generator = imgGen.flow_from_directory(....)
Create a custom generator to wrap the original one, replace its on_epoch_end method to update sigmaParam.
from keras.utils import Sequence
class SigmaGenerator(Sequence):
def __init__(self, keras_generator):
self.keras_generator = keras_generator
def __len__(self):
return len(self.keras_generator)
def __getitem__(self,i):
return self.keras_generator[i]
def on_epoch_end(self):
sigmaParam += 1
self.keras_generator.on_epoch_end()
training_generator = SigmaGenerator(generator)
I am trying to create a custom data generator and don't know how integrate the yield function combined with an infinite loop inside the __getitem__ method.
EDIT: After the answer I realized that the code I am using is a Sequence which doesn't need a yield statement.
Currently I am returning multiple images with a return statement:
class DataGenerator(tensorflow.keras.utils.Sequence):
def __init__(self, files, labels, batch_size=32, shuffle=True, random_state=42):
'Initialization'
self.files = files
self.labels = labels
self.batch_size = batch_size
self.shuffle = shuffle
self.random_state = random_state
self.on_epoch_end()
def __len__(self):
return int(np.floor(len(self.files) / self.batch_size))
def __getitem__(self, index):
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
files_batch = [self.files[k] for k in indexes]
y = [self.labels[k] for k in indexes]
# Generate data
x = self.__data_generation(files_batch)
return x, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.files))
if self.shuffle == True:
np.random.seed(self.random_state)
np.random.shuffle(self.indexes)
def __data_generation(self, files):
imgs = []
for img_file in files:
img = cv2.imread(img_file, -1)
###############
# Augment image
###############
imgs.append(img)
return imgs
In this article I saw that yield is used in an infinite loop. I don't quite understand that syntax. How is the loop escaped?
You are using the Sequence API, which works a bit different than plain generators. In a generator function, you would use the yield keyword to perform iteration inside a while True: loop, so each time Keras calls the generator, it gets a batch of data and it automatically wraps around the end of the data.
But in a Sequence, there is an index parameter to the __getitem__ function, so no iteration or yield is required, this is performed by Keras for you. This is made so the sequence can run in parallel using multiprocessing, which is not possible with old generator functions.
So you are doing things the right way, there is no change needed.
Example of generator in Keras:
def datagenerator(images, labels, batchsize, mode="train"):
while True:
start = 0
end = batchsize
while start < len(images):
# load your images from numpy arrays or read from directory
x = images[start:end]
y = labels[start:end]
yield x, y
start += batchsize
end += batchsize
Keras wants you to have the infinite loop running in the generator.
If you want to learn about Python generators, then the link in the comments is actually a good place to start.
I'm trying to use the pretrained InceptionV3 model to classify the food-101 dataset, which containts food images for 101 categories, 1000 per category. I've preprocessed this dataset into a single hdf5 file (I assumed this is beneficial compared to loading images on the go when training) so far, which has the following tables inside:
The data split is the standard 70% train, 20% validation, 10% test, so for example the valid_img has a size of 20200*299*299*3. The labels are onehotencoded for Keras, so valid_labels has a size of 20200*101.
This hdf5 file has a size of 27.1 GB, so it will not fit into my memory. (Have 8 GB of it, although effectively only probably 4-5 gigs is usable while running Ubuntu. Also my GPU is GTX 960 with 2 GB of VRAM, and so far it looked like 1.5 GB is available for python when I try to start the training script). I'm using Tensorflow backend.
The first idea I had is to use model.train_on_batch() with a double nested for loop like this:
#Loading InceptionV3, adding my fully connected layers, compiling model...
dataset = h5py.File('/home/uzoltan/PycharmProjects/food-101/food-101_299x299.hdf5', 'r')
epoch = 50
for i in range(epoch):
for i in range(100): #1000 images can fit in the memory easily, this could probably be range(10) too
train_images = dataset["train_img"][i * 706:(i + 1) * 706, ...]
train_labels = dataset["train_labels"][i * 706:(i + 1) * 706, ...]
val_images = dataset["valid_img"][i * 202:(i + 1) * 202, ...]
val_labels = dataset["valid_labels"][i * 202:(i + 1) * 202, ...]
model.train_on_batch(x=train_images, y=train_labels, class_weight=None,
sample_weight=None, )
My problem with this approach is that train_on_batch provides 0 support for validation or batch shuffling, so that the batches are not in the same order every epoch.
So I looked towards model.fit_generator() which has the nice property of providing all the same functionality as fit(), plus with the built in ImageDataGenerator you can do image augmentations (rotations, horizontal flips, etc.) at the same time with the CPU, so that your model can be more robust. My problem here is, that if I understand it correctly, the ImageDataGenerator.flow(x,y) method needs all the samples and labels at once, but my training/validation data wont fit into my RAM.
Here is where I think custom data generators come into the picture, but after looking extensively at some examples I could find on the Keras GitHub/Issues page, I still dont really get how should I implement a custom generator, which would read in batches of data from my hdf5 file. Can someone provide me with a good example or pointers? How could I couple the custom batch generator with the image augmentations? Or maybe is it easier to implement some kind of manual validation and batch shuffling for train_on_batch()? If so, I could use some pointer there too.
For anyone still looking for an answer, I made the following "crude wrapper" around ImageDataGeneator's apply_transform method.
from numpy.random import uniform, randint
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
import numpy as np
class CustomImagesGenerator:
def __init__(self, x, zoom_range, shear_range, rescale, horizontal_flip, batch_size):
self.x = x
self.zoom_range = zoom_range
self.shear_range = shear_range
self.rescale = rescale
self.horizontal_flip = horizontal_flip
self.batch_size = batch_size
self.__img_gen = ImageDataGenerator()
self.__batch_index = 0
def __len__(self):
# steps_per_epoch, if unspecified, will use the len(generator) as a number of steps.
# hence this
return np.floor(self.x.shape[0]/self.batch_size)
def next(self):
return self.__next__()
def __next__(self):
start = self.__batch_index*self.batch_size
stop = start + self.batch_size
self.__batch_index += 1
if stop > len(self.x):
raise StopIteration
transformed = np.array(self.x[start:stop]) # loads from hdf5
for i in range(len(transformed)):
zoom = uniform(self.zoom_range[0], self.zoom_range[1])
transformations = {
'zx': zoom,
'zy': zoom,
'shear': uniform(-self.shear_range, self.shear_range),
'flip_horizontal': self.horizontal_flip and bool(randint(0,2))
}
transformed[i] = self.__img_gen.apply_transform(transformed[i], transformations)
return transformed * self.rescale
It can be called like so:
import h5py
f = h5py.File("my_heavy_dataset_file.hdf5", 'r')
images = f['mydatasets/images']
my_gen = CustomImagesGenerator(
images,
zoom_range=[0.8, 1],
shear_range=6,
rescale=1./255,
horizontal_flip=True,
batch_size=64
)
model.fit_generator(my_gen)
If I understood you correctly, you want to use the data (which does not fit in the memory) from HDF5 and at the same time use data augmentation on it.
I'm in the same situation as you, and I found this code that maybe can be helpful with some few modifications:
https://gist.github.com/wassname/74f02bc9134897e3fe4e60784f5aaa15
this is my solution for shuffle data per epoch with h5 file.
indices means train or val index list.
def generator(h5path, indices, batchSize=128, is_train=True, aug=None):
db = h5py.File(h5path, "r")
with open("mean.json") as f:
mean = json.load(f)
meanV = np.array([mean["R"], mean["G"], mean["B"]])
while True:
np.random.shuffle(indices)
for i in range(0, len(indices), batchSize):
t0 = time()
batch_indices = indices[i:i+batchSize]
batch_indices.sort()
by = db["labels"][batch_indices,:]
bx = db["images"][batch_indices,:,:,:]
bx[:,:,:,0] -= meanV[0]
bx[:,:,:,1] -= meanV[1]
bx[:,:,:,2] -= meanV[2]
t1=time()
if is_train:
#bx = random_crop(bx, (224,224))
if aug is not None:
bx,by = next(aug.flow(bx,by,batchSize))
yield (bx,by)
h5path='all_224.hdf5'
model.fit_generator(generator(h5path, train_indices, batchSize=batchSize, is_train=True, aug=aug),
steps_per_epoch = 20000//batchSize,
validation_data= generator(h5path, test_indices, is_train=False, batchSize=batchSize),
validation_steps = 2424//batchSize,
epochs=args.epoch,
max_queue_size=100,
callbacks=[checkpoint, early_stop])
You want to write a function which loads images from the HDF5 and then yields (not returns) them as a numpy array. Here is a simple example which uses OpenCV to load images directly from .png/.jpg files in a given directory:
def generate_data(directory, batch_size):
"""Replaces Keras' native ImageDataGenerator."""
i = 0
file_list = os.listdir(directory)
while True:
image_batch = []
for b in range(batch_size):
if i == len(file_list):
i = 0
random.shuffle(file_list)
sample = file_list[i]
i += 1
image = cv2.resize(cv2.imread(sample[0]), INPUT_SHAPE)
image_batch.append((image.astype(float) - 128) / 128)
yield np.array(image_batch)
Obviously you will have to modify it to read from the HDF5 instead.
Once you have written your function, the usage is simply:
model.fit_generator(
generate_data('~/my_data', batch_size),
steps_per_epoch=len(os.listdir('~/my_data')) // batch_size)
Again modified to reflect the fact that you are reading from an HDF5 and not a directory.