I am building a model with multiple inputs as shown in pyimagesearch, however I can't load all images into RAM and I am trying to create a generator that uses flow_from_directory and get from a CSV file all the extra attributes for each image being processed.
Question: How do I get the attributes from the CSV to correspond with the images in each batch from the image generator?
def get_combined_generator(images_dir, csv_dir, split, *args):
"""
Creates train/val generators on images and csv data.
Arguments:
images_dir : string
Path to a directory with subdirectories for each class.
csv_dir : string
Path to a directory containing train/val csv files with extra attributes.
split : string
Current split being used (train, val or test)
"""
img_width, img_height, batch_size = args
datagen = ImageDataGenerator(
rescale=1. / 255)
generator = datagen.flow_from_directory(
f'{images_dir}/{split}',
target_size=(img_width, img_height),
batch_size=batch_size,
shuffle=True,
class_mode='categorical')
df = pd.read_csv(f'{csv_dir}/{split}.csv', index_col='image')
def my_generator(image_gen, data):
while True:
i = image_gen.batch_index
batch = image_gen.batch_size
row = data[i * batch:(i + 1) * batch]
images, labels = image_gen.next()
yield [images, row], labels
csv_generator = my_generator(generator, df)
return csv_generator
I found a solution based on Luke's answer using a custom generator
import random
import pandas as pd
import numpy as np
from glob import glob
from keras.preprocessing import image as krs_image
# Create the arguments for image preprocessing
data_gen_args = dict(
horizontal_flip=True,
brightness_range=[0.5, 1.5],
shear_range=10,
channel_shift_range=50,
rescale=1. / 255,
)
# Create an empty data generator
datagen = ImageDataGenerator()
# Read the image list and csv
image_file_list = glob(f'{images_dir}/{split}/**/*.JPG', recursive=True)
df = pd.read_csv(f'{csv_dir}/{split}.csv', index_col=csv_data[0])
random.shuffle(image_file_list)
def custom_generator(images_list, dataframe, batch_size):
i = 0
while True:
batch = {'images': [], 'csv': [], 'labels': []}
for b in range(batch_size):
if i == len(images_list):
i = 0
random.shuffle(images_list)
# Read image from list and convert to array
image_path = images_list[i]
image_name = os.path.basename(image_path).replace('.JPG', '')
image = krs_image.load_img(image_path, target_size=(img_height, img_width))
image = datagen.apply_transform(image, data_gen_args)
image = krs_image.img_to_array(image)
# Read data from csv using the name of current image
csv_row = dataframe.loc[image_name, :]
label = csv_row['class']
csv_features = csv_row.drop(labels='class')
batch['images'].append(image)
batch['csv'].append(csv_features)
batch['labels'].append(label)
i += 1
batch['images'] = np.array(batch['images'])
batch['csv'] = np.array(batch['csv'])
# Convert labels to categorical values
batch['labels'] = np.eye(num_classes)[batch['labels']]
yield [batch['images'], batch['csv']], batch['labels']
I would suggest creating a custom generator given this relatively specific case. Something like the following (modified from a similar answer here) should suffice:
import os
import random
import pandas as pd
def generator(image_dir, csv_dir, batch_size):
i = 0
image_file_list = os.listdir(image_dir)
while True:
batch_x = {'images': list(), 'other_feats': list()} # use a dict for multiple inputs
batch_y = list()
for b in range(batch_size):
if i == len(image_file_list):
i = 0
random.shuffle(image_file_list)
sample = image_file_list[i]
image_file_path = sample[0]
csv_file_path = os.path.join(csv_dir,
os.path.basename(image_file_path).replace('.png', '.csv'))
i += 1
image = preprocess_image(cv2.imread(image_file_path))
csv_file = pd.read_csv(csv_file_path)
other_feat = preprocess_feats(csv_file)
batch_x['images'].append(image)
batch_x['other_feats'].append(other_feat)
batch_y.append(csv_file.loc[image_name, :]['class'])
batch_x['images'] = np.array(batch_x['images']) # convert each list to array
batch_x['other_feats'] = np.array(batch_x['other_feats'])
batch_y = np.eye(num_classes)[batch['labels']]
yield batch_x, batch_y
Then, you can use Keras's fit_generator() function to train your model.
Obviously, this assumes you have csv files with the same names as your image files, and that you have some custom preprocessing functions for images and csv files.
Related
I have downloaded the MINC dataset for material classification which consists of 23 cateogories. However, I am only interested in a subset of the categories (e.g. [wood, foliage, glass, hair])
Is it possible to get a subset of the data using tf.keras.preprocessing.image_dataset_from_directory?
I have tried tf.keras.preprocessing.image_dataset_from_directory(folder_dir, label_mode="categorical", class_names=["wood", "foliage", "glass", "hair"]) but it give this error The `class_names` passed did not match the names of the subdirectories of the target directory.
Is there a way to get a subset of the directories without deleting or modifying the folders? I know datagen.flow_from_directory is able to do it but keras says that it is deprecated and I should use image_dataset_from_directory.
There are two ways of doing this the first way is to do this by generator, but that process is costly, there is another way of doing this called Using tf.data for finer control. You can check this out at this link
https://www.tensorflow.org/tutorials/load_data/images
But, I will show you a brief demo that how you can load only the folders of your choice.
So, let's start...
#First import some libraries which are needed
import os
import glob
import tensorflow as tf
import matplotlib.pyplot as plt
I am taking only two classes of "Cats" vs "Dogs" you can take more than two classes...
batch_size = 32
img_height = 180
img_width = 180
#define your data directory where your dataset is placed
data_dir = path to your datasetfolder
#Now, here define a list of names for your dataset, like I am only loading cats and dogs... you can fill it with more if you have more
dataset_names = ['cats' , 'dogs']
#Now, glob the list of images in these two directories (cats & Dogs)
list_files = [glob.glob(data_dir + images + '/*.jpg') for images in folders]
list_files = list_files[0] + list_files[1]
image_count = len(list_files)
#Now, here pass this list to a tf.data.Dataset
list_files = tf.data.Dataset.from_tensor_slices(list_files)
#Now, define your class names to labels your dataset later...
class_names = ['cats', 'dogs']
#Now, here define the validation, test, train etc.
val_size = int(image_count * 0.2)
train_ds = list_files.skip(val_size)
val_ds = list_files.take(val_size)
#To get labels
def get_label(file_path):
# Convert the path to a list of path components
parts = tf.strings.split(file_path, os.path.sep)
parts = tf.strings.substr(parts, -4, 4)[0]
one_hot = parts == class_names
# Integer encode the label
return tf.argmax(one_hot)
def decode_img(img):
# Convert the compressed string to a 3D uint8 tensor
img = tf.io.decode_jpeg(img, channels=3)
# Resize the image to the desired size
return tf.image.resize(img, [img_height, img_width])
def process_path(file_path):
label = get_label(file_path)
# Load the raw data from the file as a string
img = tf.io.read_file(file_path)
img = decode_img(img)
return img, label
#Use Dataset.map to create a dataset of image, label pairs:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
#Configure dataset for performance
def configure_for_performance(ds):
ds = ds.cache()
ds = ds.shuffle(buffer_size=1000)
ds = ds.batch(batch_size)
ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
return ds
train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)
#Visualize the data
image_batch, label_batch = next(iter(train_ds))
plt.figure(figsize=(10, 10))
for i in range(9):
ax = plt.subplot(3, 3, i + 1)
plt.imshow(image_batch[i].numpy().astype("uint8"))
label = label_batch[i]
plt.title(class_names[label])
plt.axis("off")
Output:
Link to the COLAB file is:
https://colab.research.google.com/drive/1oUNuGVDWDLqwt_YQ80X-CBRL6kJ_YhUX?usp=sharing
I implemented a dual input model using custom generator as in here: Create a mixed data generator (images,csv) in keras
import random
import pandas as pd
import numpy as np
from glob import glob
from keras.preprocessing import image as krs_image
# Create the arguments for image preprocessing
data_gen_args = dict(
horizontal_flip=True,
brightness_range=[0.5, 1.5],
shear_range=10,
channel_shift_range=50,
rescale=1. / 255,
)
# Create an empty data generator
datagen = ImageDataGenerator()
# Read the image list and csv
image_file_list = glob(f'{images_dir}/{split}/**/*.JPG', recursive=True)
df = pd.read_csv(f'{csv_dir}/{split}.csv', index_col=csv_data[0])
random.shuffle(image_file_list)
def custom_generator(images_list, dataframe, batch_size):
i = 0
while True:
batch = {'images': [], 'csv': [], 'labels': []}
for b in range(batch_size):
if i == len(images_list):
i = 0
random.shuffle(images_list)
# Read image from list and convert to array
image_path = images_list[i]
image_name = os.path.basename(image_path).replace('.JPG', '')
image = krs_image.load_img(image_path, target_size=(img_height, img_width))
image = datagen.apply_transform(image, data_gen_args)
image = krs_image.img_to_array(image)
# Read data from csv using the name of current image
csv_row = dataframe.loc[image_name, :]
label = csv_row['class']
csv_features = csv_row.drop(labels='class')
batch['images'].append(image)
batch['csv'].append(csv_features)
batch['labels'].append(label)
i += 1
batch['images'] = np.array(batch['images'])
batch['csv'] = np.array(batch['csv'])
# Convert labels to categorical values
batch['labels'] = np.eye(num_classes)[batch['labels']]
yield [batch['images'], batch['csv']], batch['labels']
And fitted the model with:
history = model.fit(custom_generator(path_train, df_train, batch_size),
steps_per_epoch= train_steps,
epochs=epochs,
verbose=1,
#callbacks=callbacks,
validation_data=custom_generator(path_valid, df_valid, batch_size),
validation_steps=val_steps)
path_train & path_valid are the path to image folders train and validation data. df_train and df_valid are dataframes of metadata.
However, I am not sure how to make predictions on a model fitted with custom generator. Should I do as follows?
predictions = model.predict(custom_generator(test_path, df_test, batch_size=1), verbose =1)
Any help is greatly appreciated.
I am working on Image Binarization using UNet and have a dataset of 150 images and their binarized versions too. My idea is to augment the images randomly to make them look like they are differentso I have made a function which inserts any of the 4-5 types of Noises, skewness, shearing and so on to an image. I could have easily used
ImageDataGenerator(preprocess_function=my_aug_function) to augment the images but the problem is that my y target is also an image. Also, I could have used something like:
train_dataset = (
train_dataset.map(
encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
But it has 2 problems:
With larger dataset, it'll blow up the memory as data needs to be already in the memory
This is the crucial part that I need to augment the images on the go to make it look like I have a huge dataset.
Another Solution could be saving augmented images to a directory and making them 30-40K and then loading them. It would be silly thing to do.
Now the idea part is that I can use Sequence as the parent class but How can I keep on augmenting and generating new images on the fly with respective Y binarized images?
I have an idea as the below code. Can somebody help me with the augmentation and generation of y images. I have my X_DIR, Y_DIR where image names for binarised and original are same but stored in different directories.
class DataGenerator(tensorflow.keras.utils.Sequence):
def __init__(self, files_path, labels_path, batch_size=32, shuffle=True, random_state=42):
'Initialization'
self.files = files_path
self.labels = labels_path
self.batch_size = batch_size
self.shuffle = shuffle
self.random_state = random_state
self.on_epoch_end()
def on_epoch_end(self):
'Updates indexes after each epoch'
# Shuffle the data here
def __len__(self):
return int(np.floor(len(self.files) / self.batch_size))
def __getitem__(self, index):
# What do I do here?
def __data_generation(self, files):
# I think this is responsible for Augmentation but no idea how should I implement it and how does it works.
Custom Image Data Generator
load Directory data into dataframe for CustomDataGenerator
def data_to_df(data_dir, subset=None, validation_split=None):
df = pd.DataFrame()
filenames = []
labels = []
for dataset in os.listdir(data_dir):
img_list = os.listdir(os.path.join(data_dir, dataset))
label = name_to_idx[dataset]
for image in img_list:
filenames.append(os.path.join(data_dir, dataset, image))
labels.append(label)
df["filenames"] = filenames
df["labels"] = labels
if subset == "train":
split_indexes = int(len(df) * validation_split)
train_df = df[split_indexes:]
val_df = df[:split_indexes]
return train_df, val_df
return df
train_df, val_df = data_to_df(train_dir, subset="train", validation_split=0.2)
Custom Data Generator
import tensorflow as tf
from PIL import Image
import numpy as np
class CustomDataGenerator(tf.keras.utils.Sequence):
''' Custom DataGenerator to load img
Arguments:
data_frame = pandas data frame in filenames and labels format
batch_size = divide data in batches
shuffle = shuffle data before loading
img_shape = image shape in (h, w, d) format
augmentation = data augmentation to make model rebust to overfitting
Output:
Img: numpy array of image
label : output label for image
'''
def __init__(self, data_frame, batch_size=10, img_shape=None, augmentation=True, num_classes=None):
self.data_frame = data_frame
self.train_len = len(data_frame)
self.batch_size = batch_size
self.img_shape = img_shape
self.num_classes = num_classes
print(f"Found {self.data_frame.shape[0]} images belonging to {self.num_classes} classes")
def __len__(self):
''' return total number of batches '''
self.data_frame = shuffle(self.data_frame)
return math.ceil(self.train_len/self.batch_size)
def on_epoch_end(self):
''' shuffle data after every epoch '''
# fix on epoch end it's not working, adding shuffle in len for alternative
pass
def __data_augmentation(self, img):
''' function for apply some data augmentation '''
img = tf.keras.preprocessing.image.random_shift(img, 0.2, 0.3)
img = tf.image.random_flip_left_right(img)
img = tf.image.random_flip_up_down(img)
return img
def __get_image(self, file_id):
""" open image with file_id path and apply data augmentation """
img = np.asarray(Image.open(file_id))
img = np.resize(img, self.img_shape)
img = self.__data_augmentation(img)
img = preprocess_input(img)
return img
def __get_label(self, label_id):
""" uncomment the below line to convert label into categorical format """
#label_id = tf.keras.utils.to_categorical(label_id, num_classes)
return label_id
def __getitem__(self, idx):
batch_x = self.data_frame["filenames"][idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.data_frame["labels"][idx * self.batch_size:(idx + 1) * self.batch_size]
# read your data here using the batch lists, batch_x and batch_y
x = [self.__get_image(file_id) for file_id in batch_x]
y = [self.__get_label(label_id) for label_id in batch_y]
return tf.convert_to_tensor(x), tf.convert_to_tensor(y)
You can use libraries like albumentations and imgaug, both are good but I have heard there are issues with random seed with albumentations.
Here's an example of imgaug taken from the documentation here:
seq = iaa.Sequential([
iaa.Dropout([0.05, 0.2]), # drop 5% or 20% of all pixels
iaa.Sharpen((0.0, 1.0)), # sharpen the image
iaa.Affine(rotate=(-45, 45)), # rotate by -45 to 45 degrees (affects segmaps)
iaa.ElasticTransformation(alpha=50, sigma=5) # apply water effect (affects segmaps)
], random_order=True)
# Augment images and segmaps.
images_aug = []
segmaps_aug = []
for _ in range(len(input_data)):
images_aug_i, segmaps_aug_i = seq(image=image, segmentation_maps=segmap)
images_aug.append(images_aug_i)
segmaps_aug.append(segmaps_aug_i)
You are going in the right way with the custom generator. In __getitem__, make a batch using batch_x = self.files[index:index+batch_size] and same with batch_y, then augment them using X,y = __data_generation(batch_x, batch_y) which will load images(using any library you like, I prefer opencv), and return the augmented pairs (and any other manipulation).
Your __getitem__ will then return the tuple (X,y)
You can use ImageDataGenerator even if your label is an image.
Here is a simple example of how you can do that:
Code:
# Specifying your data augmentation here for both image and label
image_datagen = tf.keras.preprocessing.image.ImageDataGenerator()
mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator()
# Provide the same seed and keyword arguments to the flow methods
seed = 1
image_generator = image_datagen.flow_from_directory(
data_dir,
class_mode=None,
seed=seed)
mask_generator = mask_datagen.flow_from_directory(
data_dir,
class_mode=None,
seed=seed)
# Combine the image and label generator.
train_generator = zip(image_generator, mask_generator)
Now, if you iterate over it you will get:
for image, label in train_generator:
print(image.shape,label.shape)
break
Output:
(32, 256, 256, 3) (32, 256, 256, 3)
You can use this train_generator with fit() command.
Code:
model.fit_generator(
train_generator,
steps_per_epoch=2000,
epochs=50)
With flow_from_directory your memory won't be cluttered and Imagedatagenerator will take care of the augmentation part.
Hi so right now I got data load code and I'm not sure how would i split it into training and testing data. can anyone give me suggestion how to do it this is my data load code.
def __init__(self, root, specific_folder, img_extension, preprocessing_method=None, crop_size=(96, 112),train = True):
"""
Dataloader of the LFW dataset.
root: path to the dataset to be used.
specific_folder: specific folder inside the same dataset.
img_extension: extension of the dataset images.
preprocessing_method: string with the name of the preprocessing method.
crop_size: retrieval network specific crop size.
"""
self.preprocessing_method = preprocessing_method
self.crop_size = crop_size
self.imgl_list = []
self.classes = []
self.people = []
self.model_align = None
self.arr = []
# read the file with the names and the number of images of each people in the dataset
with open(os.path.join(root, 'people.txt')) as f:
people = f.read().splitlines()[1:]
# get only the people that have more than 20 images
for p in people:
p = p.split('\t')
if len(p) > 1:
if int(p[1]) >= 20:
for num_img in range(1, int(p[1]) + 1):
self.imgl_list.append(os.path.join(root, specific_folder, p[0], p[0] + '_' +
'{:04}'.format(num_img) + '.' + img_extension))
self.classes.append(p[0])
self.people.append(p[0])
le = preprocessing.LabelEncoder()
self.classes = le.fit_transform(self.classes)
print(len(self.imgl_list), len(self.classes), len(self.people))
def __getitem__(self, index):
imgl = imageio.imread(self.imgl_list[index])
cl = self.classes[index]
# if image is grayscale, transform into rgb by repeating the image 3 times
if len(imgl.shape) == 2:
imgl = np.stack([imgl] * 3, 2)
imgl, bb = preprocess(imgl, self.preprocessing_method, crop_size=self.crop_size,
is_processing_dataset=True, return_only_largest_bb=True, execute_default=True)
# append image with its reverse
imglist = [imgl, imgl[:, ::-1, :]]
# normalization
for i in range(len(imglist)):
imglist[i] = (imglist[i] - 127.5) / 128.0
imglist[i] = imglist[i].transpose(2, 0, 1)
imgs = [torch.from_numpy(i).float() for i in imglist]
return imgs, cl, imgl, bb, self.imgl_list[index], self.people[index]
def __len__(self):
return len(self.imgl_list)
I need to split the data in there into 20% and 80% data so I can test my module it been almost a week now and still have no idea at all how to do it would be appreciate so much if anyone can help:
In general using PyTorch:
import torch
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
dataset = yourdatahere
batch_size = 16 #change to whatever you'd like it to be
test_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(test_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
train_indices, test_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=test_sampler)
# Usage Example:
num_epochs = 10
for epoch in range(num_epochs):
# Train:
for batch_index, (faces, labels) in enumerate(train_loader):
# ...
Please note that you should also split your training data into training + validation data. You may use the same logic from above to do so.
Is it possible to get the file names that were loaded using flow_from_directory ?
I have :
datagen = ImageDataGenerator(
rotation_range=3,
# featurewise_std_normalization=True,
fill_mode='nearest',
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True
)
train_generator = datagen.flow_from_directory(
path+'/train',
target_size=(224, 224),
batch_size=batch_size,)
I have a custom generator for my multi output model like:
a = np.arange(8).reshape(2, 4)
# print(a)
print(train_generator.filenames)
def generate():
while 1:
x,y = train_generator.next()
yield [x] ,[a,y]
Node that at the moment I am generating random numbers for a but for real training , I wish to load up a json file that contains the bounding box coordinates for my images. For that I will need to get the file names that were generated using train_generator.next() method. After I have that , I can load the file, parse the json and pass it instead of a. It is also necessary that the ordering of the x variable and the list of the file names that I get is the same.
Yes is it possible, at least with version 2.0.4 (don't know about earlier version).
The instance of ImageDataGenerator().flow_from_directory(...) has an attribute with filenames which is a list of all the files in the order the generator yields them and also an attribute batch_index. So you can do it like this:
datagen = ImageDataGenerator()
gen = datagen.flow_from_directory(...)
And every iteration on generator you can get the corresponding filenames like this:
for i in gen:
idx = (gen.batch_index - 1) * gen.batch_size
print(gen.filenames[idx : idx + gen.batch_size])
This will give you the filenames of the images in the current batch.
You can make a pretty minimal subclass that returns the image, file_path tuple by inheriting the DirectoryIterator:
import numpy as np
from keras.preprocessing.image import ImageDataGenerator, DirectoryIterator
class ImageWithNames(DirectoryIterator):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.filenames_np = np.array(self.filepaths)
self.class_mode = None # so that we only get the images back
def _get_batches_of_transformed_samples(self, index_array):
return (super()._get_batches_of_transformed_samples(index_array),
self.filenames_np[index_array])
In the init, I added a attribute that is the numpy version of self.filepaths so that we can easily index into that array to get the paths on each batch generation.
The only other change to the base class is to return a tuple that is the image batch super()._get_batches_of_transformed_samples(index_array) and the file paths self.filenames_np[index_array].
With that, you can make your generator like so:
imagegen = ImageDataGenerator()
datagen = ImageWithNames('/data/path', imagegen, target_size=(224,224))
And then check with
next(datagen)
at least with version 2.2.4,you can do it like this
datagen = ImageDataGenerator()
gen = datagen.flow_from_directory(...)
for file in gen.filenames:
print(file)
or get the file path
for filepath in gen.filepaths:
print(filepath)
Here is an example that works with shuffle=True as well. And also properly handles last batch. To make one pass:
datagen = ImageDataGenerator().flow_from_directory(...)
batches_per_epoch = datagen.samples // datagen.batch_size + (datagen.samples % datagen.batch_size > 0)
for i in range(batches_per_epoch):
batch = next(datagen)
current_index = ((datagen.batch_index-1) * datagen.batch_size)
if current_index < 0:
if datagen.samples % datagen.batch_size > 0:
current_index = max(0,datagen.samples - datagen.samples % datagen.batch_size)
else:
current_index = max(0,datagen.samples - datagen.batch_size)
index_array = datagen.index_array[current_index:current_index + datagen.batch_size].tolist()
img_paths = [datagen.filepaths[idx] for idx in index_array]
#batch[0] - x, batch[1] - y, img_paths - absolute path
the below code might help. Overriding the flow_from_directory
class AugmentingDataGenerator(ImageDataGenerator):
def flow_from_directory(self, directory, mask_generator, *args, **kwargs):
generator = super().flow_from_directory(directory, class_mode=None, *args, **kwargs)
seed = None if 'seed' not in kwargs else kwargs['seed']
while True:
for image_path in generator.filepaths:
# Get augmentend image samples
image = next(generator)
# print(image_path )
yield image,image_path
# Create training generator
train_datagen = AugmentingDataGenerator(
rotation_range=10,
width_shift_range=0.1,
height_shift_range=0.1,
rescale=1./255,
horizontal_flip=True
)
train_generator = train_datagen.flow_from_directory(
TRAIN_DIRECTORY_PATH,
target_size=(256, 256),
shuffle = False,
batch_size=BATCH_SIZE
)
# Create testing generator
test_datagen = AugmentingDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
TEST_DIRECTORY_PATH,
target_size=(256, 256),
shuffle = False, # inorder to get imagepath of the same image
batch_size=BATCH_SIZE
)
And to check your images and file path returned
image,file_path = next(test_generator)
# print(file_path)
# plt.imshow(image)
I needed exactly this and I developed a simple function that works with shuffle=True or shuffle=False.
def get_indices_from_keras_generator(gen, batch_size):
"""
Given a keras data generator, it returns the indices and the filepaths
corresponding the current batch.
:param gen: keras generator.
:param batch_size: size of the last batch generated.
:return: tuple with indices and filenames
"""
idx_left = (gen.batch_index - 1) * batch_size
idx_right = idx_left + gen.batch_size if idx_left >= 0 else None
indices = gen.index_array[idx_left:idx_right]
filenames = [gen.filenames[i] for i in indices]
return indices, filenames
Then, you would use it as follows:
for x, y in gen:
indices, filenames = get_indices_from_keras_generator(gen)