How to split the data into training and testing data - python

Hi so right now I got data load code and I'm not sure how would i split it into training and testing data. can anyone give me suggestion how to do it this is my data load code.
def __init__(self, root, specific_folder, img_extension, preprocessing_method=None, crop_size=(96, 112),train = True):
"""
Dataloader of the LFW dataset.
root: path to the dataset to be used.
specific_folder: specific folder inside the same dataset.
img_extension: extension of the dataset images.
preprocessing_method: string with the name of the preprocessing method.
crop_size: retrieval network specific crop size.
"""
self.preprocessing_method = preprocessing_method
self.crop_size = crop_size
self.imgl_list = []
self.classes = []
self.people = []
self.model_align = None
self.arr = []
# read the file with the names and the number of images of each people in the dataset
with open(os.path.join(root, 'people.txt')) as f:
people = f.read().splitlines()[1:]
# get only the people that have more than 20 images
for p in people:
p = p.split('\t')
if len(p) > 1:
if int(p[1]) >= 20:
for num_img in range(1, int(p[1]) + 1):
self.imgl_list.append(os.path.join(root, specific_folder, p[0], p[0] + '_' +
'{:04}'.format(num_img) + '.' + img_extension))
self.classes.append(p[0])
self.people.append(p[0])
le = preprocessing.LabelEncoder()
self.classes = le.fit_transform(self.classes)
print(len(self.imgl_list), len(self.classes), len(self.people))
def __getitem__(self, index):
imgl = imageio.imread(self.imgl_list[index])
cl = self.classes[index]
# if image is grayscale, transform into rgb by repeating the image 3 times
if len(imgl.shape) == 2:
imgl = np.stack([imgl] * 3, 2)
imgl, bb = preprocess(imgl, self.preprocessing_method, crop_size=self.crop_size,
is_processing_dataset=True, return_only_largest_bb=True, execute_default=True)
# append image with its reverse
imglist = [imgl, imgl[:, ::-1, :]]
# normalization
for i in range(len(imglist)):
imglist[i] = (imglist[i] - 127.5) / 128.0
imglist[i] = imglist[i].transpose(2, 0, 1)
imgs = [torch.from_numpy(i).float() for i in imglist]
return imgs, cl, imgl, bb, self.imgl_list[index], self.people[index]
def __len__(self):
return len(self.imgl_list)
I need to split the data in there into 20% and 80% data so I can test my module it been almost a week now and still have no idea at all how to do it would be appreciate so much if anyone can help:

In general using PyTorch:
import torch
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
dataset = yourdatahere
batch_size = 16 #change to whatever you'd like it to be
test_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(test_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
train_indices, test_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=test_sampler)
# Usage Example:
num_epochs = 10
for epoch in range(num_epochs):
# Train:
for batch_index, (faces, labels) in enumerate(train_loader):
# ...
Please note that you should also split your training data into training + validation data. You may use the same logic from above to do so.

Related

How to build a Custom Data Generator for Keras/tf.Keras where X images are being augmented and corresponding Y labels are also images

I am working on Image Binarization using UNet and have a dataset of 150 images and their binarized versions too. My idea is to augment the images randomly to make them look like they are differentso I have made a function which inserts any of the 4-5 types of Noises, skewness, shearing and so on to an image. I could have easily used
ImageDataGenerator(preprocess_function=my_aug_function) to augment the images but the problem is that my y target is also an image. Also, I could have used something like:
train_dataset = (
train_dataset.map(
encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
But it has 2 problems:
With larger dataset, it'll blow up the memory as data needs to be already in the memory
This is the crucial part that I need to augment the images on the go to make it look like I have a huge dataset.
Another Solution could be saving augmented images to a directory and making them 30-40K and then loading them. It would be silly thing to do.
Now the idea part is that I can use Sequence as the parent class but How can I keep on augmenting and generating new images on the fly with respective Y binarized images?
I have an idea as the below code. Can somebody help me with the augmentation and generation of y images. I have my X_DIR, Y_DIR where image names for binarised and original are same but stored in different directories.
class DataGenerator(tensorflow.keras.utils.Sequence):
def __init__(self, files_path, labels_path, batch_size=32, shuffle=True, random_state=42):
'Initialization'
self.files = files_path
self.labels = labels_path
self.batch_size = batch_size
self.shuffle = shuffle
self.random_state = random_state
self.on_epoch_end()
def on_epoch_end(self):
'Updates indexes after each epoch'
# Shuffle the data here
def __len__(self):
return int(np.floor(len(self.files) / self.batch_size))
def __getitem__(self, index):
# What do I do here?
def __data_generation(self, files):
# I think this is responsible for Augmentation but no idea how should I implement it and how does it works.
Custom Image Data Generator
load Directory data into dataframe for CustomDataGenerator
def data_to_df(data_dir, subset=None, validation_split=None):
df = pd.DataFrame()
filenames = []
labels = []
for dataset in os.listdir(data_dir):
img_list = os.listdir(os.path.join(data_dir, dataset))
label = name_to_idx[dataset]
for image in img_list:
filenames.append(os.path.join(data_dir, dataset, image))
labels.append(label)
df["filenames"] = filenames
df["labels"] = labels
if subset == "train":
split_indexes = int(len(df) * validation_split)
train_df = df[split_indexes:]
val_df = df[:split_indexes]
return train_df, val_df
return df
train_df, val_df = data_to_df(train_dir, subset="train", validation_split=0.2)
Custom Data Generator
import tensorflow as tf
from PIL import Image
import numpy as np
class CustomDataGenerator(tf.keras.utils.Sequence):
''' Custom DataGenerator to load img
Arguments:
data_frame = pandas data frame in filenames and labels format
batch_size = divide data in batches
shuffle = shuffle data before loading
img_shape = image shape in (h, w, d) format
augmentation = data augmentation to make model rebust to overfitting
Output:
Img: numpy array of image
label : output label for image
'''
def __init__(self, data_frame, batch_size=10, img_shape=None, augmentation=True, num_classes=None):
self.data_frame = data_frame
self.train_len = len(data_frame)
self.batch_size = batch_size
self.img_shape = img_shape
self.num_classes = num_classes
print(f"Found {self.data_frame.shape[0]} images belonging to {self.num_classes} classes")
def __len__(self):
''' return total number of batches '''
self.data_frame = shuffle(self.data_frame)
return math.ceil(self.train_len/self.batch_size)
def on_epoch_end(self):
''' shuffle data after every epoch '''
# fix on epoch end it's not working, adding shuffle in len for alternative
pass
def __data_augmentation(self, img):
''' function for apply some data augmentation '''
img = tf.keras.preprocessing.image.random_shift(img, 0.2, 0.3)
img = tf.image.random_flip_left_right(img)
img = tf.image.random_flip_up_down(img)
return img
def __get_image(self, file_id):
""" open image with file_id path and apply data augmentation """
img = np.asarray(Image.open(file_id))
img = np.resize(img, self.img_shape)
img = self.__data_augmentation(img)
img = preprocess_input(img)
return img
def __get_label(self, label_id):
""" uncomment the below line to convert label into categorical format """
#label_id = tf.keras.utils.to_categorical(label_id, num_classes)
return label_id
def __getitem__(self, idx):
batch_x = self.data_frame["filenames"][idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.data_frame["labels"][idx * self.batch_size:(idx + 1) * self.batch_size]
# read your data here using the batch lists, batch_x and batch_y
x = [self.__get_image(file_id) for file_id in batch_x]
y = [self.__get_label(label_id) for label_id in batch_y]
return tf.convert_to_tensor(x), tf.convert_to_tensor(y)
You can use libraries like albumentations and imgaug, both are good but I have heard there are issues with random seed with albumentations.
Here's an example of imgaug taken from the documentation here:
seq = iaa.Sequential([
iaa.Dropout([0.05, 0.2]), # drop 5% or 20% of all pixels
iaa.Sharpen((0.0, 1.0)), # sharpen the image
iaa.Affine(rotate=(-45, 45)), # rotate by -45 to 45 degrees (affects segmaps)
iaa.ElasticTransformation(alpha=50, sigma=5) # apply water effect (affects segmaps)
], random_order=True)
# Augment images and segmaps.
images_aug = []
segmaps_aug = []
for _ in range(len(input_data)):
images_aug_i, segmaps_aug_i = seq(image=image, segmentation_maps=segmap)
images_aug.append(images_aug_i)
segmaps_aug.append(segmaps_aug_i)
You are going in the right way with the custom generator. In __getitem__, make a batch using batch_x = self.files[index:index+batch_size] and same with batch_y, then augment them using X,y = __data_generation(batch_x, batch_y) which will load images(using any library you like, I prefer opencv), and return the augmented pairs (and any other manipulation).
Your __getitem__ will then return the tuple (X,y)
You can use ImageDataGenerator even if your label is an image.
Here is a simple example of how you can do that:
Code:
# Specifying your data augmentation here for both image and label
image_datagen = tf.keras.preprocessing.image.ImageDataGenerator()
mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator()
# Provide the same seed and keyword arguments to the flow methods
seed = 1
image_generator = image_datagen.flow_from_directory(
data_dir,
class_mode=None,
seed=seed)
mask_generator = mask_datagen.flow_from_directory(
data_dir,
class_mode=None,
seed=seed)
# Combine the image and label generator.
train_generator = zip(image_generator, mask_generator)
Now, if you iterate over it you will get:
for image, label in train_generator:
print(image.shape,label.shape)
break
Output:
(32, 256, 256, 3) (32, 256, 256, 3)
You can use this train_generator with fit() command.
Code:
model.fit_generator(
train_generator,
steps_per_epoch=2000,
epochs=50)
With flow_from_directory your memory won't be cluttered and Imagedatagenerator will take care of the augmentation part.

How to use flow_from_directory in Keras for multi-class semantic segmentation?

Let's say I have 100 training grayscale images and 100 RGB training masks, each of size 512x512. I was able to one-hot encode the masks using to_categorical in Keras with the below
numclasses=3
masks_one_hot=to_categorical(maskArr,numclasses)
where maskArr is a 100x512x512x1, and masks_one_hot is 100x512x512x3.
However, to use ImageDataGenerator and flow_from_directory using trainGenerator from https://github.com/zhixuhao/unet/blob/master/data.py, I tried to save the one-hot encoded training images and then read them using trainGenerator. However, I noticed after using imwrite on them and then reading them with imread, they changed from one-hot encoded 512x512x3 to 512x512x3 RGB images. That is, instead of each channel having a value of 0 or 1, they now range from 0-255
As a result, if I do:
myGenerator = trainGeneratorOneHot(20,'data/membrane/train','image','label',data_gen_args,save_to_dir = "data/membrane/train/aug", flag_multi_class = True,
num_class = 3, target_size=(512,512,3))
num_batch=3
for i,batch in enumerate(myGenerator):
if(i >= num_batch):
break
where trainGeneratorOneHot is below:
def trainGeneratorOneHot(batch_size,...class_mode=None, image_class_mode=None):
image_datagen = ImageDataGenerator(**aug_dict)
mask_datagen = ImageDataGenerator(**aug_dict)
image_generator = image_datagen.flow_from_directory(train_path,classes = [image_folder], class_mode = image_class_mode, color_mode = image_color_mode,target_size = target_size, ...)
mask_generator = mask_datagen.flow_from_directory(train_path, classes = [mask_folder], class_mode = class_mode, target_size = target_size,...)
train_generator = zip(image_generator, mask_generator)
for (img,mask) in train_generator:
img,mask = adjustDataOneHot(img,mask)
yield (img,mask)
def adjustDataOneHot(img,mask):
return (img,mask)
Then I get `ValueError: could not broadcast input array from shape (512,512,1) into shape (512,512,3,1)
How can I fix this?
Was dealing with the same issue a few days ago. I found it essential to make my own data generator class to deal with taking in data from a dataframe, augmenting it, and then one-hot-encoding it before passing it to my model. I was never able to get the Keras ImageDataGenerator to work for semantic segmentation problems with multiple classes.
Below is a data generator class in case it might help you out:
def one_hot_encoder(mask, num_classes = 8):
hot_mask = np.zeros(shape = mask.shape, dtype = 'uint8')
for _ in range(8):
temp = np.zeros(shape = mask.shape[0:2], dtype = 'uint8')
temp[mask[:, :, _] != 0] = 1
hot_mask[:, :, _] = temp
return hot_mask
# Image data generator class
class DataGenerator(keras.utils.Sequence):
def __init__(self, dataframe, batch_size, n_classes = 8, augment = False):
self.dataframe = dataframe
self.batch_size = batch_size
self.n_classes = n_classes
self.augment = augment
# Steps per epoch
def __len__(self):
return len(self.dataframe) // self.batch_size
# Shuffles and resets the index at the end of training epoch
def on_epoch_end(self):
self.dataframe = self.dataframe.reset_index(drop = True)
# Generates data, feeds to training
def __getitem__(self, index):
processed_images = []
processed_masks = []
for _ in range(self.batch_size):
the_image = io.imread(self.dataframe['Images'][index])
the_mask = io.imread(self.dataframe['Masks'][index]).astype('uint8');
one_hot_mask = one_hot_encoder(the_mask, 8)
if(self.augment):
# Resizing followed by some augmentations
processed_image = augs_for_images(image = the_image) / 255.0
processed_mask = augs_for_masks(image = one_hot_mask)
else:
# Still resizing but no augmentations
processed_image = resize(image = the_image) / 255.0
processed_mask = resize(image = one_hot_mask)
processed_images.append(processed_image)
processed_masks.append(processed_mask)
batch_x = np.array( processed_images )
batch_y = np.array( processed_masks )
return (batch_x, batch_y)
Also, here's a link to a repo with some semantic segmentation models that might be of interest to you. The notebook itself shows how the author dealt with multi-class semantic segmentation.

Keras - Input arrays should have the same number of samples as target arrays

I have the code below which run a Generative Adversarial Network (GAN) on 374 training images of size 32x32.
Why am I having the following error?
ValueError: Input arrays should have the same number of samples as target arrays. Found 7500 input samples and 40 target samples.
which occurs at the following statement:
discriminator_loss = discriminator.train_on_batch(combined_images,labels)
import keras
from keras import layers
import numpy as np
import cv2
import os
from keras.preprocessing import image
latent_dimension = 32
height = 32
width = 32
channels = 3
iterations = 100000
batch_size = 20
real_images = []
# paths to the training and results directories
train_directory = '/training'
results_directory = '/results'
# GAN generator
generator_input = keras.Input(shape=(latent_dimension,))
# transform the input into a 16x16 128-channel feature map
x = layers.Dense(128*16*16)(generator_input)
x = layers.LeakyReLU()(x)
x = layers.Reshape((16,16,128))(x)
x = layers.Conv2D(256,5,padding='same')(x)
x = layers.LeakyReLU()(x)
# upsample to 32x32
x = layers.Conv2DTranspose(256,4,strides=2,padding='same')(x)
x = layers.LeakyReLU()(x)
x = layers.Conv2D(256,5,padding='same')(x)
x = layers.LeakyReLU()(x)
x = layers.Conv2D(256,5,padding='same')(x)
x = layers.LeakyReLU()(x)
# a 32x32 1-channel feature map is generated (i.e. shape of image)
x = layers.Conv2D(channels,7,activation='tanh',padding='same')(x)
# instantiae the generator model, which maps the input of shape (latent dimension) into an image of shape (32,32,1)
generator = keras.models.Model(generator_input,x)
generator.summary()
# GAN discriminator
discriminator_input = layers.Input(shape=(height,width,channels))
x = layers.Conv2D(128,3)(discriminator_input)
x = layers.LeakyReLU()(x)
x = layers.Conv2D(128,4,strides=2)(x)
x = layers.LeakyReLU()(x)
x = layers.Conv2D(128,4,strides=2)(x)
x = layers.LeakyReLU()(x)
x = layers.Conv2D(128,4,strides=2)(x)
x = layers.LeakyReLU()(x)
x = layers.Flatten()(x)
# dropout layer
x = layers.Dropout(0.4)(x)
# classification layer
x = layers.Dense(1,activation='sigmoid')(x)
# instantiate the discriminator model, and turn a (32,32,1) input
# into a binary classification decision (fake or real)
discriminator = keras.models.Model(discriminator_input,x)
discriminator.summary()
discriminator_optimizer = keras.optimizers.RMSprop(
lr=0.0008,
clipvalue=1.0,
decay=1e-8)
discriminator.compile(optimizer=discriminator_optimizer, loss='binary_crossentropy')
# adversarial network
discriminator.trainable = False
gan_input = keras.Input(shape=(latent_dimension,))
gan_output = discriminator(generator(gan_input))
gan = keras.models.Model(gan_input,gan_output)
gan_optimizer = keras.optimizers.RMSprop(
lr=0.0004,
clipvalue=1.0,
decay=1e-8)
gan.compile(optimizer=gan_optimizer,loss='binary_crossentropy')
start = 0
for step in range(iterations):
# sample random points in the latent space
random_latent_vectors = np.random.normal(size=(batch_size,latent_dimension))
# decode the random latent vectors into fake images
generated_images = generator.predict(random_latent_vectors)
stop = start + batch_size
i = start
for root, dirs, files in os.walk(train_directory):
for file in files:
for i in range(stop-start):
img = cv2.imread(root + '/' + file)
real_images.append(img)
i = i+1
combined_images = np.concatenate([generated_images,real_images])
# assemble labels and discrminate between real and fake images
labels = np.concatenate([np.ones((batch_size,1)),np.zeros(batch_size,1)])
# add random noise to the labels
labels = labels + 0.05 * np.random.random(labels.shape)
# train the discriminator
discriminator_loss = discriminator.train_on_batch(combined_images,labels)
random_latent_vectors = np.random.normal(size=(batch_size,latent_dimension))
# assemble labels that classify the images as "real", which is not true
misleading_targets = np.zeros((batch_size,1))
# train the generator via the GAN model, where the discriminator weights are frozen
adversarial_loss = gan.train_on_batch(random_latent_vectors,misleading_targets)
start = start + batch_size
if start > len(train_directory)-batch_size:
start = 0
# save the model weights
if step % 100 == 0:
gan.save_weights('gan.h5')
print'discriminator loss: '
print discriminator_loss
print 'adversarial loss: '
print adversarial_loss
img = image.array_to_img(generated_images[0] * 255.)
img.save(os.path.join(results_directory,'generated_melanoma_image' + str(step) + '.png'))
img = image.array_to_img(real_images[0] * 255.)
img.save(os.path.join(results_directory,'real_melanoma_image' + str(step) + '.png'))
Thanks.
Your following step causing this problem,
i = start
for root, dirs, files in os.walk(train_directory):
for file in files:
for i in range(stop-start):
img = cv2.imread(root + '/' + file)
real_images.append(img)
i = i+1
You are trying to collect 20 samples of real_images, which is done by inner loop. Then there is outer loop, which is running for each files, So outer loop is collecting 20 sample for each file, which collect 7480 sample in total, where you are planned to collect only 20 in total .

training a multi-output keras model

I have 10,000 images, each of which are labeled with 20 tags. For each image, the tag is either true or false. I'm trying to train a multi-output model to perform all these 20 binary classifications with one network.
The network is a Residual Network. After the flatten layer, the network branches out into 20 branches. Each branch has 2 fully connected layers, each of which are followed by a drop out layer. And finally a dense layer with one node and sigmoid activation in the end.
The labels for each image and the image name are stored in a text file, for both train and validation set. Like this:
1.jpg 1 -1 1 -1 -1 1 -1.........
I wrote my own generator, but I can't get them to work. I keep getting this error:
Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 20 array(s), but instead got the following list of 1 arrays.
Function explanations: get_input function reads an image and resizes it.
get_output prepares the labels for each image. The labels are stored in a list and returned in the end. preprocess_input performs preprocessing and converting images into arrays. train_generator and validation_generator generate batches with size 32 to be fed to the model.
Here's my code:
def get_input(img_name):
path = os.path.join("images", img_name)
img = image.load_img(path, target_size=(224, 224))
return img
def get_output(img_name, file_path):
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
img_id = img_name.split(".")[0]
img_id = img_id.lstrip("0")
img_id = int(img_id)
labels = data.loc[img_id - 1].values
labels = labels[1:]
labels = list(labels)
label_arrays = []
for i in range(20):
val = np.zeros((1))
val[0] = labels[i]
label_arrays.append(val)
return label_arrays
def preprocess_input(img_name):
img = get_input(img_name)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
return x
def train_generator(batch_size):
file_path = "train.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
while True:
for i in range(math.floor(8000/batch_size)):
x_batch = np.zeros(shape=(32, 224, 224, 3))
y_batch = np.zeros(shape=(32, 20))
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch[j, :, :, :] = x
y_batch[j] = y
yield(x_batch, y_batch)
def val_generator(batch_size):
file_path = "val.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
while True:
for i in range(math.floor(2000/batch_size)):
x_batch = np.zeros(shape=(32, 224, 224, 3))
y_batch = np.zeros(shape=(32, 20))
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch[j, :, :, :] = x
y_batch[j] = y
yield(x_batch, y_batch)
Edit:
One quick question. What's the difference between this loop and the one in your answer:
ys = []
for i in range(batch_size):
ys.append(y_batch[i, :])
yield(x_batch, ys)
If your model has 20 outputs then you must provide a list of 20 arrays as target. One way of doing this is to modify the generator (for both training and validation):
ys = []
for i in range(20):
ys.append(y_batch[:,i])
yield(x_batch, ys)
As a side note, you mentioned that you have 20 tags per sample then why have you specified 40 in the input shape?
y_batch = np.zeros(shape=(32, 40))
Further, I don't know about the specific problem you are working on but alternatively you could only have one output of size 20 instead of 20 outputs with size one.
You can test the generator output dimensions initializing the generator and call the function next() to check the dimensions. For example with the train_generator:
train_gen = train_generator(batch_size)
x_batch, y_batch = next(train_gen)
Then check x_batch and y_batch dimensions and datatype
I would make the generator in this way:
def train_generator(batch_size):
file_path = "train.txt"
data = pd.read_csv(file_path, delim_whitespace=True, header=None)
# Initialize empty list
x_batch = []
y_batch = []
while True:
for i in range(math.floor(8000/batch_size)):
for j in range(batch_size):
img_name = data.loc[i * batch_size + j].values
img_name = img_name[0]
x = preprocess_input(img_name)
y = get_output(img_name, file_path)
x_batch.append(x)
y_batch.append(y)
yield(np.array(x_batch), np.array(y_batch))

Split queue into train/test set

I set up my pipeline starting with a filename queue as in the following pseudocode:
filename_queue = tf.train.string_input_producer(["file0.pd", "file1.pd"])
pointing to TFRecords containing multiple serialized tf.train.Example images.
Following the tensorflow guide a function which reads one example:
def read_my_file_format(filename_queue):
reader = tf.SomeReader()
key, record_string = reader.read(filename_queue)
example, label = tf.some_decoder(record_string)
processed_example = some_processing(example)
return processed_example, label
which is used for a batch queue:
def input_pipeline(filenames, batch_size):
filename_queue = tf.train.string_input_producer(filenames)
example, label = read_my_file_format(filename_queue)
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=100,
min_after_dequeue=10)
return example_batch, label_batch
I am looking for a way to split the data randomly into training and test sets. I don't want to save the training and test set into different files, but that the images are randomly assigned to the training or the test set independent of the file they are read from.
Ideally I would like to split the input pipeline into a training and test queue.
Here is what I normally do in numpy when I have to split a huge dataset
import numpy as np
from numpy.random import choice
from numpy.random import RandomState
queue = range(10)
weights = (.8,.2) # create 2 partitions with this weights
def sampler(partition, seed=0):
rng = RandomState(seed)
return lambda x: rng.choice(np.arange(len(weights)), p=weights) == partition
def split(queue, weights):
# filter the queue for each partition
return [filter(sampler(partition), queue) for partition in range(len(weights)) ]
(train, test) = split(queue, weights)
print(list(train)) # [0, 1, 2, 3, 4, 5, 6, 9]
print(list(test)) # [7, 8]
Suggestion, using Tensorflow Dataset API (map(), interleave(), filter()):
import tensorflow as tf
import numpy as np
def _parse_function(example_proto):
""" Parse TFRecord data """
features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
"label": tf.FixedLenFeature((), tf.int64, default_value=0)}
parsed_features = tf.parse_single_example(example_proto, features)
return parsed_features
def split_train_test(parsed_features, train_rate=0.8, seed=11):
""" Randomly classify samples into training or testing split """
# Snippet by Igor Gadelha Pereira (https://stackoverflow.com/a/49825457/624547)
parsed_features['is_train'] = tf.gather(tf.random_uniform([1], seed=seed) < train_rate, 0)
return parsed_features
def filter_per_split(parsed_features, train=True):
""" Filter samples depending on their split """
return parsed_features['is_train'] if train else ~parsed_features['is_train']
def select_features(parsed_features, keys=["image", "label"]):
""" Return array of features selected by key """
selected_features = [parsed_features[key] for key in keys]
return selected_features
weights = (.8,.2)
num_files = 3
file_block_length = 1
files = ["/tmp/file{}.tfrecords".format(i) for i in range(num_files)]
# ... where file{i}.tfrecords contains:
# [{"label": i, "image": "class_{}/img_{}.png".format(i, k)} for k in range(10)]
# Create TFRecord file list list:
files = tf.data.Dataset.from_tensor_slices(files)
# Interleave all records:
dataset = files.interleave(lambda x: tf.data.TFRecordDataset(x),
cycle_length=num_files, block_length=file_block_length)
# ^ dataset containing:
# [rec0#file0, rec0#file1, rec0#file2, rec1#file0, rec1#file1, rec1#file2, ...]
# Parse TFRecord samples:
dataset = dataset.map(_parse_function)
# Randomly classify samples between training or testing:
dataset = dataset.map(lambda x: split_train_test(x, train_rate=weights[0]))
# Split into 2 datasets accordingly:
dataset_train = dataset.filter(lambda x: filter_per_split(x, train=True))
dataset_test = dataset.filter(lambda x: filter_per_split(x, train=False))
# Opt. remove "is_train" key, keeping only the original features:
dataset_train = dataset_train.map(select_features)
dataset_test = dataset_test.map(select_features)
# Use:
iterator_train = dataset_train.make_one_shot_iterator()
iterator_test = dataset_test.make_one_shot_iterator()
with tf.Session() as sess:
for it, name in zip([iterator_train, iterator_test], ["Training", "Testing"]):
x = it.get_next()
count = 0
print("{} Split:".format(name))
try:
while True:
print(sess.run(x))
count += 1
except:
print("- End of Split ({} / {}".format(count, num_files * 10))
Output:
Training Split:
(b'class_0/img_0.png', 0)
(b'class_1/img_0.png', 1)
(b'class_2/img_0.png', 2)
(b'class_0/img_1.png', 0)
(b'class_1/img_1.png', 1)
(b'class_1/img_2.png', 1)
(b'class_2/img_2.png', 2)
(b'class_0/img_3.png', 0)
(b'class_1/img_3.png', 1)
(b'class_2/img_3.png', 2)
(b'class_1/img_4.png', 1)
(b'class_2/img_4.png', 2)
(b'class_0/img_5.png', 0)
(b'class_1/img_5.png', 1)
(b'class_2/img_5.png', 2)
(b'class_0/img_6.png', 0)
(b'class_1/img_6.png', 1)
(b'class_2/img_6.png', 2)
(b'class_0/img_7.png', 0)
(b'class_1/img_7.png', 1)
(b'class_2/img_7.png', 2)
(b'class_0/img_8.png', 0)
(b'class_1/img_8.png', 1)
(b'class_2/img_8.png', 2)
(b'class_0/img_9.png', 0)
(b'class_1/img_9.png', 1)
(b'class_2/img_9.png', 2)
- End of Split (27 / 30
Testing Split:
(b'class_2/img_1.png', 2)
(b'class_0/img_2.png', 0)
(b'class_0/img_4.png', 0)
- End of Split (3 / 30

Categories