Is there anyway we can add some functionality to the ImageDataGenerator, so that the ImageDataGenerator can take a list of filenames, and random sample images for each minibatch?
I know that I can custom a class which inherit ImageDataGenerator class, but I still don't know the details how to do that.
Here is what I have done:
for epoch in range(epochs):
print("epoch is: %d, total epochs: %f" % ((epoch+1), int(epochs)))
print("prepare training batch...")
train_batch = makebatch(filelist=self.train_files, img_num=img_num, slice_times=slice_times)
print("prepare validation batch..")
val_batch = makebatch(filelist=self.val_files, img_num=int(math.ceil(img_num*0.2)), slice_times=slice_times)
x_train = train_batch
y_train = x_train
x_val = val_batch
y_val = x_val
print("generate training data...")
train_datagen.fit(x_train)
train_generator = train_datagen.flow(
x=x_train,
y=y_train,
batch_size=16)
val_datagen.fit(x_val)
val_generator = val_datagen.flow(
x=x_val,
y=y_val,
batch_size=16)
print("start training..")
history = model.fit_generator(
generator=train_generator,
steps_per_epoch=None,
epochs=1,
verbose=1,
validation_data=val_generator,
validation_steps=None,
callbacks=self.callbacks)
what I really want to obtain is that I can remove the for loop and the generator random sample images for each batch.
Someone can help with that?
Here, what I would do.
Suppose I have a list of paths to all images stored in variables X_train, X_validation and the labels are stored as y_train and y_validation.
First, I would define a sequence generator. ( This is from keras website)
from skimage.io import imread
from skimage.transform import resize
import numpy as np
# Here, `x_set` is list of path to the images
# and `y_set` are the associated classes.
class CIFAR10Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
return np.array([
resize(imread(file_name), (200, 200))
for file_name in batch_x]), np.array(batch_y)
Now, I would define generator for training and validation as
Xtrain_gen = detracSequence(X_train,y_train,batch_size=512) # you can choose your batch size.
Xvalidation_gen = detracSequence(X_validation,y_validation,batch_size=512)
Now, final step to train the model
model.fit_generator(generator=Xtrain_gen, epochs=100, validation_data=Xvalidation_gen,use_multiprocessing=True)
This will avoid the for loop for you and it's very efficient because CPU fetch data in parallel.
Related
This question already has answers here:
PyTorch NotImplementedError in forward
(3 answers)
Closed 1 year ago.
i'm having problems traing to practice the Logistic Regression in pytorch.
I want to use the CIFAR10 dataset but i cant make the training loop because when i can excecute the Linnear function y recived an NotImplementedError
I probably have more than one error that I am not seeing because as I said I am learning.
I leave my code here.
import numpy as np
import matplotlib.pyplot as plt
import torch
from torchvision import datasets, transforms
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
#IMPORTING DATA
datatest = mnist_train = datasets.CIFAR10(root="./datasets",
train=True,
transform=transforms.ToTensor(),
download=True)
datatrain = datasets.CIFAR10(root="./datasets",
train=False,
transform=transforms.ToTensor(),
download=True)
print (f'Number of CIFAR test examples {len(datatest)}')
print (f'Number of CIFAR train examples {len(datatest)}')
train_loader = torch.utils.data.DataLoader(datatrain, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(datatest, batch_size=100, shuffle=False)
data_train_iter = iter(train_loader)
images, labels = data_train_iter.next()
print("Shape of the minibatch of images: {}".format(images.shape))
print("Shape of the minibatch of labels: {}".format(labels.shape))
#n_samples, n_features = images.shape, labels.shape
#print(n_samples, n_features)
#MODEL
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = nn.Linear(3072, 10)
def foward(self, x):
return self.linear(x)
#Inicializate model
model = Model()
#Criterion
criterion= nn.CrossEntropyLoss()
#Optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(),
lr=learning_rate)
# Iterate through train set minibatchs
for images, labels in tqdm(train_loader):
# Zero out the gradients
optimizer.zero_grad()
# Forward pass
x = images.view(-1, 32*32*3)
y = model(x)
loss = criterion(y, labels)
loss.backward()
optimizer.step()
## Testing
correct = 0
total = len(datatest)
with torch.no_grad():
# Iterate through test set minibatchs
for images, labels in tqdm(test_loader):
# Forward pass
x = images.view(-1, 32*32*3)
y = model(x)
predictions = torch.argmax(y, dim=1)
correct += torch.sum((predictions == labels).float())
print('Test accuracy: {}'.format(correct/total))
Thanks!
It is due to a spelling error of forward in your Model class. You have written it as foward. Please correct the spelling in
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.linear = nn.Linear(3072, 10)
def forward(self, x): # You have written it as `foward`
return self.linear(x)
I am slightly losging my mind about a simple task. I want to implement a simple RandomForestClassifier on Images using the tf.estimator.BoostedTreesClassifier (Gradient Boosted Tree is good enough although the difference is clear to me). I'm following the https://www.tensorflow.org/tutorials/estimator/boosted_trees_model_understanding guide. I swapped the
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)
def make_input_fn(X, y, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
if shuffle:
dataset = dataset.shuffle(NUM_EXAMPLES)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = (dataset
.repeat(n_epochs)
.batch(NUM_EXAMPLES))
return dataset
return input_fn
with my own function looking like this
# LOADING IMAGES USING TENSORFLOW
labels = ['some','fancy','labels']
batch_size = 32
datagen = ImageDataGenerator(
rescale=1. / 255,
data_format='channels_last',
validation_split=0.1,
dtype=tf.float32
)
train_generator = datagen.flow_from_directory(
'./images',
classes=labels,
target_size=(128, 128),
batch_size=batch_size,
class_mode='categorical',
shuffle=True,
subset='training',
seed=42
)
valid_generator = datagen.flow_from_directory(
'./images',
classes=labels,
target_size=(128, 128),
batch_size=batch_size,
class_mode='categorical',
shuffle=False,
subset='validation',
seed=42
)
# THE SWAPPED FUNCTION:
NUM_FEATURES = 128 * 128
NUM_EXAMPLES = len(train_generator)
def make_input_fn(gen, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_generator(gen, (tf.float32, tf.int32))
if shuffle:
dataset = dataset.shuffle(NUM_EXAMPLES)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = (dataset
.repeat(n_epochs)
.batch(NUM_EXAMPLES))
return dataset
return input_fn
def _generator_(tf_gen):
print(len(tf_gen))
def arg_free():
for _ in range(len(tf_gen)):
X, y = next(iter(tf_gen))
X = X.reshape((len(X), -1))
print(X.shape)
yield X, y
return arg_free()
_gen = _generator_(train_generator)
print(callable(g_gen)) # returns Fals WHY?!
I dont understand why this is not working and why on earth nobody ever thaught about making a simple enough tutorial (or why I am not able to find it :D). If you are asking yourself, why I want to use the RandomForest and not regular Deep Learning aproaches. The RF is set by the supervising Authorithy as well as it has to be TF (and not e.g. sklearn).
Anyway, any help will be appreciated.
I'm using a deep CNN+LSTM network to perfom a classification on a dataset of 1D signals. I'm using keras 2.2.4 backed by tensorflow 1.12.0. Since I have a large dataset and limited resources, I'm using a generator to load the data into the memory during the training phase. First, I tried this generator:
def data_generator(batch_size, preproc, type, x, y):
num_examples = len(x)
examples = zip(x, y)
examples = sorted(examples, key = lambda x: x[0].shape[0])
end = num_examples - batch_size + 1
batches = [examples[i:i + batch_size] for i in range(0, end, batch_size)]
random.shuffle(batches)
while True:
for batch in batches:
x, y = zip(*batch)
yield preproc.process(x, y)
Using the above method, I'm able to launch training with a mini-batch size up to 30 samples at a time. However, this kind of method does not guarantee that the network will only train once on each sample per epoch. Considering this comment from Keras's website:
Sequence is a safer way to do multiprocessing. This structure
guarantees that the network will only train once on each sample per
epoch which is not the case with generators.
I've tried another way of loading data using the following class:
class Data_Gen(Sequence):
def __init__(self, batch_size, preproc, type, x_set, y_set):
self.x, self.y = np.array(x_set), np.array(y_set)
self.batch_size = batch_size
self.indices = np.arange(self.x.shape[0])
np.random.shuffle(self.indices)
self.type = type
self.preproc = preproc
def __len__(self):
# print(self.type + ' - len : ' + str(int(np.ceil(self.x.shape[0] / self.batch_size))))
return int(np.ceil(self.x.shape[0] / self.batch_size))
def __getitem__(self, idx):
inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_x = self.x[inds]
batch_y = self.y[inds]
return self.preproc.process(batch_x, batch_y)
def on_epoch_end(self):
np.random.shuffle(self.indices)
I can confirm that using this method the network is training once on each sample per epoch but this time when I put more than 7 samples in the mini-batch, I got out of memory error:
OP_REQUIRES failed at random_op.cc: 202: Resource exhausted: OOM when
allocating tensor with shape...............
I can confirm that I'm using the same model architecture, configuration, and machine to do this test. I'm wondering why would be a difference between these 2 ways of loading data??
Please don't hesitate to ask for more details in case needed.
Thanks in advance.
EDITED:
Here is the code I'm using to fit the model:
reduce_lr = keras.callbacks.ReduceLROnPlateau(
factor=0.1,
patience=2,
min_lr=params["learning_rate"])
checkpointer = keras.callbacks.ModelCheckpoint(
filepath=str(get_filename_for_saving(save_dir)),
save_best_only=False)
batch_size = params.get("batch_size", 32)
path = './logs/run-{0}'.format(datetime.now().strftime("%b %d %Y %H:%M:%S"))
tensorboard = keras.callbacks.TensorBoard(log_dir=path, histogram_freq=0,
write_graph=True, write_images=False)
if index == 0:
print(model.summary())
print("Model memory needed for batchsize {0} : {1} Gb".format(batch_size, get_model_memory_usage(batch_size, model)))
if params.get("generator", False):
train_gen = load.data_generator(batch_size, preproc, 'Train', *train)
dev_gen = load.data_generator(batch_size, preproc, 'Dev', *dev)
valid_metrics = Metrics(dev_gen, len(dev[0]) // batch_size, batch_size)
model.fit_generator(
train_gen,
steps_per_epoch=len(train[0]) / batch_size + 1 if len(train[0]) % batch_size != 0 else len(train[0]) // batch_size,
epochs=MAX_EPOCHS,
validation_data=dev_gen,
validation_steps=len(dev[0]) / batch_size + 1 if len(dev[0]) % batch_size != 0 else len(dev[0]) // batch_size,
callbacks=[valid_metrics, MyCallback(), checkpointer, reduce_lr, tensorboard])
# train_gen = load.Data_Gen(batch_size, preproc, 'Train', *train)
# dev_gen = load.Data_Gen(batch_size, preproc, 'Dev', *dev)
# model.fit_generator(
# train_gen,
# epochs=MAX_EPOCHS,
# validation_data=dev_gen,
# callbacks=[valid_metrics, MyCallback(), checkpointer, reduce_lr, tensorboard])
Those methods are roughly the same. It is correct to subclass
Sequence when your dataset doesn't fit in memory. But you shouldn't
run any preprocessing in any of the class' methods because that will
be reexecuted once per epoch wasting lots of computing resources.
It is probably also easier to shuffle the samples rather than their
indices. Like this:
from random import shuffle
class DataGen(Sequence):
def __init__(self, batch_size, preproc, type, x_set, y_set):
self.samples = list(zip(x, y))
self.batch_size = batch_size
shuffle(self.samples)
self.type = type
self.preproc = preproc
def __len__(self):
return int(np.ceil(len(self.samples) / self.batch_size))
def __getitem__(self, i):
batch = self.samples[i * self.batch_size:(i + 1) * self.batch_size]
return self.preproc.process(*zip(batch))
def on_epoch_end(self):
shuffle(self.samples)
I think it is impossible to say why you run out of memory without
knowing more about your data. My guess would be that your preproc
function is doing something wrong. You can debug it by running:
for e in DataGen(batch_size, preproc, *train):
print(e)
for e in DataGen(batch_size, preproc, *dev):
print(e)
You will most likely run out of memory.
I followed this tutorial to create a custom generator for my Keras model. Here is an MWE that shows the issues I'm facing:
import sys, keras
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.models import Model
from keras.layers import Dense, Input
from keras.optimizers import Adam
from keras.losses import binary_crossentropy
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, list_IDs, batch_size, shuffle=False):
'Initialization'
self.batch_size = batch_size
self.list_IDs = list_IDs
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.list_IDs) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
#print('self.batch_size: ', self.batch_size)
print('index: ', index)
sys.exit()
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
print('self.indexes: ', self.indexes)
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_IDs_temp):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
X1 = np.empty((self.batch_size, 10), dtype=float)
X2 = np.empty((self.batch_size, 12), dtype=int)
#Generate data
for i, ID in enumerate(list_IDs_temp):
print('i is: ', i, 'ID is: ', ID)
#Preprocess this sample (omitted)
X1[i,] = np.repeat(1, X1.shape[1])
X2[i,] = np.repeat(2, X2.shape[1])
Y = X1[:,:-1]
return X1, X2, Y
if __name__=='__main__':
train_ids_to_use = list(np.arange(1, 321)) #1, 2, ...,320
valid_ids_to_use = list(np.arange(321, 481)) #321, 322, ..., 480
params = {'batch_size': 32}
train_generator = DataGenerator(train_ids_to_use, **params)
valid_generator = DataGenerator(valid_ids_to_use, **params)
#Build a toy model
input_1 = Input(shape=(3, 10))
input_2 = Input(shape=(3, 12))
y_input = Input(shape=(3, 10))
concat_1 = keras.layers.concatenate([input_1, input_2])
concat_2 = keras.layers.concatenate([concat_1, y_input])
dense_1 = Dense(10, activation='relu')(concat_2)
output_1 = Dense(10, activation='sigmoid')(dense_1)
model = Model([input_1, input_2, y_input], output_1)
print(model.summary())
#Compile and fit_generator
model.compile(optimizer=Adam(lr=0.001), loss=binary_crossentropy)
model.fit_generator(generator=train_generator, validation_data = valid_generator, epochs=2, verbose=2)
I don't want to shuffle my input data. I thought that was getting handled, but in my code, when I print out index in __get_item__, I get random numbers. I would like consecutive numbers. Notice I'm trying to kill the process using sys.exit inside __getitem__ to see what's going on.
My questions:
Why does index not go consecutively? How can I fix this?
When I run this in the terminal using screen, why doesn't it respond to Ctrl+C?
You can use shuffle argument of fit_generator method to generate batches consecutively. From fit_generator() documentation:
shuffle: Boolean. Whether to shuffle the order of the batches at the beginning of each epoch. Only used with instances of Sequence (keras.utils.Sequence). Has no effect when steps_per_epoch is not None.
Just pass shuffle=False to fit_generator:
model.fit_generator(generator=train_generator, shuffle=False, ...)
I am working on a multilabel classification model where I am trying to combine two models, a CNN and a text-classifier into one model using Keras and train them together, like so:
#cnn_model is a vgg16 model
#text_model looks as follows:
### takes the vectorized text as input
text_model = Sequential()
text_model .add(Dense(vec_size, input_shape=(vec_size,), name='aux_input'))
## merging both models
merged = Merge([cnn_model, text_model], mode='concat')
### final_model takes the combined models and adds a sofmax classifier to it
final_model = Sequential()
final_model.add(merged)
final_model.add(Dense(n_classes, activation='softmax'))
As such, I am working with an ImageDataGenerator to process the images and the respective labels.
For the images I am using a custom helper function that reads images into the model via paths provided by pandas dataframes - one for training (df_train) and one for validation (df_validation). The dataframes also provide the final labels for the model in the "label_vec" column:
# From https://github.com/keras-team/keras/issues/5152
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
base_dir = os.path.dirname(in_df[path_col].values[0])
print('## Ignore next message from keras, values are replaced anyways')
df_gen = img_data_gen.flow_from_directory(base_dir, class_mode = 'sparse', **dflow_args)
df_gen.filenames = in_df[path_col].values
df_gen.classes = numpy.stack(in_df[y_col].values)
df_gen.samples = in_df.shape[0]
df_gen.n = in_df.shape[0]
df_gen._set_index_array()
df_gen.directory = '' # since we have the full path
print('Reinserting dataframe: {} images'.format(in_df.shape[0]))
return df_gen
from keras.applications.vgg16 import preprocess_input
train_datagen = keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input) horizontal_flip=True)
validation_datagen = keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input)#rescale=1./255)
train_generator = flow_from_dataframe(train_datagen, df_train,
path_col = 'filename',
y_col = 'label_vec',
target_size=(224, 224), batch_size=128, shuffle=False)
validation_generator = flow_from_dataframe(validation_datagen, df_validation,
path_col = 'filename',
y_col = 'label_vec',
target_size=(224, 224), batch_size=64, shuffle=False)
Now I am trying to provide my one-hot-encoded text vectors (i.e. [0,0,0,1,0,0]) to the model, which are also stored in a pandas dataframe.
Since my train_generator provides me with the image and label data, I am now looking for a solution to combine this generator with a generator which allows me to additionally feed the respective text-vector
You might want to consider writing your own generator (making use of Keras' Sequence object to allow for multiprocessing) instead of modifying the ImageDataGenerator code. From the Keras docs:
class CIFAR10Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
return np.array([
resize(imread(file_name), (200, 200))
for file_name in batch_x]), np.array(batch_y)
You could have your labels, paths to the images, and paths to the text files in a single pandas dataframe and modify the __getitem__ method from above to have your generator yield all three of them simultaneously: one list of numpy arraysX which contains all the inputs, one numpy array Y which contains the outputs.