Tensorflow input pipeline with queues workflow - feeding data - python

I am using the Tensorflow input pipeline via queues.
Therefore, I do:
images, volumes = utils.inputs(FLAGS.train_file_path, FLAGS.batch_size, FLAGS.num_epochs)
v_images, v_volumes = utils.inputs(FLAGS.val_file_path, FLAGS.batch_size)
To read the data.
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
_, train_loss = sess.run([train_optimizer, loss])
if step % FLAGS.val_interval == 0:
for val_step in range(0, FLAGS.val_iter):
val_loss = sess.run(v_loss)
How can I make sure that I use the images, volumes and volumes for the training and the v_images, v_volumes for validation?
EDIT
logits = utils.inference(images, FLAGS.stacks, True)
v_logits = utils.inference(v_images, FLAGS.stacks, False)
loss = utils.get_loss(logits, volumes, FLAGS.stacks, 'train')
v_loss = utils.get_loss(v_logits, v_volumes, FLAGS.stacks, 'val')
EDIT 2
summary_train_op = tf.summary.merge_all('train')
summary_val_op = tf.summary.merge_all('val')
with tf.Session() as sess:
summary_writer = tf.summary.FileWriter(FLAGS.train_dir + FLAGS.run, sess.graph)
summary_writer_val = tf.summary.FileWriter(FLAGS.train_dir + FLAGS.run + FLAGS.run_val)

One technique is to build graph nodes individually for training and validation.
Consider "net" is your model graph.
logits_train, volumes = net(images,volumes)
logits_val, v_volumes = net(v_images, v_volumes)
loss_train = some_loss(logits_train,volumes)
train_optimizer = some_optimizer(loss_train)
Now, if you do,
1. sess.run([train_optimizer]) #It uses training data (Dequeues train data)
2. sess.run([logits_val]) #It uses validation data (Dequeues validation data )
I hope this helps.
Update:
_,loss_train_value, logits_train_value = sess.run([train_optimizer, loss_train,logits_train])
Update 2
I am doing everything explicitly as shown below. May be there is a better way to do it.
loss_train_s = tf.summary.scalar("loss_train", loss_train)
acc_train_s = tf.summary.scalar("acc_train", acc_train)
loss_val_s = tf.summary.scalar("loss_val", loss_val)
train_summary_op = tf.summary.merge([loss_train_s,acc_train_s])
val_summary_op = tf.summary.merge([loss_val_s])

Related

Tensorflow GPU Out Of Memory during runtime using dynamic_rnn

I'm having trouble training a seq2seq model using Tensorflow on an Nvidia P100 GPU.
Here are the versions I'm using: TensorFlow 1.10.0, Keras 2.2.2, Python 3.6.3, CUDA 9.2.148.1, cuDNN 7.2.1
I currently get an OOM error well in the middle of training (18 minutes).
I've been doing a little digging and tried setting allow_growth = True (flag not set in the code below) but did not manage to see any memory grow, it all gets allocated at the start.
I also tried setting the graph to read only with tf.finalize() but the program still runs, which suggests no nodes are being added or that if haven't placed the function in the correct place in the code.
Since the graph trains and can be saved and all, it doesn't seem to be too large at the start.
Here are some of the hyper parameters I'm using:
batch_size = 10
rnn_size = 1024
src/tgt_vocab_size = 8000
epochs = 20
display_steps = 10
The dataset used are docstrings and the associated function code, so not incredibly long.
One of my original thoughts was that since the size of the sentences is dynamic, one really long one could be too big. But I shuffled the dataset to see if the crash happened at a different time and it's still at 18 minutes with the same parameters.
Here is the code including the graphs and the training/testing loop.
def train(...):
...
...
def source_generator():
for el in train_source_sents:
yield el
def target_generator():
for el in train_target_sents:
yield el
train_graph = tf.Graph()
with train_graph.as_default():
# Dataset and batch preparation
with tf.name_scope("Dataset-prep"):
source_dataset = tf.data.Dataset.from_generator(source_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = tf.data.Dataset.from_generator(target_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = target_dataset.map(lambda x: tf.concat([x, [target_tok2ID['<EOS>']]], 0)) # Adding <EOS> character to the endo of all the target sentences
target_input_dataset = target_dataset.map(lambda x: tf.concat([[target_tok2ID['<GO>']], x], 0)) # Creating training inputs for the decoder, This requires adding <GO> to the start of the sequence
target_sequence_length = target_dataset.map(lambda x: tf.shape(x)[0]) # Adding the sizes of all the sequences to be paired up with the reset of the dataset
dataset = tf.data.Dataset.zip((source_dataset, target_dataset, target_input_dataset, target_sequence_length)) # create the collection of all the individual datasets
dataset = dataset.shuffle(buffer_size=10000)
with tf.name_scope("Dataset-batching"):
dataset = dataset.repeat(epochs)
pad_id = target_tok2ID['<PAD>']
batched_dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None],[None], []), padding_values=(pad_id,pad_id,pad_id,pad_id))
batched_dataset = batched_dataset.prefetch(buffer_size=batch_size) # could be removed, perhaps yeilds improvements
iterator = batched_dataset.make_one_shot_iterator()
source_batch, target_batch, target_input_batch, batch_target_sequence_length = iterator.get_next()
with tf.name_scope("Encoding-layer"):
source_vocab_size = len(source_tok2ID)
embed = tf.contrib.layers.embed_sequence(source_batch, vocab_size=source_vocab_size, embed_dim=encoding_embedding_size)
stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
outputs, encoder_state = tf.nn.dynamic_rnn(stacked_cells, embed, dtype=tf.float32)
with tf.name_scope("Decoding-layer"):
target_vocab_size = len(target_tok2ID)
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, target_input_batch)
cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
output_layer = tf.layers.Dense(target_vocab_size)
dec_cell = tf.contrib.rnn.DropoutWrapper(cells, output_keep_prob=keep_prob)
helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, batch_target_sequence_length)
decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, encoder_state, output_layer)
max_target_sequence_length = tf.reduce_max(batch_target_sequence_length, axis=0)
dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
dec_outputs = tf.identity(dec_outputs.rnn_output, name='logits') # This step might seem a little misterious, but it is to take the output from the dynamic decoder and pass it to a tensor from (Dodumentation is scarce here)
masks = tf.sequence_mask(batch_target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
cost = tf.contrib.seq2seq.sequence_loss( dec_outputs, target_batch, masks)
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
tf.summary.scalar('cost', cost)
merged = tf.summary.merge_all()
train_saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
train_sess = tf.Session(graph=train_graph)
if(model_path):
print(model_path)
train_saver.restore(train_sess, model_path)
else:
train_sess.run(init_op)
store_path = "./visualisations/"
writer = tf.summary.FileWriter(store_path, train_sess.graph)
step = 0
start_time = datetime.now()
while True:
try:
step += 1
model_cost, _, summary = train_sess.run((cost, train_op, merged))
writer.add_summary(summary,step)
if(step % display_step == 0):
save_path = train_saver.save(train_sess, "./checkpoints/NMT")
print("Model saved in path: %s" % save_path)
test_sess = tf.Session(graph=test_graph)
test_saver.restore(test_sess, save_path)
# print(test_sess.run(foo))
scores = []
while True:
try:
predictions, refrence = test_sess.run([dec_predictions, test_target_batch])
for i in range(len(refrence)):
BLEU_score = nltk.translate.bleu_score.sentence_bleu([np.trim_zeros(refrence[i])], np.trim_zeros(predictions[i]), weights = (1,0))
scores.append(BLEU_score)
print("########################################")
print("ref:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(refrence[i]))))
print("")
print("pred:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(predictions[i]))))
except tf.errors.OutOfRangeError:
print("Exhausted test data")
break
delta_time = datetime.now() - start_time
total_exp_time = delta_time * (total_steps / step)
remaining_time = total_exp_time - delta_time
print("")
print("Test set BLEU Score:", np.mean(scores))
print("Model cost:" ,model_cost)
print("Step {} from {}".format(step, total_steps))
print("Current time:", datetime.now())
print("Total Experiment time (Hours:Minutes:Seconds):", str(total_exp_time))
print("Time Elapled (Hours:Minutes:Seconds):", str(delta_time))
print("Time remaining (Hours:Minutes:Seconds):", str(remaining_time))
print("")
except tf.errors.OutOfRangeError:
print("Model finished training")
break
return save_path
Here is an output of a training run:
command line output
Is there something wrong with the way I'm executing the graph, or am I repeating some step leading to the memory fill up?
Thanks for all your help!

How to plot different summary metrics on the same plot with Tensorboard?

I would like to be able plot the training loss per batch and the average validation loss for the validation set on the same plot in Tensorboard. I ran into this issue when my validation set was too large to fit into memory so required batching and the use of tf.metrics update ops.
This question could apply to any Tensorflow metrics you wanted to appear on the same graph in Tensorboard.
I am able to
plot these two graphs separately (see here)
plot the validation-loss-per-validation-batch on the same graph as the training-loss-per-training-batch (this was OK when the validation set could be a single batch and I could reuse the training summary op train_summ below)
In the example code below, my issue stems from the fact that my validation summary tf.summary.scalar with name=loss gets renamed to loss_1 and thus is moved to a separate graph in Tensorboard. From what I can work out Tensorboard takes "same name" and plots them on the same graph, regardless of what folder they are in. This is frustrating as train_summ (name=loss) is only ever written to the train folder and valid_summ (name=loss) is only ever written to the valid folder - but is still renamed to loss_1.
The example code:
# View graphs with (Linux): $ tensorboard --logdir=/tmp/my_tf_model
import tensorflow as tf
import numpy as np
import os
import tempfile
def train_data_gen():
yield np.random.normal(size=[3]), np.array([0.5, 0.5, 0.5])
def valid_data_gen():
yield np.random.normal(size=[3]), np.array([0.8, 0.8, 0.8])
batch_size = 25
n_training_batches = 4
n_valid_batches = 2
n_epochs = 5
summary_loc = os.path.join(tempfile.gettempdir(), 'my_tf_model')
print("Summaries written to" + summary_loc)
# Dummy data
train_data = tf.data.Dataset.from_generator(train_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
valid_data = tf.data.Dataset.from_generator(valid_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(handle,
train_data.output_types, train_data.output_shapes)
batch_x, batch_y = iterator.get_next()
train_iter = train_data.make_initializable_iterator()
valid_iter = valid_data.make_initializable_iterator()
# Some ops on the data
loss = tf.losses.mean_squared_error(batch_x, batch_y)
valid_loss, valid_loss_update = tf.metrics.mean(loss)
# Write to summaries
train_summ = tf.summary.scalar('loss', loss)
valid_summ = tf.summary.scalar('loss', valid_loss) # <- will be renamed to "loss_1"
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_handle, valid_handle = sess.run([train_iter.string_handle(), valid_iter.string_handle()])
sess.run([train_iter.initializer, valid_iter.initializer])
# Summary writers
writer_train = tf.summary.FileWriter(os.path.join(summary_loc, 'train'), sess.graph)
writer_valid = tf.summary.FileWriter(os.path.join(summary_loc, 'valid'), sess.graph)
global_step = 0 # implicit as no actual training
for i in range(n_epochs):
# "Training"
for j in range(n_training_batches):
global_step += 1
summ = sess.run(train_summ, feed_dict={handle: train_handle})
writer_train.add_summary(summary=summ, global_step=global_step)
# "Validation"
sess.run(tf.local_variables_initializer())
for j in range(n_valid_batches):
_, batch_summ = sess.run([valid_loss_update, train_summ], feed_dict={handle: valid_handle})
# The following will plot the batch loss for the validation set on the loss plot with the training data:
# writer_valid.add_summary(summary=batch_summ, global_step=global_step + j + 1)
summ = sess.run(valid_summ)
writer_valid.add_summary(summary=summ, global_step=global_step) # <- I want this on the training loss graph
What I have tried
Separate tf.summary.FileWriter objects (one for training, one for validation), as recommended by this issue and this question (think what I'm after is alluded to in the comment of that question)
The use of tf.summary.merge to merge all my training and validation/test metrics into overall summary ops; does useful book-keeping but doesn't plot what I want on the same graph
Use of the tf.summary.scalar family attribute (loss still gets renamed to loss_1)
(Complete hack solution) Use valid_loss, valid_loss_update = tf.metrics.mean(loss) on the training data and then run tf.local_variables_initializer() every training batch. This does give you the same summary op and thus puts things on the same graph but is surely not how you're meant to do this? It also doesn't generalise to other metrics.
Context
Tensorflow 1.9.0
Tensorboard 1.9.0
Python 3.5.2
The Tensorboard custom_scalar plugin is the way to solve this problem.
Here's the same example again with a custom_scalar to plot the two losses (per training batch + averaged over all validation batches) on the same plot:
# View graphs with (Linux): $ tensorboard --logdir=/tmp/my_tf_model
import os
import tempfile
import tensorflow as tf
import numpy as np
from tensorboard import summary as summary_lib
from tensorboard.plugins.custom_scalar import layout_pb2
def train_data_gen():
yield np.random.normal(size=[3]), np.array([0.5, 0.5, 0.5])
def valid_data_gen():
yield np.random.normal(size=[3]), np.array([0.8, 0.8, 0.8])
batch_size = 25
n_training_batches = 4
n_valid_batches = 2
n_epochs = 5
summary_loc = os.path.join(tempfile.gettempdir(), 'my_tf_model')
print("Summaries written to " + summary_loc)
# Dummy data
train_data = tf.data.Dataset.from_generator(
train_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
valid_data = tf.data.Dataset.from_generator(
valid_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(handle, train_data.output_types,
train_data.output_shapes)
batch_x, batch_y = iterator.get_next()
train_iter = train_data.make_initializable_iterator()
valid_iter = valid_data.make_initializable_iterator()
# Some ops on the data
loss = tf.losses.mean_squared_error(batch_x, batch_y)
valid_loss, valid_loss_update = tf.metrics.mean(loss)
with tf.name_scope('loss'):
train_summ = summary_lib.scalar('training', loss)
valid_summ = summary_lib.scalar('valid', valid_loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_handle, valid_handle = sess.run([train_iter.string_handle(), valid_iter.string_handle()])
sess.run([train_iter.initializer, valid_iter.initializer])
writer_train = tf.summary.FileWriter(os.path.join(summary_loc, 'train'), sess.graph)
writer_valid = tf.summary.FileWriter(os.path.join(summary_loc, 'valid'), sess.graph)
layout_summary = summary_lib.custom_scalar_pb(
layout_pb2.Layout(category=[
layout_pb2.Category(
title='losses',
chart=[
layout_pb2.Chart(
title='losses',
multiline=layout_pb2.MultilineChartContent(tag=[
'loss/training', 'loss/valid'
]))
])
]))
writer_train.add_summary(layout_summary)
global_step = 0
for i in range(n_epochs):
for j in range(n_training_batches): # "Training"
global_step += 1
summ = sess.run(train_summ, feed_dict={handle: train_handle})
writer_train.add_summary(summary=summ, global_step=global_step)
sess.run(tf.local_variables_initializer())
for j in range(n_valid_batches): # "Validation"
_, batch_summ = sess.run([valid_loss_update, train_summ], feed_dict={handle: valid_handle})
summ = sess.run(valid_summ)
writer_valid.add_summary(summary=summ, global_step=global_step)
Here's the resulting output in Tensorboard.

Restoring a Tensorflow model that uses Iterators

I have a model that's trains my network using an Iterator; following the new Dataset API pipeline model that's now recommended by Google.
I read tfrecord files, feed data to the network, train nicely, and all is going well, I save my model in the end of the training so I can run Inference on it later. A simplified version of the code is as following:
""" Training and saving """
training_dataset = tf.contrib.data.TFRecordDataset(training_record)
training_dataset = training_dataset.map(ds._path_records_parser)
training_dataset = training_dataset.batch(BATCH_SIZE)
with tf.name_scope("iterators"):
training_iterator = Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
next_training_element = training_iterator.get_next()
training_init_op = training_iterator.make_initializer(training_dataset)
def train(num_epochs):
# compute for the number of epochs
for e in range(1, num_epochs+1):
session.run(training_init_op) #initializing iterator here
while True:
try:
images, labels = session.run(next_training_element)
session.run(optimizer, feed_dict={x: images, y_true: labels})
except tf.errors.OutOfRangeError:
saver_name = './saved_models/ucf-model'
print("Finished Training Epoch {}".format(e))
break
""" Restoring """
# restoring the saved model and its variables
session = tf.Session()
saver = tf.train.import_meta_graph(r'saved_models\ucf-model.meta')
saver.restore(session, tf.train.latest_checkpoint('.\saved_models'))
graph = tf.get_default_graph()
# restoring relevant tensors/ops
accuracy = graph.get_tensor_by_name("accuracy/Mean:0") #the tensor that when evaluated returns the mean accuracy of the batch
testing_iterator = graph.get_operation_by_name("iterators/Iterator") #my iterator used in testing.
next_testing_element = graph.get_operation_by_name("iterators/IteratorGetNext") #the GetNext operator for my iterator
# loading my testing set tfrecords
testing_dataset = tf.contrib.data.TFRecordDataset(testing_record_path)
testing_dataset = testing_dataset.map(ds._path_records_parser, num_threads=4, output_buffer_size=BATCH_SIZE*20)
testing_dataset = testing_dataset.batch(BATCH_SIZE)
testing_init_op = testing_iterator.make_initializer(testing_dataset) #to initialize the dataset
with tf.Session() as session:
session.run(testing_init_op)
while True:
try:
images, labels = session.run(next_testing_element)
accuracy = session.run(accuracy, feed_dict={x: test_images, y_true: test_labels}) #error here, x, y_true not defined
except tf.errors.OutOfRangeError:
break
My problem is mainly when I restore the model. How to feed testing data to the network?
When I restore my Iterator using testing_iterator = graph.get_operation_by_name("iterators/Iterator"), next_testing_element = graph.get_operation_by_name("iterators/IteratorGetNext"), I get the following error:
GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
So I did try to initialize my dataset using: testing_init_op = testing_iterator.make_initializer(testing_dataset)). I got this error: AttributeError: 'Operation' object has no attribute 'make_initializer'
Another issue is, since an iterator is being used, there's no need to use placeholders in the training_model, as an iterator feed data directly to the graph. But this way, how to restore my feed_dict keys in the 3rd to last line, when I feed data to the "accuracy" op?
EDIT: if someone could suggest a way to add placeholders between the Iterator and the network input, then I could try running the graph by evaluating the "accuracy" tensor while feeding data to the placeholders and ignoring the iterator altogether.
When restoring a saved meta graph, you can restore the initialization operation with name and then use it again to initialize the input pipeline for inference.
That is, when creating the graph, you can do
dataset_init_op = iterator.make_initializer(dataset, name='dataset_init')
And then restore this operation by doing:
dataset_init_op = graph.get_operation_by_name('dataset_init')
Here is a self contained code snippet that compares results of a randomly initialized model before and after restoring.
Saving an Iterator
np.random.seed(42)
data = np.random.random([4, 4])
X = tf.placeholder(dtype=tf.float32, shape=[4, 4], name='X')
dataset = tf.data.Dataset.from_tensor_slices(X)
iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes)
dataset_next_op = iterator.get_next()
# name the operation
dataset_init_op = iterator.make_initializer(dataset, name='dataset_init')
w = np.random.random([1, 4])
W = tf.Variable(w, name='W', dtype=tf.float32)
output = tf.multiply(W, dataset_next_op, name='output')
sess = tf.Session()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
sess.run(dataset_init_op, feed_dict={X:data})
while True:
try:
print(sess.run(output))
except tf.errors.OutOfRangeError:
saver.save(sess, 'tmp/', global_step=1002)
break
And then you can restore the same model for inference as follows:
Restoring saved iterator
np.random.seed(42)
data = np.random.random([4, 4])
tf.reset_default_graph()
sess = tf.Session()
saver = tf.train.import_meta_graph('tmp/-1002.meta')
ckpt = tf.train.get_checkpoint_state(os.path.dirname('tmp/checkpoint'))
saver.restore(sess, ckpt.model_checkpoint_path)
graph = tf.get_default_graph()
# Restore the init operation
dataset_init_op = graph.get_operation_by_name('dataset_init')
X = graph.get_tensor_by_name('X:0')
output = graph.get_tensor_by_name('output:0')
sess.run(dataset_init_op, feed_dict={X:data})
while True:
try:
print(sess.run(output))
except tf.errors.OutOfRangeError:
break
I would suggest to use tf.contrib.data.make_saveable_from_iterator, which has been designed precisely for this purpose. It is much less verbose and does not require you to change existing code, in particular how you define your iterator.
Working example, when we save everything after step 5 has completed. Note how I don't even bother knowing what seed is used.
import tensorflow as tf
iterator = (
tf.data.Dataset.range(100)
.shuffle(10)
.make_one_shot_iterator())
batch = iterator.get_next(name='batch')
saveable_obj = tf.contrib.data.make_saveable_from_iterator(iterator)
tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
saver = tf.train.Saver()
with tf.Session() as sess:
tf.global_variables_initializer().run()
for step in range(10):
print('{}: {}'.format(step, sess.run(batch)))
if step == 5:
saver.save(sess, './foo', global_step=step)
# 0: 1
# 1: 6
# 2: 7
# 3: 3
# 4: 8
# 5: 10
# 6: 12
# 7: 14
# 8: 5
# 9: 17
Then later, if we resume from step 6, we get the same output.
import tensorflow as tf
saver = tf.train.import_meta_graph('./foo-5.meta')
with tf.Session() as sess:
saver.restore(sess, './foo-5')
for step in range(6, 10):
print('{}: {}'.format(step, sess.run('batch:0')))
# 6: 12
# 7: 14
# 8: 5
# 9: 17
I couldn't solve the problem related to initializing the iterator, but since I pre-process my dataset using map method, and I apply transformations defined by Python operations wrapped with py_func, which cannot be serialized for storing\restoring, I'll have to initialize my dataset when I want to restore it anyway.
So, the problem that remains is how to feed data to my graph when I restore it. I placed a tf.identity node between the iterator output and my network input. Upon restoring, I feed my data to the identity node. A better solution that I discovered later is using placeholder_with_default(), as described in this answer.
I would suggest having a look at CheckpointInputPipelineHook CheckpointInputPipelineHook, which implements saving iterator state for further training with tf.Estimator.

Adapting Tensorflow CIFAR10 model for own data

I am just starting with tensorflow and I thought a good first step would be to adapt CIFAR10 model for my own use. My database are not images but signals and a whole database has a shape of [16400,3000,1,1] (dimensionwise: number of all samples, height, width and number of channels added on purpose). I am already working on this problem with MatConvNet toolbox, so this question is strictly about tensorflow machnism. The database is a ready numpy tensor of the size above, in the code below is my attempt to prepare the data to be readable for the training script
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import numpy as np
IMAGE_SIZE = 3000
data = np.load('/home/tensorflow-master/tensorflow/models/image/cifar10 /konsensop/data.npy')
labels = np.load('/home/tensorflow-master/tensorflow/models/image/cifar10/konsensop/labels.npy')
labels = labels-1
labels = labels.astype(int)
data = tf.cast(data,tf.float32)
labels = tf.cast(labels,tf.int64)
NUM_CLASSES = 2
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 10000
NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 6400
def _generate_image_and_label_batch(data_sample, label, in_queue_examples,
batch_size, shuffle):
num_preprocess_threads = 16
if shuffle:
data, label_batch = tf.train.shuffle_batch(
[data_sample, label],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + batch_size,
min_after_dequeue=min_queue_examples)
else:
data, label_batch = tf.train.batch(
[data_sample, label],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + batch_size)
return data, tf.reshape(label_batch, [batch_size])
def inputs(data,labels, batch_size):
for i in xrange(0, data.shape[0]/batch_size):
data_sample = data[i,:,:,:]
label = labels[i,0]
height = 3000
width = 1
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN*
min_fraction_of_examples_in_queue)
print('Filling queue with %d data before starting to train' % min_queue_examples)
return _generate_image_and_label_batch(data_sample, label,
min_queue_examples, batch_size,
shuffle=True)
I'm trying to load the data I aleady have and generate batches in a way cifar10 model did, but when running the trainer code I get an error indata,labels = konsensop_input.inputs(data,labels,batch_size) UnboundcocalError: local variable 'data' referenced before assigment
data = konsensop_input.data
labels = konsensop_input.labels
def train():
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable = False)
data, labels = konsensop_input.inputs(data, labels, batch_size)
logits = konsensop_train.inference(data)
# calculate loss
loss = konsensop.loss(logits, labels)
train_op = konsensop.train(loss, global_step)
# create a saver
saver = tf.train.Saver(tf.all_variables()) #saves all variables in a graph
# build the summary operation based on the TF collection of summaries
summary_op = tf.merge_all_summaries()
# build an initialization operation to run below
init = tf.initialize_all_variables()
# start running operations on the graph
sess = tf.Session(config = tf.ConfigProto(log_device_placement=False))
sess.run(init)
# start the queue runners
tf.train.start_queue_runners(sess = sess) #co to i po co to"""
summary_writer = tf.train.SummaryWriter( FLAGS.train_dir, sess.graph)
for step in xrange(FLAGS.max_step):
start_time = time.time()
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
print ( format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = step)
def main(argv=None):
train()
if __name__=='__main__':
tf.app.run()
I would like to figure out how to implement a reasonable data feeding technique here
For the relatively small data set you want to work with, you might consider just loading it into a big numpy array, then iterating over it in mini-batches, which you feed to the computation graph via tf.placeholders and the feed_dict mechanism.
The mini-batch iteration could look something like this (you should probably add random shuffling after each epoch):
def iterate_batches(X, y, batch_size, num_epochs):
N = np.size(X, 0)
batches_per_epoch = N/float(batch_size)
for i in range(num_epochs):
for j in range(batches_per_epoch):
start, stop = j*batch_size, (j+1)*batch_size
yield X[start:stop, :], y[start:stop]
(If you are not familiar with Python's yield mechanism, google for Python generators. There a lots of good introductions on the web.)
Given that you have a mechanism to load the whole data set into a numpy array X_train, y_train, you can then write your training loop like this
train_op = ...
for X, y in iterate_batches(X_train, y_train, you_batch_size, your_num_epochs):
sess.run([train_op], feed_dict={X_tensor: X, y_tensor: y}
Here, X_tensor and y_tensor are tf.placeholders for the data, that you have to specify in your network architecture.

How to *actually* read CSV data in TensorFlow?

I'm relatively new to the world of TensorFlow, and pretty perplexed by how you'd actually read CSV data into a usable example/label tensors in TensorFlow. The example from the TensorFlow tutorial on reading CSV data is pretty fragmented and only gets you part of the way to being able to train on CSV data.
Here's my code that I've pieced together, based off that CSV tutorial:
from __future__ import print_function
import tensorflow as tf
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
filename = "csv_test_data.csv"
# setup text reader
file_length = file_len(filename)
filename_queue = tf.train.string_input_producer([filename])
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
# setup CSV decoding
record_defaults = [[0],[0],[0],[0],[0]]
col1,col2,col3,col4,col5 = tf.decode_csv(csv_row, record_defaults=record_defaults)
# turn features back into a tensor
features = tf.stack([col1,col2,col3,col4])
print("loading, " + str(file_length) + " line(s)\n")
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, col5])
print(example, label)
coord.request_stop()
coord.join(threads)
print("\ndone loading")
And here is an brief example from the CSV file I'm loading - pretty basic data - 4 feature columns, and 1 label column:
0,0,0,0,0
0,15,0,0,0
0,30,0,0,0
0,45,0,0,0
All the code above does is print each example from the CSV file, one by one, which, while nice, is pretty darn useless for training.
What I'm struggling with here is how you'd actually turn those individual examples, loaded one-by-one, into a training dataset. For example, here's a notebook I was working on in the Udacity Deep Learning course. I basically want to take the CSV data I'm loading, and plop it into something like train_dataset and train_labels:
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
I've tried using tf.train.shuffle_batch, like this, but it just inexplicably hangs:
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, colRelevant])
example_batch, label_batch = tf.train.shuffle_batch([example, label], batch_size=file_length, capacity=file_length, min_after_dequeue=10000)
print(example, label)
So to sum up, here are my questions:
What am I missing about this process?
It feels like there is some key intuition that I'm missing about how to properly build an input pipeline.
Is there a way to avoid having to know the length of the CSV file?
It feels pretty inelegant to have to know the number of lines you want to process (the for i in range(file_length) line of code above)
Edit:
As soon as Yaroslav pointed out that I was likely mixing up imperative and graph-construction parts here, it started to become clearer. I was able to pull together the following code, which I think is closer to what would typically done when training a model from CSV (excluding any model training code):
from __future__ import print_function
import numpy as np
import tensorflow as tf
import math as math
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dataset')
args = parser.parse_args()
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def read_from_csv(filename_queue):
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
record_defaults = [[0],[0],[0],[0],[0]]
colHour,colQuarter,colAction,colUser,colLabel = tf.decode_csv(csv_row, record_defaults=record_defaults)
features = tf.stack([colHour,colQuarter,colAction,colUser])
label = tf.stack([colLabel])
return features, label
def input_pipeline(batch_size, num_epochs=None):
filename_queue = tf.train.string_input_producer([args.dataset], num_epochs=num_epochs, shuffle=True)
example, label = read_from_csv(filename_queue)
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
file_length = file_len(args.dataset) - 1
examples, labels = input_pipeline(file_length, 1)
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop():
example_batch, label_batch = sess.run([examples, labels])
print(example_batch)
except tf.errors.OutOfRangeError:
print('Done training, epoch reached')
finally:
coord.request_stop()
coord.join(threads)
I think you are mixing up imperative and graph-construction parts here. The operation tf.train.shuffle_batch creates a new queue node, and a single node can be used to process the entire dataset. So I think you are hanging because you created a bunch of shuffle_batch queues in your for loop and didn't start queue runners for them.
Normal input pipeline usage looks like this:
Add nodes like shuffle_batch to input pipeline
(optional, to prevent unintentional graph modification) finalize graph
--- end of graph construction, beginning of imperative programming --
tf.start_queue_runners
while(True): session.run()
To be more scalable (to avoid Python GIL), you could generate all of your data using TensorFlow pipeline. However, if performance is not critical, you can hook up a numpy array to an input pipeline by using slice_input_producer. Here's an example with some Print nodes to see what's going on (messages in Print go to stdout when node is run)
tf.reset_default_graph()
num_examples = 5
num_features = 2
data = np.reshape(np.arange(num_examples*num_features), (num_examples, num_features))
print data
(data_node,) = tf.slice_input_producer([tf.constant(data)], num_epochs=1, shuffle=False)
data_node_debug = tf.Print(data_node, [data_node], "Dequeueing from data_node ")
data_batch = tf.batch([data_node_debug], batch_size=2)
data_batch_debug = tf.Print(data_batch, [data_batch], "Dequeueing from data_batch ")
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
tf.get_default_graph().finalize()
tf.start_queue_runners()
try:
while True:
print sess.run(data_batch_debug)
except tf.errors.OutOfRangeError as e:
print "No more inputs."
You should see something like this
[[0 1]
[2 3]
[4 5]
[6 7]
[8 9]]
[[0 1]
[2 3]]
[[4 5]
[6 7]]
No more inputs.
The "8, 9" numbers didn't fill up the full batch, so they didn't get produced. Also tf.Print are printed to sys.stdout, so they show up in separately in Terminal for me.
PS: a minimal of connecting batch to a manually initialized queue is in github issue 2193
Also, for debugging purposes you might want to set timeout on your session so that your IPython notebook doesn't hang on empty queue dequeues. I use this helper function for my sessions
def create_session():
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM
config.operation_timeout_in_ms=60000 # terminate on long hangs
# create interactive session to register a default session
sess = tf.InteractiveSession("", config=config)
return sess
Scalability Notes:
tf.constant inlines copy of your data into the Graph. There's a fundamental limit of 2GB on size of Graph definition so that's an upper limit on size of data
You could get around that limit by using v=tf.Variable and saving the data into there by running v.assign_op with a tf.placeholder on right-hand side and feeding numpy array to the placeholder (feed_dict)
That still creates two copies of data, so to save memory you could make your own version of slice_input_producer which operates on numpy arrays, and uploads rows one at a time using feed_dict
Or you could try this, the code loads the Iris dataset into tensorflow using pandas and numpy and a simple one neuron output is printed in the session. Hope it helps for a basic understanding.... [ I havent added the way of one hot decoding labels].
import tensorflow as tf
import numpy
import pandas as pd
df=pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [0,1,2,3,4],skiprows = [0],header=None)
d = df.values
l = pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [5] ,header=None)
labels = l.values
data = numpy.float32(d)
labels = numpy.array(l,'str')
#print data, labels
#tensorflow
x = tf.placeholder(tf.float32,shape=(150,5))
x = data
w = tf.random_normal([100,150],mean=0.0, stddev=1.0, dtype=tf.float32)
y = tf.nn.softmax(tf.matmul(w,x))
with tf.Session() as sess:
print sess.run(y)
You can use latest tf.data API :
dataset = tf.contrib.data.make_csv_dataset(filepath)
iterator = dataset.make_initializable_iterator()
columns = iterator.get_next()
with tf.Session() as sess:
sess.run([iteator.initializer])
If anyone came here searching for a simple way to read absolutely large and sharded CSV files in tf.estimator API then , please see below my code
CSV_COLUMNS = ['ID','text','class']
LABEL_COLUMN = 'class'
DEFAULTS = [['x'],['no'],[0]] #Default values
def read_dataset(filename, mode, batch_size = 512):
def _input_fn(v_test=False):
# def decode_csv(value_column):
# columns = tf.decode_csv(value_column, record_defaults = DEFAULTS)
# features = dict(zip(CSV_COLUMNS, columns))
# label = features.pop(LABEL_COLUMN)
# return add_engineered(features), label
# Create list of files that match pattern
file_list = tf.gfile.Glob(filename)
# Create dataset from file list
#dataset = tf.data.TextLineDataset(file_list).map(decode_csv)
dataset = tf.contrib.data.make_csv_dataset(file_list,
batch_size=batch_size,
column_names=CSV_COLUMNS,
column_defaults=DEFAULTS,
label_name=LABEL_COLUMN)
if mode == tf.estimator.ModeKeys.TRAIN:
num_epochs = None # indefinitely
dataset = dataset.shuffle(buffer_size = 10 * batch_size)
else:
num_epochs = 1 # end-of-input after this
batch_features, batch_labels = dataset.make_one_shot_iterator().get_next()
#Begins - Uncomment for testing only -----------------------------------------------------<
if v_test == True:
with tf.Session() as sess:
print(sess.run(batch_features))
#End - Uncomment for testing only -----------------------------------------------------<
return add_engineered(batch_features), batch_labels
return _input_fn
Example usage in TF.estimator:
train_spec = tf.estimator.TrainSpec(input_fn = read_dataset(
filename = train_file,
mode = tf.estimator.ModeKeys.TRAIN,
batch_size = 128),
max_steps = num_train_steps)
2.0 Compatible Solution: This Answer might be provided by others in the above thread but I will provide additional links which will help the community.
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=5, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
For more information, please refer this Tensorflow Tutorial.

Categories