I'm having trouble training a seq2seq model using Tensorflow on an Nvidia P100 GPU.
Here are the versions I'm using: TensorFlow 1.10.0, Keras 2.2.2, Python 3.6.3, CUDA 9.2.148.1, cuDNN 7.2.1
I currently get an OOM error well in the middle of training (18 minutes).
I've been doing a little digging and tried setting allow_growth = True (flag not set in the code below) but did not manage to see any memory grow, it all gets allocated at the start.
I also tried setting the graph to read only with tf.finalize() but the program still runs, which suggests no nodes are being added or that if haven't placed the function in the correct place in the code.
Since the graph trains and can be saved and all, it doesn't seem to be too large at the start.
Here are some of the hyper parameters I'm using:
batch_size = 10
rnn_size = 1024
src/tgt_vocab_size = 8000
epochs = 20
display_steps = 10
The dataset used are docstrings and the associated function code, so not incredibly long.
One of my original thoughts was that since the size of the sentences is dynamic, one really long one could be too big. But I shuffled the dataset to see if the crash happened at a different time and it's still at 18 minutes with the same parameters.
Here is the code including the graphs and the training/testing loop.
def train(...):
...
...
def source_generator():
for el in train_source_sents:
yield el
def target_generator():
for el in train_target_sents:
yield el
train_graph = tf.Graph()
with train_graph.as_default():
# Dataset and batch preparation
with tf.name_scope("Dataset-prep"):
source_dataset = tf.data.Dataset.from_generator(source_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = tf.data.Dataset.from_generator(target_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = target_dataset.map(lambda x: tf.concat([x, [target_tok2ID['<EOS>']]], 0)) # Adding <EOS> character to the endo of all the target sentences
target_input_dataset = target_dataset.map(lambda x: tf.concat([[target_tok2ID['<GO>']], x], 0)) # Creating training inputs for the decoder, This requires adding <GO> to the start of the sequence
target_sequence_length = target_dataset.map(lambda x: tf.shape(x)[0]) # Adding the sizes of all the sequences to be paired up with the reset of the dataset
dataset = tf.data.Dataset.zip((source_dataset, target_dataset, target_input_dataset, target_sequence_length)) # create the collection of all the individual datasets
dataset = dataset.shuffle(buffer_size=10000)
with tf.name_scope("Dataset-batching"):
dataset = dataset.repeat(epochs)
pad_id = target_tok2ID['<PAD>']
batched_dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None],[None], []), padding_values=(pad_id,pad_id,pad_id,pad_id))
batched_dataset = batched_dataset.prefetch(buffer_size=batch_size) # could be removed, perhaps yeilds improvements
iterator = batched_dataset.make_one_shot_iterator()
source_batch, target_batch, target_input_batch, batch_target_sequence_length = iterator.get_next()
with tf.name_scope("Encoding-layer"):
source_vocab_size = len(source_tok2ID)
embed = tf.contrib.layers.embed_sequence(source_batch, vocab_size=source_vocab_size, embed_dim=encoding_embedding_size)
stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
outputs, encoder_state = tf.nn.dynamic_rnn(stacked_cells, embed, dtype=tf.float32)
with tf.name_scope("Decoding-layer"):
target_vocab_size = len(target_tok2ID)
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, target_input_batch)
cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
output_layer = tf.layers.Dense(target_vocab_size)
dec_cell = tf.contrib.rnn.DropoutWrapper(cells, output_keep_prob=keep_prob)
helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, batch_target_sequence_length)
decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, encoder_state, output_layer)
max_target_sequence_length = tf.reduce_max(batch_target_sequence_length, axis=0)
dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
dec_outputs = tf.identity(dec_outputs.rnn_output, name='logits') # This step might seem a little misterious, but it is to take the output from the dynamic decoder and pass it to a tensor from (Dodumentation is scarce here)
masks = tf.sequence_mask(batch_target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
cost = tf.contrib.seq2seq.sequence_loss( dec_outputs, target_batch, masks)
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
tf.summary.scalar('cost', cost)
merged = tf.summary.merge_all()
train_saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
train_sess = tf.Session(graph=train_graph)
if(model_path):
print(model_path)
train_saver.restore(train_sess, model_path)
else:
train_sess.run(init_op)
store_path = "./visualisations/"
writer = tf.summary.FileWriter(store_path, train_sess.graph)
step = 0
start_time = datetime.now()
while True:
try:
step += 1
model_cost, _, summary = train_sess.run((cost, train_op, merged))
writer.add_summary(summary,step)
if(step % display_step == 0):
save_path = train_saver.save(train_sess, "./checkpoints/NMT")
print("Model saved in path: %s" % save_path)
test_sess = tf.Session(graph=test_graph)
test_saver.restore(test_sess, save_path)
# print(test_sess.run(foo))
scores = []
while True:
try:
predictions, refrence = test_sess.run([dec_predictions, test_target_batch])
for i in range(len(refrence)):
BLEU_score = nltk.translate.bleu_score.sentence_bleu([np.trim_zeros(refrence[i])], np.trim_zeros(predictions[i]), weights = (1,0))
scores.append(BLEU_score)
print("########################################")
print("ref:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(refrence[i]))))
print("")
print("pred:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(predictions[i]))))
except tf.errors.OutOfRangeError:
print("Exhausted test data")
break
delta_time = datetime.now() - start_time
total_exp_time = delta_time * (total_steps / step)
remaining_time = total_exp_time - delta_time
print("")
print("Test set BLEU Score:", np.mean(scores))
print("Model cost:" ,model_cost)
print("Step {} from {}".format(step, total_steps))
print("Current time:", datetime.now())
print("Total Experiment time (Hours:Minutes:Seconds):", str(total_exp_time))
print("Time Elapled (Hours:Minutes:Seconds):", str(delta_time))
print("Time remaining (Hours:Minutes:Seconds):", str(remaining_time))
print("")
except tf.errors.OutOfRangeError:
print("Model finished training")
break
return save_path
Here is an output of a training run:
command line output
Is there something wrong with the way I'm executing the graph, or am I repeating some step leading to the memory fill up?
Thanks for all your help!
Related
I am trying to use the tf.Data API to accelerate my code and prevent GPU data starvation but there is one thing that is stopping me for being comfortable with it and it's the ability to use the same batch when calling the training op multiple times.
Suppose I have my dataset set up as
dataset = tf.data.TextLineDataset("textfile.txt")
dataset = dataset.shuffle(dataset_size)
dataset = dataset.padded_batch(batch_size, ...)
dataset = dataset.repeat()
iterator = dataset.make_one_shot_iterator()
x_batch = iterator.get_next()
loss1 = someFunctionOf(x_batch)
loss2 = someOtherFunctionOf(x_batch)
train_op1 = someOptimizerOf(loss1)
train_op2 = someOtherOptimizerOf(loss2)
but now whenever I call train_op1, iterator.get_next() is called and so when calling train_op2, I am training on the next batch.
From this question, I am aware that I can use a combination of flat_map and repeat(n) where n is the number of times I wanna repeat the same batch but this n would depend on the number of train_ops that I call which I have to count manually. Also, I need these two train_ops because they optimize different parts of my graph.
Thank you for your help!
Try the code below. It creates a copy of input and target so hopefully they will not change when you switch optimizer/loss_op. They are persistent between sess.run calls as long as you do not pass is_new:True flag.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def ds_train(batch_size, num_epochs):
ds = (tf.data.Dataset.from_tensor_slices(([1.0,2.0,3.0,4.0,5.0], [-1,-2,-3,-4,-5]))
.batch(batch_size)
.repeat(num_epochs)
)
return ds
batch_size = 1
input_size = 1
num_epochs = 2
with tf.variable_scope("dataset"):
ds_t = ds_train(batch_size, num_epochs)
with tf.variable_scope("iterator"):
iterator_t = ds_t.make_initializable_iterator()
iterator_handle = tf.placeholder(tf.string, shape=[], name="iterator_handle")
iterator = tf.data.Iterator.from_string_handle(iterator_handle,
iterator_t.output_types,
iterator_t.output_shapes)
def next_item():
next_elem = iterator.get_next(name="next_element")
x, y = tf.cast(next_elem[0], tf.float32), next_elem[1]# tf.cast(next_elem[1], tf.int32)
return x, y
inputs = tf.Variable(tf.zeros(shape=[batch_size,input_size]), dtype=tf.float32, name="inputs", trainable=False, use_resource=True)
target = tf.Variable(tf.zeros(shape=[batch_size], dtype=tf.int32), dtype=tf.int32, name="target", trainable=False,use_resource=True)
is_new = tf.placeholder_with_default(tf.constant(False), shape=[], name="new_item_flag")
def new_data(batch_size, input_size):
# run the data layer to generate a new batch
next_inputs, next_target = next_item()
next_inputs = tf.reshape(next_inputs, shape=[batch_size, input_size])
with tf.control_dependencies([tf.assign(inputs, next_inputs), tf.assign(target, next_target)]):
return tf.identity(inputs), tf.identity(target)
def old_data():
# just forward the existing batch
return inputs, target
next_inputs, next_target = next_item()
inputs, target = tf.cond(is_new, lambda:new_data(batch_size, input_size), old_data)
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(),tf.local_variables_initializer()])
handle_t = sess.run(iterator_t.string_handle())
sess.run(iterator_t.initializer)
while True:
try:
print(sess.run([inputs, target], feed_dict={iterator_handle:handle_t, is_new: False}))
print(sess.run([inputs, target], feed_dict={iterator_handle:handle_t, is_new: False}))
print(sess.run([inputs, target], feed_dict={iterator_handle:handle_t, is_new: True}))
except tf.errors.OutOfRangeError:
print("End of training dataset.")
break
#!/usr/bin/python
#-*- coding:utf-8 -*-
"""
Created on Tue Jan 2 16:31:45 2018
#author: houlinjie
"""
import tensorflow as tf
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step
from tensorflow.python.platform import tf_logging as logging
import inception_preprocessing
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
slim = tf.contrib.slim
import sys
import matplotlib.pyplot as plt
import numpy as np
#================ DATASET INFORMATION ======================
#State dataset directory where the tfrecord files are located
dataset_dir = '.'
#设定Gpu使用量
#config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.7
#State where your log file is at. If it doesn't exist, create it.
log_dir = './log'
#State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
#State the image size you're resizing your images to. We will use the default inception size of 299.
img_height = 600
img_width = 800
#State the number of classes to predict:
num_classes = 6
#State the labels file and read it
labels_file = './labels.txt'
labels = open(labels_file, 'r')
#Create a dictionary to refer each label to their string name
labels_to_name = {}
for line in labels:
label, string_name = line.split(':')
string_name = string_name[:-1] #Remove newline
labels_to_name[int(label)] = string_name
#Create the file pattern of your TFRecord files so that it could be recognized later on
file_pattern = 'estate_%s_*.tfrecord'
#Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later.
items_to_descriptions = {
'image': 'A 3-channel RGB coloured real estate image that is either bathroom, bedroom, floorplan, kitchen, or livingroom, other.',
'label': 'A label that is as such -- 0:bathroom, 1:bedroom, 2:floorplan, 3:kitchen, 4:livingroom, 5:other'
}
#================= TRAINING INFORMATION ==================
#State the number of epochs to train
num_epochs = 10
#State your batch size
batch_size = 4
#Learning rate information and configuration (Up to you to experiment)
initial_learning_rate = 0.0002
learning_rate_decay_factor = 0.7
num_epochs_before_decay = 2
#============== DATASET LOADING ======================
#We now create a function that creates a Dataset class which will give us many TFRecord files to feed in the examples into a queue in parallel.
def get_split(split_name, dataset_dir, file_pattern=file_pattern, file_pattern_for_counting='estate'):
'''
Obtains the split - training or validation - to create a Dataset class for feeding the examples into a queue later on. This function will
set up the decoder and dataset information all into one Dataset class so that you can avoid the brute work later on.
Your file_pattern is very important in locating the files later.
INPUTS:
- split_name(str): 'train' or 'validation'. Used to get the correct data split of tfrecord files
- dataset_dir(str): the dataset directory where the tfrecord files are located
- file_pattern(str): the file name structure of the tfrecord files in order to get the correct data
- file_pattern_for_counting(str): the string name to identify your tfrecord files for counting
OUTPUTS:
- dataset (Dataset): A Dataset class object where we can read its various components for easier batch creation later.
'''
#First check whether the split_name is train or validation
if split_name not in ['train', 'validation']:
raise ValueError('The split_name %s is not recognized. Please input either train or validation as the split_name' % (split_name))
#Create the full path for a general file_pattern to locate the tfrecord_files
file_pattern_path = os.path.join(dataset_dir, file_pattern % (split_name))
#Count the total number of examples in all of these shard
num_samples = 0
file_pattern_for_counting = file_pattern_for_counting + '_' + split_name
tfrecords_to_count = [os.path.join(dataset_dir, file) for file in os.listdir(dataset_dir) if file.startswith(file_pattern_for_counting)]
for tfrecord_file in tfrecords_to_count:
for record in tf.python_io.tf_record_iterator(tfrecord_file):
num_samples += 1
#Create a reader, which must be a TFRecord reader in this case
reader = tf.TFRecordReader
#Create the keys_to_features dictionary for the decoder
keys_to_features = {
'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
'image/format': tf.FixedLenFeature((), tf.string, default_value='jpg'),
'image/class/label': tf.FixedLenFeature(
[], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
}
#Create the items_to_handlers dictionary for the decoder.
items_to_handlers = {
'image': slim.tfexample_decoder.Image(),
'label': slim.tfexample_decoder.Tensor('image/class/label'),
}
#Start to create the decoder
decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
#Create the labels_to_name file
labels_to_name_dict = labels_to_name
#Actually create the dataset
#dataset 对象定义了数据集的文件位置,解码方式等元信息
dataset = slim.dataset.Dataset(
data_sources = file_pattern_path,
decoder = decoder,
reader = reader,
num_readers = 4,
num_samples = num_samples,
num_classes = num_classes,
labels_to_name = labels_to_name_dict,
items_to_descriptions = items_to_descriptions)
return dataset
def load_batch(dataset, batch_size, height=img_height, width=img_width, is_training=True):
'''
Loads a batch for training.
INPUTS:
- dataset(Dataset): a Dataset class object that is created from the get_split function
- batch_size(int): determines how big of a batch to train
- height(int): the height of the image to resize to during preprocessing
- width(int): the width of the image to resize to during preprocessing
- is_training(bool): to determine whether to perform a training or evaluation preprocessing
OUTPUTS:
- images(Tensor): a Tensor of the shape (batch_size, height, width, channels) that contain one batch of images
- labels(Tensor): the batch's labels with the shape (batch_size,) (requires one_hot_encoding).
'''
#First create the data_provider object
data_provider = slim.dataset_data_provider.DatasetDataProvider(
dataset,
common_queue_capacity = 24 + 3 * batch_size,
common_queue_min = 24)
#Obtain the raw image using the get method
raw_image, label = data_provider.get(['image', 'label'])
#Perform the correct preprocessing for this image depending if it is training or evaluating
image = inception_preprocessing.preprocess_image(raw_image, height, width, is_training)
#As for the raw images, we just do a simple reshape to batch it up
raw_image = tf.expand_dims(raw_image, 0)
raw_image = tf.image.resize_nearest_neighbor(raw_image, [height, width])
raw_image = tf.squeeze(raw_image)
#Batch up the image by enqueing the tensors internally in a FIFO queue and dequeueing many elements with tf.train.batch.
images, raw_images, labels = tf.train.batch(
[image, raw_image, label],
batch_size = batch_size,
num_threads = 4,
capacity = 4 * batch_size,
allow_smaller_final_batch = True)
print("images tensor data type:", tf.shape(images))
return images, raw_images, labels
def train():
#Create the log directory here. Must be done here otherwise import will activate this unneededly.
if not os.path.exists(log_dir):
os.mkdir(log_dir)
#======================= TRAINING PROCESS =========================
#Now we start to construct the graph and build our model
#with tf.Graph().as_default() as graph:
tf.logging.set_verbosity(tf.logging.INFO) #Set the verbosity to INFO level
#First create the dataset and load one batch
dataset = get_split('train', dataset_dir, file_pattern=file_pattern)
images, raw_images, labels = load_batch(dataset, batch_size=batch_size)
print('num_samples:', dataset.num_samples)
#Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(dataset.num_samples / batch_size)
num_steps_per_epoch = num_batches_per_epoch #Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
x = tf.placeholder(tf.float32, shape=[None, img_height, img_width, 3], name='x')
y_true = tf.placeholder(tf.int32, shape=[None], name='y_true')
#Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(x, num_classes = dataset.num_classes, is_training = True)
#Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude = exclude)
#Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = slim.one_hot_encoding(y_true, dataset.num_classes)
#Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits)
total_loss = tf.losses.get_total_loss() #obtain the regularization losses as well
#Create the global step for monitoring the learning_rate and training.
global_step = get_or_create_global_step()
#Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate = initial_learning_rate,
global_step = global_step,
decay_steps = decay_steps,
decay_rate = learning_rate_decay_factor,
staircase = True)
#Now we can define the optimizer that takes on the learning rate
optimizer = tf.train.AdamOptimizer(learning_rate = lr)
#Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer)
#State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
y_pred = tf.nn.softmax(logits, name='y_pred')
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions,y_true)
precision, precision_update = tf.contrib.metrics.streaming_precision(predictions, y_true)
recall, recall_update = tf.contrib.metrics.streaming_recall(predictions, y_true)
#tf.group 返回的值是 ‘op’
metrics_op = tf.group(accuracy_update, probabilities, precision_update, recall_update)
#Now finally create all the summaries you need to monitor and group them into one summary op.
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
tf.summary.scalar('precision',precision)
tf.summary.scalar('recall',recall)
my_summary_op = tf.summary.merge_all()
#my_summary_op = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES))
#Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
def train_step(sess, train_op, global_step, img, lab):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
'''
#Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],feed_dict={x: img.eval(session=sess), y_true: lab.eval(session=sess)})
#total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
time_elapsed = time.time() - start_time
#Run the logging to print some results
#if global_step_count % 10 == 0:
logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
#Now we create a saver function that actually restores the variables from a checkpoint file in a sess
saver = tf.train.Saver(variables_to_restore)
def restore_fn(sess):
return saver.restore(sess, checkpoint_file)
#Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
#logdir used for save checkpoint and summary
"""
Supervisor的作用
1.自动去checkpoint 加载数据或初始化数据
2.自身有一个Saver, 可以用来保存checkpoint
3.有一个summary_computed 用来保存Summary
所以我们就不需要:
1.手动初始化或从checkpoint 中加载数据
2.不需要创建Saver, 使用sv内部的就可以
3.不需要创建summary writer
"""
sv = tf.train.Supervisor(logdir = log_dir, summary_op = None, init_fn = restore_fn)
#Run the managed session
#会自动去logdir 中去找checkpoint, 如果没有的话,自动执行初始化
with sv.managed_session() as sess:
for step in xrange(num_steps_per_epoch * num_epochs):
#At the start of every epoch, show the vital information
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy])
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run([logits, probabilities, predictions, labels], feed_dict={x: images.eval(session=sess), y_true: labels.eval(session=sess)})
print 'logits: \n', logits_value
print 'Probabilities: \n', probabilities_value
print 'predictions: \n', predictions_value
print 'Labels:\n:', labels_value
#Log the summaries every 10 step.
if step % 10 == 0:
loss, _ = train_step(sess, train_op, sv.global_step, images, labels)
summaries = sess.run(my_summary_op, feed_dict={x: images.eval(session=sess), y_true: labels.eval(session=sess)})
sv.summary_computed(sess, summaries)
#If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, sv.global_step, images, labels)
#raw_images, labels, predictions = sess.run([raw_images, labels, predictions], feed_dict={x: images.eval(session=sess), y_true: labels.eval(session=sess)})
#We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
#Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
#saver.save(sess, "./log/estate_model.ckpt")
sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
if __name__ == '__main__':
train()
The code can run but accuracy is very low,I modify the code from batch load dataset to placeholder method, I convert to the return value that load_batch() function from a tensor to numpy.array, use tensor.eval() method to convert a tensor feed_dict={x: img.eval(session=sess), y_true: lab.eval(session=sess)}),I suspect the below code snippet ,but i can't find out the issue
dataset = get_split('train', dataset_dir, file_pattern=file_pattern)
images, raw_images, labels = load_batch(dataset, batch_size=batch_size)
print('num_samples:', dataset.num_samples)
#Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(dataset.num_samples / batch_size)
num_steps_per_epoch = num_batches_per_epoch #Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
x = tf.placeholder(tf.float32, shape=[None, img_height, img_width, 3], name='x')
y_true = tf.placeholder(tf.int32, shape=[None], name='y_true')
#Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(x, num_classes = dataset.num_classes, is_training = True)
#Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude = exclude)
#Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = slim.one_hot_encoding(y_true, dataset.num_classes)
#Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits)
total_loss = tf.losses.get_total_loss() #obtain the regularization losses as well
I am using the Tensorflow input pipeline via queues.
Therefore, I do:
images, volumes = utils.inputs(FLAGS.train_file_path, FLAGS.batch_size, FLAGS.num_epochs)
v_images, v_volumes = utils.inputs(FLAGS.val_file_path, FLAGS.batch_size)
To read the data.
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
_, train_loss = sess.run([train_optimizer, loss])
if step % FLAGS.val_interval == 0:
for val_step in range(0, FLAGS.val_iter):
val_loss = sess.run(v_loss)
How can I make sure that I use the images, volumes and volumes for the training and the v_images, v_volumes for validation?
EDIT
logits = utils.inference(images, FLAGS.stacks, True)
v_logits = utils.inference(v_images, FLAGS.stacks, False)
loss = utils.get_loss(logits, volumes, FLAGS.stacks, 'train')
v_loss = utils.get_loss(v_logits, v_volumes, FLAGS.stacks, 'val')
EDIT 2
summary_train_op = tf.summary.merge_all('train')
summary_val_op = tf.summary.merge_all('val')
with tf.Session() as sess:
summary_writer = tf.summary.FileWriter(FLAGS.train_dir + FLAGS.run, sess.graph)
summary_writer_val = tf.summary.FileWriter(FLAGS.train_dir + FLAGS.run + FLAGS.run_val)
One technique is to build graph nodes individually for training and validation.
Consider "net" is your model graph.
logits_train, volumes = net(images,volumes)
logits_val, v_volumes = net(v_images, v_volumes)
loss_train = some_loss(logits_train,volumes)
train_optimizer = some_optimizer(loss_train)
Now, if you do,
1. sess.run([train_optimizer]) #It uses training data (Dequeues train data)
2. sess.run([logits_val]) #It uses validation data (Dequeues validation data )
I hope this helps.
Update:
_,loss_train_value, logits_train_value = sess.run([train_optimizer, loss_train,logits_train])
Update 2
I am doing everything explicitly as shown below. May be there is a better way to do it.
loss_train_s = tf.summary.scalar("loss_train", loss_train)
acc_train_s = tf.summary.scalar("acc_train", acc_train)
loss_val_s = tf.summary.scalar("loss_val", loss_val)
train_summary_op = tf.summary.merge([loss_train_s,acc_train_s])
val_summary_op = tf.summary.merge([loss_val_s])
I`m just finished to write neural net with tensorflow
attached code :
import tensorflow as tensorFlow
import csv
# read data from csv
file = open('stub.csv')
reader = csv.reader(file)
temp = list(reader)
del temp[0]
# change data from string to float (Tensorflow)
# create data & goal lists
data = []
goal = []
for i in range(len(temp)):
data.append(map(float, temp[i]))
goal.append([data[i][6], 0.0])
del data[i][6]
# change lists to tuple
data = tuple(tuple(x) for x in data)
goal = tuple(goal)
# create training data and test data by 70-30
a = int(len(data) * 0.6) # training set 60%
b = int(len(data) * 0.8) # validation & test: each one is 20%
trainData = data[0:a] # 60%
validationData = data[b: len(data)]
testData = data[a: b] # 20%
trainGoal = goal[0:a]
validationGoal = goal[b:len(data)]
testGoal = goal[a: b]
numberOfLayers = 500
nodesLayer = []
# define the numbers of nodes in hidden layers
for i in range(numberOfLayers):
nodesLayer.append(500)
# define our goal class
classes = 2
batchSize = 2000
# x for input, y for output
sizeOfRow = len(data[0])
x = tensorFlow.placeholder(dtype= tensorFlow.float32, shape=[None, sizeOfRow])
y = tensorFlow.placeholder(dtype= tensorFlow.float32, shape=[None, classes])
hiddenLayers = []
layers = []
def neuralNetworkModel(x):
# first step: (input * weights) + bias, linear operation like y = ax + b
# each layer connection to other layer will represent by nodes(i) * nodes(i+1)
for i in range(0,numberOfLayers):
if i == 0:
hiddenLayers.append({"weights": tensorFlow.Variable(tensorFlow.random_normal([sizeOfRow, nodesLayer[i]])),
"biases": tensorFlow.Variable(tensorFlow.random_normal([nodesLayer[i]]))})
elif i > 0 and i < numberOfLayers-1:
hiddenLayers.append({"weights" : tensorFlow.Variable(tensorFlow.random_normal([nodesLayer[i], nodesLayer[i+1]])),
"biases" : tensorFlow.Variable(tensorFlow.random_normal([nodesLayer[i+1]]))})
else:
outputLayer = {"weights": tensorFlow.Variable(tensorFlow.random_normal([nodesLayer[i], classes])),
"biases": tensorFlow.Variable(tensorFlow.random_normal([classes]))}
# create the layers
for i in range(numberOfLayers):
if i == 0:
layers.append(tensorFlow.add(tensorFlow.matmul(x, hiddenLayers[i]["weights"]), hiddenLayers[i]["biases"]))
layers.append(tensorFlow.nn.relu(layers[i])) # pass values to activation function (i.e sigmoid, softmax) and add it to the layer
elif i >0 and i < numberOfLayers-1:
layers.append(tensorFlow.add(tensorFlow.matmul(layers[i-1], hiddenLayers[i]["weights"]), hiddenLayers[i]["biases"]))
layers.append(tensorFlow.nn.relu(layers[i]))
output = tensorFlow.matmul(layers[numberOfLayers-1], outputLayer["weights"]) + outputLayer["biases"]
return output
def neuralNetworkTrain(data, x, y):
prediction = neuralNetworkModel(x)
# using softmax function, normalize values to range(0,1)
cost = tensorFlow.reduce_mean(tensorFlow.nn.softmax_cross_entropy_with_logits(prediction, y))
# minimize the cost function
# using AdamOptimizer algorithm
optimizer = tensorFlow.train.AdadeltaOptimizer().minimize(cost)
epochs = 2 # feed machine forward + backpropagation = epoch
# build sessions and train the model
with tensorFlow.Session() as sess:
sess.run(tensorFlow.initialize_all_variables())
for epoch in range(epochs):
epochLoss = 0
i = 0
for _ in range(int(len(data) / batchSize)):
ex, ey = nextBatch(i) # takes 500 examples
i += 1
feedDict = {x :ex, y:ey }
_, cos = sess.run([optimizer,cost], feed_dict= feedDict) # start session to optimize the cost function
epochLoss += cos
print("Epoch", epoch + 1, "completed out of", epochs, "loss:", epochLoss)
correct = tensorFlow.equal(tensorFlow.argmax(prediction,1), tensorFlow.argmax(y, 1))
accuracy = tensorFlow.reduce_mean(tensorFlow.cast(correct, "float"))
print("Accuracy:", accuracy.eval({ x: trainData, y:trainGoal}))
# takes 500 examples each iteration
def nextBatch(num):
# Return the next `batch_size` examples from this data set.
# case: using our data & batch size
num *= batchSize
if num < (len(data) - batchSize):
return data[num: num+batchSize], goal[num: num +batchSize]
neuralNetworkTrain(trainData, x, y)
each epoch (iteration) I`ve got the value of loss function and all good.
now I want to try it on my validation/test set.
Someone now what should I do exactly?
Thanks
If you want to get predictions on the trained data you can simply put something like:
tf_p = tf.nn.softmax(prediction)
...In your graph, having loaded your test data into x_test. Then evaluate predictions with:
[p] = session.run([tf_p], feed_dict = {
x : x_test,
y : y_test
}
)
at the end of your neuralNetworkTrain method, and you should end up having them in p.
...Or using tf.train.Saver:
Alternatively you could use tf.train.Saver object to save and restore (and optionally persist) your model. In order to do that you create a saver after you initialise all variables:
...
tf.initialize_all_variables().run()
saver = tf.train.Saver()
...
And then save it once you're done training, at the end of your neuralNetworkTrain method:
...
model_path = saver.save(sess)
You then build a new graph for evaluation, and restore the model before running it on your test data:
# Load test dataset into X_test
...
tf_x = tf.constant(X_test)
tf_p = tf.nn.softmax(neuralNetworkModel(tf_x))
with tf.Session() as session:
tf.initialize_all_variables().run()
saver.restore(session, model_path)
p = tf_p.eval()
And, once again, p should contain softmax activations for your test dataset.
(I haven't actually run this code I'm afraid, but it should give you an idea of how to implement it.)
I am just starting with tensorflow and I thought a good first step would be to adapt CIFAR10 model for my own use. My database are not images but signals and a whole database has a shape of [16400,3000,1,1] (dimensionwise: number of all samples, height, width and number of channels added on purpose). I am already working on this problem with MatConvNet toolbox, so this question is strictly about tensorflow machnism. The database is a ready numpy tensor of the size above, in the code below is my attempt to prepare the data to be readable for the training script
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import numpy as np
IMAGE_SIZE = 3000
data = np.load('/home/tensorflow-master/tensorflow/models/image/cifar10 /konsensop/data.npy')
labels = np.load('/home/tensorflow-master/tensorflow/models/image/cifar10/konsensop/labels.npy')
labels = labels-1
labels = labels.astype(int)
data = tf.cast(data,tf.float32)
labels = tf.cast(labels,tf.int64)
NUM_CLASSES = 2
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 10000
NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 6400
def _generate_image_and_label_batch(data_sample, label, in_queue_examples,
batch_size, shuffle):
num_preprocess_threads = 16
if shuffle:
data, label_batch = tf.train.shuffle_batch(
[data_sample, label],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + batch_size,
min_after_dequeue=min_queue_examples)
else:
data, label_batch = tf.train.batch(
[data_sample, label],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + batch_size)
return data, tf.reshape(label_batch, [batch_size])
def inputs(data,labels, batch_size):
for i in xrange(0, data.shape[0]/batch_size):
data_sample = data[i,:,:,:]
label = labels[i,0]
height = 3000
width = 1
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN*
min_fraction_of_examples_in_queue)
print('Filling queue with %d data before starting to train' % min_queue_examples)
return _generate_image_and_label_batch(data_sample, label,
min_queue_examples, batch_size,
shuffle=True)
I'm trying to load the data I aleady have and generate batches in a way cifar10 model did, but when running the trainer code I get an error indata,labels = konsensop_input.inputs(data,labels,batch_size) UnboundcocalError: local variable 'data' referenced before assigment
data = konsensop_input.data
labels = konsensop_input.labels
def train():
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable = False)
data, labels = konsensop_input.inputs(data, labels, batch_size)
logits = konsensop_train.inference(data)
# calculate loss
loss = konsensop.loss(logits, labels)
train_op = konsensop.train(loss, global_step)
# create a saver
saver = tf.train.Saver(tf.all_variables()) #saves all variables in a graph
# build the summary operation based on the TF collection of summaries
summary_op = tf.merge_all_summaries()
# build an initialization operation to run below
init = tf.initialize_all_variables()
# start running operations on the graph
sess = tf.Session(config = tf.ConfigProto(log_device_placement=False))
sess.run(init)
# start the queue runners
tf.train.start_queue_runners(sess = sess) #co to i po co to"""
summary_writer = tf.train.SummaryWriter( FLAGS.train_dir, sess.graph)
for step in xrange(FLAGS.max_step):
start_time = time.time()
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
print ( format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = step)
def main(argv=None):
train()
if __name__=='__main__':
tf.app.run()
I would like to figure out how to implement a reasonable data feeding technique here
For the relatively small data set you want to work with, you might consider just loading it into a big numpy array, then iterating over it in mini-batches, which you feed to the computation graph via tf.placeholders and the feed_dict mechanism.
The mini-batch iteration could look something like this (you should probably add random shuffling after each epoch):
def iterate_batches(X, y, batch_size, num_epochs):
N = np.size(X, 0)
batches_per_epoch = N/float(batch_size)
for i in range(num_epochs):
for j in range(batches_per_epoch):
start, stop = j*batch_size, (j+1)*batch_size
yield X[start:stop, :], y[start:stop]
(If you are not familiar with Python's yield mechanism, google for Python generators. There a lots of good introductions on the web.)
Given that you have a mechanism to load the whole data set into a numpy array X_train, y_train, you can then write your training loop like this
train_op = ...
for X, y in iterate_batches(X_train, y_train, you_batch_size, your_num_epochs):
sess.run([train_op], feed_dict={X_tensor: X, y_tensor: y}
Here, X_tensor and y_tensor are tf.placeholders for the data, that you have to specify in your network architecture.