Related
Suppose you have an network that has worked with feed_dict so far to inject data into a graph. Every few epochs, I evaluated the training and test loss by feeding a batch from either dataset to my graph.
Now, for performance reasons, I decided to use an input pipeline. Take a look at this dummy example:
import tensorflow as tf
import numpy as np
dataset_size = 200
batch_size= 5
dimension = 4
# create some training dataset
dataset = tf.data.Dataset.\
from_tensor_slices(np.random.normal(2.0,size=(dataset_size,dimension)).
astype(np.float32))
dataset = dataset.batch(batch_size) # take batches
iterator = dataset.make_initializable_iterator()
x = tf.cast(iterator.get_next(),tf.float32)
w = tf.Variable(np.random.normal(size=(1,dimension)).astype(np.float32))
loss_func = lambda x,w: tf.reduce_mean(tf.square(x-w)) # notice that the loss function is a mean!
loss = loss_func(x,w) # this is the loss that will be minimized
train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# train one epoch
sess.run(iterator.initializer)
for i in range(dataset_size//batch_size):
# the training step will update the weights based on ONE batch of examples each step
loss1,_ = sess.run([loss,train_op])
print('train step {:d}. batch loss {:f}.'.format(i,loss1))
# I want to print the loss from another dataset (test set) here
Printing the loss of the training data is no problem, but how do I do this for another dataset? When using feed_dict, I simply got a batch from said set and fed it a value for x.
There are several things you can do for that. One simple option could be something like having two datasets and iterators and use tf.cond to switch between them. However, the more powerful way of doing it is to use an iterator that supports this directly. See the guide on how to create iterators for a description of the various iterator types. For example, using a reinitializable iterator you could have something like this:
import tensorflow as tf
import numpy as np
dataset_size = 200
dataset_test_size = 20
batch_size= 5
dimension = 4
# create some training dataset
dataset = tf.data.Dataset.\
from_tensor_slices(np.random.normal(2.0,size=(dataset_size,dimension)).
astype(np.float32))
dataset = dataset.batch(batch_size) # take batches
# create some test dataset
dataset_test = tf.data.Dataset.\
from_tensor_slices(np.random.normal(2.0,size=(dataset_test_size,dimension)).
astype(np.float32))
dataset_test = dataset_test.batch(batch_size) # take batches
iterator = tf.data.Iterator.from_structure(dataset.output_types,
dataset.output_shapes)
dataset_init_op = iterator.make_initializer(dataset)
dataset_test_init_op = iterator.make_initializer(dataset_test)
x = tf.cast(iterator.get_next(),tf.float32)
w = tf.Variable(np.random.normal(size=(1,dimension)).astype(np.float32))
loss_func = lambda x,w: tf.reduce_mean(tf.square(x-w)) # notice that the loss function is a mean!
loss = loss_func(x,w) # this is the loss that will be minimized
train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# train one epoch
sess.run(dataset_init_op)
for i in range(dataset_size//batch_size):
# the training step will update the weights based on ONE batch of examples each step
loss1,_ = sess.run([loss,train_op])
print('train step {:d}. batch loss {:f}.'.format(i,loss1))
# print test loss
sess.run(dataset_test_init_op)
for i in range(dataset_test_size//batch_size):
loss1 = sess.run(loss)
print('test step {:d}. batch loss {:f}.'.format(i,loss1))
You can do something similar with a feedable iterator, depending on what you find more convenient, and I suppose even with an initializable iterator, for example making a boolean dataset that then you map to some data with tf.cond, although that would not be a very natural way to do it.
EDIT:
Here is how you can do it with an initializable iterator, actually in a cleaner way than what I was initially thinking, so maybe you actually like this more:
import tensorflow as tf
import numpy as np
dataset_size = 200
dataset_test_size = 20
batch_size= 5
dimension = 4
# create data
data = tf.constant(np.random.normal(2.0,size=(dataset_size,dimension)), tf.float32)
data_test = tf.constant(np.random.normal(2.0,size=(dataset_test_size,dimension)), tf.float32)
# choose data
testing = tf.placeholder_with_default(False, ())
current_data = tf.cond(testing, lambda: data_test, lambda: data)
# create dataset
dataset = tf.data.Dataset.from_tensor_slices(current_data)
dataset = dataset.batch(batch_size)
# create iterator
iterator = dataset.make_initializable_iterator()
x = tf.cast(iterator.get_next(),tf.float32)
w = tf.Variable(np.random.normal(size=(1,dimension)).astype(np.float32))
loss_func = lambda x,w: tf.reduce_mean(tf.square(x-w)) # notice that the loss function is a mean!
loss = loss_func(x,w) # this is the loss that will be minimized
train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# train one epoch
sess.run(iterator.initializer)
for i in range(dataset_size//batch_size):
# the training step will update the weights based on ONE batch of examples each step
loss1,_ = sess.run([loss,train_op])
print('train step {:d}. batch loss {:f}.'.format(i,loss1))
# print test loss
sess.run(iterator.initializer, feed_dict={testing: True})
for i in range(dataset_test_size//batch_size):
loss1 = sess.run(loss)
print('test step {:d}. batch loss {:f}.'.format(i,loss1))
I would like to be able plot the training loss per batch and the average validation loss for the validation set on the same plot in Tensorboard. I ran into this issue when my validation set was too large to fit into memory so required batching and the use of tf.metrics update ops.
This question could apply to any Tensorflow metrics you wanted to appear on the same graph in Tensorboard.
I am able to
plot these two graphs separately (see here)
plot the validation-loss-per-validation-batch on the same graph as the training-loss-per-training-batch (this was OK when the validation set could be a single batch and I could reuse the training summary op train_summ below)
In the example code below, my issue stems from the fact that my validation summary tf.summary.scalar with name=loss gets renamed to loss_1 and thus is moved to a separate graph in Tensorboard. From what I can work out Tensorboard takes "same name" and plots them on the same graph, regardless of what folder they are in. This is frustrating as train_summ (name=loss) is only ever written to the train folder and valid_summ (name=loss) is only ever written to the valid folder - but is still renamed to loss_1.
The example code:
# View graphs with (Linux): $ tensorboard --logdir=/tmp/my_tf_model
import tensorflow as tf
import numpy as np
import os
import tempfile
def train_data_gen():
yield np.random.normal(size=[3]), np.array([0.5, 0.5, 0.5])
def valid_data_gen():
yield np.random.normal(size=[3]), np.array([0.8, 0.8, 0.8])
batch_size = 25
n_training_batches = 4
n_valid_batches = 2
n_epochs = 5
summary_loc = os.path.join(tempfile.gettempdir(), 'my_tf_model')
print("Summaries written to" + summary_loc)
# Dummy data
train_data = tf.data.Dataset.from_generator(train_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
valid_data = tf.data.Dataset.from_generator(valid_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(handle,
train_data.output_types, train_data.output_shapes)
batch_x, batch_y = iterator.get_next()
train_iter = train_data.make_initializable_iterator()
valid_iter = valid_data.make_initializable_iterator()
# Some ops on the data
loss = tf.losses.mean_squared_error(batch_x, batch_y)
valid_loss, valid_loss_update = tf.metrics.mean(loss)
# Write to summaries
train_summ = tf.summary.scalar('loss', loss)
valid_summ = tf.summary.scalar('loss', valid_loss) # <- will be renamed to "loss_1"
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_handle, valid_handle = sess.run([train_iter.string_handle(), valid_iter.string_handle()])
sess.run([train_iter.initializer, valid_iter.initializer])
# Summary writers
writer_train = tf.summary.FileWriter(os.path.join(summary_loc, 'train'), sess.graph)
writer_valid = tf.summary.FileWriter(os.path.join(summary_loc, 'valid'), sess.graph)
global_step = 0 # implicit as no actual training
for i in range(n_epochs):
# "Training"
for j in range(n_training_batches):
global_step += 1
summ = sess.run(train_summ, feed_dict={handle: train_handle})
writer_train.add_summary(summary=summ, global_step=global_step)
# "Validation"
sess.run(tf.local_variables_initializer())
for j in range(n_valid_batches):
_, batch_summ = sess.run([valid_loss_update, train_summ], feed_dict={handle: valid_handle})
# The following will plot the batch loss for the validation set on the loss plot with the training data:
# writer_valid.add_summary(summary=batch_summ, global_step=global_step + j + 1)
summ = sess.run(valid_summ)
writer_valid.add_summary(summary=summ, global_step=global_step) # <- I want this on the training loss graph
What I have tried
Separate tf.summary.FileWriter objects (one for training, one for validation), as recommended by this issue and this question (think what I'm after is alluded to in the comment of that question)
The use of tf.summary.merge to merge all my training and validation/test metrics into overall summary ops; does useful book-keeping but doesn't plot what I want on the same graph
Use of the tf.summary.scalar family attribute (loss still gets renamed to loss_1)
(Complete hack solution) Use valid_loss, valid_loss_update = tf.metrics.mean(loss) on the training data and then run tf.local_variables_initializer() every training batch. This does give you the same summary op and thus puts things on the same graph but is surely not how you're meant to do this? It also doesn't generalise to other metrics.
Context
Tensorflow 1.9.0
Tensorboard 1.9.0
Python 3.5.2
The Tensorboard custom_scalar plugin is the way to solve this problem.
Here's the same example again with a custom_scalar to plot the two losses (per training batch + averaged over all validation batches) on the same plot:
# View graphs with (Linux): $ tensorboard --logdir=/tmp/my_tf_model
import os
import tempfile
import tensorflow as tf
import numpy as np
from tensorboard import summary as summary_lib
from tensorboard.plugins.custom_scalar import layout_pb2
def train_data_gen():
yield np.random.normal(size=[3]), np.array([0.5, 0.5, 0.5])
def valid_data_gen():
yield np.random.normal(size=[3]), np.array([0.8, 0.8, 0.8])
batch_size = 25
n_training_batches = 4
n_valid_batches = 2
n_epochs = 5
summary_loc = os.path.join(tempfile.gettempdir(), 'my_tf_model')
print("Summaries written to " + summary_loc)
# Dummy data
train_data = tf.data.Dataset.from_generator(
train_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
valid_data = tf.data.Dataset.from_generator(
valid_data_gen, (tf.float32, tf.float32)).repeat().batch(batch_size)
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(handle, train_data.output_types,
train_data.output_shapes)
batch_x, batch_y = iterator.get_next()
train_iter = train_data.make_initializable_iterator()
valid_iter = valid_data.make_initializable_iterator()
# Some ops on the data
loss = tf.losses.mean_squared_error(batch_x, batch_y)
valid_loss, valid_loss_update = tf.metrics.mean(loss)
with tf.name_scope('loss'):
train_summ = summary_lib.scalar('training', loss)
valid_summ = summary_lib.scalar('valid', valid_loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_handle, valid_handle = sess.run([train_iter.string_handle(), valid_iter.string_handle()])
sess.run([train_iter.initializer, valid_iter.initializer])
writer_train = tf.summary.FileWriter(os.path.join(summary_loc, 'train'), sess.graph)
writer_valid = tf.summary.FileWriter(os.path.join(summary_loc, 'valid'), sess.graph)
layout_summary = summary_lib.custom_scalar_pb(
layout_pb2.Layout(category=[
layout_pb2.Category(
title='losses',
chart=[
layout_pb2.Chart(
title='losses',
multiline=layout_pb2.MultilineChartContent(tag=[
'loss/training', 'loss/valid'
]))
])
]))
writer_train.add_summary(layout_summary)
global_step = 0
for i in range(n_epochs):
for j in range(n_training_batches): # "Training"
global_step += 1
summ = sess.run(train_summ, feed_dict={handle: train_handle})
writer_train.add_summary(summary=summ, global_step=global_step)
sess.run(tf.local_variables_initializer())
for j in range(n_valid_batches): # "Validation"
_, batch_summ = sess.run([valid_loss_update, train_summ], feed_dict={handle: valid_handle})
summ = sess.run(valid_summ)
writer_valid.add_summary(summary=summ, global_step=global_step)
Here's the resulting output in Tensorboard.
#!/usr/bin/python
#-*- coding:utf-8 -*-
"""
Created on Tue Jan 2 16:31:45 2018
#author: houlinjie
"""
import tensorflow as tf
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step
from tensorflow.python.platform import tf_logging as logging
import inception_preprocessing
from inception_resnet_v2 import inception_resnet_v2, inception_resnet_v2_arg_scope
import os
import time
slim = tf.contrib.slim
import sys
import matplotlib.pyplot as plt
import numpy as np
#================ DATASET INFORMATION ======================
#State dataset directory where the tfrecord files are located
dataset_dir = '.'
#设定Gpu使用量
#config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.7
#State where your log file is at. If it doesn't exist, create it.
log_dir = './log'
#State where your checkpoint file is
checkpoint_file = './inception_resnet_v2_2016_08_30.ckpt'
#State the image size you're resizing your images to. We will use the default inception size of 299.
img_height = 600
img_width = 800
#State the number of classes to predict:
num_classes = 6
#State the labels file and read it
labels_file = './labels.txt'
labels = open(labels_file, 'r')
#Create a dictionary to refer each label to their string name
labels_to_name = {}
for line in labels:
label, string_name = line.split(':')
string_name = string_name[:-1] #Remove newline
labels_to_name[int(label)] = string_name
#Create the file pattern of your TFRecord files so that it could be recognized later on
file_pattern = 'estate_%s_*.tfrecord'
#Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later.
items_to_descriptions = {
'image': 'A 3-channel RGB coloured real estate image that is either bathroom, bedroom, floorplan, kitchen, or livingroom, other.',
'label': 'A label that is as such -- 0:bathroom, 1:bedroom, 2:floorplan, 3:kitchen, 4:livingroom, 5:other'
}
#================= TRAINING INFORMATION ==================
#State the number of epochs to train
num_epochs = 10
#State your batch size
batch_size = 4
#Learning rate information and configuration (Up to you to experiment)
initial_learning_rate = 0.0002
learning_rate_decay_factor = 0.7
num_epochs_before_decay = 2
#============== DATASET LOADING ======================
#We now create a function that creates a Dataset class which will give us many TFRecord files to feed in the examples into a queue in parallel.
def get_split(split_name, dataset_dir, file_pattern=file_pattern, file_pattern_for_counting='estate'):
'''
Obtains the split - training or validation - to create a Dataset class for feeding the examples into a queue later on. This function will
set up the decoder and dataset information all into one Dataset class so that you can avoid the brute work later on.
Your file_pattern is very important in locating the files later.
INPUTS:
- split_name(str): 'train' or 'validation'. Used to get the correct data split of tfrecord files
- dataset_dir(str): the dataset directory where the tfrecord files are located
- file_pattern(str): the file name structure of the tfrecord files in order to get the correct data
- file_pattern_for_counting(str): the string name to identify your tfrecord files for counting
OUTPUTS:
- dataset (Dataset): A Dataset class object where we can read its various components for easier batch creation later.
'''
#First check whether the split_name is train or validation
if split_name not in ['train', 'validation']:
raise ValueError('The split_name %s is not recognized. Please input either train or validation as the split_name' % (split_name))
#Create the full path for a general file_pattern to locate the tfrecord_files
file_pattern_path = os.path.join(dataset_dir, file_pattern % (split_name))
#Count the total number of examples in all of these shard
num_samples = 0
file_pattern_for_counting = file_pattern_for_counting + '_' + split_name
tfrecords_to_count = [os.path.join(dataset_dir, file) for file in os.listdir(dataset_dir) if file.startswith(file_pattern_for_counting)]
for tfrecord_file in tfrecords_to_count:
for record in tf.python_io.tf_record_iterator(tfrecord_file):
num_samples += 1
#Create a reader, which must be a TFRecord reader in this case
reader = tf.TFRecordReader
#Create the keys_to_features dictionary for the decoder
keys_to_features = {
'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
'image/format': tf.FixedLenFeature((), tf.string, default_value='jpg'),
'image/class/label': tf.FixedLenFeature(
[], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
}
#Create the items_to_handlers dictionary for the decoder.
items_to_handlers = {
'image': slim.tfexample_decoder.Image(),
'label': slim.tfexample_decoder.Tensor('image/class/label'),
}
#Start to create the decoder
decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features, items_to_handlers)
#Create the labels_to_name file
labels_to_name_dict = labels_to_name
#Actually create the dataset
#dataset 对象定义了数据集的文件位置,解码方式等元信息
dataset = slim.dataset.Dataset(
data_sources = file_pattern_path,
decoder = decoder,
reader = reader,
num_readers = 4,
num_samples = num_samples,
num_classes = num_classes,
labels_to_name = labels_to_name_dict,
items_to_descriptions = items_to_descriptions)
return dataset
def load_batch(dataset, batch_size, height=img_height, width=img_width, is_training=True):
'''
Loads a batch for training.
INPUTS:
- dataset(Dataset): a Dataset class object that is created from the get_split function
- batch_size(int): determines how big of a batch to train
- height(int): the height of the image to resize to during preprocessing
- width(int): the width of the image to resize to during preprocessing
- is_training(bool): to determine whether to perform a training or evaluation preprocessing
OUTPUTS:
- images(Tensor): a Tensor of the shape (batch_size, height, width, channels) that contain one batch of images
- labels(Tensor): the batch's labels with the shape (batch_size,) (requires one_hot_encoding).
'''
#First create the data_provider object
data_provider = slim.dataset_data_provider.DatasetDataProvider(
dataset,
common_queue_capacity = 24 + 3 * batch_size,
common_queue_min = 24)
#Obtain the raw image using the get method
raw_image, label = data_provider.get(['image', 'label'])
#Perform the correct preprocessing for this image depending if it is training or evaluating
image = inception_preprocessing.preprocess_image(raw_image, height, width, is_training)
#As for the raw images, we just do a simple reshape to batch it up
raw_image = tf.expand_dims(raw_image, 0)
raw_image = tf.image.resize_nearest_neighbor(raw_image, [height, width])
raw_image = tf.squeeze(raw_image)
#Batch up the image by enqueing the tensors internally in a FIFO queue and dequeueing many elements with tf.train.batch.
images, raw_images, labels = tf.train.batch(
[image, raw_image, label],
batch_size = batch_size,
num_threads = 4,
capacity = 4 * batch_size,
allow_smaller_final_batch = True)
print("images tensor data type:", tf.shape(images))
return images, raw_images, labels
def train():
#Create the log directory here. Must be done here otherwise import will activate this unneededly.
if not os.path.exists(log_dir):
os.mkdir(log_dir)
#======================= TRAINING PROCESS =========================
#Now we start to construct the graph and build our model
#with tf.Graph().as_default() as graph:
tf.logging.set_verbosity(tf.logging.INFO) #Set the verbosity to INFO level
#First create the dataset and load one batch
dataset = get_split('train', dataset_dir, file_pattern=file_pattern)
images, raw_images, labels = load_batch(dataset, batch_size=batch_size)
print('num_samples:', dataset.num_samples)
#Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(dataset.num_samples / batch_size)
num_steps_per_epoch = num_batches_per_epoch #Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
x = tf.placeholder(tf.float32, shape=[None, img_height, img_width, 3], name='x')
y_true = tf.placeholder(tf.int32, shape=[None], name='y_true')
#Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(x, num_classes = dataset.num_classes, is_training = True)
#Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude = exclude)
#Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = slim.one_hot_encoding(y_true, dataset.num_classes)
#Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits)
total_loss = tf.losses.get_total_loss() #obtain the regularization losses as well
#Create the global step for monitoring the learning_rate and training.
global_step = get_or_create_global_step()
#Define your exponentially decaying learning rate
lr = tf.train.exponential_decay(
learning_rate = initial_learning_rate,
global_step = global_step,
decay_steps = decay_steps,
decay_rate = learning_rate_decay_factor,
staircase = True)
#Now we can define the optimizer that takes on the learning rate
optimizer = tf.train.AdamOptimizer(learning_rate = lr)
#Create the train_op.
train_op = slim.learning.create_train_op(total_loss, optimizer)
#State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
y_pred = tf.nn.softmax(logits, name='y_pred')
predictions = tf.argmax(end_points['Predictions'], 1)
probabilities = end_points['Predictions']
accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions,y_true)
precision, precision_update = tf.contrib.metrics.streaming_precision(predictions, y_true)
recall, recall_update = tf.contrib.metrics.streaming_recall(predictions, y_true)
#tf.group 返回的值是 ‘op’
metrics_op = tf.group(accuracy_update, probabilities, precision_update, recall_update)
#Now finally create all the summaries you need to monitor and group them into one summary op.
tf.summary.scalar('losses/Total_Loss', total_loss)
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('learning_rate', lr)
tf.summary.scalar('precision',precision)
tf.summary.scalar('recall',recall)
my_summary_op = tf.summary.merge_all()
#my_summary_op = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES))
#Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
def train_step(sess, train_op, global_step, img, lab):
'''
Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
'''
#Check the time for each sess run
start_time = time.time()
total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op],feed_dict={x: img.eval(session=sess), y_true: lab.eval(session=sess)})
#total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op])
time_elapsed = time.time() - start_time
#Run the logging to print some results
#if global_step_count % 10 == 0:
logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed)
return total_loss, global_step_count
#Now we create a saver function that actually restores the variables from a checkpoint file in a sess
saver = tf.train.Saver(variables_to_restore)
def restore_fn(sess):
return saver.restore(sess, checkpoint_file)
#Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
#logdir used for save checkpoint and summary
"""
Supervisor的作用
1.自动去checkpoint 加载数据或初始化数据
2.自身有一个Saver, 可以用来保存checkpoint
3.有一个summary_computed 用来保存Summary
所以我们就不需要:
1.手动初始化或从checkpoint 中加载数据
2.不需要创建Saver, 使用sv内部的就可以
3.不需要创建summary writer
"""
sv = tf.train.Supervisor(logdir = log_dir, summary_op = None, init_fn = restore_fn)
#Run the managed session
#会自动去logdir 中去找checkpoint, 如果没有的话,自动执行初始化
with sv.managed_session() as sess:
for step in xrange(num_steps_per_epoch * num_epochs):
#At the start of every epoch, show the vital information
if step % num_batches_per_epoch == 0:
logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1, num_epochs)
learning_rate_value, accuracy_value = sess.run([lr, accuracy])
logging.info('Current Learning Rate: %s', learning_rate_value)
logging.info('Current Streaming Accuracy: %s', accuracy_value)
# optionally, print your logits and predictions for a sanity check that things are going fine.
logits_value, probabilities_value, predictions_value, labels_value = sess.run([logits, probabilities, predictions, labels], feed_dict={x: images.eval(session=sess), y_true: labels.eval(session=sess)})
print 'logits: \n', logits_value
print 'Probabilities: \n', probabilities_value
print 'predictions: \n', predictions_value
print 'Labels:\n:', labels_value
#Log the summaries every 10 step.
if step % 10 == 0:
loss, _ = train_step(sess, train_op, sv.global_step, images, labels)
summaries = sess.run(my_summary_op, feed_dict={x: images.eval(session=sess), y_true: labels.eval(session=sess)})
sv.summary_computed(sess, summaries)
#If not, simply run the training step
else:
loss, _ = train_step(sess, train_op, sv.global_step, images, labels)
#raw_images, labels, predictions = sess.run([raw_images, labels, predictions], feed_dict={x: images.eval(session=sess), y_true: labels.eval(session=sess)})
#We log the final training loss and accuracy
logging.info('Final Loss: %s', loss)
logging.info('Final Accuracy: %s', sess.run(accuracy))
#Once all the training has been done, save the log files and checkpoint model
logging.info('Finished training! Saving model to disk now.')
#saver.save(sess, "./log/estate_model.ckpt")
sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
if __name__ == '__main__':
train()
The code can run but accuracy is very low,I modify the code from batch load dataset to placeholder method, I convert to the return value that load_batch() function from a tensor to numpy.array, use tensor.eval() method to convert a tensor feed_dict={x: img.eval(session=sess), y_true: lab.eval(session=sess)}),I suspect the below code snippet ,but i can't find out the issue
dataset = get_split('train', dataset_dir, file_pattern=file_pattern)
images, raw_images, labels = load_batch(dataset, batch_size=batch_size)
print('num_samples:', dataset.num_samples)
#Know the number steps to take before decaying the learning rate and batches per epoch
num_batches_per_epoch = int(dataset.num_samples / batch_size)
num_steps_per_epoch = num_batches_per_epoch #Because one step is one batch processed
decay_steps = int(num_epochs_before_decay * num_steps_per_epoch)
x = tf.placeholder(tf.float32, shape=[None, img_height, img_width, 3], name='x')
y_true = tf.placeholder(tf.int32, shape=[None], name='y_true')
#Create the model inference
with slim.arg_scope(inception_resnet_v2_arg_scope()):
logits, end_points = inception_resnet_v2(x, num_classes = dataset.num_classes, is_training = True)
#Define the scopes that you want to exclude for restoration
exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits']
variables_to_restore = slim.get_variables_to_restore(exclude = exclude)
#Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
one_hot_labels = slim.one_hot_encoding(y_true, dataset.num_classes)
#Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits)
total_loss = tf.losses.get_total_loss() #obtain the regularization losses as well
I am just starting with tensorflow and I thought a good first step would be to adapt CIFAR10 model for my own use. My database are not images but signals and a whole database has a shape of [16400,3000,1,1] (dimensionwise: number of all samples, height, width and number of channels added on purpose). I am already working on this problem with MatConvNet toolbox, so this question is strictly about tensorflow machnism. The database is a ready numpy tensor of the size above, in the code below is my attempt to prepare the data to be readable for the training script
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import numpy as np
IMAGE_SIZE = 3000
data = np.load('/home/tensorflow-master/tensorflow/models/image/cifar10 /konsensop/data.npy')
labels = np.load('/home/tensorflow-master/tensorflow/models/image/cifar10/konsensop/labels.npy')
labels = labels-1
labels = labels.astype(int)
data = tf.cast(data,tf.float32)
labels = tf.cast(labels,tf.int64)
NUM_CLASSES = 2
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 10000
NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 6400
def _generate_image_and_label_batch(data_sample, label, in_queue_examples,
batch_size, shuffle):
num_preprocess_threads = 16
if shuffle:
data, label_batch = tf.train.shuffle_batch(
[data_sample, label],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + batch_size,
min_after_dequeue=min_queue_examples)
else:
data, label_batch = tf.train.batch(
[data_sample, label],
batch_size=batch_size,
num_threads=num_preprocess_threads,
capacity=min_queue_examples + batch_size)
return data, tf.reshape(label_batch, [batch_size])
def inputs(data,labels, batch_size):
for i in xrange(0, data.shape[0]/batch_size):
data_sample = data[i,:,:,:]
label = labels[i,0]
height = 3000
width = 1
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN*
min_fraction_of_examples_in_queue)
print('Filling queue with %d data before starting to train' % min_queue_examples)
return _generate_image_and_label_batch(data_sample, label,
min_queue_examples, batch_size,
shuffle=True)
I'm trying to load the data I aleady have and generate batches in a way cifar10 model did, but when running the trainer code I get an error indata,labels = konsensop_input.inputs(data,labels,batch_size) UnboundcocalError: local variable 'data' referenced before assigment
data = konsensop_input.data
labels = konsensop_input.labels
def train():
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable = False)
data, labels = konsensop_input.inputs(data, labels, batch_size)
logits = konsensop_train.inference(data)
# calculate loss
loss = konsensop.loss(logits, labels)
train_op = konsensop.train(loss, global_step)
# create a saver
saver = tf.train.Saver(tf.all_variables()) #saves all variables in a graph
# build the summary operation based on the TF collection of summaries
summary_op = tf.merge_all_summaries()
# build an initialization operation to run below
init = tf.initialize_all_variables()
# start running operations on the graph
sess = tf.Session(config = tf.ConfigProto(log_device_placement=False))
sess.run(init)
# start the queue runners
tf.train.start_queue_runners(sess = sess) #co to i po co to"""
summary_writer = tf.train.SummaryWriter( FLAGS.train_dir, sess.graph)
for step in xrange(FLAGS.max_step):
start_time = time.time()
_, loss_value = sess.run([train_op, loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
print ( format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch))
if step % 100 == 0:
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step = step)
def main(argv=None):
train()
if __name__=='__main__':
tf.app.run()
I would like to figure out how to implement a reasonable data feeding technique here
For the relatively small data set you want to work with, you might consider just loading it into a big numpy array, then iterating over it in mini-batches, which you feed to the computation graph via tf.placeholders and the feed_dict mechanism.
The mini-batch iteration could look something like this (you should probably add random shuffling after each epoch):
def iterate_batches(X, y, batch_size, num_epochs):
N = np.size(X, 0)
batches_per_epoch = N/float(batch_size)
for i in range(num_epochs):
for j in range(batches_per_epoch):
start, stop = j*batch_size, (j+1)*batch_size
yield X[start:stop, :], y[start:stop]
(If you are not familiar with Python's yield mechanism, google for Python generators. There a lots of good introductions on the web.)
Given that you have a mechanism to load the whole data set into a numpy array X_train, y_train, you can then write your training loop like this
train_op = ...
for X, y in iterate_batches(X_train, y_train, you_batch_size, your_num_epochs):
sess.run([train_op], feed_dict={X_tensor: X, y_tensor: y}
Here, X_tensor and y_tensor are tf.placeholders for the data, that you have to specify in your network architecture.
So I have this great bit of code that comes out with approximately a 93% accuracy rate on its predictions. What I'm wondering how to do now is to take the trained program, make it look at actual test data without the answer on it, and make it fill in the answer regardless of the accuracy. Here's the code that I have that predicts with a ~93% accuracy rate.
import tensorflow as tf
import numpy as np
from numpy import genfromtxt
import sklearn
# Convert to one hot
def convertOneHot(data):
y=np.array([int(i[0]) for i in data])
y_onehot=[0]*len(y)
for i,j in enumerate(y):
y_onehot[i]=[0]*(y.max() + 1)
y_onehot[i][j]=1
return (y,y_onehot)
data = genfromtxt('cs-training.csv',delimiter=',') # Training data
test_data = genfromtxt('cs-test.csv',delimiter=',') # Test data
x_train=np.array([ i[1::] for i in data])
y_train,y_train_onehot = convertOneHot(data)
x_test=np.array([ i[1::] for i in test_data])
y_test,y_test_onehot = convertOneHot(test_data)
A=data.shape[1]-1 # Number of features, Note first is y
B=len(y_train_onehot[0])
tf_in = tf.placeholder("float", [None, A]) # Features
tf_weight = tf.Variable(tf.zeros([A,B]))
tf_bias = tf.Variable(tf.zeros([B]))
tf_softmax = tf.nn.softmax(tf.matmul(tf_in,tf_weight) + tf_bias)
# Training via backpropagation
tf_softmax_correct = tf.placeholder("float", [None,B])
tf_cross_entropy = -tf.reduce_sum(tf_softmax_correct*tf.log(tf_softmax))
# Train using tf.train.GradientDescentOptimizer
tf_train_step = tf.train.GradientDescentOptimizer(0.01).minimize(tf_cross_entropy)
# Add accuracy checking nodes
tf_correct_prediction = tf.equal(tf.argmax(tf_softmax,1), tf.argmax(tf_softmax_correct,1))
tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))
saver = tf.train.Saver([tf_weight,tf_bias])
# Initialize and run
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
print("...")
# Run the training
for i in range(100):
sess.run(tf_train_step, feed_dict={tf_in: x_train, tf_softmax_correct: y_train_onehot})
#Print accuracy
result = sess.run(tf_accuracy, feed_dict={tf_in: x_test, tf_softmax_correct: y_test_onehot})
print result
Now I have the actual test set cs-test-actual.csv where the first column is entirely empty and I need to fill it in with a predicted 1 or 0. How do I go about doing that?
The program above doesn't appear to be saving the trained session. I think you want to do this in two steps.
Train and save the session
Restore the save session, and run test data through it.
Step 1:
#!/usr/bin/env python
import tensorflow as tf
import numpy as np
from numpy import genfromtxt
import sklearn
# Convert to one hot
def convertOneHot(data):
y=np.array([int(i[0]) for i in data])
y_onehot=[0]*len(y)
for i,j in enumerate(y):
y_onehot[i]=[0]*(y.max() + 1)
y_onehot[i][j]=1
return (y,y_onehot)
# Build Example Data is CSV format, but use Iris data
from sklearn import datasets
from sklearn.model_selection import train_test_split
def buildDataFromIris():
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.33, random_state=42)
f=open('cs-training.csv','w')
for i,j in enumerate(X_train):
k=np.append(np.array(y_train[i]),j )
f.write(",".join([str(s) for s in k]) + '\n')
f.close()
f=open('cs-test.csv','w')
for i,j in enumerate(X_test):
k=np.append(np.array(y_test[i]),j )
f.write(",".join([str(s) for s in k]) + '\n')
f.close()
# Recreate logging and save dir
# Seems the tensorflow won't always overwrite
import shutil, os, sys
TMPDir='./tensorTMP'
try:
shutil.rmtree(TMPDir)
except:
print "Tmp Dir did not exist...that's okay"
os.mkdir(TMPDir, 0755 )
# Populate the data
buildDataFromIris()
data = genfromtxt('cs-training.csv',delimiter=',') # Training data
test_data = genfromtxt('cs-test.csv',delimiter=',') # Test data
x_train=np.array([ i[1::] for i in data])
y_train,y_train_onehot = convertOneHot(data)
x_test=np.array([ i[1::] for i in test_data])
y_test,y_test_onehot = convertOneHot(test_data)
A=data.shape[1]-1 # Number of features, Note first is y
B=len(y_train_onehot[0])
tf_in = tf.placeholder("float", [None, A]) # Features
tf_weight = tf.Variable(tf.zeros([A,B]))
tf_bias = tf.Variable(tf.zeros([B]))
tf_softmax = tf.nn.softmax(tf.matmul(tf_in,tf_weight) + tf_bias)
# Training via backpropagation
tf_softmax_correct = tf.placeholder("float", [None,B])
tf_cross_entropy = -tf.reduce_sum(tf_softmax_correct*tf.log(tf_softmax))
# Train using tf.train.GradientDescentOptimizer
tf_train_step = tf.train.GradientDescentOptimizer(0.01).minimize(tf_cross_entropy)
# Add accuracy checking nodes
tf_correct_prediction = tf.equal(tf.argmax(tf_softmax,1), tf.argmax(tf_softmax_correct,1))
tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))
saver = tf.train.Saver([tf_weight,tf_bias])
# Initialize and run
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
THRESHOLD = 0.98
saved = False
print("...")
# Run the training
for i in range(100):
sess.run(tf_train_step, feed_dict={tf_in: x_train, tf_softmax_correct: y_train_onehot})
result = sess.run(tf_accuracy, feed_dict={tf_in: x_test, tf_softmax_correct: y_test_onehot})
# If it's well trained on this iteration, save it. We just need one save.
if result > THRESHOLD and saved == False:
saved = True
print "saving result {}".format(result)
saver.save(sess,TMPDir +"/savedSess")
The only modifications made were generating sample data using Iris, establishing a THRESHOLD or confidence interval for the session. If it's over that THRESHOLD, then, save the session. After running step one, the model should be trained and saved.
Step 2:
Restore the saved session, and run the training data through it.
#!/usr/bin/env python
import tensorflow as tf
import numpy as np
from numpy import genfromtxt
import sklearn
# Convert to one hot
def convertOneHot(data):
y=np.array([int(i[0]) for i in data])
y_onehot=[0]*len(y)
for i,j in enumerate(y):
y_onehot[i]=[0]*(y.max() + 1)
y_onehot[i][j]=1
return (y,y_onehot)
data = genfromtxt('cs-training.csv',delimiter=',') # Training data
test_data = genfromtxt('cs-test.csv',delimiter=',') # Test data
x_train=np.array([ i[1::] for i in data])
y_train,y_train_onehot = convertOneHot(data)
x_test=np.array([ i[1::] for i in test_data])
y_test,y_test_onehot = convertOneHot(test_data)
A=data.shape[1]-1 # Number of features, Note first is y
B=len(y_train_onehot[0])
tf_in = tf.placeholder("float", [None, A]) # Features
tf_weight = tf.Variable(tf.zeros([A,B]))
tf_bias = tf.Variable(tf.zeros([B]))
tf_softmax = tf.nn.softmax(tf.matmul(tf_in,tf_weight) + tf_bias)
# Training via backpropagation
tf_softmax_correct = tf.placeholder("float", [None,B])
tf_cross_entropy = -tf.reduce_sum(tf_softmax_correct*tf.log(tf_softmax))
# Train using tf.train.GradientDescentOptimizer
tf_train_step = tf.train.GradientDescentOptimizer(0.01).minimize(tf_cross_entropy)
# Add accuracy checking nodes
tf_correct_prediction = tf.equal(tf.argmax(tf_softmax,1), tf.argmax(tf_softmax_correct,1))
tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))
saver = tf.train.Saver([tf_weight,tf_bias])
# Initialize and run
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
TMPDir='./tensorTMP'
saver.restore(sess, TMPDir + '/savedSess')
ans = sess.run(tf_softmax, feed_dict={tf_in: x_test, tf_softmax_correct: y_test_onehot})
print ans
Note, your output will look like the following...
[[ 6.17585704e-02 8.63590300e-01 7.46511072e-02]
[ 9.98804331e-01 1.19561062e-03 3.25832108e-13]
[ 1.52018686e-07 4.49650863e-04 9.99550164e-01]