tensorflow using tf.train.string_input_producer - python

I'm using tf.train.string_input_producer to read data from tfRecord file. I suppose it create a queue and pipeline and the data will automatically loaded and feed into my model. However, it stuck at the first batch, and show this exception:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value input_producer/limit_epochs/epochs
my tfrecord was made by tf.train.SequenceExample, instead of tf.train.Example, which don't have clear documentation in the official guide.
here is code snapshot to reproduce my problem. (I believe my problem come from the queue initializing or sth. because it seems that the whole pipeline is hang up)
from config.config import get_config
init = tf.global_variables_initializer()
config = get_config()
filename_queue = tf.train.string_input_producer(['data0.tfrecord,data1.tfrecord'], 5, capacity=16384)
reader = tf.TFRecordReader()
(keys, values) = reader.read_up_to(filename_queue, config.batch_size)
context_features = {
"seq_len": tf.FixedLenFeature([1], dtype=tf.int64),
}
audio_features = {
"audio": tf.FixedLenSequenceFeature([config.num_features], dtype=tf.float32),
"label": tf.FixedLenSequenceFeature([config.num_classes], dtype=tf.float32)
}
audio_list = []
label_list = []
len_list = []
for i in range(config.batch_size):
print(i)
context, sequence = tf.parse_single_sequence_example(
serialized=values[i],
context_features=context_features,
sequence_features=audio_features
)
audio = sequence['audio']
label = sequence['label']
# seq_len = context['seq_len'][0]
seq_len = tf.shape(audio)[0]
audio_list.append(audio)
label_list.append(label)
len_list.append(seq_len)
audio_tensor = tf.stack(audio_list)
label_tenor = tf.stack(label_list)
len_tensor = tf.stack(len_list)
with tf.Session() as sess:
sess.run(init)
threads = tf.train.start_queue_runners(sess=sess)
for i in range(3):
x, y, z = sess.run([audio_tensor, label_tenor, len_tensor])
print(z)

Try
init2 = tf.local_variables_initializer()
sess.run(init2)
Variabes (num_epochs or capacity) inside tf.train.string_input_producer() are local variables. You have to initialize them with local variable initializer as shown above.
Let me know if this helped.

Related

Tensorflow error with dataset iterator initialization in monitoredtrainingsession

Hi everyone i need somehelp.
I try to code resnet-101 imagenet classification using tensorflow without using estimator. I try it to study deep learning and understand how to use tensorflow.
My problem is monitoredtrainingSession does not initilize my iterator.
I have read some article about the problems and try to use hook to handle it but it fails and i have no idea why it fails.
after i create monitoredtrainingsession it first initialize train_iterator
and got outOfRange exception
then validation step are performed.
It seems fine for now but after finish runing validation and try to run training step again. I got Error related with iterator.get_next().
It saids I did not initialize iterator but my hook function clearly call
session.run(self._initializer, feed_dict={filenames: self._filenames})
i'm sure because i can see the below message that i print to check if it is initialized or not.
iter_val.initializer after_create_session is called 0 times
what am i wrong with it?
running flow are like below
run train step fine (epoch =0)
run validation step fine (epoch =0)
run train step Error(epoch =1)
Please ignore horovod(hvd()) in the code cause I am not using it right now.
Here is my code so please help me to fix it and let me know what's wrong with my code.
class _DatasetInitializerHook(tf.train.SessionRunHook):
def __init__(self, initializer, filenames=[], name=""):
self._initializer = initializer
self._filenames = filenames
self._name = name
self._cnt = 0
self._before_runCnt = 0
def begin(self):
pass
def after_create_session(self, session, coord):
del coord
if len(self._filenames) == 0:
session.run(self._initializer)
else:
session.run(self._initializer, feed_dict={filenames: self._filenames})
print(self._name, "after_create_session is called {} times".format(self._cnt))
self._cnt += 1
if __name__ == "__main__":
if len(sys.argv) > 1:
nlogs = sys.argv[1]
else:
nlogs = 0
hvd.init()
b_imagenet=False
if b_imagenet:
training_filenames = ['/data/tfrecords/imagenet2012_train_shard{}.tfrecord'.format(i) for i in range(129)]
else:
training_filenames = ['/data/cifar-10-tfrecords/train_shard{}.tfrecord'.format(i) for i in range(1, 2, 1)]
filenames = tf.placeholder(tf.string, shape=[None])
trainData = dataset_input_fn(is_training=True, filename=filenames, nworkers=hvd.size(), workeridx=hvd.rank(),
batch_size=FLAGS.batchSize, prefetch_size=FLAGS.prefetch_buffer_size, repeat=1,
shuffle_buffer_size=FLAGS.shuffle_buffer_size)
valData = dataset_input_fn(is_training=False, filename=FLAGS.validationfile, nworkers=hvd.size(), workeridx=hvd.rank(),
batch_size=1,prefetch_size=FLAGS.prefetch_buffer_size, repeat=1, shuffle_buffer_size=1)
# Pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
for i in tqdm(range(FLAGS.nepoch)):
shuffle(training_filenames)
model = model_class(nCls=FLAGS.nClasses, img_width=FLAGS.width, img_height=FLAGS.height,
learning_rate=FLAGS.learning_rate, weight_decay=FLAGS.weight_decay)
iter_train = trainData.make_initializable_iterator()
train_op = model.build_model(iter_train.get_next(), is_trainig=True, hvd=None)
train_hooks = [hvd.BroadcastGlobalVariablesHook(0),
_DatasetInitializerHook(iter_train.initializer, training_filenames, "iter_train.initializer")]
with tf.train.MonitoredTrainingSession(checkpoint_dir="./tmp/train_logs", config=config, hooks=train_hooks,
save_checkpoint_secs=30) as sess:
try:
while True:
opt = sess.run([train_op])
except tf.errors.OutOfRangeError:
pass
iter_val = valData.make_initializable_iterator()
prediction_result = model.build_model(iter_val.get_next(),is_trainig=False, hvd=None)
validation_hooks = [hvd.BroadcastGlobalVariablesHook(0),
_DatasetInitializerHook(iter_val.initializer, [], "iter_val.initializer")]
with tf.train.MonitoredTrainingSession( checkpoint_dir="./tmp/train_logs",config=config, hooks=validation_hooks) as sess:
try:
while True:
result = sess.run([prediction_result])
except tf.errors.OutOfRangeError:
pass
This is the error message I got.
tensorflow.python.framework.errors_impl.FailedPreconditionError: GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
[[node IteratorGetNext (defined at workspace/multi_gpu/main.py:128) ]]
Errors may have originated from an input operation.
Input Source operations connected to node IteratorGetNext:
IteratorV2_2 (defined at workspace/multi_gpu/main.py:126)
Try putting your initializer into a scaffold:
scaffold = tf.train.Scaffold(local_init_op=train_init_operator)
and give it to the monitoredTrainingSession with:
with tf.train.MonitoredTrainingSession(scaffold=scaffold, ...

Running multiple tensorflow sessions subsequently

I'm developing a simple REST controller using gunicorn and flask.
At each REST call, I execute the following code
#app.route('/objects', methods=['GET'])
def get_objects():
video_title = request.args.get('video_title')
video_path = "../../video/" + video_title
cl.logger.info(video_path)
start = request.args.get('start')
stop = request.args.get('stop')
scene = [start, stop]
frames = images_utils.extract_frames(video_path, scene[0], scene[1], 1)
cl.logger.info(scene[0]+" "+scene[1])
objects = list()
##objects
model = GenericDetector('../resources/open_images/frozen_inference_graph.pb', '../resources/open_images/labels.txt')
model.run(frames)
for result in model.get_boxes_and_labels():
if result is not None:
objects.append(result)
data = {'message': {
'start_time': scene[0],
'end_time': scene[1],
'path': video_path,
'objects':objects,
}, 'metadata_type': 'detection'}
return jsonify({'status': data}), 200
This code runs a tensorflow frozen model as follows:
class GenericDetector(Process):
def __init__(self, model, labels):
# ## Load a (frozen) Tensorflow model into memory.
self.detection_graph = tf.Graph()
with self.detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(model, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
self.boxes_and_labels = []
# ## Loading label map
with open(labels) as f:
txt_labels = f.read()
self.labels = json.loads(txt_labels)
def run(self, frames):
tf.reset_default_graph()
with self.detection_graph.as_default():
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(graph=self.detection_graph, config=config) as sess:
image_tensor = self.detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = self.detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
detection_scores = self.detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = self.detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = self.detection_graph.get_tensor_by_name('num_detections:0')
i = 0
for frame in frames:
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(frame, axis=0)
# Actual detection.
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections], \
feed_dict={image_tensor: image_np_expanded})
boxes = np.squeeze(boxes)
classes = np.squeeze(classes).astype(np.int32)
scores = np.squeeze(scores)
for j, box in enumerate(boxes):
if all(v == 0 for v in box):
continue
self.boxes_and_labels.append(
{
"ymin": str(box[0]),
"xmin": str(box[1]),
"ymax": str(box[2]),
"xmax": str(box[3]),
"label": self.labels[str(classes[j])],
"score": str(scores[j]),
"frame":i
})
i += 1
sess.close()
def get_boxes_and_labels(self):
return self.boxes_and_labels
Everything seems to work as excepted, but once I send a second request to my server, my GPU ( a GTX 1050 ) goes out of memory:
ResourceExhaustedError (see above for traceback): OOM when allocating
tensor of shape [3,3,256,256] and type float
If I try to make a call after that, it works most of the time. Sometimes it will work on subsequent calls too. I tried executing the GenericDetector on a separate Process ( Making GEnericDetector hereditate Process ), but it did not help. I read that once the process that executes the REST GET is dead, the memory of the GPU should be freed, so I also tried adding a sleep(30) after the execution of the tensorflow model, with no luck. Wham am I doing wrong?
The thing is that Tensorflow allocates the memory for the process not the Session, closing the session is not enough (even if you put the allow_growth option).
The first is the allow_growth option, which attempts to allocate only as much GPU memory based on runtime allocations: it starts out allocating very little memory, and as Sessions get run and more GPU memory is needed, we extend the GPU memory region needed by the TensorFlow process. Note that we do not release memory, since that can lead to even worse memory fragmentation.
There is an issue on TF github with some solutions , you could for example decorate your run method with the RunAsCUDASubprocess proposed in the thread.
This error means that you are trying to fit into the GPU something bigger than the memory you have available. Maybe you can reduce the number of parameters somewhere in your model in order for it to be lighter?

MonitoredTrainingSession writes more than one metagraph event per run

When writing checkpoint files using a tf.train.MonitoredTrainingSession it somehow writes multiple metagraphs. What am I doing wrong?
I stripped it down to the following code:
import tensorflow as tf
global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step")
train = tf.assign(global_step, global_step + 1)
saver = tf.train.Saver()
hooks = [(tf.train.CheckpointSaverHook(checkpoint_dir=output_path + "test1/ckpt/",
save_steps = 10,
saver = saver))]
with tf.train.MonitoredTrainingSession(master = '',
is_chief = True,
checkpoint_dir = None,
hooks = hooks,
save_checkpoint_secs = None,
save_summaries_steps = None,
save_summaries_secs = None) as mon_sess:
for i in range(30):
if mon_sess.should_stop():
break
try:
gs, _ = mon_sess.run([global_step, train])
print(gs)
except (tf.errors.OutOfRangeError,tf.errors.CancelledError) as e:
break
finally:
pass
Running this will give duplicate metagraphs, as evidenced by the tensorboard warning:
$ tensorboard --logdir ../train/test1/ --port=6006
WARNING:tensorflow:Found more than one graph event per run, or there
was a metagraph containing a graph_def, as well as one or more graph
events. Overwriting the graph with the newest event. Starting
TensorBoard 54 at local:6006 (Press CTRL+C to quit)
This is in tensorflow 1.2.0 (I cannot upgrade).
Running the same thing without a monitored session gives the right checkpoint output:
global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step")
train = tf.assign(global_step, global_step + 1)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init_op)
for i in range(30):
gs, _ = sess.run([global_step, train])
print(gs)
if i%10==0:
saver.save(sess, output_path+'/test2/my-model', global_step=gs)
print("Saved ckpt")
Results in no tensorboard errors:
$ tensorboard --logdir ../traitest2/ --port=6006
Starting TensorBoard 54 at local:6006 (Press CTRL+C to quit)
I'd like to fix this as I suspect I'm missing something fundamental, and this error may have some connection to other issues I have in distributed mode. I have to restart tensorboard anytime I want to update the data. Moreover, TensorBoard seems to get really slow over time when it puts out many of these warnings.
There is a related question: tensorflow Found more than one graph event per run
In this case the errors were due to multiple runs (with different parameters) written to the same output directory. The case here is about a single run to a clean output directory.
Running the MonitoredTrainingSession version in distributed mode gives the same errors.
Update Oct-12
#Nikhil Kothari suggested to use tf.train.MonitoredSession instead of the larger tf.train.MonitoredTrainSession wrapper, as follows:
import tensorflow as tf
global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step")
train = tf.assign(global_step, global_step + 1)
saver = tf.train.Saver()
hooks[(tf.train.CheckpointSaverHook(checkpoint_dir=output_path + "test3/ckpt/",
save_steps=10,
saver=saver))]
chiefsession = tf.train.ChiefSessionCreator(scaffold=None,
master='',
config=None,
checkpoint_dir=None,
checkpoint_filename_with_path=None)
with tf.train.MonitoredSession(session_creator=chiefsession,
hooks=hooks,
stop_grace_period_secs=120) as mon_sess:
for i in range(30):
if mon_sess.should_stop():
break
try:
gs, _ = mon_sess.run([global_step, train])
print(gs)
except (tf.errors.OutOfRangeError,tf.errors.CancelledError) as e:
break
finally:
pass
Unfortunately this still gives the same tensorboard errors:
$ tensorboard --logdir ../train/test3/ --port=6006
WARNING:tensorflow:Found more than one graph event per run, or there
was a metagraph containing a graph_def, as well as one or more graph
events. Overwriting the graph with the newest event. Starting
TensorBoard 54 at local:6006 (Press CTRL+C to quit)
btw, each codeblock is stand-alone, copy=paste it in a Jupyter notebook and you will replicate the problem.
I wonder if this is because every node in your cluster is running the same code, declaring itself as a chief, and saving out graphs and checkpoints.
I don't if the is_chief = True is just illustrative in the post here on Stack Overflow or that is exactly what you are using... so guessing a bit here.
I personally used MonitoredSession instead of MonitoredTrainingSession and created a list of hooks based on whether the code is running on the master/chief or not. Example: https://github.com/TensorLab/tensorfx/blob/master/src/training/_trainer.py#L94
You should set the parameter chief_only_hooks in 'MonitoredTrainingSession', the code as follows:
hooks = [(tf.train.CheckpointSaverHook(checkpoint_dir=output_path + "test1/ckpt/",
save_steps = 10,
saver = saver))]
with tf.train.MonitoredTrainingSession(master = '',
is_chief = True,
checkpoint_dir = None,
chief_only_hooks = hooks,
save_checkpoint_secs = None,
save_summaries_steps = None,
save_summaries_secs = None) as mon_sess:

Tensorflow hangs with high CPU usage for training a very small forest

I tried the following code in order train a very small random forest as a test, but for some reason it hangs with high CPU usage. I think it possibly might have to do with the hash table not being initialized, but I am not sure where it should be initialized to be accessible by both input_fn and the call to 'estimator.evaluate'. Any advice?
import csv
import tensorflow as tf
from tensorflow.contrib.cloud.python.ops import bigquery_reader_ops
from tensorflow.contrib.lookup import KeyValueTensorInitializer, HashTable
from tensorflow.contrib.tensor_forest.client.random_forest import TensorForestEstimator
from tensorflow.contrib.tensor_forest.python.tensor_forest import ForestHParams
from tensorflow.python.training.input import string_input_producer
sess = tf.Session()
with open('event_classes.csv', mode='r') as infile:
reader = csv.reader(infile)
event_names, event_numbers = list(zip(*[(r[0], int(r[1])) for r in reader]))
def input_fn():
# Create maps between event names and event numbers
event_class_map = HashTable(KeyValueTensorInitializer(
event_names, event_numbers, key_dtype=tf.string, value_dtype=tf.int64), int(0))
reverse_event_class_map = HashTable(KeyValueTensorInitializer(
event_numbers, event_names, key_dtype=tf.int64, value_dtype=tf.string), "Unknown")
# Specify features to read
features = {"time_{}".format(i): tf.FixedLenFeature([1], tf.string, default_value="") for i in range(4)}
# Create a Reader.
reader = bigquery_reader_ops.BigQueryReader(project_id="drivemode-com",
dataset_id="temp_stephane",
table_id="event_history",
timestamp_millis=1497502522,
num_partitions=1,
features=features)
# Populate a queue with the BigQuery Table partitions.
queue = string_input_producer(reader.partitions())
# Read and parse examples.
row_id, examples_serialized = reader.read_up_to(queue, 100)
examples = tf.parse_example(examples_serialized, features=features)
# Process the Tensors example["name"], example["age"], etc...
for i in range(4):
col = "time_{}".format(i)
examples[col] = event_class_map.lookup(examples[col])
# event_class_map.init.run(session=sess)
label = examples.pop("time_3")
return examples, label
hparams = ForestHParams(num_classes=len(event_numbers), num_features=3, max_nodes=3, num_trees=1).fill()
estimator = TensorForestEstimator(hparams)
estimator.fit(input_fn=input_fn, steps=1)
tf.train.start_queue_runners(sess)
print sess.run(estimator.evaluate(input_fn=input_fn))

What is the "RIGHT" structure to save/restore a model in Tensorflow during training/val/test?

I want to write some codes in Tensoflow that can train a model, run validation during training, and finally report results on test data for the best model selected via validation data. I was wondering is the following structure the right way to do that? [considering variable scopes, parameter sharing, saving/restoring, ..]
MyModel.py
class MyModel(object):
def build_model(self, reuse):
with tf.variable_scope("Model", reuse = reuse) as scope:
self.v1 = tf.get_variable("v1", [1, 2])
// rest of the codes
def train(self, sess):
self.build_model(False)
s1 = tf.train.Saver()
init_opt =tf.global_variables_initializer()
sess.run(init_opt)
// model training
// ...
s1.save(sess, "/tmp/model.ckpt")
def val(self, sess):
self.build_model(True)
s2 = tf.train.Saver()
// do the validation
s2.save(sess, "/tmp/best_model.ckpt")
def test(self, sess):
self.build_model(False)
s3 = tf.train.Saver()
s3.restore(sess, "/tmp/model_best.ckpt")
//rest of the codes ...
And I wrote the following functions in the two different files:
train.py:
with tf.Session() as sess:
mtrain = MyModel()
mval = MyModel()
for iter_i in range(num_training_iters):
mtrain.train(sess)
mval.val(sess)
test.py
with tf.Session() as sess:
mtest = MyModel()
mtest.test(sess)
I looked at Tensorflow tutorials, but none of them have this structure.
Any help would be highly appreciated.
Thanks

Categories