MonitoredTrainingSession save and restore model

MonitoredTrainingSession save and restore model - python

I'm trying to extend the example https://www.tensorflow.org/deploy/distributed outlined here but I'm having trouble saving the model. I'm running this in docker container available at gcr.io/tensorflow/tensorflow:1.5.0-gpu-py3. I started two processes one for 'ps' and one for 'worker' and the ps process is simply this code:
import tensorflow as tf
def main(_):
cluster = tf.train.ClusterSpec({"ps":["localhost:2222"],"worker":["localhost:2223"]})
server = tf.train.Server(cluster, job_name="ps", task_index=0)
server.join()
if __name__ == "__main__":
tf.app.run()
The worker code is the following and is based on the mnist examples and the distributed article above:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
data_dir = "/data"
checkpoint_dir = "/tmp/train_logs"
def main(_):
cluster = tf.train.ClusterSpec({"ps":["localhost:2222"],"worker":["localhost:2223"]})
server = tf.train.Server(cluster, job_name="worker", task_index=0)
mnist = input_data.read_data_sets(data_dir, one_hot=True)
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:0", cluster=cluster)):
x = tf.placeholder(tf.float32, [None,784], name="x_input")
W = tf.Variable(tf.zeros([784,10]))
b = tf.Variable(tf.zeros([10]))
y = tf.placeholder(tf.float32, [None,10])
model = tf.matmul(x, W) + b
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=model))
global_step = tf.train.get_or_create_global_step()
train_op = tf.train.GradientDescentOptimizer(0.5).minimize(cost, global_step=global_step)
prediction = tf.equal(tf.argmax(model,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))
hooks = [tf.train.StopAtStepHook(last_step=101)]
with tf.train.MonitoredTrainingSession(master=server.target, is_chief=True, checkpoint_dir=checkpoint_dir, hooks=hooks) as sess:
while not sess.should_stop():
batch_xs, batch_ys = mnist.train.next_batch(1000)
sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys})
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
#saver = tf.train.Saver()
saver = tf.train.import_meta_graph(latest_checkpoint+".meta", clear_devices=True)
with tf.Session() as sess:
saver.restore(sess,latest_checkpoint) # "/tmp/train_logs/model.ckpt"
acc = sess.run(accuracy, feed_dict={x: mnist.test.images,y: mnist.test.labels});
print("Test accuracy = "+"{:5f}".format(acc))
if __name__ == "__main__":
tf.app.run()
The examples I've found all seem to end without showing how to use the model. The above code fails on the saver.restore() line with the following error:
InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'save/RestoreV2_2':
Operation was explicitly assigned to /job:ps/task:0/device:CPU:0
but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0, /job:localhost/replica:0/task:0/device:GPU:0 ].
Make sure the device specification refers to a valid device.
Also, as shown above I tried both saver = tf.train.Saver() and saver = tf.train.import_meta_graph(latest_checkpoint+".meta", clear_devices=True) with no success. Same error is shown in either case.
I don't really understand the with tf.device(...): statement. In one iteration I commented out this line (and unindented the statements below it) and the code ran without errors. But I think this is not correct and would like to understand the correct way for this to work.

Related

I don't know how to do this at ubuntu tensorflow-gpu, "E tensorflow / core / util / events_writer.cc: 104]

I need to write a checkpoint on this deep learning problem. But that error is preventing me from writing the file. (E : at tf.summary.FileWriter)
Error :
"tensorflow/core/util/events_writer.cc:104] Write failed because file could not be opened.
i tryed this. Reinstallation, deletion, and authorization (tensorflow-gpu, tensorflow, tensorboard)
with tf.Session() as sess:
dir = pickledir
dic_0 = pat_level_arr(dir, 0, [0, 1])
dic_1 = pat_level_arr(dir, 1, [1, 0])
seed = 42
abc = range(len(dic_0))
abcd = range(len(dic_1))
dic_0_train, dic_0_test, _, _ = train_test_split(
dic_0, abc, test_size=0.244, random_state=seed)
dic_1_train, dic_1_test, _, _ = train_test_split(
dic_1, abcd, test_size=0.35, random_state=seed)
dic_train = np.concatenate((dic_0_train, dic_1_train), axis=0)
dic_test = np.concatenate((dic_0_test, dic_1_test), axis=0)
summaries_dir = './logs_level'
#here is the problem "tensorflow/core/util/events_writer.cc:104] Write failed because file could not be opened.
======================================================================
print("here is start\n")
train_writer = tf.summary.FileWriter(summaries_dir + '/train', sess.graph)
test_writer = tf.summary.FileWriter(summaries_dir + '/test')
print("here is end\n")
======================================================================
init = tf.global_variables_initializer()
sess.run(init)
# For train
try:
saver.restore(sess, './modelckpt/inception.ckpt')
print('Model restored')
epoch_saved = data_saved['var_epoch_saved'].eval()
except tf.errors.NotFoundError:
print('No saved model found')
epoch_saved = 1
except tf.errors.InvalidArgumentError:
print('Model structure has change. Rebuild model')
epoch_saved = 1
E tensorflow/core/util/events_writer.cc:104] Write failed because file could not be opened.
ValueError : the passed save_path is not a valid checkpoint: ./modelckpt/inception.ckpt
tensorflow-gpu version is 1.10.0.
python version is 3.5(i think).
I install tensorboard already.

Basically the error conveys that the checkpoint file is absent and therefore it is not a valid checkpoint.
You need to Save the Model using the below code, before executing saver.restore() method as it loads the file from the disk.
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(init_op)
# Do some work with the model.
# Save the variables to disk.
save_path = saver.save(sess, "/tmp/model.ckpt")
Please refer Save and Restore explanation and Code for Tensorflow Version 1.x in the below link, https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/saved_model.md

training operation not getting loaded after restoring model Tensorflow

I have tried this:
saver = tf.train.import_meta_graph(tf.train.latest_checkpoint(model_path)+".meta")
sess = tf.Session()
if(tf.train.checkpoint_exists(tf.train.latest_checkpoint(model_path))):
saver.restore(sess, tf.train.latest_checkpoint(model_path))
print(tf.train.latest_checkpoint(model_path) + "Session Loaded for Testing")
graph = tf.get_default_graph()
X = graph.get_tensor_by_name('input:0')
y = graph.get_tensor_by_name('output:0')
loss = tf.reduce_mean(tf.square(outputs - y)) # loss function = mean squared error
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss))
I got the following error:
ValueError: Duplicate node name in graph: 'rnn/multi_rnn_cell/cell_0/layer0/kernel/Adam'
I tried using the name as mentioned in the error:
optimizer = graph.get_operation_by_name("rnn/multi_rnn_cell/cell_0/layer0/kernel/Adam")
Then I received this error:
AttributeError: 'Operation' object has no attribute 'minimize'
Please let me know how I can get access to the training_op from the model?

Tensorflow - can't initialize saved variables unless I recreate the "saver" object. Why?

I'm pretty sure I'm missing something about how tensorflow works because my solution doesn't make any sense.
I'm trying to train a neural network (from scratch, without using Estimators or other abstractions), save it, and load a simplified version of it for inference.
The following code trains but gives me the error: FailedPreconditionError (see above for traceback): Attempting to use uninitialized value hidden0/biases/Variable
[[Node: hidden0/biases/Variable/read = Identity[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](hidden0/biases/Variable)]]. If I add the commented line - if I recreate the saver obect that I'm not going to use nor return - the code works just fine.
Why do I need to create a (useless) saver object in order to restore the saved weights?
import tensorflow as tf
import numpy as np
def add_fc_layer(input_tensor, input_dimensions, output_dimensions, layer_name, activation=None):
with tf.variable_scope(layer_name):
with tf.variable_scope('weights'):
weights = tf.Variable(tf.truncated_normal([input_dimensions, output_dimensions]))
with tf.variable_scope('biases'):
biases = tf.Variable(tf.zeros([output_dimensions]))
with tf.variable_scope('Wx_plus_b'):
preactivate = tf.matmul(input_tensor, weights) + biases
if activation is None:
return preactivate
with tf.variable_scope('activation'):
activations = activation(preactivate)
return activations
def make_network(model_phase):
if model_phase not in {"train", "test"}:
raise ValueError("invalid type")
hidden0_units = 25
hidden1_units = 15
hidden2_units = 10
input_size = 10
output_size = 4
with tf.variable_scope('InputVector'):
inputs = tf.placeholder(shape=[1, input_size], dtype=tf.float32)
hidden0_out = add_fc_layer(inputs, input_size, hidden0_units, "hidden0", activation=tf.nn.sigmoid)
hidden1_out = add_fc_layer(hidden0_out, hidden0_units, hidden1_units, "hidden1", activation=tf.nn.sigmoid)
hidden2_out = add_fc_layer(hidden1_out, hidden1_units, hidden2_units, "hidden2", activation=tf.nn.sigmoid)
out = add_fc_layer(hidden2_out, hidden2_units, output_size, "regression")
if model_phase == "test":
# UNCOMMENTIN THIS LINE MAKES THE SCRIPT WORK
# saver = tf.train.Saver(var_list=tf.trainable_variables())
return inputs, out
saver = tf.train.Saver(var_list=tf.trainable_variables())
with tf.variable_scope('training'):
with tf.variable_scope('groundTruth'):
ground_truth = tf.placeholder(shape=[1, output_size], dtype=tf.float32)
with tf.variable_scope('loss'):
loss = tf.reduce_sum(tf.square(ground_truth - out))
tf.summary.scalar('loss', loss)
with tf.variable_scope('optimizer'):
trainer = tf.train.AdamOptimizer(learning_rate=0.001)
with tf.variable_scope('gradient'):
updateModel = trainer.minimize(loss)
with tf.variable_scope('predict'):
predict = tf.random_shuffle(tf.boolean_mask(out, tf.equal(out, tf.reduce_max(out, axis=None))))[0]
writer = tf.summary.FileWriter('/tmp/test', tf.get_default_graph())
return inputs, out, ground_truth, updateModel, writer, saver
train_graph = tf.Graph()
with tf.Session(graph=train_graph) as sess:
tf.set_random_seed(42)
inputs, out, ground_truth, updateModel, writer, saver = make_network(model_phase='train')
init = tf.initialize_all_variables()
sess.run(init)
print('\nLearning...')
for _ in range(10):
sess.run([updateModel], feed_dict={inputs:np.arange(10)+np.random.random((1,10)), ground_truth:np.arange(4).reshape(1, 4)})
saver.save(sess,'./tensorflowModel.ckpt')
new_graph = tf.Graph()
with tf.Session(graph=new_graph) as sess:
inputs, out = make_network(model_phase='test')
saver = tf.train.import_meta_graph('./tensorflowModel.ckpt.meta')
saver.restore(sess, tf.train.latest_checkpoint('./'))
# evaluation
print('\nEvaluation...')
for _ in range(10):
_ = sess.run(out, feed_dict={inputs:np.arange(10).reshape(1,10)})

I don't know why creating an unused Saver makes the problem go away, but the code betrays a misunderstanding.
When you are restoring, you are creating the model graph twice. First, you call make_network() which creates the computation graph and variables. You then also call import_meta_graph which also creates a graph and variables. You should create a saver with simple saver = tf.train.Saver() instead of saver = tf.train.import_meta_graph('./tensorflowModel.ckpt.meta')

How to return variables from main()?

Brand new to TensorFlow and am trying to modify some examples they give. For instance:
https://github.com/tensorflow/tensorflow/blob/4806cb0646bd21f713722bd97c0d0262c575f7e0/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
"""Simple MNIST classifier example with JIT XLA and timelines.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.python.client import timeline
FLAGS = None
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
w = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, w) + b
....
....
....
....
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), y_)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
sess.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
parser.add_argument(
'--xla', type=bool, default=True, help='Turn xla via JIT on')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
This will print "0.9202" on the commandline. How do I return the value so that I can use it in other functions?
val = tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
print(val)
I get:
[pylint] E1111:Assigning to function call which doesn't return
Also, can't do anything after the function executes. If I try to print("this string") then the program exits before printing that.
EDIT:
Answers so far give the same error:
[pylint] E1111:Assigning to function call which doesn't return
I have looked through a lot of the examples for TF but cannot find an example of how to return the value rather than printing it to the console.

Have a look here How does tf.app.run() work?.
essentially tf.app.run is a wrapper which calls a main with some arguments. Either you change the print statment to assign the result to a variable and return it or within main you call your own funtions to write some where
...
result = (sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: ...
sess.close()
return result
Maybe the full code of the tutorial makes this clearer:
https://github.com/tensorflow/tensorflow/blob/r1.8/tensorflow/examples/tutorials/layers/cnn_mnist.py

You first gess was right, add val = to your function call. But you should also return a value from your function.
Replace:
print(sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
by:
return sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})

Running a basic distributed MNIST solver in TensorFlow

I'm trying to learn a model to predict MNIST classes in distributed TensorFlow. I've read the main distributed TensorFlow page, but I don't understand what I run to create a distributed TensorFlow model.
I'm just using a linear classifier for the moment, based on the code here.
How do I run this model? The link I got the code from says that this command should be run in the terminal:
python dist_minst_softmax.py
--ps_hosts=localhost:2222,localhost:2223
--worker_hosts=localhost:2224,localhost:2225
--job_name=worker --task_index=1
If I run this in the terminal, I get the following messages:
2018-04-23 11:02:35.034319: I tensorflow/core/distributed_runtime/master.cc:221] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2018-04-23 11:02:35.034375: I tensorflow/core/distributed_runtime/master.cc:221] CreateSession still waiting for response from worker: /job:worker/replica:0/task:0
This message just repeats indefinitely. So how do I start the training process?
For reference, the model is defined as follows:
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
FLAGS = None
def main(_):
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
server = tf.train.Server(cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
global_step = tf.contrib.framework.get_or_create_global_step()
with tf.name_scope("input"):
mnist = input_data.read_data_sets("./input_data", one_hot=True)
x = tf.placeholder(tf.float32, [None, 784], name="x-input")
y_ = tf.placeholder(tf.float32, [None, 10], name="y-input")
tf.set_random_seed(1)
with tf.name_scope("weights"):
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
with tf.name_scope("model"):
y = tf.matmul(x, W) + b
with tf.name_scope("cross_entropy"):
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
with tf.name_scope("train"):
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
with tf.name_scope("acc"):
init_op = tf.initialize_all_variables()
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
init_op=init_op)
with sv.prepare_or_wait_for_session(server.target) as sess:
for _ in range(100):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
# Flags for defining the tf.train.ClusterSpec
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs"
)
parser.add_argument(
"--worker_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs"
)
parser.add_argument(
"--job_name",
type=str,
default="",
help="One of 'ps', 'worker'"
)
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index",
type=int,
default=0,
help="Index of task within the job"
)
FLAGS, unparsed = parser.parse_known_args()
print(FLAGS, unparsed)
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

You should first initialise your ps_server, and then launch your worker. Example with one ps and one worker:
python dist_minst_softmax.py
--ps_hosts=localhost:2222
--worker_hosts=localhost:2223
--job_name=ps --task_index=0
python dist_minst_softmax.py
--ps_hosts=localhost:2222
--worker_hosts=localhost:2223
--job_name=worker --task_index=0
I couldn't run the example code you gave me, since my computer doesn't have BLAS configured, but at least it tried to perform some operations...

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

MonitoredTrainingSession save and restore model - python

Related

I don't know how to do this at ubuntu tensorflow-gpu, "E tensorflow / core / util / events_writer.cc: 104]

training operation not getting loaded after restoring model Tensorflow

Tensorflow - can't initialize saved variables unless I recreate the "saver" object. Why?

How to return variables from main()?

Running a basic distributed MNIST solver in TensorFlow

Categories

Resources