Running a basic distributed MNIST solver in TensorFlow

Running a basic distributed MNIST solver in TensorFlow - python

I'm trying to learn a model to predict MNIST classes in distributed TensorFlow. I've read the main distributed TensorFlow page, but I don't understand what I run to create a distributed TensorFlow model.
I'm just using a linear classifier for the moment, based on the code here.
How do I run this model? The link I got the code from says that this command should be run in the terminal:
python dist_minst_softmax.py
--ps_hosts=localhost:2222,localhost:2223
--worker_hosts=localhost:2224,localhost:2225
--job_name=worker --task_index=1
If I run this in the terminal, I get the following messages:
2018-04-23 11:02:35.034319: I tensorflow/core/distributed_runtime/master.cc:221] CreateSession still waiting for response from worker: /job:ps/replica:0/task:0
2018-04-23 11:02:35.034375: I tensorflow/core/distributed_runtime/master.cc:221] CreateSession still waiting for response from worker: /job:worker/replica:0/task:0
This message just repeats indefinitely. So how do I start the training process?
For reference, the model is defined as follows:
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
FLAGS = None
def main(_):
ps_hosts = FLAGS.ps_hosts.split(",")
worker_hosts = FLAGS.worker_hosts.split(",")
cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
server = tf.train.Server(cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
global_step = tf.contrib.framework.get_or_create_global_step()
with tf.name_scope("input"):
mnist = input_data.read_data_sets("./input_data", one_hot=True)
x = tf.placeholder(tf.float32, [None, 784], name="x-input")
y_ = tf.placeholder(tf.float32, [None, 10], name="y-input")
tf.set_random_seed(1)
with tf.name_scope("weights"):
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
with tf.name_scope("model"):
y = tf.matmul(x, W) + b
with tf.name_scope("cross_entropy"):
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
with tf.name_scope("train"):
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
with tf.name_scope("acc"):
init_op = tf.initialize_all_variables()
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
init_op=init_op)
with sv.prepare_or_wait_for_session(server.target) as sess:
for _ in range(100):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
# Flags for defining the tf.train.ClusterSpec
parser.add_argument(
"--ps_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs"
)
parser.add_argument(
"--worker_hosts",
type=str,
default="",
help="Comma-separated list of hostname:port pairs"
)
parser.add_argument(
"--job_name",
type=str,
default="",
help="One of 'ps', 'worker'"
)
# Flags for defining the tf.train.Server
parser.add_argument(
"--task_index",
type=int,
default=0,
help="Index of task within the job"
)
FLAGS, unparsed = parser.parse_known_args()
print(FLAGS, unparsed)
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

You should first initialise your ps_server, and then launch your worker. Example with one ps and one worker:
python dist_minst_softmax.py
--ps_hosts=localhost:2222
--worker_hosts=localhost:2223
--job_name=ps --task_index=0
python dist_minst_softmax.py
--ps_hosts=localhost:2222
--worker_hosts=localhost:2223
--job_name=worker --task_index=0
I couldn't run the example code you gave me, since my computer doesn't have BLAS configured, but at least it tried to perform some operations...

Related

How to restore two graphs in Tensorflow?

I have LSTM training in tensorflow and the whole sess is saved with saver = tf.train.Saver().
The whole code is shown below.
def LSTM_RNN(_X, _weights, _biases):
# model architecture based on "guillaume-chevalier" and "aymericdamien" under the MIT license.
_X = tf.transpose(_X, [1, 0, 2]) # permute n_steps and batch_size
_X = tf.reshape(_X, [-1, n_input])
# Rectifies Linear Unit activation function used
_X = tf.nn.relu(tf.matmul(_X, _weights['hidden']) + _biases['hidden'])
# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(_X, n_steps, 0)
# Define two stacked LSTM cells (two recurrent layers deep) with tensorflow
lstm_cell_1 = tf.contrib.rnn.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_cell_2 = tf.contrib.rnn.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell_1, lstm_cell_2], state_is_tuple=True)
outputs, states = tf.contrib.rnn.static_rnn(lstm_cells, _X, dtype=tf.float32)
# A single output is produced, in style of "many to one" classifier, refer to http://karpathy.github.io/2015/05/21/rnn-effectiveness/ for details
lstm_last_output = outputs[-1]
# Linear activation
return tf.matmul(lstm_last_output, _weights['out']) + _biases['out']
# Graph input/output
x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
# Graph weights
weights = {
'hidden': tf.Variable(tf.random_normal([n_input, n_hidden])), # Hidden layer weights
'out': tf.Variable(tf.random_normal([n_hidden, n_classes], mean=1.0))
}
biases = {
'hidden': tf.Variable(tf.random_normal([n_hidden])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
pred = LSTM_RNN(x, weights, biases)
# Loss, optimizer and evaluation
l2 = lambda_loss_amount * sum(
tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables()
) # L2 loss prevents this overkill neural network to overfit the data
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=pred)) + l2 # Softmax loss
if decaying_learning_rate:
learning_rate = tf.train.exponential_decay(init_learning_rate, global_step*batch_size, decay_steps, decay_rate, staircase=True)
#decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) #exponentially decayed learning rate
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost,global_step=global_step) # Adam Optimizer
while step * batch_size <= training_iters:
#print (sess.run(learning_rate)) #decaying learning rate
#print (sess.run(global_step)) # global number of iterations
if len(unsampled_indices) < batch_size:
unsampled_indices = list(range(0,len(X_train)))
batch_xs, raw_labels, unsampled_indicies = extract_batch_size(X_train, y_train, unsampled_indices, batch_size)
batch_ys = one_hot(raw_labels)
# check that encoded output is same length as num_classes, if not, pad it
if len(batch_ys[0]) < n_classes:
temp_ys = np.zeros((batch_size, n_classes))
temp_ys[:batch_ys.shape[0],:batch_ys.shape[1]] = batch_ys
batch_ys = temp_ys
# Fit training using batch data
_, loss, acc = sess.run(
[optimizer, cost, accuracy],
feed_dict={
x: batch_xs,
y: batch_ys
}
)
train_losses.append(loss)
train_accuracies.append(acc)
# Evaluate network only at some steps for faster training:
if (step*batch_size % display_iter == 0) or (step == 1) or (step * batch_size > training_iters):
# To not spam console, show training accuracy/loss in this "if"
print("Iter #" + str(step*batch_size) + ": Learning rate = " + "{:.6f}".format(sess.run(learning_rate)) + ": Batch Loss = " + "{:.6f}".format(loss) + ", Accuracy = {}".format(acc))
# Evaluation on the test set (no learning made here - just evaluation for diagnosis)
loss, acc = sess.run([cost, accuracy], feed_dict={x: X_test,y: one_hot(y_test)})
test_losses.append(loss)
test_accuracies.append(acc)
print("PERFORMANCE ON TEST SET: " + "Batch Loss = {}".format(loss) + ", Accuracy = {}".format(acc))
step += 1
print("Optimization Finished!")
save_path = saver.save(sess, "ActivityTrainedModels/model.ckpt")
Then I restore the model for deployment.
At that time, I need to use together with another human pose estimator model for Human pose estimation. Pose estimator is loaded with get_graph_path().
I can't load both. I can load either one only. If I load both I have error as
NotFoundError (see above for traceback): Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:
Key smoothing/gauss_weight not found in checkpoint
[[node save/RestoreV2 (defined at ActivityDetection.py:219) = RestoreV2[dtypes=[DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]
My deployment code is as follow.
n_steps = 32 # 32 timesteps per series
n_input = 36 # num input parameters per timestep
n_hidden = 34 # Hidden layer num of features
n_classes = 3
global_step = tf.Variable(0, trainable=False)
# Graph input/output
x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
# Graph weights
weights = {
'hidden': tf.Variable(tf.random_normal([n_input, n_hidden])), # Hidden layer weights
'out': tf.Variable(tf.random_normal([n_hidden, n_classes], mean=1.0))
}
biases = {
'hidden': tf.Variable(tf.random_normal([n_hidden])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
pred = LSTM_RNN(x, weights, biases)
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))
init = tf.global_variables_initializer()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Activity Recognition')
parser.add_argument('--video', type=str, default='../../tf-openpose/TestVideos/2019_01-Feb Recording/C4-13.mp4')
parser.add_argument('--resolution', type=str, default='640x360', help='network input resolution. default=432x368')
parser.add_argument('--model', type=str, default='mobilenet_thin', help='cmu / mobilenet_thin')
parser.add_argument('--resize', type=str, default='0x0',
help='if provided, resize images before they are processed. default=0x0, Recommends : 432x368 or 656x368 or 1312x736 ')
parser.add_argument('--resize-out-ratio', type=float, default=4.0,
help='if provided, resize heatmaps before they are post-processed. default=1.0')
parser.add_argument('--show-process', type=bool, default=False,
help='for debug purpose, if enabled, speed for inference is dropped.')
parser.add_argument('--showBG', type=bool, default=True, help='False to show skeleton only.')
parser.add_argument('--s', type=str, default='00:00', help='start time to crop')
parser.add_argument('--e', type=str, default='00:00', help='end time to crop')
args = parser.parse_args()
w, h = model_wh(args.resize)
if w > 0 and h > 0:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(w, h))
else:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(img_w, img_h))
'''with tf.Session() as sess:
sess.run(init)
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('/home/coie/venvp3/HumanActivityRecognition/HumanActivityRecognition/ActivityTrainedModels/'))
print("Model restored.")
all_vars = tf.trainable_variables()
for i in range(len(all_vars)):
name = all_vars[i].name
values = sess.run(name)
print('name', name)
#print('value', values)
print('shape',values.shape)'''
#result = sess.run(pred, feed_dict={x: X_test[24:27]})
#for r in range(len(result)):
#print("predicted activity:", LABELS[result[r].argmax(0)])
If I load TfPoseEstimator, I can't restore LSTM model.
How can I solve the problem?

I made a separate class for LSTM and LSTM graph is loaded with a sess in the class. So main python code has another sess with default graph. The default graph at main python load postestimator graph.
My LSTM class is defined as
class ActivityRecognition:
#Utility functions for training:
def LSTM_RNN(self,_X, _weights, _biases):
# model architecture based on "guillaume-chevalier" and "aymericdamien" under the MIT license.
_X = tf.transpose(_X, [1, 0, 2]) # permute n_steps and batch_size
_X = tf.reshape(_X, [-1, self.n_input])
# Rectifies Linear Unit activation function used
_X = tf.nn.relu(tf.matmul(_X, _weights['hidden']) + _biases['hidden'])
# Split data because rnn cell needs a list of inputs for the RNN inner loop
_X = tf.split(_X, self.n_steps, 0)
# Define two stacked LSTM cells (two recurrent layers deep) with tensorflow
lstm_cell_1 = tf.contrib.rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_cell_2 = tf.contrib.rnn.BasicLSTMCell(self.n_hidden, forget_bias=1.0, state_is_tuple=True)
lstm_cells = tf.contrib.rnn.MultiRNNCell([lstm_cell_1, lstm_cell_2], state_is_tuple=True)
outputs, states = tf.contrib.rnn.static_rnn(lstm_cells, _X, dtype=tf.float32)
lstm_last_output = outputs[-1]
return tf.matmul(lstm_last_output, _weights['out']) + _biases['out']
def __init__(self):
self.n_steps = 32 # 32 timesteps per series
self.n_input = 36 # num input parameters per timestep
self.n_hidden = 34 # Hidden layer num of features
self.n_classes = 3
self.global_step = tf.Variable(0, trainable=False)
# Graph input/output
self.x = tf.placeholder(tf.float32, [None, self.n_steps, self.n_input])
self.y = tf.placeholder(tf.float32, [None, self.n_classes])
# Graph weights
self.weights = {
'hidden': tf.Variable(tf.random_normal([self.n_input, self.n_hidden])), # Hidden layer weights
'out': tf.Variable(tf.random_normal([self.n_hidden, self.n_classes], mean=1.0))
}
self.biases = {
'hidden': tf.Variable(tf.random_normal([self.n_hidden])),
'out': tf.Variable(tf.random_normal([self.n_classes]))
}
self.pred = self.LSTM_RNN(self.x, self.weights, self.biases)
self.sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))
self.init = tf.global_variables_initializer()
with tf.Session() as sess:
self.sess.run(self.init)
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('ActivityTrainedModels/'))
print("Model restored.")
#all_vars = tf.trainable_variables()
#for i in range(len(all_vars)):
#name = all_vars[i].name
#values = sess.run(name)
#print('name', name)
#print('value', values)
#print('shape',values.shape)
def inference(self,test):
result = self.sess.run(selfpred, feed_dict={x: test})
for r in range(len(result)):
activity=LABELS[result[r].argmax(0)]
return activity
if __name__ == "__main__":
ActivityRecognition()

How to return variables from main()?

Brand new to TensorFlow and am trying to modify some examples they give. For instance:
https://github.com/tensorflow/tensorflow/blob/4806cb0646bd21f713722bd97c0d0262c575f7e0/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
"""Simple MNIST classifier example with JIT XLA and timelines.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.python.client import timeline
FLAGS = None
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
w = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, w) + b
....
....
....
....
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), y_)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
sess.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
parser.add_argument(
'--xla', type=bool, default=True, help='Turn xla via JIT on')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
This will print "0.9202" on the commandline. How do I return the value so that I can use it in other functions?
val = tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
print(val)
I get:
[pylint] E1111:Assigning to function call which doesn't return
Also, can't do anything after the function executes. If I try to print("this string") then the program exits before printing that.
EDIT:
Answers so far give the same error:
[pylint] E1111:Assigning to function call which doesn't return
I have looked through a lot of the examples for TF but cannot find an example of how to return the value rather than printing it to the console.

Have a look here How does tf.app.run() work?.
essentially tf.app.run is a wrapper which calls a main with some arguments. Either you change the print statment to assign the result to a variable and return it or within main you call your own funtions to write some where
...
result = (sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: ...
sess.close()
return result
Maybe the full code of the tutorial makes this clearer:
https://github.com/tensorflow/tensorflow/blob/r1.8/tensorflow/examples/tutorials/layers/cnn_mnist.py

You first gess was right, add val = to your function call. But you should also return a value from your function.
Replace:
print(sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
by:
return sess.run(accuracy,
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})

Tensorflow - how to import MNIST database

I want to train a model using MNIST database. I'm working on Tensorflow tutorial Tensorflow tutorial. A suggested way to import database is to use mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) but I need to use something like:
with open('my/directory/train-images-idx3-ubyte.gz', 'rb') as f:
train_images = extract_images(f)
with open('my/directory/train-labels-idx1-ubyte.gz', 'rb') as f:
train_labels = extract_images(f)
...
Thats brings question how to adjust code to work with my train_images, train_lables, test_images, test_lables:
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
# Build the graph for the deep net
y_conv, keep_prob = deepnn(x)
with tf.name_scope('loss'):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
logits=y_conv)
cross_entropy = tf.reduce_mean(cross_entropy)
with tf.name_scope('adam_optimizer'):
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope('accuracy'):
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
graph_location = tempfile.mkdtemp()
print('Saving graph to: %s' % graph_location)
train_writer = tf.summary.FileWriter(graph_location)
train_writer.add_graph(tf.get_default_graph())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(20000):
batch = mnist.train.next_batch(50)
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={
x: batch[0], y_: batch[1], keep_prob: 1.0})
print('step %d, training accuracy %g' % (i, train_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
print('test accuracy %g' % accuracy.eval(feed_dict={
x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

A recommended way to import the mnist dataset with TF2 is the following:
from tensorflow.keras.datasets import mnist
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

MonitoredTrainingSession save and restore model

I'm trying to extend the example https://www.tensorflow.org/deploy/distributed outlined here but I'm having trouble saving the model. I'm running this in docker container available at gcr.io/tensorflow/tensorflow:1.5.0-gpu-py3. I started two processes one for 'ps' and one for 'worker' and the ps process is simply this code:
import tensorflow as tf
def main(_):
cluster = tf.train.ClusterSpec({"ps":["localhost:2222"],"worker":["localhost:2223"]})
server = tf.train.Server(cluster, job_name="ps", task_index=0)
server.join()
if __name__ == "__main__":
tf.app.run()
The worker code is the following and is based on the mnist examples and the distributed article above:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
data_dir = "/data"
checkpoint_dir = "/tmp/train_logs"
def main(_):
cluster = tf.train.ClusterSpec({"ps":["localhost:2222"],"worker":["localhost:2223"]})
server = tf.train.Server(cluster, job_name="worker", task_index=0)
mnist = input_data.read_data_sets(data_dir, one_hot=True)
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:0", cluster=cluster)):
x = tf.placeholder(tf.float32, [None,784], name="x_input")
W = tf.Variable(tf.zeros([784,10]))
b = tf.Variable(tf.zeros([10]))
y = tf.placeholder(tf.float32, [None,10])
model = tf.matmul(x, W) + b
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=model))
global_step = tf.train.get_or_create_global_step()
train_op = tf.train.GradientDescentOptimizer(0.5).minimize(cost, global_step=global_step)
prediction = tf.equal(tf.argmax(model,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(prediction, tf.float32))
hooks = [tf.train.StopAtStepHook(last_step=101)]
with tf.train.MonitoredTrainingSession(master=server.target, is_chief=True, checkpoint_dir=checkpoint_dir, hooks=hooks) as sess:
while not sess.should_stop():
batch_xs, batch_ys = mnist.train.next_batch(1000)
sess.run(train_op, feed_dict={x: batch_xs, y: batch_ys})
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
#saver = tf.train.Saver()
saver = tf.train.import_meta_graph(latest_checkpoint+".meta", clear_devices=True)
with tf.Session() as sess:
saver.restore(sess,latest_checkpoint) # "/tmp/train_logs/model.ckpt"
acc = sess.run(accuracy, feed_dict={x: mnist.test.images,y: mnist.test.labels});
print("Test accuracy = "+"{:5f}".format(acc))
if __name__ == "__main__":
tf.app.run()
The examples I've found all seem to end without showing how to use the model. The above code fails on the saver.restore() line with the following error:
InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'save/RestoreV2_2':
Operation was explicitly assigned to /job:ps/task:0/device:CPU:0
but available devices are [ /job:localhost/replica:0/task:0/device:CPU:0, /job:localhost/replica:0/task:0/device:GPU:0 ].
Make sure the device specification refers to a valid device.
Also, as shown above I tried both saver = tf.train.Saver() and saver = tf.train.import_meta_graph(latest_checkpoint+".meta", clear_devices=True) with no success. Same error is shown in either case.
I don't really understand the with tf.device(...): statement. In one iteration I commented out this line (and unindented the statements below it) and the code ran without errors. But I think this is not correct and would like to understand the correct way for this to work.

train two seperate tensorflow models at once

I am new in tensorflow, i am trying to train to tensorflow models which are connected with a dot product output layer. The input are two 2048 float vectors.
When I run the script I always get these errors:
Traceback (most recent call last):
File "classifier.py", line 120, in
_, summary = sess.run([optimizer, merged], feed_dict={x1: batch_x1s, x2: batch_x2s})
File "/Users/Joachim/work/tensorflow/virtualenv/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 789, in run
run_metadata_ptr)
File "/Users/Joachim/work/tensorflow/virtualenv/tensorflow/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 968, in _run
np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
File "/Users/Joachim/work/tensorflow/virtualenv/tensorflow/lib/python3.6/site-packages/numpy/core/numeric.py", line 531, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.
Here is my code:
import tensorflow as tf
import sys
import math
import os
import numpy as np
import json
import argparse
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.python.platform import gfile
from progress.bar import Bar
bottleneck_dir = 'bottlenecks'
### LOAD DATA FROM BOTTLENECKS
data_inputs = []
data_labels = []
data_expected_result=[]
bottleneck_list = []
file_glob = os.path.join(bottleneck_dir, '*.txt')
bottleneck_list.extend(gfile.Glob(file_glob))
for bottleneck_file in bottleneck_list:
bottleneck = open(bottleneck_file)
bottleneck_string = bottleneck.read()
bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
imageName=bottleneck_file.split('.')[0]
helper=False
for i in range(len(data_labels)):
if imageName==data_labels[i]:
if 'search' in bottleneck_file:
data_inputs[i][0]=bottleneck_values
else:
data_inputs[i][1]=bottleneck_values
helper=true
if helper!=True:
if 'search' in bottleneck_file:
data_inputs.append([bottleneck_values,[]])
else:
data_inputs.append([[],bottleneck_values])
data_expected_result.append(1);
data_inputs_x1 = [i[0] for i in data_inputs]
data_inputs_x2 = [i[1] for i in data_inputs]
# Setting hyperparameters
learning_rate = 0.01
batch_size = 4
epochs = 1
log_batch_step = 50
n_features = np.size(data_inputs, 1)
tf.reset_default_graph()
graph = tf.get_default_graph()
inputVectorSize=2048
outputVectorSize=2048
x1 = tf.placeholder(tf.float32, [None, inputVectorSize], name='x1')#input layer
x2 = tf.placeholder(tf.float32, [None, inputVectorSize], name='x2')#input layer
dense1 = tf.layers.dense(inputs=x1, units=inputVectorSize, activation=tf.nn.relu)
logits1 = tf.layers.dense(inputs=dense1, units=outputVectorSize, activation=tf.nn.relu)
logits1_normalized=tf.nn.softmax(logits1)
dense2 = tf.layers.dense(inputs=x2, units=inputVectorSize, activation=tf.nn.relu)
logits2 = tf.layers.dense(inputs=dense2, units=outputVectorSize, activation=tf.nn.relu)
logits2_normalized=tf.nn.softmax(logits2)
output = tf.reduce_sum( tf.multiply( logits1_normalized, logits2_normalized), 1, keep_dims=True )
# Defining loss of network
loss = data_expected_result-output
tf.summary.scalar('loss', loss)
# Setting optimiser
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
# Define accuracy
accuracy = loss
tf.summary.scalar('accuracy', accuracy)
# For saving checkpoint after training
saver = tf.train.Saver()
merged = tf.summary.merge_all()
# use in command line: tensorboard --logdir=path/to/log --> to view tensorboard
# Run tensorflow session
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
train_writer = tf.summary.FileWriter('log', sess.graph)
tf.train.write_graph(sess.graph_def, '', 'savedgraph.pbtxt', as_text=False)
# Running the training in batches
batch_count = int(math.ceil(len(data_inputs)/batch_size))
for epoch_i in range(epochs):
batches_pbar = tqdm(range(batch_count), desc='Epoch {:>2}/{}'.format(epoch_i+1, epochs), unit='batches')
# The training cycle
for batch_i in batches_pbar:
# Get a batch of training features and labels
batch_start = batch_i*batch_size
batch_x1s = data_inputs_x1[batch_start:batch_start + batch_size]
batch_x2s = data_inputs_x2[batch_start:batch_start + batch_size]
# Run optimizer
_, summary = sess.run([optimizer, merged], feed_dict={x1: batch_x1s, x2: batch_x2s})
train_writer.add_summary(summary, batch_i)
# Check accuracy against validation data
val_accuracy, val_loss = sess.run([accuracy, loss], feed_dict={x1: data_inputs_x1[0:len(data_inputs-1)], x2: data_inputs_x2[0:len(data_inputs-1)]})
print("After epoch {}, Loss: {}, Accuracy: {}".format(epoch_i+1, val_loss, val_accuracy))
test_accuracy, test_loss = sess.run([accuracy, loss], feed_dict={x1: data_inputs_x1[0:len(data_inputs-1)], x2: data_inputs_x2[0:len(data_inputs-1)]})
print ("TEST LOSS: {}, TEST ACCURACY: {}".format(test_loss, test_accuracy))
g = tf.get_default_graph()
saver.save(sess, 'savedgraph')
Can anyone show my what to do to fix the problem?

You need to feed array not list. change the lines where used list as feed_dict input.
batch_x1s = np.asarray(data_inputs_x1[batch_start:batch_start + batch_size])
batch_x2s = np.asarray(data_inputs_x2[batch_start:batch_start + batch_size])
...
test_accuracy, test_loss = sess.run([accuracy, loss], feed_dict=
{x1:np.asarray(data_inputs_x1[0:len(data_inputs-1)]), x2:
np.asarray(data_inputs_x2[0:len(data_inputs-1)])})

I found the Problem, it was a problem with the input data.
import tensorflow as tf
import sys
import math
import os
import numpy as np
import json
import argparse
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.python.platform import gfile
from progress.bar import Bar
bottleneck_dir = 'bottlenecks'
### LOAD DATA FROM BOTTLENECKS
data_inputs = []
data_labels = []
data_expected_result=[]
bottleneck_list = []
file_glob = os.path.join(bottleneck_dir, '*.txt')
bottleneck_list.extend(gfile.Glob(file_glob))
for bottleneck_file in bottleneck_list:
bottleneck = open(bottleneck_file)
bottleneck_string = bottleneck.read()
bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
imageName=bottleneck_file.split('.')[0]
helper=False
for i in range(len(data_labels)):
if imageName==data_labels[i]:
if 'search' in bottleneck_file:
data_inputs[i][0]=np.asarray(bottleneck_values)
else:
data_inputs[i][1]=np.asarray(bottleneck_values)
helper=True
if helper!=True:
if 'search' in bottleneck_file:
data_inputs.append([bottleneck_values,[]])
else:
data_inputs.append([[],bottleneck_values])
data_expected_result.append(1);
data_labels.append(imageName);
data_inputs_x1 = [i[0] for i in data_inputs]
data_inputs_x2 = [i[1] for i in data_inputs]
for i in range(len(data_inputs_x2)):
print(len(data_inputs_x2[i]))
# Setting hyperparameters
learning_rate = 0.01
batch_size = 4
epochs = 1
log_batch_step = 50
n_features = np.size(data_inputs, 1)
tf.reset_default_graph()
graph = tf.get_default_graph()
inputVectorSize=2048
outputVectorSize=2048
x1 = tf.placeholder(tf.float32, [None, inputVectorSize], name='x1')#input layer
x2 = tf.placeholder(tf.float32, [None, inputVectorSize], name='x2')#input layer
dense1 = tf.layers.dense(inputs=x1, units=inputVectorSize, activation=tf.nn.relu)
logits1 = tf.layers.dense(inputs=dense1, units=outputVectorSize, activation=tf.nn.relu)
logits1_normalized=tf.nn.softmax(logits1)
dense2 = tf.layers.dense(inputs=x2, units=inputVectorSize, activation=tf.nn.relu)
logits2 = tf.layers.dense(inputs=dense2, units=outputVectorSize, activation=tf.nn.relu)
logits2_normalized=tf.nn.softmax(logits2)
output = tf.reduce_sum( tf.multiply( logits1_normalized, logits2_normalized), 1, keep_dims=True )
# Defining loss of network
loss = tf.reduce_sum(tf.subtract(1.0,output));
tf.summary.scalar('loss', loss)
# Setting optimiser
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
# Define accuracy
accuracy = loss
tf.summary.scalar('accuracy', accuracy)
# For saving checkpoint after training
saver = tf.train.Saver()
merged = tf.summary.merge_all()
# use in command line: tensorboard --logdir=path/to/log --> to view tensorboard
# Run tensorflow session
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
train_writer = tf.summary.FileWriter('log', sess.graph)
tf.train.write_graph(sess.graph_def, '', 'savedgraph.pbtxt', as_text=False)
# Running the training in batches
batch_count = int(math.ceil(len(data_inputs)/batch_size))
for epoch_i in range(epochs):
batches_pbar = tqdm(range(batch_count), desc='Epoch {:>2}/{}'.format(epoch_i+1, epochs), unit='batches')
# The training cycle
for batch_i in batches_pbar:
# Get a batch of training features and labels
batch_start = batch_i*batch_size
batch_x1s = np.asarray(data_inputs_x1[batch_start:batch_start + batch_size])
batch_x2s = np.asarray(data_inputs_x2[batch_start:batch_start + batch_size])
# Run optimizer
_, summary = sess.run([optimizer, merged], feed_dict={x1: batch_x1s, x2: batch_x2s})
train_writer.add_summary(summary, batch_i)
# Check accuracy against validation data
val_accuracy, val_loss = sess.run([accuracy, loss], feed_dict={x1: np.asarray(data_inputs_x1), x2: np.asarray(data_inputs_x2)})
print("After epoch {}, Loss: {}, Accuracy: {}".format(epoch_i+1, val_loss, val_accuracy))
test_accuracy, test_loss = sess.run([accuracy, loss], feed_dict={x1: np.asarray(data_inputs_x1), x2: np.asarray(data_inputs_x2)})
print ("TEST LOSS: {}, TEST ACCURACY: {}".format(test_loss, test_accuracy))
g = tf.get_default_graph()
saver.save(sess, 'savedgraph')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Running a basic distributed MNIST solver in TensorFlow - python

Related

How to restore two graphs in Tensorflow?

How to return variables from main()?

Tensorflow - how to import MNIST database

MonitoredTrainingSession save and restore model

train two seperate tensorflow models at once

Categories

Resources