Multi GPU seems not work on TensorFlow1.0 - python

I am using TensorFlow 1.0 and I have develop a simple program to measure performance. I have a silly model as follow
def model(example_batch):
h1 = tf.layers.dense(inputs=example_batch, units=64, activation=tf.nn.relu)
h2 = tf.layers.dense(inputs=h1, units=2)
return h2
and a simple function to run the simulation:
def testPerformanceFromMemory(model, iter=1000 num_cores=2):
example_batch = tf.placeholder(np.float32, shape=(64, 128))
for core in range(num_cores):
with tf.device('/gpu:%d'%core):
prediction = model(example_batch)
init_op = tf.global_variables_initializer()
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
sess.run(init_op)
tf.train.start_queue_runners(sess=sess)
input_array = np.random.random((64,128))
for step in range(iter):
myprediction = sess.run(prediction, feed_dict={example_batch:input_array})
if I run the python script and then run nvidia-smi command I can see that GPU0 is running with a high percentage of usage but GPU1 is 0 % usage.
I read this: https://www.tensorflow.org/tutorials/using_gpu and this: https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py but I don't know why my example doesn't run in multi gpu.
PS If I doenload ciphar 10 example from tensorflow repository it run in a multigpu mode.
Edit: As mrry says I am overwriting prediction so, I post here the correct way:
def testPerformanceFromMemory(model, iter=1000 num_cores=2):
example_batch = tf.placeholder(np.float32, shape=(64, 128))
prediction = []
for core in range(num_cores):
with tf.device('/gpu:%d'%core):
prediction.append([model(example_batch)])
init_op = tf.global_variables_initializer()
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
sess.run(init_op)
tf.train.start_queue_runners(sess=sess)
input_array = np.random.random((64,128))
for step in range(iter):
myprediction = sess.run(prediction, feed_dict={example_batch:input_array})

Looking at your program, you are creating several parallel subgraphs (often called "towers") on different GPU devices, but overwriting the prediction tensor in each iteration of the first for loop:
for core in range(num_cores):
with tf.device('/gpu:%d'%core):
prediction = model(example_batch)
# ...
for step in range(iter):
myprediction = sess.run(prediction, feed_dict={example_batch:input_array})
As a result, when you call sess.run(prediction, ...) you will only be running the subgraph that was created in the final iteration of the first for loop, which only runs on one GPU.

Related

Display loss in a Tensorflow DQN without leaving tf.Session()

I have a DQN all set up and working, but I can't figure out how to display the loss without leaving the Tensorflow session.
I first thought it involved creating a new function or class, but I'm not sure where to put it in the code, and what specifically to put into the function or class.
observations = tf.placeholder(tf.float32, shape=[None, num_stops], name='observations')
actions = tf.placeholder(tf.int32,shape=[None], name='actions')
rewards = tf.placeholder(tf.float32,shape=[None], name='rewards')
# Model
Y = tf.layers.dense(observations, 200, activation=tf.nn.relu)
Ylogits = tf.layers.dense(Y, num_stops)
# sample an action from predicted probabilities
sample_op = tf.random.categorical(logits=Ylogits, num_samples=1)
# loss
cross_entropies = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot(actions,num_stops), logits=Ylogits)
loss = tf.reduce_sum(rewards * cross_entropies)
# training operation
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=.99)
train_op = optimizer.minimize(loss)
I then run the network, which works without error.
with tf.Session() as sess:
'''etc. The network is run'''
sess.run(train_op, feed_dict={observations: observations_list,
actions: actions_list,
rewards: rewards_list})
I want to have loss from train_op displayed to the user.
try this
loss, _ = sess.run([loss, train_op], feed_dict={observations: observations_list,
actions: actions_list,
rewards: rewards_list})

Speed of Logistic Regression on MNIST with Tensorflow

I am taking the CS 20SI: Tensorflow for Deep Learning Research from Stanford. I have question regarding the following code:
import time
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
# Step 1: Read in data
# using TF Learn's built in function to load MNIST data to the folder data/mnist
MNIST = input_data.read_data_sets("/data/mnist", one_hot=True)
# Batched logistic regression
learning_rate = 0.01
batch_size = 128
n_epochs = 25
X = tf.placeholder(tf.float32, [batch_size, 784], name = 'image')
Y = tf.placeholder(tf.float32, [batch_size, 10], name = 'label')
#w = tf.Variable(tf.random_normal(shape = [int(shape[1]), int(Y.shape[1])], stddev = 0.01), name='weights')
#b = tf.Variable(tf.zeros(shape = [1, int(Y.shape[1])]), name='bias')
w = tf.Variable(tf.random_normal(shape=[784, 10], stddev=0.01), name="weights")
b = tf.Variable(tf.zeros([1, 10]), name="bias")
logits = tf.matmul(X,w) + b
entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=Y)
loss = tf.reduce_mean(entropy) #computes the mean over examples in the batch
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(loss)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
n_batches = int(MNIST.train.num_examples/batch_size)
for i in range(n_epochs):
start_time = time.time()
for _ in range(n_batches):
X_batch, Y_batch = MNIST.train.next_batch(batch_size)
opt, loss_ = sess.run([optimizer, loss], feed_dict = {X: X_batch, Y:Y_batch})
end_time = time.time()
print('Epoch %d took %f'%(i, end_time - start_time))
On this code, logistic regression with MNIST dataset is performed. The author states:
Running on my Mac, the batch version of the model with batch size 128
runs in 0.5 second
However, when I run it, each epoch takes around 2 seconds, giving a total execution time of around a minute. Is it reasonable that this example takes that time? Currently I have a Ryzen 1700 without OC (3.0GHz) and a GPU Gtx 1080 without OC.
I tried this code on GTX Titan X (Maxwell) and got around 0.5 seconds per epoch. I would expect that GTX 1080 should be able to get similar results.
Try using the latest tensorflow and cuda/cudnn versions. Make sure there are no limiting (which GPUs are visible, how much memory tensorflow can use, etc) environment variables set. You can try running a micro-benchmark to see that you can achieve the the stated FLOPS of your card, e.g. Testing GPU with tensorflow matrix multiplication

tensorflow: initialization of variables inside function

Newbee to tensorflow. I'm trying to write some simple net with following code:
import tensorflow as tf
import tensorflow.contrib as tfc
import tensorflow.contrib.layers as tfcl
def generator_deconv(z, kernel):
with tf.variable_scope("generator", reuse=True):
weights = tf.get_variable("weights")
biases = tf.get_variable("biases")
result = tf.matmul(z, weights)
result = tf.add(result, biases)
result = tf.reshape(result, tf.stack([tf.shape(result)[0],13,4,1]))
result = tf.nn.conv2d_transpose(result, kernel,
output_shape=[tf.shape(result)[0],25,8,1],
strides=[1,2,2,1],
padding="SAME")
result = tf.nn.conv2d_transpose(result, kernel,
output_shape=[tf.shape(result)[0],50,15,1],
strides=[1,2,2,1],
padding="SAME")
result = tf.nn.conv2d_transpose(result, kernel,
output_shape=[tf.shape(result)[0],100,30,1],
strides=[1,2,2,1],
padding="SAME")
return result
kernel = tf.constant(1.0, shape=[4,4,1,1])
protype = tf.constant(1.0, shape=[3,4])
init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC'
config.gpu_options.allow_growth=True
with tf.variable_scope("generator"):
t1 = tf.get_variable("weights",shape=[4,52])
t2 = tf.get_variable("biases", shape=[52])
test = generator_deconv(protype,kernel)
with tf.Session(config=config) as sess:
sess.run(init)
sess.run(tf.shape(t1))
sess.run(tf.shape(t2))
sess.run(tf.shape(test))
but got error:
tensorflow.python.framework.errors_impl.FailedPreconditionError:
Attempting to use uninitialized value generator/weights
for the last line
sess.run(tf.shape(test))
checked official api of tensorflow but still don't know what's wrong with the code.
================================UPDATE==========================
found 2 ways to fix it
1.if replace
sess.run(init)
by
sess.run(tf.global_variables_initializer())
then whole code works.
OR
2.run
init = tf.global_variables_initializer()
with tf.Session(config=config) as sess:
sess.run(init)
sess.run(tf.shape(t1))
sess.run(tf.shape(t2))
sess.run(tf.shape(test))
again it also works.
BUT don't understand why
I removed some parts of the code for you:
init = tf.global_variables_initializer()
with tf.variable_scope("generator"):
t1 = tf.get_variable("weights",shape=[4,52])
t2 = tf.get_variable("biases", shape=[52])
with tf.Session(config=config) as sess:
sess.run(init)
sess.run(tf.shape(t1))
You add variables to your graph after you saved the result of calling global_variables_initializer(). In your fix you call this function AFTER you added all variables you want to initialize to your graph, and thus everything is initialized.
Hope this helps!

TensorFlow Distributed training using tf.Supervisor stalls when running init_op

I'm trying to get some simple distributed training code working using TensorFlow r0.12.1 based on the example on this page from the documentation. However, I'm having issues with my distributed sessions never returning.
My cluster specification is as follows, and I am running both the ps and worker processes on the same server:
cluster = tf.train.ClusterSpec({
'ps': ['localhost:2222'],
'worker': ['localhost:2223', 'localhost:2224']
})
Using this, my ps process is as follows:
server = tf.train.Server(cluster, job_name="ps")
server.join()
And my two workers attempt to train a simple network using MNIST, and are launched using CUDA_VISIBLE_DEVICES=0 or CUDA_VISIBLE_DEVICES=1:
TASK_INDEX = <0 or 1>
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
server = tf.train.Server(cluster, job_name="worker", task_index=TASK_INDEX)
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % task_index,
cluster=cluster)):
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
global_step = tf.Variable(0)
train_op = tf.train.AdagradOptimizer(0.01).minimize(
cross_entropy, global_step=global_step)
summary_op = tf.summary.merge_all()
init_op = tf.global_variables_initializer()
sv = tf.train.Supervisor(is_chief=(task_index == 0),
init_op=init_op,
summary_op=summary_op,
global_step=global_step)
with sv.managed_session(server.target) as sess:
step = 0
batch_sz = 50
iters = 55000 / batch_sz
while not sv.should_stop() and step < iters:
_, step = sess.run([train_op, global_step], feed_dict={x: mnist.train.images[step*batch_sz:(step+1)*batch_sz],
y_: mnist.train.labels[step*batch_sz:(step+1)*batch_sz]})
# Ask for all the services to stop.
sv.stop()
Unfortunately, my code appears to freeze on the call to with sv.managed_session(server.target). Further diagnosis shows that the troublesome line is in fact the call to session_manager.wait_for_session which is initiated from the chief worker and, within this, the line 235 of tensorflow/python/training/session_manager.py:
sess.run(init_op, feed_dict=init_feed_dict)
So it seems like the whole process is stalling when attempting to run the initialisation operation, but does not produce any error messages. Regular training (not via tf.Supervisor) works fine on the same system, and so it seems unlikely it is installation / driver issues.
Is there any reason why the process should stall when running init_op? Is there something I am missing about how to properly use tf.Supervisor?

duplicate a tensorflow graph

What is the best way of duplicating a TensorFlow graph and keep it uptodate?
Ideally I want to put the duplicated graph on another device (e.g. from GPU to CPU) and then time to time update the copy.
Short answer: You probably want checkpoint files (permalink).
Long answer:
Let's be clear about the setup here. I'll assume that you have two devices, A and B, and you are training on A and running inference on B.
Periodically, you'd like to update the parameters on the device running inference with new parameters found during training on the other.
The tutorial linked above is a good place to start. It shows you how tf.train.Saver objects work, and you shouldn't need anything more complicated here.
Here is an example:
import tensorflow as tf
def build_net(graph, device):
with graph.as_default():
with graph.device(device):
# Input placeholders
inputs = tf.placeholder(tf.float32, [None, 784])
labels = tf.placeholder(tf.float32, [None, 10])
# Initialization
w0 = tf.get_variable('w0', shape=[784,256], initializer=tf.contrib.layers.xavier_initializer())
w1 = tf.get_variable('w1', shape=[256,256], initializer=tf.contrib.layers.xavier_initializer())
w2 = tf.get_variable('w2', shape=[256,10], initializer=tf.contrib.layers.xavier_initializer())
b0 = tf.Variable(tf.zeros([256]))
b1 = tf.Variable(tf.zeros([256]))
b2 = tf.Variable(tf.zeros([10]))
# Inference network
h1 = tf.nn.relu(tf.matmul(inputs, w0)+b0)
h2 = tf.nn.relu(tf.matmul(h1,w1)+b1)
output = tf.nn.softmax(tf.matmul(h2,w2)+b2)
# Training network
cross_entropy = tf.reduce_mean(-tf.reduce_sum(labels * tf.log(output), reduction_indices=[1]))
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
# Your checkpoint function
saver = tf.train.Saver()
return tf.initialize_all_variables(), inputs, labels, output, optimizer, saver
The code for the training program:
def programA_main():
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
# Build training network on device A
graphA = tf.Graph()
init, inputs, labels, _, training_net, saver = build_net(graphA, '/cpu:0')
with tf.Session(graph=graphA) as sess:
sess.run(init)
for step in xrange(1,10000):
batch = mnist.train.next_batch(50)
sess.run(training_net, feed_dict={inputs: batch[0], labels: batch[1]})
if step%100==0:
saver.save(sess, '/tmp/graph.checkpoint')
print 'saved checkpoint'
...and code for an inference program:
def programB_main():
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
# Build inference network on device B
graphB = tf.Graph()
init, inputs, _, inference_net, _, saver = build_net(graphB, '/cpu:0')
with tf.Session(graph=graphB) as sess:
batch = mnist.test.next_batch(50)
saver.restore(sess, '/tmp/graph.checkpoint')
print 'loaded checkpoint'
out = sess.run(inference_net, feed_dict={inputs: batch[0]})
print out[0]
import time; time.sleep(2)
saver.restore(sess, '/tmp/graph.checkpoint')
print 'loaded checkpoint'
out = sess.run(inference_net, feed_dict={inputs: batch[0]})
print out[1]
If you fire up the training program and then the inference program, you'll see the inference program produces two different outputs (from the same input batch). This is a result of it picking up the parameters that the training program has checkpointed.
Now, this program obviously isn't your end point. We don't do any real synchronization, and you'll have to decide what "periodic" means with respect to checkpointing. But this should give you an idea of how to sync parameters from one network to another.
One final warning: this does not mean that the two networks are necessarily deterministic. There are known non-deterministic elements in TensorFlow (e.g., this), so be wary if you need exactly the same answer. But this is the hard truth about running on multiple devices.
Good luck!
I'll try to go with a pretty simplified answer, to see if the general approach is what OP is describing:
I'd implement it via the tf.train.Saver object.
Suppose you have your weights in a variable W1, W2, and b1
mysaver = tf.train.Saver(({'w1': W1, 'w2': W2, 'b1': b1}))
In the train loop you can add, every n iterations:
saver.save(session_var, 'model1', global_step=step)
And then in the loading instance, when needed, you run:
tf.train.Saver.restore(other_session_object, 'model1')
Hope this is similar to the solution you are asking.
Simply do the round trip tf.Graph > tf.GraphDef > tf.Graph:
import tensorflow as tf
def copy_graph(graph: tf.Graph) -> tf.Graph:
with tf.Graph().as_default() as copied_graph:
graph_def = graph.as_graph_def(add_shapes=True)
tf.graph_util.import_graph_def(graph_def)
return copied_graph

Categories