How do you pass a custom gradient into a gradient optimization function in TensorFlow.
I have illustrated what I am trying to do, with a simple example (trying to minimize z = 2x^2 + y^2 + 2).
I have been looking at:
https://www.tensorflow.org/api_docs/python/tf/train/Optimizer
The problem seems to work if you pass in optimizer = tf.train.GradientDescentOptimizer(0.55) and train = optimizer.minimize(z)
This code works:
import tensorflow as tf
x = tf.Variable(11, name='x', dtype=tf.float32)
y = tf.Variable(11, name='x', dtype=tf.float32)
const = tf.constant(2.0, dtype=tf.float32)
z = x**2 + y**2 + const
optimizer = tf.train.GradientDescentOptimizer(0.55)
train = optimizer.minimize(z)
init = tf.global_variables_initializer()
def optimize():
with tf.Session() as session:
session.run(init)
print("starting at", "x:", session.run(x), "y:", session.run(y), "z:", session.run(z))
for step in range(10):
session.run(train)
print("step", step, "x:", session.run(x), "y:", session.run(y), "z:", session.run(z))
optimize()
But I want to specify the gradient in the problem.
aka I am trying to do this:
def function_to_minimize(x,y, const):
# z = 2x^2 + y^2 + constant
z = 2*x**2 + y**2 + const
return z
def calc_grad(x,y):
# z = 2x^2 + y^2 + constant
dz_dx = 4*x
dz_dy = 2*y
return [(dz_dx, x), (dz_dy, y)]
x = tf.Variable(3, name='x', dtype=tf.float32)
y = tf.Variable(3, name='y', dtype=tf.float32)
const = tf.constant(2.0, dtype=tf.float32)
z = function_to_minimize(x,y, const)
grad = calc_grad(x,y)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
print(sess.run(z))
print(sess.run(grad))
optimizer = tf.train.GradientDescentOptimizer(0.5)
grads_and_vars = calc_grad(x,y)
optimizer.apply_gradients(grads_and_vars)
# minimize() takes care of both computing the gradients and applying them to the variables.
#If you want to process the gradients before applying them you can instead use the optimizer in three steps:
# 1. Compute the gradients with compute_gradients().
# 2. Process the gradients as you wish.
# 3. Apply the processed gradients with apply_gradients()
How do you do this properly?
apply_gradients returns an operation that you can use to apply the gradients. In other words, you just do train = optimizer.apply_gradients(grads_and_vars) and the rest will work as in the first snippet. I,e.:
optimizer = tf.train.GradientDescentOptimizer(0.55)
grads_and_vars = calc_grad(x,y)
train = optimizer.apply_gradients(grads_and_vars)
init = tf.global_variables_initializer()
def optimize():
with tf.Session() as session:
session.run(init)
print("starting at", "x:", session.run(x), "y:", session.run(y), "z:", session.run(z))
for step in range(10):
session.run(train)
print("step", step, "x:", session.run(x), "y:", session.run(y), "z:", session.run(z))
optimize()
Related
I have the following scenario:
y = tf.placeholder(tf.float32, [None, 1],name="output")
layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons,activation=tf.nn.leaky_relu, name="layer"+str(layer))
for layer in range(2)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, 100])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, 1)
outputs = tf.reshape(stacked_outputs, [-1, 2, 1])
outputs = tf.identity(outputs[:,1,:], name="prediction")
loss = Custom_loss(y,outputs)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss,name="training_op")
The custom loss function I tried is:
def Custom_loss(y,outputs):
hold_loss = []
for exp,pred in zip(y,outputs):
if exp >= pred:
result = tf.pow(pred * 0.5,2) - exp
hold_loss.append(result)
else:
hold_loss.append(tf.subtract(pred-exp))
return tf.reduce_mean(hold_loss)
Now when I am trying to implement this I am getting the following error:
TypeError: Tensor objects are only iterable when eager execution is enabled. To iterate over this tensor use tf.map_fn.
I have tried implementing the tf.map_fn() but there is the same error I encounter. I have used the following question:
How to explain the result of tf.map_fn?
Kindly, help me get through this issue? How I can iterate the tensor? What way is best for the custom loss function implementation?
def Custom_loss(y,outputs):
mask = tf.greater_equal(y, outputs)
a = tf.pow(tf.boolean_mask(outputs, mask)*0.5, 2) - tf.boolean_mask(y, mask)
inv_mask = tf.logical_not(mask)
b = tf.boolean_mask(outputs, inv_mask)- tf.boolean_mask(y, inv_mask)
return tf.reduce_mean(tf.concat([a, b], axis=-1))
Test case
def Custom_loss_np(y,outputs):
hold_loss = []
for exp,pred in zip(y,outputs):
if exp >= pred:
result = pow(pred * 0.5,2) - exp
hold_loss.append(result)
else:
hold_loss.append(pred-exp)
return np.mean(hold_loss)
np_x = np.random.randn(100)
np_y = np.random.randn(100)
x = tf.constant(np_x)
y = tf.constant(np_y)
with tf.Session() as sess:
assert sess.run(Custom_loss(x, y)) == Custom_loss_np(np_x, np_y)
Use tf.math if you are in latest versoin of tensorflow.
Example using the custom loss to train a simple linear regression model
X = tf.placeholder(tf.float32,[None,1])
y = tf.placeholder(tf.float32,[None,1])
w = tf.Variable(tf.ones([1,1]))
b = tf.Variable(tf.ones([1,1]))
y_ = tf.matmul(X, w)+b
loss = Custom_loss(y, y_) #tf.reduce_mean(tf.square(y_ - y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss,name="training_op")
#dummy data for linear regression
x_data = np.random.randn(100,1)
y_labels = 1.5*x_data + 2.5 + np.random.randn(100,1)
init = tf.global_variables_initializer()
sess.run(init)
sess = tf.Session()
sess.run(init)
for i in range(5000):
_, loss_ = sess.run([training_op,loss], feed_dict={X:x_data, y:y_labels})
if (i+1)%1000 == 0 :
print (loss_)
print (sess.run([w, b]))
The logic for calculating the loss is something OP have come up with.
So my question is how do I normalize the variables after I do gradient descent in the _apply_dense() method of the optimizer class. This is what I currently have.
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
var_update = state_ops.assign_sub(var, lr_t*grad)
normalize = var.assign(tf.norm(var))
return control_flow_ops.group(*[var_update,normalize])
My current code seems to just normalize the original variables without applying the gradient descent update. I know that this is due to the normalize step I have above which is just reassigning the original variables but normalized. How do I correct this so that the gradient descent step is applied and then the normalization is done on the result.
This could be implemented as following:
lr = 0.01
with tf.name_scope('optimizer'):
vars_ = tf.trainable_variables()
grads = tf.gradients(loss_tensor, vars_)
assign_ops = [tf.assign(v, (v - lr*g)) for g, v in zip(grads, vars_)]
with tf.control_dependencies(assign_ops):
vars_norms = [tf.sqrt(2*tf.nn.l2_loss(v)) for v in vars_]
# next line prevents division by zero
vars_norms = [tf.clip_by_value(n, 0.00001, np.inf) for n in vars_norms]
update_ops = [tf.assign(v, v/n) for v, n in zip(vars_, vars_norms)]
update_op = tf.group(update_ops)
Note that if I've added tf.clip_by_value() to prevent division by zero.
Here's a full usage example:
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=(None, 2))
y = tf.placeholder(tf.int32, shape=(None))
logits = tf.layers.dense(x, 2)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=logits)
loss_tensor = tf.reduce_mean(xentropy)
lr = 0.01
with tf.name_scope('optimizer'):
vars_ = tf.trainable_variables()
grads = tf.gradients(loss_tensor, vars_)
assign_ops = [tf.assign(v, (v - lr*g)) for g, v in zip(grads, vars_)]
with tf.control_dependencies(assign_ops):
vars_norms = [tf.sqrt(2*tf.nn.l2_loss(v)) for v in vars_]
# next line prevents division by zero
vars_norms = [tf.clip_by_value(n, 0.00001, np.inf) for n in vars_norms]
update_ops = [tf.assign(v, v/n) for v, n in zip(vars_, vars_norms)]
update_op = tf.group(update_ops)
# dummy data for illustration
x_train = np.random.normal(size=(10, 2))
x_train = np.vstack([x_train, 2*np.random.normal(size=(10, 2))])
y_train = [0 for _ in range(10)] + [1 for _ in range(10)]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(10):
loss, _ = sess.run([loss_tensor, update_op], feed_dict={x:x_train, y:y_train})
print(loss)
# 0.7111398
# 0.7172677
# 0.71517026
# 0.713101
# 0.71105987
# 0.7090467
# 0.70706147
# 0.7051038
# 0.7031738
# 0.7012712
I have a scope which I named 'Pred/Accuracy' that I cant seem to find in Tensorboard. I will include my entire code a little later but specifically in my definition of my cost function I have:
def compute_cost(z, Y, parameters, l2_reg=False):
with tf.name_scope('cost'):
logits = tf.transpose(z)
labels = tf.transpose(Y)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits,
labels = labels))
if l2_reg == True:
reg = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
cost = cost + tf.reduce_sum(reg)
with tf.name_scope('Pred/Accuracy'):
prediction=tf.argmax(z)
correct_prediction = tf.equal(tf.argmax(z), tf.argmax(Y))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
return cost, prediction, accuracy
But on tensorboard I cant see it even if I click on the cost block:
Below is basically my entire code excluding importing / pre-processing data
# Create X and Y placeholders
def create_xy_placeholder(n_x, n_y):
X = tf.placeholder(tf.float32, shape = [n_x, None], name = 'X')
Y = tf.placeholder(tf.float32, shape = [n_y, None], name = 'Y')
return X, Y
# initialize parameters hidden layers
def initialize_parameters(n_x, scale, hidden_units):
hidden_units= [n_x] + hidden_units
parameters = {}
regularizer = tf.contrib.layers.l2_regularizer(scale)
for i in range(0, len(hidden_units[1:])):
with tf.variable_scope('hidden_parameters_'+str(i+1)):
w = tf.get_variable("W"+str(i+1), [hidden_units[i+1], hidden_units[i]],
initializer=tf.contrib.layers.xavier_initializer(),
regularizer=regularizer)
b = tf.get_variable("b"+str(i+1), [hidden_units[i+1], 1],
initializer = tf.constant_initializer(0.1))
parameters.update({"W"+str(i+1): w})
parameters.update({"b"+str(i+1): b})
return parameters
# forward progression with batch norm and dropout
def forward_propagation(X, parameters, batch_norm=False, keep_prob=1):
a_new = X
for i in range(0, int(len(parameters)/2)-1):
with tf.name_scope('forward_pass_'+str(i+1)):
w = parameters['W'+str(i+1)]
b = parameters['b'+str(i+1)]
z = tf.matmul(w, a_new) + b
if batch_norm == True:
z = tf.layers.batch_normalization(z, momentum=0.99, axis=0)
a = tf.nn.relu(z)
if keep_prob < 1:
a = tf.nn.dropout(a, keep_prob)
a_new = a
tf.summary.histogram('act_'+str(i+1), a_new)
# calculating final Z before input into cost as logit
with tf.name_scope('forward_pass_'+str(int(len(parameters)/2))):
w = parameters['W'+str(int(len(parameters)/2))]
b = parameters['b'+str(int(len(parameters)/2))]
z = tf.matmul(w, a_new) + b
if batch_norm == True:
z = tf.layers.batch_normalization(z, momentum=0.99, axis=0)
return z
# compute cost with option for l2 regularizatoin
def compute_cost(z, Y, parameters, l2_reg=False):
with tf.name_scope('cost'):
logits = tf.transpose(z)
labels = tf.transpose(Y)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits,
labels = labels))
if l2_reg == True:
reg = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
cost = cost + tf.reduce_sum(reg)
with tf.name_scope('Pred/Accuracy'):
prediction=tf.argmax(z)
correct_prediction = tf.equal(tf.argmax(z), tf.argmax(Y))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
return cost, prediction, accuracy
# defining the model (need to add keep_prob for dropout)
def model(X_train, Y_train, X_test, Y_test,
hidden_units=[30, 50, 50, 30, 4], # hidden units/layers
learning_rate = 0.0001, # Learning rate
num_epochs = 2000, minibatch_size = 30, # minibatch/ number epochs
keep_prob=0.5, # dropout
batch_norm=True, # batch normalization
l2_reg=True, scale = 0.01, # L2 regularization/scale is lambda
print_cost = True):
ops.reset_default_graph() # to be able to rerun the model without overwriting tf variables
tf.set_random_seed(1) # to keep consistent results
seed = 3 # to keep consistent results
(n_x, m) = X_train.shape # (n_x: input size, m : number of examples in the train set)
n_y = Y_train.shape[0] # n_y : output size
costs = [] # To keep track of the cost
logs_path = '/tmp/tensorflow_logs/example/'
# Create Placeholders of shape (n_x, n_y)
X, Y = create_xy_placeholder(n_x, n_y)
# Initialize parameters
parameters = initialize_parameters(n_x, scale, hidden_units)
# Forward propagation: Build the forward propagation in the tensorflow graph
z = forward_propagation(X, parameters, keep_prob, batch_norm)
# Cost function: Add cost function to tensorflow graph
cost, prediction, accuracy = compute_cost(z, Y, parameters, l2_reg)
# Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer.
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
# Op to calculate every variable gradient
grads = tf.gradients(cost, tf.trainable_variables())
grads = list(zip(grads, tf.trainable_variables()))
# Op to update all variables according to their gradient
apply_grads = optimizer.apply_gradients(grads_and_vars = grads)
# Initialize all the variables
init = tf.global_variables_initializer()
# to view in tensorboard
tf.summary.scalar('loss', cost)
tf.summary.scalar('accuracy', accuracy)
# Create summaries to visualize weights
for var in tf.trainable_variables():
tf.summary.histogram(var.name, var)
# Summarize all gradients
for grad, var in grads:
tf.summary.histogram(var.name + '/gradient', grad)
merged_summary_op = tf.summary.merge_all()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# Start the session to compute the tensorflow graph
with tf.Session(config=config) as sess:
# Run the initialization
sess.run(init)
# define writer
summary_writer = tf.summary.FileWriter(logs_path,
graph=tf.get_default_graph())
# Do the training loop
for epoch in range(num_epochs):
epoch_cost = 0. # Defines a cost related to an epoch
num_minibatches = int(m / minibatch_size) # number of minibatches of size minibatch_size in the train set
seed = seed + 1
minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)
count = 0
for minibatch in minibatches:
# Select a minibatch
(minibatch_X, minibatch_Y) = minibatch
# IMPORTANT: The line that runs the graph on a minibatch.
# Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
_ , minibatch_cost, summary = sess.run([apply_grads, cost,
merged_summary_op],
feed_dict = {X: minibatch_X, Y: minibatch_Y})
epoch_cost += minibatch_cost / num_minibatches
# Write logs at every iteration
summary_writer.add_summary(summary, epoch * num_minibatches + count)
count += 1
# Print the cost every epoch
if print_cost == True and epoch % 100 == 0:
print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
prediction1=tf.argmax(z)
# print('Z5: ', Z5.eval(feed_dict={X: minibatch_X, Y: minibatch_Y}))
print('prediction: ', prediction1.eval(feed_dict={X: minibatch_X,
Y: minibatch_Y}))
correct1=tf.argmax(Y)
# print('Y: ', Y.eval(feed_dict={X: minibatch_X,
# Y: minibatch_Y}))
print('correct: ', correct1.eval(feed_dict={X: minibatch_X,
Y: minibatch_Y}))
if print_cost == True and epoch % 5 == 0:
costs.append(epoch_cost)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
# lets save the parameters in a variable
parameters = sess.run(parameters)
print ("Parameters have been trained!")
# Calculate the correct predictions
correct_prediction = tf.equal(tf.argmax(z), tf.argmax(Y))
# Calculate accuracy on the test set
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))
print("Run the command line:\n" \
"--> tensorboard --logdir=/tmp/tensorflow_logs " \
"\nThen open http://0.0.0.0:6006/ into your web browser")
return parameters
# run model on test data
parameters = model(x_train, y_train, x_test, y_test, keep_prob=1)
Tensorflow scopes are hierarchical: you can have a scope within another scope within another scope, etc. The name "Pred/Accuracy" means exactly that: you have a top level "Pred" scope and "Accuracy" nested scope (this is because slash is has a special meaning in naming).
Tensorboard shows the top ones by default: "Pred" (on the top), "batch_normalization", etc. You can expand them to see what's inside them by double clicking. Inside "Pred" you should find "Accuracy".
If you like, just name your scope differently, e.g. "Pred_Accuracy", and the full name will appear in tensorboard.
I want to check if I can solve this problem with tensorflow instead of pymc3. The experimental idea is that I am going to define a probibalistic system that contains a switchpoint. I can use sampling as a method of inference but I started wondering why I couldn't just do this with a gradient descent instead.
I decided to do the gradient search in tensorflow but it seems like tensorflow is having a hard time performing a gradient search when tf.where is involved.
You can find the code below.
import tensorflow as tf
import numpy as np
x1 = np.random.randn(50)+1
x2 = np.random.randn(50)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(5, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(10, name = "tau", dtype=tf.float32)
mu = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu1,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu2)
sigma = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma1,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma2)
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) -tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.RMSPropOptimizer(0.01)
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print("these variables should be trainable: {}".format([_.name for _ in tf.trainable_variables()]))
for step in range(10000):
_lik, _ = sess.run([total_likelihood, opt_task])
if step % 1000 == 0:
variables = {_.name:_.eval() for _ in [mu1, mu2, sigma1, sigma2, tau]}
print("step: {}, values: {}".format(str(step).zfill(4), variables))
You'll notice that the tau parameter does not change even though tensorflow seems to be aware of the variable and it's gradient. Any clue on what is going wrong? Is this something that can be calculated in tensorflow or do I need a different pattern?
tau is only used in the condition argument to where: (tf.where(time_all < tau, ...) , which is a boolean tensor. Since calculating gradients only makes sense for continuous values, the gradient of the output with respect to tau will be zero.
Even ignoring tf.where, you used tau in the expression time_all < tau, which is constant almost everywhere, so has a gradient of zero.
Due to the gradient of zero, there is no way to learn tau with gradient descent methods.
Depending on your problem, maybe instead of a hard switch between two values, you can use a weighted sum instead p*val1 + (1-p)*val2, where p depends on tau in a continuous manner.
The assigned solution is the correct answer, but doesn't contain the code solution to my problem. The following snippet does;
import tensorflow as tf
import numpy as np
import os
import uuid
TENSORBOARD_PATH = "/tmp/tensorboard-switchpoint"
# tensorboard --logdir=/tmp/tensorboard-switchpoint
x1 = np.random.randn(35)-1
x2 = np.random.randn(35)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(0, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(15, name = "tau", dtype=tf.float32)
switch = 1./(1+tf.exp(tf.pow(time_all - tau, 1)))
mu = switch*mu1 + (1-switch)*mu2
sigma = switch*sigma1 + (1-switch)*sigma2
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) - tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.AdamOptimizer()
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
tf.summary.scalar("mu1", mu1)
tf.summary.scalar("mu2", mu2)
tf.summary.scalar("sigma1", sigma1)
tf.summary.scalar("sigma2", sigma2)
tf.summary.scalar("tau", tau)
tf.summary.scalar("likelihood", total_likelihood)
merged_summary_op = tf.summary.merge_all()
with tf.Session() as sess:
sess.run(init)
print("these variables should be trainable: {}".format([_.name for _ in tf.trainable_variables()]))
uniq_id = os.path.join(TENSORBOARD_PATH, "switchpoint-" + uuid.uuid1().__str__()[:4])
summary_writer = tf.summary.FileWriter(uniq_id, graph=tf.get_default_graph())
for step in range(40000):
lik, opt, summary = sess.run([total_likelihood, opt_task, merged_summary_op])
if step % 100 == 0:
variables = {_.name:_.eval() for _ in [total_likelihood]}
summary_writer.add_summary(summary, step)
print("i{}: {}".format(str(step).zfill(5), variables))
rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
state = rnn_cell.zero_state(batch_size, tf.float32)
init = tf.global_variables_initializer()
sess = tf.Session()
for i in range(len(x_data)):
x = process_x(x_data[i])[:std_size]
y = word[i][:std_size]
x_split = tf.split(0, time_step_size, x)
outputs, state = tf.nn.rnn(rnn_cell, x_split, state)
prediction = tf.reshape(tf.concat(1, outputs), [-1, rnn_size])
real = tf.reshape(y, [-1])
ratio = tf.ones([time_step_size * batch_size])
loss = tf.nn.seq2seq.sequence_loss_by_example([prediction], [real], [ratio])
cost = tf.reduce_mean(loss)/batch_size
train = tf.train.AdamOptimizer(0.01).minimize(cost)
tf.global_variables_initializer().run(session=sess)
step = 0
print state
while step < 1000:
sess.run(train)
step+=1
result = sess.run(tf.arg_max(prediction, 1))
print result, [t for t in result] == y
tf.get_variable_scope().reuse_variables()
If source code is like above, rnn_cell and state is initialized in every steps in for loops?
If I want to use state in other training case then I have to reuse it. So rnn_cell and state should be initialized at first only not after that.
I can't imagine how this code works.
I think the problem is you have to separate your computational graph part with the session running part. what you are doing now is not how tensorflow usually works. maybe try this:
rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
state = rnn_cell.zero_state(batch_size, tf.float32)
x_split = tf.split(0, time_step_size, x)
outputs, state = tf.nn.rnn(rnn_cell, x_split, state)
prediction = tf.reshape(tf.concat(1, outputs), [-1, rnn_size])
real = tf.reshape(y, [-1])
ratio = tf.ones([time_step_size * batch_size])
loss = tf.nn.seq2seq.sequence_loss_by_example([prediction], [real], [ratio])
cost = tf.reduce_mean(loss)/batch_size
train = tf.train.AdamOptimizer(0.01).minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
for i in range(len(x_data)):
x = process_x(x_data[i])[:std_size]
y = word[i][:std_size]
step = 0
while step < 1000:
sess.run(train, feed_dict={x_split:x, real:y})
step+=1
result = sess.run(tf.arg_max(prediction, 1))
print result, [t for t in result] == y
your code may have some design problem, but the point is separating your graph design with your "training".