Normalizing variables after running apply gradients all within the optimizer class

Normalizing variables after running apply gradients all within the optimizer class - python

So my question is how do I normalize the variables after I do gradient descent in the _apply_dense() method of the optimizer class. This is what I currently have.
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
var_update = state_ops.assign_sub(var, lr_t*grad)
normalize = var.assign(tf.norm(var))
return control_flow_ops.group(*[var_update,normalize])
My current code seems to just normalize the original variables without applying the gradient descent update. I know that this is due to the normalize step I have above which is just reassigning the original variables but normalized. How do I correct this so that the gradient descent step is applied and then the normalization is done on the result.

This could be implemented as following:
lr = 0.01
with tf.name_scope('optimizer'):
vars_ = tf.trainable_variables()
grads = tf.gradients(loss_tensor, vars_)
assign_ops = [tf.assign(v, (v - lr*g)) for g, v in zip(grads, vars_)]
with tf.control_dependencies(assign_ops):
vars_norms = [tf.sqrt(2*tf.nn.l2_loss(v)) for v in vars_]
# next line prevents division by zero
vars_norms = [tf.clip_by_value(n, 0.00001, np.inf) for n in vars_norms]
update_ops = [tf.assign(v, v/n) for v, n in zip(vars_, vars_norms)]
update_op = tf.group(update_ops)
Note that if I've added tf.clip_by_value() to prevent division by zero.
Here's a full usage example:
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=(None, 2))
y = tf.placeholder(tf.int32, shape=(None))
logits = tf.layers.dense(x, 2)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=logits)
loss_tensor = tf.reduce_mean(xentropy)
lr = 0.01
with tf.name_scope('optimizer'):
vars_ = tf.trainable_variables()
grads = tf.gradients(loss_tensor, vars_)
assign_ops = [tf.assign(v, (v - lr*g)) for g, v in zip(grads, vars_)]
with tf.control_dependencies(assign_ops):
vars_norms = [tf.sqrt(2*tf.nn.l2_loss(v)) for v in vars_]
# next line prevents division by zero
vars_norms = [tf.clip_by_value(n, 0.00001, np.inf) for n in vars_norms]
update_ops = [tf.assign(v, v/n) for v, n in zip(vars_, vars_norms)]
update_op = tf.group(update_ops)
# dummy data for illustration
x_train = np.random.normal(size=(10, 2))
x_train = np.vstack([x_train, 2*np.random.normal(size=(10, 2))])
y_train = [0 for _ in range(10)] + [1 for _ in range(10)]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(10):
loss, _ = sess.run([loss_tensor, update_op], feed_dict={x:x_train, y:y_train})
print(loss)
# 0.7111398
# 0.7172677
# 0.71517026
# 0.713101
# 0.71105987
# 0.7090467
# 0.70706147
# 0.7051038
# 0.7031738
# 0.7012712

Related

Pricing american options using deep learning, put instead of max-call

So I'm trying to learn to optimally stop options in a Black-Scholes setting along the lines of the article: "Solving high-dimensional optimal stopping problems using deep learning" by Sebastian Becker, Patrick Cheridito, Arnulf Jentzen, and Timo Welti.
The framework used to price options is the following:
import tensorflow as tf
from tensorflow.python.training.moving_averages import assign_moving_average
def neural_net(x, neurons, is_training, dtype=tf.float32, decay=0.9):
def batch_normalization(y):
shape = y.get_shape().as_list()
y = tf.reshape(y, [-1, shape[1] * shape[2]])
#variables for batch normalization
beta = tf.compat.v1.get_variable(
name='beta', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.zeros_initializer())
gamma = tf.compat.v1.get_variable(
name='gamma', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.ones_initializer())
mv_mean = tf.compat.v1.get_variable(
'mv_mean', [shape[1]*shape[2]],
dtype = dtype, initializer=tf.zeros_initializer(),
trainable = False)
mv_var = tf.compat.v1.get_variable(
'mv_var', [shape[1]*shape[2]],
dtype = dtype, initializer =tf.ones_initializer(),
trainable = False)
mean,variance = tf.nn.moments(y, [0], name = 'moments')
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_mean, mean, decay,
zero_debias=True))
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_var, variance, decay,
zero_debias=False))
mean, variance = tf.cond(is_training, lambda: (mean, variance),
lambda: (mv_mean, mv_var))
y = tf.nn.batch_normalization(y, mean, variance, beta, gamma, 1e-6)
return tf.reshape(y, [-1, shape[1], shape[2]])
def fc_layer(y, out_size, activation, is_single):
shape = y.get_shape().as_list()
w = tf.compat.v1.get_variable(
name='weights',
shape=[shape[2], shape[1], out_size],
dtype=dtype,
initializer=tf.initializers.glorot_uniform())
y = tf.transpose(tf.matmul(tf.transpose(y, [2, 0, 1]), w),
[1, 2, 0])
if is_single:
b = tf.compat.v1.get_variable(
name='bias',
shape=[out_size, shape[2]],
dtype = dtype,
initializer=tf.zeros_initializer())
return activation(y + b)
return activation(batch_normalization(y))
x = batch_normalization(x)
for i in range(len(neurons)):
with tf.compat.v1.variable_scope('layer_' + str(i)):
x = fc_layer(x, neurons[i],
tf.nn.relu if i < len(neurons) - 1
else tf.nn.sigmoid, False)
return x
#then Deep optimal stopping
def deep_optimal_stopping(x, t, n, g, neurons, batch_size, train_steps,
mc_runs, lr_boundaries, lr_values, beta1=0.9,
beta2=0.999, epsilon=1e-8, decay=0.9):
is_training = tf.compat.v1.placeholder(tf.bool, []) # a variable used to distinguish between training and Monte Carlo simulation, used for batch noralization
p = g(t, x) # we evaluate the payoff for the whole batch at every point in time
nets = neural_net(tf.concat([x[:, :, :-1], p[:, :, :-1]], axis=1),
neurons, is_training, decay=decay)
u_list = [nets[:, :, 0]]
u_sum = u_list[-1]
for k in range(1, n - 1): #range(start, stop)
u_list.append(nets[:, :, k] * (1. - u_sum)) # we build a neural network to approximate the stopping decision at time n*T/N
u_sum += u_list[-1]
#last iteration?
u_list.append(1. - u_sum)
u_stack = tf.concat(u_list, axis=1)
p = tf.squeeze(p, axis=1) #removes dimension of size 1
loss = tf.reduce_mean(tf.reduce_sum(-u_stack * p, axis=1)) #loss function
idx = tf.argmax(tf.cast(tf.cumsum(u_stack, axis=1) + u_stack >= 1,
dtype=tf.uint8), #idx for index?, argmax takes index for largest value
axis=1, output_type=tf.int32)
stopped_payoffs = tf.reduce_mean(
tf.gather_nd(p, tf.stack([tf.range(0, batch_size, dtype=tf.int32),
idx], axis=1))) # this is the approximation of the price for one batch, we will calculate the mean over MC-runs of those numbers
global_step = tf.Variable(0) # a variable used to apply the learning rate schedule, without it the optimizer would not know at which training step we are
learning_rate = tf.compat.v1.train.piecewise_constant(global_step,
lr_boundaries,
lr_values) # this gives us a piecewise constant learning rate, according to the schedule
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate,
beta1=beta1,
beta2=beta2,# define the optimizer, we use Adam with our learning rate schedule and a small tweak of one of its parameters
epsilon=epsilon)
update_ops = tf.compat.v1.get_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.global_variables_initializer())
for _ in range(train_steps):
sess.run(train_op, feed_dict={is_training: True})
px_mean = 0. # value that will hold the price
for _ in range(mc_runs): # loop over the number of MC runs
px_mean += sess.run(stopped_payoffs,
feed_dict={is_training: False})# we stop training, this is used for the batch normalization, from now on we will use the sampled moving averages
return px_mean / mc_runs
Now we define the various variables and simulate paths of a stock as X. Then we run use deep_optimal_stopping function to price the option, defined in the following code
import tensorflow as tf
import numpy as np
import time
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
T, N, K = 3., 9, 100.
r, delta, beta = 0.05, 0.1, 0.2
batch_size = 800#8192
lr_values = [0.05, 0.005, 0.0005]
mc_runs = 50#500
def g(s, x):
return tf.exp(-r * s) \
* tf.maximum(tf.reduce_max(x, axis=1, keepdims=True) - K, 0.)
_file = open('example_4_4_1_1.csv', 'w')
_file.write('dim, run, mean, time\n')
for d in [2, 3, 5, 10, 20, 30, 50, 100, 200, 500]:
for s_0 in [40.]:#[90., 100., 110.]:
for run in range(5):
tf.compat.v1.reset_default_graph()
t0 = time.time()
neurons = [d + 50, d + 50, 1]
train_steps = 1500 + d
lr_boundaries = [int(500 + d / 5), int(1500 + 3 * d / 5)]
W = tf.cumsum(tf.compat.v1.random_normal(
shape=[batch_size, d, N],
stddev=np.sqrt(T / N)), axis=2)
t = tf.constant(np.linspace(start=T / N, stop=T, num=N,
endpoint=True, dtype=np.float32))
#X = tf.exp((r - delta - beta ** 2 / 2.) * t + beta * W) * s_0
px_mean = deep_optimal_stopping(
W, t, N, g, neurons, batch_size,
train_steps, mc_runs,
lr_boundaries, lr_values, epsilon=0.1)
t1 = time.time()
print("")
_file.write('%i, %i, %f, %f\n' % (d, run, px_mean, t1 - t0))
_file.close()
So here the option is a bermudan max-call defined by the payoff function g(s,x). My understanding would be, if I wanted the price of an American put, I instead changed the payoff function g to be:
def g(s, x):
return tf.exp(-r * s) * tf.maximum(K-x, 0.)
and otherwise changing nothing. But instead of getting a price of 5.31 as reported in their article, I get 4.02.
Can someone explain where I'm going wrong with my understanding of the problem?

Best way to wrap an optimizer in `CrossShardOptimizer`

Let's say I have this code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import mesh_tensorflow as mtf
import tensorflow.compat.v1 as tf
def get_optimizer(mesh, loss, params, variable_dtype, inp_var_grads=None):
"""Creates and returns an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.constant(value=params["lr"], shape=[], dtype=variable_dtype.slice_dtype)
clip_value = mtf.constant(mesh, params["gradient_clipping"], dtype=variable_dtype.slice_dtype)
if inp_var_grads is None:
var_grads = mtf.gradients([loss], [v.outputs[0] for v in mesh.graph.trainable_variables])
else:
var_grads = inp_var_grads
# Cast to full precision
var_grads_fp = [mtf.cast(v, variable_dtype.slice_dtype) for v in var_grads]
# decrease LR to final lr (lr*0.1) by this step - defaults to train_steps
end_step = params.get("lr_decay_end", params["train_steps"])
if params["lr_decay"] == "linear":
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
end_step,
end_learning_rate=params["lr"]*0.1, # Decrease to 10% of initial LR according to GPT-3 paper
power=1.0,
cycle=False)
elif params["lr_decay"] == "cosine":
learning_rate = tf.train.cosine_decay(
learning_rate,
global_step,
end_step,
alpha=0.1 # Alpha is min lr value as a fraction of init lr.
)
if params["warmup_steps"] > 0:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(params["warmup_steps"], dtype=tf.int32)
dtype = variable_dtype.slice_dtype
global_steps_float = tf.cast(global_steps_int, dtype)
warmup_steps_float = tf.cast(warmup_steps_int, dtype)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = learning_rate * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, dtype)
learning_rate = ((1.0 - is_warmup) * learning_rate +
is_warmup * warmup_learning_rate)
learning_rate = mtf.import_fully_replicated(mesh, learning_rate, mtf.Shape([]), name="learning_rate")
mtf.scalar_summary("lr", learning_rate)
if params["opt_name"].lower() == "adam":
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=params["weight_decay"],
beta_1=params["beta1"],
beta_2=params["beta2"],
epsilon=params["epsilon"],
exclude_from_weight_decay=["norm", "bias"],
variable_dtype=variable_dtype
)
else:
optimizer = mtf.optimize.AdafactorOptimizer(
learning_rate=params["lr"],
decay_rate=params["weight_decay"],
beta1=params["beta1"],
epsilon1=params["ada_epsilon1"],
epsilon2=params["ada_epsilon2"]
)
if params["use_tpu"]:
optimizer = tf.tpu.CrossShardOptimizer(optimizer)
if params["gradient_clipping"] is not None:
(var_grads_fp, _) = clip_by_global_norm(var_grads_fp, clip_norm=clip_value)
update_ops = optimizer.apply_grads(var_grads_fp, mesh.graph.trainable_variables)
return learning_rate, update_ops, var_grads_fp
class AdamWeightDecayOptimizer(mtf.optimize.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
variable_dtype=None):
"""Constructs a AdamWeightDecayOptimizer."""
self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
self.variable_dtype = variable_dtype
def apply_grad(self, grad, var):
"""See base class."""
if grad is None:
tf.logging.warning("Gradient is None for variable %s" % var.name)
return []
grad = mtf.to_float(grad)
assignments = []
m = mtf.get_variable(
var.mesh, var.name + "/adam_m", var.shape,
initializer=tf.zeros_initializer(),
# master_dtype=self.variable_dtype.master_dtype,
# slice_dtype=self.variable_dtype.slice_dtype,
# activation_dtype=self.variable_dtype.activation_dtype,
trainable=False)
v = mtf.get_variable(
var.mesh, var.name + "/adam_v", var.shape,
initializer=tf.zeros_initializer(),
# master_dtype=self.variable_dtype.master_dtype,
# slice_dtype=self.variable_dtype.slice_dtype,
# activation_dtype=self.variable_dtype.activation_dtype,
trainable=False)
# Standard Adam update.
next_m = self.beta_1 * m + (1.0 - self.beta_1) * grad
next_v = self.beta_2 * v + (1.0 - self.beta_2) * mtf.square(grad)
update = next_m / (mtf.sqrt(next_v) + self.epsilon)
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if self._do_use_weight_decay(var.name):
update += mtf.to_float(var.value) * self.weight_decay_rate
update_with_lr = self.learning_rate * update
var_update = mtf.assign_sub(var, update_with_lr)
assignments.extend(
[var_update,
mtf.assign(m, next_m),
mtf.assign(v, next_v)])
return assignments
This code when run results in the following error:
TypeError: CrossShardOptimizer only works with tf.training.Optimizer and not Optimizer_v2. If you are using TPUStrategy, OptimizerV2 will sum gradients across replicas.If you are using TPUEstimator, you may instead sum your gradients with: grads = [tf.compat.v1.tpu.cross_replica_sum(g) for g in grads]. If you want to average your gradients, rescale your loss with: loss /= global_batch_size
So I wonder what's the best way to deal with it? Is there another CrossShardOptimizer wrapper compatible with v2 optimizers? Should I re-write mesh-tensorflow optimizers? Perhaps there is a submodule of tensorflow that already has implements optimizers that are ready to be run on TPUs?

Wrapping an optimizer with CrossShardOptimizer is necessary for porting an Estimator model to a TPUEstimator model, as this handles averaging gradients across TPU shards.
With Mesh, this is a bit different since it takes a SIMD (single instruction, multiple device) philosophy for the TPU implementation. Because of this, you won't see any MTF implementation that uses CrossShardOptimizer, but in actuality a mtf.optimize.Optimizer is supported on TPUs. It just requires SIMD changes rather than optimizer level changes.
In case you haven't seen this yet, here is a Mesh TF example running on MNIST that should help.

Tensorflow Type error when trying to iterate the Tensors in loop

I have the following scenario:
y = tf.placeholder(tf.float32, [None, 1],name="output")
layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons,activation=tf.nn.leaky_relu, name="layer"+str(layer))
for layer in range(2)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, 100])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, 1)
outputs = tf.reshape(stacked_outputs, [-1, 2, 1])
outputs = tf.identity(outputs[:,1,:], name="prediction")
loss = Custom_loss(y,outputs)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss,name="training_op")
The custom loss function I tried is:
def Custom_loss(y,outputs):
hold_loss = []
for exp,pred in zip(y,outputs):
if exp >= pred:
result = tf.pow(pred * 0.5,2) - exp
hold_loss.append(result)
else:
hold_loss.append(tf.subtract(pred-exp))
return tf.reduce_mean(hold_loss)
Now when I am trying to implement this I am getting the following error:
TypeError: Tensor objects are only iterable when eager execution is enabled. To iterate over this tensor use tf.map_fn.
I have tried implementing the tf.map_fn() but there is the same error I encounter. I have used the following question:
How to explain the result of tf.map_fn?
Kindly, help me get through this issue? How I can iterate the tensor? What way is best for the custom loss function implementation?

def Custom_loss(y,outputs):
mask = tf.greater_equal(y, outputs)
a = tf.pow(tf.boolean_mask(outputs, mask)*0.5, 2) - tf.boolean_mask(y, mask)
inv_mask = tf.logical_not(mask)
b = tf.boolean_mask(outputs, inv_mask)- tf.boolean_mask(y, inv_mask)
return tf.reduce_mean(tf.concat([a, b], axis=-1))
Test case
def Custom_loss_np(y,outputs):
hold_loss = []
for exp,pred in zip(y,outputs):
if exp >= pred:
result = pow(pred * 0.5,2) - exp
hold_loss.append(result)
else:
hold_loss.append(pred-exp)
return np.mean(hold_loss)
np_x = np.random.randn(100)
np_y = np.random.randn(100)
x = tf.constant(np_x)
y = tf.constant(np_y)
with tf.Session() as sess:
assert sess.run(Custom_loss(x, y)) == Custom_loss_np(np_x, np_y)
Use tf.math if you are in latest versoin of tensorflow.
Example using the custom loss to train a simple linear regression model
X = tf.placeholder(tf.float32,[None,1])
y = tf.placeholder(tf.float32,[None,1])
w = tf.Variable(tf.ones([1,1]))
b = tf.Variable(tf.ones([1,1]))
y_ = tf.matmul(X, w)+b
loss = Custom_loss(y, y_) #tf.reduce_mean(tf.square(y_ - y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss,name="training_op")
#dummy data for linear regression
x_data = np.random.randn(100,1)
y_labels = 1.5*x_data + 2.5 + np.random.randn(100,1)
init = tf.global_variables_initializer()
sess.run(init)
sess = tf.Session()
sess.run(init)
for i in range(5000):
_, loss_ = sess.run([training_op,loss], feed_dict={X:x_data, y:y_labels})
if (i+1)%1000 == 0 :
print (loss_)
print (sess.run([w, b]))
The logic for calculating the loss is something OP have come up with.

LSTM won't overfit training data

I have been trying to use an LSTM for regression in TensorFlow, but it doesn't fit the data. I have successfully fit the same data in Keras (with the same size network). My code for trying to overfit a sine wave is below:
import tensorflow as tf
import numpy as np
yt = np.cos(np.linspace(0, 2*np.pi, 256))
xt = np.array([yt[i-50:i] for i in range(50, len(yt))])[...,None]
yt = yt[-xt.shape[0]:]
g = tf.Graph()
with g.as_default():
x = tf.constant(xt, dtype=tf.float32)
y = tf.constant(yt, dtype=tf.float32)
lstm = tf.nn.rnn_cell.BasicLSTMCell(32)
outputs, state = tf.nn.dynamic_rnn(lstm, x, dtype=tf.float32)
pred = tf.layers.dense(outputs[:,-1], 1)
loss = tf.reduce_mean(tf.square(pred-y))
train_op = tf.train.AdamOptimizer().minimize(loss)
init = tf.global_variables_initializer()
sess = tf.InteractiveSession(graph=g)
sess.run(init)
for i in range(200):
_, l = sess.run([train_op, loss])
print(l)
This results in a MSE of 0.436067 (while Keras got to 0.0022 after 50 epochs), and the predictions range from -0.1860 to -0.1798. What am I doing wrong here?
Edit:
When I change my loss function to the following, the model fits properly:
def pinball(y_true, y_pred):
tau = np.arange(1,100).reshape(1,-1)/100
pin = tf.reduce_mean(tf.maximum(y_true[:,None] - y_pred, 0) * tau +
tf.maximum(y_pred - y_true[:,None], 0) * (1 - tau))
return pin
I also change the assignments of pred and loss to
pred = tf.layers.dense(outputs[:,-1], 99)
loss = pinball(y, pred)
This results in a decrease of loss from 0.3 to 0.003 as it trains, and seems to properly fit the data.

Looks like a shape/broadcasting issue. Here's a working version:
import tensorflow as tf
import numpy as np
yt = np.cos(np.linspace(0, 2*np.pi, 256))
xt = np.array([yt[i-50:i] for i in range(50, len(yt))])
yt = yt[-xt.shape[0]:]
g = tf.Graph()
with g.as_default():
x = tf.constant(xt, dtype=tf.float32)
y = tf.constant(yt, dtype=tf.float32)
lstm = tf.nn.rnn_cell.BasicLSTMCell(32)
outputs, state = tf.nn.dynamic_rnn(lstm, x[None, ...], dtype=tf.float32)
pred = tf.squeeze(tf.layers.dense(outputs, 1), axis=[0, 2])
loss = tf.reduce_mean(tf.square(pred-y))
train_op = tf.train.AdamOptimizer().minimize(loss)
init = tf.global_variables_initializer()
sess = tf.InteractiveSession(graph=g)
sess.run(init)
for i in range(200):
_, l = sess.run([train_op, loss])
print(l)
x gets a batch dimension of 1 before going into dynamic_rnn, since with time_major=False the first dimension is expected to be a batch dimension. It's important that the last dimension of the output of tf.layers.dense get squeezed off so that it doesn't broadcast with y (TensorShape([256, 1]) and TensorShape([256]) broadcast to TensorShape([256, 256])). With those fixes it converges:
5.78507e-05

You are not passing-on the state from one call of dynamic_rnn to next. That's the problem for sure.
Also, why take only last item of the output through the dense layer and onward?

tf.where causes optimiser to fail in tensorflow

I want to check if I can solve this problem with tensorflow instead of pymc3. The experimental idea is that I am going to define a probibalistic system that contains a switchpoint. I can use sampling as a method of inference but I started wondering why I couldn't just do this with a gradient descent instead.
I decided to do the gradient search in tensorflow but it seems like tensorflow is having a hard time performing a gradient search when tf.where is involved.
You can find the code below.
import tensorflow as tf
import numpy as np
x1 = np.random.randn(50)+1
x2 = np.random.randn(50)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(5, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(10, name = "tau", dtype=tf.float32)
mu = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu1,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu2)
sigma = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma1,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma2)
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) -tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.RMSPropOptimizer(0.01)
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print("these variables should be trainable: {}".format([_.name for _ in tf.trainable_variables()]))
for step in range(10000):
_lik, _ = sess.run([total_likelihood, opt_task])
if step % 1000 == 0:
variables = {_.name:_.eval() for _ in [mu1, mu2, sigma1, sigma2, tau]}
print("step: {}, values: {}".format(str(step).zfill(4), variables))
You'll notice that the tau parameter does not change even though tensorflow seems to be aware of the variable and it's gradient. Any clue on what is going wrong? Is this something that can be calculated in tensorflow or do I need a different pattern?

tau is only used in the condition argument to where: (tf.where(time_all < tau, ...) , which is a boolean tensor. Since calculating gradients only makes sense for continuous values, the gradient of the output with respect to tau will be zero.
Even ignoring tf.where, you used tau in the expression time_all < tau, which is constant almost everywhere, so has a gradient of zero.
Due to the gradient of zero, there is no way to learn tau with gradient descent methods.
Depending on your problem, maybe instead of a hard switch between two values, you can use a weighted sum instead p*val1 + (1-p)*val2, where p depends on tau in a continuous manner.

The assigned solution is the correct answer, but doesn't contain the code solution to my problem. The following snippet does;
import tensorflow as tf
import numpy as np
import os
import uuid
TENSORBOARD_PATH = "/tmp/tensorboard-switchpoint"
# tensorboard --logdir=/tmp/tensorboard-switchpoint
x1 = np.random.randn(35)-1
x2 = np.random.randn(35)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(0, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(15, name = "tau", dtype=tf.float32)
switch = 1./(1+tf.exp(tf.pow(time_all - tau, 1)))
mu = switch*mu1 + (1-switch)*mu2
sigma = switch*sigma1 + (1-switch)*sigma2
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) - tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.AdamOptimizer()
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
tf.summary.scalar("mu1", mu1)
tf.summary.scalar("mu2", mu2)
tf.summary.scalar("sigma1", sigma1)
tf.summary.scalar("sigma2", sigma2)
tf.summary.scalar("tau", tau)
tf.summary.scalar("likelihood", total_likelihood)
merged_summary_op = tf.summary.merge_all()
with tf.Session() as sess:
sess.run(init)
print("these variables should be trainable: {}".format([_.name for _ in tf.trainable_variables()]))
uniq_id = os.path.join(TENSORBOARD_PATH, "switchpoint-" + uuid.uuid1().__str__()[:4])
summary_writer = tf.summary.FileWriter(uniq_id, graph=tf.get_default_graph())
for step in range(40000):
lik, opt, summary = sess.run([total_likelihood, opt_task, merged_summary_op])
if step % 100 == 0:
variables = {_.name:_.eval() for _ in [total_likelihood]}
summary_writer.add_summary(summary, step)
print("i{}: {}".format(str(step).zfill(5), variables))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Normalizing variables after running apply gradients all within the optimizer class - python

Related

Pricing american options using deep learning, put instead of max-call

Best way to wrap an optimizer in `CrossShardOptimizer`

Tensorflow Type error when trying to iterate the Tensors in loop

LSTM won't overfit training data

tf.where causes optimiser to fail in tensorflow

Categories

Resources