I want to check if I can solve this problem with tensorflow instead of pymc3. The experimental idea is that I am going to define a probibalistic system that contains a switchpoint. I can use sampling as a method of inference but I started wondering why I couldn't just do this with a gradient descent instead.
I decided to do the gradient search in tensorflow but it seems like tensorflow is having a hard time performing a gradient search when tf.where is involved.
You can find the code below.
import tensorflow as tf
import numpy as np
x1 = np.random.randn(50)+1
x2 = np.random.randn(50)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(5, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(10, name = "tau", dtype=tf.float32)
mu = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu1,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu2)
sigma = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma1,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma2)
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) -tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.RMSPropOptimizer(0.01)
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
with tf.Session() as sess:
print("these variables should be trainable: {}".format([ for _ in tf.trainable_variables()]))
for step in range(10000):
_lik, _ =[total_likelihood, opt_task])
if step % 1000 == 0:
variables = { for _ in [mu1, mu2, sigma1, sigma2, tau]}
print("step: {}, values: {}".format(str(step).zfill(4), variables))
You'll notice that the tau parameter does not change even though tensorflow seems to be aware of the variable and it's gradient. Any clue on what is going wrong? Is this something that can be calculated in tensorflow or do I need a different pattern?

tau is only used in the condition argument to where: (tf.where(time_all < tau, ...) , which is a boolean tensor. Since calculating gradients only makes sense for continuous values, the gradient of the output with respect to tau will be zero.
Even ignoring tf.where, you used tau in the expression time_all < tau, which is constant almost everywhere, so has a gradient of zero.
Due to the gradient of zero, there is no way to learn tau with gradient descent methods.
Depending on your problem, maybe instead of a hard switch between two values, you can use a weighted sum instead p*val1 + (1-p)*val2, where p depends on tau in a continuous manner.

The assigned solution is the correct answer, but doesn't contain the code solution to my problem. The following snippet does;
import tensorflow as tf
import numpy as np
import os
import uuid
TENSORBOARD_PATH = "/tmp/tensorboard-switchpoint"
# tensorboard --logdir=/tmp/tensorboard-switchpoint
x1 = np.random.randn(35)-1
x2 = np.random.randn(35)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(0, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(15, name = "tau", dtype=tf.float32)
switch = 1./(1+tf.exp(tf.pow(time_all - tau, 1)))
mu = switch*mu1 + (1-switch)*mu2
sigma = switch*sigma1 + (1-switch)*sigma2
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) - tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.AdamOptimizer()
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
tf.summary.scalar("mu1", mu1)
tf.summary.scalar("mu2", mu2)
tf.summary.scalar("sigma1", sigma1)
tf.summary.scalar("sigma2", sigma2)
tf.summary.scalar("tau", tau)
tf.summary.scalar("likelihood", total_likelihood)
merged_summary_op = tf.summary.merge_all()
with tf.Session() as sess:
print("these variables should be trainable: {}".format([ for _ in tf.trainable_variables()]))
uniq_id = os.path.join(TENSORBOARD_PATH, "switchpoint-" + uuid.uuid1().__str__()[:4])
summary_writer = tf.summary.FileWriter(uniq_id, graph=tf.get_default_graph())
for step in range(40000):
lik, opt, summary =[total_likelihood, opt_task, merged_summary_op])
if step % 100 == 0:
variables = { for _ in [total_likelihood]}
summary_writer.add_summary(summary, step)
print("i{}: {}".format(str(step).zfill(5), variables))


Pricing american options using deep learning, put instead of max-call

So I'm trying to learn to optimally stop options in a Black-Scholes setting along the lines of the article: "Solving high-dimensional optimal stopping problems using deep learning" by Sebastian Becker, Patrick Cheridito, Arnulf Jentzen, and Timo Welti.
The framework used to price options is the following:
import tensorflow as tf
from import assign_moving_average
def neural_net(x, neurons, is_training, dtype=tf.float32, decay=0.9):
def batch_normalization(y):
shape = y.get_shape().as_list()
y = tf.reshape(y, [-1, shape[1] * shape[2]])
#variables for batch normalization
beta = tf.compat.v1.get_variable(
name='beta', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.zeros_initializer())
gamma = tf.compat.v1.get_variable(
name='gamma', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.ones_initializer())
mv_mean = tf.compat.v1.get_variable(
'mv_mean', [shape[1]*shape[2]],
dtype = dtype, initializer=tf.zeros_initializer(),
trainable = False)
mv_var = tf.compat.v1.get_variable(
'mv_var', [shape[1]*shape[2]],
dtype = dtype, initializer =tf.ones_initializer(),
trainable = False)
mean,variance = tf.nn.moments(y, [0], name = 'moments')
assign_moving_average(mv_mean, mean, decay,
assign_moving_average(mv_var, variance, decay,
mean, variance = tf.cond(is_training, lambda: (mean, variance),
lambda: (mv_mean, mv_var))
y = tf.nn.batch_normalization(y, mean, variance, beta, gamma, 1e-6)
return tf.reshape(y, [-1, shape[1], shape[2]])
def fc_layer(y, out_size, activation, is_single):
shape = y.get_shape().as_list()
w = tf.compat.v1.get_variable(
shape=[shape[2], shape[1], out_size],
y = tf.transpose(tf.matmul(tf.transpose(y, [2, 0, 1]), w),
[1, 2, 0])
if is_single:
b = tf.compat.v1.get_variable(
shape=[out_size, shape[2]],
dtype = dtype,
return activation(y + b)
return activation(batch_normalization(y))
x = batch_normalization(x)
for i in range(len(neurons)):
with tf.compat.v1.variable_scope('layer_' + str(i)):
x = fc_layer(x, neurons[i],
tf.nn.relu if i < len(neurons) - 1
else tf.nn.sigmoid, False)
return x
#then Deep optimal stopping
def deep_optimal_stopping(x, t, n, g, neurons, batch_size, train_steps,
mc_runs, lr_boundaries, lr_values, beta1=0.9,
beta2=0.999, epsilon=1e-8, decay=0.9):
is_training = tf.compat.v1.placeholder(tf.bool, []) # a variable used to distinguish between training and Monte Carlo simulation, used for batch noralization
p = g(t, x) # we evaluate the payoff for the whole batch at every point in time
nets = neural_net(tf.concat([x[:, :, :-1], p[:, :, :-1]], axis=1),
neurons, is_training, decay=decay)
u_list = [nets[:, :, 0]]
u_sum = u_list[-1]
for k in range(1, n - 1): #range(start, stop)
u_list.append(nets[:, :, k] * (1. - u_sum)) # we build a neural network to approximate the stopping decision at time n*T/N
u_sum += u_list[-1]
#last iteration?
u_list.append(1. - u_sum)
u_stack = tf.concat(u_list, axis=1)
p = tf.squeeze(p, axis=1) #removes dimension of size 1
loss = tf.reduce_mean(tf.reduce_sum(-u_stack * p, axis=1)) #loss function
idx = tf.argmax(tf.cast(tf.cumsum(u_stack, axis=1) + u_stack >= 1,
dtype=tf.uint8), #idx for index?, argmax takes index for largest value
axis=1, output_type=tf.int32)
stopped_payoffs = tf.reduce_mean(
tf.gather_nd(p, tf.stack([tf.range(0, batch_size, dtype=tf.int32),
idx], axis=1))) # this is the approximation of the price for one batch, we will calculate the mean over MC-runs of those numbers
global_step = tf.Variable(0) # a variable used to apply the learning rate schedule, without it the optimizer would not know at which training step we are
learning_rate = tf.compat.v1.train.piecewise_constant(global_step,
lr_values) # this gives us a piecewise constant learning rate, according to the schedule
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate,
beta2=beta2,# define the optimizer, we use Adam with our learning rate schedule and a small tweak of one of its parameters
update_ops = tf.compat.v1.get_collection(
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
with tf.compat.v1.Session() as sess:
for _ in range(train_steps):, feed_dict={is_training: True})
px_mean = 0. # value that will hold the price
for _ in range(mc_runs): # loop over the number of MC runs
px_mean +=,
feed_dict={is_training: False})# we stop training, this is used for the batch normalization, from now on we will use the sampled moving averages
return px_mean / mc_runs
Now we define the various variables and simulate paths of a stock as X. Then we run use deep_optimal_stopping function to price the option, defined in the following code
import tensorflow as tf
import numpy as np
import time
from tensorflow.python.framework.ops import disable_eager_execution
T, N, K = 3., 9, 100.
r, delta, beta = 0.05, 0.1, 0.2
batch_size = 800#8192
lr_values = [0.05, 0.005, 0.0005]
mc_runs = 50#500
def g(s, x):
return tf.exp(-r * s) \
* tf.maximum(tf.reduce_max(x, axis=1, keepdims=True) - K, 0.)
_file = open('example_4_4_1_1.csv', 'w')
_file.write('dim, run, mean, time\n')
for d in [2, 3, 5, 10, 20, 30, 50, 100, 200, 500]:
for s_0 in [40.]:#[90., 100., 110.]:
for run in range(5):
t0 = time.time()
neurons = [d + 50, d + 50, 1]
train_steps = 1500 + d
lr_boundaries = [int(500 + d / 5), int(1500 + 3 * d / 5)]
W = tf.cumsum(tf.compat.v1.random_normal(
shape=[batch_size, d, N],
stddev=np.sqrt(T / N)), axis=2)
t = tf.constant(np.linspace(start=T / N, stop=T, num=N,
endpoint=True, dtype=np.float32))
#X = tf.exp((r - delta - beta ** 2 / 2.) * t + beta * W) * s_0
px_mean = deep_optimal_stopping(
W, t, N, g, neurons, batch_size,
train_steps, mc_runs,
lr_boundaries, lr_values, epsilon=0.1)
t1 = time.time()
_file.write('%i, %i, %f, %f\n' % (d, run, px_mean, t1 - t0))
So here the option is a bermudan max-call defined by the payoff function g(s,x). My understanding would be, if I wanted the price of an American put, I instead changed the payoff function g to be:
def g(s, x):
return tf.exp(-r * s) * tf.maximum(K-x, 0.)
and otherwise changing nothing. But instead of getting a price of 5.31 as reported in their article, I get 4.02.
Can someone explain where I'm going wrong with my understanding of the problem?

Problem reproducing the predicted covariance of a gaussian process using gpytorch with same hyperparameters

I need to build a function that gives the a posteriori covariance of a Gaussian Process. The idea is to train a GP using GPytorch, then take the learned hyperparameters, and pass them into my kernel function. (for several reason I can't use the GPyTorch directly).
Now the problem is that I can't reproduce the prediction. Here the code I wrote. I have been working on it the whole day but I can't find the problem. Do you know what I am doing wrong?
from gpytorch.mlls import ExactMarginalLogLikelihood
import numpy as np
import gpytorch
import torch
train_x1 = torch.linspace(0, 0.95, 50) + 0.05 * torch.rand(50)
train_y1 = torch.sin(train_x1 * (2 * np.pi)) + 0.2 * torch.randn_like(train_x1)
n_datapoints = train_x1.shape[0]
def kernel_rbf(x1, x2, c, l):
# my RBF function
if x1.shape is ():
x1 = np.atleast_2d(x1)
if x2.shape is ():
x2 = np.atleast_2d(x2)
return c * np.exp(- np.matmul((x1 - x2).T, (x1 - x2)) / (2 * l ** 2))
class ExactGPModel(gpytorch.models.ExactGP):
def __init__(self, train_x, train_y, likelihood):
super().__init__(train_x, train_y, likelihood)
lengthscale_prior = gpytorch.priors.GammaPrior(3.0, 6.0)
outputscale_prior = gpytorch.priors.GammaPrior(2.0, 0.15)
self.mean_module = gpytorch.means.ConstantMean()
self.covar_module = gpytorch.kernels.ScaleKernel(
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(train_x1, train_y1, likelihood)
# Find optimal model hyperparameters
mll = ExactMarginalLogLikelihood(likelihood, model)
# Use the Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1) # Includes GaussianLikelihood parameters
training_iterations = 50
for i in range(training_iterations):
output = model(*model.train_inputs)
loss = -mll(output, model.train_targets)
print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))
# Get the learned hyperparameters
outputscale = model.covar_module.outputscale.item()
lengthscale = model.covar_module.base_kernel.lengthscale.item()
noise = likelihood.noise_covar.noise.item()
train_x1 = train_x1.numpy()
train_y1 = train_y1.numpy()
# Get covariance train points
K = np.zeros((n_datapoints, n_datapoints))
for i in range(n_datapoints):
for j in range(n_datapoints):
K[i, j] = kernel_rbf(train_x1[i], train_x1[j], outputscale, lengthscale)
# Add noise
K += noise ** 2 * np.eye(n_datapoints)
# Get covariance train-test points
x_test = torch.rand(1, 1)
Ks = np.zeros((n_datapoints, 1))
for i in range(n_datapoints):
Ks[i] = kernel_rbf(train_x1[i], x_test.numpy(), outputscale, lengthscale)
# Get variance test points
Kss = kernel_rbf(x_test.numpy(), x_test.numpy(), outputscale, lengthscale)
L = np.linalg.cholesky(K)
v = np.linalg.solve(L, Ks)
var = Kss - np.matmul(v.T, v)
with gpytorch.settings.fast_pred_var():
y_preds = likelihood(model(x_test))
print(f"Predicted variance with gpytorch:{y_preds.variance.item()}")
print(f"Predicted variance with my kernel:{var}")
I found the errors:
The noise is not squared so it is K += noise * np.eye(n_datapoints) and not K += noise**2 * np.eye(n_datapoints)
I forgot to add the noise term in the $$ K** $$, i.e. Kss += noise

In a neural network, how does a gradient get calculated by matrix multiplication? Why?

This is not a question for a specific problem I am trying to solve. I am just trying to understand why a gradient is calculated by multiplying the layers (matrices) in a mostly backward fashion. I also didn't know subtracting y from the prediction could also give you something called a gradient.
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 =
I don't know what I thought Pytorch was doing finding the gradients. I figured it was some kind of algorithm that did the power rule and followed other derivative rules somehow.
import numpy as np
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
h =
h_relu = np.maximum(h, 0)
y_pred =
# Compute and print loss
loss = np.square(y_pred - y).sum()
print(t, loss)
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 =
grad_h_relu =
grad_h = grad_h_relu.copy()
grad_h[h < 0] = 0
grad_w1 =
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2

TensorFlow simple example help - custom gradient

How do you pass a custom gradient into a gradient optimization function in TensorFlow.
I have illustrated what I am trying to do, with a simple example (trying to minimize z = 2x^2 + y^2 + 2).
I have been looking at:
The problem seems to work if you pass in optimizer = tf.train.GradientDescentOptimizer(0.55) and train = optimizer.minimize(z)
This code works:
import tensorflow as tf
x = tf.Variable(11, name='x', dtype=tf.float32)
y = tf.Variable(11, name='x', dtype=tf.float32)
const = tf.constant(2.0, dtype=tf.float32)
z = x**2 + y**2 + const
optimizer = tf.train.GradientDescentOptimizer(0.55)
train = optimizer.minimize(z)
init = tf.global_variables_initializer()
def optimize():
with tf.Session() as session:
print("starting at", "x:",, "y:",, "z:",
for step in range(10):
print("step", step, "x:",, "y:",, "z:",
But I want to specify the gradient in the problem.
aka I am trying to do this:
def function_to_minimize(x,y, const):
# z = 2x^2 + y^2 + constant
z = 2*x**2 + y**2 + const
return z
def calc_grad(x,y):
# z = 2x^2 + y^2 + constant
dz_dx = 4*x
dz_dy = 2*y
return [(dz_dx, x), (dz_dy, y)]
x = tf.Variable(3, name='x', dtype=tf.float32)
y = tf.Variable(3, name='y', dtype=tf.float32)
const = tf.constant(2.0, dtype=tf.float32)
z = function_to_minimize(x,y, const)
grad = calc_grad(x,y)
init = tf.global_variables_initializer()
sess = tf.Session()
optimizer = tf.train.GradientDescentOptimizer(0.5)
grads_and_vars = calc_grad(x,y)
# minimize() takes care of both computing the gradients and applying them to the variables.
#If you want to process the gradients before applying them you can instead use the optimizer in three steps:
# 1. Compute the gradients with compute_gradients().
# 2. Process the gradients as you wish.
# 3. Apply the processed gradients with apply_gradients()
How do you do this properly?
apply_gradients returns an operation that you can use to apply the gradients. In other words, you just do train = optimizer.apply_gradients(grads_and_vars) and the rest will work as in the first snippet. I,e.:
optimizer = tf.train.GradientDescentOptimizer(0.55)
grads_and_vars = calc_grad(x,y)
train = optimizer.apply_gradients(grads_and_vars)
init = tf.global_variables_initializer()
def optimize():
with tf.Session() as session:
print("starting at", "x:",, "y:",, "z:",
for step in range(10):
print("step", step, "x:",, "y:",, "z:",

Normalizing variables after running apply gradients all within the optimizer class

So my question is how do I normalize the variables after I do gradient descent in the _apply_dense() method of the optimizer class. This is what I currently have.
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
var_update = state_ops.assign_sub(var, lr_t*grad)
normalize = var.assign(tf.norm(var))
My current code seems to just normalize the original variables without applying the gradient descent update. I know that this is due to the normalize step I have above which is just reassigning the original variables but normalized. How do I correct this so that the gradient descent step is applied and then the normalization is done on the result.
This could be implemented as following:
lr = 0.01
with tf.name_scope('optimizer'):
vars_ = tf.trainable_variables()
grads = tf.gradients(loss_tensor, vars_)
assign_ops = [tf.assign(v, (v - lr*g)) for g, v in zip(grads, vars_)]
with tf.control_dependencies(assign_ops):
vars_norms = [tf.sqrt(2*tf.nn.l2_loss(v)) for v in vars_]
# next line prevents division by zero
vars_norms = [tf.clip_by_value(n, 0.00001, np.inf) for n in vars_norms]
update_ops = [tf.assign(v, v/n) for v, n in zip(vars_, vars_norms)]
update_op =
Note that if I've added tf.clip_by_value() to prevent division by zero.
Here's a full usage example:
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=(None, 2))
y = tf.placeholder(tf.int32, shape=(None))
logits = tf.layers.dense(x, 2)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=logits)
loss_tensor = tf.reduce_mean(xentropy)
lr = 0.01
with tf.name_scope('optimizer'):
vars_ = tf.trainable_variables()
grads = tf.gradients(loss_tensor, vars_)
assign_ops = [tf.assign(v, (v - lr*g)) for g, v in zip(grads, vars_)]
with tf.control_dependencies(assign_ops):
vars_norms = [tf.sqrt(2*tf.nn.l2_loss(v)) for v in vars_]
# next line prevents division by zero
vars_norms = [tf.clip_by_value(n, 0.00001, np.inf) for n in vars_norms]
update_ops = [tf.assign(v, v/n) for v, n in zip(vars_, vars_norms)]
update_op =
# dummy data for illustration
x_train = np.random.normal(size=(10, 2))
x_train = np.vstack([x_train, 2*np.random.normal(size=(10, 2))])
y_train = [0 for _ in range(10)] + [1 for _ in range(10)]
with tf.Session() as sess:
for epoch in range(10):
loss, _ =[loss_tensor, update_op], feed_dict={x:x_train, y:y_train})
# 0.7111398
# 0.7172677
# 0.71517026
# 0.713101
# 0.71105987
# 0.7090467
# 0.70706147
# 0.7051038
# 0.7031738
# 0.7012712
