So I'm trying to learn to optimally stop options in a Black-Scholes setting along the lines of the article: "Solving high-dimensional optimal stopping problems using deep learning" by Sebastian Becker, Patrick Cheridito, Arnulf Jentzen, and Timo Welti.
The framework used to price options is the following:
import tensorflow as tf
from tensorflow.python.training.moving_averages import assign_moving_average
def neural_net(x, neurons, is_training, dtype=tf.float32, decay=0.9):
def batch_normalization(y):
shape = y.get_shape().as_list()
y = tf.reshape(y, [-1, shape[1] * shape[2]])
#variables for batch normalization
beta = tf.compat.v1.get_variable(
name='beta', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.zeros_initializer())
gamma = tf.compat.v1.get_variable(
name='gamma', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.ones_initializer())
mv_mean = tf.compat.v1.get_variable(
'mv_mean', [shape[1]*shape[2]],
dtype = dtype, initializer=tf.zeros_initializer(),
trainable = False)
mv_var = tf.compat.v1.get_variable(
'mv_var', [shape[1]*shape[2]],
dtype = dtype, initializer =tf.ones_initializer(),
trainable = False)
mean,variance = tf.nn.moments(y, [0], name = 'moments')
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_mean, mean, decay,
zero_debias=True))
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_var, variance, decay,
zero_debias=False))
mean, variance = tf.cond(is_training, lambda: (mean, variance),
lambda: (mv_mean, mv_var))
y = tf.nn.batch_normalization(y, mean, variance, beta, gamma, 1e-6)
return tf.reshape(y, [-1, shape[1], shape[2]])
def fc_layer(y, out_size, activation, is_single):
shape = y.get_shape().as_list()
w = tf.compat.v1.get_variable(
name='weights',
shape=[shape[2], shape[1], out_size],
dtype=dtype,
initializer=tf.initializers.glorot_uniform())
y = tf.transpose(tf.matmul(tf.transpose(y, [2, 0, 1]), w),
[1, 2, 0])
if is_single:
b = tf.compat.v1.get_variable(
name='bias',
shape=[out_size, shape[2]],
dtype = dtype,
initializer=tf.zeros_initializer())
return activation(y + b)
return activation(batch_normalization(y))
x = batch_normalization(x)
for i in range(len(neurons)):
with tf.compat.v1.variable_scope('layer_' + str(i)):
x = fc_layer(x, neurons[i],
tf.nn.relu if i < len(neurons) - 1
else tf.nn.sigmoid, False)
return x
#then Deep optimal stopping
def deep_optimal_stopping(x, t, n, g, neurons, batch_size, train_steps,
mc_runs, lr_boundaries, lr_values, beta1=0.9,
beta2=0.999, epsilon=1e-8, decay=0.9):
is_training = tf.compat.v1.placeholder(tf.bool, []) # a variable used to distinguish between training and Monte Carlo simulation, used for batch noralization
p = g(t, x) # we evaluate the payoff for the whole batch at every point in time
nets = neural_net(tf.concat([x[:, :, :-1], p[:, :, :-1]], axis=1),
neurons, is_training, decay=decay)
u_list = [nets[:, :, 0]]
u_sum = u_list[-1]
for k in range(1, n - 1): #range(start, stop)
u_list.append(nets[:, :, k] * (1. - u_sum)) # we build a neural network to approximate the stopping decision at time n*T/N
u_sum += u_list[-1]
#last iteration?
u_list.append(1. - u_sum)
u_stack = tf.concat(u_list, axis=1)
p = tf.squeeze(p, axis=1) #removes dimension of size 1
loss = tf.reduce_mean(tf.reduce_sum(-u_stack * p, axis=1)) #loss function
idx = tf.argmax(tf.cast(tf.cumsum(u_stack, axis=1) + u_stack >= 1,
dtype=tf.uint8), #idx for index?, argmax takes index for largest value
axis=1, output_type=tf.int32)
stopped_payoffs = tf.reduce_mean(
tf.gather_nd(p, tf.stack([tf.range(0, batch_size, dtype=tf.int32),
idx], axis=1))) # this is the approximation of the price for one batch, we will calculate the mean over MC-runs of those numbers
global_step = tf.Variable(0) # a variable used to apply the learning rate schedule, without it the optimizer would not know at which training step we are
learning_rate = tf.compat.v1.train.piecewise_constant(global_step,
lr_boundaries,
lr_values) # this gives us a piecewise constant learning rate, according to the schedule
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate,
beta1=beta1,
beta2=beta2,# define the optimizer, we use Adam with our learning rate schedule and a small tweak of one of its parameters
epsilon=epsilon)
update_ops = tf.compat.v1.get_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.global_variables_initializer())
for _ in range(train_steps):
sess.run(train_op, feed_dict={is_training: True})
px_mean = 0. # value that will hold the price
for _ in range(mc_runs): # loop over the number of MC runs
px_mean += sess.run(stopped_payoffs,
feed_dict={is_training: False})# we stop training, this is used for the batch normalization, from now on we will use the sampled moving averages
return px_mean / mc_runs
Now we define the various variables and simulate paths of a stock as X. Then we run use deep_optimal_stopping function to price the option, defined in the following code
import tensorflow as tf
import numpy as np
import time
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
T, N, K = 3., 9, 100.
r, delta, beta = 0.05, 0.1, 0.2
batch_size = 800#8192
lr_values = [0.05, 0.005, 0.0005]
mc_runs = 50#500
def g(s, x):
return tf.exp(-r * s) \
* tf.maximum(tf.reduce_max(x, axis=1, keepdims=True) - K, 0.)
_file = open('example_4_4_1_1.csv', 'w')
_file.write('dim, run, mean, time\n')
for d in [2, 3, 5, 10, 20, 30, 50, 100, 200, 500]:
for s_0 in [40.]:#[90., 100., 110.]:
for run in range(5):
tf.compat.v1.reset_default_graph()
t0 = time.time()
neurons = [d + 50, d + 50, 1]
train_steps = 1500 + d
lr_boundaries = [int(500 + d / 5), int(1500 + 3 * d / 5)]
W = tf.cumsum(tf.compat.v1.random_normal(
shape=[batch_size, d, N],
stddev=np.sqrt(T / N)), axis=2)
t = tf.constant(np.linspace(start=T / N, stop=T, num=N,
endpoint=True, dtype=np.float32))
#X = tf.exp((r - delta - beta ** 2 / 2.) * t + beta * W) * s_0
px_mean = deep_optimal_stopping(
W, t, N, g, neurons, batch_size,
train_steps, mc_runs,
lr_boundaries, lr_values, epsilon=0.1)
t1 = time.time()
print("")
_file.write('%i, %i, %f, %f\n' % (d, run, px_mean, t1 - t0))
_file.close()
So here the option is a bermudan max-call defined by the payoff function g(s,x). My understanding would be, if I wanted the price of an American put, I instead changed the payoff function g to be:
def g(s, x):
return tf.exp(-r * s) * tf.maximum(K-x, 0.)
and otherwise changing nothing. But instead of getting a price of 5.31 as reported in their article, I get 4.02.
Can someone explain where I'm going wrong with my understanding of the problem?
I decided to write my lossfunction Structural Similarity Loss according to the article
https://arxiv.org/pdf/1910.08711.pdf
Testing different models for segmentation and different losses for them I have a problem that sometimes there is numerical instability and my self-written Segnet model gives out NaN during training, due to which loss also becomes NaN. While on other losses (bce, dice loss, focal loss) everything is stable. After printing out the variables in detail, I found out that the loss value before the y_pred=NaN arrives is adequate, so my assumption is that the loss gradients are counted incorrectly, but it's not clear how to fix it.
def ssl_loss (y_real, y_pred, window_size=11, eps = 0.01):
beta = 0.1
Lambda = 0.5
#input size(B, C, H, W)
#C = 1, because we compare monochrome segmentations
y_real, y_pred = y_real.to(device).squeeze(), y_pred.to(device).squeeze()
bce_matrix = (y_pred - y_real * y_pred + torch.log(1 + torch.exp(-y_pred)))
y_pred = torch.sigmoid(y_pred)
blurer = T.GaussianBlur(kernel_size=(11, 11), sigma=(1.5, 1.5))
mu_y = blurer(y_real)
sigma_y = blurer((y_real - mu_y) ** 2)
mu_p = blurer(y_pred)
sigma_p = blurer((y_pred - mu_p) ** 2)
errors = torch.abs((y_real - mu_y + eps) / (torch.sqrt(sigma_y) + eps) - (y_pred - mu_p + eps) / (torch.sqrt(sigma_p) + eps)).squeeze()
f_n_c = (errors > beta * errors.max()).int()
M = f_n_c.sum(dim=(1, 2)).unsqueeze(1).unsqueeze(2)
ssl_matrix = (errors * f_n_c * bce_matrix / M)
loss = Lambda * bce_matrix.mean() + (1 - Lambda) * ssl_matrix.mean()
return loss
And here's meaningful part of my train function
for epoch in range(epochs):
avg_loss = 0
model.train()
for X_batch, Y_batch in data_tr:
X_batch = X_batch.to(device)
Y_batch = Y_batch.to(device)
opt.zero_grad()
Y_pred = model(X_batch)
loss = loss_fn(Y_batch, Y_pred)
loss.backward()
opt.step()
avg_loss += loss / len(data_tr)
scheduler.step()
I'm using this custom loss function for ccc
def ccc(y_true, y_pred):
ccc = ((ccc_v(y_true, y_pred) + ccc_a(y_true, y_pred)) / 2)
return 1 - ccc
def ccc_v(y_true, y_pred):
x = y_true[:,0]
y = y_pred[:,0]
x_mean = K.mean(x, axis=0)
y_mean = K.mean(y, axis=0)
covar = K.mean( (x - x_mean) * (y - y_mean) )
x_var = K.var(x)
y_var = K.var(y)
ccc = (2.0 * covar) / (x_var + y_var + (x_mean + y_mean)**2)
return ccc
def ccc_a(y_true, y_pred):
x = y_true[:,1]
y = y_pred[:,1]
x_mean = K.mean(x, axis=0)
y_mean = K.mean(y, axis=0)
covar = K.mean( (x - x_mean) * (y - y_mean) )
x_var = K.var(x)
y_var = K.var(y)
ccc = (2.0 * covar) / (x_var + y_var + (x_mean + y_mean)**2)
return ccc
Currently the loss function ccc returns a scalar. The loss function is split into 2 different functions (ccc_v and ccc_a) because I use them as metrics as well.
I've read from Keras doc and this question that a custom loss function should return a list of losses, one for each sample.
First question: my model trains even if the loss function returns a scalar. Is it that bad? How is training different if I use a loss function whose output is a scalar instead of a list of scalars?
Second question: how can I rewrite my loss function to return a list of losses? I know I should avoid means and sums but in my case I think it's not possible because there's not a global mean but different ones, one a the numerator for the covariance and a couple at the denominator for the variances.
if your using tensorflow there are automatic apis for calculating loss
tf.keras.losses.mse()
tf.keras.losses.mae()
tf.keras.losses.Huber()
# Define the loss function
def loss_function(w1, b1, w2, b2, features = borrower_features, targets = default):
predictions = model(w1, b1, w2, b2)
# Pass targets and predictions to the cross entropy loss
return keras.losses.binary_crossentropy(targets, predictions)
#if your using categorical_crossentropy than return the losses for it.
#convert your image into a single np.array for input
#build your SoftMax model
# Define a sequential model
model=keras.Sequential()
# Define a hidden layer
model.add(keras.layers.Dense(16, activation='relu', input_shape=(784,)))
# Define the output layer
model.add(keras.layers.Dense(4,activation='softmax'))
# Compile the model
model.compile('SGD', loss='categorical_crossentropy',metrics=['accuracy'])
# Complete the fitting operation
train_data=train_data.reshape((50,784))
# Fit the model
model.fit(train_data, train_labels, validation_split=0.2, epochs=3)
# Reshape test data
test_data = test_data.reshape(10, 784)
# Evaluate the model
model.evaluate(test_data, test_labels)
I'm try to make a simple linear model to predict parameters of formula.
y = 3*x1 + x2 - 2*x3
Unfortunately, there are some problem when i try to compute loss.
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
When i set batch_size = 3, the size of each result is different
x = torch.randn(3,3)
answer(x)
tensor([ 2.0201, -3.8354, 2.0059])
model(x)
tensor([[ 0.2085],
[-0.0670],
[-1.3635]], grad_fn=<ThAddmmBackward>)
answer(x.data).size()
torch.Size([3])
model(x.data).size()
torch.Size([3, 1])
I think the broadcast applied automatically.
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
How can i make same size of two tensors? Thanks
This is my code
import torch
import torch.nn as nn
import torch.optim as optim
class model(nn.Module):
def __init__(self, input_size, output_size):
super(model, self).__init__()
self.linear = nn.Linear(input_size, output_size)
def forward(self, x):
y = self.linear(x)
return y
model = model(3,1)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.1)
print('Parameters : ')
for p in model.parameters():
print(p)
print('')
print('Optimizer : ')
print(optimizer)
def generate_data(batch_size):
x = torch.randn(batch_size, 3)
return x
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
x = torch.randn(3,3)
print(x)
x = torch.FloatTensor(x)
batch_size = 3
epoch_n = 1000
iter_n = 100
for epoch in range(epoch_n):
avg_loss = 0
for i in range(iter_n):
x = torch.randn(batch_size, 3)
optimizer.zero_grad()
loss = loss_f(x.data)
loss.backward()
optimizer.step()
avg_loss += loss
avg_loss = avg_loss / iter_n
x_valid = torch.FloatTensor([[1,2,3]])
y_valid = answer(x_valid)
model.eval()
y_hat = model(x_valid)
model.train()
print(avg_loss, y_valid.data[0], y_hat.data[0])
if avg_loss < 0.001:
break
You can use Tensor.view
https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
So something like
answer(x.data).view(-1, 1)
should do the trick.
I've been recently trying to implement a model, which can be described as following: Given an input matrix and a set of targets, let the model learn, simultaneously, the matrix representation, as well as the targets via a custom loss function.
The architecture (simplified):
input_matrix = Input(shape=(i_shape,))
layer1 = Dense(100)(input_matrix)
output = Dense(3)(layer1)
autoencoder_mid = Dense(100)(input_matrix)
autoencoder_output = Dense(i_shape)(autoencoder_mid)
My idea of a loss function:
def customLoss(true_matrix,pred_matrix):
def combined_loss(y_true,y_pred):
return K.abs(y_true-y_pred)
a = K.mean( K.square(y_pred - y_true) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((y_true - 3)/5 )))),axis=-1 )
b = K.mean( K.square(pred_matrix - true_matrix) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((true_matrix - 3)/5 )))),axis=-1)
return a+b
return combined_loss
I compile the model as:
net = Model(input_matrix, [output,autoencoder_output])
net = net.compile(optimizer='adam', loss=customLoss(true_matrix=X,pred_matrix=autoencoder_output))
Where I try to fit the network with a standard:
net.fit(X,
target,
epochs=10,
batch_size=10)
The error I get is:
ValueError: Tensor conversion requested dtype float32 for Tensor with dtype float64: 'Tensor("loss/dense_4_loss/Log_3:0", shape=(389, 3890), dtype=float64, device=/device:GPU:0)'
My question is, is there any other way of doing this? If so, could you please point me towards a possible solution. Thank you very much.
You can try this:
def customLoss(true_matrix):
def combined_loss(y_true,y_pred):
y_pred, pred_matrix = y_pred
...
return combined_loss
net = Model(input_matrix, [output,autoencoder_output])
net.compile(optimizer='adam', loss=customLoss(X))
As the original y_pred will be a touple with (output,autoencoder_output).
Concerning the double return, the function will only return the first one, so I'd remove one of the two return lines or combine the two outputs such as:
alpha = 0.5
beta = 0.5
...
loss1, loss2 = K.abs(y_true-y_pred), a+b
return alpha*loss1 + beta*loss2
Changing alpha and beta upon convenience.
Thus, the whole thing could be:
def customLoss(true_matrix, alpha = 0.5, beta = 0.5):
def combined_loss(y_true,y_pred):
y_pred, pred_matrix = y_pred
a = K.mean( K.square(y_pred - y_true) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((y_true - 3)/5 )))),axis=-1 )
b = K.mean( K.square(pred_matrix - true_matrix) * K.exp(-K.log(1.7) * (K.log(1. + K.exp((true_matrix - 3)/5 )))),axis=-1)
loss1, loss2 = K.abs(y_true-y_pred), a+b
return alpha*loss1 + beta*loss2
return combined_loss
net = Model(input_matrix, [output,autoencoder_output])
net.compile(optimizer='adam', loss=customLoss(X))