So I'm trying to learn to optimally stop options in a Black-Scholes setting along the lines of the article: "Solving high-dimensional optimal stopping problems using deep learning" by Sebastian Becker, Patrick Cheridito, Arnulf Jentzen, and Timo Welti.
The framework used to price options is the following:
import tensorflow as tf
from tensorflow.python.training.moving_averages import assign_moving_average
def neural_net(x, neurons, is_training, dtype=tf.float32, decay=0.9):
def batch_normalization(y):
shape = y.get_shape().as_list()
y = tf.reshape(y, [-1, shape[1] * shape[2]])
#variables for batch normalization
beta = tf.compat.v1.get_variable(
name='beta', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.zeros_initializer())
gamma = tf.compat.v1.get_variable(
name='gamma', shape=[shape[1] * shape[2]],
dtype=dtype, initializer=tf.ones_initializer())
mv_mean = tf.compat.v1.get_variable(
'mv_mean', [shape[1]*shape[2]],
dtype = dtype, initializer=tf.zeros_initializer(),
trainable = False)
mv_var = tf.compat.v1.get_variable(
'mv_var', [shape[1]*shape[2]],
dtype = dtype, initializer =tf.ones_initializer(),
trainable = False)
mean,variance = tf.nn.moments(y, [0], name = 'moments')
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_mean, mean, decay,
zero_debias=True))
tf.compat.v1.add_to_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS,
assign_moving_average(mv_var, variance, decay,
zero_debias=False))
mean, variance = tf.cond(is_training, lambda: (mean, variance),
lambda: (mv_mean, mv_var))
y = tf.nn.batch_normalization(y, mean, variance, beta, gamma, 1e-6)
return tf.reshape(y, [-1, shape[1], shape[2]])
def fc_layer(y, out_size, activation, is_single):
shape = y.get_shape().as_list()
w = tf.compat.v1.get_variable(
name='weights',
shape=[shape[2], shape[1], out_size],
dtype=dtype,
initializer=tf.initializers.glorot_uniform())
y = tf.transpose(tf.matmul(tf.transpose(y, [2, 0, 1]), w),
[1, 2, 0])
if is_single:
b = tf.compat.v1.get_variable(
name='bias',
shape=[out_size, shape[2]],
dtype = dtype,
initializer=tf.zeros_initializer())
return activation(y + b)
return activation(batch_normalization(y))
x = batch_normalization(x)
for i in range(len(neurons)):
with tf.compat.v1.variable_scope('layer_' + str(i)):
x = fc_layer(x, neurons[i],
tf.nn.relu if i < len(neurons) - 1
else tf.nn.sigmoid, False)
return x
#then Deep optimal stopping
def deep_optimal_stopping(x, t, n, g, neurons, batch_size, train_steps,
mc_runs, lr_boundaries, lr_values, beta1=0.9,
beta2=0.999, epsilon=1e-8, decay=0.9):
is_training = tf.compat.v1.placeholder(tf.bool, []) # a variable used to distinguish between training and Monte Carlo simulation, used for batch noralization
p = g(t, x) # we evaluate the payoff for the whole batch at every point in time
nets = neural_net(tf.concat([x[:, :, :-1], p[:, :, :-1]], axis=1),
neurons, is_training, decay=decay)
u_list = [nets[:, :, 0]]
u_sum = u_list[-1]
for k in range(1, n - 1): #range(start, stop)
u_list.append(nets[:, :, k] * (1. - u_sum)) # we build a neural network to approximate the stopping decision at time n*T/N
u_sum += u_list[-1]
#last iteration?
u_list.append(1. - u_sum)
u_stack = tf.concat(u_list, axis=1)
p = tf.squeeze(p, axis=1) #removes dimension of size 1
loss = tf.reduce_mean(tf.reduce_sum(-u_stack * p, axis=1)) #loss function
idx = tf.argmax(tf.cast(tf.cumsum(u_stack, axis=1) + u_stack >= 1,
dtype=tf.uint8), #idx for index?, argmax takes index for largest value
axis=1, output_type=tf.int32)
stopped_payoffs = tf.reduce_mean(
tf.gather_nd(p, tf.stack([tf.range(0, batch_size, dtype=tf.int32),
idx], axis=1))) # this is the approximation of the price for one batch, we will calculate the mean over MC-runs of those numbers
global_step = tf.Variable(0) # a variable used to apply the learning rate schedule, without it the optimizer would not know at which training step we are
learning_rate = tf.compat.v1.train.piecewise_constant(global_step,
lr_boundaries,
lr_values) # this gives us a piecewise constant learning rate, according to the schedule
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate,
beta1=beta1,
beta2=beta2,# define the optimizer, we use Adam with our learning rate schedule and a small tweak of one of its parameters
epsilon=epsilon)
update_ops = tf.compat.v1.get_collection(
tf.compat.v1.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, global_step=global_step)
with tf.compat.v1.Session() as sess:
sess.run(tf.compat.v1.global_variables_initializer())
for _ in range(train_steps):
sess.run(train_op, feed_dict={is_training: True})
px_mean = 0. # value that will hold the price
for _ in range(mc_runs): # loop over the number of MC runs
px_mean += sess.run(stopped_payoffs,
feed_dict={is_training: False})# we stop training, this is used for the batch normalization, from now on we will use the sampled moving averages
return px_mean / mc_runs
Now we define the various variables and simulate paths of a stock as X. Then we run use deep_optimal_stopping function to price the option, defined in the following code
import tensorflow as tf
import numpy as np
import time
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
T, N, K = 3., 9, 100.
r, delta, beta = 0.05, 0.1, 0.2
batch_size = 800#8192
lr_values = [0.05, 0.005, 0.0005]
mc_runs = 50#500
def g(s, x):
return tf.exp(-r * s) \
* tf.maximum(tf.reduce_max(x, axis=1, keepdims=True) - K, 0.)
_file = open('example_4_4_1_1.csv', 'w')
_file.write('dim, run, mean, time\n')
for d in [2, 3, 5, 10, 20, 30, 50, 100, 200, 500]:
for s_0 in [40.]:#[90., 100., 110.]:
for run in range(5):
tf.compat.v1.reset_default_graph()
t0 = time.time()
neurons = [d + 50, d + 50, 1]
train_steps = 1500 + d
lr_boundaries = [int(500 + d / 5), int(1500 + 3 * d / 5)]
W = tf.cumsum(tf.compat.v1.random_normal(
shape=[batch_size, d, N],
stddev=np.sqrt(T / N)), axis=2)
t = tf.constant(np.linspace(start=T / N, stop=T, num=N,
endpoint=True, dtype=np.float32))
#X = tf.exp((r - delta - beta ** 2 / 2.) * t + beta * W) * s_0
px_mean = deep_optimal_stopping(
W, t, N, g, neurons, batch_size,
train_steps, mc_runs,
lr_boundaries, lr_values, epsilon=0.1)
t1 = time.time()
print("")
_file.write('%i, %i, %f, %f\n' % (d, run, px_mean, t1 - t0))
_file.close()
So here the option is a bermudan max-call defined by the payoff function g(s,x). My understanding would be, if I wanted the price of an American put, I instead changed the payoff function g to be:
def g(s, x):
return tf.exp(-r * s) * tf.maximum(K-x, 0.)
and otherwise changing nothing. But instead of getting a price of 5.31 as reported in their article, I get 4.02.
Can someone explain where I'm going wrong with my understanding of the problem?
Related
I have a neural network with one hidden layer. When I compute the cost through the entropy function with this code
def neural_network_model(X, Y, hidden_unit, num_iterations = 1000):
np.random.seed(3)
input_unit = define_structure(X, Y)[0]
output_unit = define_structure(X, Y)[2]
parameters = parameters_initialization(input_unit, hidden_unit, output_unit)
W1 = parameters['W1']
b1 = parameters['b1']
W2 = parameters['W2']
b2 = parameters['b2']
for i in range(0, num_iterations):
A2, cache = forward_propagation(X, parameters)
cost = cross_entropy_cost(A2, Y, parameters)
grads = backward_propagation(parameters, cache, X, Y)
parameters = gradient_descent(parameters, grads)
if i % 5 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
return parameters
parameters = neural_network_model(X_train, y_train, 4, num_iterations=1000)
I obtain NaN as result and the warning
divide by zero encountered in log
logprobs = np.multiply(np.log(A2), Y) + np.multiply((1-Y), np.log(1 - A2))
the functions cross_entropy_cost and backward_propagation are the following:
def cross_entropy_cost(A2, Y, parameters):
# number of training example
m = Y.shape[1]
# Compute the cross-entropy cost
logprobs = np.multiply(np.log(A2), Y) + np.multiply((1-Y), np.log(1 - A2))
cost = - np.sum(logprobs) / m
cost = float(np.squeeze(cost))
return cost
def backward_propagation(parameters, cache, X, Y):
#number of training example
m = X.shape[1]
W1 = parameters['W1']
W2 = parameters['W2']
A1 = cache['A1']
A2 = cache['A2']
dZ2 = A2-Y
dW2 = (1/m) * np.dot(dZ2, A1.T)
db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
dW1 = (1/m) * np.dot(dZ1, X.T)
db1 = (1/m)*np.sum(dZ1, axis=1, keepdims=True)
grads = {"dW1": dW1, "db1": db1, "dW2": dW2,"db2": db2}
return grads
How can I solve the problem of the division by zero?
Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)
I was using old source code and was using Scipy's L-BFGS-B algorithm. I have inaccuracies with it, though.
I got the following instructions:
In general, if you're having issues with L-BFGS-B, you also might want to try projected Adam or some other similar algorithm. There is really nothing special about L-BFGS-B for solving that problem, it just was convenient at that time for me. You can start off from the optim.Adam code and after each step, clamp x to be within your bounds.
I was wondering how to change the following code to use ADAM instead of L-BFGS-B. I am unfamiliar with Scipy. Please help because I'm a beginner
def get_sharpness(data_loader, model, criterion, epsilon, manifolds=0):
# extract current x0
x0 = None
for p in model.parameters():
if x0 is None:
x0 = p.data.view(-1)
else:
x0 = torch.cat((x0, p.data.view(-1)))
x0 = x0.cpu().numpy()
# get current f_x
f_x0, _ = get_minus_cross_entropy(x0, data_loader, model, criterion)
f_x0 = -f_x0
logging.info('min loss f_x0 = {loss:.4f}'.format(loss=f_x0))
# find the minimum
if 0==manifolds:
x_min = np.reshape(x0 - epsilon * (np.abs(x0) + 1), (x0.shape[0], 1))
x_max = np.reshape(x0 + epsilon * (np.abs(x0) + 1), (x0.shape[0], 1))
bounds = np.concatenate([x_min, x_max], 1)
func = lambda x: get_minus_cross_entropy(x, data_loader, model, criterion, training=True)
init_guess = x0
else:
warnings.warn("Small manifolds may not be able to explore the space.")
assert(manifolds<=x0.shape[0])
#transformer = rp.GaussianRandomProjection(n_components=manifolds)
#transformer.fit(np.random.rand(manifolds, x0.shape[0]))
#A_plus = transformer.components_
#A = np.linalg.pinv(A_plus)
A_plus = np.random.rand(manifolds, x0.shape[0])*2.-1.
# normalize each column to unit length
A_plus_norm = np.linalg.norm(A_plus, axis=1)
A_plus = A_plus / np.reshape(A_plus_norm, (manifolds,1))
A = np.linalg.pinv(A_plus)
abs_bound = epsilon * (np.abs(np.dot(A_plus, x0))+1)
abs_bound = np.reshape(abs_bound, (abs_bound.shape[0], 1))
bounds = np.concatenate([-abs_bound, abs_bound], 1)
def func(y):
floss, fg = get_minus_cross_entropy(x0 + np.dot(A, y), data_loader, model, criterion, training=True)
return floss, np.dot(np.transpose(A), fg)
#func = lambda y: get_minus_cross_entropy(x0+np.dot(A, y), data_loader, model, criterion, training=True)
init_guess = np.zeros(manifolds)
#rand_selections = (np.random.rand(bounds.shape[0])+1e-6)*0.99
#init_guess = np.multiply(1.-rand_selections, bounds[:,0])+np.multiply(rand_selections, bounds[:,1])
minimum_x, f_x, d = sciopt.fmin_l_bfgs_b(func, init_guess, maxiter=10, bounds=list(bounds), disp=1, iprint=101)
And get_minus_cross_entropy is defined as
def get_minus_cross_entropy(x, data_loader, model, criterion, training=False):
if type(x).__module__ == np.__name__:
x = torch.from_numpy(x).float()
x = x.cuda()
# switch to evaluate mode
model.eval()
# fill vector x of parameters to model
x_start = 0
for p in model.parameters():
psize = p.data.size()
peltnum = 1
for s in psize:
peltnum *= s
x_part = x[x_start:x_start+peltnum]
p.data = x_part.view(psize)
x_start += peltnum
result, grads = forward(data_loader, model, criterion, 0,
training=training, optimizer=None)
#print ('get_minus_cross_entropy {}!'.format(-result['loss']))
return (-result['loss'], None if grads is None else grads.cpu().numpy().astype(np.float64))
I want to check if I can solve this problem with tensorflow instead of pymc3. The experimental idea is that I am going to define a probibalistic system that contains a switchpoint. I can use sampling as a method of inference but I started wondering why I couldn't just do this with a gradient descent instead.
I decided to do the gradient search in tensorflow but it seems like tensorflow is having a hard time performing a gradient search when tf.where is involved.
You can find the code below.
import tensorflow as tf
import numpy as np
x1 = np.random.randn(50)+1
x2 = np.random.randn(50)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(5, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(10, name = "tau", dtype=tf.float32)
mu = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu1,
tf.ones(shape=(len_x,), dtype=tf.float32) * mu2)
sigma = tf.where(time_all < tau,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma1,
tf.ones(shape=(len_x,), dtype=tf.float32) * sigma2)
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) -tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.RMSPropOptimizer(0.01)
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
print("these variables should be trainable: {}".format([_.name for _ in tf.trainable_variables()]))
for step in range(10000):
_lik, _ = sess.run([total_likelihood, opt_task])
if step % 1000 == 0:
variables = {_.name:_.eval() for _ in [mu1, mu2, sigma1, sigma2, tau]}
print("step: {}, values: {}".format(str(step).zfill(4), variables))
You'll notice that the tau parameter does not change even though tensorflow seems to be aware of the variable and it's gradient. Any clue on what is going wrong? Is this something that can be calculated in tensorflow or do I need a different pattern?
tau is only used in the condition argument to where: (tf.where(time_all < tau, ...) , which is a boolean tensor. Since calculating gradients only makes sense for continuous values, the gradient of the output with respect to tau will be zero.
Even ignoring tf.where, you used tau in the expression time_all < tau, which is constant almost everywhere, so has a gradient of zero.
Due to the gradient of zero, there is no way to learn tau with gradient descent methods.
Depending on your problem, maybe instead of a hard switch between two values, you can use a weighted sum instead p*val1 + (1-p)*val2, where p depends on tau in a continuous manner.
The assigned solution is the correct answer, but doesn't contain the code solution to my problem. The following snippet does;
import tensorflow as tf
import numpy as np
import os
import uuid
TENSORBOARD_PATH = "/tmp/tensorboard-switchpoint"
# tensorboard --logdir=/tmp/tensorboard-switchpoint
x1 = np.random.randn(35)-1
x2 = np.random.randn(35)*2 + 5
x_all = np.hstack([x1, x2])
len_x = len(x_all)
time_all = np.arange(1, len_x + 1)
mu1 = tf.Variable(0, name="mu1", dtype=tf.float32)
mu2 = tf.Variable(0, name = "mu2", dtype=tf.float32)
sigma1 = tf.Variable(2, name = "sigma1", dtype=tf.float32)
sigma2 = tf.Variable(2, name = "sigma2", dtype=tf.float32)
tau = tf.Variable(15, name = "tau", dtype=tf.float32)
switch = 1./(1+tf.exp(tf.pow(time_all - tau, 1)))
mu = switch*mu1 + (1-switch)*mu2
sigma = switch*sigma1 + (1-switch)*sigma2
likelihood_arr = tf.log(tf.sqrt(1/(2*np.pi*tf.pow(sigma, 2)))) - tf.pow(x_all - mu, 2)/(2*tf.pow(sigma, 2))
total_likelihood = tf.reduce_sum(likelihood_arr, name="total_likelihood")
optimizer = tf.train.AdamOptimizer()
opt_task = optimizer.minimize(-total_likelihood)
init = tf.global_variables_initializer()
tf.summary.scalar("mu1", mu1)
tf.summary.scalar("mu2", mu2)
tf.summary.scalar("sigma1", sigma1)
tf.summary.scalar("sigma2", sigma2)
tf.summary.scalar("tau", tau)
tf.summary.scalar("likelihood", total_likelihood)
merged_summary_op = tf.summary.merge_all()
with tf.Session() as sess:
sess.run(init)
print("these variables should be trainable: {}".format([_.name for _ in tf.trainable_variables()]))
uniq_id = os.path.join(TENSORBOARD_PATH, "switchpoint-" + uuid.uuid1().__str__()[:4])
summary_writer = tf.summary.FileWriter(uniq_id, graph=tf.get_default_graph())
for step in range(40000):
lik, opt, summary = sess.run([total_likelihood, opt_task, merged_summary_op])
if step % 100 == 0:
variables = {_.name:_.eval() for _ in [total_likelihood]}
summary_writer.add_summary(summary, step)
print("i{}: {}".format(str(step).zfill(5), variables))
I implemented bias units for my neural network with gradient descent. But I'm not 100% sure If I've implemented it the right way. Would be glade if you can quickly look through my code. Only the parts with
if bias:
are important.
And my second question:
Shouldn't the derivate of the softmax function be 1-x, because x is the output of the softmax function?
I tried my net with 1-x but its performance was worse.
Every help is appreciated.
Thanks in advance.
import numpy as np
import pickle
import time
import math
class FeedForwardNetwork():
def __init__(self, input_dim, hidden_dim, output_dim, dropout=False, dropout_prop=0.5, bias=False):
np.random.seed(1)
self.input_layer = np.array([])
self.hidden_layer = np.array([])
self.output_layer = np.array([])
self.hidden_dim = hidden_dim
self.dropout = dropout
self.dropout_prop = dropout_prop
self.bias = bias
r_input_hidden = math.sqrt(6 / (input_dim + hidden_dim))
r_hidden_output = math.sqrt(6 / (hidden_dim + output_dim))
#self.weights_input_hidden = np.random.uniform(low=-r_input_hidden, high=r_input_hidden, size=(input_dim, hidden_dim))
#self.weights_hidden_output = np.random.uniform(low=-r_hidden_output, high=r_hidden_output, size=(hidden_dim, output_dim))
self.weights_input_hidden = np.random.uniform(low=-0.01, high=0.01, size=(input_dim, hidden_dim))
self.weights_hidden_output = np.random.uniform(low=-0.01, high=0.01, size=(hidden_dim, output_dim))
self.validation_data = np.array([])
self.validation_data_solution = np.array([])
self.velocities_input_hidden = np.zeros(self.weights_input_hidden.shape)
self.velocities_hidden_output = np.zeros(self.weights_hidden_output.shape)
if bias:
self.weights_bias_hidden = np.random.uniform(low=-0.01, high=0.01, size=((1, hidden_dim)))
self.weights_bias_output = np.random.uniform(low=-0.01, high=0.01, size=((1, output_dim)))
self.velocities_bias_hidden = np.zeros(self.weights_bias_hidden.shape)
self.velocities_bias_output = np.zeros(self.weights_bias_output.shape)
def _tanh(self, x, deriv=False):
#The derivate is: 1-np.tanh(x)**2; Because x is already the output of tanh(x) 1-x*x is the correct derivate.
if not deriv:
return np.tanh(x)
return 1-x*x
def _softmax(self, x, deriv=False):
if not deriv:
return np.exp(x) / np.sum(np.exp(x), axis=0)
return 1 - np.exp(x) / np.sum(np.exp(x), axis=0)
def set_training_data(self, training_data_input, training_data_target, validation_data_input=None, validation_data_target=None):
"""Splits the data up into training and validation data with a ratio of 0.85/0.15 if no validation data is given.
Sets the data for training."""
if len(training_data_input) != len(training_data_target):
raise ValueError(
'Number of training examples and'
' training targets does not match!'
)
if (validation_data_input is None) and (validation_data_target is None):
len_training_data = int((len(training_data_input)/100*85//1))
self.input_layer = training_data_input[:len_training_data]
self.output_layer = training_data_target[:len_training_data]
self.validation_data = training_data_input[len_training_data:]
self.validation_data_solution = training_data_target[len_training_data:]
else:
self.input_layer = training_data_input
self.output_layer = training_data_target
self.validation_data = validation_data_input
self.validation_data_solution = validation_data_target
def save(self, filename):
"""Saves the weights into a pickle file."""
with open(filename, "wb") as network_file:
pickle.dump(self.weights_input_hidden, network_file)
pickle.dump(self.weights_hidden_output, network_file)
def load(self, filename):
"""Loads network weights from a pickle file."""
with open(filename, "rb") as network_file:
weights_input_hidden = pickle.load(network_file)
weights_hidden_output = pickle.load(network_file)
if (
len(weights_input_hidden) != len(self.weights_input_hidden)
or len(weights_hidden_output) != len(self.weights_hidden_output)
):
raise ValueError(
'File contains weights that does not'
' match the current networks size!'
)
self.weights_input_hidden = weights_input_hidden
self.weights_hidden_output = weights_hidden_output
def measure_error(self, input_data, output_data):
return 1/2 * np.sum((output_data - self.forward_propagate(input_data))**2)
#return np.sum(np.nan_to_num(-output_data*np.log(self.forward_propagate(input_data))-(1-output_data)*np.log(1-self.forward_propagate(input_data))))
def forward_propagate(self, input_data, dropout=False):
"""Proceds the input data from input neurons up to output neurons and returns the output layer.
If dropout is True some of the neurons are randomly turned off."""
input_layer = input_data
self.hidden_layer = self._tanh(np.dot(input_layer, self.weights_input_hidden))
if self.bias:
self.hidden_layer += self.weights_bias_hidden
if dropout:
self.hidden_layer *= np.random.binomial([np.ones((len(input_data),self.hidden_dim))],1-self.dropout_prop)[0] * (1.0/(1-self.dropout_prop))
if self.bias:
return self._softmax((np.dot(self.hidden_layer, self.weights_hidden_output) + self.weights_bias_output).T).T
else:
return self._softmax(np.dot(self.hidden_layer, self.weights_hidden_output).T).T
#return self._softmax(output_layer.T).T
def back_propagate(self, input_data, output_data, alpha, beta, momentum):
"""Calculates the difference between target output and output and adjusts the weights to fit the target output better.
The parameter alpha is the learning rate.
Beta is the parameter for weight decay which penaltizes large weights."""
sample_count = len(input_data)
output_layer = self.forward_propagate(input_data, dropout=self.dropout)
output_layer_error = output_layer - output_data
output_layer_delta = output_layer_error * self._softmax(output_layer, deriv=True)
print("Error: ", np.mean(np.abs(output_layer_error)))
#How much did each hidden neuron contribute to the output error?
#Multiplys delta term with weights
hidden_layer_error = output_layer_delta.dot(self.weights_hidden_output.T)
#If the prediction is good, the second term will be small and the change will be small
#Ex: target: 1 -> Slope will be 1 so the second term will be big
hidden_layer_delta = hidden_layer_error * self._tanh(self.hidden_layer, deriv=True)
#The both lines return a matrix. A row stands for all weights connected to one neuron.
#E.g. [1, 2, 3] -> Weights to Neuron A
# [4, 5, 6] -> Weights to Neuron B
hidden_weights_gradient = input_data.T.dot(hidden_layer_delta)/sample_count
output_weights_gradient = self.hidden_layer.T.dot(output_layer_delta)/sample_count
velocities_input_hidden = self.velocities_input_hidden
velocities_hidden_output = self.velocities_hidden_output
self.velocities_input_hidden = velocities_input_hidden * momentum - alpha * hidden_weights_gradient
self.velocities_hidden_output = velocities_hidden_output * momentum - alpha * output_weights_gradient
#Includes momentum term and weight decay; The weight decay parameter is beta
#Weight decay penalizes large weights to prevent overfitting
self.weights_input_hidden += -velocities_input_hidden * momentum + (1 + momentum) * self.velocities_input_hidden
- alpha * beta * self.weights_input_hidden / sample_count
self.weights_hidden_output += -velocities_hidden_output * momentum + (1 + momentum) * self.velocities_hidden_output
- alpha * beta * self.weights_hidden_output / sample_count
if self.bias:
velocities_bias_hidden = self.velocities_bias_hidden
velocities_bias_output = self.velocities_bias_output
hidden_layer_delta = np.sum(hidden_layer_delta, axis=0)
output_layer_delta = np.sum(output_layer_delta, axis=0)
self.velocities_bias_hidden = velocities_bias_hidden * momentum - alpha * hidden_layer_delta
self.velocities_bias_output = velocities_bias_output * momentum - alpha * output_layer_delta
self.weights_bias_hidden += -velocities_bias_hidden * momentum + (1 + momentum) * self.velocities_bias_hidden
- alpha * beta * self.weights_bias_hidden / sample_count
self.weights_bias_output += -velocities_bias_output * momentum + (1 + momentum) * self.velocities_bias_output
- alpha * beta * self.weights_bias_output / sample_count
def batch_train(self, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in batch mode that means the weights are updated after showing all training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting.
Beta is the parameter for weight decay which penaltizes large weights."""
#The weight decay parameter is beta
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
for epoch in range(epochs):
self.back_propagate(self.input_layer, self.output_layer, alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch, validation_error))
self.save("Network_Mnist.net")
def mini_batch_train(self, batch_size, epochs, alpha, beta, momentum, patience=10):
"""Trains the network in mini batch mode, that means the weights are updated after showing only a bunch of training examples.
alpha is the learning rate and patience is the number of epochs that the validation error is allowed to increase before aborting."""
validation_error = self.measure_error(self.validation_data, self.validation_data_solution)
sample_count = len(self.input_layer)
epoch_counter = 0
for epoch in range(0, epochs*batch_size, batch_size):
epoch_counter += 1
self.back_propagate(self.input_layer[epoch%sample_count:(epoch%sample_count)+batch_size],
self.output_layer[epoch%sample_count:(epoch%sample_count)+batch_size], alpha, beta, momentum)
validation_error_new = self.measure_error(self.validation_data, self.validation_data_solution)
if validation_error_new < validation_error:
validation_error = validation_error_new
patience = 20
else:
patience -= 1
if patience == 0:
print("Abort Training. Overfitting has started! Epoch: {0}. Error: {1}".format(epoch_counter, validation_error_new))
return
print("Epoch: {0}, Validation Error: {1}".format(epoch_counter, validation_error))
self.save("Network_Mnist.net")
if __name__ == "__main__":
#If the first row is a one the first output neuron should be on the second off
x = np.array([ [0, 0, 1, 1, 0],
[0, 1, 1, 1, 1],
[1, 0, 1, 1, 1],
[1, 1, 1, 1, 0],
[0, 1, 1, 1, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 0, 1, 0, 0] ])
y = np.array([ [0, 1],
[0, 1],
[1, 0],
[1, 0],
[0, 1],
[1, 0],
[1, 0],
[1, 0] ])
#x = np.array([ [0, 0, 1, 1] ])
#y = np.array([[0]]).T
a = FeedForwardNetwork(input_dim=5, hidden_dim=200, output_dim=2, bias=False)
a.set_training_data(x, y)
start = time.time()
a.batch_train(epochs=2000, alpha=0.05, beta=0.0001, momentum=0.99, patience=20)
print(time.time()-start)
In relation with the derivatives...
If you are using the tanh activation function, i.e. the derivative is: y' = 1 - y^2. The tanh is commonly used because it is zero-centered.
If you are using the logistic equation, then the derivative is: y' = y(1+y). The softmax has a similar derivative.
The nice thing is that all these can be expressed as functions of themselves, so you need to have a look at the def _softmax(self, x, deriv=False) function, to define it in a similar way than def _tanh(self, x, deriv=False).