I am trying to use Deep Galerkin Method (DGM) to solve high dimensional PDEs and I face a problem. For illustrative purposes, I am posting a simple optimization problem below. The feed-forward network successfully recovers the optimal funciton, but DGM network fails to do so. Any help is highly appreciated.
import logging, os
os.system('clear')
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# CLASS DEFINITIONS FOR NEURAL NETWORKS USED IN DEEP GALERKIN METHOD
#%% import needed packages
import tensorflow as tf
#%% LSTM-like layer used in DGM (see Figure 5.3 and set of equations on p. 45) - modification of Keras layer class
class LSTMLayer(tf.keras.layers.Layer):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, output_dim, input_dim, trans1 = "tanh", trans2 = "tanh"):
'''
Args:
input_dim (int): dimensionality of input data
output_dim (int): number of outputs for LSTM layers
trans1, trans2 (str): activation functions used inside the layer;
one of: "tanh" (default), "relu" or "sigmoid"
Returns: customized Keras layer object used as intermediate layers in DGM
'''
# create an instance of a Layer object (call initialize function of superclass of LSTMLayer)
super(LSTMLayer, self).__init__()
# add properties for layer including activation functions used inside the layer
self.output_dim = output_dim
self.input_dim = input_dim
if trans1 == "tanh":
self.trans1 = tf.nn.tanh
elif trans1 == "relu":
self.trans1 = tf.nn.relu
elif trans1 == "sigmoid":
self.trans1 = tf.nn.sigmoid
if trans2 == "tanh":
self.trans2 = tf.nn.tanh
elif trans2 == "relu":
self.trans2 = tf.nn.relu
elif trans2 == "sigmoid":
self.trans2 = tf.nn.relu
### define LSTM layer parameters (use Xavier initialization)
# u vectors (weighting vectors for inputs original inputs x)
self.Uz = self.add_variable("Uz", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Ug = self.add_variable("Ug", shape=[self.input_dim ,self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Ur = self.add_variable("Ur", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Uh = self.add_variable("Uh", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# w vectors (weighting vectors for output of previous layer)
self.Wz = self.add_variable("Wz", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wg = self.add_variable("Wg", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wr = self.add_variable("Wr", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wh = self.add_variable("Wh", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# bias vectors
self.bz = self.add_variable("bz", shape=[1, self.output_dim])
self.bg = self.add_variable("bg", shape=[1, self.output_dim])
self.br = self.add_variable("br", shape=[1, self.output_dim])
self.bh = self.add_variable("bh", shape=[1, self.output_dim])
# main function to be called
def call(self, S, X):
'''Compute output of a LSTMLayer for a given inputs S,X .
Args:
S: output of previous layer
X: data input
Returns: customized Keras layer object used as intermediate layers in DGM
'''
# compute components of LSTM layer output (note H uses a separate activation function)
Z = self.trans1(tf.add(tf.add(tf.matmul(X,self.Uz), tf.matmul(S,self.Wz)), self.bz))
G = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ug), tf.matmul(S, self.Wg)), self.bg))
R = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ur), tf.matmul(S, self.Wr)), self.br))
H = self.trans2(tf.add(tf.add(tf.matmul(X,self.Uh), tf.matmul(tf.multiply(S, R), self.Wh)), self.bh))
# compute LSTM layer output
S_new = tf.add(tf.multiply(tf.subtract(tf.ones_like(G), G), H), tf.multiply(Z,S))
return S_new
#%% Fully connected (dense) layer - modification of Keras layer class
class DenseLayer(tf.keras.layers.Layer):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, output_dim, input_dim, transformation=None):
'''
Args:
input_dim: dimensionality of input data
output_dim: number of outputs for dense layer
transformation: activation function used inside the layer; using
None is equivalent to the identity map
Returns: customized Keras (fully connected) layer object
'''
# create an instance of a Layer object (call initialize function of superclass of DenseLayer)
super(DenseLayer,self).__init__()
self.output_dim = output_dim
self.input_dim = input_dim
### define dense layer parameters (use Xavier initialization)
# w vectors (weighting vectors for output of previous layer)
self.W = self.add_variable("W", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# bias vectors
self.b = self.add_variable("b", shape=[1, self.output_dim])
if transformation:
if transformation == "tanh":
self.transformation = tf.tanh
elif transformation == "relu":
self.transformation = tf.nn.relu
else:
self.transformation = transformation
# main function to be called
def call(self,X):
'''Compute output of a dense layer for a given input X
Args:
X: input to layer
'''
# compute dense layer output
S = tf.add(tf.matmul(X, self.W), self.b)
if self.transformation:
S = self.transformation(S)
return S
#%% Neural network architecture used in DGM - modification of Keras Model class
class DGMNet(tf.keras.Model):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, layer_width, n_layers, input_dim, final_trans=None):
'''
Args:
layer_width:
n_layers: number of intermediate LSTM layers
input_dim: spaital dimension of input data (EXCLUDES time dimension)
final_trans: transformation used in final layer
Returns: customized Keras model object representing DGM neural network
'''
# create an instance of a Model object (call initialize function of superclass of DGMNet)
super(DGMNet,self).__init__()
# define initial layer as fully connected
# NOTE: to account for time inputs we use input_dim+1 as the input dimensionality
self.initial_layer = DenseLayer(layer_width, input_dim, transformation = "tanh")
# define intermediate LSTM layers
self.n_layers = n_layers
self.LSTMLayerList = []
for _ in range(self.n_layers):
self.LSTMLayerList.append(LSTMLayer(layer_width, input_dim))
# define final layer as fully connected with a single output (function value)
self.final_layer = DenseLayer(1, layer_width, transformation = final_trans)
# main function to be called
def call(self,x):
'''
Args:
t: sampled time inputs
x: sampled space inputs
Run the DGM model and obtain fitted function value at the inputs (t,x)
'''
# define input vector as time-space pairs
X = tf.concat([x],1)
# call initial layer
S = self.initial_layer.call(X)
# call intermediate LSTM layers
for i in range(self.n_layers):
S = self.LSTMLayerList[i].call(S,X)
# call final LSTM layers
result = self.final_layer.call(S)
return result
#%% main class
class check():
def __init__(self,v,x,layers,learning_rate,adam_iter,params):
self.params=params
self.v=v
self.x=x
self.learning_rate = learning_rate
self.adam_iter = adam_iter
self.lb = np.array([self.x[0][0]])
self.ub = np.array([self.x[-1][0]])
self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = True))
self.x_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
self.v_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])
self.x_u_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
self.v_u_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])
self.weights_v,self.biases_v = self.initialize_nn(layers)
self.weights_i,self.biases_i = self.initialize_nn(layers)
with tf.variable_scope("control",reuse=True):
self.i_pred = self.net_i(self.x_tf)
with tf.variable_scope("value",reuse=True):
self.v_pred = self.net_v(self.x_tf)
self.error_i = self.policy_error(self.x_tf)
self.loss_v = tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf))
self.loss = tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf)) + tf.reduce_mean(tf.square(self.error_i))
self.optimizer_Adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op_Adam = self.optimizer_Adam.minimize(self.loss)
self.optimizer_Adam_v = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op_Adam_v = self.optimizer_Adam.minimize(self.loss_v)
init = tf.global_variables_initializer()
self.sess.run(init)
def policy_error(self,x):
i=self.net_i(x)
v_ = self.net_v(x+i)
l = v_ - i*x**2
error_i = tf.gradients(l,i)[0]
return error_i
def initialize_nn(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(num_layers-1):
W = self.xavier_init(size = [layers[l],layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype=tf.float32), dtype = tf.float32)
weights.append(W)
biases.append(b)
return weights,biases
def xavier_init(self,size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
try:
val = tf.Variable(tf.random.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
except:
val = tf.Variable(tf.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
return val
def neural_net(self,X,weights,biases):
num_layers = len(weights) +1
H = 2.0*(X - self.lb)/(self.ub - self.lb) -1
#H=X
for l in range(num_layers-2):
W = weights[l]
b = biases[l]
H = tf.tanh(tf.add(tf.matmul(H,W),b))
W = weights[-1]
b = biases[-1]
Y = tf.add(tf.matmul(H,W),b)
return Y
def net_v(self,eta):
if self.params['DGM']==True:
model_v = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
v_u = model_v(eta)
else:
X = tf.concat([eta],1)
v_u = self.neural_net(X,self.weights_v,self.biases_v)
return v_u
def net_i(self,eta):
if self.params['DGM']==True:
model_i = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
i_u = model_i(eta)
else:
X = tf.concat([eta],1)
i_u = self.neural_net(X,self.weights_i,self.biases_i)
return i_u
def callback(self,loss):
print('Loss: ',loss)
def train(self):
#K.clear_session()
start_time = time.time()
if True: #set this to true if you want adam to run
tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
for it in range(self.adam_iter):
self.sess.run(self.train_op_Adam_v, tf_dict)
# Print
if it % 1000 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss_v, tf_dict)
print('It: %d, Loss: %.3e, Time: %.2f' %
(it, loss_value, elapsed))
start_time = time.time()
start_time = time.time()
if True: #set this to true if you want adam to run
tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
for it in range(self.adam_iter):
self.sess.run(self.train_op_Adam, tf_dict)
# Print
if it % 1000 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss, tf_dict)
print('It: %d, Loss: %.3e, Time: %.2f' %
(it, loss_value, elapsed))
start_time = time.time()
start_time = time.time()
def predict(self,X_star):
i_star = self.sess.run(self.i_pred,{self.x_tf: X_star[:,0:1]})
v_star = self.sess.run(self.v_pred,{self.x_tf: X_star[:,0:1]})
error = self.sess.run(self.error_i,{self.x_tf: X_star[:,0:1]})
tf.reset_default_graph()
return i_star,v_star,error
#%%
if __name__=="__main__":
params={'DGM':True,'neurons_per_layer':50,'num_layers':4}
x=np.linspace(-1,1,100).reshape(-1,1).astype(np.float32)
v=(10 - x**2).reshape(-1,1).astype(np.float32)
#architecture for feed-forward network
layers = [1, 10,1]
learning_rate = 0.001
adam_iter = 5000
run = check(v,x,layers,learning_rate,adam_iter,params)
run.train()
i_star,v_star,error=run.predict(x)
The problem is to find the optimal function i that maximizes the function v=10-(x+i^2)-ix^2, where x is the state variable. That is, the optimal function i will depend on x. If I set 'DGM' as False in the parameter dictionary and run the code, I get the right solution (in this case the functions are coded as feed-forward neural network), where the correct analytical solution is i_star = 0.5*(-2x-x^2). If I set 'DGM' as False, the solution is incorrect. I tried with different number of layers and number of neurons per each layer, but DGM always gives incorrect solution.
Am I doing something wrong? Many thanks.
I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)
I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.
I am trying to implement StyleGAN with TensorFlow version 2 and I have no idea how to do an equalized learning rate. I tried to scale gradients this way:
def equalize_in_list(datalist):
for i in range(len(datalist)):
if (datalist[i] is list):
equalize_in_list(datalist[i])
else:
datalist[i] = datalist[i] * np.sqrt(2)/np.prod(datalist[i].shape)
return datalist
gen_grad = equalize_in_list(gen_grad)
disc_grad = equalize_in_list(disc_grad)
But it doesn't work correctly.
You can just create a custom layer.
class DenseEQ(Dense):
"""
Standard dense layer but includes learning rate equilization
at runtime as per Karras et al. 2017.
Inherits Dense layer and overides the call method.
"""
def __init__(self, **kwargs):
if 'kernel_initializer' in kwargs:
raise Exception("Cannot override kernel_initializer")
super().__init__(kernel_initializer=normal(0,1), **kwargs)
def build(self, input_shape):
super().build(input_shape)
# The number of inputs
n = np.product([int(val) for val in input_shape[1:]])
# He initialisation constant
self.c = np.sqrt(2/n)
def call(self, inputs):
output = K.dot(inputs, self.kernel*self.c) # scale kernel
if self.use_bias:
output = K.bias_add(output, self.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
And then create a model as you normally would... (But you'll have to specify its arguments explicitly e.g. : units=x. Positionaly arguments will not work)
model_in = Input(shape(12,))
x = DenseEq(name="whatever_1", units=16)(model_in)
x = LeakyRelu(0.2)(x)
x = DenseEq(name="whatever_2", units=1)(model_in)
model_out = LeakyRelu(0.2)(x)
model = Model(model_in, model_out)
You can do the same thing for a convolution.
class Conv2DEQ(Conv2D):
"""
Standard Conv2D layer but includes learning rate equilization
at runtime as per Karras et al. 2017.
Inherits Conv2D layer and overrides the call method, following
https://github.com/keras-team/keras/blob/master/keras/layers/convolutional.py
"""
def __init__(self, **kwargs):
if 'kernel_initializer' in kwargs:
raise Exception("Cannot override kernel_initializer")
super().__init__(kernel_initializer=normal(0,1), **kwargs)
def build(self, input_shape):
super().build(input_shape)
# The number of inputs
n = np.product([int(val) for val in input_shape[1:]])
# He initialisation constant
self.c = np.sqrt(2/n)
def call(self, inputs):
if self.rank == 2:
outputs = K.conv2d(
inputs,
self.kernel*self.c, # scale kernel
strides=self.strides,
padding=self.padding,
data_format=self.data_format,
dilation_rate=self.dilation_rate)
if self.use_bias:
outputs = K.bias_add(
outputs,
self.bias,
data_format=self.data_format)
if self.activation is not None:
return self.activation(outputs)
return outputs
I'm working on a NeuralNetwork class using the back-propogation algorithm from Artificial Intelligence: A Modern Approach. I stepped through a run of the train function with a debugger and everything seems to be working properly, but the error isn't going down when I run it. Can anybody spot what I'm doing wrong?
import math, random
import numpy as np
CLOSE = 0.2
class Perceptron:
'''A single perceptron using sigmoid activation'''
def __init__(self, inputs):
'''Set up the perceptron with the given number of inputs'''
self.weights = np.empty(inputs)
for i in range(inputs):
self.weights[i] = random.random()
self.bias = random.random()
def getOutput(self, inputs):
'''Calculates, stores, and returns the output'''
assert len(inputs) == len(self.weights)
inj = np.sum(inputs * self.weights) + self.bias # Sum inputs
g = 1.0 / (1.0 + math.exp(-inj)) # Sigmoid activation
self.aj = g
return g
def adjust(self, delta):
'''Adjusts the weights and bias'''
self.bias += self.aj * delta
for i in range(len(self.weights)):
self.weights[i] += self.aj * delta
class Layer:
'''Creates a single layer in a single feed-forward neural network'''
def __init__(self, width, inputSize, prevLayer=False):
'''Create a new layer'''
self.prevLayer = prevLayer
self.nextLayer = False
self.nodes = []
for _ in range(width):
self.nodes.append(Perceptron(inputSize))
def setNext(self, nextLayer):
'''Set the next layer in the network'''
self.nextLayer = nextLayer
def getOutput(self, inputs):
'''Get an array of the output of the network'''
output = np.empty(len(self.nodes))
for i in range(len(self.nodes)):
output[i] = self.nodes[i].getOutput(inputs)
if isinstance(self.nextLayer, Layer):
# If this isn't the output layer, recurse to the next layer down
return self.nextLayer.getOutput(output)
return output
def backProp(self, deltas):
'''Back-propogate error through all the layers'''
if isinstance(self.prevLayer, Layer):
# If this isn't the input layer, calculate deltas for the next layer up
crossprod = np.empty((len(deltas), len(self.nodes[0].weights)))
for j in range(len(deltas)):
crossprod[j][:] = self.nodes[j].weights * deltas[j]
nextDeltas = crossprod.sum(axis=0)
for i in range(len(nextDeltas)):
# multiply by g'
nextDeltas[i] *= self.prevLayer.nodes[i].aj * (1.0 - self.prevLayer.nodes[i].aj)
# Recurse upwards
self.prevLayer.backProp(nextDeltas)
# Adjust the weights of neurons in this layer
for i in range(len(self.nodes)):
self.nodes[i].adjust(deltas[i])
class NeuralNetwork:
def __init__(self, layerSizes=np.array(0), filename=""):
'''Creates a neural network with the given layer sizes.'''
prev = False
inputLayer = False
for i in range(len(layerSizes)-1):
inputSize = layerSizes[i]
outputSize = layerSizes[i+1]
layer = Layer(outputSize, inputSize, prev)
if isinstance(prev, Layer):
prev.setNext(layer)
if not isinstance(inputLayer, Layer):
inputLayer = layer
prev = layer
self.inputLayer = inputLayer
self.outputLayer = prev
def train(self, inputs, outputs):
'''Train the network on the given sample'''
pred = self.inputLayer.getOutput(inputs)
# calculate error of output layer
error = outputs - pred
deltas = error * pred * (1.0 - pred)
# back-propogate the error
self.outputLayer.backProp(deltas)
# return error
return np.max(abs(error))
def test(self, inputs, outputs):
'''Test the network on the given sample'''
pred = self.inputLayer.getOutput(inputs)
correct = True
for i in range(len(pred)):
if abs(pred[i] - outputs[i]) > CLOSE:
correct = False
return correct
You can try any of these measures :
Shuffle your data well.
Use a smaller learning rate like 0.001
Use ReLU instead of sigmoid.
Initialize your biases as 1 and not random.
Use softmax at output layer if you are using ReLU.
The episodic memory module from DMN has a unique state update mechanism that is based on the attention:
Equation 8 describes the gate g, the attention map, is used to compute the next state between a GRU and the previous state. My current implementation uses:
class EpisodicMemory(L.Wrapper):
"""Episodic memory from DMN."""
def __init__(self, units, **kwargs):
self.grucell = L.GRUCell(units, name=kwargs['name']+'_gru') # Internal cell
super().__init__(self.grucell, **kwargs)
def build(self, input_shape):
"""Build the layer."""
_, _, ctx_shape = input_shape
self.grucell.build((ctx_shape[0],) + ctx_shape[2:])
super().build(input_shape)
def call(self, inputs):
"""Compute new state episode."""
init_state, atts, cs = inputs
# GRU pass over the facts, according to the attention mask.
while_valid_index = lambda state, index: index < tf.shape(cs)[1]
retain = 1 - atts
update_state = (lambda state, index: (atts[:,index,:] * self.grucell.call(cs[:,index,:], [state])[0] + retain[:,index,:] * state))
# Loop over context
final_state, _ = tf.while_loop(while_valid_index,
(lambda state, index: (update_state(state, index), index+1)),
loop_vars = [init_state, 0])
return final_state
def compute_output_shape(self, input_shape):
"""Collapse time dimension."""
return input_shape[0]
and used as:
# ...
sim_vec = concat([s_s_c, s_m_c, ctx_state, embedded_sents, repeated_q])
sim_vec = att_dense1(sim_vec) # (?, context_size, dim)
sim_vec = att_dense2(sim_vec) # (?, context_size, 1)
episodic_mem = EpisodicMemory(dim, name='episodic_mem')
state = episodic_mem([state, sim_vec, embedded_sents])
# ...
Is there a way to implement this in Keras without binding to Tensorflow functions? Even though we can pass constants to a custom RNN cell, say the attention map, we don't know the current index of the loop inside call function.