Episodic Memory from Dynamic Memory Networks - python

The episodic memory module from DMN has a unique state update mechanism that is based on the attention:
Equation 8 describes the gate g, the attention map, is used to compute the next state between a GRU and the previous state. My current implementation uses:
class EpisodicMemory(L.Wrapper):
"""Episodic memory from DMN."""
def __init__(self, units, **kwargs):
self.grucell = L.GRUCell(units, name=kwargs['name']+'_gru') # Internal cell
super().__init__(self.grucell, **kwargs)
def build(self, input_shape):
"""Build the layer."""
_, _, ctx_shape = input_shape
self.grucell.build((ctx_shape[0],) + ctx_shape[2:])
super().build(input_shape)
def call(self, inputs):
"""Compute new state episode."""
init_state, atts, cs = inputs
# GRU pass over the facts, according to the attention mask.
while_valid_index = lambda state, index: index < tf.shape(cs)[1]
retain = 1 - atts
update_state = (lambda state, index: (atts[:,index,:] * self.grucell.call(cs[:,index,:], [state])[0] + retain[:,index,:] * state))
# Loop over context
final_state, _ = tf.while_loop(while_valid_index,
(lambda state, index: (update_state(state, index), index+1)),
loop_vars = [init_state, 0])
return final_state
def compute_output_shape(self, input_shape):
"""Collapse time dimension."""
return input_shape[0]
and used as:
# ...
sim_vec = concat([s_s_c, s_m_c, ctx_state, embedded_sents, repeated_q])
sim_vec = att_dense1(sim_vec) # (?, context_size, dim)
sim_vec = att_dense2(sim_vec) # (?, context_size, 1)
episodic_mem = EpisodicMemory(dim, name='episodic_mem')
state = episodic_mem([state, sim_vec, embedded_sents])
# ...
Is there a way to implement this in Keras without binding to Tensorflow functions? Even though we can pass constants to a custom RNN cell, say the attention map, we don't know the current index of the loop inside call function.

Related

Deep Galerkin Method for optimization

I am trying to use Deep Galerkin Method (DGM) to solve high dimensional PDEs and I face a problem. For illustrative purposes, I am posting a simple optimization problem below. The feed-forward network successfully recovers the optimal funciton, but DGM network fails to do so. Any help is highly appreciated.
import logging, os
os.system('clear')
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# CLASS DEFINITIONS FOR NEURAL NETWORKS USED IN DEEP GALERKIN METHOD
#%% import needed packages
import tensorflow as tf
#%% LSTM-like layer used in DGM (see Figure 5.3 and set of equations on p. 45) - modification of Keras layer class
class LSTMLayer(tf.keras.layers.Layer):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, output_dim, input_dim, trans1 = "tanh", trans2 = "tanh"):
'''
Args:
input_dim (int): dimensionality of input data
output_dim (int): number of outputs for LSTM layers
trans1, trans2 (str): activation functions used inside the layer;
one of: "tanh" (default), "relu" or "sigmoid"
Returns: customized Keras layer object used as intermediate layers in DGM
'''
# create an instance of a Layer object (call initialize function of superclass of LSTMLayer)
super(LSTMLayer, self).__init__()
# add properties for layer including activation functions used inside the layer
self.output_dim = output_dim
self.input_dim = input_dim
if trans1 == "tanh":
self.trans1 = tf.nn.tanh
elif trans1 == "relu":
self.trans1 = tf.nn.relu
elif trans1 == "sigmoid":
self.trans1 = tf.nn.sigmoid
if trans2 == "tanh":
self.trans2 = tf.nn.tanh
elif trans2 == "relu":
self.trans2 = tf.nn.relu
elif trans2 == "sigmoid":
self.trans2 = tf.nn.relu
### define LSTM layer parameters (use Xavier initialization)
# u vectors (weighting vectors for inputs original inputs x)
self.Uz = self.add_variable("Uz", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Ug = self.add_variable("Ug", shape=[self.input_dim ,self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Ur = self.add_variable("Ur", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Uh = self.add_variable("Uh", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# w vectors (weighting vectors for output of previous layer)
self.Wz = self.add_variable("Wz", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wg = self.add_variable("Wg", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wr = self.add_variable("Wr", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wh = self.add_variable("Wh", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# bias vectors
self.bz = self.add_variable("bz", shape=[1, self.output_dim])
self.bg = self.add_variable("bg", shape=[1, self.output_dim])
self.br = self.add_variable("br", shape=[1, self.output_dim])
self.bh = self.add_variable("bh", shape=[1, self.output_dim])
# main function to be called
def call(self, S, X):
'''Compute output of a LSTMLayer for a given inputs S,X .
Args:
S: output of previous layer
X: data input
Returns: customized Keras layer object used as intermediate layers in DGM
'''
# compute components of LSTM layer output (note H uses a separate activation function)
Z = self.trans1(tf.add(tf.add(tf.matmul(X,self.Uz), tf.matmul(S,self.Wz)), self.bz))
G = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ug), tf.matmul(S, self.Wg)), self.bg))
R = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ur), tf.matmul(S, self.Wr)), self.br))
H = self.trans2(tf.add(tf.add(tf.matmul(X,self.Uh), tf.matmul(tf.multiply(S, R), self.Wh)), self.bh))
# compute LSTM layer output
S_new = tf.add(tf.multiply(tf.subtract(tf.ones_like(G), G), H), tf.multiply(Z,S))
return S_new
#%% Fully connected (dense) layer - modification of Keras layer class
class DenseLayer(tf.keras.layers.Layer):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, output_dim, input_dim, transformation=None):
'''
Args:
input_dim: dimensionality of input data
output_dim: number of outputs for dense layer
transformation: activation function used inside the layer; using
None is equivalent to the identity map
Returns: customized Keras (fully connected) layer object
'''
# create an instance of a Layer object (call initialize function of superclass of DenseLayer)
super(DenseLayer,self).__init__()
self.output_dim = output_dim
self.input_dim = input_dim
### define dense layer parameters (use Xavier initialization)
# w vectors (weighting vectors for output of previous layer)
self.W = self.add_variable("W", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# bias vectors
self.b = self.add_variable("b", shape=[1, self.output_dim])
if transformation:
if transformation == "tanh":
self.transformation = tf.tanh
elif transformation == "relu":
self.transformation = tf.nn.relu
else:
self.transformation = transformation
# main function to be called
def call(self,X):
'''Compute output of a dense layer for a given input X
Args:
X: input to layer
'''
# compute dense layer output
S = tf.add(tf.matmul(X, self.W), self.b)
if self.transformation:
S = self.transformation(S)
return S
#%% Neural network architecture used in DGM - modification of Keras Model class
class DGMNet(tf.keras.Model):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, layer_width, n_layers, input_dim, final_trans=None):
'''
Args:
layer_width:
n_layers: number of intermediate LSTM layers
input_dim: spaital dimension of input data (EXCLUDES time dimension)
final_trans: transformation used in final layer
Returns: customized Keras model object representing DGM neural network
'''
# create an instance of a Model object (call initialize function of superclass of DGMNet)
super(DGMNet,self).__init__()
# define initial layer as fully connected
# NOTE: to account for time inputs we use input_dim+1 as the input dimensionality
self.initial_layer = DenseLayer(layer_width, input_dim, transformation = "tanh")
# define intermediate LSTM layers
self.n_layers = n_layers
self.LSTMLayerList = []
for _ in range(self.n_layers):
self.LSTMLayerList.append(LSTMLayer(layer_width, input_dim))
# define final layer as fully connected with a single output (function value)
self.final_layer = DenseLayer(1, layer_width, transformation = final_trans)
# main function to be called
def call(self,x):
'''
Args:
t: sampled time inputs
x: sampled space inputs
Run the DGM model and obtain fitted function value at the inputs (t,x)
'''
# define input vector as time-space pairs
X = tf.concat([x],1)
# call initial layer
S = self.initial_layer.call(X)
# call intermediate LSTM layers
for i in range(self.n_layers):
S = self.LSTMLayerList[i].call(S,X)
# call final LSTM layers
result = self.final_layer.call(S)
return result
#%% main class
class check():
def __init__(self,v,x,layers,learning_rate,adam_iter,params):
self.params=params
self.v=v
self.x=x
self.learning_rate = learning_rate
self.adam_iter = adam_iter
self.lb = np.array([self.x[0][0]])
self.ub = np.array([self.x[-1][0]])
self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = True))
self.x_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
self.v_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])
self.x_u_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
self.v_u_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])
self.weights_v,self.biases_v = self.initialize_nn(layers)
self.weights_i,self.biases_i = self.initialize_nn(layers)
with tf.variable_scope("control",reuse=True):
self.i_pred = self.net_i(self.x_tf)
with tf.variable_scope("value",reuse=True):
self.v_pred = self.net_v(self.x_tf)
self.error_i = self.policy_error(self.x_tf)
self.loss_v = tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf))
self.loss = tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf)) + tf.reduce_mean(tf.square(self.error_i))
self.optimizer_Adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op_Adam = self.optimizer_Adam.minimize(self.loss)
self.optimizer_Adam_v = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op_Adam_v = self.optimizer_Adam.minimize(self.loss_v)
init = tf.global_variables_initializer()
self.sess.run(init)
def policy_error(self,x):
i=self.net_i(x)
v_ = self.net_v(x+i)
l = v_ - i*x**2
error_i = tf.gradients(l,i)[0]
return error_i
def initialize_nn(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(num_layers-1):
W = self.xavier_init(size = [layers[l],layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype=tf.float32), dtype = tf.float32)
weights.append(W)
biases.append(b)
return weights,biases
def xavier_init(self,size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
try:
val = tf.Variable(tf.random.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
except:
val = tf.Variable(tf.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
return val
def neural_net(self,X,weights,biases):
num_layers = len(weights) +1
H = 2.0*(X - self.lb)/(self.ub - self.lb) -1
#H=X
for l in range(num_layers-2):
W = weights[l]
b = biases[l]
H = tf.tanh(tf.add(tf.matmul(H,W),b))
W = weights[-1]
b = biases[-1]
Y = tf.add(tf.matmul(H,W),b)
return Y
def net_v(self,eta):
if self.params['DGM']==True:
model_v = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
v_u = model_v(eta)
else:
X = tf.concat([eta],1)
v_u = self.neural_net(X,self.weights_v,self.biases_v)
return v_u
def net_i(self,eta):
if self.params['DGM']==True:
model_i = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
i_u = model_i(eta)
else:
X = tf.concat([eta],1)
i_u = self.neural_net(X,self.weights_i,self.biases_i)
return i_u
def callback(self,loss):
print('Loss: ',loss)
def train(self):
#K.clear_session()
start_time = time.time()
if True: #set this to true if you want adam to run
tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
for it in range(self.adam_iter):
self.sess.run(self.train_op_Adam_v, tf_dict)
# Print
if it % 1000 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss_v, tf_dict)
print('It: %d, Loss: %.3e, Time: %.2f' %
(it, loss_value, elapsed))
start_time = time.time()
start_time = time.time()
if True: #set this to true if you want adam to run
tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
for it in range(self.adam_iter):
self.sess.run(self.train_op_Adam, tf_dict)
# Print
if it % 1000 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss, tf_dict)
print('It: %d, Loss: %.3e, Time: %.2f' %
(it, loss_value, elapsed))
start_time = time.time()
start_time = time.time()
def predict(self,X_star):
i_star = self.sess.run(self.i_pred,{self.x_tf: X_star[:,0:1]})
v_star = self.sess.run(self.v_pred,{self.x_tf: X_star[:,0:1]})
error = self.sess.run(self.error_i,{self.x_tf: X_star[:,0:1]})
tf.reset_default_graph()
return i_star,v_star,error
#%%
if __name__=="__main__":
params={'DGM':True,'neurons_per_layer':50,'num_layers':4}
x=np.linspace(-1,1,100).reshape(-1,1).astype(np.float32)
v=(10 - x**2).reshape(-1,1).astype(np.float32)
#architecture for feed-forward network
layers = [1, 10,1]
learning_rate = 0.001
adam_iter = 5000
run = check(v,x,layers,learning_rate,adam_iter,params)
run.train()
i_star,v_star,error=run.predict(x)
The problem is to find the optimal function i that maximizes the function v=10-(x+i^2)-ix^2, where x is the state variable. That is, the optimal function i will depend on x. If I set 'DGM' as False in the parameter dictionary and run the code, I get the right solution (in this case the functions are coded as feed-forward neural network), where the correct analytical solution is i_star = 0.5*(-2x-x^2). If I set 'DGM' as False, the solution is incorrect. I tried with different number of layers and number of neurons per each layer, but DGM always gives incorrect solution.
Am I doing something wrong? Many thanks.

RuntimeError: tensors must be 2-D

I was running a simple MLP network with customized learning algorithms. It worked fine on the training set, but I got this error when I entered additional code to check the test accuracy. How can I fix it?
Test Accuracy code
epochs = 1
for epcoh in range(epochs):
model_bp.eval()
model_fa.eval()
test_loss_bp = 0
correct_bp = 0
test_loss_fa = 0
correct_fa = 0
with torch.no_grad():
for idx_batch, (inputs, targets) in enumerate(test_loader):
output_bp = model_bp(inputs)
output_fa = model_fa(inputs)
# sum up batch loss
test_loss_bp += loss_crossentropy(output_bp, targets).item()
test_loss_bp += loss_crossentropy(output_fa, targets).item()
# get the index of the max log-probability
## predict_bp = outputs_bp.argmax(dim=1, keepdim=True)
predict_bp = torch.max(output_bp.data,1)[1]
correct_bp += predict_bp.eq(targets.view_as(predict_bp)).sum().item()
predict_fa = torch.max(output_fa.data,1)[1]
correct_fa += predict_fa.eq(targets.view_as(predict_fa)).sum().item()
print('Test set: BP Average loss: {:.4f}, Accuracy: {}/{} ({:.4f}%)\n'.format(test_loss_bp, correct_bp, len(test_loader.dataset),
100. * correct_bp / len(test_loader.dataset)))
print('Test set: FA Average loss: {:.4f}, Accuracy: {}/{} ({:.4f}%)\n'.format(test_loss_fa, correct_fa, len(test_loader.dataset),
100. * correct_fa / len(test_loader.dataset)))
Error
I'm curious about the meaning of 'RuntimeError: tensors must be 2-D'. We would appreciate it if you could tell us why it happened and where you made the mistake.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-9-9b8b6f683e59> in <module>
16 #targets = targets.to(device)
17
---> 18 output_bp = model_bp(inputs)
19 output_fa = model_fa(inputs)
20 # sum up batch loss
~\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
c:\Users\bclab\Desktop\feedback-alignment-pytorch-master\lib\linear.py in forward(self, inputs)
102 """
103 # first layer
--> 104 linear1 = F.relu(self.linear[0](inputs))
105
106 linear2 = self.linear[1](linear1)
~\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
c:\Users\bclab\Desktop\feedback-alignment-pytorch-master\lib\linear.py in forward(self, input)
69 def forward(self, input):
70 # See the autograd section for explanation of what happens here.
---> 71 return LinearFunction.apply(input, self.weight, self.bias)
72
73
c:\Users\bclab\Desktop\feedback-alignment-pytorch-master\lib\linear.py in forward(ctx, input, weight, bias)
11 def forward(ctx, input, weight, bias=None):
12 ctx.save_for_backward(input, weight, bias)
---> 13 output = input.mm(weight.t())
14 if bias is not None:
15 output += bias.unsqueeze(0).expand_as(output)
RuntimeError: tensors must be 2-D
This is my model. And fa_linear, linear : customize network
# load feedforward dfa model
model_fa = fa_linear.LinearFANetwork(in_features=784, num_layers=2, num_hidden_list=[1000, 10]).to(device)
# load reference linear model
model_bp = linear.LinearNetwork(in_features=784, num_layers=2, num_hidden_list=[1000, 10]).to(device)
# optimizers
optimizer_fa = torch.optim.SGD(model_fa.parameters(),
lr=1e-4, momentum=0.9, weight_decay=0.001, nesterov=True)
optimizer_bp = torch.optim.SGD(model_bp.parameters(),
lr=1e-4, momentum=0.9, weight_decay=0.001, nesterov=True)
loss_crossentropy = torch.nn.CrossEntropyLoss()
# make log file
results_path = 'bp_vs_fa_'
logger_train = open(results_path + 'train_log2.txt', 'w')
linear
from torch.autograd import Function
from torch import nn
import torch
import torch.nn.functional as F
# Inherit from Function
class LinearFunction(Function):
# Note that both forward and backward are #staticmethods
#staticmethod
# bias is an optional argument
def forward(ctx, input, weight, bias=None):
ctx.save_for_backward(input, weight, bias)
output = input.mm(weight.t())
if bias is not None:
output += bias.unsqueeze(0).expand_as(output)
return output
# This function has only a single output, so it gets only one gradient
#staticmethod
def backward(ctx, grad_output):
# This is a pattern that is very convenient - at the top of backward
# unpack saved_tensors and initialize all gradients w.r.t. inputs to
# None. Thanks to the fact that additional trailing Nones are
# ignored, the return statement is simple even when the function has
# optional inputs.
input, weight, bias = ctx.saved_variables
grad_input = grad_weight = grad_bias = None
# These needs_input_grad checks are optional and there only to
# improve efficiency. If you want to make your code simpler, you can
# skip them. Returning gradients for inputs that don't require it is
# not an error.
if ctx.needs_input_grad[0]:
grad_input = grad_output.mm(weight)
if ctx.needs_input_grad[1]:
grad_weight = grad_output.t().mm(input)
if bias is not None and ctx.needs_input_grad[2]:
grad_bias = grad_output.sum(0).squeeze(0)
return grad_input, grad_weight, grad_bias
class Linear(nn.Module):
def __init__(self, input_features, output_features, bias=True):
super(Linear, self).__init__()
self.input_features = input_features
self.output_features = output_features
# nn.Parameter is a special kind of Variable, that will get
# automatically registered as Module's parameter once it's assigned
# as an attribute. Parameters and buffers need to be registered, or
# they won't appear in .parameters() (doesn't apply to buffers), and
# won't be converted when e.g. .cuda() is called. You can use
# .register_buffer() to register buffers.
# nn.Parameters can never be volatile and, different than Variables,
# they require gradients by default.
self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(output_features))
else:
# You should always register all possible parameters, but the
# optional ones can be None if you want.
self.register_parameter('bias', None)
# weight initialization
torch.nn.init.kaiming_uniform(self.weight)
torch.nn.init.constant(self.bias, 1)
def forward(self, input):
# See the autograd section for explanation of what happens here.
return LinearFunction.apply(input, self.weight, self.bias)
class LinearNetwork(nn.Module):
def __init__(self, in_features, num_layers, num_hidden_list):
"""
:param in_features: dimension of input features (784 for MNIST)
:param num_layers: number of layers for feed-forward net
:param num_hidden_list: list of integers indicating hidden nodes of each layer
"""
super(LinearNetwork, self).__init__()
self.in_features = in_features
self.num_layers = num_layers
self.num_hidden_list = num_hidden_list
# create list of linear layers
# first hidden layer
self.linear = [Linear(self.in_features, self.num_hidden_list[0])]
# append additional hidden layers to list
for idx in range(self.num_layers - 1):
self.linear.append(Linear(self.num_hidden_list[idx], self.num_hidden_list[idx+1]))
# create ModuleList to make list of layers work
self.linear = nn.ModuleList(self.linear)
def forward(self, inputs):
"""
forward pass, which is same for conventional feed-forward net
:param inputs: inputs with shape [batch_size, in_features]
:return: logit outputs from the network
"""
# first layer
linear1 = F.relu(self.linear[0](inputs))
linear2 = self.linear[1](linear1)
return linear2
fa_linear
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import autograd
from torch.autograd import Variable
class LinearFANetwork(nn.Module):
"""
Linear feed-forward networks with feedback alignment learning
Does NOT perform non-linear activation after each layer
"""
def __init__(self, in_features, num_layers, num_hidden_list):
"""
:param in_features: dimension of input features (784 for MNIST)
:param num_layers: number of layers for feed-forward net
:param num_hidden_list: list of integers indicating hidden nodes of each layer
"""
super(LinearFANetwork, self).__init__()
self.in_features = in_features
self.num_layers = num_layers
self.num_hidden_list = num_hidden_list
# create list of linear layers
# first hidden layer
self.linear = [LinearFAModule(self.in_features, self.num_hidden_list[0])]
# append additional hidden layers to list
for idx in range(self.num_layers - 1):
self.linear.append(LinearFAModule(self.num_hidden_list[idx], self.num_hidden_list[idx+1]))
# create ModuleList to make list of layers work
self.linear = nn.ModuleList(self.linear)
def forward(self, inputs):
"""
forward pass, which is same for conventional feed-forward net
:param inputs: inputs with shape [batch_size, in_features]
:return: logit outputs from the network
"""
# first layer
linear1 = self.linear[0](inputs)
# second layer
linear2 = self.linear[1](linear1)
return linear2
class LinearFAFunction(autograd.Function):
#staticmethod
# same as reference linear function, but with additional fa tensor for backward
def forward(context, input, weight, weight_fa, bias=None):
context.save_for_backward(input, weight, weight_fa, bias)
output = input.mm(weight.t())
if bias is not None:
output += bias.unsqueeze(0).expand_as(output)
return output
#staticmethod
def backward(context, grad_output):
input, weight, weight_fa, bias = context.saved_variables
grad_input = grad_weight = grad_weight_fa = grad_bias = None
if context.needs_input_grad[0]:
# all of the logic of FA resides in this one line
# calculate the gradient of input with fixed fa tensor, rather than the "correct" model weight
grad_input = grad_output.mm(weight_fa)
if context.needs_input_grad[1]:
# grad for weight with FA'ed grad_output from downstream layer
# it is same with original linear function
grad_weight = grad_output.t().mm(input)
if bias is not None and context.needs_input_grad[3]:
grad_bias = grad_output.sum(0).squeeze(0)
return grad_input, grad_weight, grad_weight_fa, grad_bias
class LinearFAModule(nn.Module):
def __init__(self, input_features, output_features, bias=True):
super(LinearFAModule, self).__init__()
self.input_features = input_features
self.output_features = output_features
# weight and bias for forward pass
# weight has transposed form; more efficient (so i heard) (transposed at forward pass)
self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(output_features))
else:
self.register_parameter('bias', None)
# fixed random weight and bias for FA backward pass
# does not need gradient
self.weight_fa = nn.Parameter(Variable(torch.FloatTensor(output_features, input_features), requires_grad=False))
# weight initialization
torch.nn.init.kaiming_uniform(self.weight)
torch.nn.init.kaiming_uniform(self.weight_fa)
torch.nn.init.constant(self.bias, 1)
def forward(self, input):
return LinearFAFunction.apply(input, self.weight, self.weight_fa, self.bias)
You just need to flatten your input before passing it to your model. Something like this:
# ...
# from [batch_size, 1, 28, 28] <- 4-D
# to [batch_size, 1x28x28] <- 2-D, as expected
flat_inputs = torch.flatten(inputs)
output_bp = model_bp(flat_inputs)
output_fa = model_fa(flat_inputs)
# ...

Tensorflow 2.0: flat_map() to flatten Dataset of Dataset returns cardinality -2

I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)
I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.

Applying Gradient to LSTM layers using GradientTape raises "No gradients provided for any variable" error

I'm currently setting up a Deep Deterministic Policy Gradient agent to interact with a crypto trading environment. The code works when I'm using Dense layers for the function approximator, but when I switch to LSTM or GRUs this error pops up when I call the learn method of Agent:
No gradients provided for any variable: ['actor_network_4/lstm/kernel:0', 'actor_network_4/lstm/recurrent_kernel:0', 'actor_network_4/lstm/bias:0', 'actor_network_4/lstm_1/kernel:0', 'actor_network_4/lstm_1/recurrent_kernel:0', 'actor_network_4/lstm_1/bias:0', 'actor_network_4/dense_8/kernel:0', 'actor_network_4/dense_8/bias:0'].
I'm using GradientTape to record the gradients and optimizer.apply_gradients to update the actor and critic networks. Please find the code snippets below:
#RNN version
class CriticNetwork(keras.Model):
def __init__(self, n_actions,name='critic', chkpt_dir='ddpg'):
super(CriticNetwork, self).__init__()
self.n_actions = n_actions
self.model_name = name
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir, self.model_name+'_ddpg.h5')
self.lstm1 = LSTM(128,return_sequences=True,unroll=False)
self.lstm2 = LSTM(128)
self.q = Dense(1, activation=None)
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = np.reshape(action_value,(32,1,44))
action_value = self.lstm1(action_value)
action_value = self.lstm2(action_value)
q = self.q(action_value)
return q
class ActorNetwork(keras.Model):
def __init__(self,n_actions=1, name='actor',chkpt_dir='ddpg'):
super(ActorNetwork, self).__init__()
self.n_actions = n_actions
self.model_name = name
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir,
self.model_name+'_ddpg.h5')
self.lstm1 = LSTM(128,return_sequences=True,unroll=False)
self.lstm2 = LSTM(128)
self.mu = Dense(self.n_actions, activation='tanh')
def call(self, state):
state = np.reshape(state,(32,1,43))
prob = self.lstm1(state)
prob = self.lstm2(prob)
mu = self.mu(prob)
return mu
class Agent:
def __init__(self, alpha=0.001, beta=0.002, input_dims=[33], env=None,
gamma=0.99, n_actions=1, max_size=3000000, tau=0.005,
fc1=128, fc2=128, fc3=64, batch_size=32):
self.gamma = gamma
self.tau = tau
self.memory = ReplayBuffer(max_size, input_dims, n_actions)
self.batch_size = batch_size
self.n_actions = n_actions
self.max_action = 1
self.min_action = -1
self.actor = ActorNetwork(n_actions=n_actions, name='actor')
self.critic = CriticNetwork(n_actions=n_actions, name='critic')
self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor')
self.target_critic = CriticNetwork(n_actions=n_actions, name='target_critic')
self.actor.compile(optimizer=Adam(learning_rate=alpha))
self.critic.compile(optimizer=Adam(learning_rate=beta))
self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
self.target_critic.compile(optimizer=Adam(learning_rate=beta))
self.update_network_parameters(tau=1)
def update_network_parameters(self, tau=None):
if tau is None:
tau = self.tau
weights = []
targets = self.target_actor.weights
for i, weight in enumerate(self.actor.weights):
weights.append(weight * tau + targets[i]*(1-tau))
self.target_actor.set_weights(weights)
weights = []
targets = self.target_critic.weights
for i, weight in enumerate(self.critic.weights):
weights.append(weight * tau + targets[i]*(1-tau))
self.target_critic.set_weights(weights)
def remember(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def save_models(self):
print('... saving models ...')
self.actor.save_weights(self.actor.checkpoint_file)
self.target_actor.save_weights(self.target_actor.checkpoint_file)
self.critic.save_weights(self.critic.checkpoint_file)
self.target_critic.save_weights(self.target_critic.checkpoint_file)
def load_models(self):
print('... loading models ...')
self.actor.load_weights(self.actor.checkpoint_file)
self.target_actor.load_weights(self.target_actor.checkpoint_file)
self.critic.load_weights(self.critic.checkpoint_file)
self.target_critic.load_weights(self.target_critic.checkpoint_file)
def choose_action(self, observation, evaluate=False):
state = tf.convert_to_tensor([observation], dtype=tf.float32)
actions = self.actor(state)
if not evaluate:
actions += tf.random.normal(shape=[self.n_actions],
mean=0.0, stddev=0.05)
actions = tf.clip_by_value(actions, self.min_action, self.max_action)
return actions
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
states = tf.convert_to_tensor(state, dtype=tf.float32)
states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
actions = tf.convert_to_tensor(action, dtype=tf.float32)
with tf.GradientTape() as tape:
target_actions = self.target_actor(states_)
critic_value_ = tf.squeeze(self.target_critic(
states_, target_actions), 1)
critic_value = tf.squeeze(self.critic(states, actions), 1)
target = reward + self.gamma*critic_value_*(1-done)
critic_loss = keras.losses.MSE(target, critic_value)
critic_network_gradient = tape.gradient(critic_loss, self.critic.trainable_variables)
self.critic.optimizer.apply_gradients(zip(critic_network_gradient, self.critic.trainable_variables))
with tf.GradientTape() as tape:
new_policy_actions = self.actor(states)
actor_loss = -self.critic(states, new_policy_actions)
actor_loss = tf.math.reduce_mean(actor_loss)
actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables)
self.actor.optimizer.apply_gradients(zip(actor_network_gradient, self.actor.trainable_variables))
self.update_network_parameters()
Any advice or help in terms of how to record and apply the gradient for LSTMs would help.
Actually, the problem is that you are using Numpy operations to define the computation logic (hence, the gradients cannot flow from that point onward); however, this should be done entirely using TF Ops or Keras layers. Specifically, in call method of CriticNetwork as well as ActorNetwork, instead of using np.reshape you should either use tf.expand_dims (if you only want to add a new axis of size one to the tensor), or tf.reshape, or tf.keras.layers.Reshape layer (for more involved reshaping). For example, using tf.expand_dims:
class CriticNetwork(keras.Model):
#...
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = tf.expand_dims(action_value, axis=1)
or using tf.reshape:
class CriticNetwork(keras.Model):
#...
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = tf.reshape(action_value, (-1, 1, 44)) # Use `-1` for the first axis so that any batch size would be supported
or using Reshape layer:
class CriticNetwork(keras.Model):
def __init__(self, n_actions,name='critic', chkpt_dir='ddpg'):
# ...
self.reshape = tf.keras.layers.Reshape((1, 44))
def call(self, state, action):
action_value = tf.concat([state, action], axis=1)
action_value = self.reshape(action_value)
And you need to do the same thing for the ActorNetwork.
Side note: I am not sure if this is just a demo code or not, but note that using RNN layers on a sequence of length one (i.e. having only one timestep) may not prove to be very beneficial.

How may I do equalized learning rate with tensorflow 2?

I am trying to implement StyleGAN with TensorFlow version 2 and I have no idea how to do an equalized learning rate. I tried to scale gradients this way:
def equalize_in_list(datalist):
for i in range(len(datalist)):
if (datalist[i] is list):
equalize_in_list(datalist[i])
else:
datalist[i] = datalist[i] * np.sqrt(2)/np.prod(datalist[i].shape)
return datalist
gen_grad = equalize_in_list(gen_grad)
disc_grad = equalize_in_list(disc_grad)
But it doesn't work correctly.
You can just create a custom layer.
class DenseEQ(Dense):
"""
Standard dense layer but includes learning rate equilization
at runtime as per Karras et al. 2017.
Inherits Dense layer and overides the call method.
"""
def __init__(self, **kwargs):
if 'kernel_initializer' in kwargs:
raise Exception("Cannot override kernel_initializer")
super().__init__(kernel_initializer=normal(0,1), **kwargs)
def build(self, input_shape):
super().build(input_shape)
# The number of inputs
n = np.product([int(val) for val in input_shape[1:]])
# He initialisation constant
self.c = np.sqrt(2/n)
def call(self, inputs):
output = K.dot(inputs, self.kernel*self.c) # scale kernel
if self.use_bias:
output = K.bias_add(output, self.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
And then create a model as you normally would... (But you'll have to specify its arguments explicitly e.g. : units=x. Positionaly arguments will not work)
model_in = Input(shape(12,))
x = DenseEq(name="whatever_1", units=16)(model_in)
x = LeakyRelu(0.2)(x)
x = DenseEq(name="whatever_2", units=1)(model_in)
model_out = LeakyRelu(0.2)(x)
model = Model(model_in, model_out)
You can do the same thing for a convolution.
class Conv2DEQ(Conv2D):
"""
Standard Conv2D layer but includes learning rate equilization
at runtime as per Karras et al. 2017.
Inherits Conv2D layer and overrides the call method, following
https://github.com/keras-team/keras/blob/master/keras/layers/convolutional.py
"""
def __init__(self, **kwargs):
if 'kernel_initializer' in kwargs:
raise Exception("Cannot override kernel_initializer")
super().__init__(kernel_initializer=normal(0,1), **kwargs)
def build(self, input_shape):
super().build(input_shape)
# The number of inputs
n = np.product([int(val) for val in input_shape[1:]])
# He initialisation constant
self.c = np.sqrt(2/n)
def call(self, inputs):
if self.rank == 2:
outputs = K.conv2d(
inputs,
self.kernel*self.c, # scale kernel
strides=self.strides,
padding=self.padding,
data_format=self.data_format,
dilation_rate=self.dilation_rate)
if self.use_bias:
outputs = K.bias_add(
outputs,
self.bias,
data_format=self.data_format)
if self.activation is not None:
return self.activation(outputs)
return outputs

Categories