I am trying to implement normalizing flows embedded in a Keras model. In all examples I can find, such as the documentation of MAF, the bijectors which constitute the normalizing flows are embedded into a TransformedDistribution and exposed directly for training etc.
I am trying to embed this TransformedDistribution in a keras Model to match the architecture of other models I have which are inheriting from keras Model.
Unfortunately all my attempts (see code) so far fail at transferring the trainable variables inside the transformed distribution to the keras Model.
I have tried to make the bijector inherit from tf.keras.layers.Layer, which did not change anything.
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors
class Flow(tfb.Bijector, tf.Module):
"""
tf.Module to register trainable_variables
"""
def __init__(self, d, init_sigma=0.1, **kwargs):
super(Flow, self).__init__(
dtype=tf.float32,
forward_min_event_ndims=0,
inverse_min_event_ndims=0,
**kwargs
)
# Shape of the flow goes from Rd to Rd
self.d = d
# Weights/Variables initializer
self.init_sigma = init_sigma
w_init = tf.random_normal_initializer(stddev=self.init_sigma)
# Variables
self.u = tf.Variable(
w_init(shape=[1, self.d], dtype=tf.float32),
dtype=tf.float32,
name='u',
trainable=True,
)
def _forward(self, x):
return x
def _inverse(self, y):
return y
class Flows(tf.keras.Model):
def __init__(self, d=2, shape=(100, 2), n_flows=10, ):
super(Flows, self).__init__()
# Parameters
self.d = d
self.shape = shape
self.n_flows = n_flows
# Base distribution - MF = Multivariate normal diag
base_distribution = tfd.MultivariateNormalDiag(
loc=tf.zeros(shape=shape, dtype=tf.float32)
)
# Flows as chain of bijector
flows = []
for n in range(n_flows):
flows.append(Flow(self.d, name=f"flow_{n + 1}"))
bijector = tfb.Chain(list(reversed(flows)))
self.flow = tfd.TransformedDistribution(
distribution=base_distribution,
bijector=bijector
)
def call(self, *inputs):
return self.flow.bijector.forward(*inputs)
def log_prob(self, *inputs):
return self.flow.log_prob(*inputs)
def sample(self, num):
return self.flow.sample(num)
q = Flows()
# Call to instantiate variables
q(tf.zeros(q.shape))
# Prints no trainable params
print(q.summary())
# Prints expected trainable params
print(q.flow.trainable_variables)
Any idea if this is even possible? Thanks!
I bumped into this issue as well. It seems to be caused by the incompatibility issues between TFP and TF 2.0 (a couple relevant issues https://github.com/tensorflow/probability/issues/355 and https://github.com/tensorflow/probability/issues/946).
As a workaround, you need to add the (trainable) variables of your transformed distribution / bijector as an attribute to your Keras Model:
class Flows(tf.keras.Model):
def __init__(self, d=2, shape=(100, 2), n_flows=10, ):
super(Flows, self).__init__()
# Parameters
self.d = d
self.shape = shape
self.n_flows = n_flows
# Base distribution - MF = Multivariate normal diag
base_distribution = tfd.MultivariateNormalDiag(
loc=tf.zeros(shape=shape, dtype=tf.float32)
)
# Flows as chain of bijector
flows = []
for n in range(n_flows):
flows.append(Flow(self.d, name=f"flow_{n + 1}"))
bijector = tfb.Chain(list(reversed(flows)))
self.flow = tfd.TransformedDistribution(
distribution=base_distribution,
bijector=bijector
)
# issue: https://github.com/tensorflow/probability/issues/355, https://github.com/tensorflow/probability/issues/946
# need to add bijector's trainable variables as an attribute (name does not matter)
# otherwise this layer has zero trainable variables
self._variables = self.flow.variables # https://github.com/tensorflow/probability/issues/355
def call(self, *inputs):
return self.flow.bijector.forward(*inputs)
def log_prob(self, *inputs):
return self.flow.log_prob(*inputs)
def sample(self, num):
return self.flow.sample(num)
After adding this your model should have trainable variables and weights to optimize.
Related
I have the following code:
import torch
from torch.nn.utils.stateless import functional_call
import torch.autograd as autograd
import torch.nn as nn
# This is the model
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
# This is the initialization function
def pars(model):
params = {}
for name, param in model.named_parameters():
if len(param.shape) == 1:
init = torch.nn.init.constant_(param, 0)
else:
init = torch.nn.init.orthogonal_(param)
params[name] = nn.Parameter(init)
return params
# Initializating the model
model = Encoder(4, 2, 5)
x = torch.rand(3, 5, 4)
params = pars(model)
# Running the model with functional_call and calculating gradient.
samp = functional_call(model, params, x)
grad_f = autograd.grad(torch.mean(samp), params.values(),
retain_graph=True, allow_unused=True)
print(grad_f)
# grad_f has gradient for the linear layer, but None for the LSTM layer.
# Running the model without functional_call and calculating gradient.
samp = model(x)
grad = autograd.grad(torch.mean(samp), model.parameters(), retain_graph=True)
print(grad)
# grad has gradient for all layers, e.g., linears and lstm.
I know the problem is with the LSTM layer because when I use a linear layer with nn.Linear, then the gradient depends on std as well as the linear layer. Unfortunately, I do not know to resolve this problem. I'd appreciate any help.
*Edit: I heavily edited the code provided just to further simplify the example. This code can be copied and run.
Update Dec 11, 2022
class Encoder(nn.Module):
def __init__(self, action_dim, z_dim, skill_length):
super().__init__()
print(action_dim)
self.lin1 = nn.Linear(action_dim, action_dim)
self.lstm = nn.LSTM(input_size=action_dim, hidden_size=z_dim, batch_first=True)
self.lin2 = nn.Linear(z_dim, z_dim)
def forward(self, skill):
a, b, c = skill.shape
skill = skill.reshape(-1, skill.shape[-1])
embed = self.lin1(skill)
embed = embed.reshape(a, b, c)
mean, _ = self.lstm(embed)
pdb.set_trace()
grad1 = autograd.grad(mean.mean(), params.values(),
retain_graph=True, allow_unused=True)
# This gives gradient for the self.lin1 layer, and None for the LSTM
grad2 = autograd.grad(mean.mean(), self.parameters(),
retain_graph=True, allow_unused=True)
# This gives gradient the LSTM, but None for the self.lin1 layer
mean = mean[:, -1, :]
mean = self.lin2(mean)
return mean
When I run it the regular without functional_call and calling directly the model, then autograd.grad(mean.mean(), self.parameters(), allow_unused=True, retain_graph=True) has gradient for the self.lin1 and LSTM layer.
I don't know if this information is useful, but putting out there just in case.
I am trying to use Deep Galerkin Method (DGM) to solve high dimensional PDEs and I face a problem. For illustrative purposes, I am posting a simple optimization problem below. The feed-forward network successfully recovers the optimal funciton, but DGM network fails to do so. Any help is highly appreciated.
import logging, os
os.system('clear')
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
# CLASS DEFINITIONS FOR NEURAL NETWORKS USED IN DEEP GALERKIN METHOD
#%% import needed packages
import tensorflow as tf
#%% LSTM-like layer used in DGM (see Figure 5.3 and set of equations on p. 45) - modification of Keras layer class
class LSTMLayer(tf.keras.layers.Layer):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, output_dim, input_dim, trans1 = "tanh", trans2 = "tanh"):
'''
Args:
input_dim (int): dimensionality of input data
output_dim (int): number of outputs for LSTM layers
trans1, trans2 (str): activation functions used inside the layer;
one of: "tanh" (default), "relu" or "sigmoid"
Returns: customized Keras layer object used as intermediate layers in DGM
'''
# create an instance of a Layer object (call initialize function of superclass of LSTMLayer)
super(LSTMLayer, self).__init__()
# add properties for layer including activation functions used inside the layer
self.output_dim = output_dim
self.input_dim = input_dim
if trans1 == "tanh":
self.trans1 = tf.nn.tanh
elif trans1 == "relu":
self.trans1 = tf.nn.relu
elif trans1 == "sigmoid":
self.trans1 = tf.nn.sigmoid
if trans2 == "tanh":
self.trans2 = tf.nn.tanh
elif trans2 == "relu":
self.trans2 = tf.nn.relu
elif trans2 == "sigmoid":
self.trans2 = tf.nn.relu
### define LSTM layer parameters (use Xavier initialization)
# u vectors (weighting vectors for inputs original inputs x)
self.Uz = self.add_variable("Uz", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Ug = self.add_variable("Ug", shape=[self.input_dim ,self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Ur = self.add_variable("Ur", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Uh = self.add_variable("Uh", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# w vectors (weighting vectors for output of previous layer)
self.Wz = self.add_variable("Wz", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wg = self.add_variable("Wg", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wr = self.add_variable("Wr", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
self.Wh = self.add_variable("Wh", shape=[self.output_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# bias vectors
self.bz = self.add_variable("bz", shape=[1, self.output_dim])
self.bg = self.add_variable("bg", shape=[1, self.output_dim])
self.br = self.add_variable("br", shape=[1, self.output_dim])
self.bh = self.add_variable("bh", shape=[1, self.output_dim])
# main function to be called
def call(self, S, X):
'''Compute output of a LSTMLayer for a given inputs S,X .
Args:
S: output of previous layer
X: data input
Returns: customized Keras layer object used as intermediate layers in DGM
'''
# compute components of LSTM layer output (note H uses a separate activation function)
Z = self.trans1(tf.add(tf.add(tf.matmul(X,self.Uz), tf.matmul(S,self.Wz)), self.bz))
G = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ug), tf.matmul(S, self.Wg)), self.bg))
R = self.trans1(tf.add(tf.add(tf.matmul(X,self.Ur), tf.matmul(S, self.Wr)), self.br))
H = self.trans2(tf.add(tf.add(tf.matmul(X,self.Uh), tf.matmul(tf.multiply(S, R), self.Wh)), self.bh))
# compute LSTM layer output
S_new = tf.add(tf.multiply(tf.subtract(tf.ones_like(G), G), H), tf.multiply(Z,S))
return S_new
#%% Fully connected (dense) layer - modification of Keras layer class
class DenseLayer(tf.keras.layers.Layer):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, output_dim, input_dim, transformation=None):
'''
Args:
input_dim: dimensionality of input data
output_dim: number of outputs for dense layer
transformation: activation function used inside the layer; using
None is equivalent to the identity map
Returns: customized Keras (fully connected) layer object
'''
# create an instance of a Layer object (call initialize function of superclass of DenseLayer)
super(DenseLayer,self).__init__()
self.output_dim = output_dim
self.input_dim = input_dim
### define dense layer parameters (use Xavier initialization)
# w vectors (weighting vectors for output of previous layer)
self.W = self.add_variable("W", shape=[self.input_dim, self.output_dim],
initializer = tf.contrib.layers.xavier_initializer())
# bias vectors
self.b = self.add_variable("b", shape=[1, self.output_dim])
if transformation:
if transformation == "tanh":
self.transformation = tf.tanh
elif transformation == "relu":
self.transformation = tf.nn.relu
else:
self.transformation = transformation
# main function to be called
def call(self,X):
'''Compute output of a dense layer for a given input X
Args:
X: input to layer
'''
# compute dense layer output
S = tf.add(tf.matmul(X, self.W), self.b)
if self.transformation:
S = self.transformation(S)
return S
#%% Neural network architecture used in DGM - modification of Keras Model class
class DGMNet(tf.keras.Model):
# constructor/initializer function (automatically called when new instance of class is created)
def __init__(self, layer_width, n_layers, input_dim, final_trans=None):
'''
Args:
layer_width:
n_layers: number of intermediate LSTM layers
input_dim: spaital dimension of input data (EXCLUDES time dimension)
final_trans: transformation used in final layer
Returns: customized Keras model object representing DGM neural network
'''
# create an instance of a Model object (call initialize function of superclass of DGMNet)
super(DGMNet,self).__init__()
# define initial layer as fully connected
# NOTE: to account for time inputs we use input_dim+1 as the input dimensionality
self.initial_layer = DenseLayer(layer_width, input_dim, transformation = "tanh")
# define intermediate LSTM layers
self.n_layers = n_layers
self.LSTMLayerList = []
for _ in range(self.n_layers):
self.LSTMLayerList.append(LSTMLayer(layer_width, input_dim))
# define final layer as fully connected with a single output (function value)
self.final_layer = DenseLayer(1, layer_width, transformation = final_trans)
# main function to be called
def call(self,x):
'''
Args:
t: sampled time inputs
x: sampled space inputs
Run the DGM model and obtain fitted function value at the inputs (t,x)
'''
# define input vector as time-space pairs
X = tf.concat([x],1)
# call initial layer
S = self.initial_layer.call(X)
# call intermediate LSTM layers
for i in range(self.n_layers):
S = self.LSTMLayerList[i].call(S,X)
# call final LSTM layers
result = self.final_layer.call(S)
return result
#%% main class
class check():
def __init__(self,v,x,layers,learning_rate,adam_iter,params):
self.params=params
self.v=v
self.x=x
self.learning_rate = learning_rate
self.adam_iter = adam_iter
self.lb = np.array([self.x[0][0]])
self.ub = np.array([self.x[-1][0]])
self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = True))
self.x_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
self.v_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])
self.x_u_tf = tf.placeholder(tf.float32, shape=[None,self.x.shape[1]])
self.v_u_tf = tf.placeholder(tf.float32, shape=[None,self.v.shape[1]])
self.weights_v,self.biases_v = self.initialize_nn(layers)
self.weights_i,self.biases_i = self.initialize_nn(layers)
with tf.variable_scope("control",reuse=True):
self.i_pred = self.net_i(self.x_tf)
with tf.variable_scope("value",reuse=True):
self.v_pred = self.net_v(self.x_tf)
self.error_i = self.policy_error(self.x_tf)
self.loss_v = tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf))
self.loss = tf.math.reduce_max(tf.abs(self.v_pred-self.v_tf)) + tf.reduce_mean(tf.square(self.error_i))
self.optimizer_Adam = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op_Adam = self.optimizer_Adam.minimize(self.loss)
self.optimizer_Adam_v = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
self.train_op_Adam_v = self.optimizer_Adam.minimize(self.loss_v)
init = tf.global_variables_initializer()
self.sess.run(init)
def policy_error(self,x):
i=self.net_i(x)
v_ = self.net_v(x+i)
l = v_ - i*x**2
error_i = tf.gradients(l,i)[0]
return error_i
def initialize_nn(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(num_layers-1):
W = self.xavier_init(size = [layers[l],layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype=tf.float32), dtype = tf.float32)
weights.append(W)
biases.append(b)
return weights,biases
def xavier_init(self,size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
try:
val = tf.Variable(tf.random.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
except:
val = tf.Variable(tf.truncated_normal([in_dim,out_dim], stddev = xavier_stddev), dtype = tf.float32)
return val
def neural_net(self,X,weights,biases):
num_layers = len(weights) +1
H = 2.0*(X - self.lb)/(self.ub - self.lb) -1
#H=X
for l in range(num_layers-2):
W = weights[l]
b = biases[l]
H = tf.tanh(tf.add(tf.matmul(H,W),b))
W = weights[-1]
b = biases[-1]
Y = tf.add(tf.matmul(H,W),b)
return Y
def net_v(self,eta):
if self.params['DGM']==True:
model_v = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
v_u = model_v(eta)
else:
X = tf.concat([eta],1)
v_u = self.neural_net(X,self.weights_v,self.biases_v)
return v_u
def net_i(self,eta):
if self.params['DGM']==True:
model_i = DGMNet(self.params['neurons_per_layer'],self.params['num_layers'],1)
i_u = model_i(eta)
else:
X = tf.concat([eta],1)
i_u = self.neural_net(X,self.weights_i,self.biases_i)
return i_u
def callback(self,loss):
print('Loss: ',loss)
def train(self):
#K.clear_session()
start_time = time.time()
if True: #set this to true if you want adam to run
tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
for it in range(self.adam_iter):
self.sess.run(self.train_op_Adam_v, tf_dict)
# Print
if it % 1000 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss_v, tf_dict)
print('It: %d, Loss: %.3e, Time: %.2f' %
(it, loss_value, elapsed))
start_time = time.time()
start_time = time.time()
if True: #set this to true if you want adam to run
tf_dict = {self.v_tf:self.v, self.x_tf:self.x}
for it in range(self.adam_iter):
self.sess.run(self.train_op_Adam, tf_dict)
# Print
if it % 1000 == 0:
elapsed = time.time() - start_time
loss_value = self.sess.run(self.loss, tf_dict)
print('It: %d, Loss: %.3e, Time: %.2f' %
(it, loss_value, elapsed))
start_time = time.time()
start_time = time.time()
def predict(self,X_star):
i_star = self.sess.run(self.i_pred,{self.x_tf: X_star[:,0:1]})
v_star = self.sess.run(self.v_pred,{self.x_tf: X_star[:,0:1]})
error = self.sess.run(self.error_i,{self.x_tf: X_star[:,0:1]})
tf.reset_default_graph()
return i_star,v_star,error
#%%
if __name__=="__main__":
params={'DGM':True,'neurons_per_layer':50,'num_layers':4}
x=np.linspace(-1,1,100).reshape(-1,1).astype(np.float32)
v=(10 - x**2).reshape(-1,1).astype(np.float32)
#architecture for feed-forward network
layers = [1, 10,1]
learning_rate = 0.001
adam_iter = 5000
run = check(v,x,layers,learning_rate,adam_iter,params)
run.train()
i_star,v_star,error=run.predict(x)
The problem is to find the optimal function i that maximizes the function v=10-(x+i^2)-ix^2, where x is the state variable. That is, the optimal function i will depend on x. If I set 'DGM' as False in the parameter dictionary and run the code, I get the right solution (in this case the functions are coded as feed-forward neural network), where the correct analytical solution is i_star = 0.5*(-2x-x^2). If I set 'DGM' as False, the solution is incorrect. I tried with different number of layers and number of neurons per each layer, but DGM always gives incorrect solution.
Am I doing something wrong? Many thanks.
I am performing reinforcement learning and need to train an actor and a critic neural network over a custom environment. I have the following code for my networks and RL agent:
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls
class critic(tf.keras.Model):
## Critic NN
def __init__(self):
super().__init__()
self.d1 = tf.keras.layers.Dense(64,activation='relu')
self.v = tf.keras.layers.Dense(1, activation = None)
def call(self, input_data):
x = self.d1(input_data)
v = self.v(x)
return v
class actor(tf.keras.Model):
## Actor NN
def __init__(self):
super().__init__()
self.d1 = tf.keras.layers.Dense(64,activation='relu')
self.a = tf.keras.layers.Dense(4,activation='softmax')
def call(self, input_data):
x = self.d1(input_data)
a = self.a(x)
return a
class rlalgorithm:
## RL Agent that trains the above NNs based on data from environment
def __init__(self, actions, learning_rate=0.1):
## Display name for graphing performance
self.display_name="A2C"
## Root Mean Square Optimizer for minimizing A2C losses
self.a_opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
self.c_opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
## Initialize models
self.actor = actor()
self.critic = critic()
## Define training constants and variables
learn_rate = tf.constant(learning_rate, dtype=tf.float32)
self.reward = tf.Variable(initial_value=0, dtype=tf.float32)
self.state = tf.Variable(initial_value=tf.zeros(shape=(1, 4)), dtype=tf.float32, shape=(1, 4))
self.next_state = tf.Variable(initial_value=tf.zeros(shape=(1, 4)), dtype=tf.float32, shape=(1, 4))
self.action = tf.Variable(initial_value=0, dtype=tf.float32)
## The graph that produces the advantage
advantage = self.reward + learn_rate*self.critic(self.next_state) - self.critic(self.state)
## Graph that produces losses
dist = tfp.distributions.Categorical(probs=self.actor(self.state), dtype=tf.float32)
self.actor_loss = dist.log_prob(self.action)*advantage
self.critic_loss = advantage**2
def choose_action(self, state):
## Agent chooses action to proceed to next state
prob = self.actor(tf.convert_to_tensor([state]))
dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
action = dist.sample()
return int(action.numpy()[0])
def learn(self, s, a, r, s_):
## Based on chosen action, learn from result
## Assign training variables for this state-action outcome
self.reward = self.reward.assign(r)
self.state = self.state.assign(tf.convert_to_tensor([s]))
self.next_state = self.next_state.assign(tf.convert_to_tensor([s_]))
self.action = self.action.assign(a)
## Generate the loss gradient for actor
with tf.GradientTape() as tape:
actor_grad = tape.gradient(self.actor_loss, self.actor.trainable_variables)
self.a_opt.apply_gradients(zip(actor_grad, self.actor.trainable_variables))
## Generate the loss gradient for critic
with tf.GradientTape() as tape:
critic_grad = tape.gradient(self.critic_loss, self.critic.trainable_variables)
self.c_opt.apply_gradients(zip(critic_grad, self.critic.trainable_variables))
## Environment uses this, not relevant to learning
return s_, self.choose_action(s_)
I am getting the following error:
ValueError: No gradients provided for any variable: ['actor/dense/kernel:0', 'actor/dense/bias:0', 'actor/dense_1/kernel:0', 'actor/dense_1/bias:0'].
I have seen this question asked multiple times, but none of the previous solutions seem to work for my case. Unfortunately, I cannot provide the environment that this agent runs on but the error is only contained within the above file.
I have read through the docs and tried a similar implementation using the optimizer minimize function, with the same results.
I suspect the issue is related to how the tensorflow graphs are defined, but am unsure of what exactly the problem is.
Any and all help is appreciated.
I solved this on my own.
What I didn't understand was the proper usage of tf.GradientTape. Within the with block, I need to perform the operations that compute loss, so that the gradients can be found.
Here is the updated learn function, for anybody else' reference:
def learn(self, s, a, r, s_):
## Based on chosen action, learn from result
## Assign training variables for this state-action outcome
self.reward = self.reward.assign(r)
self.state = self.state.assign(tf.convert_to_tensor([s]))
self.next_state = self.next_state.assign(tf.convert_to_tensor([s_]))
self.action = self.action.assign(a)
## Generate the loss gradient for critic
with tf.GradientTape() as tape:
advantage = self.reward + self.learn_rate*self.critic(self.next_state) - self.critic(self.state)
critic_loss = advantage**2
critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
self.c_opt.apply_gradients(zip(critic_grad, self.critic.trainable_variables))
## Generate the loss gradient for actor
with tf.GradientTape() as tape:
dist = tfp.distributions.Categorical(probs=self.actor(self.state), dtype=tf.float32)
actor_loss = dist.log_prob(self.action)*self.advantage
actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
self.a_opt.apply_gradients(zip(actor_grad, self.actor.trainable_variables))
## Environment uses this, not relevant to learning
return s_, self.choose_action(s_)
I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)
I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.
I am trying to follow an implementation of an attention decoder
from keras.layers.recurrent import Recurrent
...
class AttentionDecoder(Recurrent):
...
#######################################################
# The functionality of init, build and call is clear
#######################################################
def __init__(self, units, output_dim,
activation='tanh',
...
def build(self, input_shape):
...
def call(self, x):
self.x_seq = x
...
return super(AttentionDecoder, self).call(x)
##################################################################
# What is the purpose of 'get_initial_state' and 'step' functions
# Do these functions override the Recurrent base class functions?
##################################################################
def get_initial_state(self, inputs):
# apply the matrix on the first time step to get the initial s0.
s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))
# from keras.layers.recurrent to initialize a vector of (batchsize,
# output_dim)
y0 = K.zeros_like(inputs) # (samples, timesteps, input_dims)
y0 = K.sum(y0, axis=(1, 2)) # (samples, )
y0 = K.expand_dims(y0) # (samples, 1)
y0 = K.tile(y0, [1, self.output_dim])
return [y0, s0]
def step(self, x, states):
ytm, stm = states
...
return yt, [yt, st]
The AttentionDecoder class is inherited from Recurrent, an abstract base class for recurrent layers ( Recurrent#keras.layers.recurrent, documented here ).
How do the get_initial_state and step function work withing the class (who calls them, when, etc.)? If these function are related to the base class Recurrent, where can I find the relevant documentation?