I've created a rather complex seq2seq type model (based on "A Neural Transducer"), and in the latest version of Tensorflow, the following code returns the error:
Cannot use 'transducer_training/while/rnn/strided_slice' as input to 'gradients/transducer_training/while/rnn/while/Select_1_grad/Select/f_acc' because 'transducer_training/while/rnn/strided_slice' is in a while loop
The code worked before, only since the latest version has it stopped:
numpy (1.14.0)
protobuf (3.5.1) tensorflow (1.5.0) tensorflow-gpu
(1.3.0) tensorflow-tensorboard (1.5.1) Ubuntu version 16.04.3 LTS
(Xenial Xerus)
Code (To get the error just copy, paste and run it):
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple
from tensorflow.python.layers import core as layers_core
# NOTE: Time major
# ---------------- Constants Manager ----------------------------
class ConstantsManager(object):
def __init__(self, input_dimensions, input_embedding_size, inputs_embedded, encoder_hidden_units,
transducer_hidden_units, vocab_ids, input_block_size, beam_width):
assert transducer_hidden_units == encoder_hidden_units, 'Encoder and transducer have to have the same amount' \
'of hidden units'
self.input_dimensions = input_dimensions
self.vocab_ids = vocab_ids
self.E_SYMBOL = len(self.vocab_ids)
self.vocab_ids.append('E_SYMBOL')
self.GO_SYMBOL = len(self.vocab_ids)
self.vocab_ids.append('GO_SYMBOL')
self.vocab_size = len(self.vocab_ids)
self.input_embedding_size = input_embedding_size
self.inputs_embedded = inputs_embedded
self.encoder_hidden_units = encoder_hidden_units
self.transducer_hidden_units = transducer_hidden_units
self.input_block_size = input_block_size
self.beam_width = beam_width
self.batch_size = 1 # Cannot be increased, see paper
self.log_prob_init_value = 0
# ----------------- Model ---------------------------------------
class Model(object):
def __init__(self, cons_manager):
self.var_list = []
self.cons_manager = cons_manager
self.max_blocks, self.inputs_full_raw, self.transducer_list_outputs, self.start_block, self.encoder_hidden_init,\
self.trans_hidden_init, self.logits, self.encoder_hidden_state_new, \
self.transducer_hidden_state_new, self.train_saver = self.build_full_transducer()
self.targets, self.train_op, self.loss = self.build_training_step()
def build_full_transducer(self):
with tf.variable_scope('transducer_training'):
embeddings = tf.Variable(tf.random_uniform([self.cons_manager.vocab_size,
self.cons_manager.input_embedding_size], -1.0, 1.0),
dtype=tf.float32,
name='embedding')
# Inputs
max_blocks = tf.placeholder(dtype=tf.int32, name='max_blocks') # total amount of blocks to go through
if self.cons_manager.inputs_embedded is True:
input_type = tf.float32
else:
input_type = tf.int32
inputs_full_raw = tf.placeholder(shape=(None, self.cons_manager.batch_size,
self.cons_manager.input_dimensions), dtype=input_type,
name='inputs_full_raw') # shape [max_time, 1, input_dims]
transducer_list_outputs = tf.placeholder(shape=(None,), dtype=tf.int32,
name='transducer_list_outputs') # amount to output per block
start_block = tf.placeholder(dtype=tf.int32, name='transducer_start_block') # where to start the input
encoder_hidden_init = tf.placeholder(shape=(2, 1, self.cons_manager.encoder_hidden_units), dtype=tf.float32,
name='encoder_hidden_init')
trans_hidden_init = tf.placeholder(shape=(2, 1, self.cons_manager.transducer_hidden_units), dtype=tf.float32,
name='trans_hidden_init')
# Temporary constants, maybe changed during inference
end_symbol = tf.get_variable(name='end_symbol',
initializer=tf.constant_initializer(self.cons_manager.vocab_size),
shape=(), dtype=tf.int32)
# Turn inputs into tensor which is easily readable#
inputs_full = tf.reshape(inputs_full_raw, shape=[-1, self.cons_manager.input_block_size,
self.cons_manager.batch_size,
self.cons_manager.input_dimensions])
# Outputs
outputs_ta = tf.TensorArray(dtype=tf.float32, size=max_blocks)
init_state = (start_block, outputs_ta, encoder_hidden_init, trans_hidden_init)
# Initiate cells, NOTE: if there is a future error, put these back inside the body function
encoder_cell = tf.contrib.rnn.LSTMCell(num_units=self.cons_manager.encoder_hidden_units)
transducer_cell = tf.contrib.rnn.LSTMCell(self.cons_manager.transducer_hidden_units)
def cond(current_block, outputs_int, encoder_hidden, trans_hidden):
return current_block < start_block + max_blocks
def body(current_block, outputs_int, encoder_hidden, trans_hidden):
# --------------------- ENCODER ----------------------------------------------------------------------
encoder_inputs = inputs_full[current_block]
encoder_inputs_length = [tf.shape(encoder_inputs)[0]]
encoder_hidden_state = encoder_hidden
if self.cons_manager.inputs_embedded is True:
encoder_inputs_embedded = encoder_inputs
else:
encoder_inputs = tf.reshape(encoder_inputs, shape=[-1, self.cons_manager.batch_size])
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
# Build model
# Build previous state
encoder_hidden_c, encoder_hidden_h = tf.split(encoder_hidden_state, num_or_size_splits=2, axis=0)
encoder_hidden_c = tf.reshape(encoder_hidden_c, shape=[-1, self.cons_manager.encoder_hidden_units])
encoder_hidden_h = tf.reshape(encoder_hidden_h, shape=[-1, self.cons_manager.encoder_hidden_units])
encoder_hidden_state_t = LSTMStateTuple(encoder_hidden_c, encoder_hidden_h)
# encoder_outputs: [max_time, batch_size, num_units]
encoder_outputs, encoder_hidden_state_new = tf.nn.dynamic_rnn(
encoder_cell, encoder_inputs_embedded,
sequence_length=encoder_inputs_length, time_major=True,
dtype=tf.float32, initial_state=encoder_hidden_state_t)
# Modify output of encoder_hidden_state_new so that it can be fed back in again without problems.
encoder_hidden_state_new = tf.concat([encoder_hidden_state_new.c, encoder_hidden_state_new.h], axis=0)
encoder_hidden_state_new = tf.reshape(encoder_hidden_state_new,
shape=[2, -1, self.cons_manager.encoder_hidden_units])
# --------------------- TRANSDUCER --------------------------------------------------------------------
encoder_raw_outputs = encoder_outputs
# Save/load the state as one tensor, use encoder state as init if this is the first block
trans_hidden_state = tf.cond(current_block > 0, lambda: trans_hidden, lambda: encoder_hidden_state_new)
transducer_amount_outputs = transducer_list_outputs[current_block - start_block]
# Model building
helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
embedding=embeddings,
start_tokens=tf.tile([self.cons_manager.GO_SYMBOL],
[self.cons_manager.batch_size]), # TODO: check if this looks good
end_token=end_symbol) # vocab size, so that it doesn't prematurely end the decoding
attention_states = tf.transpose(encoder_raw_outputs,
[1, 0, 2]) # attention_states: [batch_size, max_time, num_units]
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
self.cons_manager.encoder_hidden_units, attention_states)
decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
transducer_cell,
attention_mechanism,
attention_layer_size=self.cons_manager.transducer_hidden_units)
projection_layer = layers_core.Dense(self.cons_manager.vocab_size, use_bias=False)
# Build previous state
trans_hidden_c, trans_hidden_h = tf.split(trans_hidden_state, num_or_size_splits=2, axis=0)
trans_hidden_c = tf.reshape(trans_hidden_c, shape=[-1, self.cons_manager.transducer_hidden_units])
trans_hidden_h = tf.reshape(trans_hidden_h, shape=[-1, self.cons_manager.transducer_hidden_units])
trans_hidden_state_t = LSTMStateTuple(trans_hidden_c, trans_hidden_h)
decoder = tf.contrib.seq2seq.BasicDecoder(
decoder_cell, helper,
decoder_cell.zero_state(1, tf.float32).clone(cell_state=trans_hidden_state_t),
output_layer=projection_layer)
outputs, transducer_hidden_state_new, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
output_time_major=True,
maximum_iterations=transducer_amount_outputs)
logits = outputs.rnn_output # logits of shape [max_time,batch_size,vocab_size]
decoder_prediction = outputs.sample_id # For debugging
# Modify output of transducer_hidden_state_new so that it can be fed back in again without problems.
transducer_hidden_state_new = tf.concat(
[transducer_hidden_state_new[0].c, transducer_hidden_state_new[0].h],
axis=0)
transducer_hidden_state_new = tf.reshape(transducer_hidden_state_new,
shape=[2, -1, self.cons_manager.transducer_hidden_units])
# Note the outputs
outputs_int = outputs_int.write(current_block - start_block, logits)
return current_block + 1, outputs_int, encoder_hidden_state_new, transducer_hidden_state_new
_, outputs_final, encoder_hidden_state_new, transducer_hidden_state_new = \
tf.while_loop(cond, body, init_state, parallel_iterations=1)
# Process outputs
outputs = outputs_final.concat()
logits = tf.reshape(
outputs,
shape=(-1, 1, self.cons_manager.vocab_size)) # And now its [max_output_time, batch_size, vocab]
# For loading the model later on
logits = tf.identity(logits, name='logits')
encoder_hidden_state_new = tf.identity(encoder_hidden_state_new, name='encoder_hidden_state_new')
transducer_hidden_state_new = tf.identity(transducer_hidden_state_new, name='transducer_hidden_state_new')
train_saver = tf.train.Saver() # For now save everything
return max_blocks, inputs_full_raw, transducer_list_outputs, start_block, encoder_hidden_init,\
trans_hidden_init, logits, encoder_hidden_state_new, transducer_hidden_state_new, train_saver
def build_training_step(self):
targets = tf.placeholder(shape=(None,), dtype=tf.int32, name='targets')
targets_one_hot = tf.one_hot(targets, depth=self.cons_manager.vocab_size, dtype=tf.float32)
targets_one_hot = tf.Print(targets_one_hot, [targets], message='Targets: ', summarize=10)
targets_one_hot = tf.Print(targets_one_hot, [tf.argmax(self.logits, axis=2)], message='Argmax: ', summarize=10)
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=targets_one_hot,
logits=self.logits)
loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)
return targets, train_op, loss
constants_manager = ConstantsManager(input_dimensions=1, input_embedding_size=11, inputs_embedded=False,
encoder_hidden_units=100, transducer_hidden_units=100, vocab_ids=[0, 1, 2],
input_block_size=1, beam_width=5)
model = Model(cons_manager=constants_manager)
I encounter a similar problem recently when I put a dynamic_rnn inside a scan (i.e. a while loop). It seems that the bug was introduced only in TensorFlow 1.5. You can try downgrade your TensorFlow version to 1.4 or upgrade to 1.6. Both should work.
In this particular case, the error seems to be raised incorrectly (see github issue in comments). In general, however, such errors mean the following:
The usage pattern that the error message is complaining about was always illegal. Earlier versions of TensorFlow just did not have good checks for it.
The core of the problem is that in TensorFlow's execution model, you cannot use a tensor that you create inside a while loop, outside of it. For a simple illustration of this, take a look at this test case.
You can just disable the check by immediately returning from here, but your computation graph will be malformed, which can lead to undefined behavior.
The correct fix is to add all the tensors that you want to access outside of the while loop (outside of cond and body functions) to the loop_vars and use them as returned from the tf.while_loop.
Related
I am getting;
tensorflow.python.framework.errors_impl.OperatorNotAllowedInGraphError: Using a symbolic `tf.Tensor` as a Python `bool` is not allowed in Graph execution. Use Eager execution or decorate this function with #tf.function.
Error while I'm trying to fit DDPG agent over custom environment.
Here is the CustomEnv()
class CustomEnv(Env):
def __init__(self):
print("Test_3 : Init")
"""NOTE: Bool array element definition for Box action space needs to be determined !!!!"""
self.action_space = Tuple((Box(low=4, high=20, shape=(1, 1)),
Box(low=0, high=1, shape=(1, 1)),
MultiBinary(1),
MultiBinary(1),
Box(low=4, high=20, shape=(1, 1)),
Box(low=0, high=1, shape=(1, 1)),
MultiBinary(1),
MultiBinary(1),
Box(low=0, high=100, shape=(1, 1)),
Box(low=0, high=100, shape=(1, 1))))
"""Accuracy array"""
self.observation_space = Box(low=np.asarray([0]), high=np.asarray([100]))
"""Initial Space"""
self.state = return_Acc(directory=source_dir, input_array=self.action_space.sample())
self.episode_length = 20
print(f"Action Space sample = {self.action_space.sample()}")
print("Test_3 : End Init")
def step(self, action):
print(f"Model Action Space Output = {action}")
print("Test_2 : Step")
accuracy_of_model = random.randint(0,100)#return_Acc(directory=source_dir, input_array=action)
self.state = accuracy_of_model#round(100*abs(accuracy_of_model))
self.episode_length -= 1
# Calculating the reward
print(f"self.state = {self.state}, accuracy_of_model = {accuracy_of_model}")
if (self.state > 60):
reward = self.state
else:
reward = -(60-self.state)*10
if self.episode_length <= 0:
done = True
else:
done = False
# Setting the placeholder for info
info = {}
# Returning the step information
print("Test_2 : End Step")
return self.state, reward, done, info
def reset(self):
print("Test_1 : Reset")
self.state = 50
print(f"Self state = {self.state}")
self.episode_length = 20
print("Test_1 : End Reset")
return self.state
return_Acc function runs a Random Decision Forrest Model and return it's accuracy to DDPG model for determining next step's parameters. For the last my DDPG model as given below;
states = env.observation_space.shape
actions = np.asarray(env.action_space.sample()).size
print(f"states = {states}, actions = {actions}")
def model_creation(states, actions):
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(32, activation='relu', input_shape=states))
model.add(tf.keras.layers.Dense(24, activation='relu'))
model.add(tf.keras.layers.Dense(actions, activation='linear'))
model.build()
return model
model = model_creation(states, actions)
model.summary()
def build_agent(model, actions, critic):
policy = BoltzmannQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
nafa = DDPGAgent(nb_actions=actions, actor=model, memory=memory, critic=critic, critic_action_input=action_input)
#dqn = DQNAgent(model=model, memory=memory, policy=policy,
# nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
return nafa
action_input = Input(shape=(actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())
dqn = build_agent(model, actions, critic)
dqn.compile(tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=200, visualize=False, verbose=1)
results = dqn.test(env, nb_episodes=500, visualize=False)
print(f"episode_reward = {np.mean(results.history['episode_reward'])}")
I tried most of the solutions that I found here like
tf.compat.v1.enable_eager_execution()
and combination of this with other functions. (Such as enable_v2_behaviour()) But I couldn't able to make this worked. If I don't run RDF model inside DDPG then there is no problem occurring. If it's possible how can I connect RDf model accuracy output to self.state as an input.
keras-rl2 1.0.5
tensorflow-macos 2.10.0
And I'm using M1 based mac if that's matter.
To anyone interested with the solution I came up with a slower but at least working solution. It's actually simpler than expected. Just insert a command which runs the model script from terminal and write its output to a text file, than read that text file from RL agent script and again write the action space values to a text file which then can be red from model to create observation.
I have a simple code below for testing a RNN cell by feeding previous output as current input.
I was to do this after training.
When I call
tf.compat.v1.nn.raw_rnn(cell, rnn_loop)
after training I want it to use the weights that were achieved in training using another
tf.compat.v1.nn.raw_rnn(cell, rnn_loop)
Will the weights be the same or will the weights for raw_rnn during testing be initialized from zero? I will not run sess.run(tf.initialize_all_variables). I want know if I can safely call
tf.compat.v1.nn.raw_rnn(cell, rnn_loop) twice and still be using the same weights.
I also want to know how to inspect the trained weight values? so that I can confirm this.
The shape of rnn_outputs_tensor is (None,64,128) but I am expecting (10,64,128) because there are 10 steps (HORIZON) right?
print(rnn_outputs_tensor.shape)
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
state_size = 128
BATCH_SIZE = 64
HORIZON = 10
cell = tf.compat.v1.nn.rnn_cell.BasicRNNCell(state_size)
class RnnLoop:
def __init__(self, initial_state, cell):
self.initial_state = initial_state
self.cell = cell
def __call__(self, time, cell_output, cell_state, loop_state):
emit_output = cell_output # == None for time == 0
if cell_output is None: # time == 0
initial_input = tf.fill([BATCH_SIZE, state_size], 0.0)
next_input = initial_input
next_cell_state = self.initial_state
else:
next_input = cell_output
next_cell_state = cell_state
elements_finished = (time >= HORIZON)
next_loop_state = None
return elements_finished, next_input, next_cell_state, emit_output, next_loop_state
initial_state_tensor = tf.zeros((BATCH_SIZE,state_size),dtype=tf.float32)
rnn_loop = RnnLoop(initial_state=initial_state_tensor, cell=cell)
rnn_outputs_tensor_array, _, _ = tf.compat.v1.nn.raw_rnn(cell, rnn_loop)
rnn_outputs_tensor = rnn_outputs_tensor_array.stack()
print(rnn_outputs_tensor.shape)
var = [v for v in tf.compat.v1.trainable_variables()]
print(var)
I use a very custom LSTM-cell inspired by http://mlexplained.com/2019/02/15/building-an-lstm-from-scratch-in-pytorch-lstms-in-depth-part-1/.
I use it to look at intermediate gating values. My question is, how would I expand this class to have an option for adding more layers and for adding bidirectionality? Should it be wrapped in a new class or added in the present one?
class Dim(IntEnum):
batch = 0
seq = 1
class simpleLSTM(nn.Module):
def __init__(self, input_sz: int, hidden_sz: int):
super().__init__()
self.input_size = input_sz
self.hidden_size = hidden_sz
# input gate
self.W_ii = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hi = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_i = Parameter(torch.Tensor(hidden_sz))
# forget gate
self.W_if = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hf = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_f = Parameter(torch.Tensor(hidden_sz))
# ???
self.W_ig = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hg = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_g = Parameter(torch.Tensor(hidden_sz))
# output gate
self.W_io = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_ho = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_o = Parameter(torch.Tensor(hidden_sz))
self.init_weights()
self.out = nn.Linear(hidden_sz, len(TRG.vocab))
def init_weights(self):
for p in self.parameters():
if p.data.ndimension() >= 2:
nn.init.xavier_uniform_(p.data)
else:
nn.init.zeros_(p.data)
def forward(self, x, init_states=None ):
"""Assumes x is of shape (batch, sequence, feature)"""
seq_sz, bs, = x.size()
hidden_seq = []
prediction = []
if init_states is None:
h_t, c_t = torch.zeros(self.hidden_size).to(x.device), torch.zeros(self.hidden_size).to(x.device)
else:
h_t, c_t = init_states
for t in range(seq_sz): # iterate over the time steps
x_t = x[t, :].float()
#LOOK HERE!!!
i_t = torch.sigmoid(x_t # self.W_ii + h_t # self.W_hi + self.b_i)
f_t = torch.sigmoid(x_t # self.W_if + h_t # self.W_hf + self.b_f)
g_t = torch.tanh(x_t # self.W_ig + h_t # self.W_hg + self.b_g)
o_t = torch.sigmoid(x_t # self.W_io + h_t # self.W_ho + self.b_o)
c_t = f_t * c_t + i_t * g_t
h_t = o_t * torch.tanh(c_t)
hidden_seq.append(h_t.unsqueeze(Dim.batch))
pred_t = self.out(h_t.unsqueeze(Dim.batch))
#pred_t = F.softmax(pred_t)
prediction.append(pred_t)
hidden_seq = torch.cat(hidden_seq, dim=Dim.batch)
prediction = torch.cat(prediction, dim=Dim.batch)
# reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
hidden_seq = hidden_seq.transpose(Dim.batch, Dim.seq).contiguous()
prediction = prediction.transpose(Dim.batch, Dim.seq).contiguous()
return prediction, hidden_seq, (h_t, c_t)
I call it and train using the following as an example.
lstm = simpleLSTM(1, 100)
hidden_size = lstm.hidden_size
optimizer = optim.Adam(lstm.parameters())
h_0, c_0 = (torch.zeros(hidden_size, requires_grad=True),
torch.zeros(hidden_size, requires_grad=True))
grads = []
h_t, c_t = h_0, c_0
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
epoch_loss = 0
for i, batch in enumerate(train):
optimizer.zero_grad()
src, src_len = batch.src
trg = batch.trg
trg = trg.view(-1)
predict, output, hidden_states = lstm(src)
predict = predict.t().unsqueeze(1)
predict= predict.view(-1, predict.shape[-1])
loss = criterion(predict,trg)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(epoch_loss)
The easiest would be to create another module (say Bidirectional) and pass any cell you want to it.
Implementation itself is quite easy to do. Notice that I'm using concat operation for joining bi-directional output, you may want to specify other modes like summation etc.
Please read the comments in the code below, you may have to change it appropriately.
import torch
class Bidirectional(torch.nn.Module):
def __init__(self, cell):
super().__init__()
self.cell = cell
def __call__(self, x, init_states=None):
prediction, hidden_seq, (h_t, c_t) = self.cell(x, init_states)
backward_prediction, backward_hidden_seq, (
backward_h_t,
backward_c_t,
# Assuming sequence is first dimension, otherwise change 0 appropriately
# Reverses sequences so the LSTM cell acts on the reversed sequence
) = self.cell(torch.flip(x, (0,)), init_states)
return (
# Assuming you transpose so it has (batch, seq, features) dimensionality
torch.cat((prediction, backward_prediction), 2),
torch.cat((hidden_seq, backward_hidden_seq), 2),
# Assuming it has (batch, features) dimensionality
torch.cat((h_t, backward_ht), 1),
torch.cat((c_t, backward_ct), 1),
)
When it comes to multiple layers you could do something similiar in principle:
import torch
class Multilayer(torch.nn.Module):
def __init__(self, *cells):
super().__init__()
self.cells = torch.nn.ModuleList(cells)
def __call__(self, x, init_states=None):
inputs = x
for cell in self.cells:
prediction, hidden_seq, (h_t, c_t) = cell(inputs, init_states)
inputs = hidden_seq
return prediction, hidden_seq, (h_t, c_t)
Please note you have to pass created cell objects into Multilayer e.g.:
# For three layers of LSTM, each needs features to be set up correctly
multilayer_LSTM = Multilayer(LSTM(), LSTM(), LSTM())
You may also pass classes instead of instances into constructor and create those inside Multilayer (so hidden_size matches automatically), but those ideas should get you started.
i am trying to implement multidimentional lstm in tensorflow, I am using TensorArray to remember previous states, i am using a complicated way to get two neigbours state(above and from left). tf.cond want that both posible condition to exist and to have the same number of inputs. this is why i added one more cell.zero_state to the (last index +1) of the states. then i using a function to get the correct indexes to the states. when i am trying to use an optimizer in order to minimize a cost, i getting that error:
InvalidArgumentError (see above for traceback): TensorArray
MultiDimentionalLSTMCell-l1-multi-l1/state_ta_262#gradients: Could not
read from TensorArray index 809 because it has not yet been written
to.
Can someone tell how to fix it?
Ps: without optimizer it works!
class MultiDimentionalLSTMCell(tf.nn.rnn_cell.RNNCell):
"""
Note that state_is_tuple is always True.
"""
def __init__(self, num_units, forget_bias=1.0, activation=tf.nn.tanh):
self._num_units = num_units
self._forget_bias = forget_bias
self._activation = activation
#property
def state_size(self):
return tf.nn.rnn_cell.LSTMStateTuple(self._num_units, self._num_units)
#property
def output_size(self):
return self._num_units
def __call__(self, inputs, state, scope=None):
"""Long short-term memory cell (LSTM).
#param: imputs (batch,n)
#param state: the states and hidden unit of the two cells
"""
with tf.variable_scope(scope or type(self).__name__):
c1,c2,h1,h2 = state
# change bias argument to False since LN will add bias via shift
concat = tf.nn.rnn_cell._linear([inputs, h1, h2], 5 * self._num_units, False)
i, j, f1, f2, o = tf.split(1, 5, concat)
new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) +
c2 * tf.nn.sigmoid(f2 + self._forget_bias) + tf.nn.sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * tf.nn.sigmoid(o)
new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)
return new_h, new_state
def multiDimentionalRNN_whileLoop(rnn_size,input_data,sh,dims=None,scopeN="layer1"):
"""Implements naive multidimentional recurent neural networks
#param rnn_size: the hidden units
#param input_data: the data to process of shape [batch,h,w,chanels]
#param sh: [heigth,width] of the windows
#param dims: dimentions to reverse the input data,eg.
dims=[False,True,True,False] => true means reverse dimention
#param scopeN : the scope
returns [batch,h/sh[0],w/sh[1],chanels*sh[0]*sh[1]] the output of the lstm
"""
with tf.variable_scope("MultiDimentionalLSTMCell-"+scopeN):
cell = MultiDimentionalLSTMCell(rnn_size)
shape = input_data.get_shape().as_list()
if shape[1]%sh[0] != 0:
offset = tf.zeros([shape[0], sh[0]-(shape[1]%sh[0]), shape[2], shape[3]])
input_data = tf.concat(1,[input_data,offset])
shape = input_data.get_shape().as_list()
if shape[2]%sh[1] != 0:
offset = tf.zeros([shape[0], shape[1], sh[1]-(shape[2]%sh[1]), shape[3]])
input_data = tf.concat(2,[input_data,offset])
shape = input_data.get_shape().as_list()
h,w = int(shape[1]/sh[0]),int(shape[2]/sh[1])
features = sh[1]*sh[0]*shape[3]
batch_size = shape[0]
x = tf.reshape(input_data, [batch_size,h,w, features])
if dims is not None:
x = tf.reverse(x, dims)
x = tf.transpose(x, [1,2,0,3])
x = tf.reshape(x, [-1, features])
x = tf.split(0, h*w, x)
sequence_length = tf.ones(shape=(batch_size,), dtype=tf.int32)*shape[0]
inputs_ta = tf.TensorArray(dtype=tf.float32, size=h*w,name='input_ta')
inputs_ta = inputs_ta.unpack(x)
states_ta = tf.TensorArray(dtype=tf.float32, size=h*w+1,name='state_ta',clear_after_read=False)
outputs_ta = tf.TensorArray(dtype=tf.float32, size=h*w,name='output_ta')
states_ta = states_ta.write(h*w, tf.nn.rnn_cell.LSTMStateTuple(tf.zeros([batch_size,rnn_size], tf.float32),
tf.zeros([batch_size,rnn_size], tf.float32)))
def getindex1(t,w):
return tf.cond(tf.less_equal(tf.constant(w),t),
lambda:t-tf.constant(w),
lambda:tf.constant(h*w))
def getindex2(t,w):
return tf.cond(tf.less(tf.constant(0),tf.mod(t,tf.constant(w))),
lambda:t-tf.constant(1),
lambda:tf.constant(h*w))
time = tf.constant(0)
def body(time, outputs_ta, states_ta):
constant_val = tf.constant(0)
stateUp = tf.cond(tf.less_equal(tf.constant(w),time),
lambda: states_ta.read(getindex1(time,w)),
lambda: states_ta.read(h*w))
stateLast = tf.cond(tf.less(constant_val,tf.mod(time,tf.constant(w))),
lambda: states_ta.read(getindex2(time,w)),
lambda: states_ta.read(h*w))
currentState = stateUp[0],stateLast[0],stateUp[1],stateLast[1]
out , state = cell(inputs_ta.read(time),currentState)
outputs_ta = outputs_ta.write(time,out)
states_ta = states_ta.write(time,state)
return time + 1, outputs_ta, states_ta
def condition(time,outputs_ta,states_ta):
return tf.less(time , tf.constant(h*w))
result , outputs_ta, states_ta = tf.while_loop(condition, body, [time,outputs_ta,states_ta])
outputs = outputs_ta.pack()
states = states_ta.pack()
y = tf.reshape(outputs, [h,w,batch_size,rnn_size])
y = tf.transpose(y, [2,0,1,3])
if dims is not None:
y = tf.reverse(y, dims)
return y
def tanAndSum(rnn_size,input_data,scope):
outs = []
for i in range(2):
for j in range(2):
dims = [False]*4
if i!=0:
dims[1] = True
if j!=0:
dims[2] = True
outputs = multiDimentionalRNN_whileLoop(rnn_size,input_data,[2,2],
dims,scope+"-multi-l{0}".format(i*2+j))
outs.append(outputs)
outs = tf.pack(outs, axis=0)
mean = tf.reduce_mean(outs, 0)
return tf.nn.tanh(mean)
graph = tf.Graph()
with graph.as_default():
input_data = tf.placeholder(tf.float32, [20,36,90,1])
#input_data = tf.ones([20,36,90,1],dtype=tf.float32)
sh = [2,2]
out1 = tanAndSum(20,input_data,'l1')
out = tanAndSum(25,out1,'l2')
cost = tf.reduce_mean(out)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)
#out = multiDimentionalRNN_raw_rnn(2,input_data,sh,dims=[False,True,True,False],scopeN="layer1")
#cell = MultiDimentionalLSTMCell(10)
#out = cell.zero_state(2, tf.float32).c
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
ou,k,_ = session.run([out,cost,optimizer],{input_data:np.ones([20,36,90,1],dtype=np.float32)})
print(ou.shape)
print(k)
You should add parameter parallel_iterations=1 to your while loop call.
Such as:
result, outputs_ta, states_ta = tf.while_loop(
condition, body, [time,outputs_ta,states_ta], parallel_iterations=1)
This is required because inside body you perform read and write operations on the same tensor array (states_ta). And in case of parallel loop execution(parallel_iterations > 1) some thread may try to read info from tensorArray, that was not written to it by another one.
I've test your code snippet with parallel_iterations=1 on tensorflow 0.12.1 and it works as expected.
trying to merge all my summaries, I have an error saying that the inputs of Merge/MergeSummary comes from different frames. So, first of all: what is a frame? Could you please point me somewhere in the TF documentation about such stuff? -- of course, I googled a bit but could find almost nothing. How can I fix this issue? Below the code to reproduce the error. Thanks in advance.
import numpy as np
import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(23)
BATCH = 2
LENGTH = 4
SIZE = 5
ATT_SIZE = 3
NUM_QUERIES = 2
def linear(inputs, output_size, use_bias=True, activation_fn=None):
"""Linear projection."""
input_shape = inputs.get_shape().as_list()
input_size = input_shape[-1]
output_shape = input_shape[:-1] + [output_size]
if len(output_shape) > 2:
output_shape_tensor = tf.unstack(tf.shape(inputs))
output_shape_tensor[-1] = output_size
output_shape_tensor = tf.stack(output_shape_tensor)
inputs = tf.reshape(inputs, [-1, input_size])
kernel = tf.get_variable("kernel", [input_size, output_size])
output = tf.matmul(inputs, kernel)
if use_bias:
output = output + tf.get_variable('bias', [output_size])
if len(output_shape) > 2:
output = tf.reshape(output, output_shape_tensor)
output.set_shape(output_shape) # pylint: disable=I0011,E1101
if activation_fn is not None:
return activation_fn(output)
return output
class Attention(object):
"""Attention mechanism implementation."""
def __init__(self, attention_states, attention_size):
"""Initializes a new instance of the Attention class."""
self._states = attention_states
self._attention_size = attention_size
self._batch = tf.shape(self._states)[0]
self._length = tf.shape(self._states)[1]
self._size = self._states.get_shape()[2].value
self._features = None
def _init_features(self):
states = tf.reshape(
self._states, [self._batch, self._length, 1, self._size])
weights = tf.get_variable(
"kernel", [1, 1, self._size, self._attention_size])
self._features = tf.nn.conv2d(states, weights, [1, 1, 1, 1], "SAME")
def get_weights(self, query, scope=None):
"""Reurns the attention weights for the given query."""
with tf.variable_scope(scope or "Attention"):
if self._features is None:
self._init_features()
else:
tf.get_variable_scope().reuse_variables()
vect = tf.get_variable("Vector", [self._attention_size])
with tf.variable_scope("Query"):
query_features = linear(query, self._attention_size, False)
query_features = tf.reshape(
query_features, [-1, 1, 1, self._attention_size])
activations = vect * tf.tanh(self._features + query_features)
activations = tf.reduce_sum(activations, [2, 3])
with tf.name_scope('summaries'):
tf.summary.histogram('histogram', activations)
return tf.nn.softmax(activations)
states = tf.placeholder(tf.float32, shape=[BATCH, None, SIZE]) # unknown length
queries = tf.placeholder(tf.float32, shape=[NUM_QUERIES, BATCH, ATT_SIZE])
attention = Attention(states, ATT_SIZE)
func = lambda x: attention.get_weights(x, "Softmax")
weights = tf.map_fn(func, queries)
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
name = var.name.replace(':', '_')
tf.summary.histogram(name, var)
summary_op = tf.summary.merge_all()
states_np = np.random.rand(BATCH, LENGTH, SIZE)
queries_np = np.random.rand(NUM_QUERIES, BATCH, ATT_SIZE)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
weights_np, summary_str = sess.run([weights, summary_op], {states: states_np, queries: queries_np})
print weights_np
The error message was indeed not user friendly. It has been updated to
ValueError: Cannot use 'map/while/summaries/histogram' as input to 'Merge/MergeSummary' because 'map/while/summaries/histogram' is in a while loop. See info log for more details.
As the new message says, the problem is that you cannot produce summaries from inside of the while loop. The frame that the original message referred to is the "execution frame" of the while loop - all the state for each iteration of the while loop is kept in a frame.
In this case, the while_loop is created by tf.map_fn and the summary inside it is tf.summary.histogram('histogram', activations).
There are a couple of ways to deal with this. You can take the summary out of the get_weights, have the get_weights return activations as well, create the summary using the newly returned activations from tf.map_fn call.
Another approach, if NUM_QUERIES is constant and small, can be to statically unroll the loop instead of using tf.map_fn. Here is the code to do this:
# TOP PART OF THE CODE IS THE SAME
states = tf.placeholder(tf.float32, shape=[BATCH, None, SIZE]) # unknown length
queries = tf.placeholder(tf.float32, shape=[NUM_QUERIES, BATCH, ATT_SIZE])
attention = Attention(states, ATT_SIZE)
func = lambda x: attention.get_weights(x, "Softmax")
# NEW CODE BEGIN
split_queries = tf.split(queries, NUM_QUERIES)
weights = []
for query in split_queries:
weights.append(func(query))
# NEW CODE END
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
name = var.name.replace(':', '_')
tf.summary.histogram(name, var)
summary_op = tf.summary.merge_all()
states_np = np.random.rand(BATCH, LENGTH, SIZE)
queries_np = np.random.rand(NUM_QUERIES, BATCH, ATT_SIZE)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# NEW CODE BEGIN
results = sess.run(weights + [summary_op], {states: states_np, queries: queries_np})
weights_np, summary_str = results[:-1], results[-1]
# NEW CODE END
print weights_np