NaN loss in tensorflow LSTM model

NaN loss in tensorflow LSTM model - python

The following network code, which should be your classic simple LSTM language model, starts outputting nan loss after a while... on my training set it takes a couple of hours and I couldn't replicate it easily on smaller datasets. But it always happens in serious training.
Sparse_softmax_with_cross_entropy should be numerically stable, so it can't be the cause... but other than that, I don't see any other node that could cause an issue in the graph. What could be the problem?
class MyLM():
def __init__(self, batch_size, embedding_size, hidden_size, vocab_size):
self.x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, seq-len]
self.lengths = tf.placeholder(tf.int32, [batch_size]) # [batch_size]
# remove padding. [batch_size * seq_len] -> [batch_size * sum(lengths)]
mask = tf.sequence_mask(self.lengths) # [batch_size, seq_len]
mask = tf.cast(mask, tf.int32) # [batch_size, seq_len]
mask = tf.reshape(mask, [-1]) # [batch_size * seq_len]
# remove padding + last token. [batch_size * seq_len] -> [batch_size * sum(lengths-1)]
mask_m1 = tf.cast(tf.sequence_mask(self.lengths - 1, maxlen=tf.reduce_max(self.lengths)), tf.int32) # [batch_size, seq_len]
mask_m1 = tf.reshape(mask_m1, [-1]) # [batch_size * seq_len]
# remove padding + first token. [batch_size * seq_len] -> [batch_size * sum(lengths-1)]
m1_mask = tf.cast(tf.sequence_mask(self.lengths - 1), tf.int32) # [batch_size, seq_len-1]
m1_mask = tf.concat([tf.cast(tf.zeros([batch_size, 1]), tf.int32), m1_mask], axis=1) # [batch_size, seq_len]
m1_mask = tf.reshape(m1_mask, [-1]) # [batch_size * seq_len]
embedding = tf.get_variable("TokenEmbedding", shape=[vocab_size, embedding_size])
x_embed = tf.nn.embedding_lookup(embedding, self.x) # [batch_size, seq_len, embedding_size]
lstm = tf.nn.rnn_cell.LSTMCell(hidden_size, use_peepholes=True)
# outputs shape: [batch_size, seq_len, hidden_size]
outputs, final_state = tf.nn.dynamic_rnn(lstm, x_embed, dtype=tf.float32,
sequence_length=self.lengths)
outputs = tf.reshape(outputs, [-1, hidden_size]) # [batch_size * seq_len, hidden_size]
w = tf.get_variable("w_out", shape=[hidden_size, vocab_size])
b = tf.get_variable("b_out", shape=[vocab_size])
logits_padded = tf.matmul(outputs, w) + b # [batch_size * seq_len, vocab_size]
self.logits = tf.dynamic_partition(logits_padded, mask_m1, 2)[1] # [batch_size * sum(lengths-1), vocab_size]
predict = tf.argmax(logits_padded, axis=1) # [batch_size * seq_len]
self.predict = tf.dynamic_partition(predict, mask, 2)[1] # [batch_size * sum(lengths)]
flat_y = tf.dynamic_partition(tf.reshape(self.x, [-1]), m1_mask, 2)[1] # [batch_size * sum(lengths-1)]
self.cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=flat_y)
self.cost = tf.reduce_mean(self.cross_entropy)
self.train_step = tf.train.AdamOptimizer(learning_rate=0.01).minimize(self.cost)

check your columns which are fed to the model, in my case, there was a column having NaN values, after removing NaNs, it worked

It may be the case of exploding gradients, where gradients may explode during backpropagation in LSTMs, resulting number overflows. A common technique to deal with exploding gradients is to perform Gradient Clipping.

Related

Tensorflow Shape and Concat Error: Dimension 1 in both shapes must be equal, but are 4 and 5

I am getting this error when joining the pool when I connect two cnn layers in python. How can I correct the error and normalize the values?
import tensorflow as tf
import numpy as np
from tensorflow.python.ops import gen_array_ops
class TextCNN(object):
"""
A CNN for text classification.
Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
"""
def __init__(
self, sequence_length, num_classes, vocab_size,
embedding_size, filter_sizes, num_filters,fc_hidden_size, l2_reg_lambda=0.0):
# Placeholders for input, output and dropout
self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
fc_hidden_size=1024
self.is_training = tf.placeholder(tf.bool, name="is_training")
initializer=tf.random_normal_initializer(stddev=0.1)
self.initializer=initializer
self.is_training_flag=True
# Keeping track of l2 regularization loss (optional)
l2_loss = tf.constant(0.0)
def flatten_reshape(variable):
dim = 1
for d in variable.get_shape()[1:].as_list():
dim *= d
return tf.reshape(variable, shape=[-1, dim])
def _highway_layer(input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu):
"""
Highway Network (cf. http://arxiv.org/abs/1505.00387).
t = sigmoid(Wy + b)
z = t * g(Wy + b) + (1 - t) * y
where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
"""
for idx in range(num_layers):
g = f(_linear(input_, size, scope=("highway_lin_{0}".format(idx))))
t = tf.sigmoid(_linear(input_, size, scope=("highway_gate_{0}".format(idx))) + bias)
output = t * g + (1. - t) * input_
input_ = output
return output
def _linear(input_, output_size, scope="SimpleLinear"):
"""
Linear map: output[k] = sum_i(Matrix[k, i] * args[i] ) + Bias[k]
Args:
input_: a tensor or a list of 2D, batch x n, Tensors.
output_size: int, second dimension of W[i].
scope: VariableScope for the created subgraph; defaults to "SimpleLinear".
Returns:
A 2D Tensor with shape [batch x output_size] equal to
sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
Raises:
ValueError: if some of the arguments has unspecified or wrong shape.
"""
shape = input_.get_shape().as_list()
if len(shape) != 2:
raise ValueError("Linear is expecting 2D arguments: {0}".format(str(shape)))
if not shape[1]:
raise ValueError("Linear expects shape[1] of arguments: {0}".format(str(shape)))
input_size = shape[1]
# Now the computation.
with tf.variable_scope(scope):
W = tf.get_variable("W", [input_size, output_size], dtype=input_.dtype)
b = tf.get_variable("b", [output_size], dtype=input_.dtype)
return tf.nn.xw_plus_b(input_, W, b)
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
self.W = tf.Variable(
tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
name="W")
self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
# Create a convolution + maxpool layer for each filter size
pooled_outputs = []
for i, filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s" % filter_size):
# Convolution Layer
filter_shape = [filter_size, embedding_size, 1, num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
conv = tf.nn.conv2d(
self.embedded_chars_expanded,
W,
strides=[1, 1, 1, 1],
padding="VALID",
name="conv")
# Apply nonlinearity
h = tf.nn.sigmoid(tf.nn.bias_add(conv, b), name="sigmoid")
#h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
# Maxpooling over the outputs
h2 = tf.reshape(h, [-1, sequence_length, num_filters,1]) # shape:[batch_size,sequence_length,num_filters,1]
print(h2)
# self.initializer=tf.random_normal_initializer(stddev=0.1)
filter2 = tf.get_variable("filter2-%s" % filter_size,[filter_size, num_filters, 1, num_filters],initializer=self.initializer)
conv2 = tf.nn.conv2d(h2, filter2, strides=[1, 1, 1, 1], padding="SAME",name="conv2") # shape:[batch_size,sequence_length-filter_size*2+2,1,num_filters]
b2 = tf.get_variable("b2-%s" % filter_size, [num_filters]) # ADD 2017-06-09
h3 = tf.nn.sigmoid(tf.nn.bias_add(conv2, b2), name="sigmoid")
pooled = tf.nn.max_pool(
h3,
ksize=[1, sequence_length - filter_size + 1, 1, 1],
strides=[1, 1, 1, 1],
padding='VALID',
name="pool")
#x_reshaped = tf.reshape(pooled, [-1, 3])
s=flatten_reshape(pooled)
pooled_outputs.append(s)
# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
self.h_pool = tf.concat(pooled_outputs, 3)
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
print("zzz")
num_filters_total = num_filters * len(filter_sizes)
self.pool = tf.concat(pooled_outputs, axis=3)
self.pool_flat = tf.reshape(self.pool, shape=[-1, num_filters_total])
# Fully Connected Layer
with tf.name_scope("fc"):
W = tf.Variable(tf.truncated_normal(shape=[num_filters_total, fc_hidden_size],
stddev=0.1, dtype=tf.float32), name="W")
b = tf.Variable(tf.constant(value=0.1, shape=[fc_hidden_size], dtype=tf.float32), name="b")
self.fc = tf.nn.xw_plus_b(self.pool_flat, W, b)
# Batch Normalization Layer
self.fc_bn = tf.layers.batch_normalization(self.fc, training=self.is_training)
# Apply nonlinearity
self.fc_out = tf.nn.relu(self.fc_bn, name="relu")
# Highway Layer
with tf.name_scope("highway"):
self.highway = _highway_layer(self.fc_out, self.fc_out.get_shape()[1], num_layers=1, bias=0)
# Add dropout
with tf.name_scope("dropout"):
self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
# Final (unnormalized) scores and predictions
with tf.name_scope("output"):
W = tf.get_variable(
"W",
shape=[num_filters_total, num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
self.predictions = tf.argmax(self.scores, 1, name="predictions")
# Calculate mean cross-entropy loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
# Accuracy
with tf.name_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
ValueError: Dimension 1 in both shapes must be equal, but are 4 and 5. Shapes are [?,4,50] and [?,5,50]. for 'concat' (op: 'ConcatV2') with input shapes: [?,3,50,50], [?,4,50,50], [?,5,50,50], [] and with computed input tensors: input[3]
ValueError: Shape must be at least rank 4 but is rank 2 for 'concat' (op: 'ConcatV2') with input shapes: [?,7500], [?,10000], [?,12500], []

GRU same configurations but in two different ways produces two different output in tensorflow

I would like to do some sequence prediction in tensorflow using GRU. so I have created the same model in 2 different ways as follows:
In model 1 I have a 2 GRUs, one after the other, that is, the new_state1, the final hidden state of the first GRU, acts as the initial state to the second GRU. Therefore, the model outputs new_state1 and new_state2 consequentially. Note that this is not a 2 layer model, but only 1 layer. From the code below, I divided the input and the output into 2 parts where GRU1 takes the first part, and the second GRU takes the second part.
Also the random_seed is set and fixed for both model so that results can be comparable.
Model 1
import tensorflow as tf
import numpy as np
cell_size = 32
seq_length = 1000
time_steps1 = 500
time_steps2 = seq_length - time_steps1
x_t = np.arange(1, seq_length + 1)
x_t_plus_1 = np.arange(2, seq_length + 2)
tf.set_random_seed(123)
m_dtype = tf.float32
input_1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="input_1")
input_2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="input_2")
labels1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="labels_1")
labels2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="labels_2")
labels = tf.concat([labels1, labels2], axis=1, name="labels")
initial_state = tf.placeholder(shape=[None, cell_size], dtype=m_dtype, name="initial_state")
def model(input_feat1, input_feat2):
with tf.variable_scope("GRU"):
cell1 = tf.nn.rnn_cell.GRUCell(cell_size)
cell2 = tf.nn.rnn_cell.GRUCell(cell_size)
with tf.variable_scope("First50"):
# output1: shape=[1, time_steps1, 32]
output1, new_state1 = tf.nn.dynamic_rnn(cell1, input_feat1, dtype=m_dtype, initial_state=initial_state)
with tf.variable_scope("Second50"):
# output2: shape=[1, time_steps2, 32]
output2, new_state2 = tf.nn.dynamic_rnn(cell2, input_feat2, dtype=m_dtype, initial_state=new_state1)
with tf.variable_scope("output"):
# output shape: [1, time_steps1 + time_steps2, 32] => [1, 100, 32]
output = tf.concat([output1, output2], axis=1)
output = tf.reshape(output, shape=[-1, cell_size])
output = tf.layers.dense(output, units=1)
output = tf.reshape(output, shape=[1, time_steps1 + time_steps2, 1])
with tf.variable_scope("outputs_1_2_reshaped"):
output1 = tf.slice(input_=output, begin=[0, 0, 0], size=[-1, time_steps1, -1])
output2 = tf.slice(input_=output, begin=[0, time_steps1, 0], size=[-1, time_steps2, 1])
print(output.get_shape().as_list(), "1")
print(output1.get_shape().as_list(), "2")
print(output2.get_shape().as_list(), "3")
return output, output1, output2, initial_state, new_state1, new_state2
output, output1, output2, initial_state, new_state1, new_state2 = model(input_1, input_2)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
to_run_list = [new_state1, new_state2]
in1 = np.reshape(x_t[:time_steps1], newshape=(1, time_steps1, 1))
in2 = np.reshape(x_t[time_steps1:], newshape=(1, time_steps2, 1))
l1 = np.reshape(x_t_plus_1[:time_steps1], newshape=(1, time_steps1, 1))
l2 = np.reshape(x_t_plus_1[time_steps1:], newshape=(1, time_steps2, 1))
i_s = np.zeros([1, cell_size])
new_s1, new_s2 = sess.run(to_run_list, feed_dict={input_1: in1,
input_2: in2,
labels1: l1,
labels2: l2,
initial_state: i_s})
print(np.shape(new_s1), np.shape(new_s2))
print(np.mean(new_s1), np.mean(new_s2))
print(np.sum(new_s1), np.sum(new_s2))
In this model, Instead of having 2 different GRU, I created one, and I divided the input and labels into 2 different parts as well, and I used a for loop to iterate over my input dataset. Then the final state is taken and fed back into the same model as initial state.
Note that both model1 and model2 have the very first initial state of zeros.
Model 2
import tensorflow as tf
import numpy as np
cell_size = 32
seq_length = 1000
time_steps = 500
x_t = np.arange(1, seq_length + 1)
x_t_plus_1 = np.arange(2, seq_length + 2)
tf.set_random_seed(123)
m_dtype = tf.float32
inputs = tf.placeholder(dtype=m_dtype, shape=[None, time_steps, 1], name="inputs")
labels = tf.placeholder(dtype=m_dtype, shape=[None, time_steps, 1], name="labels")
initial_state = tf.placeholder(shape=[None, cell_size], dtype=m_dtype, name="initial_state")
grads_initial_state = tf.placeholder(dtype=m_dtype, shape=[None, cell_size], name="prev_grads")
this_is_last_batch = tf.placeholder(dtype=tf.bool, name="this_is_last_batch")
def model(input_feat):
with tf.variable_scope("GRU"):
cell = tf.nn.rnn_cell.GRUCell(cell_size)
with tf.variable_scope("cell"):
# output1: shape=[1, time_steps, 32]
output, new_state = tf.nn.dynamic_rnn(cell, input_feat, dtype=m_dtype, initial_state=initial_state)
with tf.variable_scope("output"):
output = tf.reshape(output, shape=[-1, cell_size])
output = tf.layers.dense(output, units=1)
output = tf.reshape(output, shape=[1, time_steps, 1])
print(output.get_shape().as_list(), "1")
return output, new_state
output, new_state = model(inputs)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
# 1000 // 500 = 2
num_iterations = seq_length // time_steps
print("num_iterations:", num_iterations)
final_states = []
to_run_list = [grads_wrt_initial_state, new_state]
for i in range(num_iterations):
current_xt = x_t[i * time_steps: (i + 1)*time_steps]
current_xt_plus_1 = x_t_plus_1[i*time_steps: (i + 1)*time_steps]
in1 = np.reshape(current_xt, newshape=(1, time_steps, 1))
l1 = np.reshape(current_xt_plus_1, newshape=(1, time_steps, 1))
i_s = np.zeros([1, cell_size])
if i == 0:
new_s = sess.run(new_state, feed_dict={inputs: in1,
labels: l1,
initial_state: i_s})
final_states.append(new_s)
print("---->", np.mean(final_states[-1]), np.sum(final_states[-1]), i)
else:
new_s = sess.run(new_state, feed_dict={inputs: in1,
labels: l1,
initial_state: final_states[-1]})
final_states.append(new_s)
print("---->", np.mean(final_states[-1]), np.sum(final_states[-1]), i)
Finally, after printing out the statistics of new_state1 and new_state2 in model1, they were different from the new_state, after each iteration, in model2.
I would like to know how to fix this problem and why is that happening.
Edit:
I have figured out that the weights values of the gru in both files are different
Now how can I reproduce the same results in 2 the different files even after setting the random seed?
Any help is much appreciated!!!

so to reproduce the same results in different files, tf.set_random_seed() is not enough. I figured out that we need to also set the seed for the intializers of the gru cells as well as the initializers of the weights in the dense layer at the output (this is at least acccording to my model); so the definition of the cell is now:
cell1 = tf.nn.rnn_cell.GRUCell(cell_size, kernel_initializer=tf.glorot_normal_initializer(seed=123, dtype=m_dtype))
And for the dense layer:
output = tf.layers.dense(output, units=1, kernel_initializer=tf.glorot_uniform_initializer(seed=123, dtype=m_dtype))
Note that any other initializer could be used as long as we set the seed the dtype for it.

Tensorflow Python LSTM Understanding multiple Input

so I am new with machine learning and I got a bonus course at my university where I have to train a lstm model to generate captions. I have read this so far: Blogpost_about_lstms
And used this as reference: some_random_code
So what I want to achieve:
I have an Dataset which is structured like this:
output from an CNN with a Vector on size 2048 that holds some "features" of an image. And 5 Captions describing that image.
Training:
input: CNN vector + Captions
output: Caption (guess)
Validation:
input: CNN vector
output: caption (guess)
So how can I use 2 Inputs (the CNN data and a Caption Sequence) to train to generate new captions only from an CNN input vector!
This is kinda tricky and I cannot grasp the theory in this. And Tensorflow is also quite a thing I have to say.
I have a normal Seq_2_Seq model in place that works. But now I am stuck :/
class Model(object):
def __init__(self, _input, is_training, hidden_size, vocab_size, num_layers,
dropout=config.trainer.dropout, init_scale=config.trainer.init_scale):
self.is_training = is_training
self.input_obj = _input
self.batch_size = _input.batch_size
self.num_steps = _input.num_steps
self.hidden_size = hidden_size
# create the word embeddings
with tf.device("/cpu:0"):
randomized = tf.random_uniform([vocab_size, hidden_size], -init_scale, init_scale)
print("randomized: ", randomized)
embedding = tf.Variable(randomized)
inputs = tf.nn.embedding_lookup(embedding, self.input_obj.input_data)
if is_training and dropout < 1:
inputs = tf.nn.dropout(inputs, dropout)
# set up the state storage / extraction
self.init_state = tf.placeholder(tf.float32, [num_layers, 2, self.batch_size, hidden_size])
state_per_layer_list = tf.unstack(self.init_state, axis=0)
rnn_tuple_state = tuple([tf.contrib.rnn.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1])for idx in range(num_layers)])
# create an LSTM cell to be unrolled
print("Hidden size: ", hidden_size)
cell = tf.contrib.rnn.LSTMCell(hidden_size, forget_bias=config.trainer.forget_bias)
# add a dropout wrapper if training
if is_training and dropout < 1:
cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout)
if num_layers > 1:
cell = tf.contrib.rnn.MultiRNNCell([cell for _ in range(num_layers)], state_is_tuple=True)
print("input: ", inputs)
output, self.state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, initial_state=rnn_tuple_state)
# reshape to (batch_size * num_steps, hidden_size)
output = tf.reshape(output, [-1, hidden_size])
softmax_w = tf.Variable(tf.random_uniform([hidden_size, vocab_size], -init_scale, init_scale))
softmax_b = tf.Variable(tf.random_uniform([vocab_size], -init_scale, init_scale))
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
# Reshape logits to be a 3-D tensor for sequence loss
logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])
# Use the contrib sequence loss and average over the batches
loss = tf.contrib.seq2seq.sequence_loss(logits,
self.input_obj.targets,
tf.ones([self.batch_size, self.num_steps], dtype=tf.float32),
average_across_timesteps=False,
average_across_batch=True)
# Update the cost
self.cost = tf.reduce_sum(loss)
# get the prediction accuracy
self.softmax_out = tf.nn.softmax(tf.reshape(logits, [-1, vocab_size]))
self.predict = tf.cast(tf.argmax(self.softmax_out, axis=1), tf.int32)
correct_prediction = tf.equal(self.predict, tf.reshape(self.input_obj.targets, [-1]))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
if not is_training:
return
self.learning_rate = tf.Variable(0.01, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 5)
optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
self.train_op = optimizer.apply_gradients(zip(grads, tvars),
global_step=tf.contrib.framework.get_or_create_global_step())
self.new_lr = tf.placeholder(tf.float32, shape=[])
self.lr_update = tf.assign(self.learning_rate, self.new_lr)
def assign_lr(self, session, lr_value):
session.run(self.lr_update, feed_dict={self.new_lr: lr_value})
I don't need a solution but some explanation how to move forward would be awesome!!

LSTM cells after convolution

I need to implement an LSTM layer after a two convolutional layers. Here is my code after the first convolution:
convo_2 = convolutional_layer(convo_1_pooling, shape=[5, 5, 32, 64])
convo_2_pooling = max_pool_2by2(convo_2)
convo_2_flat = tf.reshape(convo_2_pooling, shape=[-1, 64 * 50 * 25])
cell = rnn.LSTMCell(num_units=100, activation=tf.nn.relu)
cell = rnn.OutputProjectionWrapper(cell, output_size=7)
conv_to_rnn = int(convo_2_flat.get_shape()[1])
outputs, states = tf.nn.dynamic_rnn(cell, convo_2_flat, dtype=tf.float32)
I get this error on the last line:
ValueError: Shape (?, 50, 64) must have rank 2
I have to indicate the time steps into the convo_2_flat variable, right? How? I really don't know ho to do that.
EDIT:
After this reshape:
convo_2_flat = tf.reshape(convo_2_flat, shape=[-1, N_TIME_STEPS, INPUT_SIZE])
where
N_TIME_STEPS = 25
INPUT_SIZE = int(64 * 50 * 25 / N_TIME_STEPS)
I got this error: InvalidArgumentError (see above for traceback): logits and labels must be same size: logits_size=[5000,7] labels_size=[50,7] on this line:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=outputs))
Seem to me that the batch size has changed after the last reshape.
EDIT 2:
Is it wrong the code below?
convo_2_shape = convo_2_pooling.get_shape().as_list()
shape_convo_flat = convo_2_shape[1] * convo_2_shape[2] * convo_2_shape[3]
N_TIME_STEPS = convo_2_shape[1]
INPUT_SIZE = tf.cast(shape_convo_flat / N_TIME_STEPS, tf.int32)
convo_2_out = tf.reshape(convo_2_pooling, shape=[-1, shape_convo_flat])
convo_2_out = tf.reshape(convo_2_out, shape=[-1, N_TIME_STEPS, INPUT_SIZE])
I set N_TIME_STEPS that way because otherwise I'll have a float INPUT_SIZE and tf will throw an error.

According to Tensorflow documentation (https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
Input should be in the following shape (I use the default here),
i.e., [BATCH_SIZE, N_TIME_STEPS, INPUT_SIZE]. Therefore, you can reshape convo_2_flat as follows,
#get the shape of the output of max pooling
shape = convo_2_pooling.get_shape().as_list()
#flat accordingly
convo_2_flat = tf.reshape(convo_2_pooling, [-1, shape[1] * shape[2] * shape[3]])
# Here shape[1] * shape[2] * shape[3]] = N_TIME_STEPS*INPUT_SIZE
#reshape according to dynamic_rnn input
convo_2_flat = tf.reshape(convo_2_flat, shape=[-1, N_TIME_STEPS, INPUT_SIZE])
outputs, states = tf.nn.dynamic_rnn(cell, convo_2_flat, dtype=tf.float32)
# get the output of the last time step
val = tf.transpose(outputs, [1, 0, 2])
lstm_last_output = val[-1]
OUTPUT_SIZE = 7 #since you have defined in cell = rnn.OutputProjectionWrapper(cell, output_size=7)
W = {
'output': tf.Variable(tf.random_normal([OUTPUT_SIZE, N_CLASSES]))
}
biases = {
'output': tf.Variable(tf.random_normal([N_CLASSES]))
}
#Dense Layer
pred_Y= tf.matmul(lstm_last_output, W['output']) + biases['output']
#Softmax Layer
pred_softmax = tf.nn.softmax(pred_Y)
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=pred_softmax))
Note on the outputs:
According to the documentation, output of the dynamic_rnn is as follows,
i.e., [BATCH_SIZE, N_TIME_STEPS, OUTPUT_SIZE]. Therefore, you have an output for every time step. In the above code, I only get the output of the last time step. Alternatively, you can think about a different architecture for rnn output that describes as here (How do we use LSTM to classify sequences?),
Hope this helps.

Tensorflow Convolutional Network that returns an image (no logits)

I have undertaken a project in which I must use a convolutional network which will output an image instead of logit class predictors. For this purpose I've adapter the CNN code I downloaded from https://github.com/aymericdamien/TensorFlow-Examples
My input data are 64x64 images read from a binary file. The binary file is comprised of records of two 64x64 images in sequence. I need to minimize a cost function which is the difference of the second image and the 64x64 output of the network.
This is the module I've written to read the input data:
import tensorflow as tf
# various initialization variables
BATCH_SIZE = 128
N_FEATURES = 9
# This function accepts a tensor of size [batch_size, 2 ,record_size]
# and segments in into two tensors of size [batch_size, record] along the second dimension
# IMPORTANT: to be executed within an active session
def segment_batch(batch_p, batch_size, n_input):
batch_xs = tf.slice(batch_p, [0,0,0], [batch_size,1,n_input]) # optical data tensor
batch_ys = tf.slice(batch_p, [0,1,0], [batch_size,1,n_input]) # GT data tensor
optical = tf.reshape([batch_xs], [batch_size, n_input])
gt = tf.reshape([batch_ys], [batch_size, n_input])
return [optical, gt]
def batch_generator(filenames, record_size, batch_size):
""" filenames is the list of files you want to read from.
record_bytes: The size of a record in bytes
batch_size: The size a data batch (examples/batch)
"""
filename_queue = tf.train.string_input_producer(filenames)
reader = tf.FixedLengthRecordReader(record_bytes=2*record_size) # record size is double the value given (optical + ground truth images)
_, value = reader.read(filename_queue)
# read in the data (UINT8)
content = tf.decode_raw(value, out_type=tf.uint8)
# The bytes read represent the image, which we reshape
# from [depth * height * width] to [depth, height, width].
# read optical data slice
depth_major = tf.reshape(
tf.strided_slice(content, [0],
[record_size]),
[1, 64, 64])
# read GT (ground truth) data slice
depth_major1 = tf.reshape(
tf.strided_slice(content, [record_size],
[2*record_size]),
[1, 64, 64])
# Optical data
# Convert from [depth, height, width] to [height, width, depth].
uint8image = tf.transpose(depth_major, [1, 2, 0])
uint8image = tf.reshape(uint8image, [record_size]) # reshape into a single-dimensional vector
uint8image = tf.cast(uint8image, tf.float32) # cast into a float32
uint8image = uint8image/255 # normalize
# Ground Truth data
# Convert from [depth, height, width] to [height, width, depth].
gt_image = tf.transpose(depth_major1, [1, 2, 0])
gt_image = tf.reshape(gt_image, [record_size]) # reshape into a single-dimensional vector
gt_image = tf.cast(gt_image, tf.float32) # cast into a float32
gt_image = gt_image/255 # normalize
# stack them into a single features tensor
features = tf.stack([uint8image, gt_image])
# minimum number elements in the queue after a dequeue, used to ensure
# that the samples are sufficiently mixed
# I think 10 times the BATCH_SIZE is sufficient
min_after_dequeue = 10 * batch_size
# the maximum number of elements in the queue
capacity = 20 * batch_size
# shuffle the data to generate BATCH_SIZE sample pairs
data_batch = tf.train.shuffle_batch([features], batch_size=batch_size,
capacity=capacity, min_after_dequeue=min_after_dequeue)
return data_batch
This is the main code of my implementation:
from __future__ import print_function
# Various initialization variables
DATA_PATH_OPTICAL_TRAIN = 'data/building_ground_truth_for_training.bin'
DATA_PATH_EVAL = 'data/building_ground_truth_for_eval.bin'
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
# custom imports
import data_reader2
# Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 128
epochs = 10
display_step = 10
rows = 64
cols = 64
# Network Parameters
n_input = 4096 # optical image data (img shape: 64*64)
n_classes = 4096 # output is an image of same resolution as initial image
dropout = 0.75 # Dropout, probability to keep units
# input data parameters
record_size = 64**2
total_bytes_of_optical_binary_file = 893329408 # total size of binary file containing training data ([64z64 optical] [64x64 GT])
# create the data batches (queue)
# Accepts two parameters. The tensor containing the binary files and the size of a record
data_batch = data_reader2.batch_generator([DATA_PATH_OPTICAL_TRAIN],record_size, batch_size) # train set
data_batch_eval = data_reader2.batch_generator([DATA_PATH_EVAL],record_size, batch_size) # train set
##############################################################
######################### FUNCTIONS ##########################
##############################################################
# extract optical array from list
# A helper function. Data returned from segment_batch is a list which contains two arrays.
# The first array contains the optical data while the second contains the ground truth data
def extract_optical_from_list(full_batch):
optical = full_batch[0] # extract array from list
return optical
# extract ground truth array from list
# A helper function. Data returned from segment_batch is a list which contains two arrays.
# The first array contains the optical data while the second contains the ground truth data
def extract_gt_from_list(full_batch):
gt = full_batch[1] # extract array from list
return gt
# This function accepts a tensor of size [batch_size, 2 ,record_size]
# and segments in into two tensors of size [batch_size, record] along the second dimension
# IMPORTANT: to be executed within an active session
def segment_batch(batch_p):
batch_xs = tf.slice(batch_p, [0,0,0], [batch_size,1,n_input]) # optical data tensor
batch_ys = tf.slice(batch_p, [0,1,0], [batch_size,1,n_input]) # GT data tensor
optical = tf.reshape([batch_xs], [batch_size, n_input])
gt = tf.reshape([batch_ys], [batch_size, n_input])
return [optical, gt]
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, weights, biases, dropout):
# Reshape input picture into 64x64 subimages [rows, rows, cols, channels]
x1 = tf.reshape(x, shape=[-1, rows, cols, 1]) # this is the 4-dimensional that tf.conv2D expects as Input
# Convolution Layer
conv1 = conv2d(x1, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
# Max Pooling (down-sampling)
conv2 = maxpool2d(conv2, k=2)
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1)
# Apply Dropout
#fc1 = tf.nn.dropout(fc1, dropout)
# Output image (edge), prediction
out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
# Add print operation
out = tf.Print(out, [out], message="This is out: ")
return [out, x]
# Store layers weight & bias
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([16*16*64, 1024])),
# 1024 inputs, 10 outputs (class prediction)
'out': tf.Variable(tf.random_normal([1024, n_classes]))
}
biases = {
'bc1': tf.Variable(tf.random_normal([32])),
'bc2': tf.Variable(tf.random_normal([64])),
'bd1': tf.Variable(tf.random_normal([1024])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
####################################################################
##################### PLACEHOLDERS #################################
####################################################################
# tf Graph input (only pictures)
X = tf.placeholder_with_default(extract_optical_from_list(segment_batch(data_batch)), [batch_size, n_input])
####################################################################
##################### END OF PLACEHOLDERS ##########################
####################################################################
# tf Graph input
keep_prob = tf.Variable(dropout) #dropout (keep probability)
# Construct model
pred = conv_net(extract_optical_from_list(X), weights, biases, keep_prob) # x[0] is the optical data
y_true = extract_gt_from_list(extract_gt_from_list(X)) # y_true is the ground truth data
# Define loss and optimizer
cost = tf.reduce_mean(tf.pow(y_true - pred[0], 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 1
# Keep training until reach max iterations
while step * batch_size < training_iters:
print("Optimizing")
sess.run(optimizer)
print("Iter " + str(step*batch_size))
step += 1
print("Optimization Finished!")
After a lot of tweaking with the shape of the tensors I managed to fix the Syntax errors. Unfortunately, it just hangs the moment it starts executing the optimization part of the Graph. Since I have no way to debug this (found very scarce info on using the Tensorflow debugger) I'm really at a loss as to what has gone wrong! If someone with more experience on Tensorflow can point out what is wrong with this code it would help me a lot.
Thanks, in advance

you need to start the queue runner to get the data for optimizing from the queue.
....
coord = tf.train.Coordinator()
with tf.Session() as sess:
sess.run(init)
tf.train.start_queue_runners(sess=sess, coord=coord)
....
# also use tf.nn.sparse_softmax_cross_entropy_with_logits for cost

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

NaN loss in tensorflow LSTM model - python

check your columns which are fed to the model, in my case, there was a column having NaN values, after removing NaNs, it worked

It may be the case of exploding gradients, where gradients may explode during backpropagation in LSTMs, resulting number overflows. A common technique to deal with exploding gradients is to perform Gradient Clipping.

Related

Tensorflow Shape and Concat Error: Dimension 1 in both shapes must be equal, but are 4 and 5

GRU same configurations but in two different ways produces two different output in tensorflow

Tensorflow Python LSTM Understanding multiple Input

LSTM cells after convolution

Tensorflow Convolutional Network that returns an image (no logits)

Categories

Resources