TensorFlow: How to use Adam optimizer properly

TensorFlow: How to use Adam optimizer properly - python

Somebody have already asked a similar question, but the solution, which is given there, does not work for me.
I am trying to use Adam optimizer in TensorFlow. Here is a part of my code about it:
adamOptimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9,
beta2=0.999, epsilon=1e-08, use_locking=False, name='Adam')
print('Optimizer was created!')
# Create a variable to track the global step.
global_step = tf.Variable(0, name='global_step', trainable=False)
# Initialize variables
vars_to_init = ae.get_variables_to_init(n)
vars_to_init.append(global_step)
vars_to_init.append
sess.run(tf.variables_initializer(vars_to_init))
# Create an optimizer
train_op = adamOptimizer.minimize(loss, global_step=global_step)
The following error is raised after train_op is used for the first time:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value pretrain_1/beta2_power
[[Node: pretrain_1/beta2_power/read = IdentityT=DT_FLOAT, _class=["loc:#autoencoder_variables/weights1"], _device="/job:localhost/replica:0/task:0/cpu:0"]]
If I try to add a line
vars_to_init.append(beta2_power)
I am getting the following error:
NameError: global name 'beta2_power' is not defined
If I follow an advice for the similar question and replace sess.run(tf.variables_initializer(vars_to_init)) by sess.run(tf.initialize_all_variables()), I am getting the following error after running this line:
FailedPreconditionError: Attempting to use uninitialized value autoencoder_variables/biases1
[[Node: autoencoder_variables/biases1/read = IdentityT=DT_FLOAT, _class=["loc:#autoencoder_variables/biases1"], _device="/job:localhost/replica:0/task:0/cpu:0"]]
I didn't have any problems when I was using Gradient Descent optimizer...
What am I doing wrong? What is the proper way to use this optimizer?
More details about the class to clarify autoencoder_variables:
class AutoEncoder(object):
_weights_str = "weights{0}"
_biases_str = "biases{0}"
def __init__(self, shape, sess):
self.__shape = shape
self.__num_hidden_layers = len(self.__shape) - 2
self.__variables = {}
self.__sess = sess
self._setup_variables()
#property
def shape(self):
return self.__shape
#property
def num_hidden_layers(self):
return self.__num_hidden_layers
#property
def session(self):
return self.__sess
def __getitem__(self, item):
return self.__variables[item]
def __setitem__(self, key, value):
self.__variables[key] = value
def _setup_variables(self):
with tf.name_scope("autoencoder_variables"):
for i in xrange(self.__num_hidden_layers + 1):
# Train weights
name_w = self._weights_str.format(i + 1)
w_shape = (self.__shape[i], self.__shape[i + 1])
a = tf.mul(4.0, tf.sqrt(6.0 / (w_shape[0] + w_shape[1])))
w_init = tf.random_uniform(w_shape, -1 * a, a)
self[name_w] = tf.Variable(w_init,
name=name_w,
trainable=True)
# Train biases
name_b = self._biases_str.format(i + 1)
b_shape = (self.__shape[i + 1],)
b_init = tf.zeros(b_shape)
self[name_b] = tf.Variable(b_init, trainable=True, name=name_b)
if i <= self.__num_hidden_layers:
# Hidden layer fixed weights (after pretraining before fine tuning)
self[name_w + "_fixed"] = tf.Variable(tf.identity(self[name_w]),
name=name_w + "_fixed",
trainable=False)
# Hidden layer fixed biases
self[name_b + "_fixed"] = tf.Variable(tf.identity(self[name_b]),
name=name_b + "_fixed",
trainable=False)
# Pretraining output training biases
name_b_out = self._biases_str.format(i + 1) + "_out"
b_shape = (self.__shape[i],)
b_init = tf.zeros(b_shape)
self[name_b_out] = tf.Variable(b_init,
trainable=True,
name=name_b_out)
def _w(self, n, suffix=""):
return self[self._weights_str.format(n) + suffix]
def _b(self, n, suffix=""):
return self[self._biases_str.format(n) + suffix]
def get_variables_to_init(self, n):
assert n > 0
assert n <= self.__num_hidden_layers + 1
vars_to_init = [self._w(n), self._b(n)]
if n <= self.__num_hidden_layers:
vars_to_init.append(self._b(n, "_out"))
if 1 < n <= self.__num_hidden_layers+1:
# Fixed matrices for learning of deeper layers
vars_to_init.append(self._w(n - 1, "_fixed"))
vars_to_init.append(self._b(n - 1, "_fixed"))
return vars_to_init

The problem was that I was using one variable's values in order to initialize other variable (it raised an error of using uninitialized variables during initialization).
Instead of using another variable during initialization,
self[name_b + "_fixed"] = tf.Variable(tf.identity(self[name_b]),
name=name_b + "_fixed",
trainable=False)
I initialize it randomly:
self[name_b + "_fixed"] = tf.Variable(init_b,
name=name_b + "_fixed",
trainable=False)
And assigned it to another variable once it was trained:
ae[name_w + "_fixed"] = tf.identity(ae[name_w])

Related

Keras - ValueError: Could not interpret loss function identifier

I am trying to build the autoencoder structure detailed in this IEEE article. The autoencoder uses a separable loss function where it is required that I create a custom loss function for the "cluster loss" term of the separable loss function as a function of the average output of the encoder. I create my own layer called RffConnected that calculates the cluster loss and utilizes the add_loss method. Otherwise, this RffConnected layer should act as just a normal deep layer.
Here are my relevant code snippets:
import matplotlib.pyplot as plot
from mpl_toolkits.axes_grid1 import ImageGrid
import numpy as np
import math
from matplotlib.figure import Figure
import tensorflow as tf
import keras
from keras import layers
import random
import time
from os import listdir
#loads data from a text file
def loadData(basePath, samplesPerFile, sampleRate):
real = []
imag = []
fileOrder = []
for file in listdir(basePath):
if((file != "READ_ME") and ((file != "READ_ME.txt"))):
fid = open(basePath + "\\" + file, "r")
fileOrder.append(file)
t = 0
sampleEvery = samplesPerFile / sampleRate
temp1 = []
temp2 = []
times = []
for line in fid.readlines():
times.append(t)
samples = line.split("\t")
temp1.append(float(samples[0]))
temp2.append(float(samples[1]))
t = t + sampleEvery
real.append(temp1)
imag.append(temp2)
fid.close()
real = np.array(real)
imag = np.array(imag)
return real, imag, times, fileOrder
#####################################################################################################
#Breaks up and randomizes data
def breakUpData(real, imag, times, numPartitions, basePath):
if(len(real) % numPartitions != 0):
raise ValueError("Error: The length of the dataset must be divisible by the number of partitions.")
newReal = []
newImag = []
newTimes = []
fileOrder = listdir(basePath)
dataFiles = []
interval = int(len(real[0]) / numPartitions)
for i in range(0, interval):
newTimes.append(times[i])
for i in range(0, len(real)):
tempI = []
tempQ = []
for j in range(0, len(real[0])):
tempI.append(real[i, j])
tempQ.append(imag[i, j])
if((j + 1) % interval == 0):
newReal.append(tempI)
newImag.append(tempQ)
#fileName = fileOrder[i][0: fileOrder[i].find("_") + 3]
dataFiles.append(fileOrder[i])
tempI = []
tempQ = []
#randomizes the broken up dataset and the file list
for i in range(0, len(newReal)):
r = random.randint(0, len(newReal) - 1)
tempReal = newReal[i]
tempImag = newImag[i]
newReal[i] = newReal[r]
newImag[i] = newImag[r]
newReal[r] = tempReal
newImag[r] = tempImag
tempFile = dataFiles[i]
dataFiles[i] = dataFiles[r]
dataFiles[r] = tempFile
#return np.array(newReal), np.array(newImag), newTimes, dataFiles
return newReal, newImag, newTimes, dataFiles
#####################################################################################################
#custom loss layer for the RffAe-S that calculates the clustering loss term
class RffConnected(layers.Layer):
def __init__(self, output_dim, batchSize, beta, alpha):
super(RffConnected, self).__init__()
# self.total = tf.Variable(initial_value=tf.zeros((input_dim,)), trainable=False)
#array = np.zeros(output_dim)
self.iters = 0.0
self.beta = beta
self.alpha = alpha
self.batchSize = batchSize
self.output_dim = output_dim
self.sum = tf.zeros(output_dim, tf.float64)
self.moving_average = tf.zeros(output_dim, tf.float64)
self.clusterloss = tf.zeros(output_dim, tf.float64)
self.sum = tf.cast(self.sum, tf.float32)
self.moving_average = tf.cast(self.moving_average, tf.float32)
self.clusterloss = tf.cast(self.clusterloss, tf.float32)
# self.sum = keras.Input(shape=(self.output_dim,))
# self.moving_average = keras.Input(shape=(self.output_dim,))
# self.clusterloss = keras.Input(shape=(self.output_dim,))
def build(self, input_shape):
self.kernel = self.add_weight(name = 'kernel', \
shape = (int(input_shape[-1]), self.output_dim), \
initializer = 'normal', trainable = True)
#self.kernel = tf.cast(self.kernel, tf.float64)
super(RffConnected, self).build(int(input_shape[-1]))
def call(self, inputs):
#keeps track of training epochs
self.iters = self.iters + 1
#inputs = tf.cast(inputs, tf.float64)
#where this custom layer acts as a normal layer- the loss then uses this
#calc = keras.backend.dot(inputs, self.kernel)
calc = tf.matmul(inputs, self.kernel)
#cumulative sum of deep encoded features
#self.sum = state_ops.assign(self.sum, tf.reshape(tf.math.add(self.sum, calc), tf.shape(self.sum)))
#self.sum = tf.ops.state_ops.assign(self.sum, tf.math.add(self.sum, calc))
#self.sum.assign_add(calc)
self.sum = tf.math.add(self.sum, calc)
#calculate the moving average and loss if we have already trained one batch
if(self.iters >= self.batchSize):
self.moving_average = tf.math.divide(self.sum, self.iters)
self.clusterloss = tf.math.exp(\
tf.math.multiply(-1 * self.beta, tf.math.reduce_sum(tf.math.square(tf.math.subtract(inputs, self.moving_average)))))
#self.add_loss(tf.math.multiply(self.clusterloss, self.alpha))
self.add_loss(self.clusterloss.numpy() * self.alpha)
return calc
#####################################################################################################
def customloss(y_true, y_pred):
loss = tf.square(y_true - y_pred)
print(loss)
return loss
#####################################################################################################
realTraining = np.array(real[0:2200])
realTesting = np.array(real[2200:-1])
imagTraining = np.array(imag[0:2200])
imagTesting = np.array(imag[2200:-1])
numInputs = len(realTraining[0])
i_sig = keras.Input(shape=(numInputs,))
q_sig = keras.Input(shape=(numInputs,))
iRff = tf.keras.layers.experimental.RandomFourierFeatures(numInputs, \
kernel_initializer='gaussian', scale=9.0)(i_sig)
rff1 = keras.Model(inputs=i_sig, outputs=iRff)
qRff = tf.keras.layers.experimental.RandomFourierFeatures(numInputs, \
kernel_initializer='gaussian', scale=9.0)(q_sig)
rff2 = keras.Model(inputs=q_sig, outputs=qRff)
combined = layers.Concatenate()([iRff, qRff])
combineRff = tf.keras.layers.experimental.RandomFourierFeatures(4 * numInputs, \
kernel_initializer='gaussian', scale=10.0)(combined)
preprocess = keras.Model(inputs=[iRff, qRff], outputs=combineRff)
#print(realTraining[0:5])
preprocessedTraining = preprocess.predict([realTraining, imagTraining])
preprocessedTesting = preprocess.predict([realTesting, imagTesting])
################## Entering Encoder ######################
encoderIn = keras.Input(shape=(4*numInputs,))
#connected1 = layers.Dense(100, activation="sigmoid")(encoderIn)
clusterLossLayer = RffConnected(100, 30, 1.00, 100.00)(encoderIn)
#clusterLossLayer = myRffConnected(256)(connected1)
encoder = keras.Model(inputs=encoderIn, outputs=clusterLossLayer)
################## Entering Decoder ######################
connected2 = layers.Dense(125, activation="sigmoid")(clusterLossLayer)
relu1 = layers.ReLU()(connected2)
dropout = layers.Dropout(0.2)(relu1)
reshape1 = layers.Reshape((25, 5, 1))(dropout)
bn1 = layers.BatchNormalization()(reshape1)
trans1 = layers.Conv2DTranspose(1, (4, 2))(bn1)
ups1 = layers.UpSampling2D(size=(2, 1))(trans1)
relu2 = layers.ReLU()(ups1)
bn2 = layers.BatchNormalization()(relu2)
trans2 = layers.Conv2DTranspose(1, (4, 2))(bn2)
ups2 = layers.UpSampling2D(size=(2, 1))(trans2)
relu3 = layers.ReLU()(ups2)
bn3 = layers.BatchNormalization()(relu3)
trans3 = layers.Conv2DTranspose(1, (5, 2))(bn3)
ups3 = layers.UpSampling2D(size=(2, 1))(trans3)
relu4 = layers.ReLU()(ups3)
bn4 = layers.BatchNormalization()(relu4)
trans4 = layers.Conv2DTranspose(1, (7, 1))(bn4)
reshape2 = layers.Reshape((4*numInputs, 1, 1))(trans4)
autoencoder = keras.Model(inputs=encoderIn, outputs=reshape2)
encoded_input = keras.Input(shape=(None, 100))
decoder_layer = autoencoder.layers[-1]
#autoencoder.summary()
autoencoder.compile(optimizer='adam', loss=[autoencoder.losses[-1], customloss], metrics=['accuracy', 'accuracy'])
autoencoder.fit(preprocessedTraining, preprocessedTraining, epochs=100, batch_size=20, shuffle=True, validation_data=(preprocessedTesting, preprocessedTesting))
It seems like it runs for two training epochs then it gives me an error. I end up getting this error when I run it:
ValueError: Could not interpret loss function identifier: Tensor("rff_connected_137/Const:0", shape=(100,), dtype=float32)
I've already spent a considerable amount of time debugging this thing, although if you spot any more errors I would appreciate a heads-up. Thank you in advance.

According to the documentation of the keras Keras Model Training-Loss, the 'loss' attribute can take the value of float tensor (except for the sparse loss functions returning integer arrays) with a specific shape.
If it is necessary to combine two loss functions, it would be better to perform mathematical calculations within your custom loss function to return an output of float tensor. This reference might be a help Keras CustomLoss definition.

Adding layers and bidirectionality to custom LSTM cell in pytorch

I use a very custom LSTM-cell inspired by http://mlexplained.com/2019/02/15/building-an-lstm-from-scratch-in-pytorch-lstms-in-depth-part-1/.
I use it to look at intermediate gating values. My question is, how would I expand this class to have an option for adding more layers and for adding bidirectionality? Should it be wrapped in a new class or added in the present one?
class Dim(IntEnum):
batch = 0
seq = 1
class simpleLSTM(nn.Module):
def __init__(self, input_sz: int, hidden_sz: int):
super().__init__()
self.input_size = input_sz
self.hidden_size = hidden_sz
# input gate
self.W_ii = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hi = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_i = Parameter(torch.Tensor(hidden_sz))
# forget gate
self.W_if = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hf = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_f = Parameter(torch.Tensor(hidden_sz))
# ???
self.W_ig = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_hg = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_g = Parameter(torch.Tensor(hidden_sz))
# output gate
self.W_io = Parameter(torch.Tensor(input_sz, hidden_sz))
self.W_ho = Parameter(torch.Tensor(hidden_sz, hidden_sz))
self.b_o = Parameter(torch.Tensor(hidden_sz))
self.init_weights()
self.out = nn.Linear(hidden_sz, len(TRG.vocab))
def init_weights(self):
for p in self.parameters():
if p.data.ndimension() >= 2:
nn.init.xavier_uniform_(p.data)
else:
nn.init.zeros_(p.data)
def forward(self, x, init_states=None ):
"""Assumes x is of shape (batch, sequence, feature)"""
seq_sz, bs, = x.size()
hidden_seq = []
prediction = []
if init_states is None:
h_t, c_t = torch.zeros(self.hidden_size).to(x.device), torch.zeros(self.hidden_size).to(x.device)
else:
h_t, c_t = init_states
for t in range(seq_sz): # iterate over the time steps
x_t = x[t, :].float()
#LOOK HERE!!!
i_t = torch.sigmoid(x_t # self.W_ii + h_t # self.W_hi + self.b_i)
f_t = torch.sigmoid(x_t # self.W_if + h_t # self.W_hf + self.b_f)
g_t = torch.tanh(x_t # self.W_ig + h_t # self.W_hg + self.b_g)
o_t = torch.sigmoid(x_t # self.W_io + h_t # self.W_ho + self.b_o)
c_t = f_t * c_t + i_t * g_t
h_t = o_t * torch.tanh(c_t)
hidden_seq.append(h_t.unsqueeze(Dim.batch))
pred_t = self.out(h_t.unsqueeze(Dim.batch))
#pred_t = F.softmax(pred_t)
prediction.append(pred_t)
hidden_seq = torch.cat(hidden_seq, dim=Dim.batch)
prediction = torch.cat(prediction, dim=Dim.batch)
# reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
hidden_seq = hidden_seq.transpose(Dim.batch, Dim.seq).contiguous()
prediction = prediction.transpose(Dim.batch, Dim.seq).contiguous()
return prediction, hidden_seq, (h_t, c_t)
I call it and train using the following as an example.
lstm = simpleLSTM(1, 100)
hidden_size = lstm.hidden_size
optimizer = optim.Adam(lstm.parameters())
h_0, c_0 = (torch.zeros(hidden_size, requires_grad=True),
torch.zeros(hidden_size, requires_grad=True))
grads = []
h_t, c_t = h_0, c_0
N_EPOCHS = 10
for epoch in range(N_EPOCHS):
epoch_loss = 0
for i, batch in enumerate(train):
optimizer.zero_grad()
src, src_len = batch.src
trg = batch.trg
trg = trg.view(-1)
predict, output, hidden_states = lstm(src)
predict = predict.t().unsqueeze(1)
predict= predict.view(-1, predict.shape[-1])
loss = criterion(predict,trg)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(epoch_loss)

The easiest would be to create another module (say Bidirectional) and pass any cell you want to it.
Implementation itself is quite easy to do. Notice that I'm using concat operation for joining bi-directional output, you may want to specify other modes like summation etc.
Please read the comments in the code below, you may have to change it appropriately.
import torch
class Bidirectional(torch.nn.Module):
def __init__(self, cell):
super().__init__()
self.cell = cell
def __call__(self, x, init_states=None):
prediction, hidden_seq, (h_t, c_t) = self.cell(x, init_states)
backward_prediction, backward_hidden_seq, (
backward_h_t,
backward_c_t,
# Assuming sequence is first dimension, otherwise change 0 appropriately
# Reverses sequences so the LSTM cell acts on the reversed sequence
) = self.cell(torch.flip(x, (0,)), init_states)
return (
# Assuming you transpose so it has (batch, seq, features) dimensionality
torch.cat((prediction, backward_prediction), 2),
torch.cat((hidden_seq, backward_hidden_seq), 2),
# Assuming it has (batch, features) dimensionality
torch.cat((h_t, backward_ht), 1),
torch.cat((c_t, backward_ct), 1),
)
When it comes to multiple layers you could do something similiar in principle:
import torch
class Multilayer(torch.nn.Module):
def __init__(self, *cells):
super().__init__()
self.cells = torch.nn.ModuleList(cells)
def __call__(self, x, init_states=None):
inputs = x
for cell in self.cells:
prediction, hidden_seq, (h_t, c_t) = cell(inputs, init_states)
inputs = hidden_seq
return prediction, hidden_seq, (h_t, c_t)
Please note you have to pass created cell objects into Multilayer e.g.:
# For three layers of LSTM, each needs features to be set up correctly
multilayer_LSTM = Multilayer(LSTM(), LSTM(), LSTM())
You may also pass classes instead of instances into constructor and create those inside Multilayer (so hidden_size matches automatically), but those ideas should get you started.

Gradients error using TensorArray Tensorflow

i am trying to implement multidimentional lstm in tensorflow, I am using TensorArray to remember previous states, i am using a complicated way to get two neigbours state(above and from left). tf.cond want that both posible condition to exist and to have the same number of inputs. this is why i added one more cell.zero_state to the (last index +1) of the states. then i using a function to get the correct indexes to the states. when i am trying to use an optimizer in order to minimize a cost, i getting that error:
InvalidArgumentError (see above for traceback): TensorArray
MultiDimentionalLSTMCell-l1-multi-l1/state_ta_262#gradients: Could not
read from TensorArray index 809 because it has not yet been written
to.
Can someone tell how to fix it?
Ps: without optimizer it works!
class MultiDimentionalLSTMCell(tf.nn.rnn_cell.RNNCell):
"""
Note that state_is_tuple is always True.
"""
def __init__(self, num_units, forget_bias=1.0, activation=tf.nn.tanh):
self._num_units = num_units
self._forget_bias = forget_bias
self._activation = activation
#property
def state_size(self):
return tf.nn.rnn_cell.LSTMStateTuple(self._num_units, self._num_units)
#property
def output_size(self):
return self._num_units
def __call__(self, inputs, state, scope=None):
"""Long short-term memory cell (LSTM).
#param: imputs (batch,n)
#param state: the states and hidden unit of the two cells
"""
with tf.variable_scope(scope or type(self).__name__):
c1,c2,h1,h2 = state
# change bias argument to False since LN will add bias via shift
concat = tf.nn.rnn_cell._linear([inputs, h1, h2], 5 * self._num_units, False)
i, j, f1, f2, o = tf.split(1, 5, concat)
new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) +
c2 * tf.nn.sigmoid(f2 + self._forget_bias) + tf.nn.sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * tf.nn.sigmoid(o)
new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)
return new_h, new_state
def multiDimentionalRNN_whileLoop(rnn_size,input_data,sh,dims=None,scopeN="layer1"):
"""Implements naive multidimentional recurent neural networks
#param rnn_size: the hidden units
#param input_data: the data to process of shape [batch,h,w,chanels]
#param sh: [heigth,width] of the windows
#param dims: dimentions to reverse the input data,eg.
dims=[False,True,True,False] => true means reverse dimention
#param scopeN : the scope
returns [batch,h/sh[0],w/sh[1],chanels*sh[0]*sh[1]] the output of the lstm
"""
with tf.variable_scope("MultiDimentionalLSTMCell-"+scopeN):
cell = MultiDimentionalLSTMCell(rnn_size)
shape = input_data.get_shape().as_list()
if shape[1]%sh[0] != 0:
offset = tf.zeros([shape[0], sh[0]-(shape[1]%sh[0]), shape[2], shape[3]])
input_data = tf.concat(1,[input_data,offset])
shape = input_data.get_shape().as_list()
if shape[2]%sh[1] != 0:
offset = tf.zeros([shape[0], shape[1], sh[1]-(shape[2]%sh[1]), shape[3]])
input_data = tf.concat(2,[input_data,offset])
shape = input_data.get_shape().as_list()
h,w = int(shape[1]/sh[0]),int(shape[2]/sh[1])
features = sh[1]*sh[0]*shape[3]
batch_size = shape[0]
x = tf.reshape(input_data, [batch_size,h,w, features])
if dims is not None:
x = tf.reverse(x, dims)
x = tf.transpose(x, [1,2,0,3])
x = tf.reshape(x, [-1, features])
x = tf.split(0, h*w, x)
sequence_length = tf.ones(shape=(batch_size,), dtype=tf.int32)*shape[0]
inputs_ta = tf.TensorArray(dtype=tf.float32, size=h*w,name='input_ta')
inputs_ta = inputs_ta.unpack(x)
states_ta = tf.TensorArray(dtype=tf.float32, size=h*w+1,name='state_ta',clear_after_read=False)
outputs_ta = tf.TensorArray(dtype=tf.float32, size=h*w,name='output_ta')
states_ta = states_ta.write(h*w, tf.nn.rnn_cell.LSTMStateTuple(tf.zeros([batch_size,rnn_size], tf.float32),
tf.zeros([batch_size,rnn_size], tf.float32)))
def getindex1(t,w):
return tf.cond(tf.less_equal(tf.constant(w),t),
lambda:t-tf.constant(w),
lambda:tf.constant(h*w))
def getindex2(t,w):
return tf.cond(tf.less(tf.constant(0),tf.mod(t,tf.constant(w))),
lambda:t-tf.constant(1),
lambda:tf.constant(h*w))
time = tf.constant(0)
def body(time, outputs_ta, states_ta):
constant_val = tf.constant(0)
stateUp = tf.cond(tf.less_equal(tf.constant(w),time),
lambda: states_ta.read(getindex1(time,w)),
lambda: states_ta.read(h*w))
stateLast = tf.cond(tf.less(constant_val,tf.mod(time,tf.constant(w))),
lambda: states_ta.read(getindex2(time,w)),
lambda: states_ta.read(h*w))
currentState = stateUp[0],stateLast[0],stateUp[1],stateLast[1]
out , state = cell(inputs_ta.read(time),currentState)
outputs_ta = outputs_ta.write(time,out)
states_ta = states_ta.write(time,state)
return time + 1, outputs_ta, states_ta
def condition(time,outputs_ta,states_ta):
return tf.less(time , tf.constant(h*w))
result , outputs_ta, states_ta = tf.while_loop(condition, body, [time,outputs_ta,states_ta])
outputs = outputs_ta.pack()
states = states_ta.pack()
y = tf.reshape(outputs, [h,w,batch_size,rnn_size])
y = tf.transpose(y, [2,0,1,3])
if dims is not None:
y = tf.reverse(y, dims)
return y
def tanAndSum(rnn_size,input_data,scope):
outs = []
for i in range(2):
for j in range(2):
dims = [False]*4
if i!=0:
dims[1] = True
if j!=0:
dims[2] = True
outputs = multiDimentionalRNN_whileLoop(rnn_size,input_data,[2,2],
dims,scope+"-multi-l{0}".format(i*2+j))
outs.append(outputs)
outs = tf.pack(outs, axis=0)
mean = tf.reduce_mean(outs, 0)
return tf.nn.tanh(mean)
graph = tf.Graph()
with graph.as_default():
input_data = tf.placeholder(tf.float32, [20,36,90,1])
#input_data = tf.ones([20,36,90,1],dtype=tf.float32)
sh = [2,2]
out1 = tanAndSum(20,input_data,'l1')
out = tanAndSum(25,out1,'l2')
cost = tf.reduce_mean(out)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)
#out = multiDimentionalRNN_raw_rnn(2,input_data,sh,dims=[False,True,True,False],scopeN="layer1")
#cell = MultiDimentionalLSTMCell(10)
#out = cell.zero_state(2, tf.float32).c
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
ou,k,_ = session.run([out,cost,optimizer],{input_data:np.ones([20,36,90,1],dtype=np.float32)})
print(ou.shape)
print(k)

You should add parameter parallel_iterations=1 to your while loop call.
Such as:
result, outputs_ta, states_ta = tf.while_loop(
condition, body, [time,outputs_ta,states_ta], parallel_iterations=1)
This is required because inside body you perform read and write operations on the same tensor array (states_ta). And in case of parallel loop execution(parallel_iterations > 1) some thread may try to read info from tensorArray, that was not written to it by another one.
I've test your code snippet with parallel_iterations=1 on tensorflow 0.12.1 and it works as expected.

The node 'Merge/MergeSummary' has inputs from different frames: what does it mean?

trying to merge all my summaries, I have an error saying that the inputs of Merge/MergeSummary comes from different frames. So, first of all: what is a frame? Could you please point me somewhere in the TF documentation about such stuff? -- of course, I googled a bit but could find almost nothing. How can I fix this issue? Below the code to reproduce the error. Thanks in advance.
import numpy as np
import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(23)
BATCH = 2
LENGTH = 4
SIZE = 5
ATT_SIZE = 3
NUM_QUERIES = 2
def linear(inputs, output_size, use_bias=True, activation_fn=None):
"""Linear projection."""
input_shape = inputs.get_shape().as_list()
input_size = input_shape[-1]
output_shape = input_shape[:-1] + [output_size]
if len(output_shape) > 2:
output_shape_tensor = tf.unstack(tf.shape(inputs))
output_shape_tensor[-1] = output_size
output_shape_tensor = tf.stack(output_shape_tensor)
inputs = tf.reshape(inputs, [-1, input_size])
kernel = tf.get_variable("kernel", [input_size, output_size])
output = tf.matmul(inputs, kernel)
if use_bias:
output = output + tf.get_variable('bias', [output_size])
if len(output_shape) > 2:
output = tf.reshape(output, output_shape_tensor)
output.set_shape(output_shape) # pylint: disable=I0011,E1101
if activation_fn is not None:
return activation_fn(output)
return output
class Attention(object):
"""Attention mechanism implementation."""
def __init__(self, attention_states, attention_size):
"""Initializes a new instance of the Attention class."""
self._states = attention_states
self._attention_size = attention_size
self._batch = tf.shape(self._states)[0]
self._length = tf.shape(self._states)[1]
self._size = self._states.get_shape()[2].value
self._features = None
def _init_features(self):
states = tf.reshape(
self._states, [self._batch, self._length, 1, self._size])
weights = tf.get_variable(
"kernel", [1, 1, self._size, self._attention_size])
self._features = tf.nn.conv2d(states, weights, [1, 1, 1, 1], "SAME")
def get_weights(self, query, scope=None):
"""Reurns the attention weights for the given query."""
with tf.variable_scope(scope or "Attention"):
if self._features is None:
self._init_features()
else:
tf.get_variable_scope().reuse_variables()
vect = tf.get_variable("Vector", [self._attention_size])
with tf.variable_scope("Query"):
query_features = linear(query, self._attention_size, False)
query_features = tf.reshape(
query_features, [-1, 1, 1, self._attention_size])
activations = vect * tf.tanh(self._features + query_features)
activations = tf.reduce_sum(activations, [2, 3])
with tf.name_scope('summaries'):
tf.summary.histogram('histogram', activations)
return tf.nn.softmax(activations)
states = tf.placeholder(tf.float32, shape=[BATCH, None, SIZE]) # unknown length
queries = tf.placeholder(tf.float32, shape=[NUM_QUERIES, BATCH, ATT_SIZE])
attention = Attention(states, ATT_SIZE)
func = lambda x: attention.get_weights(x, "Softmax")
weights = tf.map_fn(func, queries)
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
name = var.name.replace(':', '_')
tf.summary.histogram(name, var)
summary_op = tf.summary.merge_all()
states_np = np.random.rand(BATCH, LENGTH, SIZE)
queries_np = np.random.rand(NUM_QUERIES, BATCH, ATT_SIZE)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
weights_np, summary_str = sess.run([weights, summary_op], {states: states_np, queries: queries_np})
print weights_np

The error message was indeed not user friendly. It has been updated to
ValueError: Cannot use 'map/while/summaries/histogram' as input to 'Merge/MergeSummary' because 'map/while/summaries/histogram' is in a while loop. See info log for more details.
As the new message says, the problem is that you cannot produce summaries from inside of the while loop. The frame that the original message referred to is the "execution frame" of the while loop - all the state for each iteration of the while loop is kept in a frame.
In this case, the while_loop is created by tf.map_fn and the summary inside it is tf.summary.histogram('histogram', activations).
There are a couple of ways to deal with this. You can take the summary out of the get_weights, have the get_weights return activations as well, create the summary using the newly returned activations from tf.map_fn call.
Another approach, if NUM_QUERIES is constant and small, can be to statically unroll the loop instead of using tf.map_fn. Here is the code to do this:
# TOP PART OF THE CODE IS THE SAME
states = tf.placeholder(tf.float32, shape=[BATCH, None, SIZE]) # unknown length
queries = tf.placeholder(tf.float32, shape=[NUM_QUERIES, BATCH, ATT_SIZE])
attention = Attention(states, ATT_SIZE)
func = lambda x: attention.get_weights(x, "Softmax")
# NEW CODE BEGIN
split_queries = tf.split(queries, NUM_QUERIES)
weights = []
for query in split_queries:
weights.append(func(query))
# NEW CODE END
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
name = var.name.replace(':', '_')
tf.summary.histogram(name, var)
summary_op = tf.summary.merge_all()
states_np = np.random.rand(BATCH, LENGTH, SIZE)
queries_np = np.random.rand(NUM_QUERIES, BATCH, ATT_SIZE)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# NEW CODE BEGIN
results = sess.run(weights + [summary_op], {states: states_np, queries: queries_np})
weights_np, summary_str = results[:-1], results[-1]
# NEW CODE END
print weights_np

Tensorflow evaluate: Aborted (core dumped)

tl;dr: I input a word to my model, and am supposed to get a list of similar words and their associated measures of similarity back. I get an error: Aborted (core dumped).
My goal is to determine which words are similar to an input word, based on their feature vectors. I have model already trained. I load it and call two functions:
def main(argv=None):
model = NVDM(args)
sess_saver = tf.train.Saver()
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
loaded = load_for_similar(sess, sess_saver) #my function
wm = word_match(sess, loaded[0], loaded[1], "bottle", loaded[2], loaded[3], topN=5)
My problem is that I can't print out the words which are similar and the associated similarity measure. I tried (in main):
sess.run(wm)
wm[0].eval(session=sess)
print(wm)
All of which gave me the error:
F tensorflow/core/kernels/strided_slice_op.cc:316] Check failed: tmp.CopyFrom(input.Slice(begin[0], end[0]), final_shape)
Aborted (core dumped)
This tells me I'm not running the session properly. What am I doing wrong?
Details on the functions, just in case:
The function 'load_for_similar' restores the weights and bias of the decoder in my model (a variational autoencoder), and normalizes them. It also reverses the order of the keys and values in my vocabulary dictionary for later use:
def load_for_similar(sess, saver_obj):
saver_obj.restore(sess, "./CA_checkpoints/saved_model.ckpt")
vocab_file = '/path/to/vocab.pkl'
t1 = loader_object(vocab_file)
v1 = t1.get_vocab()
v1_rev = {k:v for v, k in v1.iteritems()}
decoder_mat = tf.get_collection(tf.GraphKeys.VARIABLES, scope='decoder')[0]
decoder_bias = tf.get_collection(tf.GraphKeys.VARIABLES, scope='decoder')[1]
return (find_norm(decoder_mat), find_norm(decoder_bias), v1, v1_rev)
To find similar words, I pass the normalized weight matrix and bias in to an new function, along with the feature vector of my word (vec):
def find_similar(sess, Weights, vec, bias):
dists = tf.add(tf.reduce_sum(tf.mul(Weights, vec)), bias)
best = argsort(sess, dists, reverse=True)
dist_sort = tf.nn.top_k(dists, k=dists.get_shape().as_list()[0], sorted=True).values
return dist_sort, best
Finally, I want to match the words that are closest to my supplied word, "bottle":
def word_match(sess, norm_mat , norm_bias, word_ , vocab, vocab_inverse , topN = 10):
idx = vocab[word_]
similarity_meas , indexes = find_similar(sess, norm_mat , norm_mat[idx], norm_bias)
words = tf.gather(vocab_inverse.keys(), indexes[:topN])
return (words, similarity_meas[:topN])
EDIT: in response to mrry's comment, here is the model (I hope this is what you wanted?). This code depends on utils.py, a separate utilities file. I will include that as well. Please note that this code is heavily based on Yishu Miao's and Sarath Nair's.
class NVDM(object):
""" Neural Variational Document Model -- BOW VAE.
"""
def __init__(self,
vocab_size=15000, #was 2000
n_hidden=500,
n_topic=50,
n_sample=1,
learning_rate=1e-5,
batch_size=100, #was 64
non_linearity=tf.nn.tanh):
self.vocab_size = vocab_size
self.n_hidden = n_hidden
self.n_topic = n_topic
self.n_sample = n_sample
self.non_linearity = non_linearity
self.learning_rate = learning_rate/batch_size #CA
self.batch_size = batch_size
self.x = tf.placeholder(tf.float32, [None, vocab_size], name='input')
self.mask = tf.placeholder(tf.float32, [None], name='mask') # mask paddings
# encoder
with tf.variable_scope('encoder'):
self.enc_vec = utils.mlp(self.x, [self.n_hidden, self.n_hidden])
self.mean = utils.linear(self.enc_vec, self.n_topic, scope='mean')
self.logsigm = utils.linear(self.enc_vec,
self.n_topic,
bias_start_zero=True,
matrix_start_zero=False,
scope='logsigm')
self.kld = -0.5 * tf.reduce_sum(1 - tf.square(self.mean) + 2 * self.logsigm - tf.exp(2 * self.logsigm), 1)
self.kld = self.mask*self.kld # mask paddings
with tf.variable_scope('decoder'):
if self.n_sample ==1: # single sample
p1 = tf.cast(tf.reduce_sum(self.mask), tf.int32) #needed for random normal generation
eps = tf.random_normal((p1, self.n_topic), 0, 1)
doc_vec = tf.mul(tf.exp(self.logsigm), eps) + self.mean
logits = tf.nn.log_softmax(utils.linear(doc_vec, self.vocab_size, scope='projection'))
self.recons_loss = -tf.reduce_sum(tf.mul(logits, self.x), 1)
# multiple samples
else:
eps = tf.random_normal((self.n_sample*batch_size, self.n_topic), 0, 1)
eps_list = tf.split(0, self.n_sample, eps)
recons_loss_list = []
for i in xrange(self.n_sample):
if i > 0: tf.get_variable_scope().reuse_variables()
curr_eps = eps_list[i]
doc_vec = tf.mul(tf.exp(self.logsigm), curr_eps) + self.mean
logits = tf.nn.log_softmax(utils.linear(doc_vec, self.vocab_size, scope='projection'))
recons_loss_list.append(-tf.reduce_sum(tf.mul(logits, self.x), 1))
self.recons_loss = tf.add_n(recons_loss_list) / self.n_sample
self.objective = self.recons_loss + self.kld
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
fullvars = tf.trainable_variables()
enc_vars = utils.variable_parser(fullvars, 'encoder')
dec_vars = utils.variable_parser(fullvars, 'decoder')
enc_grads = tf.gradients(self.objective, enc_vars)
dec_grads = tf.gradients(self.objective, dec_vars)
self.optim_enc = optimizer.apply_gradients(zip(enc_grads, enc_vars))
self.optim_dec = optimizer.apply_gradients(zip(dec_grads, dec_vars))
def minibatch_bow(it1, Instance1, n_samples, batch_size, used_ints = set()):
available = set(np.arange(n_samples)) - used_ints #
if len(available) < batch_size:
indices = np.array(list(available))
else:
indices = np.random.choice(tuple(available), batch_size, replace=False)
used = used_ints
mb = itemgetter(*indices)(it1)
batch_xs = Instance1._bag_of_words(mb, vocab_size=15000)
batch_flattened = np.ravel(batch_xs)
index_positions = np.where(batch_flattened > 0)[0]
return (batch_xs, index_positions, set(indices)) #batch_xs[0] is the bag of words; batch_xs[1] is the 0/1 word used/not;
def train(sess, model, train_file, vocab_file, saver_obj, training_epochs, alternate_epochs, batch_size):
Instance1 = testchunk_Nov23.testLoader(train_file, vocab_file)
data_set = Instance1.get_batch(batch_size) #get all minibatches of size 100
n_samples = Instance1.num_reviews()
train_batches = list(data_set) #this is an itertools.chain object
it1_train = list(itertools.chain(*train_batches)) #length is 732,356. This is all the reviews.atch_size
if len(it1_train) % batch_size != 0:
total_batch = int(len(it1_train)/batch_size) + 1
else:
total_batch = int(len(it1_train)/batch_size)
trainfilesave = "train_ELBO_and_perplexity_Dec1.txt"
#Training
train_time = time.time()
for epoch in range(training_epochs):
for switch in xrange(0, 2):
if switch == 0:
optim = model.optim_dec
print_mode = 'updating decoder'
else:
optim = model.optim_enc
print_mode = 'updating encoder'
with open(trainfilesave, 'w') as f:
for i in xrange(alternate_epochs):
loss_sum = 0.0
kld_sum = 0.0
word_count = 0
used_indices = set()
for idx_batch in range(total_batch): #train_batches:
mb = minibatch_bow(it1_train, Instance1, n_samples, batch_size, used_ints=used_indices)
print('minibatch', idx_batch)
used_indices.update(mb[2])
num_mb = np.ones(mb[0][0].shape[0])
input_feed = {model.x.name: mb[0][0], model.mask: num_mb}
_, (loss, kld) = sess.run((optim,[model.objective, model.kld]) , input_feed)
loss_sum += np.sum(loss)
And the utils.py file:
def linear(inputs,
output_size,
no_bias=False,
bias_start_zero=False,
matrix_start_zero=False,
scope=None):
"""Define a linear connection."""
with tf.variable_scope(scope or 'Linear'):
if matrix_start_zero:
matrix_initializer = tf.constant_initializer(0)
else:
matrix_initializer = None
if bias_start_zero:
bias_initializer = tf.constant_initializer(0)
else:
bias_initializer = None
input_size = inputs.get_shape()[1].value
matrix = tf.get_variable('Matrix', [input_size, output_size],
initializer=matrix_initializer)
bias_term = tf.get_variable('Bias', [output_size],
initializer=bias_initializer)
output = tf.matmul(inputs, matrix)
if not no_bias:
output = output + bias_term
return output
def mlp(inputs,
mlp_hidden=[],
mlp_nonlinearity=tf.nn.tanh,
scope=None):
"""Define an MLP."""
with tf.variable_scope(scope or 'Linear'):
mlp_layer = len(mlp_hidden)
res = inputs
for l in xrange(mlp_layer):
res = mlp_nonlinearity(linear(res, mlp_hidden[l], scope='l'+str(l)))
return res

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

TensorFlow: How to use Adam optimizer properly - python

Related

Keras - ValueError: Could not interpret loss function identifier

Adding layers and bidirectionality to custom LSTM cell in pytorch

Gradients error using TensorArray Tensorflow

The node 'Merge/MergeSummary' has inputs from different frames: what does it mean?

Tensorflow evaluate: Aborted (core dumped)

Categories

Resources