I want to use tf.data library for training speed.
But my code, raise error message, like as bellows.
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py", line 727, in fit
use_multiprocessing=use_multiprocessing)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py", line 675, in fit
steps_name='steps_per_epoch')
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py", line 169, in model_iteration
ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py", line 535, in _prepare_feed_values
extract_tensors_from_dataset=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py", line 2471, in _standardize_user_data
exception_prefix='input')
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_utils.py", line 517, in standardize_input_data
standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_utils.py", line 517, in <listcomp>
standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_utils.py", line 442, in standardize_single_array
if (x.shape is not None and len(x.shape) == 1 and
File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/framework/tensor_shape.py", line 827, in __len__
raise ValueError("Cannot take the length of shape with unknown rank.")
ValueError: Cannot take the length of shape with unknown rank.
I don't know why and how can I fix it.
My dataset code is as bellows.
def _py_parse_line(line):
line = line.decode('utf-8')
parsed_line = line.split("\t")
label = int(parsed_line[0])
rawSentence = str(parsed_line[1])
morphemePOS = str(parsed_line[2])
NE_LIST = str(parsed_line[3])
preprocessedDatum = FTV.featureToVectorLexMorpDictNElist(
[rawSentence], [morphemePOS], [NE_LIST]
)
syllable_feature = preprocessedDatum[0][0]
sdiDict_feature = preprocessedDatum[1][0]
morphemePos_feature = preprocessedDatum[2][0]
neDict_feautre = preprocessedDatum[3][0]
return syllable_feature, sdiDict_feature, morphemePos_feature, neDict_feautre, label
def _decode_tsv(line):
type_list = [tf.int32, tf.int8, tf.int32, tf.int8, tf.int64]
data_list = tf.py_func(_py_parse_line, [line], type_list)
label = data_list[4]
features = {"lexical_input": data_list[0],
"dictInfo_input": data_list[1],
"morphemePos_input": data_list[2],
"ne_input": data_list[3]
}
d = features, label
return d
dataset = tf.data.TextLineDataset("/root/Workspace/ias_sdi_trainer/generatedModel/sdi_nugu.dtg.wholeData.txt")
dataset = dataset.shuffle(4096)
dataset = dataset.map(_decode_tsv)
dataset = dataset.batch(batchSize )
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
I saw inside of tensor with "make_oneshot_iterator()" and tf.Session.
There was data what I think.
I was this question, too. But It cannot help me. Cannot take the length of Shape with unknown rank
Is there any good way to use tf.data for keras?
I want to use data from textline in tf.data
Added
My tensorflow version is 1.14.
I used dataset in fit, like as bellows.
sdi_model.fit(dataset, epochs=MAX_EPOC)
I used dataset with generator too. Code is as bellows.
dataFetcher = dataset.make_one_shot_iterator()
def gen(dataFetcher):
with tf.Session() as sess:
while True:
next_elem = dataFetcher.get_next()
x_batch,y_batch = sess.run(next_elem)
yield x_batch, y_batch
sdi_model.fit_generator(gen(dataFetcher), steps_per_epoch=100, epochs=MAX_EPOC)
With generator, I received a error message, "sess is empty graph"
Related
I'm trying to run a training on a multi gpu enviroment.
here's model code
net_1 = nn.Sequential(nn.Conv2d(2, 12, 5),
nn.MaxPool2d(2),
snn.Leaky(beta=beta, spike_grad=spike_grad, init_hidden=True),
nn.Conv2d(12, 32, 5),
nn.MaxPool2d(2),
snn.Leaky(beta=beta, spike_grad=spike_grad, init_hidden=True),
nn.Flatten(),
nn.Linear(32*5*5, 10),
snn.Leaky(beta=beta, spike_grad=spike_grad, init_hidden=True, output=True)
)
net_1.cuda()
net = nn.DataParallel(net_1)
snn.Leaky is a module used to implement SNN structure combinig with torch.nn, Which makes network work as kind of RNN.
links here(https://snntorch.readthedocs.io/en/latest/readme.html)
The input shape looks like this (timestep, batchsize, 2, 32,32)
Training code
def forward_pass(net, data):
spk_rec = []
utils.reset(net) # resets hidden states for all LIF neurons in net
for step in range(data.size(1)): # data.size(0) = number of time steps
datas = data[:,step,:,:,:].cuda()
net = net.to(device)
spk_out, mem_out = net(datas)
spk_rec.append(spk_out)
return torch.stack(spk_rec)
optimizer = torch.optim.Adam(net.parameters(), lr=2e-2, betas=(0.9, 0.999))
loss_fn = SF.mse_count_loss(correct_rate=0.8, incorrect_rate=0.2)
num_epochs = 5
num_iters = 50
loss_hist = []
acc_hist = []
t_spk_rec_sum = []
start = time.time()
net.train()
# training loop
for epoch in range(num_epochs):
for i, (data, targets) in enumerate(iter(trainloader)):
data = data.to(device)
targets = targets.to(device)
spk_rec = forward_pass(net, data)
loss_val = loss_fn(spk_rec, targets)
# Gradient calculation + weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_hist.append(loss_val.item())
print("time :", time.time() - start,"sec")
print(f"Epoch {epoch}, Iteration {i} \nTrain Loss: {loss_val.item():.2f}")
acc = SF.accuracy_rate(spk_rec, targets)
acc_hist.append(acc)
print(f"Train Accuracy: {acc * 100:.2f}%\n")
And I got this error
Traceback (most recent call last):
File "/home/hubo1024/PycharmProjects/snntorch/multi_gpu_train.py", line 87, in <module>
spk_rec = forward_pass(net, data)
File "/home/hubo1024/PycharmProjects/snntorch/multi_gpu_train.py", line 63, in forward_pass
spk_out, mem_out = net(datas)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/_utils.py", line 461, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/snntorch/_neurons/leaky.py", line 162, in forward
self.mem = self.state_fn(input_)
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/snntorch/_neurons/leaky.py", line 201, in _build_state_function_hidden
self._base_state_function_hidden(input_) - self.reset * self.threshold
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/snntorch/_neurons/leaky.py", line 195, in _base_state_function_hidden
base_fn = self.beta.clamp(0, 1) * self.mem + input_
File "/home/hubo1024/anaconda3/envs/spyketorchproject/lib/python3.10/site-packages/torch/_tensor.py", line 1121, in __torch_function__
ret = func(*args, **kwargs)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Process finished with exit code 1
Line 87 is
spk_rec = forward_pass(net, data)
from traning loop
and line 63 is
spk_out, mem_out = net(datas)
of forward pass function
I checked and made sure that there's no part where the tensor is defined as cpu,
And the code works well when I run this code in single GPU.
I'm currently using
torch.utils.data import DataLoader
for making batch train loader. I'm thinking that this might be main source of the problem.
Should I use different dataloader for multi GPU training?
And if so where can I find some reference with this?, I serched a bit but those info where a bit old.
This was a bug in the Leaky neuron that kept resetting its device when using DataParallel. It has been fixed in the current version of snnTorch in GitHub, and addressed in this issue: https://github.com/jeshraghian/snntorch/issues/154
We're working on fixing up the other neurons now.
Due to the size of my data, I have my training, validation and test data fed via a generator as below:
def sequence_generator(data_type):
data_type = data_type.decode('ascii')
sequence_folder = f"{model_attributes['sequences_path']}/{data_type}"
for numpy_file in random.sample(os.listdir(sequence_folder),len(os.listdir(sequence_folder))):
if numpy_file.endswith(".npz"):
with np.load(f"{sequence_folder}/{numpy_file}") as numpy_data:
sequences = numpy_data['x']
labels = numpy_data['y']
labels = tf.one_hot(labels, 3)
for X,y in random.sample(list(zip(sequences,labels)),len(sequences)):
yield X,y
train_dataset = tf.data.Dataset.from_generator(generator=sequence_generator,args= ['train'], output_types = (tf.float32, tf.float32))# output_shapes = ((train_shape), (3,)))
train_dataset = train_dataset.cache().batch(BATCH).prefetch(tf.data.AUTOTUNE)
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=validation_dataset, callbacks=[tensorboard,checkpoint,earlystop,lr_callback])
From my previous questions accepted answer, I want to be able to apply weighting to 2 of the 3 classes. The accepted answers code is:
sample_weight = np.ones(shape=(len(train_y),))
sample_weight[(train_y[:, 1] == 1) | (train_y[:, 2] == 1)] = 1.5
model = get_model()
model.fit(
train_x,
train_y,
sample_weight=sample_weight,
....,
...,
..,
)
But given that my training data is behind the generator code I am using, I am scratching my head on how best to apply the weighting and with minimal overhead
EDIT
I added the following code as recommended
class_weight = {0: 1.,
1: 2.,
2: 2.}
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=validation_dataset,class_weight=class_weight, callbacks=[tensorboard,checkpoint,earlystop,lr_callback])
but I get this error
File "Train.py", line 1993, in build_and_train_model
history = model.fit(train_dataset, epochs=EPOCHS, validation_data=validation_dataset,class_weight=class_weight, callbacks=[tensorboard,checkpoint,earlystop,lr_callback])
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1147, in fit
steps_per_execution=self._steps_per_execution)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 1364, in get_data_handler
return DataHandler(*args, **kwargs)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 1175, in __init__
class_weight, distribute)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 1186, in _configure_dataset_and_inferred_steps
dataset = dataset.map(_make_class_weight_map_fn(class_weight))
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 1925, in map
return MapDataset(self, map_func, preserve_cardinality=True)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 4487, in __init__
use_legacy_function=use_legacy_function)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 3712, in __init__
self._function = fn_factory()
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\eager\function.py", line 3135, in get_concrete_function
*args, **kwargs)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\eager\function.py", line 3100, in _get_concrete_function_garbage_collected
graph_function, _ = self._maybe_define_function(args, kwargs)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\eager\function.py", line 3444, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\eager\function.py", line 3289, in _create_graph_function
capture_by_value=self._capture_by_value),
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\framework\func_graph.py", line 999, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 3687, in wrapped_fn
ret = wrapper_helper(*args)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 3617, in wrapper_helper
ret = autograph.tf_convert(self._func, ag_ctx)(*nested_args)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 692, in wrapper
return converted_call(f, args, kwargs, options=options)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 382, in converted_call
return _call_unconverted(f, args, kwargs, options)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 463, in _call_unconverted
return f(*args, **kwargs)
File "Location\\AppData\Local\Programs\Python\Python36\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 1400, in _class_weights_map_fn
if y.shape.rank > 2:
TypeError: '>' not supported between instances of 'NoneType' and 'int
Not sure what is being referred to as NoneType, from the research I read I get the inclination that it has something to do with one_hotting maybe?
This looks a great moment to use class_weight instead of sample_weight:
Optional dictionary mapping class indices (integers) to a weight (float) value, used for weighting the loss function (during training only). This can be useful to tell the model to "pay more attention" to samples from an under-represented class.
I ended up doing this by modifying the generator:
train_dataset = tf.data.Dataset.from_generator(generator=sequence_generator,args= ['train'], output_types = (tf.float32, tf.float32, tf.float32))# output_shapes = ((train_shape), (3,)))
def sequence_generator(data_type):
data_type = data_type.decode('ascii')
sequence_folder = f"{model_attributes['sequences_path']}/{data_type}"
for numpy_file in random.sample(os.listdir(sequence_folder),len(os.listdir(sequence_folder))):
if numpy_file.endswith(".npz"):
with np.load(f"{sequence_folder}/{numpy_file}") as numpy_data:
sequences = numpy_data['x']
labels = numpy_data['y']
class_weight = {0: 1, 1: 2 ,2: 2}
labels = tf.one_hot(labels, 3)
sample_weights = np.ones(shape=(len(labels),))
sample_weights[(labels[:, 1] == 1) | (labels[:, 2] == 1) ] = 2
for X,y,sample_weight in random.sample(list(zip(sequences,labels,sample_weights)),len(sequences)):
yield X,y,sample_weight
The class_weights method did not seem to work. This article seemed to shed some light on it
I use TensorFlow 1.12 with eager execution. When I call
model.fit(train, steps_per_epoch=int(np.ceil(num_train_samples / BATCH_SIZE)), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=val, validation_steps=int(np.ceil(num_val_samples / BATCH_SIZE)))
I get the following error:
ValueError: Incompatible type conversion requested to type 'uint8' for variable of type 'float32'
As far as I know, I am not converting from uint8 to float32 anywhere, at least not explicitly.
My datasets are generated as follows:
train = tf.data.Dataset.from_generator(generator=train_sample_fetcher, output_types=(tf.uint8, tf.float32))
train = train.repeat()
train = train.batch(BATCH_SIZE)
train = train.shuffle(10)
val = tf.data.Dataset.from_generator(generator=val_sample_fetcher, output_types=(tf.uint8, tf.float32))
employing the following generator functions:
def train_sample_fetcher():
return sample_fetcher()
def val_sample_fetcher():
return sample_fetcher(is_validations=True)
def sample_fetcher(is_validations=False):
sample_names = [filename[:-4] for filename in os.listdir(DIR_DATASET + "ndarrays/")]
if not is_validations: sample_names = sample_names[:int(len(sample_names) * TRAIN_VAL_SPLIT)]
else: sample_names = sample_names[int(len(sample_names) * TRAIN_VAL_SPLIT):]
for sample_name in sample_names:
rgb = tf.image.decode_jpeg(tf.read_file(DIR_DATASET + sample_name + ".jpg"))
rgb = tf.image.resize_images(rgb, (HEIGHT, WIDTH))
#d = tf.image.decode_jpeg(tf.read_file(DIR_DATASET + "depth/" + sample_name + ".jpg"))
#d = tf.image.resize_images(d, (HEIGHT, WIDTH))
#rgbd = tf.concat([rgb,d], axis=2)
onehots = tf.convert_to_tensor(np.load(DIR_DATASET + "ndarrays/" + sample_name + ".npy"), dtype=tf.float32)
yield rgb, onehots
---------------------------------------------------------------------------------
For reference, the full stacktrace:
Traceback (most recent call last):
File "tensorflow/python/ops/gen_nn_ops.py", line 976, in conv2d
"data_format", data_format, "dilations", dilations)
tensorflow.python.eager.core._FallbackException: Expecting int64_t value for attr strides, got numpy.int32
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "dir/to/my_script.py", line 100, in <module>
history = model.fit(train, steps_per_epoch=int(np.ceil(num_train_samples / BATCH_SIZE)), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=val, validation_steps=int(np.ceil(num_val_samples / BATCH_SIZE)))
File "/tensorflow/python/keras/engine/training.py", line 1614, in fit
validation_steps=validation_steps)
File "/tensorflow/python/keras/engine/training_eager.py", line 705, in fit_loop
batch_size=batch_size)
File "/tensorflow/python/keras/engine/training_eager.py", line 251, in iterator_fit_loop
model, x, y, sample_weights=sample_weights, training=True)
File "/tensorflow/python/keras/engine/training_eager.py", line 511, in _process_single_batch
training=training)
File "/tensorflow/python/keras/engine/training_eager.py", line 90, in _model_loss
outs, masks = model._call_and_compute_mask(inputs, **kwargs)
File "/tensorflow/python/keras/engine/network.py", line 856, in _call_and_compute_mask
mask=masks)
File "/tensorflow/python/keras/engine/network.py", line 1029, in _run_internal_graph
computed_tensor, **kwargs)
File "/tensorflow/python/keras/engine/network.py", line 856, in _call_and_compute_mask
mask=masks)
File "/tensorflow/python/keras/engine/network.py", line 1031, in _run_internal_graph
output_tensors = layer.call(computed_tensor, **kwargs)
File "/tensorflow/python/keras/layers/convolutional.py", line 194, in call
outputs = self._convolution_op(inputs, self.kernel)
File "/tensorflow/python/ops/nn_ops.py", line 868, in __call__
return self.conv_op(inp, filter)
File "/tensorflow/python/ops/nn_ops.py", line 520, in __call__
return self.call(inp, filter)
File "/tensorflow/python/ops/nn_ops.py", line 204, in __call__
name=self.name)
File "/tensorflow/python/ops/gen_nn_ops.py", line 982, in conv2d
name=name, ctx=_ctx)
File "/tensorflow/python/ops/gen_nn_ops.py", line 1015, in conv2d_eager_fallback
_attr_T, _inputs_T = _execute.args_to_matching_eager([input, filter], _ctx)
File "/tensorflow/python/eager/execute.py", line 195, in args_to_matching_eager
ret = [internal_convert_to_tensor(t, dtype, ctx=ctx) for t in l]
File "/tensorflow/python/eager/execute.py", line 195, in <listcomp>
ret = [internal_convert_to_tensor(t, dtype, ctx=ctx) for t in l]
File "/tensorflow/python/framework/ops.py", line 1146, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/tensorflow/python/ops/variables.py", line 828, in _TensorConversionFunction
"of type '%s'" % (dtype.name, v.dtype.name))
ValueError: Incompatible type conversion requested to type 'uint8' for variable of type 'float32'
The first thrown error is about NumPy ndarrays, but I convert those to TensorFlow tensors right after I import them. Any suggestions are greatly appreciated! I checked for any np.int32 types, but was not able to find any.
The output of tf.image.resize_images is a tensor of type float and therefore the rgb tensor returned from sample_fetcher() is a tensor of type float. However, when calling the Dataset.from_generator() method, you are specifying the output type of the first generated element as tf.uint8 (i.e. output_types=(tf.uint8, tf.float32)). Therefore, a conversion needs to be done which actually could not be done. Change it to tf.float32 (i.e. output_types=(tf.float32, tf.float32)) for both train and validation generators and the problem would be fixed.
Thanks for looking into this question!
I am trying to train an LSTM network which predicts the next 5-day stock prices based on past 30-day stock prices. I have trained the model based on 265 samples. The variables are defined as follow:
# Variables
x = tf.placeholder("float", [265, 30])
y = tf.placeholder("float", [265, 5])
weights = {
'out': tf.Variable(tf.random_normal([n_hidden, y_size]))
}
biases = {
'out': tf.Variable(tf.random_normal([y_size]))
}
and the model looks as below:
# Define RNN architecture
def RNN(x, weights, biases):
x_size = 30
x = tf.reshape(x, [-1, x_size])
x = tf.split(x, x_size, 1)
rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden)])
outputs, states = rnn.static_rnn(rnn_cell, x, dtype = tf.float32)
return tf.matmul(outputs[-1], weights['out'] + biases['out'])
Then, I attempt to use the trained model to predict as follow:
y_pred = RNN(x_input, trained_weights, trained_biases)
in which x_input has a dimension (1x30). In gave me a list of error which I could not understand:
ValueError: Variable rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1654, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 3290, in create_op
op_def=op_def)
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
Traceback (most recent call last):
File "C:\Users\teh.khoonkheng\Desktop\Others\Personal working folder\14. Projects\1. Oracle\Python\RNN_stock_01.py", line 135, in <module>
y_test = RNN(x_test, trained_weights, trained_biases)
File "C:\Users\teh.khoonkheng\Desktop\Others\Personal working folder\14. Projects\1. Oracle\Python\RNN_stock_01.py", line 81, in RNN
outputs, states = rnn.static_rnn(rnn_cell, x, dtype = tf.float32)
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\ops\rnn.py", line 1330, in static_rnn
(output, state) = call_cell()
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\ops\rnn.py", line 1317, in <lambda>
call_cell = lambda: cell(input_, state)
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 191, in __call__
return super(RNNCell, self).__call__(inputs, state)
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\layers\base.py", line 714, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "C:\Program Files\Python35\lib\site-packages\tensorflow\python\ops\rnn_cell_impl.py", line 1242, in call
cur_inp, new_state = cell(cur_inp, cur_state)
I was wondering if I have misunderstood how static_rnn works. Have I set up the model incorrectly? And how should I use the trained RNN to make prediction?
Thanks for your help!
Like the error says, you need to mention reuse=True such that the learned states can be used later for prediction. Do this:
rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden,reuse=tf.AUTO_REUSE), rnn.BasicLSTMCell(n_hidden,reuse=tf.AUTO_REUSE)])
Also, this model looks wrong because, while you're using the trained weights and biases, but you're not using the trained LSTMcells. For new inputs, you're defining new LSTMcells.
I am trying to set up a simple convolutional network in Tensorflow. I'll try to keep the amount of code at a minimum level. Here's the class:
class ConvNet(object):
def __init__(self, input, labels, dataset):
self.input = input
self.true_labels = labels
#'dataset' is an instance of a class that
#I am using to read the training images
self.data = dataset
self.build()
self._optimize = None
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
def build(self):
self.conv_layers = []
# create 3 conv layers using tf.nn.conv2d
# and append them to self.conv_layers
# create flattening layer using tf.nn.reshape
self.fc_layers = []
# create 2 fully connected layers using tf.matmul
# and append them to self.fc_layers
#property
def optimize(self):
"""Return the optimize operation."""
if not self._optimize:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = self.fc_layers[-1],
labels = self.true_labels)
cost = tf.reduce_mean(cross_entropy)
self._optimize = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
return self._optimize
def train(self, num_epochs=1, batch_size=16):
epochs_done = 0
while not epochs_done == num_epochs:
# get next batch from training data
x_batch, y_true_batch, _, _ = self.data.train.next_batch(batch_size)
feed_dict_train = {self.input: x_batch, self.true_labels: y_true_batch}
#even if I initialize variables here, same error occurs
#self.sess.run(tf.global_variables_initializer())
self.sess.run(self.optimize, feed_dict=feed_dict_train)
if self.data.train.epochs_done > epochs_done:
#print stuff...
epochs_done += 1
self.sess.close()
I was trying to run the optimize operation to train the network on a simple dataset (e.g. images of cats vs dogs). Here is the main:
if __name__ == "__main__":
classes = ['dogs', 'cats']
num_classes = len(classes)
batch_size = 16
img_size = 128
num_channels = 3
#read training data and resize images to 128 x 128 pixels
data = dataset.read_train_sets(img_size, classes)
x = tf.placeholder(tf.float32, shape = [None, img_size, img_size, num_channels], name='x')
y_true = tf.placeholder(tf.float32, shape = [None, num_classes], name = 'y_true')
net = ConvNet(x, y_true, data)
net.train()
However I keep getting the following error (it's actually much longer than this, I can post it if necessary):
Caused by op 'beta1_power/read', defined at:
File "<string>", line 1, in <module>
File "/usr/lib/python3.5/idlelib/run.py", line 124, in main
ret = method(*args, **kwargs)
File "/usr/lib/python3.5/idlelib/run.py", line 351, in runcode
exec(code, self.locals)
File "/home/gian/face_recognition/model.py", line 366, in <module>
net.train()
File "/home/gian/face_recognition/model.py", line 309, in train
self.sess.run(self.optimize, feed_dict=feed_dict_train)
File "/home/gian/face_recognition/model.py", line 269, in optimize
self._optimize = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 409, in minimize
name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 552, in apply_gradients
self._create_slots([_get_variable_for(v) for v in var_list])
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/adam.py", line 124, in _create_slots
colocate_with=first_var)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 665, in _create_non_slot_variable
v = variable_scope.variable(initial_value, name=name, trainable=False)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 2157, in variable
use_resource=use_resource)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 2147, in <lambda>
previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variable_scope.py", line 2130, in default_variable_creator
constraint=constraint)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variables.py", line 235, in __init__
constraint=constraint)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/variables.py", line 391, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/array_ops.py", line 142, in identity
return gen_array_ops.identity(input, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 3053, in identity
"Identity", input=input, name=name)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3290, in create_op
op_def=op_def)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1654, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value beta1_power
[[Node: beta1_power/read = Identity[T=DT_FLOAT, _class=["loc:#Variable"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](beta1_power)]]
Can somebody help me figure out where the problem is? Thanks a lot
This error generally occurs when you haven't initialized the optimizer.
So just add self.optimize before you initialize all the global variables.
Your code should look like this.
def __init__(self, input, labels, dataset):
self.input = input
self.true_labels = labels
#'dataset' is an instance of a class that
#I am using to read the training images
self.data = dataset
self.build()
self._optimize = None
self.sess = tf.Session()
self.optimize()
self.sess.run(tf.global_variables_initializer())