I was trying to adapt an NN architecture I saw in a blog post here: https://sorenbouma.github.io/blog/oneshot/
The only thing I was trying to change about it was the input, instead of (105,105,1) greyscale I was hoping to use (100,100,3) RGB. I thus used Keras to define the architecture as in the blog post but with different input:
def W_init(shape,name=None):
"""Initialize weights as in paper"""
values = rng.normal(loc=0,scale=1e-2,size=shape)
return K.variable(values,name=name)
#//TODO: figure out how to initialize layer biases in keras.
def b_init(shape,name=None):
"""Initialize bias as in paper"""
values=rng.normal(loc=0.5,scale=1e-2,size=shape)
return K.variable(values,name=name)
input_shape = (100, 100, 3)
left_input = Input(input_shape)
right_input = Input(input_shape)
#build convnet to use in each siamese 'leg'
convnet = Sequential()
convnet.add(Conv2D(64,(10,10),activation='relu',input_shape=input_shape,
kernel_initializer=W_init,kernel_regularizer=l2(2e-4)))
convnet.add(MaxPooling2D())
convnet.add(Conv2D(128,(7,7),activation='relu',
kernel_regularizer=l2(2e-4),kernel_initializer=W_init,bias_initializer=b_init))
convnet.add(MaxPooling2D())
convnet.add(Conv2D(128,(4,4),activation='relu',kernel_initializer=W_init,kernel_regularizer=l2(2e-4),bias_initializer=b_init))
convnet.add(MaxPooling2D())
convnet.add(Conv2D(256,(4,4),activation='relu',kernel_initializer=W_init,kernel_regularizer=l2(2e-4),bias_initializer=b_init))
convnet.add(Flatten())
convnet.add(Dense(4096,activation="sigmoid",kernel_regularizer=l2(1e-3),kernel_initializer=W_init,bias_initializer=b_init))
#encode each of the two inputs into a vector with the convnet
encoded_l = convnet(left_input)
encoded_r = convnet(right_input)
#merge two encoded inputs with the l1 distance between them
L1_distance = lambda x: K.abs(x[0]-x[1])
both = merge([encoded_l,encoded_r], mode = L1_distance, output_shape=lambda x: x[0])
prediction = Dense(1,activation='sigmoid',bias_initializer=b_init)(both)
siamese_net = Model(input=[left_input,right_input],output=prediction)
#optimizer = SGD(0.0004,momentum=0.6,nesterov=True,decay=0.0003)
optimizer = Adam(0.00006)
#//TODO: get layerwise learning rates and momentum annealing scheme described in paperworking
siamese_net.compile(loss="binary_crossentropy",optimizer=optimizer)
siamese_net.count_params()
I then train the network on my data as in the paper:
#Training loop
evaluate_every = 500
loss_every=50
batch_size = 20
N_way = 20
n_val = 250
#siamese_net.load_weights("/home/soren/keras-oneshot/weights")
max_epochs = 100
for i in range(0,max_epochs):
(inputs,targets)=loader.get_batch(batch_size)
loss=siamese_net.train_on_batch(inputs,targets)
if i % evaluate_every == 0:
val_acc = loader.test_oneshot(siamese_net,N_way,n_val,verbose=True)
if val_acc >= best:
print("saving")
siamese_net.save('/home/soren/keras-oneshot/weights')
best=val_acc
if i % loss_every == 0:
print("iteration {}, training loss: {:.2f},".format(i,loss))
But I get
FailedPreconditionError: Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
Here is full error output:
---------------------------------------------------------------------------
FailedPreconditionError Traceback (most recent call last)
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1138 try:
-> 1139 return fn(*args)
1140 except errors.OpError as e:
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1120 feed_dict, fetch_list, target_list,
-> 1121 status, run_metadata)
1122
/usr/lib/python3.4/contextlib.py in __exit__(self, type, value, traceback)
65 try:
---> 66 next(self.gen)
67 except StopIteration:
/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
FailedPreconditionError: Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
During handling of the above exception, another exception occurred:
FailedPreconditionError Traceback (most recent call last)
<ipython-input-15-06f79f757a6e> in <module>()
9 for i in range(0,max_epochs):
10 (inputs,targets)=loader.get_batch(batch_size)
---> 11 loss=siamese_net.train_on_batch(inputs,targets)
12 if i % evaluate_every == 0:
13 val_acc = loader.test_oneshot(siamese_net,N_way,n_val,verbose=True)
/usr/local/lib/python3.4/dist-packages/keras/engine/training.py in train_on_batch(self, x, y, sample_weight, class_weight)
1563 ins = x + y + sample_weights
1564 self._make_train_function()
-> 1565 outputs = self.train_function(ins)
1566 if len(outputs) == 1:
1567 return outputs[0]
/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
2263 value = (indices, sparse_coo.data, sparse_coo.shape)
2264 feed_dict[tensor] = value
-> 2265 session = get_session()
2266 updated = session.run(self.outputs + [self.updates_op],
2267 feed_dict=feed_dict,
/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py in get_session()
166 if not _MANUAL_VAR_INIT:
167 with session.graph.as_default():
--> 168 _initialize_variables()
169 return session
170
/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py in _initialize_variables()
339 if uninitialized_variables:
340 sess = get_session()
--> 341 sess.run(tf.variables_initializer(uninitialized_variables))
342
343
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
787 try:
788 result = self._run(None, fetches, feed_dict, options_ptr,
--> 789 run_metadata_ptr)
790 if run_metadata:
791 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
995 if final_fetches or final_targets:
996 results = self._do_run(handle, final_targets, final_fetches,
--> 997 feed_dict_string, options, run_metadata)
998 else:
999 results = []
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1130 if handle is None:
1131 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1132 target_list, options, run_metadata)
1133 else:
1134 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1150 except KeyError:
1151 pass
-> 1152 raise type(e)(node_def, op, message)
1153
1154 def _extend_graph(self):
FailedPreconditionError: Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
Caused by op 'conv2d_1/Variable/read', defined at:
File "/usr/lib/python3.4/runpy.py", line 170, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.4/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.4/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.4/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/usr/local/lib/python3.4/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-51595f796dab>", line 17, in <module>
kernel_initializer=W_init,kernel_regularizer=l2(2e-4)))
File "/usr/local/lib/python3.4/dist-packages/keras/models.py", line 436, in add
layer(x)
File "/usr/local/lib/python3.4/dist-packages/keras/engine/topology.py", line 569, in __call__
self.build(input_shapes[0])
File "/usr/local/lib/python3.4/dist-packages/keras/layers/convolutional.py", line 134, in build
constraint=self.kernel_constraint)
File "/usr/local/lib/python3.4/dist-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/keras/engine/topology.py", line 391, in add_weight
weight = K.variable(initializer(shape), dtype=dtype, name=name)
File "<ipython-input-2-51595f796dab>", line 4, in W_init
return K.variable(values,name=name)
File "/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py", line 321, in variable
v = tf.Variable(value, dtype=_convert_string_dtype(dtype), name=name)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/ops/variables.py", line 200, in __init__
expected_shape=expected_shape)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/ops/variables.py", line 319, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1303, in identity
result = _op_def_lib.apply_op("Identity", input=input, name=name)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
self._traceback = _extract_stack()
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
Googlign about the error did not really make things clear. I saw some posts about the error coming up when using Tensorflow, but nothing regarding this erro and Keras, so I am a little bit confused about what is going on.
I experienced the same error a few days ago.
The cause of the error is weight initialization.
you try to change the follow code.
(before)
convnet.add(Conv2D(64,(10,10),activation='relu',input_shape=input_shape,
kernel_initializer=W_init,kernel_regularizer=l2(2e-4)))
(after)
convnet.add(Conv2D(64,(10,10),activation='relu',input_shape=input_shape,
kernel_initializer=keras.initializers.RandomNormal(mean=0.0,
stddev=1e-2, seed=None),kernel_regularizer=l2(2e-4)))
Please change from all W_init and b_init to keras.initializers.RandomNormal(...).
Related
I have a pandas dataframe in which one column contains text body of an Email, I am trying to encode it using this tutorial. I have managed to encode the sentences, by
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)
tf.logging.set_verbosity(tf.logging.ERROR)
messages = df['EmailBody'].tolist()[:50] #Why 50 explained below
with tf.Session() as session:
session.run([tf.global_variables_initializer(), tf.tables_initializer()])
message_embeddings = session.run(embed(messages))
Now if I increase the size from here, it starts leaking out memory, I also tried running it in batches by
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)
tf.logging.set_verbosity(tf.logging.ERROR)
messages = df_RF_final['Preprocessed_EmailBody'].tolist()
message_embeddings = []
with tf.Session() as session:
session.run([tf.global_variables_initializer(), tf.tables_initializer()])
for i in range(int(len(messages)/100)):
message_embeddings.append(session.run(embed(messages[i*100:(1+1)*200])))
Which gave the error available at the bottom, I am looking for an implementation where rather than having to pass the list I can pass a generator function, if it is not possible to use a generator function, then gelp me fix the second approach.
Error stack
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 try:
-> 1334 return fn(*args)
1335 except errors.OpError as e:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1318 return self._call_tf_sessionrun(
-> 1319 options, feed_dict, fetch_list, target_list, run_metadata)
1320
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1406 self._session, options, feed_dict, fetch_list, target_list,
-> 1407 run_metadata)
1408
InvalidArgumentError: Requires start <= limit when delta > 0: 0/-2147483648
[[{{node module_3_apply_default_4/Encoder_en/Transformer/SequenceMask/Range}}]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-13-d0c75bde4d87> in <module>()
7 session.run([tf.global_variables_initializer(), tf.tables_initializer()])
8 for i in range(int(len(messages)/100)):
----> 9 message_embeddings.append(session.run(embed(messages[i*100:(1+1)*200])))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
927 try:
928 result = self._run(None, fetches, feed_dict, options_ptr,
--> 929 run_metadata_ptr)
930 if run_metadata:
931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1150 if final_fetches or final_targets or (handle and feed_dict_tensor):
1151 results = self._do_run(handle, final_targets, final_fetches,
-> 1152 feed_dict_tensor, options, run_metadata)
1153 else:
1154 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1326 if handle is None:
1327 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1328 run_metadata)
1329 else:
1330 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1346 pass
1347 message = error_interpolation.interpolate(message, self._graph)
-> 1348 raise type(e)(node_def, op, message)
1349
1350 def _extend_graph(self):
InvalidArgumentError: Requires start <= limit when delta > 0: 0/-2147483648
[[node module_3_apply_default_4/Encoder_en/Transformer/SequenceMask/Range (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_hub/native_module.py:514) ]]
Caused by op 'module_3_apply_default_4/Encoder_en/Transformer/SequenceMask/Range', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-13-d0c75bde4d87>", line 9, in <module>
message_embeddings.append(session.run(embed(messages[i*100:(1+1)*200])))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_hub/module.py", line 247, in __call__
name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_hub/native_module.py", line 514, in create_apply_graph
import_scope=relative_scope_name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1435, in import_meta_graph
meta_graph_or_file, clear_devices, import_scope, **kwargs)[0]
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py", line 1457, in _import_meta_graph_with_return_elements
**kwargs))
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/meta_graph.py", line 806, in import_scoped_meta_graph_with_return_elements
return_elements=return_elements)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/importer.py", line 442, in import_graph_def
_ProcessNewOps(graph)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/importer.py", line 235, in _ProcessNewOps
for new_op in graph._add_new_tf_operations(compute_devices=False): # pylint: disable=protected-access
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3433, in _add_new_tf_operations
for c_op in c_api_util.new_tf_operations(self)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3433, in <listcomp>
for c_op in c_api_util.new_tf_operations(self)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3325, in _create_op_from_tf_operation
ret = Operation(c_op, self)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): Requires start <= limit when delta > 0: 0/-2147483648
[[node module_3_apply_default_4/Encoder_en/Transformer/SequenceMask/Range (defined at /usr/local/lib/python3.6/dist-packages/tensorflow_hub/native_module.py:514) ]]
I had a similar issue and is very similar to "Strongly increasing memory consumption when using ELMo from Tensorflow-Hub". I got a great answer from arnoegw here.
In short, since you are using tf.session you are using the TF.v1 programing model. This means you first build the dataflow and then run it repeatedly i.e. feeding it inputs and fetching outputs from the graph. But you keep adding the new application of hub.Module to the graph. Instead of doing it once and using the graph.
The correct way to implement this according to the answer is:
tf.logging.set_verbosity(tf.logging.ERROR)
messages = df_RF_final['Preprocessed_EmailBody'].tolist()
message_embeddings = []
with hub.eval_function_for_module("https://tfhub.dev/google/universal-sentence-encoder-large/3") as embed:
for i in range(int(len(messages)/100)):
message_embeddings.append(embed(messages[i*100:(1+1)*200])
when I try to use the tf.matmul function on the GPU I get the following error:
InternalError: Blas xGEMMBatched launch failed
If the N value in the function calc() is set to a value less than 15 it works.
I am running tensorflow 1.8.0 and Cuda V9.1.85. There is only one Python process working on the GPU and there are no other open sessions. Also I have plenty GPU memory to spare (see attached image).
Changing the CUDA_VISIBLE_DEVICES values does not show any effect. Changing the ConfigProto() settings does not help either. Also the use of tf.matmul does not solve the problem.
This is the code I am running:
import tensorflow as tf
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
tf.Session(config=config).close()
def calc():
N = 15 # works for N <= 14
a = 16
b = 8
X = np.random.rand(N, 11520, b, 1).astype(np.float32)
print(X.nbytes*1e-6, "MB")
W = np.random.rand(N, 11520, a, b).astype(np.float32)
print(W.nbytes*1e-6, "MB")
X_ = tf.constant(X, name="X-constant", dtype=tf.float32)
W_ = tf.constant(W, name="W-constant", dtype=tf.float32)
# return tf.matmul(W_, X_, name="mymatmul")
return W_ # X_
tf.reset_default_graph()
a = calc()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
b = sess.run(a)
sess.close()
print(b.shape)
This is the output I get:
5.529599999999999 MB
88.47359999999999 MB
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1321 try:
-> 1322 return fn(*args)
1323 except errors.OpError as e:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1306 return self._call_tf_sessionrun(
-> 1307 options, feed_dict, fetch_list, target_list, run_metadata)
1308
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1408 self._session, options, feed_dict, fetch_list, target_list,
-> 1409 run_metadata)
1410 else:
InternalError: Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
During handling of the above exception, another exception occurred:
InternalError Traceback (most recent call last)
<ipython-input-5-013153235a1a> in <module>()
3 sess = tf.Session()
4 sess.run(tf.global_variables_initializer())
----> 5 b = sess.run(a)
6 sess.close()
7 print(b.shape)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
898 try:
899 result = self._run(None, fetches, feed_dict, options_ptr,
--> 900 run_metadata_ptr)
901 if run_metadata:
902 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1133 if final_fetches or final_targets or (handle and feed_dict_tensor):
1134 results = self._do_run(handle, final_targets, final_fetches,
-> 1135 feed_dict_tensor, options, run_metadata)
1136 else:
1137 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1314 if handle is None:
1315 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1316 run_metadata)
1317 else:
1318 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 except KeyError:
1334 pass
-> 1335 raise type(e)(node_def, op, message)
1336
1337 def _extend_graph(self):
InternalError: Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
Caused by op 'matmul', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 486, in start
self.io_loop.start()
File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 127, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
self._run_once()
File "/usr/lib/python3.6/asyncio/base_events.py", line 1432, in _run_once
handle._run()
File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
self._callback(*self._args)
File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 117, in _handle_events
handler_func(fileobj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-5-013153235a1a>", line 2, in <module>
a = calc()
File "<ipython-input-4-bf0e6012e9e2>", line 13, in calc
return W_ # X_
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py", line 847, in binary_op_wrapper
return func(x, y, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py", line 1976, in matmul
a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1236, in batch_mat_mul
"BatchMatMul", x=x, y=y, adj_x=adj_x, adj_y=adj_y, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3414, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1740, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InternalError (see above for traceback): Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
Here is a workaround replacing tf.matmul with tf.einsum. However, your code works on my machine with NVIDIA 840M (2004 MiB RAM), cudnn 7.0.5.15 and cuda 9.0.176 (maybe downgrading helps?).
import tensorflow as tf
import numpy as np
sess = tf.Session()
N = 20
M = 11520
a = 16
b = 8
W = np.random.rand(N, M, a, b).astype(np.float32)
X = np.random.rand(N, M, b, 1).astype(np.float32)
# tf.einsum does not support numpy arrays, so wrap W and X in tf.constants
W2 = tf.constant(W)
X2 = tf.constant(X)
# tf.einsum does not support "..." as seen later in np.einsum
WX = tf.einsum("uvik,uvkj->uvij", W2, X2)
# same as:
#WX = tf.matmul(W2, X2)
# calculate W#X using tf.einsum
result1 = sess.run(WX)
# calculate W#X using np.einsum
result2 = np.einsum("...ik,...kj->...ij", W, X)
# calculate W#X by hand (just for illustrative purpose, too slow for practical use)
result3 = np.zeros((N, M, a, 1), dtype=np.float32)
for i in range(a):
for j in range(1):
for k in range(b):
result3[..., i, j] += W[..., i, k] * X[..., k, j]
# ensure that everything is correct
assert(np.allclose(result1, result2))
assert(np.allclose(result1, result3))
print("everything ok")
sess.close()
After a restart and without touching anything else, I open up the Jupyter Notebook and attempt to run the cells that get the GPU to start training.
But in my Terminal I get this message and in the Notebook I get the long error below. (I'm using Ubuntu 16.04, Keras with Tensorflow backend).
87] Found device 1 with properties:
name: GeForce GTX 1080 Ti
major: 6 minor: 1 memoryClockRate (GHz) 1.582
pciBusID 0000:25:00.0
Total memory: 10.91GiB
Free memory: 396.44MiB
Code of the notebook here https://github.com/fastai/courses/blob/master/deeplearning1/nbs/lesson1.ipynb
In cell [5] I've lowered the batch size to 10 and then tried 5. I also set no_of_epochs=5. In addition to restarts I've also tried looking for any command that gets the system to drop any processes the GPU might be using, but it doesn't appear to be using any.
cell [7] is the cell that gives all the errors below when it is run.
This is the full error under the cell [7] that is trying to utilize the GPU.
---------------------------------------------------------------------------
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-7-2b6861506a11> in <module>()
----> 1 vgg = Vgg16()
2 # Grab a few images at a time for training and validation.
3 # NB: They must be in subdirectories named based on their category
4 batches = vgg.get_batches(path+'train', batch_size=batch_size)
5 val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
/home/eagle/fastai/courses-master/deeplearning1/nbs/vgg16.pyc in __init__(self)
45 def __init__(self):
46 self.FILE_PATH = 'http://files.fast.ai/models/'
---> 47 self.create()
48 self.get_classes()
49
/home/eagle/fastai/courses-master/deeplearning1/nbs/vgg16.pyc in create(self)
137
138 fname = 'vgg16.h5'
--> 139 model.load_weights(get_file(fname, self.FILE_PATH+fname, cache_subdir='models'))
140
141
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.pyc in load_weights(self, filepath, by_name)
2706 self.load_weights_from_hdf5_group_by_name(f)
2707 else:
-> 2708 self.load_weights_from_hdf5_group(f)
2709
2710 if hasattr(f, 'close'):
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.pyc in load_weights_from_hdf5_group(self, f)
2792 weight_values[0] = w
2793 weight_value_tuples += zip(symbolic_weights, weight_values)
-> 2794 K.batch_set_value(weight_value_tuples)
2795
2796 def load_weights_from_hdf5_group_by_name(self, f):
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in batch_set_value(tuples)
1879 assign_ops.append(assign_op)
1880 feed_dict[assign_placeholder] = value
-> 1881 get_session().run(assign_ops, feed_dict=feed_dict)
1882
1883
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in get_session()
123 session = _SESSION
124 if not _MANUAL_VAR_INIT:
--> 125 _initialize_variables()
126 return session
127
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.pyc in _initialize_variables()
280 sess = get_session()
281 if hasattr(tf, 'variables_initializer'):
--> 282 sess.run(tf.variables_initializer(uninitialized_variables))
283 else:
284 sess.run(tf.initialize_variables(uninitialized_variables))
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
776 try:
777 result = self._run(None, fetches, feed_dict, options_ptr,
--> 778 run_metadata_ptr)
779 if run_metadata:
780 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
980 if final_fetches or final_targets:
981 results = self._do_run(handle, final_targets, final_fetches,
--> 982 feed_dict_string, options, run_metadata)
983 else:
984 results = []
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1030 if handle is None:
1031 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1032 target_list, options, run_metadata)
1033 else:
1034 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
1050 except KeyError:
1051 pass
-> 1052 raise type(e)(node_def, op, message)
1053
1054 def _extend_graph(self):
ResourceExhaustedError: OOM when allocating tensor with shape[25088,4096]
[[Node: random_uniform_13/RandomUniform = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=87654321, seed2=755436606, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform_13/shape)]]
Caused by op u'random_uniform_13/RandomUniform', defined at:
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
app.launch_new_instance()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 478, in start
self.io_loop.start()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 281, in dispatcher
return self.dispatch_shell(stream, msg)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 232, in dispatch_shell
handler(stream, idents, msg)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 397, in execute_request
user_expressions, allow_stdin)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-7-2b6861506a11>", line 1, in <module>
vgg = Vgg16()
File "vgg16.py", line 47, in __init__
self.create()
File "vgg16.py", line 134, in create
self.FCBlock()
File "vgg16.py", line 113, in FCBlock
model.add(Dense(4096, activation='relu'))
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/models.py", line 332, in add
output_tensor = layer(self.outputs[0])
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.py", line 546, in __call__
self.build(input_shapes[0])
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/layers/core.py", line 798, in build
constraint=self.W_constraint)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/engine/topology.py", line 418, in add_weight
weight = initializer(shape, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/initializations.py", line 66, in glorot_uniform
return uniform(shape, s, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/initializations.py", line 33, in uniform
return K.random_uniform_variable(shape, -scale, scale, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/Keras-1.2.2-py2.7.egg/keras/backend/tensorflow_backend.py", line 634, in random_uniform_variable
low, high, dtype=tf_dtype, seed=seed)(shape)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/ops/init_ops.py", line 189, in __call__
dtype, seed=self.seed)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/ops/random_ops.py", line 236, in random_uniform
shape, dtype, seed=seed1, seed2=seed2)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/ops/gen_random_ops.py", line 263, in _random_uniform
seed=seed, seed2=seed2, name=name)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
op_def=op_def)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/home/eagle/anaconda3/envs/les1/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[25088,4096]
[[Node: random_uniform_13/RandomUniform = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=87654321, seed2=755436606, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform_13/shape)]]
After
nvidia-smi
The last line will show the process with a 'pid' number
Enter with the 'pid' number last
with the following command (your four digit number at the end will be your own)
sudo kill -9 3096
I'm trying to train an autoencoder with mse loss function with TensorFlow r1.2, but I keep getting a FailedPreconditionError which states that one of the variables related to computing the mse is uninitialized (see full stack trace printout below). I'm running this in Jupyter notebook and I'm using Python 3.
I trimmed down my code to a minimal example as follows
import tensorflow as tf
import numpy as np
from functools import partial
# specify network
def reset_graph(seed=0):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
reset_graph()
n_inputs = 100
n_hidden = 6
n_outputs = n_inputs
learning_rate = 0.001
l2_reg = 0.001
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
he_init = tf.contrib.layers.variance_scaling_initializer()
l2_regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
my_dense_layer = partial(tf.layers.dense,
activation=tf.nn.elu,
kernel_initializer=he_init,
kernel_regularizer=l2_regularizer)
hidden1 = my_dense_layer(X, n_hidden1)
outputs = my_dense_layer(hidden1, n_outputs, activation=None)
reconstruction_loss = tf.reduce_mean(tf.metrics.mean_squared_error(X, outputs))
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([reconstruction_loss] + reg_losses)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
# generate 1000 random examples
sample_X = np.random.rand(1000, 100)
# train network
n_epochs = 10
batch_size = 50
with tf.Session() as sess:
sess.run(init) # init.run()
for epoch in range(n_epochs):
n_batches = sample_X.shape[0] // batch_size
for iteration in range(n_batches):
start_idx = iteration*batch_size
if iteration == n_batches-1:
end_idx = sample_X.shape[0]
else:
end_idx = start_idx + batch_size
sys.stdout.flush()
X_batch = sample_X[start_idx:end_idx]
sess.run(training_op, feed_dict={X: X_batch})
loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
print(round(loss_train, 5))
When I replace the line that defines reconstruction_loss to not use tf.metrics, as follows
reconstruction_loss = tf.reduce_mean(tf.square(tf.norm(outputs - X)))
I don't get the exception.
I've checked several similar SO questions, but none of them has solved my problem. For example, one possible cause, suggested in an answer at FailedPreconditionError: Attempting to use uninitialized in Tensorflow, is failing to initialize all the variables in the TF graph, but my script initializes all TF variables with init = tf.global_variables_initializer() and then sess.run(init). Another possible cause is that the Adam optimizer creates its own variables, which need to be initialized after specifying the optimizer (see Tensorflow: Using Adam optimizer). However, my script defines the variable initializer after the optimizer, as suggested in the accepted answer to that question, so that also can't be my problem.
Can anyone spot anything wrong with my script or suggest things to try to suss out the cause of this error?
Below is the stack trace from the error.
---------------------------------------------------------------------------
FailedPreconditionError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1138 try:
-> 1139 return fn(*args)
1140 except errors.OpError as e:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1120 feed_dict, fetch_list, target_list,
-> 1121 status, run_metadata)
1122
~\AppData\Local\Continuum\Anaconda3\lib\contextlib.py in __exit__(self, type, value, traceback)
88 try:
---> 89 next(self.gen)
90 except StopIteration:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\errors_impl.py in raise_exception_on_not_ok_status()
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
FailedPreconditionError: Attempting to use uninitialized value mean_squared_error/total
[[Node: mean_squared_error/total/read = Identity[T=DT_FLOAT, _class=["loc:#mean_squared_error/total"], _device="/job:localhost/replica:0/task:0/cpu:0"](mean_squared_error/total)]]
During handling of the above exception, another exception occurred:
FailedPreconditionError Traceback (most recent call last)
<ipython-input-55-aac61c488ed8> in <module>()
64 sess.run(training_op, feed_dict={X: X_batch})
65
---> 66 loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
67 print(round(loss_train, 5))
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py in eval(self, feed_dict, session)
604
605 """
--> 606 return _eval_using_default_session(self, feed_dict, self.graph, session)
607
608
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py in _eval_using_default_session(tensors, feed_dict, graph, session)
3926 "the tensor's graph is different from the session's "
3927 "graph.")
-> 3928 return session.run(tensors, feed_dict)
3929
3930
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
787 try:
788 result = self._run(None, fetches, feed_dict, options_ptr,
--> 789 run_metadata_ptr)
790 if run_metadata:
791 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
995 if final_fetches or final_targets:
996 results = self._do_run(handle, final_targets, final_fetches,
--> 997 feed_dict_string, options, run_metadata)
998 else:
999 results = []
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1130 if handle is None:
1131 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1132 target_list, options, run_metadata)
1133 else:
1134 return self._do_call(_prun_fn, self._session, handle, feed_dict,
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1150 except KeyError:
1151 pass
-> 1152 raise type(e)(node_def, op, message)
1153
1154 def _extend_graph(self):
FailedPreconditionError: Attempting to use uninitialized value mean_squared_error/total
[[Node: mean_squared_error/total/read = Identity[T=DT_FLOAT, _class=["loc:#mean_squared_error/total"], _device="/job:localhost/replica:0/task:0/cpu:0"](mean_squared_error/total)]]
Caused by op 'mean_squared_error/total/read', defined at:
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\__main__.py", line 3, in <module>
app.launch_new_instance()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
app.start()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 474, in start
ioloop.IOLoop.instance().start()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 276, in dispatcher
return self.dispatch_shell(stream, msg)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 228, in dispatch_shell
handler(stream, idents, msg)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 390, in execute_request
user_expressions, allow_stdin)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 501, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2802, in run_ast_nodes
if self.run_code(code, result):
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-55-aac61c488ed8>", line 32, in <module>
reconstruction_loss = tf.reduce_mean(tf.metrics.mean_squared_error(X, outputs))
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\metrics_impl.py", line 1054, in mean_squared_error
updates_collections, name or 'mean_squared_error')
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\metrics_impl.py", line 331, in mean
total = _create_local('total', shape=[])
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\metrics_impl.py", line 196, in _create_local
validate_shape=validate_shape)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1679, in variable
caching_device=caching_device, name=name, dtype=dtype)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\variables.py", line 200, in __init__
expected_shape=expected_shape)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\variables.py", line 319, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 1303, in identity
result = _op_def_lib.apply_op("Identity", input=input, name=name)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
op_def=op_def)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
original_op=self._default_original_op, op_def=op_def)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
self._traceback = _extract_stack()
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value mean_squared_error/total
[[Node: mean_squared_error/total/read = Identity[T=DT_FLOAT, _class=["loc:#mean_squared_error/total"], _device="/job:localhost/replica:0/task:0/cpu:0"](mean_squared_error/total)]]
Looks like you're doing everything right with initialization, so I suspect your error is that you're using tf.metrics.mean_squared_error incorrectly.
The metrics package of classes allows you to compute a value, but also accumulate that value over multiple calls to sess.run. Note the return value of tf.metrics.mean_square_error in the docs:
https://www.tensorflow.org/api_docs/python/tf/metrics/mean_squared_error
You get back both mean_square_error, as you appear to expect, and an update_op. The purpose of the update_op is that you ask tensorflow to compute the update_op and it accumulates the mean square error. Each time you call mean_square_error you get the accumulated value. When you want to reset the value you would run sess.run(tf.local_variables_initializer()) (note local and not global to clear "local" variables as the metrics package defines them).
I don't think the metrics package was intended to be used the way you're using it. I think your intention was to compute the mse only based on the current batch as your loss and not accumulate the value over multiple calls. I'm not even sure how differentiation would work with respect to an accumulated value like this.
So I think the answer to your question is: don't use the metrics package this way. Use metrics for reporting, and for accumulating results over multiple iterations of a test dataset, for example, not for generating a loss function.
I think what you mean to use is tf.losses.mean_squared_error
say I have a tensorflow shape:
y_ = tf.placeholder(tf.float32,[None,19],name='Labels')
My thinking here is to get each time 19 a vector of 19 elements and add it(pluging) it to y_
and list inputlabel with 57 as length:
I want to feed the line this list into y_
sess.run(train_step,feed_dict={x:xdata,y_:np.reshape(inputlabel,(3,19))})
this feeding isn't working and I really don't get how may I solve itl. here is the error message that I get:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1021 try:
-> 1022 return fn(*args)
1023 except errors.OpError as e:
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1003 feed_dict, fetch_list, target_list,
-> 1004 status, run_metadata)
1005
c:\users\engine\appdata\local\programs\python\python35\lib\contextlib.py in __exit__(self, type, value, traceback)
65 try:
---> 66 next(self.gen)
67 except StopIteration:
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\errors_impl.py in raise_exception_on_not_ok_status()
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
InvalidArgumentError: You must feed a value for placeholder tensor 'Labels' with dtype float
[[Node: Labels = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-7-1da2dc43ca63> in <module>()
1 for j in range(len(batch_xs)-1):
----> 2 print(sess.run(train_step,feed_dict={x:batch_xs[j],y_:np.reshape(batch_ys[j],(3,numberOFClasses))}))
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
765 try:
766 result = self._run(None, fetches, feed_dict, options_ptr,
--> 767 run_metadata_ptr)
768 if run_metadata:
769 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
963 if final_fetches or final_targets:
964 results = self._do_run(handle, final_targets, final_fetches,
--> 965 feed_dict_string, options, run_metadata)
966 else:
967 results = []
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1013 if handle is None:
1014 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1015 target_list, options, run_metadata)
1016 else:
1017 return self._do_call(_prun_fn, self._session, handle, feed_dict,
c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1033 except KeyError:
1034 pass
-> 1035 raise type(e)(node_def, op, message)
1036
1037 def _extend_graph(self):
InvalidArgumentError: You must feed a value for placeholder tensor 'Labels' with dtype float
[[Node: Labels = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op 'Labels', defined at:
File "c:\users\engine\appdata\local\programs\python\python35\lib\runpy.py", line 184, in _run_module_as_main
"__main__", mod_spec)
File "c:\users\engine\appdata\local\programs\python\python35\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\ipykernel\__main__.py", line 3, in <module>
app.launch_new_instance()
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
app.start()
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelapp.py", line 474, in start
ioloop.IOLoop.instance().start()
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tornado\ioloop.py", line 887, in start
handler_func(fd_obj, events)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
return fn(*args, **kwargs)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
return fn(*args, **kwargs)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 276, in dispatcher
return self.dispatch_shell(stream, msg)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 228, in dispatch_shell
handler(stream, idents, msg)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 390, in execute_request
user_expressions, allow_stdin)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\ipykernel\zmqshell.py", line 501, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2717, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2821, in run_ast_nodes
if self.run_code(code, result):
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-3-49d5bdb3e7ad>", line 6, in <module>
y_ = tf.placeholder(tf.float32,[None],name='Labels')
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\ops\array_ops.py", line 1502, in placeholder
name=name)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 2149, in _placeholder
name=name)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 763, in apply_op
op_def=op_def)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\ops.py", line 2327, in create_op
original_op=self._default_original_op, op_def=op_def)
File "c:\users\engine\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1226, in __init__
self._traceback = _extract_stack()
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Labels' with dtype float
[[Node: Labels = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
**Update **
Inputlabel is declared as followed:
..............
inputlabel =[]
..................
for i in batch(Training_Data,batchSize):
inputlabel.append(i)
def batch(iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
No that the type issue is solve I get an other weird stuff :
InvalidArgumentError (see above for traceback): Incompatible shapes: [3,19] vs. [57,19]
[[Node: gradients/mul_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](gradients/mul_grad/Shape, gradients/mul_grad/Shape_1)]]
This works:
import numpy as np
import tensorflow as tf
y_ = tf.placeholder(tf.float32,[None,19],name='Labels')
sess = tf.InteractiveSession()
labels = np.zeros(57, dtype=np.float32)
sess.run(y_, feed_dict = {y_: np.reshape(labels, (3,19))})
Could it be that your inputlabel is of the wrong type?
You just need to convert the list of (nSamples) integers coding the 1-hot label to an array of shape (nSamples, 19) containing only 0s and 1s when you import your data,
eg (1, 8, 2) -> [[1, 0, 0, ...], [0, 0, 1, 0, 0, ...], [0, 1, 0, 0, ...]]
You could do it like that:
label_1_hot_coding = row[-1]
array_of_bits = np.zeros(numberOFClasses)
n = 1
for i,_in enumerate(array_of_bits):
if n== label_1_hot_coding:
array_of_bits[i] = 1
Training_Labels.append(array_of_bits)
Your labels are now naturally of shape (nSamples, numberOfClasses), and the batches of shape (batchSize, numberOfClasses), which is what the rest of your program wants.