Related
I am using Tensorflow on Handwritten A_Z dataset on Kaggle Kernel.
I've used 2 conv layers along with 2 maxpool one after another and then reshaped the above layer into full_1 (-1,*7*7*64) and further it to a fully_connected layer(full_2 to which I applied dropout ) and connected it to a layer named last of shape (None,26) to finally get the predicted output which represent the 26 letters of English.
CONV->MAXPOOL->CONV->MAXPOOL->reshaped(named full_1)->FULLY_CONNECTED(full_2)->OUTPUT( last )
The training process on earlier(sometime back) running gave numeric values of accuracy but later it started giving NaNs for some unknown reason.
Also, the numeric values of accuracy never increased much throughout the training process and kept very low, which worries me whether I have correctly applied the Convolutional Network because the network should only learn better to give more accuracy as the batches of data are processed into the training process. Is the less accuracy due to less layers and less complex model ?
Also, I am doubtful about the tf.nn.softmax_cross_entropy_with_logits(labels=output,logits=last) statement in my code because the relu function has already been applied on last variable which denote the output layer in my conv net and used above as logits.
The error says :
FailedPreconditionError: Attempting to use uninitialized value W_4
The code is :
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import copy
import warnings
warnings.filterwarnings('ignore')
#dataset=pd.read_csv('/Users/ajay/Documents/IpyNote/A_Z Handwritten Data.csv')
dataset=pd.read_csv('../input/handwritten_data_785.csv')
#print(dataset.head(3))
#print(dataset.info())
dataset['0'].unique()
dataset=dataset.astype('float32')
X=copy.deepcopy(dataset)
X.head(1)
Y=X.loc[:,'0']
#print(Y.head(3))
Y=Y.astype('int64')
s=pd.get_dummies(Y)
list(s)
Y=s
Y=Y.astype('float32')
Y.head(2)
X.drop('0',axis=1,inplace=True)
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25,stratify=Y)
input=tf.placeholder(dtype=tf.float32,shape=(None,28*28))
output=tf.placeholder(dtype=tf.float32,shape=(None,26))
W1=tf.Variable(tf.truncated_normal(shape=(5,5,1,32)),name='W')#28,28,32
b1=tf.Variable(tf.truncated_normal(shape=(1,32)),name='b')#14,14,32
W2=tf.Variable(tf.truncated_normal(shape=(5,5,32,64)),name='W')#14,14,64
b2=tf.Variable(tf.truncated_normal(shape=(1,64)),name='b')#7,7,64
W3=tf.Variable(tf.truncated_normal(shape=(7*7*64,1024)),name='W')
b3=tf.Variable(tf.truncated_normal(shape=(1,1024)),name='b')
W4=tf.Variable(tf.truncated_normal(shape=(1024,26)),name='W')
b4=tf.Variable(tf.truncated_normal(shape=(1,26)),name='b')
def conv(input,W,b):
return tf.nn.relu(tf.nn.conv2d(input=input,filter=W,strides=(1,1,1,1),padding='SAME')+b)
def maxpool(x):
return tf.nn.max_pool(value=x,ksize=(1,2,2,1),strides=(1,2,2,1),padding='SAME')
def full_connected(x,W,b):
return tf.nn.relu(tf.matmul(x,W)+b)
p=tf.reshape(input,[-1,28,28,1])
conv_1=conv(p,W1,b1)
print('conv_1.shape',conv_1.shape)
maxpool_1=maxpool(conv_1)
print('maxpool_1.shape',maxpool_1.shape)
conv_2=conv(maxpool_1,W2,b2)
print('conv_2.shape',conv_2.shape)
maxpool_2=maxpool(conv_2)
print('maxpool_2.shape',maxpool_2.shape)
full_1=tf.reshape(maxpool_2,[-1,7*7*64])
full_2=full_connected(full_1,W3,b3)#full_1->full_2
print('full_2.shape',full_2.shape)
keep_prob=tf.placeholder(tf.float32)
full_2_dropout=tf.nn.dropout(full_2,keep_prob)
last=full_connected(full_2_dropout,W4,b4)
last = tf.clip_by_value(last, 1e-10, 0.9999999)
print('last.shape',last.shape)
loss=tf.nn.softmax_cross_entropy_with_logits(labels=output,logits=last)#loss=tf.nn.softmax(logits=last)
train_step=tf.train.AdamOptimizer(0.005).minimize(loss)
accuracy=tf.reduce_mean(tf.cast(tf.equal(tf.argmax(output,1), tf.argmax(last,1) ) , tf.float32))
init=tf.global_variables_initializer()
with tf.Session() as sess:
epoch=1
n_iterations=10
sess.run(init)
for i in range(n_iterations):
j=i*50
k=i*50+50
print('j=',j,'k=',k)
x = X_train.iloc[i*50:j,:]
y = Y_train.iloc[i*50:j,:]
#sess.run(accuracy,feed_dict={input:X_train,output:Y_train,keep_prob:1.0})
print('Train_accuracy : ',sess.run(accuracy, feed_dict={input: x, output: y,keep_prob:1.0}))
sess.run(train_step,feed_dict={input:x,output:y,keep_prob:1.0})
with tf.Session() as sess:
n_iterations=20
for i in range(n_iterations):
j=i*50
k=i*50+50
print('j=',j,'k=',k)
x = X_test.iloc[i*50:j,:]
y = Y_test.iloc[i*50:j,:]
print('Test_accuracy : ',sess.run(accuracy, feed_dict={input: x, output: y,keep_prob:1.0}))
The error is showing something like this:
conv_1.shape (?, 28, 28, 32)
maxpool_1.shape (?, 14, 14, 32)
conv_2.shape (?, 14, 14, 64)
maxpool_2.shape (?, 7, 7, 64)
full_2.shape (?, 1024)
last.shape (?, 26)
j= 0 k= 50
Train_accuracy : nan
j= 50 k= 100
Train_accuracy : nan
j= 100 k= 150
Train_accuracy : nan
j= 150 k= 200
Train_accuracy : nan
j= 200 k= 250
Train_accuracy : nan
j= 250 k= 300
Train_accuracy : nan
j= 300 k= 350
Train_accuracy : nan
j= 350 k= 400
Train_accuracy : nan
j= 400 k= 450
Train_accuracy : nan
j= 450 k= 500
Train_accuracy : nan
j= 0 k= 50
---------------------------------------------------------------------------
FailedPreconditionError Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1329 try:
-> 1330 return fn(*args)
1331 except errors.OpError as e:
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1314 return self._call_tf_sessionrun(
-> 1315 options, feed_dict, fetch_list, target_list, run_metadata)
1316
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1422 self._session, options, feed_dict, fetch_list, target_list,
-> 1423 status, run_metadata)
1424
/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
515 compat.as_text(c_api.TF_Message(self.status.status)),
--> 516 c_api.TF_GetCode(self.status.status))
517 # Delete the underlying status object from memory otherwise it stays alive
FailedPreconditionError: Attempting to use uninitialized value W_4
[[Node: W_4/read = Identity[T=DT_FLOAT, _class=["loc:#W_4"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](W_4)]]
During handling of the above exception, another exception occurred:
FailedPreconditionError Traceback (most recent call last)
<ipython-input-2-496ec024fd3b> in <module>()
114 x = X_test.iloc[i*50:j,:]
115 y = Y_test.iloc[i*50:j,:]
--> 116 print('Test_accuracy : ',sess.run(accuracy, feed_dict={input: x, output: y,keep_prob:1.0}))
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
906 try:
907 result = self._run(None, fetches, feed_dict, options_ptr,
--> 908 run_metadata_ptr)
909 if run_metadata:
910 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1141 if final_fetches or final_targets or (handle and feed_dict_tensor):
1142 results = self._do_run(handle, final_targets, final_fetches,
-> 1143 feed_dict_tensor, options, run_metadata)
1144 else:
1145 results = []
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1322 if handle is None:
1323 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1324 run_metadata)
1325 else:
1326 return self._do_call(_prun_fn, handle, feeds, fetches)
/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1341 except KeyError:
1342 pass
-> 1343 raise type(e)(node_def, op, message)
1344
1345 def _extend_graph(self):
FailedPreconditionError: Attempting to use uninitialized value W_4
[[Node: W_4/read = Identity[T=DT_FLOAT, _class=["loc:#W_4"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](W_4)]]
Caused by op 'W_4/read', defined at:
File "/opt/conda/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/opt/conda/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/opt/conda/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/opt/conda/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/opt/conda/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/opt/conda/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/opt/conda/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/opt/conda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/opt/conda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/opt/conda/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/opt/conda/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/opt/conda/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/opt/conda/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/opt/conda/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
if self.run_code(code, result):
File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-496ec024fd3b>", line 42, in <module>
W1=tf.Variable(tf.truncated_normal(shape=(5,5,1,32)),name='W')#28,28,32
File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 235, in __init__
constraint=constraint)
File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 397, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 142, in identity
return gen_array_ops.identity(input, name=name)
File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3052, in identity
"Identity", input=input, name=name)
File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3306, in create_op
op_def=op_def)
File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1669, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value W_4
[[Node: W_4/read = Identity[T=DT_FLOAT, _class=["loc:#W_4"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](W_4)]]
Reason for Accuracy giving NaNs : You have split the training data into X_train and X_test due to which your indices got disturbed and the train dataset become quite random with respect to the indices and when you feed your X_train batches-wise, the indices from [0:50] do not exist while training and hence you end up feeding nothing to you model.
Before training the model, do this :
X_test.reset_index(drop=True)
Y_test.reset_index(drop=True)
This will reset your indices and drop=True will prevent the original indices from becoming another column in your transformed dataframe.
As far as the Weights and Biases are concerned, DO NOT use another session for testing the model because all your trained variables will be lost in this session and hence the error Attempting to use uninitialized value W_4 will occur.
You can also try saving your variables for the sake of convenience.
Also, refer this for your logits part : here
You can call: sess.run(tf.global_variables_initializer()) to initialize the variables. See this StackOverflow answer for more information about the initializer.
when I try to use the tf.matmul function on the GPU I get the following error:
InternalError: Blas xGEMMBatched launch failed
If the N value in the function calc() is set to a value less than 15 it works.
I am running tensorflow 1.8.0 and Cuda V9.1.85. There is only one Python process working on the GPU and there are no other open sessions. Also I have plenty GPU memory to spare (see attached image).
Changing the CUDA_VISIBLE_DEVICES values does not show any effect. Changing the ConfigProto() settings does not help either. Also the use of tf.matmul does not solve the problem.
This is the code I am running:
import tensorflow as tf
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
tf.Session(config=config).close()
def calc():
N = 15 # works for N <= 14
a = 16
b = 8
X = np.random.rand(N, 11520, b, 1).astype(np.float32)
print(X.nbytes*1e-6, "MB")
W = np.random.rand(N, 11520, a, b).astype(np.float32)
print(W.nbytes*1e-6, "MB")
X_ = tf.constant(X, name="X-constant", dtype=tf.float32)
W_ = tf.constant(W, name="W-constant", dtype=tf.float32)
# return tf.matmul(W_, X_, name="mymatmul")
return W_ # X_
tf.reset_default_graph()
a = calc()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
b = sess.run(a)
sess.close()
print(b.shape)
This is the output I get:
5.529599999999999 MB
88.47359999999999 MB
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1321 try:
-> 1322 return fn(*args)
1323 except errors.OpError as e:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1306 return self._call_tf_sessionrun(
-> 1307 options, feed_dict, fetch_list, target_list, run_metadata)
1308
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1408 self._session, options, feed_dict, fetch_list, target_list,
-> 1409 run_metadata)
1410 else:
InternalError: Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
During handling of the above exception, another exception occurred:
InternalError Traceback (most recent call last)
<ipython-input-5-013153235a1a> in <module>()
3 sess = tf.Session()
4 sess.run(tf.global_variables_initializer())
----> 5 b = sess.run(a)
6 sess.close()
7 print(b.shape)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
898 try:
899 result = self._run(None, fetches, feed_dict, options_ptr,
--> 900 run_metadata_ptr)
901 if run_metadata:
902 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1133 if final_fetches or final_targets or (handle and feed_dict_tensor):
1134 results = self._do_run(handle, final_targets, final_fetches,
-> 1135 feed_dict_tensor, options, run_metadata)
1136 else:
1137 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1314 if handle is None:
1315 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1316 run_metadata)
1317 else:
1318 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 except KeyError:
1334 pass
-> 1335 raise type(e)(node_def, op, message)
1336
1337 def _extend_graph(self):
InternalError: Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
Caused by op 'matmul', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 486, in start
self.io_loop.start()
File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 127, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
self._run_once()
File "/usr/lib/python3.6/asyncio/base_events.py", line 1432, in _run_once
handle._run()
File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
self._callback(*self._args)
File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 117, in _handle_events
handler_func(fileobj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-5-013153235a1a>", line 2, in <module>
a = calc()
File "<ipython-input-4-bf0e6012e9e2>", line 13, in calc
return W_ # X_
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py", line 847, in binary_op_wrapper
return func(x, y, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py", line 1976, in matmul
a, b, adj_x=adjoint_a, adj_y=adjoint_b, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 1236, in batch_mat_mul
"BatchMatMul", x=x, y=y, adj_x=adj_x, adj_y=adj_y, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3414, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1740, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InternalError (see above for traceback): Blas xGEMMBatched launch failed : a.shape=[172800,16,8], b.shape=[172800,8,1], m=16, n=1, k=8, batch_size=172800
[[Node: matmul = BatchMatMul[T=DT_FLOAT, adj_x=false, adj_y=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](W-constant, X-constant)]]
Here is a workaround replacing tf.matmul with tf.einsum. However, your code works on my machine with NVIDIA 840M (2004 MiB RAM), cudnn 7.0.5.15 and cuda 9.0.176 (maybe downgrading helps?).
import tensorflow as tf
import numpy as np
sess = tf.Session()
N = 20
M = 11520
a = 16
b = 8
W = np.random.rand(N, M, a, b).astype(np.float32)
X = np.random.rand(N, M, b, 1).astype(np.float32)
# tf.einsum does not support numpy arrays, so wrap W and X in tf.constants
W2 = tf.constant(W)
X2 = tf.constant(X)
# tf.einsum does not support "..." as seen later in np.einsum
WX = tf.einsum("uvik,uvkj->uvij", W2, X2)
# same as:
#WX = tf.matmul(W2, X2)
# calculate W#X using tf.einsum
result1 = sess.run(WX)
# calculate W#X using np.einsum
result2 = np.einsum("...ik,...kj->...ij", W, X)
# calculate W#X by hand (just for illustrative purpose, too slow for practical use)
result3 = np.zeros((N, M, a, 1), dtype=np.float32)
for i in range(a):
for j in range(1):
for k in range(b):
result3[..., i, j] += W[..., i, k] * X[..., k, j]
# ensure that everything is correct
assert(np.allclose(result1, result2))
assert(np.allclose(result1, result3))
print("everything ok")
sess.close()
I've tried to create a custom object detector in the paperspace cloud desktop, then I tried it on Jupyter Notebook and it works.
Now, I've uploaded the whole models-master folder and downloaded it on my local machine.
I ran it using Jupyter Notebook and it now gives an InvalidArgumentError. I've tried re-exporting the inference graph on my local machine using the same ckpt that was trained on cloud but it is still not working.
InvalidArgumentError Traceback (most recent call
last)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py
in _do_call(self, fn, *args) 1322 try:
-> 1323 return fn(*args) 1324 except errors.OpError as e:
/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py
in _run_fn(session, feed_dict, fetch_list, target_list, options,
run_metadata) 1301 feed_dict,
fetch_list, target_list,
-> 1302 status, run_metadata) 1303
/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/errors_impl.py
in exit(self, type_arg, value_arg, traceback_arg)
472 compat.as_text(c_api.TF_Message(self.status.status)),
--> 473 c_api.TF_GetCode(self.status.status))
474 # Delete the underlying status object from memory otherwise it stays alive
InvalidArgumentError: NodeDef mentions attr 'T' not in Op index:int64>; NodeDef:
Postprocessor/BatchMultiClassNonMaxSuppression/map/while/MultiClassNonMaxSuppression/FilterGreaterThan/Where
= WhereT=DT_BOOL, _device="/job:localhost/replica:0/task:0/device:GPU:0".
(Check whether your GraphDef-interpreting binary is up to date with
your GraphDef-generating binary.). [[Node:
Postprocessor/BatchMultiClassNonMaxSuppression/map/while/MultiClassNonMaxSuppression/FilterGreaterThan/Where
= WhereT=DT_BOOL, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call
last) in ()
20 (boxes, scores, classes, num) = sess.run(
21 [detection_boxes, detection_scores, detection_classes, num_detections],
---> 22 feed_dict={image_tensor: image_np_expanded})
23 # Visualization of the results of a detection.
24 vis_util.visualize_boxes_and_labels_on_image_array(
/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py
in run(self, fetches, feed_dict, options, run_metadata)
887 try:
888 result = self._run(None, fetches, feed_dict, options_ptr,
--> 889 run_metadata_ptr)
890 if run_metadata:
891 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py
in _run(self, handle, fetches, feed_dict, options, run_metadata)
1118 if final_fetches or final_targets or (handle and
feed_dict_tensor): 1119 results = self._do_run(handle,
final_targets, final_fetches,
-> 1120 feed_dict_tensor, options, run_metadata) 1121 else: 1122 results = []
/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py
in _do_run(self, handle, target_list, fetch_list, feed_dict, options,
run_metadata) 1315 if handle is None: 1316 return
self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1317 options, run_metadata) 1318 else: 1319 return self._do_call(_prun_fn, self._session,
handle, feeds, fetches)
/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py
in _do_call(self, fn, *args) 1334 except KeyError: 1335
pass
-> 1336 raise type(e)(node_def, op, message) 1337 1338 def _extend_graph(self):
InvalidArgumentError: NodeDef mentions attr 'T' not in Op index:int64>; NodeDef:
Postprocessor/BatchMultiClassNonMaxSuppression/map/while/MultiClassNonMaxSuppression/FilterGreaterThan/Where
= WhereT=DT_BOOL, _device="/job:localhost/replica:0/task:0/device:GPU:0".
(Check whether your GraphDef-interpreting binary is up to date with
your GraphDef-generating binary.). [[Node:
Postprocessor/BatchMultiClassNonMaxSuppression/map/while/MultiClassNonMaxSuppression/FilterGreaterThan/Where
= WhereT=DT_BOOL, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Caused by op
'Postprocessor/BatchMultiClassNonMaxSuppression/map/while/MultiClassNonMaxSuppression/FilterGreaterThan/Where',
defined at: File "/usr/lib/python3.5/runpy.py", line 184, in
_run_module_as_main
"main", mod_spec) File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
exec(code, run_globals) File "/home/ryan/.local/lib/python3.5/site-packages/ipykernel_launcher.py",
line 16, in
app.launch_new_instance() File "/home/ryan/.local/lib/python3.5/site-packages/traitlets/config/application.py",
line 658, in launch_instance
app.start() File "/home/ryan/.local/lib/python3.5/site-packages/ipykernel/kernelapp.py",
line 477, in start
ioloop.IOLoop.instance().start() File "/home/ryan/.local/lib/python3.5/site-packages/zmq/eventloop/ioloop.py",
line 177, in start
super(ZMQIOLoop, self).start() File "/home/ryan/.local/lib/python3.5/site-packages/tornado/ioloop.py",
line 888, in start
handler_func(fd_obj, events) File "/home/ryan/.local/lib/python3.5/site-packages/tornado/stack_context.py",
line 277, in null_wrapper
return fn(*args, **kwargs) File "/home/ryan/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py",
line 440, in _handle_events
self._handle_recv() File "/home/ryan/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py",
line 472, in _handle_recv
self._run_callback(callback, msg) File "/home/ryan/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py",
line 414, in _run_callback
callback(*args, **kwargs) File "/home/ryan/.local/lib/python3.5/site-packages/tornado/stack_context.py",
line 277, in null_wrapper
return fn(*args, **kwargs) File "/home/ryan/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py",
line 283, in dispatcher
return self.dispatch_shell(stream, msg) File "/home/ryan/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py",
line 235, in dispatch_shell
handler(stream, idents, msg) File "/home/ryan/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py",
line 399, in execute_request
user_expressions, allow_stdin) File "/home/ryan/.local/lib/python3.5/site-packages/ipykernel/ipkernel.py",
line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent) File
"/home/ryan/.local/lib/python3.5/site-packages/ipykernel/zmqshell.py",
line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File
"/home/ryan/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py",
line 2728, in run_cell
interactivity=interactivity, compiler=compiler, result=result) File
"/home/ryan/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py",
line 2850, in run_ast_nodes
if self.run_code(code, result): File "/home/ryan/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py",
line 2910, in run_code
exec(code_obj, self.user_global_ns, self.user_ns) File "", line 7, in
tf.import_graph_def(od_graph_def, name='') File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/importer.py",
line 313, in import_graph_def
op_def=op_def) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",
line 2956, in create_op
op_def=op_def) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",
line 1470, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): NodeDef mentions attr
'T' not in Op index:int64>;
NodeDef:
Postprocessor/BatchMultiClassNonMaxSuppression/map/while/MultiClassNonMaxSuppression/FilterGreaterThan/Where
= WhereT=DT_BOOL, _device="/job:localhost/replica:0/task:0/device:GPU:0".
(Check whether your GraphDef-interpreting binary is up to date with
your GraphDef-generating binary.). [[Node:
Postprocessor/BatchMultiClassNonMaxSuppression/map/while/MultiClassNonMaxSuppression/FilterGreaterThan/Where
= WhereT=DT_BOOL, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Are the cloud and local machines running the same Python/Tensorflow versions? Sometimes checkpoints produced by a specific Tensorflow version are not backward compatible due to internal variables renaming.
I'm trying to train an autoencoder with mse loss function with TensorFlow r1.2, but I keep getting a FailedPreconditionError which states that one of the variables related to computing the mse is uninitialized (see full stack trace printout below). I'm running this in Jupyter notebook and I'm using Python 3.
I trimmed down my code to a minimal example as follows
import tensorflow as tf
import numpy as np
from functools import partial
# specify network
def reset_graph(seed=0):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
reset_graph()
n_inputs = 100
n_hidden = 6
n_outputs = n_inputs
learning_rate = 0.001
l2_reg = 0.001
X = tf.placeholder(tf.float32, shape=[None, n_inputs])
he_init = tf.contrib.layers.variance_scaling_initializer()
l2_regularizer = tf.contrib.layers.l2_regularizer(l2_reg)
my_dense_layer = partial(tf.layers.dense,
activation=tf.nn.elu,
kernel_initializer=he_init,
kernel_regularizer=l2_regularizer)
hidden1 = my_dense_layer(X, n_hidden1)
outputs = my_dense_layer(hidden1, n_outputs, activation=None)
reconstruction_loss = tf.reduce_mean(tf.metrics.mean_squared_error(X, outputs))
reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
loss = tf.add_n([reconstruction_loss] + reg_losses)
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
# generate 1000 random examples
sample_X = np.random.rand(1000, 100)
# train network
n_epochs = 10
batch_size = 50
with tf.Session() as sess:
sess.run(init) # init.run()
for epoch in range(n_epochs):
n_batches = sample_X.shape[0] // batch_size
for iteration in range(n_batches):
start_idx = iteration*batch_size
if iteration == n_batches-1:
end_idx = sample_X.shape[0]
else:
end_idx = start_idx + batch_size
sys.stdout.flush()
X_batch = sample_X[start_idx:end_idx]
sess.run(training_op, feed_dict={X: X_batch})
loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
print(round(loss_train, 5))
When I replace the line that defines reconstruction_loss to not use tf.metrics, as follows
reconstruction_loss = tf.reduce_mean(tf.square(tf.norm(outputs - X)))
I don't get the exception.
I've checked several similar SO questions, but none of them has solved my problem. For example, one possible cause, suggested in an answer at FailedPreconditionError: Attempting to use uninitialized in Tensorflow, is failing to initialize all the variables in the TF graph, but my script initializes all TF variables with init = tf.global_variables_initializer() and then sess.run(init). Another possible cause is that the Adam optimizer creates its own variables, which need to be initialized after specifying the optimizer (see Tensorflow: Using Adam optimizer). However, my script defines the variable initializer after the optimizer, as suggested in the accepted answer to that question, so that also can't be my problem.
Can anyone spot anything wrong with my script or suggest things to try to suss out the cause of this error?
Below is the stack trace from the error.
---------------------------------------------------------------------------
FailedPreconditionError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1138 try:
-> 1139 return fn(*args)
1140 except errors.OpError as e:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1120 feed_dict, fetch_list, target_list,
-> 1121 status, run_metadata)
1122
~\AppData\Local\Continuum\Anaconda3\lib\contextlib.py in __exit__(self, type, value, traceback)
88 try:
---> 89 next(self.gen)
90 except StopIteration:
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\errors_impl.py in raise_exception_on_not_ok_status()
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
FailedPreconditionError: Attempting to use uninitialized value mean_squared_error/total
[[Node: mean_squared_error/total/read = Identity[T=DT_FLOAT, _class=["loc:#mean_squared_error/total"], _device="/job:localhost/replica:0/task:0/cpu:0"](mean_squared_error/total)]]
During handling of the above exception, another exception occurred:
FailedPreconditionError Traceback (most recent call last)
<ipython-input-55-aac61c488ed8> in <module>()
64 sess.run(training_op, feed_dict={X: X_batch})
65
---> 66 loss_train = reconstruction_loss.eval(feed_dict={X: X_batch})
67 print(round(loss_train, 5))
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py in eval(self, feed_dict, session)
604
605 """
--> 606 return _eval_using_default_session(self, feed_dict, self.graph, session)
607
608
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py in _eval_using_default_session(tensors, feed_dict, graph, session)
3926 "the tensor's graph is different from the session's "
3927 "graph.")
-> 3928 return session.run(tensors, feed_dict)
3929
3930
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
787 try:
788 result = self._run(None, fetches, feed_dict, options_ptr,
--> 789 run_metadata_ptr)
790 if run_metadata:
791 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
995 if final_fetches or final_targets:
996 results = self._do_run(handle, final_targets, final_fetches,
--> 997 feed_dict_string, options, run_metadata)
998 else:
999 results = []
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1130 if handle is None:
1131 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1132 target_list, options, run_metadata)
1133 else:
1134 return self._do_call(_prun_fn, self._session, handle, feed_dict,
~\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
1150 except KeyError:
1151 pass
-> 1152 raise type(e)(node_def, op, message)
1153
1154 def _extend_graph(self):
FailedPreconditionError: Attempting to use uninitialized value mean_squared_error/total
[[Node: mean_squared_error/total/read = Identity[T=DT_FLOAT, _class=["loc:#mean_squared_error/total"], _device="/job:localhost/replica:0/task:0/cpu:0"](mean_squared_error/total)]]
Caused by op 'mean_squared_error/total/read', defined at:
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\__main__.py", line 3, in <module>
app.launch_new_instance()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
app.start()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 474, in start
ioloop.IOLoop.instance().start()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 276, in dispatcher
return self.dispatch_shell(stream, msg)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 228, in dispatch_shell
handler(stream, idents, msg)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 390, in execute_request
user_expressions, allow_stdin)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 501, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2802, in run_ast_nodes
if self.run_code(code, result):
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-55-aac61c488ed8>", line 32, in <module>
reconstruction_loss = tf.reduce_mean(tf.metrics.mean_squared_error(X, outputs))
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\metrics_impl.py", line 1054, in mean_squared_error
updates_collections, name or 'mean_squared_error')
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\metrics_impl.py", line 331, in mean
total = _create_local('total', shape=[])
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\metrics_impl.py", line 196, in _create_local
validate_shape=validate_shape)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1679, in variable
caching_device=caching_device, name=name, dtype=dtype)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\variables.py", line 200, in __init__
expected_shape=expected_shape)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\variables.py", line 319, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 1303, in identity
result = _op_def_lib.apply_op("Identity", input=input, name=name)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
op_def=op_def)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
original_op=self._default_original_op, op_def=op_def)
File "C:\Users\user\AppData\Local\Continuum\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
self._traceback = _extract_stack()
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value mean_squared_error/total
[[Node: mean_squared_error/total/read = Identity[T=DT_FLOAT, _class=["loc:#mean_squared_error/total"], _device="/job:localhost/replica:0/task:0/cpu:0"](mean_squared_error/total)]]
Looks like you're doing everything right with initialization, so I suspect your error is that you're using tf.metrics.mean_squared_error incorrectly.
The metrics package of classes allows you to compute a value, but also accumulate that value over multiple calls to sess.run. Note the return value of tf.metrics.mean_square_error in the docs:
https://www.tensorflow.org/api_docs/python/tf/metrics/mean_squared_error
You get back both mean_square_error, as you appear to expect, and an update_op. The purpose of the update_op is that you ask tensorflow to compute the update_op and it accumulates the mean square error. Each time you call mean_square_error you get the accumulated value. When you want to reset the value you would run sess.run(tf.local_variables_initializer()) (note local and not global to clear "local" variables as the metrics package defines them).
I don't think the metrics package was intended to be used the way you're using it. I think your intention was to compute the mse only based on the current batch as your loss and not accumulate the value over multiple calls. I'm not even sure how differentiation would work with respect to an accumulated value like this.
So I think the answer to your question is: don't use the metrics package this way. Use metrics for reporting, and for accumulating results over multiple iterations of a test dataset, for example, not for generating a loss function.
I think what you mean to use is tf.losses.mean_squared_error
I was trying to adapt an NN architecture I saw in a blog post here: https://sorenbouma.github.io/blog/oneshot/
The only thing I was trying to change about it was the input, instead of (105,105,1) greyscale I was hoping to use (100,100,3) RGB. I thus used Keras to define the architecture as in the blog post but with different input:
def W_init(shape,name=None):
"""Initialize weights as in paper"""
values = rng.normal(loc=0,scale=1e-2,size=shape)
return K.variable(values,name=name)
#//TODO: figure out how to initialize layer biases in keras.
def b_init(shape,name=None):
"""Initialize bias as in paper"""
values=rng.normal(loc=0.5,scale=1e-2,size=shape)
return K.variable(values,name=name)
input_shape = (100, 100, 3)
left_input = Input(input_shape)
right_input = Input(input_shape)
#build convnet to use in each siamese 'leg'
convnet = Sequential()
convnet.add(Conv2D(64,(10,10),activation='relu',input_shape=input_shape,
kernel_initializer=W_init,kernel_regularizer=l2(2e-4)))
convnet.add(MaxPooling2D())
convnet.add(Conv2D(128,(7,7),activation='relu',
kernel_regularizer=l2(2e-4),kernel_initializer=W_init,bias_initializer=b_init))
convnet.add(MaxPooling2D())
convnet.add(Conv2D(128,(4,4),activation='relu',kernel_initializer=W_init,kernel_regularizer=l2(2e-4),bias_initializer=b_init))
convnet.add(MaxPooling2D())
convnet.add(Conv2D(256,(4,4),activation='relu',kernel_initializer=W_init,kernel_regularizer=l2(2e-4),bias_initializer=b_init))
convnet.add(Flatten())
convnet.add(Dense(4096,activation="sigmoid",kernel_regularizer=l2(1e-3),kernel_initializer=W_init,bias_initializer=b_init))
#encode each of the two inputs into a vector with the convnet
encoded_l = convnet(left_input)
encoded_r = convnet(right_input)
#merge two encoded inputs with the l1 distance between them
L1_distance = lambda x: K.abs(x[0]-x[1])
both = merge([encoded_l,encoded_r], mode = L1_distance, output_shape=lambda x: x[0])
prediction = Dense(1,activation='sigmoid',bias_initializer=b_init)(both)
siamese_net = Model(input=[left_input,right_input],output=prediction)
#optimizer = SGD(0.0004,momentum=0.6,nesterov=True,decay=0.0003)
optimizer = Adam(0.00006)
#//TODO: get layerwise learning rates and momentum annealing scheme described in paperworking
siamese_net.compile(loss="binary_crossentropy",optimizer=optimizer)
siamese_net.count_params()
I then train the network on my data as in the paper:
#Training loop
evaluate_every = 500
loss_every=50
batch_size = 20
N_way = 20
n_val = 250
#siamese_net.load_weights("/home/soren/keras-oneshot/weights")
max_epochs = 100
for i in range(0,max_epochs):
(inputs,targets)=loader.get_batch(batch_size)
loss=siamese_net.train_on_batch(inputs,targets)
if i % evaluate_every == 0:
val_acc = loader.test_oneshot(siamese_net,N_way,n_val,verbose=True)
if val_acc >= best:
print("saving")
siamese_net.save('/home/soren/keras-oneshot/weights')
best=val_acc
if i % loss_every == 0:
print("iteration {}, training loss: {:.2f},".format(i,loss))
But I get
FailedPreconditionError: Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
Here is full error output:
---------------------------------------------------------------------------
FailedPreconditionError Traceback (most recent call last)
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1138 try:
-> 1139 return fn(*args)
1140 except errors.OpError as e:
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1120 feed_dict, fetch_list, target_list,
-> 1121 status, run_metadata)
1122
/usr/lib/python3.4/contextlib.py in __exit__(self, type, value, traceback)
65 try:
---> 66 next(self.gen)
67 except StopIteration:
/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
FailedPreconditionError: Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
During handling of the above exception, another exception occurred:
FailedPreconditionError Traceback (most recent call last)
<ipython-input-15-06f79f757a6e> in <module>()
9 for i in range(0,max_epochs):
10 (inputs,targets)=loader.get_batch(batch_size)
---> 11 loss=siamese_net.train_on_batch(inputs,targets)
12 if i % evaluate_every == 0:
13 val_acc = loader.test_oneshot(siamese_net,N_way,n_val,verbose=True)
/usr/local/lib/python3.4/dist-packages/keras/engine/training.py in train_on_batch(self, x, y, sample_weight, class_weight)
1563 ins = x + y + sample_weights
1564 self._make_train_function()
-> 1565 outputs = self.train_function(ins)
1566 if len(outputs) == 1:
1567 return outputs[0]
/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
2263 value = (indices, sparse_coo.data, sparse_coo.shape)
2264 feed_dict[tensor] = value
-> 2265 session = get_session()
2266 updated = session.run(self.outputs + [self.updates_op],
2267 feed_dict=feed_dict,
/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py in get_session()
166 if not _MANUAL_VAR_INIT:
167 with session.graph.as_default():
--> 168 _initialize_variables()
169 return session
170
/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py in _initialize_variables()
339 if uninitialized_variables:
340 sess = get_session()
--> 341 sess.run(tf.variables_initializer(uninitialized_variables))
342
343
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
787 try:
788 result = self._run(None, fetches, feed_dict, options_ptr,
--> 789 run_metadata_ptr)
790 if run_metadata:
791 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
995 if final_fetches or final_targets:
996 results = self._do_run(handle, final_targets, final_fetches,
--> 997 feed_dict_string, options, run_metadata)
998 else:
999 results = []
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1130 if handle is None:
1131 return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1132 target_list, options, run_metadata)
1133 else:
1134 return self._do_call(_prun_fn, self._session, handle, feed_dict,
/usr/local/lib/python3.4/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1150 except KeyError:
1151 pass
-> 1152 raise type(e)(node_def, op, message)
1153
1154 def _extend_graph(self):
FailedPreconditionError: Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
Caused by op 'conv2d_1/Variable/read', defined at:
File "/usr/lib/python3.4/runpy.py", line 170, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.4/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.4/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.4/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "/usr/local/lib/python3.4/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.4/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-51595f796dab>", line 17, in <module>
kernel_initializer=W_init,kernel_regularizer=l2(2e-4)))
File "/usr/local/lib/python3.4/dist-packages/keras/models.py", line 436, in add
layer(x)
File "/usr/local/lib/python3.4/dist-packages/keras/engine/topology.py", line 569, in __call__
self.build(input_shapes[0])
File "/usr/local/lib/python3.4/dist-packages/keras/layers/convolutional.py", line 134, in build
constraint=self.kernel_constraint)
File "/usr/local/lib/python3.4/dist-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.4/dist-packages/keras/engine/topology.py", line 391, in add_weight
weight = K.variable(initializer(shape), dtype=dtype, name=name)
File "<ipython-input-2-51595f796dab>", line 4, in W_init
return K.variable(values,name=name)
File "/usr/local/lib/python3.4/dist-packages/keras/backend/tensorflow_backend.py", line 321, in variable
v = tf.Variable(value, dtype=_convert_string_dtype(dtype), name=name)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/ops/variables.py", line 200, in __init__
expected_shape=expected_shape)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/ops/variables.py", line 319, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1303, in identity
result = _op_def_lib.apply_op("Identity", input=input, name=name)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python3.4/dist-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
self._traceback = _extract_stack()
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value conv2d_1/Variable
[[Node: conv2d_1/Variable/read = Identity[T=DT_FLOAT, _class=["loc:#conv2d_1/Variable"], _device="/job:localhost/replica:0/task:0/cpu:0"](conv2d_1/Variable)]]
Googlign about the error did not really make things clear. I saw some posts about the error coming up when using Tensorflow, but nothing regarding this erro and Keras, so I am a little bit confused about what is going on.
I experienced the same error a few days ago.
The cause of the error is weight initialization.
you try to change the follow code.
(before)
convnet.add(Conv2D(64,(10,10),activation='relu',input_shape=input_shape,
kernel_initializer=W_init,kernel_regularizer=l2(2e-4)))
(after)
convnet.add(Conv2D(64,(10,10),activation='relu',input_shape=input_shape,
kernel_initializer=keras.initializers.RandomNormal(mean=0.0,
stddev=1e-2, seed=None),kernel_regularizer=l2(2e-4)))
Please change from all W_init and b_init to keras.initializers.RandomNormal(...).