Tensorflow returns always same result - python

I made the code with tensorflow to train and test convolutional neural network.
I use input with my own jpeg image.
Image files are called as batch format by my own code, input_batch.py.
After being called, convolution is proceeded with convpool.py.
But now, my results in test data are returned with same values.
Also in training, some batch data's convolution results are returned same.
I also looked at this issue, Tensorflow predicts always the same result
but the solutions in this issue are failed to apply in my code.
My result always return like this :
step 0, training accuracy 0.2
result: [[ 5.76441448e-18 1.00000000e+00]
[ 5.76441448e-18 1.00000000e+00]
[ 5.76441448e-18 1.00000000e+00]
[ 5.76441448e-18 1.00000000e+00]
[ 5.76441448e-18 1.00000000e+00]
[ 5.76441448e-18 1.00000000e+00]
[ 5.76434913e-18 1.00000000e+00]
[ 5.85150709e-18 1.00000000e+00]
[ 2.83430459e-17 1.00000000e+00]
[ 0.00000000e+00 1.00000000e+00]]
test result:[[ 0. 1.]]actual result:[ 1. 0.]
test result:[[ 0. 1.]]actual result:[ 1. 0.]
test result:[[ 0. 1.]]actual result:[ 0. 1.]
test result:[[ 0. 1.]]actual result:[ 0. 1.]
test result:[[ 0. 1.]]actual result:[ 0. 1.]
test result:[[ 0. 1.]]actual result:[ 1. 0.]
test result:[[ 0. 1.]]actual result:[ 1. 0.]
test result:[[ 0. 1.]]actual result:[ 0. 1.]
test result:[[ 0. 1.]]actual result:[ 1. 0.]
test result:[[ 0. 1.]]actual result:[ 1. 0.]
and here is my code:
import tensorflow as tf
import input_batch
import input
import convpool
import matplotlib.pyplot as plt
import numpy as np
FLAGS = tf.app.flags.FLAGS
sess = tf.Session()
x_image = tf.placeholder("float", shape=[None,FLAGS.width,FLAGS.height,FLAGS.depth])
y_ = tf.placeholder("float", shape=[None,FLAGS.num_class])
# x_image=tf.reshape(x,[-1,FLAGS.width,FLAGS.height,FLAGS.depth])
def weight_variable(shape):
initial=tf.truncated_normal(shape,stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial=tf.constant(0.1,shape=shape)
return tf.Variable(initial)
def spp_layer(x,n_bin,output_depth):
(a,b,c,d) = x.get_shape()
h = int(b+(n_bin-1))/n_bin
w = int(c+(n_bin-1))/n_bin
return tf.reshape(tf.nn.max_pool(x,ksize=[1, h, w, 1], strides=[1, h, w, 1], padding='SAME'),[-1, n_bin * n_bin , output_depth])
W_conv1 = weight_variable([11, 11, 3 , 96])
b_conv1 = bias_variable([96])
W_conv2 = weight_variable([5, 5, 96, 256])
b_conv2 = bias_variable([256])
W_fc1 = weight_variable([14*14* 256, 4096])
b_fc1 = bias_variable([4096])
W_fc2 = weight_variable([4096, 2])
b_fc2 = bias_variable([2])
keep_prob = tf.placeholder("float")
y_conv_train = convpool.train(x_image,W_conv1,b_conv1,W_conv2,b_conv2,W_fc1,b_fc1,W_fc2,b_fc2,keep_prob)
cross_entropy = -tf.reduce_mean(y_*tf.log(tf.clip_by_value(y_conv_train,1e-10,1.0)))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv_train,1),tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,"float"))
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in range(50):
batch = input_batch.get_data_jpeg(sess,'train',10)
if i%1==0:
train_accuracy = sess.run(accuracy,feed_dict={x_image:batch[0], y_:batch[1], keep_prob:1.0})
train_result = sess.run(y_conv_train, feed_dict={x_image: batch[0], y_: batch[1], keep_prob: 1.0})
# print('result : ', sess.run(W_fc2))
print("step %d, training accuracy %g" %(i,train_accuracy))
print('result:', train_result)
sess.run(train_step, feed_dict={x_image:batch[0], y_:batch[1], keep_prob:0.5})
# ############################test###################################
for i in range (10):
input.initialization()
testinput = input.get_data_jpeg(sess,'eval')
test_img = testinput.x_data
(i_x,i_y,i_z) = testinput.x_size
testimg = tf.reshape(test_img, [-1,i_x,i_y,i_z])
testresult=convpool.train(testimg,W_conv1,b_conv1,W_conv2,b_conv2,W_fc1,b_fc1,W_fc2,b_fc2,1.0)
result = sess.run(testresult)
print("test result:"+str(result)+ "actual result:"+ str(testinput.y_data))
#convpool.py
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
def train(input,W_conv1,b_conv1,W_conv2,b_conv2,W_fc1,b_fc1,W_fc2,b_fc2,keep_prob):
h_conv1 = tf.nn.relu(tf.nn.conv2d(input, W_conv1, strides=[1, 4, 4, 1], padding='SAME') + b_conv1)
h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
h_conv2 = tf.nn.relu(tf.nn.conv2d(h_pool1, W_conv2, strides=[1, 1, 1, 1], padding='SAME') + b_conv2)
h_pool2 = tf.nn.max_pool(h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# print(h_pool2)
h_pool2_flat = tf.reshape(h_pool2, [-1, 14*14 * 256])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
return y_conv
#input_batch.py
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('data_dir', 'C://Users/sh/PycharmProjects/test2/data',
"""Directory where to write event logs """
"""and checkpoint.""")
FLAGS.width = 224
FLAGS.height = 224
FLAGS.depth = 3
FLAGS.num_class = 2
batch_index = 0
filenames = []
FLAGS.imsize = FLAGS.height * FLAGS.width * FLAGS.depth
def get_filenames(data_set):
global filenames
labels = [ ]
with open(FLAGS.data_dir + '/labels.txt') as f:
for line in f:
inner_list = [elt.strip() for elt in line.split(',')]
labels += inner_list
for i, label in enumerate(labels):
list = os.listdir(FLAGS.data_dir + '/' + data_set + '/' +label)
for filename in list:
filenames.append([label + '/' + filename, i])
random.shuffle(filenames)
def get_data_jpeg(sess, data_set, batch_size):
global batch_index, filenames
if len(filenames) == 0:
get_filenames(data_set)
max = len(filenames)
begin = batch_index
end = batch_index + batch_size
if end >= max:
end = max
batch_index = 0
x_data = np.array([])
y_data = np.zeros((batch_size, FLAGS.num_class))
index = 0
for i in range(begin, end) :
with tf.gfile.FastGFile(FLAGS.data_dir + '/' + data_set + '/' + filenames[i][0], 'rb') as f :
image_data = f.read()
decode_image = tf.image.decode_jpeg(image_data, channels=FLAGS.depth)
resized_image = tf.image.resize_images(decode_image, [FLAGS.height, FLAGS.width],method=1)
image = sess.run(resized_image)
x_data = np.append(x_data, np.asarray(image.data, dtype='float32'))/255
y_data[index][filenames[i][1]] = 1 batch_index += batch_size
try:
x_data = x_data.reshape(batch_size, FLAGS.height, FLAGS.width, FLAGS.depth)
except:
return None, None
return x_data, y_data

Related

How to mask inputs with variable size in transformer model when the batches needs to be masked differently?

I'm making a transformer using tensorflow.keras and having issues understanding how the attention_mask works for a MultiHeadAttention layer.
My input is 3-dimensional data. For example, let's assume my whole dataset has 10 elements, each one with length no more than 4:
# whole data
[
# first item
[
[ 1, 2, 3],
[ 1, 2, 3],
[np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan],
],
# second item
[
[ 1, 2, 3],
[ 5, 8, 2],
[ 3, 7, 8],
[ 4, 6, 2],
],
... # 8 more items
]
So, my mask looks like:
# assume this is a numpy array
mask = [
[
[1, 1, 1],
[1, 1, 1],
[0, 0, 0],
[0, 0, 0],
],
[
[1, 1, 1],
[1, 1, 1],
[1, 1, 1],
[1, 1, 1],
],
...
]
So the shape of the mask til now is [10, 4, 3]. Let's say I use batch_size = 5. Now, according documentation, attention_mask shape should be [B, T, S] (batch_size, query_size, key_size). In the example case should be [5, 4, 4]?
Question
If the mask is calculated only once, what 5 items should I give as a mask? This sounds counterintuitive to me. How should I build the mask?
According this answer, head_size should be also taken in account, so they also do:
mask = mask[:, tf.newaxis, tf.newaxis, :]
What I've tested
The only time I manage to run the transformer successfully using the attention_mask is when I do:
mask = np.ones((batch_size, data.shape[1], data.shape[2]))
mask = mask[:, tf.newaxis, tf.newaxis, :]
Obviously that mask makes no sense, because it is all ones, but it was just to test if it had the correct shape.
The model
I'm using practically the same code from the keras example transformer for time series classification
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.0, mask=None):
# Normalization and Attention
x = layers.LayerNormalization(epsilon=1e-6)(inputs)
x = layers.MultiHeadAttention(
key_dim=head_size, num_heads=num_heads, dropout=dropout
)(x, x, attention_mask=mask)
x = layers.Dropout(dropout)(x)
res = x + inputs
# Feed Forward Part
x = layers.LayerNormalization(epsilon=1e-6)(res)
x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
x = layers.Dropout(dropout)(x)
x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
return x + res
def build_model(
n_classes,
input_shape,
head_size,
num_heads,
ff_dim,
num_transformer_blocks,
mlp_units,
dropout=0.0,
mlp_dropout=0.0,
input_mask=None,
) -> keras.Model:
inputs = keras.Input(shape=input_shape)
x = inputs
for _ in range(num_transformer_blocks):
x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout, input_mask)
x = layers.GlobalAveragePooling2D(data_format="channels_first")(x)
for dim in mlp_units:
x = layers.Dense(dim, activation="relu")(x)
x = layers.Dropout(mlp_dropout)(x)
outputs = layers.Dense(n_classes, activation="softmax")(x)
return keras.Model(inputs, outputs)
First, a simpler example to understand MultiHeadAttention mask.
#Crude Self attention implementation
query = tf.constant([[1], [2], [3], [4]], dtype=tf.float32) #Shape([4, 1])
scores = tf.matmul(query, query, transpose_b=True) #Shape([4, 4])
#unnormalized, presoftmax score
The above is the attention scores for the given query. attention_mask is used when you want to prevent attention to certain positions in this score. So mask dimension should be same as the attention score dimension.
Lets say, we decide that the current token in the above example needs to attend to only itself and to the next token, then we can define mask as:
mask = tf.constant([[1., 1., -np.inf, -np.inf],
[-np.inf, 1., 1. ,-np.inf],
[-np.inf, -np.inf, 1., 1.],
[-np.inf, -np.inf, -np.inf, 1.]])
#apply mask on the score
scores = scores*mask
#softmax
scores = tf.nn.softmax(scores)
#scores, ( 0 indicates no attention)
[[0.26894143, 0.73105854, 0. , 0. ],
[0. , 0.11920292, 0.880797 , 0. ],
[0. , 0. , 0.04742587, 0.95257413],
[0. , 0. , 0. , 1. ]]
#score weighted queries
value = tf.matmul(scores, query)
#value is a weighted average of the current and next token of ( [[1], [2], [3], [4]])
[[1.7310585], #weighted average of ([1], [2]) (current and next)
[2.8807971],
[3.9525743],
[4. ]]
Can there be different mask for each item in the batch?.
Yes, a use case i can think of is when you have padding for different samples in the same batch, so the mask can be set to ignore those paddings.
Your specific case: The mask has to be (batch_size, 4, 4). The mask can be same for each item in the batch.
batch_size = 5
query = keras.Input(shape=(4, 3))
mask_tensor = keras.Input(shape=(4, 4))
#keras layer
mha = keras.layers.MultiHeadAttention(num_heads=1, key_dim=3)
output = mha(query=query, value=query, attention_mask=mask_tensor, return_attention_scores=True)
#Create a model
model = keras.Model([query, mask_tensor], output)
#random query and mask. Note the mask needs to be (1:attention or 0:no attention)
queries = tf.random.normal(shape=(batch_size, 4, 3))
mask_data = tf.random.uniform(maxval=2, shape=(batch_size, 4, 4), dtype=tf.int32)
#calling the model
values, attn_weights = model.predict([queries, mask_data])
#attm_weights.shape
(5, 1, 4, 4)
After a little research and seeing several transformer model examples this is what solved the problem for me.
Create a custom TransformerBlock layer that supports masking
Add a mask parameter in the call method of the TransformerBlock and reshape it there.
Add a Masking layer before the TransformerBlock
Code:
class TransformerBlock(layers.Layer):
def __init__(self, head_size, num_heads, ff_dim, ff_dim2, rate=0.1):
super().__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=head_size)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
self.conv1 = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")
self.conv2 = layers.Conv1D(filters=ff_dim2, kernel_size=1)
self.supports_masking = True
def call(self, inputs, training, mask=None):
padding_mask = None
if mask is not None:
padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
out_norm1 = self.layernorm1(inputs, training=training)
out_att = self.att(
out_norm1, out_norm1, training=training, attention_mask=padding_mask
)
out_drop1 = self.dropout1(out_att, training=training)
res = out_drop1 + inputs
out_norm2 = self.layernorm2(res, training=training)
out_conv1 = self.conv1(out_norm2, training=training)
out_drop2 = self.dropout2(out_conv1, training=training)
out_conv2 = self.conv2(out_drop2, training=training)
return out_conv2 + res
def build_model(
n_classes,
input_shape,
head_size,
num_heads,
ff_dim,
num_transformer_blocks,
mlp_units,
dropout=0.0,
mlp_dropout=0.0,
mask=None,
) -> keras.Model:
inputs = keras.Input(shape=input_shape)
_x = inputs
if mask is not None:
_x = layers.Masking(mask_value=mask)(_x)
for _ in range(num_transformer_blocks):
_x = TransformerBlock(
head_size,
num_heads,
ff_dim,
inputs.shape[-1],
dropout,
)(_x)
_x = layers.GlobalAveragePooling2D(data_format="channels_first")(_x)
for dim in mlp_units:
_x = layers.Dense(dim, activation="relu")(_x)
_x = layers.Dropout(mlp_dropout)(_x)
outputs = layers.Dense(n_classes, activation="softmax")(_x)
return keras.Model(inputs, outputs)

Tensorflow predict the class of output

I have tried the example with keras but was not with LSTM. My model is with LSTM in Tensorflow and I am willing to predict the output in the form of classes as the keras model thus with predict_classes.
The Tensorflow model I am trying is something like this:
seq_len=10
n_steps = seq_len-1
n_inputs = x_train.shape[2]
n_neurons = 50
n_outputs = y_train.shape[1]
n_layers = 2
learning_rate = 0.0001
batch_size =100
n_epochs = 1000
train_set_size = x_train.shape[0]
test_set_size = x_test.shape[0]
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_outputs])
layers = [tf.contrib.rnn.LSTMCell(num_units=n_neurons,activation=tf.nn.sigmoid, use_peepholes = True) for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])
outputs = outputs[:,n_steps-1,:]
loss = tf.reduce_mean(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
I am encoding the with sklearn LabelEncoder as:
encoder_train = LabelEncoder()
encoder_train.fit(y_train)
encoded_Y_train = encoder_train.transform(y_train)
y_train = np_utils.to_categorical(encoded_Y_train)
The data is converted to sparse matrix kinda thing in binary format.
When I tried to predict the output I got the following:
actual==> [[0. 0. 1.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 1. 0.]
[0. 1. 0.]]
predicted==> [[0.3112209 0.3690182 0.31357136]
[0.31085992 0.36959863 0.31448898]
[0.31073445 0.3703295 0.31469804]
[0.31177694 0.37011752 0.3145326 ]
[0.31220382 0.3692756 0.31515726]
[0.31232828 0.36947766 0.3149037 ]
[0.31190437 0.36756667 0.31323162]
[0.31339088 0.36542615 0.310322 ]
[0.31598282 0.36328828 0.30711085]]
What I was expecting for the label based on the encoding done. As the Keras model thus. See the following:
predictions = model.predict_classes(X_test, verbose=True)
print("REAL VALUES:",reverse_category(Y_test,axis=1))
print("PRED VALUES:",predictions)
print("REAL COLORS:")
print(encoder.inverse_transform(reverse_category(Y_test,axis=1)))
print("PREDICTED COLORS:")
print(encoder.inverse_transform(predictions))
The output is something like the following:
REAL VALUES: [1 1 1 ... 1 2 1]
PRED VALUES: [2 1 1 ... 1 2 2]
REAL COLORS:
['ball' 'ball' 'ball' ... 'ball' 'bat' 'ball']
PREDICTED COLORS:
['bat' 'ball' 'ball' ... 'ball' 'bat' 'bat']
Kindly, let me know what I can do in the tensorflow model that will get me the result with respect to the encoding done.
I am using Tensorflow 1.12.0 and Windows 10
You are trying to map the predicted class probabilities back to class labels. Each row in the list of output predictions contains the three predicted class probabilities. Use np.argmax to obtain the one with the highest predicted probability in order to map to the predicted class label:
import numpy as np
predictions = [[0.3112209, 0.3690182, 0.31357136],
[0.31085992, 0.36959863, 0.31448898],
[0.31073445, 0.3703295, 0.31469804],
[0.31177694, 0.37011752, 0.3145326 ],
[0.31220382, 0.3692756, 0.31515726],
[0.31232828, 0.36947766, 0.3149037 ],
[0.31190437, 0.36756667, 0.31323162],
[0.31339088, 0.36542615, 0.310322 ],
[0.31598282, 0.36328828, 0.30711085]]
np.argmax(predictions, axis=1)
Gives:
array([1, 1, 1, 1, 1, 1, 1, 1, 1])
In this case, class 1 is predicted 9 times.
As noted in the comments: this is exactly what Keras does under the hood, as you'll see in the source code.

Zero gradients for repetitve convolutions tensorflow?

When I try to do repetitive convolution for same input, gradients become zero for more than 1 repetition. what could go wrong here?
W = tf.Variable(tf.zeros([3, 3, 1, 1]))
output = input_image # a 4D tensor [batch_size, 16, 16, 1]
for _ in range(4):
output = tf.nn.conv2d(
output,
W,
[1, 2, 2, 1],
padding="SAME"
)
preds = tf.reshape(output, shape=[batch_size])
loss = tf.reduce_mean(preds, labels)
# this gradient zero when num of repetitive layers > 1??
tf_gradient = tf.concat(0, tf.gradients(loss, W))
gradient = session.run(tf_gradient)
print(gradient.reshape(3**2))
#prints [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Use a random initialization for W. Something like W = tf.get_variable(name="W", initializer=tf.glorot_uniform_initializer, shape=[3, 3, 1, 1], dtype=tf.float32)

TensorFlow: Linear Regression with multiple inputs returns NaNs

This is my first attemp at TensorFlow: I am building a Linear Regression model with multiple inputs.
The problem is that the result is always NaN, and I suspect that it is because I am a complete noob with matrix operations using numpy and tensorflow (matlab background hehe).
Here is the code:
import numpy as np
import tensorflow as tf
N_INP = 2
N_OUT = 1
# Model params
w = tf.Variable(tf.zeros([1, N_INP]), name='w')
b = tf.Variable(tf.zeros([1, N_INP]), name='b')
# Model input and output
x = tf.placeholder(tf.float32, [None, N_INP], name='x')
y = tf.placeholder(tf.float32, [None, N_OUT], name='y')
linear_model = tf.reduce_sum(x * w + b, axis=1, name='out')
# Loss as sum(error^2)
loss = tf.reduce_sum(tf.square(linear_model - y), name='loss')
# Create optimizer
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = optimizer.minimize(loss, name='train')
# Define training data
w_real = np.array([-1, 4])
b_real = np.array([1, -5])
x_train = np.array([[1, 2, 3, 4], [0, 0.5, 1, 1.5]]).T
y_train = np.sum(x_train * w_real + b_real, 1)[np.newaxis].T
print('Real X:\n', x_train)
print('Real Y:\n', y_train)
# Create session and init parameters
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# Training loop
train_data = {x: x_train, y: y_train}
for i in range(1000):
sess.run(train, train_data)
# Eval solution
w_est, b_est, curr_loss, y_pred = sess.run([w, b, loss, linear_model], train_data)
print("w: %s b: %s loss: %s" % (w_est, b_est, curr_loss))
print("y_pred: %s" % (y_pred,))
And here is the output:
Real X:
[[ 1. 0. ]
[ 2. 0.5]
[ 3. 1. ]
[ 4. 1.5]]
Real Y:
[[-5.]
[-4.]
[-3.]
[-2.]]
w: [[ nan nan]] b: [[ nan nan]] loss: nan
y_pred: [ nan nan nan nan]
You need to add keep_dims=True inside your definition of linear_model. That is,
linear_model = tf.reduce_sum(x * w + b, axis=1, name='out',keep_dims=True)
The reason is that otherwise the result is "flattened", and you cannot subtract y from it.
For example,
'x' is [[1,2,3],
[4,5,6]]
tf.reduce_sum(x, axis=1) is [6, 15]
tf.reduce_sum(x, axis=1, keep_dims=True) is [[6], [15]]

Tensor Flow softmax Regression Always Predicts 1

I have the following code based on the MNIST example. It is modified in two ways:
1) I'm not using a one-hot-vector, so I simply use tf.equal(y, y_)
2) My results are binary: either 0 or 1
import tensorflow as tf
import numpy as np
# get the data
train_data, train_results = get_data(2000, 2014)
test_data, test_results = get_data(2014, 2015)
# setup a session
sess = tf.Session()
x_len = len(train_data[0])
y_len = len(train_results[0])
# make placeholders for inputs and outputs
x = tf.placeholder(tf.float32, shape=[None, x_len])
y_ = tf.placeholder(tf.float32, shape=[None, y_len])
# create the weights and bias
W = tf.Variable(tf.zeros([x_len, 1]))
b = tf.Variable(tf.zeros([1]))
# initialize everything
sess.run(tf.initialize_all_variables())
# create the "equation" for y in terms of x
y_prime = tf.matmul(x, W) + b
y = tf.nn.softmax(y_prime)
# construct the error function
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(y_prime, y_)
# setup the training algorithm
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# train the thing
for i in range(1000):
rand_rows = np.random.choice(train_data.shape[0], 100, replace=False)
_, w_out, b_out, ce_out = sess.run([train_step, W, b, cross_entropy], feed_dict={x: train_data[rand_rows, :], y_: train_results[rand_rows, :]})
print("%d: %s %s %s" % (i, str(w_out), str(b_out), str(ce_out)))
# compute how many times it was correct
correct_prediction = tf.equal(y, y_)
# find the accuracy of the predictions
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print(sess.run(accuracy, feed_dict={x: test_data, y_: test_results}))
for i in range(0, len(test_data)):
res = sess.run(y, {x: [test_data[i]]})
print("RES: " + str(res) + " ACT: " + str(test_results[i]))
The accuracy is always 0.5 (because my test data has about as many 1s as 0s). The values of W and b always seem to increase, probably because the values of cross_entropy are always a vector of all zeros.
When I try and use this model for prediction, the predictions are always 1:
RES: [[ 1.]] ACT: [ 0.]
RES: [[ 1.]] ACT: [ 1.]
RES: [[ 1.]] ACT: [ 0.]
RES: [[ 1.]] ACT: [ 1.]
RES: [[ 1.]] ACT: [ 0.]
RES: [[ 1.]] ACT: [ 1.]
RES: [[ 1.]] ACT: [ 0.]
RES: [[ 1.]] ACT: [ 0.]
RES: [[ 1.]] ACT: [ 1.]
RES: [[ 1.]] ACT: [ 0.]
RES: [[ 1.]] ACT: [ 1.]
What am I doing wrong here?
You seem to be predicting a single scalar, rather than a vector. The softmax op produces a vector-valued prediction for each example. This vector must always sum to 1. When the vector only contains one element, that element must always be 1. If you want to use a softmax for this problem, you could use [1, 0] as the output target where you are currently using [0] and use [0, 1] where you are currently using [1]. Another option is you could keep using just one number, but change the output layer to sigmoid instead of softmax, and change the cost function to be the sigmoid-based cost function as well.

Categories