Tensorflow batch normalization

Tensorflow batch normalization - python

Below are the code i am using as a learning programming in Tensorflow.
from __future__ import print_function
from datetime import datetime
import time, os
import tensorflow as tf
# Import data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.001
training_epoch = 5
batch_size = 128
display_step = 10
model_path = "./output/model.ckpt"
logs_path = './logs'
directory = os.path.dirname(model_path)
if not os.path.exists(directory):
os.makedirs(directory)
directory = os.path.dirname(logs_path)
if not os.path.exists(directory):
os.makedirs(directory)
# Network Parameters
n_input = 784 # data input
n_classes = 10 # classes
dropout = 0.5 # Dropout, probability to keep units
l2_regularization_strength = 0.0005 #l2 regularization strength
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input], name='InputData')
y = tf.placeholder(tf.float32, [None, n_classes], name='LabelData')
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
mode = tf.placeholder(tf.int32);
# Create some wrappers for simplicity
def conv2d(x, kernel_shape, strides=1, mode=0):
# Conv2D wrapper, with batch normalization and relu activation
weights = tf.get_variable('weights', kernel_shape, initializer=tf.contrib.layers.xavier_initializer())
x = tf.nn.conv2d(x, weights, strides=[1, strides, strides, 1], padding='SAME')
pop_mean = tf.get_variable('bn_pop_mean', [x.get_shape()[-1]], initializer=tf.constant_initializer(0), trainable=False)
pop_var = tf.get_variable('bn_pop_var', [x.get_shape()[-1]], initializer=tf.constant_initializer(1), trainable=False)
scale = tf.get_variable('bn_scale', [x.get_shape()[-1]], initializer=tf.constant_initializer(1))
beta = tf.get_variable('bn_beta', [x.get_shape()[-1]], initializer=tf.constant_initializer(0))
epsilon = 1e-3
decay = 0.999
if mode == 0:
batch_mean, batch_var = tf.nn.moments(x,[0, 1, 2])
train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay))
train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay))
with tf.control_dependencies([train_mean, train_var]):
bn = tf.nn.batch_normalization(x, batch_mean, batch_var, beta, scale, epsilon, name='bn')
else:
bn = tf.nn.batch_normalization(x, pop_mean, pop_var, beta, scale, epsilon, name='bn')
return tf.nn.relu(bn, name = 'relu')
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME', name='maxpool')
# Create model
def conv_net(x, dropout, mode):
# Reshape input picture
x = tf.reshape(x, shape=[-1, 28, 28, 1])
with tf.variable_scope("conv1"):
# Convolution Layer
conv1 = conv2d(x, [5, 5, 1, 32], mode=mode)
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
with tf.variable_scope("conv2"):
# Convolution Layer
conv2 = conv2d(conv1, [5, 5, 32, 64], mode=mode)
# Max Pooling (down-sampling)
conv2 = maxpool2d(conv2, k=2)
with tf.variable_scope("fc1"):
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
weights = tf.get_variable("weights", [7*7*64, 1024], initializer=tf.contrib.layers.xavier_initializer())
biases = tf.get_variable("biases", [1024], initializer=tf.constant_initializer(0.0))
fc1 = tf.reshape(conv2, [-1, weights.get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights), biases)
fc1 = tf.nn.relu(fc1, name = 'relu')
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout, name='dropout')
with tf.variable_scope("output"):
# Output, class prediction
weights = tf.get_variable("weights", [1024, n_classes], initializer=tf.contrib.layers.xavier_initializer())
biases = tf.get_variable("biases", [n_classes], initializer=tf.constant_initializer(0.0))
out = tf.add(tf.matmul(fc1, weights), biases)
return out
with tf.name_scope('Model'):
# Construct model
pred = conv_net(x, keep_prob, mode)
with tf.name_scope('Loss'):
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
vars = tf.trainable_variables()
l2_regularization = tf.add_n([tf.nn.l2_loss(v) for v in vars if any(x in v.name for x in ['weights', 'biases'])])
for v in vars:
if any(x in v.name for x in ['weights', 'biases']):
print(v.name + '-included!')
else:
print(v.name)
cost += l2_regularization_strength*l2_regularization
with tf.name_scope('Optimizer'):
# Define optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# Op to calculate every variable gradient
grads = tf.gradients(cost, tf.trainable_variables())
grads = list(zip(grads, tf.trainable_variables()))
# Op to update all variables according to their gradient
apply_grads = optimizer.apply_gradients(grads_and_vars=grads)
with tf.name_scope('Accuracy'):
# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
# Create a summary to monitor cost tensor
tf.scalar_summary('cost', cost)
# Create a summary to monitor l2_regularization tensor
tf.scalar_summary('l2_regularization', l2_regularization)
# Create a summary to monitor accuracy tensor
tf.scalar_summary('accuracy', accuracy)
# Create summaries to visualize weights
for var in tf.trainable_variables():
tf.histogram_summary(var.name, var)
for var in tf.all_variables():
if 'bn_pop' in var.name:
tf.histogram_summary(var.name, var)
# Summarize all gradients
for grad, var in grads:
tf.histogram_summary(var.name + '/gradient', grad)
# Merge all summaries into a single op
merged_summary_op = tf.merge_all_summaries()
# 'Saver' op to save and restore all the variables
saver = tf.train.Saver()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 1
# op to write logs to Tensorboard
summary_writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())
# Keep training until reach max epoch
while step * batch_size < training_epoch * mnist.train.num_examples:
start_time = time.time()
# Get barch
batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop)
sess.run(apply_grads, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout, mode: 0})
duration = time.time() - start_time
if step % display_step == 0:
# Calculate batch loss and accuracy
loss, acc, summary = sess.run([cost, accuracy, merged_summary_op], feed_dict={x: batch_x,
y: batch_y,
keep_prob: 1.,
mode: 1})
# Write logs at every iteration
summary_writer.add_summary(summary, step)
# Calculate number sample per sec
samples_per_sec = batch_size / duration
format_str = ('%s: Iter %d, Epoch %d, (%.1f examples/sec; %.3f sec/batch), Minibatch Loss = %.5f , Training Accuracy=%.5f')
print (format_str % (datetime.now(), step*batch_size, int(step*batch_size/mnist.train.num_examples) + 1, samples_per_sec, float(duration), loss, acc))
step += 1
print("Optimization Finished!")
# Calculate accuracy for 256 mnist test images
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: mnist.test.images[:5000],
y: mnist.test.labels[:5000],
keep_prob: 1.,
mode: 2}))
# Save model weights to disk
save_path = saver.save(sess, model_path)
print("Model saved in file: %s" % save_path)
When i open the tensorboard and look at the histogram and distribution sesstion, the 'bn_pop_mean' and 'bn_pop_var' in 'conv1' and 'conv2' are not updateing (they are constant at the initialised value).
Although after the training i achieved around 97% accuracy, i don't know if it the batch normalization is in effect.

In your conv_net function, you didn't set the "reuse" parameter for the tf.variable_scope(). The default setting for "reuse" is "None". Every time conv2d function is called, "bn_pop_mean" and "bn_pop_var" are re-initalized.

if mode == 0:
batch_mean, batch_var = tf.nn.moments(x,[0, 1, 2])
train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay))
train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay))
with tf.control_dependencies([train_mean, train_var]):
bn = tf.nn.batch_normalization(x, batch_mean, batch_var, beta, scale, epsilon, name='bn')
else:
bn = tf.nn.batch_normalization(x, pop_mean, pop_var, beta, scale, epsilon, name='bn')
It seems that the if prediction here always evaluate to be False. I guess what you want to do is using mode via feed_dict to control your batch normalization. So you should use tf.cond in TensorFlow instead of if in Python.

Related

tensorflow not training (only biases change)

I want to train a convolution network to output a number 0-100. But very quickly the model stops updating the weights, and only the biases in the Fully connected layers are changed. And I am unable to understand why.
Image of weights:
I've played around with different number of layers and so on, but I always run in to the same problem of only the FC biases changing.
This is the current code Im testing. Ive stripped away thing like dropout and such. Over fitting is not a concern at this moment. In fact, I would like to try and over fit the data just so I can see that my model learns anything
from __future__ import print_function
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
###################################################################################
############################# Read Data ###########################################
with tf.name_scope("READ_DATA"):
def read_my_file_format(filename_queue):
reader = tf.WholeFileReader()
key, record_string = reader.read(filename_queue)
split_res = tf.string_split([key],'_')
key = split_res.values[5]
example = tf.image.decode_png(record_string)
example = tf.image.rgb_to_grayscale(example, name=None)
processed_example = resize_img(example)
processed_example = reshape_img(processed_example)
return processed_example, key
def resize_img(imgg):
return tf.image.resize_images(imgg,[102,525])
def reshape_img(imgg):
return tf.reshape(imgg,shape=[102,525,1])
def input_pipeline( bsize=30, num_epochs=None):
filename_queue = tf.train.string_input_producer(
tf.train.match_filenames_once("./png_imgs/*.png"), num_epochs=num_epochs, shuffle=True)
example, label = read_my_file_format(filename_queue)
min_after_dequeue = bsize
capacity = min_after_dequeue + 3 * 8
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=bsize, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
imb_batch1,label_batch1 = input_pipeline()
single_img, single_lbl = input_pipeline(bsize=1)
############################# Read Data ###########################################
###################################################################################
# Parameters
#learning_rate = 0.0001
training_iters = 200000
batch_size = 30
# Network Parameters
n_input = 600*300*3
n_classes = 1 # MNIST total classes (0-9 digits)
dropout = 0.75 # Dropout, probability to keep units
# tf Graph input
x = tf.placeholder(tf.float32, [None, 102,525,1])
y = tf.placeholder(tf.float32, [None, 1])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
learning_rate = tf.placeholder(tf.float32)
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, dropout):
# Convolution Layer
with tf.variable_scope('conv1') as scope:
w = tf.get_variable('weights',[5,5,1,32], initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('biases',[32],initializer=tf.random_normal_initializer())
conv1 = conv2d(x,w,b)
tf.summary.histogram('weights',w)
tf.summary.histogram('biases',b)
with tf.variable_scope('conv2') as scope:
w = tf.get_variable('weights',[5,5,32,32], initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('biases',[32],initializer=tf.random_normal_initializer())
conv2 = conv2d(conv1,w,b)
tf.summary.histogram('weights',w)
tf.summary.histogram('biases',b)
with tf.name_scope("Maxpool"):
conv2 = maxpool2d(conv2,k=2)
with tf.variable_scope('FC1') as scope:
w = tf.get_variable('weights',[32*263*51,64], initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('biases',[64],initializer=tf.random_normal_initializer())
FC1 = tf.reshape(conv2,[-1,w.get_shape().as_list()[0]])
FC1 = tf.add(tf.matmul(FC1,w),b)
tf.summary.histogram('weights',w)
tf.summary.histogram('biases',b)
with tf.variable_scope('FC2') as scope:
w = tf.get_variable('weights',[64,1], initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('biases',[1],initializer=tf.random_normal_initializer())
FC2 = tf.add(tf.matmul(FC1,w),b)
tf.summary.histogram('weights',w)
tf.summary.histogram('biases',b)
return FC2
# Construct model
pred = conv_net(x, keep_prob)
def cost():
with tf.name_scope("Cost"):
diff = tf.abs(tf.subtract(y,pred))
cost=tf.reduce_mean(diff)
print(cost)
tf.summary.histogram('Label',y)
tf.summary.histogram('predicted',pred)
tf.summary.scalar('cost',cost)
return cost
with tf.name_scope("Optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost())
# optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost())
# Initializing the variables
saver = tf.train.Saver()
init = tf.global_variables_initializer()
merged = tf.summary.merge_all()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
writer = tf.summary.FileWriter("/tmp/tensorboard/log01")
writer.add_graph(sess.graph)
step = 1
l_rate= 0.1
# Keep training until reach max iterations
while step * batch_size < training_iters:
print("step: ",step)
batch_x, batch_y = sess.run([imb_batch1,label_batch1])
batch_y = batch_y.reshape(-1,1)
if step % 100 == 0 :
l_rate = l_rate/5
if l_rate < 0.000001 :
l_rate= 0.000001
if step > 20:
_,sumry = sess.run([optimizer,merged], feed_dict={x: batch_x, y: batch_y,
keep_prob: dropout, learning_rate: l_rate})
writer.add_summary(sumry,step)
else :
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
keep_prob: dropout, learning_rate: l_rate})
step += 1
print("Training Done!")
coord.request_stop()
coord.join(threads)
Is there a silly mistake somewhere in the code causing this?

You don't have a nonlinearity in your first fully-connected layer, so it adds no value compared to having just one fully-connected layer.

I have built a CNN for detecting human faces . From the very first epoch I am getting higher accuracy. What might be the reason for it?

The training data set consists of face images taken from youtube faces database labelled as one and non face images are taken from 256Object categories 25k images are chosen for both positive and negative data. so totally 50k for training and another 10k images is taken from yooutube faces and 256 object categories which are not repeated.
The problem is I am getting 99% accuracy after just 12k iterations in first epoch itself and I am printing cost value also it is also starts from very high value like 596014.000 like that. When it is tested against the other face images it performs very badly.
cost vs epoch graph
import tensorflow as tf
import read_data
from sklearn import metrics
import numpy as np
import os
import graph_plotter as gp
# Parameters
learning_rate = 0.001
epochs = 30
batch_size = 100
display_step = 5
# tf Graph input
input_data = tf.placeholder(tf.float32, [None, 27, 31, 3])
output_data = tf.placeholder(tf.float32, [None, 1])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
# Getting train and test data
train_data, train_label , test_data, test_label = read_data.getData()
def conv2d(x, w, bias, k=1):
x = tf.nn.conv2d(x, w, strides=[1, k, k, 1], padding='SAME')
x = tf.nn.bias_add(x, bias)
return tf.nn.relu(x)
# Performs max pooling on the convolution layer output
def maxpool2d(x, k=2):
return tf.nn.max_pool(x,
ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Weights generated randomly according to layer
weights = {
# Conv 4*4 , 1 input , 32 outputs
'wc1': tf.Variable(tf.random_normal([4, 4, 3, 32])),
# Conv 3*3 , 32 inputs , 32 outputs
'wc2': tf.Variable(tf.random_normal([3, 3, 32, 64])),
# Conv 5*6 , 64 input , 128 outputs
'wc3': tf.Variable(tf.random_normal([5, 6, 64, 128])),
# Conv 1*1 , 128 inputs , 256 outputs
'wc4': tf.Variable(tf.random_normal([1, 1, 128, 256])),
# Conv 1*1 , 256 inputs , 256 outputs
'wc5': tf.Variable(tf.random_normal([1, 1, 256, 512])),
# Output Layer 7*8*256 inputs and 1 output ( face or non-face )
'out': tf.Variable(tf.random_normal([7*8*512, 1]))
}
biases = {
'bc1': tf.Variable(tf.random_normal([32])),
'bc2': tf.Variable(tf.random_normal([64])),
'bc3': tf.Variable(tf.random_normal([128])),
'bc4': tf.Variable(tf.random_normal([256])),
'bc5': tf.Variable(tf.random_normal([512])),
'out': tf.Variable(tf.random_normal([1]))
}
def model(x, weight, bias, dropout):
# Layer 1
conv1 = conv2d(x, weight['wc1'], bias['bc1'])
conv1 = maxpool2d(conv1, k=2)
# Layer 2
conv2 = conv2d(conv1, weight['wc2'], bias['bc2'])
conv2 = maxpool2d(conv2, k=2)
# Layer 3
conv3 = conv2d(conv2, weight['wc3'], bias['bc3'])
# Layer 4
conv4 = conv2d(conv3, weight['wc4'], bias['bc4'])
# Layer 5
conv5 = conv2d(conv4, weight['wc5'], bias['bc5'])
#conv5 = tf.nn.dropout(conv5, dropout)
# Flattening data
intermediate = tf.reshape(conv5, shape=[-1, 7*8*512])
# Output Layer
output = tf.add(tf.matmul(intermediate, weight['out']), bias['out'])
return output
pred = model(input_data, weights, biases, keep_prob)
l2_loss = 0.001*(
tf.nn.l2_loss(weights.get('wc4')) +
tf.nn.l2_loss(weights.get('wc5')) +
tf.nn.l2_loss(weights.get('out')))
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
pred, output_data)) + l2_loss
tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
correct_pred = tf.equal(
tf.greater(sigmoid_output, 0.5), tf.greater(output_data, 0.5))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
y_p = tf.cast(tf.greater(sigmoid_output, 0.5), tf.int32)
saver = tf.train.Saver()
tf.add_to_collection('y_p', y_p)
tf.add_to_collection('pred', pred)
tf.add_to_collection('x', input_data)
tf.add_to_collection('y', output_data)
init = tf.global_variables_initializer()
with tf.device("/gpu:0"):
with tf.Session() as sess:
sess.run(init)
train_data_minibatches = [train_data[k:k + batch_size]
for k in range(0, len(train_data), batch_size)]
train_label_minibatches = [train_label[k:k + batch_size]
for k in range(0, len(train_label), batch_size)]
step = 0
batch_count = 0
avg_cost_list = []
avg_accuracy_list = []
for epoch in range(epochs):
print('Epoch '+epoch.__str__())
cost_list = []
accuracy_list = []
for batch_x, batch_y in zip(
train_data_minibatches, train_label_minibatches):
batch_count += 1
sess.run(optimizer, feed_dict={
input_data: batch_x, output_data: batch_y,
keep_prob: 0.75})
# if epoch % display_step == 0:
loss, acc, output = sess.run([cost, accuracy, sig],
feed_dict={input_data: batch_x, output_data: batch_y, keep_prob: 0.75})
cost_list.append(loss)
accuracy_list.append(acc)
print("Iter " + str(step * batch_size) +" Loss "+ "{:.5f}".format(loss)+ ", Training Accuracy= " +
"{:.5f}".format(acc))
step += 1
average_cost = sum(cost_list) / len(cost_list)
average_acc = sum(accuracy_list) / len(accuracy_list)
avg_cost_list.append(average_cost)
avg_accuracy_list.append(average_acc)
if epoch % display_step == 0:
test_acc, y_pred = sess.run([accuracy, y_p], feed_dict={input_data: test_data,
output_data: test_label,
keep_prob: 0.75})
print(metrics.confusion_matrix(test_label, y_pred))
print("Testing Accuracy : " + "{:.5f}".format(test_acc))
print("Optimization finished !!")
# Saving cost Vs epoch graph, and accuracy Vs epoch graphs.
gp.cost_vs_epoch(avg_cost_list)
gp.accuracy_vs_epoch(avg_accuracy_list)
save_path = saver.save(sess=sess, save_path=save_path, write_meta_graph=True)

The line
correct_pred = tf.equal(tf.greater(sigmoid_output, 0.5),
tf.greater(output_data, 0.5))
is likely to be wrong. Your code seems not to have sigmoid_output, but only pred.
If that is not the problem, I would look into train_data. How many "True" labels > 0.5 do you have? How many labels which evaluate to false do you have?

same batch but different batch size generate different result

For example, we have 64*100 input data which will be send to the tensor flow graph, and it will generate 64*(n_hidden nodes) output before putting into softmax or whatever loss function. We put 1*100 into the same graph, the result should be the the first row of previous output, but results are not. I use the tensor flow example on Mnist to test the comparison.
'''
A Multilayer Perceptron implementation example using TensorFlow library.
This example is using the MNIST database of handwritten digits
(http://yann.lecun.com/exdb/mnist/)
Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
'''
from __future__ import print_function
import numpy as np
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
# Parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 100
display_step = 1
# Network Parameters
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_2 = tf.nn.relu(layer_2)
# Output layer with linear activation
out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
return out_layer
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1]), name ='layer1'),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2]), name = 'layer2'),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]), name = 'layer3')
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1]), name = 'layer1_b'),
'b2': tf.Variable(tf.random_normal([n_hidden_2]), name = 'layer2_b'),
'out': tf.Variable(tf.random_normal([n_classes]), name = 'layer3_b')
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
#optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
var = tf.all_variables()
trainer = tf.train.AdamOptimizer(learning_rate=learning_rate)
grads = trainer.compute_gradients(cost, var)
update = trainer.apply_gradients(grads)
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x, batch_y = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
#_, c = sess.run([optimizer, cost], feed_dict={x: batch_x, y: batch_y})
#c, v,grad, Pred, bi = sess.run([cost, var,grads, pred, biases], feed_dict={x: batch_x, y: batch_y})
Pred_2 = sess.run(pred, feed_dict={x: batch_x, y: batch_y})
Pred_1 = sess.run(pred , feed_dict={x: batch_x[0:1,:], y: batch_y[0:1]})
print(Pred_2[0] == Pred_1)
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
# print(len(v))
# g1 = np.array(grad[0])
# g2 = np.array(grad[1])
# g3 = np.array(grad[2])
# g4 = np.array(grad[3])
# g5 = np.array(grad[4])
# g6 = np.array(grad[5])
# print(g1.shape)
# print(g2.shape)
# print(g3.shape)
# print(g4.shape)
# print(g5.shape)
# print(g6.shape)
# print(g6[0,:])
# print(g6[1,:])
# print(bi['out'])
#print(type(updating))
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
print(Pred_2[0] == Pred_1) Should be the same, but they are not. It is strange.

The gradient descent path should be different if your weight and bias initialization is random and the gradients each time are different, it might take a path towards a different minimum.

Tensorflow accuracy at .99 but predictions awful

Maybe I'm making predictions wrong?
Here's the project... I have a greyscale input image that I am trying to segment. The segmentation is a simple binary classification (think of foreground vs background). So the ground truth (y) is a matrix of 0's and 1's -- so there's 2 classifications. Oh and the input image is a square, so I just use one variable called n_input
My accuracy essentially converges to 0.99 but when I make a prediction I get all zero's. EDIT --> there is a single 1 in each output matrices, both in the same place...
Here's my session code(everything else is working)...
with tf.Session() as sess:
sess.run(init)
summary = tf.train.SummaryWriter('/tmp/logdir/', sess.graph_def)
step = 1
from tensorflow.contrib.learn.python.learn.datasets.scroll import scroll_data
data = scroll_data.read_data('/home/kendall/Desktop/')
# Keep training until reach max iterations
flag = 0
# while flag == 0:
while step * batch_size < training_iters:
batch_y, batch_x = data.train.next_batch(batch_size)
# pdb.set_trace()
# batch_x = batch_x.reshape((batch_size, n_input))
batch_x = batch_x.reshape((batch_size, n_input, n_input))
batch_y = batch_y.reshape((batch_size, n_input, n_input))
batch_y = convert_to_2_channel(batch_y, batch_size)
# batch_y = batch_y.reshape((batch_size, n_output, n_classes))
batch_y = batch_y.reshape((batch_size, 200, 200, n_classes))
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
keep_prob: dropout})
if step % display_step == 0:
flag = 1
# Calculate batch loss and accuracy
loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
y: batch_y,
keep_prob: 1.})
print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc)
step += 1
print "Optimization Finished!"
save_path = "model.ckpt"
saver.save(sess, save_path)
im = Image.open('/home/kendall/Desktop/HA900_frames/frame0635.tif')
batch_x = np.array(im)
pdb.set_trace()
batch_x = batch_x.reshape((1, n_input, n_input))
batch_x = batch_x.astype(float)
# pdb.set_trace()
prediction = sess.run(pred, feed_dict={x: batch_x, keep_prob: 1.})
print prediction
arr1 = np.empty((n_input,n_input))
arr2 = np.empty((n_input,n_input))
for i in xrange(n_input):
for j in xrange(n_input):
for k in xrange(2):
if k == 0:
arr1[i][j] = prediction[0][i][j][k]
else:
arr2[i][j] = prediction[0][i][j][k]
# prediction = np.asarray(prediction)
# prediction = np.reshape(prediction, (200,200))
# np.savetxt("prediction.csv", prediction, delimiter=",")
np.savetxt("prediction1.csv", arr1, delimiter=",")
np.savetxt("prediction2.csv", arr2, delimiter=",")
Since there are two classifications, that end part (with the couple of loops) is just to partition the prediction into two 2x2 matrices.
I saved the prediction arrays to a CSV file, and like I said, they were all zeros.
I have also confirmed all data is correct (dimensions and values).
Why would the training converge, but predictions are awful?
If you want to look at all the code, here it is...
import tensorflow as tf
import pdb
import numpy as np
from numpy import genfromtxt
from PIL import Image
# Import MINST data
# from tensorflow.examples.tutorials.mnist import input_data
# mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.001
training_iters = 20000
batch_size = 128
display_step = 1
# Network Parameters
n_input = 200 # MNIST data input (img shape: 28*28)
n_output = 40000 # MNIST total classes (0-9 digits)
n_classes = 2
#n_input = 200
dropout = 0.75 # Dropout, probability to keep units
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input, n_input])
y = tf.placeholder(tf.float32, [None, n_input, n_input, n_classes])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, weights, biases, dropout):
# Reshape input picture
x = tf.reshape(x, shape=[-1, n_input, n_input, 1])
# Convolution Layer
conv1 = conv2d(x, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling)
conv1 = maxpool2d(conv1, k=2)
conv1 = tf.nn.local_response_normalization(conv1)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
# Max Pooling (down-sampling)
conv2 = tf.nn.local_response_normalization(conv2)
conv2 = maxpool2d(conv2, k=2)
# Convolution Layer
conv3 = conv2d(conv2, weights['wc3'], biases['bc3'])
# Max Pooling (down-sampling)
conv3 = tf.nn.local_response_normalization(conv3)
conv3 = maxpool2d(conv3, k=2)
# pdb.set_trace()
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv3, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1)
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout)
output = []
for i in xrange(2):
output.append(tf.nn.softmax(tf.add(tf.matmul(fc1, weights['out']), biases['out'])))
return output
# return tf.nn.softmax(tf.add(tf.matmul(fc1, weights['out']), biases['out']))
# Store layers weight & bias
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
# 5x5 conv, 32 inputs, 64 outputs
'wc3': tf.Variable(tf.random_normal([5, 5, 64, 128])),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([25*25*128, 1024])),
# 1024 inputs, 10 outputs (class prediction)
'out': tf.Variable(tf.random_normal([1024, n_output]))
}
biases = {
'bc1': tf.Variable(tf.random_normal([32])),
'bc2': tf.Variable(tf.random_normal([64])),
'bc3': tf.Variable(tf.random_normal([128])),
'bd1': tf.Variable(tf.random_normal([1024])),
'out': tf.Variable(tf.random_normal([n_output]))
}
# Construct model
pred = conv_net(x, weights, biases, keep_prob)
# pdb.set_trace()
pred = tf.pack(tf.transpose(pred,[1,2,0]))
pred = tf.reshape(pred, [-1,n_input,n_input,n_classes])
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(pred, y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
saver = tf.train.Saver()
def convert_to_2_channel(x, batch_size):
#assume input has dimension (batch_size,x,y)
#output will have dimension (batch_size,x,y,2)
output = np.empty((batch_size, 200, 200, 2))
temp_arr1 = np.empty((batch_size, 200, 200))
temp_arr2 = np.empty((batch_size, 200, 200))
for i in xrange(batch_size):
for j in xrange(200):
for k in xrange(200):
if x[i][j][k] == 1:
temp_arr1[i][j][k] = 1
temp_arr2[i][j][k] = 0
else:
temp_arr1[i][j][k] = 0
temp_arr2[i][j][k] = 1
for i in xrange(batch_size):
for j in xrange(200):
for k in xrange(200):
for l in xrange(2):
if l == 0:
output[i][j][k][l] = temp_arr1[i][j][k]
else:
output[i][j][k][l] = temp_arr2[i][j][k]
return output
# Launch the graph
with tf.Session() as sess:
sess.run(init)
summary = tf.train.SummaryWriter('/tmp/logdir/', sess.graph_def)
step = 1
from tensorflow.contrib.learn.python.learn.datasets.scroll import scroll_data
data = scroll_data.read_data('/home/kendall/Desktop/')
# Keep training until reach max iterations
flag = 0
# while flag == 0:
while step * batch_size < training_iters:
batch_y, batch_x = data.train.next_batch(batch_size)
# pdb.set_trace()
# batch_x = batch_x.reshape((batch_size, n_input))
batch_x = batch_x.reshape((batch_size, n_input, n_input))
batch_y = batch_y.reshape((batch_size, n_input, n_input))
batch_y = convert_to_2_channel(batch_y, batch_size)
# batch_y = batch_y.reshape((batch_size, n_output, n_classes))
batch_y = batch_y.reshape((batch_size, 200, 200, n_classes))
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
keep_prob: dropout})
if step % display_step == 0:
flag = 1
# Calculate batch loss and accuracy
loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
y: batch_y,
keep_prob: 1.})
print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc)
step += 1
print "Optimization Finished!"
save_path = "model.ckpt"
saver.save(sess, save_path)
im = Image.open('/home/kendall/Desktop/HA900_frames/frame0635.tif')
batch_x = np.array(im)
pdb.set_trace()
batch_x = batch_x.reshape((1, n_input, n_input))
batch_x = batch_x.astype(float)
# pdb.set_trace()
prediction = sess.run(pred, feed_dict={x: batch_x, keep_prob: 1.})
print prediction
arr1 = np.empty((n_input,n_input))
arr2 = np.empty((n_input,n_input))
for i in xrange(n_input):
for j in xrange(n_input):
for k in xrange(2):
if k == 0:
arr1[i][j] = prediction[0][i][j][k]
else:
arr2[i][j] = prediction[0][i][j][k]
# prediction = np.asarray(prediction)
# prediction = np.reshape(prediction, (200,200))
# np.savetxt("prediction.csv", prediction, delimiter=",")
np.savetxt("prediction1.csv", arr1, delimiter=",")
np.savetxt("prediction2.csv", arr2, delimiter=",")
# Calculate accuracy for 256 mnist test images
print "Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: data.test.images[:256],
y: data.test.labels[:256],
keep_prob: 1.})

Errors in the code
There are multiple errors in your code:
you shouldn't call tf.nn.sigmoid_cross_entropy_with_logits with the output of a softmax layer, but with the unscaled logits:
WARNING: This op expects unscaled logits, since it performs a softmax on logits internally for efficiency. Do not call this op with the output of softmax, as it will produce incorrect results.
in fact since you have 2 classes, you should use a loss with softmax, using tf.nn.softmax_cross_entropy_with_logits
When using tf.argmax(pred, 1), you only apply argmax over axis 1, which is the height of the output image. You should use tf.argmax(pred, 3) on the last axis (of size 2).
This might explain why you get 0.99 accuracy
On the output image, it will take the argmax over the height of the image, which is by default 0 (as all values are equal for each channel)
Wrong model
The biggest drawback is that your model in general will be very hard to optimize.
You have a softmax over 40,000 classes, which is huge.
You do not take advantage at all of the fact that you want to output an image (the prediction foreground / background).
for instance prediction 2,345 is highly correlated with prediction 2,346 and prediction 2,545 but you don't take that into account
I recommend reading a bit about semantic segmentation first:
this paper: Fully Convolutional Networks for Semantic Segmentation
these slides from CS231n (Stanford): especially the part about upsampling and deconvolution
Recommendations
If you want to work with TensorFlow, you will need to start small. First try a very simple network with maybe 1 hidden layer.
You need to plot all the shapes of your tensors to make sure they correspond to what you thought. For instance, if you had plotted tf.argmax(y, 1), you would have realized the shape is [batch_size, 200, 2] instead of the expected [batch_size, 200, 200].
TensorBoard is your friend, you should try to plot the input image here, as well as your predictions to see what they look like.
Try small, with a very small dataset of 10 images and see if you can overfit it and predict almost the exact response.
To conclude, I am not sure of all my suggestions but they are worth trying, and I hope this will help you on the path to success !

Tensorflow reshaping a tensor

I'm trying to use tf.nn.sparse_softmax_cross_entropy_with_logits and I have followed the answer by user Olivier Moindrot [here][1] but I'm getting a dimension error
I'm building a segmentation network, so the input image is 200x200 and the output image is 200x200. The classification is binary, so foreground and background.
After I build the CNN pred = conv_net(x, weights, biases, keep_prob)
pred looks like this <tf.Tensor 'Add_1:0' shape=(?, 40000) dtype=float32>
The CNN has a couple of conv layers followed by a fully connected layer. The fully connected layer is 40000 because it is 200x200 flattened.
According to the above link, I reshape pred like so...
(side note: I also tried packing tf.pack() two pred's -- like above -- together, but I thought that was wrong)
pred = tf.reshape(pred, [-1, 200, 200, 2])
...so that there are 2 classifications. Continuing the above link...
temp_pred = tf.reshape(pred, [-1,2])
temp_y = tf.reshape(y, [-1])
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(temp_pred, temp_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
I have the following placeholders and batch data...
x = tf.placeholder(tf.float32, [None, 200, 200])
y = tf.placeholder(tf.int64, [None, 200, 200])
(Pdb) batch_x.shape
(10, 200, 200)
(Pdb) batch_y.shape
(10, 200, 200)
When I run a training session, I get the following dimension error:
tensorflow.python.framework.errors.InvalidArgumentError: logits first
dimension must match labels size. logits shape=[3200000,2] labels
shape=[400000]
My full code looks like this:
import tensorflow as tf
import pdb
import numpy as np
# Import MINST data
# from tensorflow.examples.tutorials.mnist import input_data
# mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.001
training_iters = 200000
batch_size = 10
display_step = 1
# Network Parameters
n_input = 200 # MNIST data input (img shape: 28*28)
n_classes = 2 # MNIST total classes (0-9 digits)
n_output = 40000
#n_input = 200
dropout = 0.75 # Dropout, probability to keep units
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input, n_input])
y = tf.placeholder(tf.int64, [None, n_input, n_input])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
padding='SAME')
# Create model
def conv_net(x, weights, biases, dropout):
# Reshape input picture
x = tf.reshape(x, shape=[-1, 200, 200, 1])
# Convolution Layer
conv1 = conv2d(x, weights['wc1'], biases['bc1'])
# Max Pooling (down-sampling)
# conv1 = tf.nn.local_response_normalization(conv1)
# conv1 = maxpool2d(conv1, k=2)
# Convolution Layer
conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
# Max Pooling (down-sampling)
# conv2 = tf.nn.local_response_normalization(conv2)
# conv2 = maxpool2d(conv2, k=2)
# Convolution Layer
conv3 = conv2d(conv2, weights['wc3'], biases['bc3'])
# # Max Pooling (down-sampling)
# conv3 = tf.nn.local_response_normalization(conv3)
# conv3 = maxpool2d(conv3, k=2)
# return conv3
# Fully connected layer
# Reshape conv2 output to fit fully connected layer input
fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
fc1 = tf.nn.relu(fc1)
# Apply Dropout
fc1 = tf.nn.dropout(fc1, dropout)
return tf.add(tf.matmul(fc1, weights['out']), biases['out'])
# Output, class prediction
# output = []
# for i in xrange(2):
# # output.append(tf.nn.softmax(tf.add(tf.matmul(fc1, weights['out']), biases['out'])))
# output.append((tf.add(tf.matmul(fc1, weights['out']), biases['out'])))
#
# return output
# Store layers weight & bias
weights = {
# 5x5 conv, 1 input, 32 outputs
'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
# 5x5 conv, 32 inputs, 64 outputs
'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
# 5x5 conv, 32 inputs, 64 outputs
'wc3': tf.Variable(tf.random_normal([5, 5, 64, 128])),
# fully connected, 7*7*64 inputs, 1024 outputs
'wd1': tf.Variable(tf.random_normal([50*50*64, 1024])),
# 1024 inputs, 10 outputs (class prediction)
'out': tf.Variable(tf.random_normal([1024, n_output]))
}
biases = {
'bc1': tf.Variable(tf.random_normal([32])),
'bc2': tf.Variable(tf.random_normal([64])),
'bc3': tf.Variable(tf.random_normal([128])),
'bd1': tf.Variable(tf.random_normal([1024])),
'out': tf.Variable(tf.random_normal([n_output]))
}
# Construct model
pred = conv_net(x, weights, biases, keep_prob)
pdb.set_trace()
# pred = tf.pack(tf.transpose(pred,[1,2,0]))
pred = tf.reshape(pred, [-1, n_input, n_input, 2])
temp_pred = tf.reshape(pred, [-1,2])
temp_y = tf.reshape(y, [-1])
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(temp_pred, temp_y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model
# correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
temp_pred2 = tf.reshape(pred, [-1,n_input,n_input])
correct_pred = tf.equal(tf.cast(y,tf.float32),tf.sub(temp_pred2,tf.cast(y,tf.float32)))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
summ = tf.train.SummaryWriter('/tmp/logdir/', sess.graph_def)
step = 1
from tensorflow.contrib.learn.python.learn.datasets.scroll import scroll_data
data = scroll_data.read_data('/home/kendall/Desktop/')
# Keep training until reach max iterations
while step * batch_size < training_iters:
batch_x, batch_y = data.train.next_batch(batch_size)
# Run optimization op (backprop)
batch_x = batch_x.reshape((batch_size, n_input, n_input))
batch_y = batch_y.reshape((batch_size, n_input, n_input))
batch_y = np.int64(batch_y)
# y = tf.reshape(y, [-1,n_input,n_input])
pdb.set_trace()
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout})
if step % display_step == 0:
# Calculate batch loss and accuracy
pdb.set_trace()
loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x, y: batch_y, keep_prob: 1.})
print "Iter " + str(step*batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc)
step += 1
print "Optimization Finished!"
# Calculate accuracy for 256 mnist test images
print "Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: data.test.images[:256],
y: data.test.labels[:256],
keep_prob: 1.})
[1]: http://stackoverflow.com/questions/35317029/how-to-implement-pixel-wise-classification-for-scene-labeling-in-tensorflow/37294185?noredirect=1#comment63253577_37294185

Let's forget about softmax and use a simpler tf.nn.sigmoid_cross_entropy_with_logits here:
with sigmoid, you only need one prediction per pixel
if pred[pixel] > 0.5, you predict 1
if pred[pixel] < 0.5, you predict 0
the shape of prediction and target should then be [batch_size, 40000]
pred = conv_net(x, weights, biases, keep_prob) # shape [batch_size, 40000]
flattened_y = tf.reshape(y, [-1, 40000]) # shape [batch_size, 40000]
loss = tf.nn.sigmoid_cross_entropy_with_logits(pred, flattened_y)

Using sparse softmax is going to be of help only after the last layer you want to resize the image to the original size (200*200).In this case using reshape as you have would ensure that the the code would be error free.
But in your case you don't have to use sparse softmax. To see why check the dimensions of "pred".

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.