yolov5 outputs' sizes are wrong - python

I am trying to use yolov5n6 pretrained model on local but it seems there are some errors.
I expect the detection size to be torch.Size(3,4) because I sent same image to the model three time in batch. I printed the result and copy-paste next to the each print function.
What is the possible reason to get unexpected outputs?
import torch
import cv2
X = cv2.imread("/path/to/image/normalized_breast.PNG")
X = cv2.resize(X,(1024,1024))
print(X.shape) # (1024, 1024, 3)
model = torch.hub.load('ultralytics/yolov5', 'yolov5n6', pretrained=True, classes=1)
checkpoint_ = torch.load('/path/to/pretrained/model/yolov5_breast.pt')['model']
model.load_state_dict(checkpoint_.state_dict())
img = torch.from_numpy(X/255).permute(2,0,1).float().unsqueeze(0)
batch = torch.concatenate([img,img,img]).to("cuda")
print(batch.size()) # torch.Size([3, 3, 1024, 1024])
with torch.no_grad():
model.eval()
detections = model(batch)
print(len(detections), # 2
detections[0].size(), # torch.Size([3, 65280, 6])
detections[1][0].size()) # torch.Size([3, 3, 128, 128, 6])
I also tried following method but it didn't solve the problem too.
import torch
import cv2
from models.experimental import attempt_load
from models.yolo import Model
from utils.general import intersect_dicts,LOGGER
X = cv2.imread("image/path/yoloV5/normalized_breast.PNG")
X = cv2.resize(X,(1024,1024))
print(X.shape)
weights = "/pretrained/model/path/yolov5_breast.pt"
device = "cuda"
cfg = "/yaml/file/path/yolov5/models/hub/yolov5n6.yaml"
nc =1
resume = False
ckpt = torch.load(weights, map_location='cpu') # load checkpoint to CPU to avoid CUDA memory leak
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=3).to(device) # create
exclude = ['anchor'] if cfg and not resume else [] # exclude keys
csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32
csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect
model.load_state_dict(csd, strict=False) # load
img = torch.from_numpy(X/255).permute(2,0,1).float().unsqueeze(0)
batch = torch.concatenate([img,img,img]).to("cuda")
print(batch.size())
with torch.no_grad():
model.eval()
detections = model(batch)
print(len(detections),
detections[0].size(),
detections[1][0].size())

Related

a mismatch between the current graph and the graph

I am trying to train encoder decoder model with multispectral images having 9 channels but the code that i am running is downloading pretrained resnet101 weights which is trained on 3 channel images.
Input Given by me:
net_input = tf.placeholder(tf.float32,shape=[None,None,None,9])
net_output = tf.placeholder(tf.float32,shape=[None,None,None,num_classes])
code for getting pretrained weights for Resnet101:
if args.model == "ResNet101" or args.model == "ALL":
subprocess.check_output(['wget','http://download.tensorflow.org/models/resnet_v2_101_2017_04_14.tar.gz', "-P", "models"])
try:
subprocess.check_output(['tar', '-xvf', 'models/resnet_v2_101_2017_04_14.tar.gz', "-C", "models"])
subprocess.check_output(['rm', 'models/resnet_v2_101_2017_04_14.tar.gz'])
except Exception as e:
print(e)
pass
error that i am getting is:
error:
Invalid argument: Assign requires shapes of both tensors to match. lhs shape= [7,7,9,64] rhs
shape= [7,7,3,64]
what can be the solution here?
If you do not want to change the channels of input from 9 to 3, you need to change ResNet architecture input and second layer from 3 to 9 channels and add the final layers for inference. Notice, you will have to train it again.
Here is a full code, as an example, you just have to change channels to 9:
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf
_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True)
PATH = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered')
train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')
BATCH_SIZE = 32
IMG_SIZE = (160, 160)
train_dataset = tf.keras.utils.image_dataset_from_directory(train_dir,
shuffle=True,
batch_size=BATCH_SIZE,
image_size=IMG_SIZE)
validation_dataset = tf.keras.utils.image_dataset_from_directory(validation_dir,
shuffle=True,
batch_size=BATCH_SIZE,
image_size=IMG_SIZE)
class_names = train_dataset.class_names
val_batches = tf.data.experimental.cardinality(validation_dataset)
test_dataset = validation_dataset.take(val_batches // 5)
validation_dataset = validation_dataset.skip(val_batches // 5)
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)
preprocess_input = tf.keras.applications.ResNet50.preprocess_input
# Create the base model from the pre-trained model ResNet50
############### HERE YOU CHANGE TO 9 CHANNELS ###############
channels=3
IMG_SHAPE = IMG_SIZE + (channels,)
base_model = tf.keras.applications.ResNet50(input_shape=IMG_SHAPE,
include_top=True,
weights='imagenet')
image_batch, label_batch = next(iter(train_dataset))
feature_batch = base_model(image_batch)
print(feature_batch.shape)
base_model.summary()
## HERE IS WHERE THE MAGIC HAPPENS
base_model_config = base_model.get_config()
#### HERE THE CHANNELS WILL BE ALTERED TO 9 ####
base_model_config['layers'][0]["config"]["batch_input_shape"]=(None, 160, 160, channels)
base_model_config['layers'][1]["config"]["padding"]=((channels,channels), (channels,channels))
#######################################################
model=tf.keras.Model().from_config(base_model_config)
model.summary()
### HERE YOU ADD THE FINAL LAYERS FOR INFERENCE
inputs = tf.keras.Input(shape=(160, 160, channels))
x = base_model(inputs, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs, outputs)
base_learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate),
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(x=train_dataset, epochs=2)
If there is any shape mismatch in the middle of the way, you know how to change it: base_model_config['layers'][x]....=........
There are a few resources about this idea, such as a blog post about transferring a ResNet on RGB data to multi-channel images here, and a relevant Colab Notebook. Below is a working example based on those resources:
import numpy as np
import tensorflow as tf
def tile_kernels(kernel, out_channels, batch_dim=-2):
mean_1d = np.mean(kernel, axis=batch_dim).reshape(kernel[:, :, -1:, :].shape)
tiled = np.tile(mean_1d, (out_channels, 1))
return tiled
def reshape_model_input(model_orig, custom_model, input_channels):
conf = custom_model.get_config()
layer_to_modify = conf["layers"][2]["config"]["name"]
layer_names = [conf['layers'][x]['name'] for x in range(len(conf['layers']))]
for layer in model_orig.layers:
if layer.name in layer_names:
if layer.get_weights() != []:
target_layer = custom_model.get_layer(layer.name)
if layer.name == layer_to_modify:
kernels, biases = layer.get_weights()
kernels_extra_channels = np.concatenate((kernels,
tile_kernels(kernels, input_channels - 3)),
axis=-2)
target_layer.set_weights([kernels_extra_channels, biases])
else:
target_layer.set_weights(layer.get_weights())
if __name__ == "__main__":
from tensorflow.keras.applications import ResNet50V2
resnet50 = ResNet50V2(weights='imagenet', include_top=False) # load resnet50 here - can be done differently
config = resnet50.get_config()
img_height = ...
img_width = ...
input_channels = 7
config["layers"][0]["config"]["batch_input_shape"] = (None, img_height, img_width, input_channels) # change the batch input shape to handle the different channel dimensions
custom_resnet = tf.keras.models.Model.from_config(config)
reshape_model_input(resnet50, custom_resnet, input_channels) # modify the custom model by reference
custom_resnet(np.zeros((1, img_width, img_height, input_channels))) # just verifying that predicting with the new shape works in the custom model
This process just iterates over each layer in the original model and sets the corresponding weights in the custom model. To produce the additional n 3 x 3 channels (in your case, n = 4, as you want 7 total channels) for the input, the mean is taken across the 3 RGB dimensions then replicated (as can be seen in the tile_kernels function). Another aggregation function could be used, such as the max, min, median, etc. If you don't want any of the weights from the original model (as in, not pretraining but just require the architecture), just modifying the original model's configuration and creating a new model from it will create a randomly initialized model:
resnet50 = ...
config = resnet50.get_config()
img_height = ...
img_width = ...
input_channels = ...
config["layers"][0]["config"]["batch_input_shape"] = (None, img_height, img_width, input_channels)
custom_resnet = tf.keras.models.Model.from_config(config)

TensorFlow + Keras multi gpu model with inference

I am trying to do image classification using Keras's Xception model modeled after this code. However I want to use multiple GPU's to do batch parallel image classification using this function. I believe it is possible and I have the original code working without multi GPU support however I can not get multi_gpu_model function to work as I would expect. I am following this example for the multi GPU example. This is my code (it is the backend of a Flask app), it instantiates the model, makes a prediction on an example ndarray when the class is created, and then expects a base 64 encoded image in the classify function:
import os
from keras.preprocessing import image as preprocess_image
from keras.applications import Xception
from keras.applications.inception_v3 import preprocess_input, decode_predictions
from keras.utils import multi_gpu_model
import numpy as np
import tensorflow as tf
import PIL.Image
from numpy import array
class ModelManager:
def __init__(self, model_path):
self.model_name = 'ImageNet'
self.model_version = '1.0'
self.batch_size = 32
height = 224
width = 224
num_classes = 1000
# self.model = tf.keras.models.load_model(os.path.join(model_path, 'ImageNetXception.h5'))
with tf.device('/cpu:0'):
model = Xception(weights=None,
input_shape=(height, width, 3),
classes=num_classes, include_top=True)
# Replicates the model on 8 GPUs.
# This assumes that your machine has 8 available GPUs.
self.parallel_model = multi_gpu_model(model, gpus=8)
self.parallel_model.compile(loss='categorical_crossentropy',
optimizer='rmsprop')
print("Loaded Xception model.")
x = np.empty((1, 224, 224, 3))
self.parallel_model.predict(x, batch_size=self.batch_size)
self.graph = tf.get_default_graph()
self.graph.finalize()
def classify(self, ids, images):
results = []
all_images = np.empty((0, 224, 224, 3))
# all_images = []
for image_id, image in zip(ids, images):
# This does the same as keras.preprocessing.image.load_img
image = image.convert('RGB')
image = image.resize((224, 224), PIL.Image.NEAREST)
x = preprocess_image.img_to_array(image)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
all_images = np.append(all_images, x, axis=0)
# all_images.append(x)
# a = array(all_images)
# print(type(a))
# print(a[0])
with self.graph.as_default():
preds = self.parallel_model.predict(all_images, batch_size=288)
#print(type(preds))
top3 = decode_predictions(preds, top=3)[0]
print(top3)
output = [((t[1],) + t[2:]) for t in top3]
predictions = [
{'label': label, 'probability': probability * 100.0}
for label, probability in output
]
results.append({
'id': 1,
'predictions': predictions
})
print(len(results))
return results
The parts I am not sure about is what to pass the predict function. Currently I am creating an ndarray of the images I want classified after they are preprocessed and then passing that to the predict function. The function returns but the preds variable doesn't hold what I expect. I tried to loop through the preds object but decode_predictions errors when I pass a single item but responds with one prediction when I pass the whole preds ndarray. In the example code they don't use the decode_predictions function so I'm not sure how to use it with the response from parallel_model.predict. Any help or resources is appreciated, thanks.
the following site illustrates how to do that correctly link

Tensorflow adam optimizer in a .pb model

I have developed a TensorFlow code that use Adam optimizer, then saved the graph and export the .pb model and correctly loaded it, my problem is when i feed it with new input image i don't get the same result compared to the result given by this code:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img, array_to_img
import cv2
import tensorflow as tf
import numpy
import numpy as np
def get_image2(imgSrc):
img = load_img(imgSrc, True) # this is a PIL image
x = img_to_array(img) # this is a Numpy array with shape (3, 150, 150
#x = x.reshape((1,) + x.shape)
x = x.astype(float)
x *= 1./255.
#x = cv2.resize(x,(512,512))
return x
def sobel2(image):
# Shape = height x width.
#image = tf.placeholder(tf.float32, shape=[None, None])
# Shape = 1 x height x width x 1.
image_resized = image#tf.expand_dims(image, 0)
Gx = tf.nn.conv2d(image_resized, sobel_x_filter, strides=[1, 1, 1, 1], padding='SAME')
Gy = tf.nn.conv2d(image_resized, sobel_y_filter,strides=[1, 1, 1, 1], padding='SAME')
#grad = tf.sqrt(tf.add(tf.pow(Gx,2),tf.pow(Gy,2)))
#grad = tf.pow(Gx,2) + tf.pow(Gy,2)
#grad = tf.truediv(grad,3.)
#grad = tf.reshape(grad, img_shape)
return Gx, Gy
image = get_image2('1.jpg')
img_shape = image.shape
print img_shape
img_h, img_w,_= img_shape
sobel_x = tf.constant([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], tf.float32)
sobel_x_filter = tf.reshape(sobel_x, [3, 3, 1, 1])
sobel_y_filter = tf.transpose(sobel_x_filter, [1, 0, 2, 3])
input_img = tf.placeholder(tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]], name="input_img")
#input_img = tf.placeholder(tf.float32, [1, 512, 512, 1], name="input_img")
gain = tf.Variable(tf.constant(1, dtype=tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]]), name="gain")
offset = tf.Variable(tf.constant(0, dtype=tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]]), name="offset")
enhanced_img = tf.add(tf.multiply(input_img, gain), offset, name = "enahnced")
#----------------------------------------------------------
# COST
#----------------------------------------------------------
input_img_deriv_x, input_img_deriv_y = sobel2(input_img)
enhanced_img_deriv_x, enhanced_img_deriv_y = sobel2(enhanced_img)
white_img = tf.constant(1, dtype=tf.float32, shape=[1,img_shape[0],img_shape[1],img_shape[2]])
image_pixels_count = img_h * img_w
white_cost = tf.reduce_sum(tf.pow(enhanced_img - white_img, 2))
sobel_cost = tf.reduce_sum(tf.pow(enhanced_img_deriv_x - input_img_deriv_x, 2) +
tf.pow(enhanced_img_deriv_y - input_img_deriv_y,2))
cost = tf.add(white_cost, tf.multiply(0.2, sobel_cost), name = "cost") # + tf.reduce_sum(gain - 1) + tf.reduce_sum(offset)
#----------------------------------------------------------
# TRAIN
#----------------------------------------------------------
# Parameters
learning_rate = 0.0001
training_epochs = 100
display_step = 5
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
image = image.reshape([-1,img_shape[0],img_shape[1],img_shape[2]])
#print image.shape
#print image.shape
feed = {input_img: image }
# Start training
with tf.Session() as sess:
#Run the initializer
print(sess.run(init))
# Fit all training data
for epoch in range(training_epochs):
sess.run([optimizer, cost], feed_dict = feed)
print(tf.reduce_sum(offset.eval()).eval())
if (epoch+1) % display_step == 0:
gen_img = sess.run(enhanced_img, feed_dict = feed)
gen_img = np.squeeze(gen_img, axis=0)
print(gen_img.shape)
gen_img *= 255
cv2.imwrite("result/output_2_{0}.png".format(epoch), gen_img)
I noticed that when i save the graph the optimizer state is also saved, so when i load the model and feed it with new image he will produce a false result since he will use the saved value related to the image i have used when i saved it.
How i can make the model run the optimizer for new images without using the saved parameters from previous input.
Do you understand how the optimizer actually works ?
The goal of the optimizer is to update the model weights with respect to the gradient. Concerning Adam it has two inner variables which are updated during training, it's part of the adam algorithm. So this behavior is perfectly normal. If you want to "reset" adam variables, it's perfectly doable, however I highly doubt that it's what you want to do... Very rare situations require you to do this. Btw. if you reset adam state, you will break the whole logic of the optimizer.
If you try to evaluate a new image at inference time, the optimizer should not be run, and thus your model output should not be impacted by Adam or any other optimizer.
If you try to continue the training from a preivously saved checkpoint, I would recommend that you keep the Adam state if the dataset is the same (not a transfer learning approach), and thus you should not reset adam's variables.
Btw. if you really want to reset adam, this is how you will do it:
optimizer_reset_op = tf.variables_initializer(optimizer.variables())
sess.run(optimizer_reset_op)

Bug with my TensorFlow app, I can't seem to find it

I'm very new to TensorFlow. I've absorbed what I can from the tutorials but I am stuck making my own thing. I learn best by doing, not reading, so I'm hoping to having something that works that I can tinker with to understand TensorFlow better.
I'm getting this error: "The shape of labels (received (32, 32)) should equal the shape of logits except for the last dimension (received (1, 32, 32))." I've fiddled with the code to no avail.
The program is supposed to read an image and spit out a "mask" image, which will later be used to cut out the background. a white pixel means "keep this" and a black pixel means "don't keep this".
pm.main is a file that contains a couple functions for loading the images, and one for creating the model. I tried to extract this stuff from the train.py ( for training ) and eval.py ( for running the thing ) to keep my code cleaner.
"pm.main":
import tensorflow as tf
import matplotlib.image as mpimg
from PIL import Image as img
import numpy as np
import os
def rgb2gray(rgb):
#return rgb
gray = np.dot(rgb[...,:3], [0.3, 0.3, 0.3]).astype('int32')
return gray
def load_images(dataDirectory):
images = []
for i in range(1, 2):
filePrefix = f'{i:03}'
fileName = '%s.png' % filePrefix
maskFileName = '%sm.png' % filePrefix
image = mpimg.imread(os.path.join(dataDirectory, fileName))
maskImg = img.open(os.path.join(dataDirectory, maskFileName))
maskImg.thumbnail((32, 32), img.ANTIALIAS)
mask = np.array(maskImg)
fixedMask = rgb2gray(mask)
images.append([image, fixedMask])
return images
class x():
pass
def cnn_model_fn(input):
input_layer = tf.reshape(input, [-1, 256, 256, 3])
def addNewLayer(prevLayer):
conv = tf.layers.conv2d(inputs=prevLayer, filters=32, kernel_size=[3,3], padding='same', activation=tf.nn.relu)
pool = tf.layers.max_pooling2d(inputs=conv, pool_size=[2, 2], strides=2)
return pool
layer1 = addNewLayer(input_layer)
layer2 = addNewLayer(layer1)
layer3 = addNewLayer(layer2)
# layer3 = 64 x 64 x 32
#return layer3
#flat = tf.reshape(layer3, [-1, 32 * 32 * 32])
#dense = tf.layers.dense(inputs=flat, units=64 * 64, activation=tf.nn.relu)
#dense = tf.layers.dense(inputs=flat, units=64, activation=tf.nn.relu)
dense = tf.layers.dense(inputs=layer3, units=1, activation=tf.nn.relu)
result = x()
result.logits = dense
#result.logits = flat
#flat = tf.reshape(layer3, [-1, 32, 32, 1])
#result.logits = flat
#print('layer3: %s' % layer3.shape)
print('logits: %s' % result.logits.shape)
return result
and "train.py":
import tensorflow as tf
from pm import main as pm
from PIL import Image as img
import matplotlib.pyplot as pyplot
import numpy as np
import os
currentDirectory = os.path.dirname(os.path.realpath(__file__))
dataDirectory = os.path.join(currentDirectory, 'data')
images = pm.load_images(dataDirectory)
imagePlaceholder = tf.placeholder(tf.int32, shape=[256, 256, 3])
#maskPlaceholder = tf.placeholder(tf.int32, shape=[1, 256, 256, 1])
maskPlaceholder = tf.placeholder(tf.int32, shape=[32, 32, 1])
pair = images[0]
image = pair[0] # inputs
mask = pair[1] # labels
model = pm.cnn_model_fn(image)
logits = model.logits
print(mask.shape)
print(logits.shape)
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
#saver.restore(sess, 'network.ckpt')
loss_op = tf.losses.sparse_softmax_cross_entropy(labels=mask, logits=logits)
result = sess.run(loss_op, feed_dict={ imagePlaceholder: image, maskPlaceholder: logits })
print(result)
resultData = result.reshape(64, 64).astype('uint8') * 255
print(resultData)
imageData = img.fromarray(resultData)
imageData.save('output.png')
saver.save(sess, 'network.ckpt')
Any insight would be greatly appreciated.
You did not provide an error stack trace, but here is my best guess.
Most probably the error is coming from the cross entropy loss: tf.losses.sparse_softmax_cross_entropy. If you look at the spec you will see that this function expects labels to be 1 rank less than logits and all dimensions expect last must be the same (so that it knows which label corresponds to which set of logits).
In your case, labels seem to have shape of (1, 32, 32). Assuming you just want to get rid of 1, you can do labels = tf.squeeze(labels)

TensorFlow model gets zero loss

import tensorflow as tf
import numpy as np
import os
import re
import PIL
def read_image_label_list(img_directory, folder_name):
# Input:
# -Name of folder (test\\\\train)
# Output:
# -List of names of files in folder
# -Label associated with each file
cat_label = 1
dog_label = 0
filenames = []
labels = []
dir_list = os.listdir(os.path.join(img_directory, folder_name)) # List of all image names in 'folder_name' folder
# Loop through all images in directory
for i, d in enumerate(dir_list):
if re.search("train", folder_name):
if re.search("cat", d): # If image filename contains 'Cat', then true
labels.append(cat_label)
else:
labels.append(dog_label)
filenames.append(os.path.join(img_dir, folder_name, d))
return filenames, labels
# Define convolutional layer
def conv_layer(input, channels_in, channels_out):
w_1 = tf.get_variable("weight_conv", [5,5, channels_in, channels_out], initializer=tf.contrib.layers.xavier_initializer())
b_1 = tf.get_variable("bias_conv", [channels_out], initializer=tf.zeros_initializer())
conv = tf.nn.conv2d(input, w_1, strides=[1,1,1,1], padding="SAME")
activation = tf.nn.relu(conv + b_1)
return activation
# Define fully connected layer
def fc_layer(input, channels_in, channels_out):
w_2 = tf.get_variable("weight_fc", [channels_in, channels_out], initializer=tf.contrib.layers.xavier_initializer())
b_2 = tf.get_variable("bias_fc", [channels_out], initializer=tf.zeros_initializer())
activation = tf.nn.relu(tf.matmul(input, w_2) + b_2)
return activation
# Define parse function to make input data to decode image into
def _parse_function(img_path, label):
img_file = tf.read_file(img_path)
img_decoded = tf.image.decode_image(img_file, channels=3)
img_decoded.set_shape([None,None,3])
img_decoded = tf.image.resize_images(img_decoded, (28, 28), method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
img_decoded = tf.image.per_image_standardization(img_decoded)
img_decoded = tf.cast(img_decoded, dty=tf.float32)
label = tf.one_hot(label, 1)
return img_decoded, label
tf.reset_default_graph()
# Define parameterspe
EPOCHS = 10
BATCH_SIZE_training = 64
learning_rate = 0.001
img_dir = 'C:/Users/tharu/PycharmProjects/cat_vs_dog/data'
batch_size = 128
# Define data
features, labels = read_image_label_list(img_dir, "train")
# Define dataset
dataset = tf.data.Dataset.from_tensor_slices((features, labels)) # Takes slices in 0th dimension
dataset = dataset.map(_parse_function)
dataset = dataset.batch(batch_size)
iterator = dataset.make_initializable_iterator()
# Get next batch of data from iterator
x, y = iterator.get_next()
# Create the network (use different variable scopes for reuse of variables)
with tf.variable_scope("conv1"):
conv_1 = conv_layer(x, 3, 32)
pool_1 = tf.nn.max_pool(conv_1, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME")
with tf.variable_scope("conv2"):
conv_2 = conv_layer(pool_1, 32, 64)
pool_2 = tf.nn.max_pool(conv_2, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME")
flattened = tf.contrib.layers.flatten(pool_2)
with tf.variable_scope("fc1"):
fc_1 = fc_layer(flattened, 7*7*64, 1024)
with tf.variable_scope("fc2"):
logits = fc_layer(fc_1, 1024, 1)
# Define loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.cast(y, dtype=tf.int32)))
# Define optimizer
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
with tf.Session() as sess:
# Initiliaze all the variables
sess.run(tf.global_variables_initializer())
# Train the network
for i in range(EPOCHS):
# Initialize iterator so that it starts at beginning of training set for each epoch
sess.run(iterator.initializer)
print("EPOCH", i)
while True:
try:
_, epoch_loss = sess.run([train, loss])
except tf.errors.OutOfRangeError: # Error given when out of data
if i % 2 == 0:
# [train_accuaracy] = sess.run([accuracy])
# print("Step ", i, "training accuracy = %{}".format(train_accuaracy))
print(epoch_loss)
break
I've spent a few hours trying to figure out systematically why I've been getting 0 loss when I run this model.
Features = list of file locations for each image (e.g. ['\data\train\cat.0.jpg', /data\train\cat.1.jpg])
Labels = [Batch_size, 1] one_hot vector
Initially I thought it was because there was something wrong with my data. But I've viewed the data after being resized and the images seems fine.
Then I tried a few different loss functions because I thought maybe I'm misunderstanding what the the tensorflow function softmax_cross_entropy does, but that didn't fix anything.
I've tried running just the 'logits' section to see what the output is. This is just a small sample and the numbers seem fine to me:
[[0.06388957]
[0. ]
[0.16969752]
[0.24913025]
[0.09961276]]
Surely then the softmax_cross_entropy function should be able to compute this loss given that the corresponding labels are 0 or 1? I'm not sure if I'm missing something. Any help would be greatly appreciated.
As documented:
logits and labels must have the same shape, e.g. [batch_size, num_classes] and the same dtype (either float16, float32, or float64).
Since you mentioned your label is "[Batch_size, 1] one_hot vector", I would assume both your logits and labels are [Batch_size, 1] shape. This will certainly lead to zero loss. Conceptually speaking, you have only 1 class (num_classes=1) and your cannot be wrong (loss=0).
So at least for you labels, you should transform it: tf.one_hot(indices=labels, depth=num_classes). Your prediction logits should also have a shape [batch_size, num_classes] output.
Alternatively, you can use sparse_softmax_cross_entropy_with_logits, where:
A common use case is to have logits of shape [batch_size, num_classes] and labels of shape [batch_size]. But higher dimensions are supported.

Categories