I use tf.dataset to fetch images, labels, edges in training with GPU. But I find the dataset API cannot load all the data.
I use codes:
def get_dataset(filenames, shuffle_buffer, repeat_times, batch_size):
dataset = tf.data.TFRecordDataset([filenames])
dataset = dataset.map(tfrecord_preprocess)
if repeat_times is None:
dataset = dataset.repeat()
else:
dataset = dataset.repeat(repeat_times)
dataset = dataset.shuffle(shuffle_buffer).batch(batch_size)
return dataset
def tfrecord_preprocess(example):
feature = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
"label": tf.FixedLenFeature((), tf.string, default_value=""),
"edge": tf.FixedLenFeature((), tf.string, default_value="")}
parsed_feature = tf.parse_single_example(example, feature)
image = tf.decode_raw(parsed_feature["image"], out_type=tf.uint8)
label = tf.decode_raw(parsed_feature["label"], out_type=tf.uint8)
edge = tf.decode_raw(parsed_feature["edge"], out_type=tf.uint8)
image = tf.cast(tf.reshape(image, shape=[1, 128, 128]), tf.float32)
label = tf.cast(tf.reshape(label, shape=[1, 128, 128]), tf.float32)
edge = tf.cast(tf.reshape(edge, shape=[128, 128]), tf.float32)
return image, label, edge
I write a simple codes to test the API
dataset = get_dataset(filenames, shuffle_buffer, repeat_times, batchsize)
#shuffle=1000, repeat_times=2, batchsize=13
iter = dataset.make_one_shot_iterator
images, labels, edges = iter.get_next()
count = 0
with tf.Session() as sess:
for _ in xrange(40):
try:
edges_value = sess.run(edges)
count = count+len(edges_value)
print count
except tf.errors.OutofRangeError:
break
the number of data is 260 so after repeat and batch, the epochs should be 40. It works.
However, when I use similar code for training, the total number of data is less than 260, only 140(through the var count). Does anyone know haw to solve this problem? Pls help me.
I use tensorflow-gpu 1.4
my training code is:
shuffle_buffer = params["shuffle_buffer"] #1000
repeat_times = params["repeat_times"] #1
batch_size = params["batch_size"] #26
num_classes = params["num_classes"] #2
dataset = model.get_dataset(filenames, shuffle_buffer, repeat_times, batch_size)
iterator = dataset.make_one_shot_iterator()
with tf.device('/gpu:1'):
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(params["learning_rate"],
global_step, 100, 0.99)
optimizer = tf.train.AdamOptimizer(learning_rate)
images, labels, edges = iterator.get_next()
_, probs = model.interence(features=images, training=True)
loss, reg = model.get_loss(probs, labels, edges, num_classes)
_, acc_mean, _ = model.get_acc(probs, labels)
train_op = optimizer.minimize(loss, global_step=global_step)
variables_average = tf.train.ExponentialMovingAverage(0.99, global_step)
var_list = tf.trainable_variables(scope='.*(kernel|bias)')
variables_average_op = variables_average.apply(var_list)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_all_op = tf.group(train_op, variables_average_op)
tf.summary.scalar("loss", loss)
tf.summary.scalar("reg", reg)
tf.summary.scalar("acc_mean", acc_mean)
merged = tf.summary.merge_all()
saver = tf.train.Saver(max_to_keep=5)
config = tf.ConfigProto(log_device_placement=True,
allow_soft_placement=True)
config.gpu_options.allow_growth = True
count = 0
with tf.Session(config=config) as sess:
tf.global_variables_initializer().run()
writer = tf.summary.FileWriter('./train', sess.graph)
for _ in xrange(10):
try:
edges_value = sess.run(edges)
count = count+len(edges_value)
_, step, summary = sess.run([train_all_op, global_step, merged])
writer.add_summary(summary, step)
if step % 5 == 0:
loss_value = sess.run(loss)
print loss_value
acc_mean_value = sess.run(acc_mean)
print acc_mean_value
saver.save(sess, params["save_dir"], step)
except tf.errors.OutOfRangeError:
print "end of data"
break
print count
print "the final step is %d" % step
loss_value = sess.run(loss)
print loss_value
acc_mean_value = sess.run(acc_mean)
print acc_mean_value
saver.save(sess, params["save_dir"], step)
writer.close()
finally I got info in the terminal:
end of data
130
the final step is 5
to test the code I set the repeat times 1
But I use test codes:
def test():
dataset = get_dataset("train_output.tfrecords", 1000, 1, 26)
terator = dataset.make_one_shot_iterator()
images, labels, edges = iterator.get_next()
count = 0
with tf.Session() as sess:
for i in xrange(10):
try:
images_value, labels_value, edges_value = sess.run([images, labels, edges])
count = count+len(edges_value)
except tf.errors.OutOfRangeError:
print "end of data"
print count
print i
test()
The terminal shows:
260
9
The problem is that the sess.run(edges) is causing this part of the graph to execute again: images, labels, edges = iterator.get_next(). Therefore, everytime you run it, you are consuming one iteration that is not counted in your counter.
To get the count of edges, keep a counter inside of the with tf.device('/gpu:1') block. You can even graph it on tensorboard using a tf.summary.scalar similar to how you do so with loss.
Declare a edges_count = tf.Variable(1, name='edges_count', trainable=False, dtype=tf.int32)
images, labels, edges = iterator.get_next()
edges_count_update_op = tf.assign_add(edges_count, len(edges))
Then add edges_count_update_op to your train_op group.
Related
The scalar value is not being recorded as in these pictures.
enter image description here 2.
enter image description here
I don't know if I'm doing the wrong with code for training or code for tensorboard.
It's my first time using a tensorboard, so I don't know if I wrote the correct code for viewing the tensorboard.
If the scalar value is not recorded as shown in the attached image even though the code for the tensorboard is written correctly, I would like to look at the code part for training.
Therefore, I would appreciate it if you could check code only from recording the scalar value to verifying it with the tensorboard
part of the main function of <run.py>
def main():
args = parser.parse_args()
assert args.n_views == 2, "Only two view training is supported. Please use --n-views 2."
# check if gpu training is available
if not args.disable_cuda and torch.cuda.is_available():
args.device = torch.device('cuda')
cudnn.deterministic = True
cudnn.benchmark = True
else:
args.device = torch.device('cpu')
args.gpu_index = -1
dataset = ContrastiveLearningDataset(args.data)
train_dataset = dataset.__getitem__(args.dataset_name, args.n_views, 1)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True, drop_last=True)
#f(*)
model = ResNetSimCLR(base_model=args.arch, out_dim=args.out_dim)
optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader), eta_min=0,
last_epoch=-1)
with torch.cuda.device(args.gpu_index):
simclr = SimCLR(model=model, optimizer=optimizer, scheduler=scheduler, args=args)
simclr.train(train_loader)
simclr.writer.flush()
<simclr.py> to write train function in main()
class SimCLR(object):
def __init__(self, *args, **kwargs):
self.args = kwargs['args']
self.model = kwargs['model'].to(self.args.device)
self.optimizer = kwargs['optimizer']
self.scheduler = kwargs['scheduler']
self.writer = SummaryWriter()
logging.basicConfig(filename=os.path.join(self.writer.log_dir, 'training.log'), level=logging.DEBUG)
self.criterion = torch.nn.CrossEntropyLoss().to(self.args.device)
def info_nce_loss(self, features):
labels = torch.cat([torch.arange(self.args.batch_size) for i in range(self.args.n_views)], dim=0)
labels = (labels.unsqueeze(0) == labels.unsqueeze(1)).float()
labels = labels.to(self.args.device)
features = F.normalize(features, dim=1)
similarity_matrix = torch.matmul(features, features.T)
mask = torch.eye(labels.shape[0], dtype=torch.bool).to(self.args.device)
labels = labels[~mask].view(labels.shape[0], -1)
similarity_matrix = similarity_matrix[~mask].view(similarity_matrix.shape[0], -1)
positives = similarity_matrix[labels.bool()].view(labels.shape[0], -1)
negatives = similarity_matrix[~labels.bool()].view(similarity_matrix.shape[0], -1)
logits = torch.cat([positives, negatives], dim=1)
labels = torch.zeros(logits.shape[0], dtype=torch.long).to(self.args.device)
logits = logits / self.args.temperature
return logits, labels
def train(self, train_loader):
scaler = GradScaler(enabled=self.args.fp16_precision)
# save config file
save_config_file(self.writer.log_dir, self.args)
n_iter = 0 #global optimization step
logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
logging.info(f"Training with gpu: {self.args.disable_cuda}.")
for epoch_counter in range(self.args.epochs):
for images, _ in tqdm(train_loader):
images = torch.cat(images, dim=0)
images = images.to(self.args.device)
with autocast(enabled=self.args.fp16_precision):
features = self.model(images)
logits, labels = self.info_nce_loss(features)
loss = self.criterion(logits, labels)
self.optimizer.zero_grad()
scaler.scale(loss).backward()
scaler.step(self.optimizer)
scaler.update()
if n_iter % self.args.log_every_n_steps == 0:
top1, top5 = accuracy(logits, labels, topk=(1, 5))
self.writer.add_scalar('loss', loss, global_step=n_iter)
self.writer.add_scalar('acc/top1', top1[0], global_step=n_iter)
self.writer.add_scalar('acc/top5', top5[0], global_step=n_iter)
self.writer.add_scalar('learning_rate', self.scheduler.get_lr()[0], global_step=n_iter)
n_iter += 1
# warmup for the first 10 epochs
if epoch_counter >= 1:
try:
self.scheduler.step()
except ZeroDivisionError:
print("ZeroDivision")
logging.debug(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}")
logging.info("Training has finished.")
# save model checkpoints
checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(self.args.epochs)
save_checkpoint({
'epoch': self.args.epochs,
'arch': self.args.arch,
'state_dict': self.model.state_dict(),
'optimizer': self.optimizer.state_dict(),
}, is_best=False, filename=os.path.join(self.writer.log_dir, checkpoint_name))
logging.info(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.")
As a Pytorch newbie (coming from tensorflow), I am unsure of how to implement Early Stopping. My research has led me discover that pytorch does not have a native way to so this. I have also discovered torchsample, but am unable to install it in my conda environment for whatever reason. Is there a simple way to go about applying early stopping without it? Here is my current setup:
class RegressionDataset(Dataset):
def __init__(self, X_data, y_data):
self.X_data = X_data
self.y_data = y_data
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
def __len__(self):
return len(self.X_data)
train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = RegressionDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
# Model Params
EPOCHS = 100
BATCH_SIZE = 1000
LEARNING_RATE = 0.001
NUM_FEATURES = np.shape(X_test)[1]
# Initialize Dataloader
train_loader = DataLoader(dataset = train_dataset, batch_size=BATCH_SIZE, shuffle = True)
val_loader = DataLoader(dataset = val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset = test_dataset, batch_size=BATCH_SIZE)
# Define Neural Network Architecture
class MultipleRegression(nn.Module):
def __init__(self, num_features):
super(MultipleRegression, self).__init__()
# Define architecture
self.layer_1 = nn.Linear(num_features, 16)
self.layer_2 = nn.Linear(16, 32)
self.layer_3 = nn.Linear(32, 25)
self.layer_4 = nn.Linear(25, 20)
self.layer_5 = nn.Linear(20, 16)
self.layer_out = nn.Linear(16, 1)
self.relu = nn.ReLU() # ReLU applied to all layers
# Initialize weights and biases
nn.init.xavier_uniform_(self.layer_1.weight)
nn.init.zeros_(self.layer_1.bias)
nn.init.xavier_uniform_(self.layer_2.weight)
nn.init.zeros_(self.layer_2.bias)
nn.init.xavier_uniform_(self.layer_3.weight)
nn.init.zeros_(self.layer_3.bias)
nn.init.xavier_uniform_(self.layer_4.weight)
nn.init.zeros_(self.layer_4.bias)
nn.init.xavier_uniform_(self.layer_5.weight)
nn.init.zeros_(self.layer_5.bias)
nn.init.xavier_uniform_(self.layer_out.weight)
nn.init.zeros_(self.layer_out.bias)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.relu(self.layer_2(x))
x = self.relu(self.layer_3(x))
x = self.relu(self.layer_4(x))
x = self.relu(self.layer_5(x))
x = self.layer_out(x)
return(x)
def predict(self, test_inputs):
x = self.relu(self.layer_1(test_inputs))
x = self.relu(self.layer_2(x))
x = self.relu(self.layer_3(x))
x = self.relu(self.layer_4(x))
x = self.relu(self.layer_5(x))
x = self.layer_out(x)
return(x)
# Check for GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = MultipleRegression(NUM_FEATURES)
model.to(device)
print(model)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
# define dictionary to store loss/epochs for training and validation
loss_stats = {
"train": [],
"val": []
}
# begin training
print("Begin Training")
for e in tqdm(range(1, EPOCHS+1)):
# Training
train_epoch_loss = 0
model.train()
for X_train_batch, y_train_batch in train_loader:
X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
optimizer.zero_grad()
y_train_pred = model(X_train_batch)
train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
# validation
with torch.no_grad():
val_epoch_loss = 0
model.eval()
for X_val_batch, y_val_batch in val_loader:
X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
y_val_pred = model(X_val_batch)
val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
val_epoch_loss += val_loss.item()
loss_stats["train"].append(train_epoch_loss/len(train_loader))
loss_stats["val"].append(val_epoch_loss/len(val_loader))
print(f"Epoch {e}: \ Train loss: {train_epoch_loss/len(train_loader):.5f} \ Val loss: {val_epoch_loss/len(val_loader):.5f}")
# Visualize loss and accuracy
train_val_loss_df = pd.DataFrame.from_dict(loss_stats).reset_index().melt(id_vars=["index"]).rename(columns = {"index":"epochs"})
plt.figure()
sns.lineplot(data = train_val_loss_df, x = "epochs", y = "value", hue = "variable").set_title("Train-Val Loss/Epoch")
# Test model
y_pred_list = []
with torch.no_grad():
model.eval()
for X_batch, _ in test_loader:
X_batch = X_batch.to(device)
y_test_pred = model(X_batch)
y_pred_list.append(y_test_pred.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
y_pred_list = [item for sublist in y_pred_list for item in sublist]
y_pred_list = np.array(y_pred_list)
mse = mean_squared_error(y_test, y_pred_list)
r_square = r2_score(y_test, y_pred_list)
print("Mean Squared Error :", mse)
print("R^2 :", r_square)
A basic way to do this is to keep track of the best validation loss obtained so far.
You can have a variable best_loss = 0 initialized before your loop over epochs (or you could do other things like best loss per epoch, etc.).
After each validation pass then do:
if val_loss > best_loss:
best_loss = val_loss
# At this point also save a snapshot of the current model
torch.save(model, 'my_model_best_loss.pth')
Then, if the best_loss does not improve significantly after some number of training steps, or by the end of the epoch, or if it val_loss gets worse, break out of the loop and terminate the training there.
For implementing algorithms like early stopping (and your training loop in general) you may find it easier to give PyTorch Lightning a try (no affiliation, but it's much easier than trying to roll everything by hand).
I've been working on implementing this model for months now and finally get it to work. Now I was looking to calculate metrics for this model like F-score, Recall, Precision etc. I looked at examples and they do it by splitting data which I've not been able to implement. Can I calculate it like model loss and total loss is calculated here???
from nets import model
from utils.data_provider import data_provider
FLAGS = tf.app.flags.FLAGS
gpus = list(range(len(FLAGS.gpu_list.split(','))))
logger.setLevel(cfg.debug)
def tower_loss(images, seg_maps_gt, training_masks, reuse_variables=None):
# Build inference graph
with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables):
seg_maps_pred = model.model(images, is_training=True)
model_loss = model.loss(seg_maps_gt, seg_maps_pred, training_masks)
total_loss = tf.add_n([model_loss] + tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
# add summary
if reuse_variables is None:
tf.summary.image('input', images)
tf.summary.image('seg_map_0_gt', seg_maps_gt[:, :, :, 0:1] * 255)
tf.summary.image('seg_map_0_pred', seg_maps_pred[:, :, :, 0:1] * 255)
tf.summary.image('training_masks', training_masks)
tf.summary.scalar('model_loss', model_loss)
tf.summary.scalar('total_loss', total_loss)
return total_loss, model_loss
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
grads = []
for g, _ in grad_and_vars:
expanded_g = tf.expand_dims(g, 0)
grads.append(expanded_g)
grad = tf.concat(grads, 0)
grad = tf.reduce_mean(grad, 0)
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def main(argv=None):
import os
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list
if not tf.gfile.Exists(FLAGS.checkpoint_path):
tf.gfile.MkDir(FLAGS.checkpoint_path)
else:
if not FLAGS.restore:
tf.gfile.DeleteRecursively(FLAGS.checkpoint_path)
tf.gfile.MkDir(FLAGS.checkpoint_path)
input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
input_seg_maps = tf.placeholder(tf.float32, shape=[None, None, None, 6], name='input_score_maps')
input_training_masks = tf.placeholder(tf.float32, shape=[None, None, None, 1], name='input_training_masks')
global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step, decay_steps=10000, decay_rate=0.94, staircase=True)
# add summary
tf.summary.scalar('learning_rate', learning_rate)
opt = tf.train.AdamOptimizer(learning_rate)
# split
input_images_split = tf.split(input_images, len(gpus))
input_seg_maps_split = tf.split(input_seg_maps, len(gpus))
input_training_masks_split = tf.split(input_training_masks, len(gpus))
tower_grads = []
reuse_variables = None
for i, gpu_id in enumerate(gpus):
with tf.device('/gpu:%d' % gpu_id):
with tf.name_scope('model_%d' % gpu_id) as scope:
iis = input_images_split[i]
isegs = input_seg_maps_split[i]
itms = input_training_masks_split[i]
total_loss, model_loss = tower_loss(iis, isegs, itms, reuse_variables)
batch_norm_updates_op = tf.group(*tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope))
reuse_variables = True
grads = opt.compute_gradients(total_loss)
tower_grads.append(grads)
grads = average_gradients(tower_grads)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
summary_op = tf.summary.merge_all()
# save moving average
variable_averages = tf.train.ExponentialMovingAverage(
FLAGS.moving_average_decay, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
# batch norm updates
with tf.control_dependencies([variables_averages_op, apply_gradient_op, batch_norm_updates_op]):
train_op = tf.no_op(name='train_op')
saver = tf.train.Saver(tf.global_variables())
summary_writer = tf.summary.FileWriter(FLAGS.checkpoint_path, tf.get_default_graph())
init = tf.global_variables_initializer()
if FLAGS.pretrained_model_path is not None:
variable_restore_op = slim.assign_from_checkpoint_fn(FLAGS.pretrained_model_path, slim.get_trainable_variables(),
ignore_missing_vars=True)
gpu_options=tf.GPUOptions(allow_growth=True)
#gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.75)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)) as sess:
if FLAGS.restore:
logger.info('continue training from previous checkpoint')
ckpt = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
logger.debug(ckpt)
saver.restore(sess, ckpt)
else:
sess.run(init)
if FLAGS.pretrained_model_path is not None:
variable_restore_op(sess)
data_generator = data_provider.get_batch(num_workers=FLAGS.num_readers,
input_size=FLAGS.input_size,
batch_size=FLAGS.batch_size_per_gpu * len(gpus))
start = time.time()
for step in range(FLAGS.max_steps):
data = next(data_generator)
ml, tl, _ = sess.run([model_loss, total_loss, train_op], feed_dict={input_images: data[0],
input_seg_maps: data[2],
input_training_masks: data[3]})
if np.isnan(tl):
logger.error('Loss diverged, stop training')
break
if step % 10 == 0:
avg_time_per_step = (time.time() - start)/10
avg_examples_per_second = (10 * FLAGS.batch_size_per_gpu * len(gpus))/(time.time() - start)
start = time.time()
logger.info('Step {:06d}, model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step, {:.2f} examples/second'.format(
step, ml, tl, avg_time_per_step, avg_examples_per_second))
if step % FLAGS.save_checkpoint_steps == 0:
saver.save(sess, os.path.join(FLAGS.checkpoint_path, 'model.ckpt'), global_step=global_step)
if step % FLAGS.save_summary_steps == 0:
_, tl, summary_str = sess.run([train_op, total_loss, summary_op], feed_dict={input_images: data[0],
input_seg_maps: data[2],
input_training_masks: data[3]})
summary_writer.add_summary(summary_str, global_step=step)
Sorry for posting bunch of code lines but I really have no idea how to calculate metrics from all this?
Scikit-learn has a great API for calculating precision, recall and F1 score for each class:
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
Y_pred = sess.run([nn_output], feed_dict={input_images: input_data[0], input_seg_maps: input_data[2], input_training_masks: input_data[3]]}) # Feed according to your model
ClassificationReport = sklearn.metrics.classification_report(Y_pred, Y_true, output_dict=True)
I'm new in TensorFlow and Machine Learning (python also).
In first step to create an image recognition program, i was hit the wall of confusion in feeding data preparation. Can someone please help me on this?
I was look into this tutorial, but the data preparation is obfuscated.
mnis softmax for beginner
I didn't expect to get a whole perfect program from this question, instead i would love to hear if you can tell me how TensorFlow work on feed_dict. For now in my mind, it is: "Work like a [for] loop, go though imageHolder, get the data of 2352 byte/ 1 image and put in the training op, in there it's perform predict base on current model and compare with data from labelHolder of same index then perform correction on model." so i was expect to put in a set of 2352 byte data (another image with same size) and get the prediction. I will also put the code here, in case my idea is correct and the error come from bad implementation.
Said: i have a set off data for 5 classes, with 3670 images in total.
When load the data to feed_dict for training, i have converted all image to 28x28 pixels, with 3 channels. it result me a tensor of (3670, 2352) for image holder in the feed_dict. After that, i managed to prepare a tensor of (3670,) for label holder in the feed_dict.
The training code is look like this:
for step in xrange(FLAGS.max_steps):
feed_dict = {
imageHolder: imageTrain,
labelHolder: labelTrain,
}
_, loss_rate = sess.run([train_op, loss_op], feed_dict=feed_dict)
Then i have my code to predict a new image with the model above:
testing_dataset = do_get_file_list(FLAGS.guess_dir)
x = tf.placeholder(tf.float32, shape=(IMAGE_PIXELS))
for data in testing_dataset:
image = Image.open(data)
image = image.resize((IMAGE_SIZE, IMAGE_SIZE))
image = np.array(image).reshape(IMAGE_PIXELS)
prediction = session.run(tf.argmax(logits, 1), feed_dict={x: image})
But the problem is the predict line always raise an error of "Can not feed value of shape...." no matter what shape my testing data is (2352,), (1, 2352) (it's ask for (3670, 2352) shape, but no way)
This is some flag i have used
IMAGE_SIZE = 28
CHANNELS = 3
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE * CHANNELS
The training op and loss computing:
def do_get_op_compute_loss(logits, labels):
labels = tf.to_int64(labels)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits, name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
return loss
def do_get_op_training(loss_op, training_rate):
optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = optimizer.minimize(loss_op, global_step=global_step)
return train_op
Variables
imageHolder = tf.placeholder(tf.float32, [data_count, IMAGE_PIXELS])
labelHolder = tf.placeholder(tf.int32, [data_count])
For complete program:
import os
import math
import tensorflow as tf
from PIL import Image
import numpy as np
from six.moves import xrange
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('max_steps', 200, 'Number of steps to run trainer.')
flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_integer('batch_size', 4, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.')
flags.DEFINE_string('save_file', '.\\data\\model.ckpt', 'Directory to put the training data.')
flags.DEFINE_string('guess_dir', 'work', 'Directory to put the testing data.')
#flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
# 'for unit testing.')
IMAGE_SIZE = 28
CHANNELS = 3
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE * CHANNELS
def do_inference(images, hidden1_units, hidden2_units, class_count):
#HIDDEN LAYER 1
with tf.name_scope('hidden1'):
weights = tf.Variable(
tf.truncated_normal([IMAGE_PIXELS, hidden1_units], stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
name='weights')
biases = tf.Variable(tf.zeros([hidden1_units]), name='biases')
hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
#HIDDEN LAYER 2
with tf.name_scope('hidden1'):
weights = tf.Variable(
tf.truncated_normal([hidden1_units, hidden2_units], stddev=1.0 / math.sqrt(float(hidden1_units))),
name='weights')
biases = tf.Variable(tf.zeros([hidden2_units]), name='biases')
hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
#LINEAR
with tf.name_scope('softmax_linear'):
weights = tf.Variable(
tf.truncated_normal([hidden2_units, class_count], stddev=1.0 / math.sqrt(float(hidden2_units))),
name='weights')
biases = tf.Variable(tf.zeros([class_count]), name='biases')
logits = tf.matmul(hidden2, weights) + biases
return logits
def do_get_op_compute_loss(logits, labels):
labels = tf.to_int64(labels)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits, name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
return loss
def do_get_op_training(loss_op, training_rate):
optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = optimizer.minimize(loss_op, global_step=global_step)
return train_op
def do_get_op_evaluate(logits, labels):
correct = tf.nn.in_top_k(logits, labels, 1)
return tf.reduce_sum(tf.cast(correct, tf.int32))
def do_evaluate(session, eval_correct_op, imageset_holder, labelset_holder, train_images, train_labels):
true_count = 0
num_examples = FLAGS.batch_size * FLAGS.batch_size
for step in xrange(FLAGS.batch_size):
feed_dict = {imageset_holder: train_images, labelset_holder: train_labels,}
true_count += session.run(eval_correct_op, feed_dict=feed_dict)
precision = true_count / num_examples
# print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' %
# (num_examples, true_count, precision))
def do_init_param(data_count, class_count):
# Generate placeholder
imageHolder = tf.placeholder(tf.float32, shape=(data_count, IMAGE_PIXELS))
labelHolder = tf.placeholder(tf.int32, shape=(data_count))
# Build a graph for prediction from inference model
logits = do_inference(imageHolder, FLAGS.hidden1, FLAGS.hidden2, class_count)
# Add loss calculating op
loss_op = do_get_op_compute_loss(logits, labelHolder)
# Add training op
train_op = do_get_op_training(loss_op, FLAGS.learning_rate)
# Add evaluate correction op
evaluate_op = do_get_op_evaluate(logits, labelHolder)
# Create session for op operating
sess = tf.Session()
# Init param
init = tf.initialize_all_variables()
sess.run(init)
return sess, train_op, loss_op, evaluate_op, imageHolder, labelHolder, logits
def do_get_class_list():
return [{'name': name, 'path': os.path.join(FLAGS.train_dir, name)} for name in os.listdir(FLAGS.train_dir)
if os.path.isdir(os.path.join(FLAGS.train_dir, name))]
def do_get_file_list(folderName):
return [os.path.join(folderName, name) for name in os.listdir(folderName)
if (os.path.isdir(os.path.join(folderName, name)) == False)]
def do_init_data_list():
file_list = []
for classItem in do_get_class_list():
for dataItem in do_get_file_list(classItem['path']):
file_list.append({'name': classItem['name'], 'path': dataItem})
# Renew data feeding dictionary
imageTrainList, labelTrainList = do_seperate_data(file_list)
imageTrain = []
for imagePath in imageTrainList:
image = Image.open(imagePath)
image = image.resize((IMAGE_SIZE, IMAGE_SIZE))
imageTrain.append(np.array(image))
imageCount = len(imageTrain)
imageTrain = np.array(imageTrain)
imageTrain = imageTrain.reshape(imageCount, IMAGE_PIXELS)
id_list, id_map = do_generate_id_label(labelTrainList)
labelTrain = np.array(id_list)
return imageTrain, labelTrain, id_map
def do_init():
imageTrain, labelTrain, id_map = do_init_data_list()
sess, train_op, loss_op, evaluate_op, imageHolder, labelHolder, logits = do_init_param(len(imageTrain), len(id_map))
return sess, train_op, loss_op, evaluate_op, imageHolder, labelHolder, imageTrain, labelTrain, id_map, logits
def do_seperate_data(data):
images = [item['path'] for item in data]
labels = [item['name'] for item in data]
return images, labels
def do_generate_id_label(label_list):
trimmed_label_list = list(set(label_list))
id_map = {trimmed_label_list.index(label): label for label in trimmed_label_list}
reversed_id_map = {label: trimmed_label_list.index(label) for label in trimmed_label_list}
id_list = [reversed_id_map.get(item) for item in label_list]
return id_list, id_map
def do_training(sess, train_op, loss_op, evaluate_op, imageHolder, labelHolder, imageTrain, labelTrain):
# Training state checkpoint saver
saver = tf.train.Saver()
# feed_dict = {
# imageHolder: imageTrain,
# labelHolder: labelTrain,
# }
for step in xrange(FLAGS.max_steps):
feed_dict = {
imageHolder: imageTrain,
labelHolder: labelTrain,
}
_, loss_rate = sess.run([train_op, loss_op], feed_dict=feed_dict)
if step % 100 == 0:
print('Step {0}: loss = {1}'.format(step, loss_rate))
if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
saver.save(sess, FLAGS.save_file, global_step=step)
print('Evaluate training data')
do_evaluate(sess, evaluate_op, imageHolder, labelHolder, imageTrain, labelTrain)
def do_predict(session, logits):
# xentropy
testing_dataset = do_get_file_list(FLAGS.guess_dir)
x = tf.placeholder(tf.float32, shape=(IMAGE_PIXELS))
print('Perform predict')
print('==================================================================================')
# TEMPORARY CODE
for data in testing_dataset:
image = Image.open(data)
image = image.resize((IMAGE_SIZE, IMAGE_SIZE))
image = np.array(image).reshape(IMAGE_PIXELS)
print(image.shape)
prediction = session.run(logits, {x: image})
print('{0}: {1}'.format(data, prediction))
def main(_):
# TF notice default graph
with tf.Graph().as_default():
sess, train_op, loss_op, evaluate_op, imageHolder, labelHolder, imageTrain, labelTrain, id_map, logits = do_init()
print("done init")
do_training(sess, train_op, loss_op, evaluate_op, imageHolder, labelHolder, imageTrain, labelTrain)
print("done training")
do_predict(sess, logits)
# NO IDEA
if __name__ == '__main__':
tf.app.run()
It's important to understand the error, you say
But the problem is the predict line always raise an error of "Can not
feed value of shape...." no matter what shape my testing data is
(2352,), (1, 2352) (it's ask for (3670, 2352) shape, but no way)
Oh yes way my friend, yes way. It says there's a problem with your shape, you need to inspect that. It asks for 3670, why?
Because your model accepts inputs with shape (data_count, IMAGE_PIXELS), which you declare in the below:
def do_init_param(data_count, class_count):
# Generate placeholder
imageHolder = tf.placeholder(tf.float32, shape=(data_count, IMAGE_PIXELS))
labelHolder = tf.placeholder(tf.int32, shape=(data_count))
This function is called here:
sess, train_op, loss_op, evaluate_op, imageHolder, labelHolder, logits = do_init_param(len(imageTrain), len(id_map))
len(imageTrain) is the length of your dataset, probably 3670 images.
Then you have your prediction function:
def do_predict(session, logits):
# xentropy
testing_dataset = do_get_file_list(FLAGS.guess_dir)
x = tf.placeholder(tf.float32, shape=(IMAGE_PIXELS))
...
prediction = session.run(logits, {x: image})
Note x here is useless. You are feeding your image to predict to your model which does not expect that shape, it expects the original placeholder shape of (3670, 2352), because that's what you said.
The solution is to declare x as a placeholder with non-specific first dimension such as:
imageHolder = tf.placeholder(tf.float32, shape=(None, IMAGE_PIXELS))
When you predict the label of your image, you can have a single image or multiple images (a mini-batch), but always must be of shape [number_images, IMAGE_PIXELS].
Makes sense?
I am very new to TensorFlow. I am doing the image classification using my own training database.
However, after I trained my own dataset, I have no idea on how to classify the input image.
Here is my code for preparing my own dataset
filenames = ['01.jpg', '02.jpg', '03.jpg', '04.jpg']
label = [0,1,1,1]
filename_queue = tf.train.string_input_producer(filenames)
reader = tf.WholeFileReader()
filename, content = reader.read(filename_queue)
image = tf.image.decode_jpeg(content, channels=3)
image = tf.cast(image, tf.float32)
resized_image = tf.image.resize_images(image, 224, 224)
image_batch , label_batch= tf.train.batch([resized_image,label], batch_size=8, num_threads = 3, capacity=5000)
Is this a correct code for training the dataset?
Afterwards, I try to use it to classify the input images with the following code.
test = ['test.jpg', 'test2.jpg']
test_queue=tf.train.string_input_producer(test)
reader = tf.WholeFileReader()
testname, test_content = reader.read(test_queue)
test = tf.image.decode_jpeg(test_content, channels=3)
test = tf.cast(test, tf.float32)
resized_image = tf.image.resize_images(test, 224,224)
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
res = sess.run(resized_image)
coord.request_stop()
coord.join(threads)
However, it does not return the predicted label for the input images.
I am looking for someone to teach me how to classify the images by using my own dataset.
Thank you.
maybe you could try this after you have install PIL python lib:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import math
import numpy
import numpy as np
import random
from PIL import Image
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
# Basic model parameters as external flags.
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('max_steps', 2000, 'Number of steps to run trainer.')
flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_integer('batch_size', 4, 'Batch size. '
'Must divide evenly into the dataset sizes.')
flags.DEFINE_string('train_dir', 'data', 'Directory to put the training data.')
flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
'for unit testing.')
NUM_CLASSES = 2
IMAGE_SIZE = 28
CHANNELS = 3
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE * CHANNELS
def inference(images, hidden1_units, hidden2_units):
# Hidden 1
with tf.name_scope('hidden1'):
weights = tf.Variable(
tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
name='weights')
biases = tf.Variable(tf.zeros([hidden1_units]),
name='biases')
hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
# Hidden 2
with tf.name_scope('hidden2'):
weights = tf.Variable(
tf.truncated_normal([hidden1_units, hidden2_units],
stddev=1.0 / math.sqrt(float(hidden1_units))),
name='weights')
biases = tf.Variable(tf.zeros([hidden2_units]),
name='biases')
hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
# Linear
with tf.name_scope('softmax_linear'):
weights = tf.Variable(
tf.truncated_normal([hidden2_units, NUM_CLASSES],
stddev=1.0 / math.sqrt(float(hidden2_units))),
name='weights')
biases = tf.Variable(tf.zeros([NUM_CLASSES]),
name='biases')
logits = tf.matmul(hidden2, weights) + biases
return logits
def cal_loss(logits, labels):
labels = tf.to_int64(labels)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, labels, name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
return loss
def training(loss, learning_rate):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = optimizer.minimize(loss, global_step=global_step)
return train_op
def evaluation(logits, labels):
correct = tf.nn.in_top_k(logits, labels, 1)
return tf.reduce_sum(tf.cast(correct, tf.int32))
def placeholder_inputs(batch_size):
images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,IMAGE_PIXELS))
labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
return images_placeholder, labels_placeholder
def fill_feed_dict(images_feed,labels_feed, images_pl, labels_pl):
feed_dict = {
images_pl: images_feed,
labels_pl: labels_feed,
}
return feed_dict
def do_eval(sess,
eval_correct,
images_placeholder,
labels_placeholder,
data_set):
# And run one epoch of eval.
true_count = 0 # Counts the number of correct predictions.
steps_per_epoch = 4 // FLAGS.batch_size
num_examples = steps_per_epoch * FLAGS.batch_size
for step in xrange(steps_per_epoch):
feed_dict = fill_feed_dict(train_images,train_labels,
images_placeholder,
labels_placeholder)
true_count += sess.run(eval_correct, feed_dict=feed_dict)
precision = true_count / num_examples
print(' Num examples: %d Num correct: %d Precision # 1: %0.04f' %
(num_examples, true_count, precision))
# Get the sets of images and labels for training, validation, and
train_images = []
for filename in ['01.jpg', '02.jpg', '03.jpg', '04.jpg']:
image = Image.open(filename)
image = image.resize((IMAGE_SIZE,IMAGE_SIZE))
train_images.append(np.array(image))
train_images = np.array(train_images)
train_images = train_images.reshape(4,IMAGE_PIXELS)
label = [0,1,1,1]
train_labels = np.array(label)
def run_training():
# Tell TensorFlow that the model will be built into the default Graph.
with tf.Graph().as_default():
# Generate placeholders for the images and labels.
images_placeholder, labels_placeholder = placeholder_inputs(4)
# Build a Graph that computes predictions from the inference model.
logits = inference(images_placeholder,
FLAGS.hidden1,
FLAGS.hidden2)
# Add to the Graph the Ops for loss calculation.
loss = cal_loss(logits, labels_placeholder)
# Add to the Graph the Ops that calculate and apply gradients.
train_op = training(loss, FLAGS.learning_rate)
# Add the Op to compare the logits to the labels during evaluation.
eval_correct = evaluation(logits, labels_placeholder)
# Create a saver for writing training checkpoints.
saver = tf.train.Saver()
# Create a session for running Ops on the Graph.
sess = tf.Session()
# Run the Op to initialize the variables.
init = tf.initialize_all_variables()
sess.run(init)
# And then after everything is built, start the training loop.
for step in xrange(FLAGS.max_steps):
start_time = time.time()
feed_dict = fill_feed_dict(train_images,train_labels,
images_placeholder,
labels_placeholder)
_, loss_value = sess.run([train_op, loss],
feed_dict=feed_dict)
duration = time.time() - start_time
if step % 100 == 0:
# Print status to stdout.
print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
saver.save(sess, FLAGS.train_dir, global_step=step)
print('Training Data Eval:')
do_eval(sess,
eval_correct,
images_placeholder,
labels_placeholder,
train_images)
def main(_):
run_training()
if __name__ == '__main__':
tf.app.run()