Tensorflow not predicting accurate enough results - python

I have some fundamental questions about the algorithms I picked in my Tensorflow project. I fed in around 1 million sets of training data and still couldn't get the accurate enough prediction results.
The code I am using is based on an old Tensorflow example (https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/examples/tutorials/estimators/abalone.py). The goal of this example is to predict the age of an abalone based on the training features provided.
My purpose is very similar. The only difference is that I have more labels(6) compared to my features(4). Since the predictions after training are way off, the concern of the feasibility of this project is starting to raise some concerns.
I am pretty new to Machine Learning and Tensorflow so I am not very sure if I have picked the proper methods for this project. I'd like to know if there are some ways to improve my current code to possibly improve the accuracy of the predictions, like more layers, different optimization methods, etc.
Here is the code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
LEARNING_RATE = 0.001
def model_fn(features, labels, mode, params):
"""Model function for Estimator."""
first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
# Connect the second hidden layer to first hidden layer with relu
second_hidden_layer = tf.layers.dense(
first_hidden_layer, 10, activation=tf.nn.relu)
# Connect the output layer to second hidden layer (no activation fn)
output_layer = tf.layers.dense(second_hidden_layer, 6)
# Reshape output layer to 1-dim Tensor to return predictions
predictions = tf.reshape(output_layer, [-1,6])
# Provide an estimator spec for `ModeKeys.PREDICT`.
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={"ages": predictions})
# Calculate loss using mean squared error
loss = tf.losses.mean_squared_error(labels, predictions)
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=params["learning_rate"])
train_op = optimizer.minimize(
loss=loss, global_step=tf.train.get_global_step())
# Calculate root mean squared error as additional eval metric
eval_metric_ops = {
"rmse": tf.metrics.root_mean_squared_error(
tf.cast(labels, tf.float64), predictions)
}
# Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops)
def main(unused_argv):
train_file = "training_data_mc1000.csv"
test_file = "test_data_mc1000.csv"
train_features_interim = pd.read_csv(train_file, usecols=['vgs', 'vbs', 'vds', 'current'])
train_features_numpy = np.asarray(train_features_interim, dtype=np.float64)
train_labels_interim = pd.read_csv(train_file, usecols=['plo_tox', 'plo_dxl', 'plo_dxw', 'parl1', 'parl2', 'random_fn'])
train_labels_numpy = np.asarray(train_labels_interim, dtype=np.float64)
# Set model params
model_params = {"learning_rate": LEARNING_RATE}
# Instantiate Estimator
nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_features_numpy},
y=train_labels_numpy,
num_epochs=None,
shuffle=True)
# Train
nn.train(input_fn=train_input_fn, max_steps=1048576)
test_features_interim = pd.read_csv(test_file, usecols=['vgs', 'vbs', 'vds', 'current'])
test_features_numpy = np.asarray(test_features_interim, dtype=np.float64)
test_labels_interim = pd.read_csv(test_file, usecols=['plo_tox', 'plo_dxl', 'plo_dxw', 'parl1', 'parl2', 'random_fn'])
test_labels_numpy = np.asarray(test_labels_interim, dtype=np.float64)
# Score accuracy
test_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": test_features_numpy},
y=test_labels_numpy,
num_epochs=1,
shuffle=False)
ev = nn.evaluate(input_fn=test_input_fn)
print("Loss: %s" % ev["loss"])
print("Root Mean Squared Error: %s" % ev["rmse"])
prediction_file = "Tensorflow_prediction_data.csv"
predict_features_interim = pd.read_csv(prediction_file, usecols=['vgs', 'vbs', 'vds', 'current'])
predict_features_numpy = np.asarray(predict_features_interim, dtype=np.float64)
# Print out predictions
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
x= {"x": predict_features_numpy},
num_epochs=1,
shuffle=False)
predictions = nn.predict(input_fn=predict_input_fn)
for i, p in enumerate(predictions):
print("Prediction %s: %s" % (i + 1, p["ages"]))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--train_data", type=str, default="", help="Path to the training data.")
parser.add_argument(
"--test_data", type=str, default="", help="Path to the test data.")
parser.add_argument(
"--predict_data",
type=str,
default="",
help="Path to the prediction data.")
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
And portion of the training and testing data looks like
The last four columns are the features and the first six columns are the labels. Again, you can see that I am having more labels than features. My goal is to train a model that when I feed in new sets of features, it can predict accurate enough labels.
The following part is added for clarification of my data sets. Thanks for the first ones that commented my question that reminds me to put this up as well.
The relation between my features and labels are : every 30(vgs)X10(vbs)X10(vds) corresponds to 1 set of labels. Basically it is like a 3-D array with the first three features acting like coordinates and the last feature(current) as the value that is stored within each cells. That's why the labels from the portions I showed are all the same.
Another question now is that I am expecting the loss should be getting smaller and smaller as the training progresses, but it is not. I think this should be another reason why the output is not accurate cuz the minimizing loss part isn't working. I don't really know why, though.
Thanks for taking time looking at this and I'd love to have a discussion down below.

From what I can see in your code, you are not normalizing your features. Try normalizing them for example to have mean zero and std=1. Since your features are in a completely different range, this normalization might help.
It would also be helpful to see other labels. The ones in the provided picture are all the same.

Related

Unable to predict with tf.keras sequential model

I am a beginner in machine learning and have created a sequential model using tf keras. I am confused on how to use my_model to get a predictions based on one instance. The model uses 4 features columns and tries to determine the label "diff". I have posted the model code for context. The following code is from google machine learning crash course and my model is able to be compiled. I am confused on how to use the generated model to predict a value. My code for the prediction will be posted under the model code.
feature_columns = []
homePAG = tf.feature_column.numeric_column("homePAG")
feature_columns.append(homePAG)
awayPAG = tf.feature_column.numeric_column("awayPAG")
feature_columns.append(awayPAG)
homePPG = tf.feature_column.numeric_column("homePPG")
feature_columns.append(homePPG)
awayPPG = tf.feature_column.numeric_column("awayPPG")
feature_columns.append(awayPPG)
fp_feature_layer = layers.DenseFeatures(feature_columns)
def create_model(my_learning_rate, feature_layer):
"""Create and compile a simple linear regression model."""
# Most simple tf.keras models are sequential.
model = tf.keras.models.Sequential()
# Add the layer containing the feature columns to the model.
model.add(feature_layer)
# Add one linear layer to the model to yield a simple linear regressor.
model.add(tf.keras.layers.Dense(units=1, input_shape=(1,)))
# Construct the layers into a model that TensorFlow can execute.
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
loss="mean_squared_error",
metrics=[tf.keras.metrics.RootMeanSquaredError()])
return model
def train_model(model, dataset, epochs, batch_size, label_name):
"""Feed a dataset into the model in order to train it."""
features = {name:np.array(value) for name, value in dataset.items()}
label = np.array(features.pop(label_name))
history = model.fit(x=features, y=label, batch_size=batch_size,
epochs=epochs, shuffle=True)
# The list of epochs is stored separately from the rest of history.
epochs = history.epoch
# Isolate the mean absolute error for each epoch.
hist = pd.DataFrame(history.history)
rmse = hist["root_mean_squared_error"]
return epochs, rmse
def plot_the_loss_curve(epochs, rmse):
"""Plot a curve of loss vs. epoch."""
plt.figure()
plt.xlabel("Epoch")
plt.ylabel("Root Mean Squared Error")
plt.plot(epochs, rmse, label="Loss")
plt.legend()
plt.ylim([rmse.min()*0.94, rmse.max()* 1.05])
plt.show()
print("Defined the create_model, train_model, and plot_the_loss_curve functions.")
# The following variables are the hyperparameters.
learning_rate = 0.01
epochs = 10
batch_size = 75
label_name = 'diff'
# Create and compile the model's topography.
my_model = create_model(learning_rate, fp_feature_layer)
# Train the model on the training set.
epochs, rmse = train_model(my_model, train_df, epochs, batch_size, label_name)
plot_the_loss_curve(epochs, rmse)
print("\n: Evaluate the new model against the test set:")
test_features = {name:np.array(value) for name, value in test_df.items()}
test_label = np.array(test_features.pop(label_name))
#test_label=tf.convert_to_tensor(test_label)
my_model.evaluate(x=test_features, y=test_label, batch_size=batch_size)
For this code I receive error :
x={'homePAG':np.int64(7), 'awayPAG':np.int64(7), 'homePPG':np.int64(7), 'awayPPG':np.int64(7)}
my_model.predict(x)
ValueError: Failed to find data adapter that can handle input: (<class
'dict'> containing {"<class 'str'>"} keys and {"<class
'numpy.int64'>"} values), <class 'NoneType'>
For this code I receive error :
z=np.array([[10,10,10,10]])
my_model.predict(z)
ValueError: ('We expected a dictionary here. Instead we got: ', <tf.Tensor 'IteratorGetNext:0' shape=(None, 4) dtype=int64>)

How can I pass Input/Output images to Tensorboard using Keras model.fit() method to train a model?

I recently switched from Tensorflow 1.14 and Estimaror API to Tensorflow 2.0 and keras API.I am working on an image segmentation problem so the inputs/outputs/labels are all images. When I used Estimator, things where pretty straight forward. In model_fn where the arguments were (features, labels, mode, params) I could just pick the features and labels, do the necessary processing and then pass it in tf.summary.image() and everything worked like a charm. Now, using the keras API, although it provides greater ease of use, it makes hard to do simple handling on data during training, which becomes even harder when it is used with dataset API.Example:
Tensorflow 1.14/Estimator:
def model_fn(features, labels, mode, params):
loss, train_op, = None, None
eval_metric_ops, training_hooks, evaluation_hooks = None, None, None
output = model(input=features)
predictions = tf.argmax(output, axis=-1)
predictions_dict = {'predicted': predictions}
dice_score = tf.contrib.metrics.f1_score(labels=label, predictions=predictions[:, :, :, 1])
if mode in (estimator.ModeKeys.TRAIN, estimator.ModeKeys.EVAL):
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(params['lr'], global_step=global_step,
decay_steps=params['decay_steps'],
decay_rate=params['decay_rate'], staircase=False)
loss = loss_fn(outputs=predictions, labels=labels)
summary.image('Input_Image', features)
summary.image('Label', tf.expand_dims(tf.cast(label, dtype=tf.float32), axis=-1))
summary.image('Prediction', tf.expand_dims(tf.cast(predictions, dtype=tf.float32), axis=-1))
if mode == estimator.ModeKeys.TRAIN:
with tf.name_scope('Metrics'):
summary.scalar('Dice_Coefficient', dice_score[1])
summary.scalar('Learning_Rate', learning_rate)
summary.merge_all()
train_logs_hook = tf.estimator.LoggingTensorHook({'Dice_Coefficient': dice_score[1]},every_n_iter=params['train_log_every_n_steps']) every_n_iter=params['train_log_every_n_steps'])
training_hooks = [train_logs_hook]
train_op = Adam(learning_rate=learning_rate, epsilon=params['epsilon']).minimize(loss=loss, global_step=global_step)
if mode == estimator.ModeKeys.EVAL:
eval_metric_ops = {'Metrics/Dice_Coefficient': dice_score}
eval_summary_hook = tf.estimator.SummarySaverHook(output_dir=params['eval_metrics_path'],
summary_op=summary.merge_all(),
save_steps=params['eval_steps_per_summary_save'])
evaluation_hooks = [eval_summary_hook]
return estimator.EstimatorSpec(mode,
predictions=predictions_dict,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops,
training_hooks=training_hooks,
evaluation_hooks=evaluation_hooks)
Using Keras with Tensorflow 2.0 AFAIK, I can't have this kind of access to the Input/Output tensors during training or evaluation (notice than even though during evaluation estimator dont get the image summaries, you can still have access to preview the results by using a tf.estimator.SummarySaverHook). Below is my falied attempt:
def train_data(params): # Similar is the eval_data
def standardization_summaries(image, label, step, writer):
# Some processing to images
with writer.as_default():
tf.summary.image('Input_dataset', image, step=step, max_outputs=1)
tf.summary.image('label_dataset', label, step=step, max_outputs=1)
return image, label
data_set = tf.data.Dataset.from_generator(generator=lambda: data_generator(params),
output_types=(tf.float32, tf.int64),
output_shapes=(tf.TensorShape([None, None]), tf.TensorShape([None, None])))
data_set = data_set.map(lambda x, y: standardization_summaries(image=x, label=y, step=params['global_step'], writer=params['writer']))
data_set = data_set.batch(params['batch_size'])
data_set = data_set.prefetch(buffer_size=-1)
return data_set
model = tf.keras.models.load_model(saved_model)
summary_writer = tf.summary.create_file_writer(save_model_path)
step = tf.Variable(0, trainable=False, dtype=tf.int64)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=save_model_path, histogram_freq=1, write_graph=True,
write_images=False)
early_stop = tf.keras.callbacks.EarlyStopping(patience=args.early_stop)
callbacks = [tensorboard, early_stop]
params = {'batch_size': args.batch_size,
'global_step': step,
'writer': summary_writer}
model.fit(x=train_data(params), epochs=args.epochs, initial_epoch=args.initial_epoch,
validation_data=val_data(params), steps_per_epoch=2, callbacks=callbacks)
Getting the input images from the dataset API came from here but this just gets tons of images whenever the dataset fetches data from the generator. Also, with the step variable being constant and not changing (I can't figure out how to make it walk) everything is just under the step 0 and I can't think any viable way to connect these outputs with the predicted output, given that I would find a way to print them.
So, the question is: Is there anything that I am still missing with Keras API and Tensorboard synergies on image summaries. Is there a way to save image summaries lets say, for every half epoch in training and once at the end of evaluation or should I just let the model be trained and get the training outputs through model.predict() at the end of training an then inspect if something goes wrong(which is not efficient)?

How to use the checkpoint and related files created by Tensorflow?

A newbie tensorflow question here. I am doing a project using Google QuickDraw dataset. I used the code given by Google to train a CNN model using the data.:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ast
import functools
import sys
import tensorflow as tf
tf.logging.set_verbosity('INFO')
def get_num_classes():
with open(FLAGS.classes_path) as label_class:
classes = label_class.readlines()
classes = [x.strip() for x in classes]
return len(classes)
def get_input_fn(mode, tfrecord_pattern, batch_size):
"""Creates an input_fn that stores all the data in memory.
Args:
mode: one of tf.contrib.learn.ModeKeys.{TRAIN, INFER, EVAL}
tfrecord_pattern: path to a TF record file created using create_dataset.py.
batch_size: the batch size to output.
Returns:
A valid input_fn for the model estimator.
"""
def _parse_tfexample_fn(example_proto, mode):
"""Parse a single record which is expected to be a tensorflow.Example."""
feature_to_type = {
"drawing": tf.VarLenFeature(dtype=tf.float32),
"shape": tf.FixedLenFeature([2], dtype=tf.int64)
}
if mode != tf.estimator.ModeKeys.PREDICT:
# The labels won't be available at inference time, so don't add them
# to the list of feature_columns to be read.
feature_to_type["class_index"] = tf.FixedLenFeature([1], dtype=tf.int64)
parsed_features = tf.parse_single_example(example_proto, feature_to_type)
labels = None
if mode != tf.estimator.ModeKeys.PREDICT:
labels = parsed_features["class_index"]
parsed_features["drawing"] = tf.sparse_tensor_to_dense(parsed_features["drawing"])
return parsed_features, labels
def _input_fn():
"""Estimator `input_fn`.
Returns:
A tuple of:
- Dictionary of string feature name to `Tensor`.
- `Tensor` of target labels.
"""
dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern)
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
# Preprocesses 10 files concurrently and interleaves records from each file.
dataset = dataset.interleave(
tf.data.TFRecordDataset,
cycle_length=10,
block_length=1)
dataset = dataset.map(
functools.partial(_parse_tfexample_fn, mode=mode),
num_parallel_calls=10)
dataset = dataset.prefetch(10000)
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=1000000)
# Our inputs are variable length, so pad them.
dataset = dataset.padded_batch(
batch_size, padded_shapes=dataset.output_shapes)
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
return _input_fn
def model_fn(features, labels, mode, params):
"""Model function for RNN classifier.
This function sets up a neural network which applies convolutional layers (as
configured with params.num_conv and params.conv_len) to the input.
The output of the convolutional layers is given to LSTM layers (as configured
with params.num_layers and params.num_nodes).
The final state of the all LSTM layers are concatenated and fed to a fully
connected layer to obtain the final classification scores.
Args:
features: dictionary with keys: inks, lengths.
labels: one hot encoded classes
mode: one of tf.estimator.ModeKeys.{TRAIN, INFER, EVAL}
params: a parameter dictionary with the following keys: num_layers,
num_nodes, batch_size, num_conv, conv_len, num_classes, learning_rate.
Returns:
ModelFnOps for Estimator API.
"""
def _get_input_tensors(features, labels):
"""Converts the input dict into inks, lengths, and labels tensors."""
# features[ink] is a sparse tensor that is [8, batch_maxlen, 3]
# inks will be a dense tensor of [8, maxlen, 3]
# shapes is [batchsize, 2]
shapes = features["shape"]
# lengths will be [batch_size]
lengths = tf.squeeze(
tf.slice(shapes, begin=[0, 0], size=[params.batch_size, 1]))
inks = tf.reshape(features["drawing"], [params.batch_size, -1, 3])
if labels is not None:
labels = tf.squeeze(labels)
return inks, lengths, labels
def _add_conv_layers(inks, lengths):
"""Adds convolution layers."""
convolved = inks
for i in range(len(params.num_conv)):
convolved_input = convolved
if params.batch_norm:
convolved_input = tf.layers.batch_normalization(
convolved_input,
training=(mode == tf.estimator.ModeKeys.TRAIN))
# Add dropout layer if enabled and not first convolution layer.
if i > 0 and params.dropout:
convolved_input = tf.layers.dropout(
convolved_input,
rate=params.dropout,
training=(mode == tf.estimator.ModeKeys.TRAIN))
convolved = tf.layers.conv1d(
convolved_input,
filters=params.num_conv[i],
kernel_size=params.conv_len[i],
activation=None,
strides=1,
padding="same",
name="conv1d_%d" % i)
return convolved, lengths
def _add_regular_rnn_layers(convolved, lengths):
"""Adds RNN layers."""
if params.cell_type == "lstm":
cell = tf.nn.rnn_cell.BasicLSTMCell
elif params.cell_type == "block_lstm":
cell = tf.contrib.rnn.LSTMBlockCell
cells_fw = [cell(params.num_nodes) for _ in range(params.num_layers)]
cells_bw = [cell(params.num_nodes) for _ in range(params.num_layers)]
if params.dropout > 0.0:
cells_fw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_fw]
cells_bw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_bw]
outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw=cells_fw,
cells_bw=cells_bw,
inputs=convolved,
sequence_length=lengths,
dtype=tf.float32,
scope="rnn_classification")
return outputs
def _add_cudnn_rnn_layers(convolved):
"""Adds CUDNN LSTM layers."""
# Convolutions output [B, L, Ch], while CudnnLSTM is time-major.
convolved = tf.transpose(convolved, [1, 0, 2])
lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers=params.num_layers,
num_units=params.num_nodes,
dropout=params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0,
direction="bidirectional")
outputs, _ = lstm(convolved)
# Convert back from time-major outputs to batch-major outputs.
outputs = tf.transpose(outputs, [1, 0, 2])
return outputs
def _add_rnn_layers(convolved, lengths):
"""Adds recurrent neural network layers depending on the cell type."""
if params.cell_type != "cudnn_lstm":
outputs = _add_regular_rnn_layers(convolved, lengths)
else:
outputs = _add_cudnn_rnn_layers(convolved)
# outputs is [batch_size, L, N] where L is the maximal sequence length and N
# the number of nodes in the last layer.
mask = tf.tile(
tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),
[1, 1, tf.shape(outputs)[2]])
zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
outputs = tf.reduce_sum(zero_outside, axis=1)
return outputs
def _add_fc_layers(final_state):
"""Adds a fully connected layer."""
return tf.layers.dense(final_state, params.num_classes)
# Build the model.
inks, lengths, labels = _get_input_tensors(features, labels)
convolved, lengths = _add_conv_layers(inks, lengths)
final_state = _add_rnn_layers(convolved, lengths)
logits = _add_fc_layers(final_state)
# Add the loss.
cross_entropy = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits))
# Add the optimizer.
train_op = tf.contrib.layers.optimize_loss(
loss=cross_entropy,
global_step=tf.train.get_global_step(),
learning_rate=params.learning_rate,
optimizer="Adam",
# some gradient clipping stabilizes training in the beginning.
clip_gradients=params.gradient_clipping_norm,
summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
# Compute current predictions.
predictions = tf.argmax(logits, axis=1)
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={"logits": logits, "predictions": predictions},
loss=cross_entropy,
train_op=train_op,
eval_metric_ops={"accuracy": tf.metrics.accuracy(labels, predictions)})
def create_estimator_and_specs(run_config):
"""Creates an Experiment configuration based on the estimator and input fn."""
model_params = tf.contrib.training.HParams(
num_layers=FLAGS.num_layers,
num_nodes=FLAGS.num_nodes,
batch_size=FLAGS.batch_size,
num_conv=ast.literal_eval(FLAGS.num_conv),
conv_len=ast.literal_eval(FLAGS.conv_len),
num_classes=get_num_classes(),
learning_rate=FLAGS.learning_rate,
gradient_clipping_norm=FLAGS.gradient_clipping_norm,
cell_type=FLAGS.cell_type,
batch_norm=FLAGS.batch_norm,
dropout=FLAGS.dropout)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config,
params=model_params)
train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn(
mode=tf.estimator.ModeKeys.TRAIN,
tfrecord_pattern=FLAGS.training_data,
batch_size=FLAGS.batch_size), max_steps=FLAGS.steps)
eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn(
mode=tf.estimator.ModeKeys.EVAL,
tfrecord_pattern=FLAGS.eval_data,
batch_size=FLAGS.batch_size))
return estimator, train_spec, eval_spec
def main(unused_args):
estimator, train_spec, eval_spec = create_estimator_and_specs(
run_config=tf.estimator.RunConfig(
model_dir=FLAGS.model_dir,
save_checkpoints_secs=300,
save_summary_steps=100))
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--training_data",
type=str,
default="/Train",
help="Path to training data (tf.Example in TFRecord format)")
parser.add_argument(
"--eval_data",
type=str,
default="/Eval",
help="Path to evaluation data (tf.Example in TFRecord format)")
parser.add_argument(
"--classes_file",
type=str,
default="",
help="Path to a file with the classes - one class per line")
parser.add_argument(
"--num_layers",
type=int,
default=3,
help="Number of recurrent neural network layers.")
parser.add_argument(
"--num_nodes",
type=int,
default=128,
help="Number of node per recurrent network layer.")
parser.add_argument(
"--num_conv",
type=str,
default="[48, 64, 96]",
help="Number of conv layers along with number of filters per layer.")
parser.add_argument(
"--conv_len",
type=str,
default="[5, 5, 3]",
help="Length of the convolution filters.")
parser.add_argument(
"--cell_type",
type=str,
default="lstm",
help="Cell type used for rnn layers: cudnn_lstm, lstm or block_lstm.")
parser.add_argument(
"--batch_norm",
type="bool",
default="False",
help="Whether to enable batch normalization or not.")
parser.add_argument(
"--learning_rate",
type=float,
default=0.0001,
help="Learning rate used for training.")
parser.add_argument(
"--gradient_clipping_norm",
type=float,
default=9.0,
help="Gradient clipping norm used during training.")
parser.add_argument(
"--dropout",
type=float,
default=0.3,
help="Dropout used for convolutions and bidi lstm layers.")
parser.add_argument(
"--steps",
type=int,
default=100000,
help="Number of training steps.")
parser.add_argument(
"--batch_size",
type=int,
default=8,
help="Batch size to use for training/evaluation.")
parser.add_argument(
"--model_dir",
type=str,
default="A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints",
help="Path for storing the model checkpoints.")
parser.add_argument(
"--self_test",
type="bool",
default="False",
help="Whether to enable batch normalization or not.")
parser.add_argument(
"--classes_path",
type=str,
default="A:\Code\Machine Learning\Software Engineering project\Quick Draw\quickdraw-dataset-master\categories.txt",
help="Path of the text file which contains name of classes"
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
I have data in TFRecords format. And after running the code, it creates an eval folder with events.out.my system name files, some more events.out files in the main folder, a graph.pbtxt file, model.ckpt-0.data-0000-of-0001, model.ckpt.index, model.ckpt.meta and a file named checkpoint. What are all these files and how to use them for prediction of testing data?
Also, why there just one model.ckpt.data when the data consists of 9 training and 9 eval files.
Another question: When I ran the code, it finished execution quite quickly, while most deep learning models take long time to execute. I mean there are 3450000 examples for training and 345000 for eval and everything was done in like a minute. I am new to TensorFlow, so please keep that in mind. I don't know much things about it.
Update: After adding the line tf.logging.set_verbosity('INFO') to the code, I am getting the following output:
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-23-22:28:00
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-23-22:28:02
INFO:tensorflow:Saving dict for global step 0: accuracy = 0.0, global_step = 0, loss = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 0: A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Loss for final step: None.
Why is the accuracy 0.0 and loss 0.0. I think there is some problem with this, but don't know exacttly what. he training is not happening on all examples I think.
Most obvious reason, your model doesn't receive any training data. As far as I could understand, your get_input_fn contains unnecessary functionality and probably behaves not as you need it to.
In particular, using if mode==train twice
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
# Preprocesses 10 files concurrently and interleaves records from each file.
dataset = dataset.interleave(
tf.data.TFRecordDataset,
cycle_length=10,
block_length=1)
dataset = dataset.map(
functools.partial(_parse_tfexample_fn, mode=mode),
num_parallel_calls=10)
dataset = dataset.prefetch(10000)
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=1000000)
And returning iterator, which shoudn't be used with Estimator API
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
In order to debug, try simple input pipeline first and add functionality step by step
dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern)
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=10, block_length=1)
dataset = dataset.map(_parse_tfexample_fn, num_parallel_calls=10)
dataset = dataset.prefetch(1)
dataset = dataset.padded_batch(batch_size, padded_shapes=dataset.output_shapes)
return dataset
And try using estimator.train at first
P.s. This is not a ready to use function, you should adapt it according to your input data.

NaN with softmax cross entropy in simple model with dummy inputs

I was simplifying my model in order to see where the NaN error occurs and narrowed it down to my loss function:
import tensorflow as tf
from tensorflow.python import debug as tf_debug
def train_input_fn():
pass
def model_fn(features, labels, mode, params):
classes = 225
enc = tf.ones((1,20,1024), dtype=tf.float16)
labels = tf.ones((1,20), dtype=tf.int32)
logits = tf.layers.dense(enc, classes)
loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) / 20
train_op = tf.train.AdamOptimizer(learning_rate=0.00001, beta1=0.9, beta2=0.999).minimize(loss)
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
if __name__ == '__main__':
model_directory = path/to/logdir
hooks = [tf_debug.LocalCLIDebugHook(ui_type="readline")]
classifier = tf.estimator.Estimator(
model_fn=model_fn,
model_dir=model_directory,
params={},
)
classifier.train(input_fn=lambda: train_input_fn(), hooks = hooks)
After the third or fourth 'run' with the tensorflow debugger on a fresh model directory I get 'NaN loss during training.'. I already tried to set the learning rate very low, but nothing changed. I'm using tensorflow-gpu 1.8.
I've tried your given code. I was getting NaN right from the first step.
And I've checked the official documentation.
logits: Unscaled log probabilities of shape [d_0, d_1, ..., d_{r-1}, num_classes] and dtype float32 or float64.
Changed enc = tf.ones((1,20,1024), dtype=tf.float16) to enc = tf.ones((1,20,1024), dtype=tf.float32) and it worked!
Using tf.float16 for Adam optimization variables makes it necessary to use higher epsilon values for numerical stability. When I add
epsilon=1e-04
(standard is 1e-08) to Adam optimizer, it works for me.

Python - features should be a dictionary of `Tensor`s with high level tf APIs

I want to train, evaluate the accuracy and eventually predict with my model. This is my first time using high level APIs such as tf.estimator.
I'm getting a value error from estimator.train(train_input_fn):
'ValueError: features should be a dictionary of `Tensor's. Given type: '
I'm not sure what is going on here. My model is taking 3 inputs and producing a binary output from one neuron.
Before this error I was getting an error about the requested shape not equal to the actual shape, or something along those lines. I fixed it by reducing the batchSize down to 1, instead of 100. I'm sure this isn't going to do so well when it comes to training though.
Any ideas? Heres my code:
import tensorflow as tf
import numpy as np
import sys
sys.path.insert(0, '/Users/blairburns/Documents/DeepLearning/BackgroundColourPredictor/Dataset/Testing/')
sys.path.insert(0, '/Users/blairburns/Documents/DeepLearning/BackgroundColourPredictor/Dataset/Training/')
#other files
from TestDataNormaliser import *
from TrainDataNormaliser import *
learning_rate = 0.01
trainingIteration = 15
batchSize = 1
displayStep = 2
#Layers using tf.layers
def get_logits(features):
l1 = tf.layers.dense(features, 3, activation=tf.nn.relu)
l2 = tf.layers.dense(l1, 4, activation=tf.nn.relu)
l3 = tf.layers.dense(l2, 1, activation=None)
a = l3
return a
#cost function
def get_loss(a, labels):
#cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(a)))
return tf.nn.sigmoid_cross_entropy_with_logits(logits=a, labels=labels)
#cross_entropy = tf.reduce_mean((l3 - y)**2)
#cross_entropy = -tf.reduce_sum(y*tf.log(a))-tf.reduce_sum((1-y)*tf.log(1-a))
#optimizer
def get_train_op(loss):
learning_rate = 1e-3
optimizer = tf.train.RMSPropOptimizer(learning_rate)
return optimizer.minimize(loss, global_step=tf.train.get_global_step())
#training
####
def get_inputs(feature_data, label_data, batch_size, n_epochs=None, shuffle=True):
dataset = tf.data.Dataset.from_tensor_slices(
(feature_data, label_data))
dataset = dataset.repeat(n_epochs)
if shuffle:
dataset = dataset.shuffle(len(feature_data))
dataset = dataset.batch(batch_size)
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
def model_fn(features, labels, mode):
a = get_logits(features)
loss = get_loss(a, labels)
train_op = get_train_op(loss)
predictions = tf.greater(a, 0)
accuracy = tf.metrics.accuracy(labels, predictions)
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops={'Accuracy': accuracy},
predictions=predictions
)
def train_input_fn():
return get_inputs(
trainArrayValues,
trainArrayLabels,
batchSize
)
def eval_input_fn():
return get_inputs(
testArrayValues,
testArrayLabels,
batchSize,
n_epochs=1,
shuffle=False
)
model_dir = './savedModel'
estimator = tf.estimator.LinearRegressor(feature_columns=[model_fn, model_dir])
#estimator.train(train_input_fn, max_steps=1)
estimator.train(train_input_fn)
estimator.evaluate(eval_input_fn)
Your problem is this line:
estimator = tf.estimator.LinearRegressor(feature_columns=[model_fn, model_dir])
You need to set the feature_columns argument to an array of feature columns. A feature column tells the estimator about the data you're feeding it.
It looks like all your input data is numeric, so I'd call tf.feature_column.numeric_column to create your feature column(s). The documentation is here. For example, the following code creates a numeric feature column containing x-coordinates:
xcol = tf.feature_column.numeric_column('x')
If all your estimator needs are x-coordinates, then you could create the estimator with the following code:
estimator = tf.estimator.LinearRegressor(feature_columns=[xcol])

Categories