A newbie tensorflow question here. I am doing a project using Google QuickDraw dataset. I used the code given by Google to train a CNN model using the data.:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import ast
import functools
import sys
import tensorflow as tf
tf.logging.set_verbosity('INFO')
def get_num_classes():
with open(FLAGS.classes_path) as label_class:
classes = label_class.readlines()
classes = [x.strip() for x in classes]
return len(classes)
def get_input_fn(mode, tfrecord_pattern, batch_size):
"""Creates an input_fn that stores all the data in memory.
Args:
mode: one of tf.contrib.learn.ModeKeys.{TRAIN, INFER, EVAL}
tfrecord_pattern: path to a TF record file created using create_dataset.py.
batch_size: the batch size to output.
Returns:
A valid input_fn for the model estimator.
"""
def _parse_tfexample_fn(example_proto, mode):
"""Parse a single record which is expected to be a tensorflow.Example."""
feature_to_type = {
"drawing": tf.VarLenFeature(dtype=tf.float32),
"shape": tf.FixedLenFeature([2], dtype=tf.int64)
}
if mode != tf.estimator.ModeKeys.PREDICT:
# The labels won't be available at inference time, so don't add them
# to the list of feature_columns to be read.
feature_to_type["class_index"] = tf.FixedLenFeature([1], dtype=tf.int64)
parsed_features = tf.parse_single_example(example_proto, feature_to_type)
labels = None
if mode != tf.estimator.ModeKeys.PREDICT:
labels = parsed_features["class_index"]
parsed_features["drawing"] = tf.sparse_tensor_to_dense(parsed_features["drawing"])
return parsed_features, labels
def _input_fn():
"""Estimator `input_fn`.
Returns:
A tuple of:
- Dictionary of string feature name to `Tensor`.
- `Tensor` of target labels.
"""
dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern)
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
# Preprocesses 10 files concurrently and interleaves records from each file.
dataset = dataset.interleave(
tf.data.TFRecordDataset,
cycle_length=10,
block_length=1)
dataset = dataset.map(
functools.partial(_parse_tfexample_fn, mode=mode),
num_parallel_calls=10)
dataset = dataset.prefetch(10000)
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=1000000)
# Our inputs are variable length, so pad them.
dataset = dataset.padded_batch(
batch_size, padded_shapes=dataset.output_shapes)
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
return _input_fn
def model_fn(features, labels, mode, params):
"""Model function for RNN classifier.
This function sets up a neural network which applies convolutional layers (as
configured with params.num_conv and params.conv_len) to the input.
The output of the convolutional layers is given to LSTM layers (as configured
with params.num_layers and params.num_nodes).
The final state of the all LSTM layers are concatenated and fed to a fully
connected layer to obtain the final classification scores.
Args:
features: dictionary with keys: inks, lengths.
labels: one hot encoded classes
mode: one of tf.estimator.ModeKeys.{TRAIN, INFER, EVAL}
params: a parameter dictionary with the following keys: num_layers,
num_nodes, batch_size, num_conv, conv_len, num_classes, learning_rate.
Returns:
ModelFnOps for Estimator API.
"""
def _get_input_tensors(features, labels):
"""Converts the input dict into inks, lengths, and labels tensors."""
# features[ink] is a sparse tensor that is [8, batch_maxlen, 3]
# inks will be a dense tensor of [8, maxlen, 3]
# shapes is [batchsize, 2]
shapes = features["shape"]
# lengths will be [batch_size]
lengths = tf.squeeze(
tf.slice(shapes, begin=[0, 0], size=[params.batch_size, 1]))
inks = tf.reshape(features["drawing"], [params.batch_size, -1, 3])
if labels is not None:
labels = tf.squeeze(labels)
return inks, lengths, labels
def _add_conv_layers(inks, lengths):
"""Adds convolution layers."""
convolved = inks
for i in range(len(params.num_conv)):
convolved_input = convolved
if params.batch_norm:
convolved_input = tf.layers.batch_normalization(
convolved_input,
training=(mode == tf.estimator.ModeKeys.TRAIN))
# Add dropout layer if enabled and not first convolution layer.
if i > 0 and params.dropout:
convolved_input = tf.layers.dropout(
convolved_input,
rate=params.dropout,
training=(mode == tf.estimator.ModeKeys.TRAIN))
convolved = tf.layers.conv1d(
convolved_input,
filters=params.num_conv[i],
kernel_size=params.conv_len[i],
activation=None,
strides=1,
padding="same",
name="conv1d_%d" % i)
return convolved, lengths
def _add_regular_rnn_layers(convolved, lengths):
"""Adds RNN layers."""
if params.cell_type == "lstm":
cell = tf.nn.rnn_cell.BasicLSTMCell
elif params.cell_type == "block_lstm":
cell = tf.contrib.rnn.LSTMBlockCell
cells_fw = [cell(params.num_nodes) for _ in range(params.num_layers)]
cells_bw = [cell(params.num_nodes) for _ in range(params.num_layers)]
if params.dropout > 0.0:
cells_fw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_fw]
cells_bw = [tf.contrib.rnn.DropoutWrapper(cell) for cell in cells_bw]
outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw=cells_fw,
cells_bw=cells_bw,
inputs=convolved,
sequence_length=lengths,
dtype=tf.float32,
scope="rnn_classification")
return outputs
def _add_cudnn_rnn_layers(convolved):
"""Adds CUDNN LSTM layers."""
# Convolutions output [B, L, Ch], while CudnnLSTM is time-major.
convolved = tf.transpose(convolved, [1, 0, 2])
lstm = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers=params.num_layers,
num_units=params.num_nodes,
dropout=params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0,
direction="bidirectional")
outputs, _ = lstm(convolved)
# Convert back from time-major outputs to batch-major outputs.
outputs = tf.transpose(outputs, [1, 0, 2])
return outputs
def _add_rnn_layers(convolved, lengths):
"""Adds recurrent neural network layers depending on the cell type."""
if params.cell_type != "cudnn_lstm":
outputs = _add_regular_rnn_layers(convolved, lengths)
else:
outputs = _add_cudnn_rnn_layers(convolved)
# outputs is [batch_size, L, N] where L is the maximal sequence length and N
# the number of nodes in the last layer.
mask = tf.tile(
tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),
[1, 1, tf.shape(outputs)[2]])
zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
outputs = tf.reduce_sum(zero_outside, axis=1)
return outputs
def _add_fc_layers(final_state):
"""Adds a fully connected layer."""
return tf.layers.dense(final_state, params.num_classes)
# Build the model.
inks, lengths, labels = _get_input_tensors(features, labels)
convolved, lengths = _add_conv_layers(inks, lengths)
final_state = _add_rnn_layers(convolved, lengths)
logits = _add_fc_layers(final_state)
# Add the loss.
cross_entropy = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits))
# Add the optimizer.
train_op = tf.contrib.layers.optimize_loss(
loss=cross_entropy,
global_step=tf.train.get_global_step(),
learning_rate=params.learning_rate,
optimizer="Adam",
# some gradient clipping stabilizes training in the beginning.
clip_gradients=params.gradient_clipping_norm,
summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
# Compute current predictions.
predictions = tf.argmax(logits, axis=1)
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={"logits": logits, "predictions": predictions},
loss=cross_entropy,
train_op=train_op,
eval_metric_ops={"accuracy": tf.metrics.accuracy(labels, predictions)})
def create_estimator_and_specs(run_config):
"""Creates an Experiment configuration based on the estimator and input fn."""
model_params = tf.contrib.training.HParams(
num_layers=FLAGS.num_layers,
num_nodes=FLAGS.num_nodes,
batch_size=FLAGS.batch_size,
num_conv=ast.literal_eval(FLAGS.num_conv),
conv_len=ast.literal_eval(FLAGS.conv_len),
num_classes=get_num_classes(),
learning_rate=FLAGS.learning_rate,
gradient_clipping_norm=FLAGS.gradient_clipping_norm,
cell_type=FLAGS.cell_type,
batch_norm=FLAGS.batch_norm,
dropout=FLAGS.dropout)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config,
params=model_params)
train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn(
mode=tf.estimator.ModeKeys.TRAIN,
tfrecord_pattern=FLAGS.training_data,
batch_size=FLAGS.batch_size), max_steps=FLAGS.steps)
eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn(
mode=tf.estimator.ModeKeys.EVAL,
tfrecord_pattern=FLAGS.eval_data,
batch_size=FLAGS.batch_size))
return estimator, train_spec, eval_spec
def main(unused_args):
estimator, train_spec, eval_spec = create_estimator_and_specs(
run_config=tf.estimator.RunConfig(
model_dir=FLAGS.model_dir,
save_checkpoints_secs=300,
save_summary_steps=100))
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--training_data",
type=str,
default="/Train",
help="Path to training data (tf.Example in TFRecord format)")
parser.add_argument(
"--eval_data",
type=str,
default="/Eval",
help="Path to evaluation data (tf.Example in TFRecord format)")
parser.add_argument(
"--classes_file",
type=str,
default="",
help="Path to a file with the classes - one class per line")
parser.add_argument(
"--num_layers",
type=int,
default=3,
help="Number of recurrent neural network layers.")
parser.add_argument(
"--num_nodes",
type=int,
default=128,
help="Number of node per recurrent network layer.")
parser.add_argument(
"--num_conv",
type=str,
default="[48, 64, 96]",
help="Number of conv layers along with number of filters per layer.")
parser.add_argument(
"--conv_len",
type=str,
default="[5, 5, 3]",
help="Length of the convolution filters.")
parser.add_argument(
"--cell_type",
type=str,
default="lstm",
help="Cell type used for rnn layers: cudnn_lstm, lstm or block_lstm.")
parser.add_argument(
"--batch_norm",
type="bool",
default="False",
help="Whether to enable batch normalization or not.")
parser.add_argument(
"--learning_rate",
type=float,
default=0.0001,
help="Learning rate used for training.")
parser.add_argument(
"--gradient_clipping_norm",
type=float,
default=9.0,
help="Gradient clipping norm used during training.")
parser.add_argument(
"--dropout",
type=float,
default=0.3,
help="Dropout used for convolutions and bidi lstm layers.")
parser.add_argument(
"--steps",
type=int,
default=100000,
help="Number of training steps.")
parser.add_argument(
"--batch_size",
type=int,
default=8,
help="Batch size to use for training/evaluation.")
parser.add_argument(
"--model_dir",
type=str,
default="A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints",
help="Path for storing the model checkpoints.")
parser.add_argument(
"--self_test",
type="bool",
default="False",
help="Whether to enable batch normalization or not.")
parser.add_argument(
"--classes_path",
type=str,
default="A:\Code\Machine Learning\Software Engineering project\Quick Draw\quickdraw-dataset-master\categories.txt",
help="Path of the text file which contains name of classes"
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
I have data in TFRecords format. And after running the code, it creates an eval folder with events.out.my system name files, some more events.out files in the main folder, a graph.pbtxt file, model.ckpt-0.data-0000-of-0001, model.ckpt.index, model.ckpt.meta and a file named checkpoint. What are all these files and how to use them for prediction of testing data?
Also, why there just one model.ckpt.data when the data consists of 9 training and 9 eval files.
Another question: When I ran the code, it finished execution quite quickly, while most deep learning models take long time to execute. I mean there are 3450000 examples for training and 345000 for eval and everything was done in like a minute. I am new to TensorFlow, so please keep that in mind. I don't know much things about it.
Update: After adding the line tf.logging.set_verbosity('INFO') to the code, I am getting the following output:
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-03-23-22:28:00
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-23-22:28:02
INFO:tensorflow:Saving dict for global step 0: accuracy = 0.0, global_step = 0, loss = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 0: A:\Code\Machine Learning\Software Engineering project\Quick Draw\Model Checkpoints\model.ckpt-0
INFO:tensorflow:Loss for final step: None.
Why is the accuracy 0.0 and loss 0.0. I think there is some problem with this, but don't know exacttly what. he training is not happening on all examples I think.
Most obvious reason, your model doesn't receive any training data. As far as I could understand, your get_input_fn contains unnecessary functionality and probably behaves not as you need it to.
In particular, using if mode==train twice
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
# Preprocesses 10 files concurrently and interleaves records from each file.
dataset = dataset.interleave(
tf.data.TFRecordDataset,
cycle_length=10,
block_length=1)
dataset = dataset.map(
functools.partial(_parse_tfexample_fn, mode=mode),
num_parallel_calls=10)
dataset = dataset.prefetch(10000)
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=1000000)
And returning iterator, which shoudn't be used with Estimator API
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
In order to debug, try simple input pipeline first and add functionality step by step
dataset = tf.data.TFRecordDataset.list_files(tfrecord_pattern)
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=10, block_length=1)
dataset = dataset.map(_parse_tfexample_fn, num_parallel_calls=10)
dataset = dataset.prefetch(1)
dataset = dataset.padded_batch(batch_size, padded_shapes=dataset.output_shapes)
return dataset
And try using estimator.train at first
P.s. This is not a ready to use function, you should adapt it according to your input data.
Related
I recently switched from Tensorflow 1.14 and Estimaror API to Tensorflow 2.0 and keras API.I am working on an image segmentation problem so the inputs/outputs/labels are all images. When I used Estimator, things where pretty straight forward. In model_fn where the arguments were (features, labels, mode, params) I could just pick the features and labels, do the necessary processing and then pass it in tf.summary.image() and everything worked like a charm. Now, using the keras API, although it provides greater ease of use, it makes hard to do simple handling on data during training, which becomes even harder when it is used with dataset API.Example:
Tensorflow 1.14/Estimator:
def model_fn(features, labels, mode, params):
loss, train_op, = None, None
eval_metric_ops, training_hooks, evaluation_hooks = None, None, None
output = model(input=features)
predictions = tf.argmax(output, axis=-1)
predictions_dict = {'predicted': predictions}
dice_score = tf.contrib.metrics.f1_score(labels=label, predictions=predictions[:, :, :, 1])
if mode in (estimator.ModeKeys.TRAIN, estimator.ModeKeys.EVAL):
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(params['lr'], global_step=global_step,
decay_steps=params['decay_steps'],
decay_rate=params['decay_rate'], staircase=False)
loss = loss_fn(outputs=predictions, labels=labels)
summary.image('Input_Image', features)
summary.image('Label', tf.expand_dims(tf.cast(label, dtype=tf.float32), axis=-1))
summary.image('Prediction', tf.expand_dims(tf.cast(predictions, dtype=tf.float32), axis=-1))
if mode == estimator.ModeKeys.TRAIN:
with tf.name_scope('Metrics'):
summary.scalar('Dice_Coefficient', dice_score[1])
summary.scalar('Learning_Rate', learning_rate)
summary.merge_all()
train_logs_hook = tf.estimator.LoggingTensorHook({'Dice_Coefficient': dice_score[1]},every_n_iter=params['train_log_every_n_steps']) every_n_iter=params['train_log_every_n_steps'])
training_hooks = [train_logs_hook]
train_op = Adam(learning_rate=learning_rate, epsilon=params['epsilon']).minimize(loss=loss, global_step=global_step)
if mode == estimator.ModeKeys.EVAL:
eval_metric_ops = {'Metrics/Dice_Coefficient': dice_score}
eval_summary_hook = tf.estimator.SummarySaverHook(output_dir=params['eval_metrics_path'],
summary_op=summary.merge_all(),
save_steps=params['eval_steps_per_summary_save'])
evaluation_hooks = [eval_summary_hook]
return estimator.EstimatorSpec(mode,
predictions=predictions_dict,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops,
training_hooks=training_hooks,
evaluation_hooks=evaluation_hooks)
Using Keras with Tensorflow 2.0 AFAIK, I can't have this kind of access to the Input/Output tensors during training or evaluation (notice than even though during evaluation estimator dont get the image summaries, you can still have access to preview the results by using a tf.estimator.SummarySaverHook). Below is my falied attempt:
def train_data(params): # Similar is the eval_data
def standardization_summaries(image, label, step, writer):
# Some processing to images
with writer.as_default():
tf.summary.image('Input_dataset', image, step=step, max_outputs=1)
tf.summary.image('label_dataset', label, step=step, max_outputs=1)
return image, label
data_set = tf.data.Dataset.from_generator(generator=lambda: data_generator(params),
output_types=(tf.float32, tf.int64),
output_shapes=(tf.TensorShape([None, None]), tf.TensorShape([None, None])))
data_set = data_set.map(lambda x, y: standardization_summaries(image=x, label=y, step=params['global_step'], writer=params['writer']))
data_set = data_set.batch(params['batch_size'])
data_set = data_set.prefetch(buffer_size=-1)
return data_set
model = tf.keras.models.load_model(saved_model)
summary_writer = tf.summary.create_file_writer(save_model_path)
step = tf.Variable(0, trainable=False, dtype=tf.int64)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=save_model_path, histogram_freq=1, write_graph=True,
write_images=False)
early_stop = tf.keras.callbacks.EarlyStopping(patience=args.early_stop)
callbacks = [tensorboard, early_stop]
params = {'batch_size': args.batch_size,
'global_step': step,
'writer': summary_writer}
model.fit(x=train_data(params), epochs=args.epochs, initial_epoch=args.initial_epoch,
validation_data=val_data(params), steps_per_epoch=2, callbacks=callbacks)
Getting the input images from the dataset API came from here but this just gets tons of images whenever the dataset fetches data from the generator. Also, with the step variable being constant and not changing (I can't figure out how to make it walk) everything is just under the step 0 and I can't think any viable way to connect these outputs with the predicted output, given that I would find a way to print them.
So, the question is: Is there anything that I am still missing with Keras API and Tensorboard synergies on image summaries. Is there a way to save image summaries lets say, for every half epoch in training and once at the end of evaluation or should I just let the model be trained and get the training outputs through model.predict() at the end of training an then inspect if something goes wrong(which is not efficient)?
I have some fundamental questions about the algorithms I picked in my Tensorflow project. I fed in around 1 million sets of training data and still couldn't get the accurate enough prediction results.
The code I am using is based on an old Tensorflow example (https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/examples/tutorials/estimators/abalone.py). The goal of this example is to predict the age of an abalone based on the training features provided.
My purpose is very similar. The only difference is that I have more labels(6) compared to my features(4). Since the predictions after training are way off, the concern of the feasibility of this project is starting to raise some concerns.
I am pretty new to Machine Learning and Tensorflow so I am not very sure if I have picked the proper methods for this project. I'd like to know if there are some ways to improve my current code to possibly improve the accuracy of the predictions, like more layers, different optimization methods, etc.
Here is the code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
LEARNING_RATE = 0.001
def model_fn(features, labels, mode, params):
"""Model function for Estimator."""
first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
# Connect the second hidden layer to first hidden layer with relu
second_hidden_layer = tf.layers.dense(
first_hidden_layer, 10, activation=tf.nn.relu)
# Connect the output layer to second hidden layer (no activation fn)
output_layer = tf.layers.dense(second_hidden_layer, 6)
# Reshape output layer to 1-dim Tensor to return predictions
predictions = tf.reshape(output_layer, [-1,6])
# Provide an estimator spec for `ModeKeys.PREDICT`.
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={"ages": predictions})
# Calculate loss using mean squared error
loss = tf.losses.mean_squared_error(labels, predictions)
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=params["learning_rate"])
train_op = optimizer.minimize(
loss=loss, global_step=tf.train.get_global_step())
# Calculate root mean squared error as additional eval metric
eval_metric_ops = {
"rmse": tf.metrics.root_mean_squared_error(
tf.cast(labels, tf.float64), predictions)
}
# Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops)
def main(unused_argv):
train_file = "training_data_mc1000.csv"
test_file = "test_data_mc1000.csv"
train_features_interim = pd.read_csv(train_file, usecols=['vgs', 'vbs', 'vds', 'current'])
train_features_numpy = np.asarray(train_features_interim, dtype=np.float64)
train_labels_interim = pd.read_csv(train_file, usecols=['plo_tox', 'plo_dxl', 'plo_dxw', 'parl1', 'parl2', 'random_fn'])
train_labels_numpy = np.asarray(train_labels_interim, dtype=np.float64)
# Set model params
model_params = {"learning_rate": LEARNING_RATE}
# Instantiate Estimator
nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_features_numpy},
y=train_labels_numpy,
num_epochs=None,
shuffle=True)
# Train
nn.train(input_fn=train_input_fn, max_steps=1048576)
test_features_interim = pd.read_csv(test_file, usecols=['vgs', 'vbs', 'vds', 'current'])
test_features_numpy = np.asarray(test_features_interim, dtype=np.float64)
test_labels_interim = pd.read_csv(test_file, usecols=['plo_tox', 'plo_dxl', 'plo_dxw', 'parl1', 'parl2', 'random_fn'])
test_labels_numpy = np.asarray(test_labels_interim, dtype=np.float64)
# Score accuracy
test_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": test_features_numpy},
y=test_labels_numpy,
num_epochs=1,
shuffle=False)
ev = nn.evaluate(input_fn=test_input_fn)
print("Loss: %s" % ev["loss"])
print("Root Mean Squared Error: %s" % ev["rmse"])
prediction_file = "Tensorflow_prediction_data.csv"
predict_features_interim = pd.read_csv(prediction_file, usecols=['vgs', 'vbs', 'vds', 'current'])
predict_features_numpy = np.asarray(predict_features_interim, dtype=np.float64)
# Print out predictions
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
x= {"x": predict_features_numpy},
num_epochs=1,
shuffle=False)
predictions = nn.predict(input_fn=predict_input_fn)
for i, p in enumerate(predictions):
print("Prediction %s: %s" % (i + 1, p["ages"]))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
parser = argparse.ArgumentParser()
parser.register("type", "bool", lambda v: v.lower() == "true")
parser.add_argument(
"--train_data", type=str, default="", help="Path to the training data.")
parser.add_argument(
"--test_data", type=str, default="", help="Path to the test data.")
parser.add_argument(
"--predict_data",
type=str,
default="",
help="Path to the prediction data.")
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
And portion of the training and testing data looks like
The last four columns are the features and the first six columns are the labels. Again, you can see that I am having more labels than features. My goal is to train a model that when I feed in new sets of features, it can predict accurate enough labels.
The following part is added for clarification of my data sets. Thanks for the first ones that commented my question that reminds me to put this up as well.
The relation between my features and labels are : every 30(vgs)X10(vbs)X10(vds) corresponds to 1 set of labels. Basically it is like a 3-D array with the first three features acting like coordinates and the last feature(current) as the value that is stored within each cells. That's why the labels from the portions I showed are all the same.
Another question now is that I am expecting the loss should be getting smaller and smaller as the training progresses, but it is not. I think this should be another reason why the output is not accurate cuz the minimizing loss part isn't working. I don't really know why, though.
Thanks for taking time looking at this and I'd love to have a discussion down below.
From what I can see in your code, you are not normalizing your features. Try normalizing them for example to have mean zero and std=1. Since your features are in a completely different range, this normalization might help.
It would also be helpful to see other labels. The ones in the provided picture are all the same.
I am trying to practice on how to use estimators in tensorflow by using the abalone dataset. However, even though I tried different numbers of training steps , batch size, activation functions, learning rates and lays of networks the evaluation accuray is still around 20%. I tried the same structure with iris dataset and the accuracy is over 90%. I am wondering if the abalone dataset is not supposed to have high accuracy results? Or it should be trained using a different model other than DNN? My code is attached. Thank you very much!
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import tensorflow as tf
import Aba_data
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', default=1000, type=int, help='batch size')
parser.add_argument('--train_steps', default=100000, type=int,
help='number of training steps')
def my_model(features, labels, mode, params):
"""DNN with three hidden layers, and dropout of 0.1 probability."""
# Create three fully connected layers each layer having a dropout
# probability of 0.1.
net = tf.feature_column.input_layer(features, params['feature_columns'])
for units in params['hidden_units']:
net = tf.layers.dense(net, units=units, activation=tf.sigmoid)
# Compute logits (1 per class).
logits = tf.layers.dense(net, params['n_classes'], activation=None)
# Compute predictions.
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'class_ids': predicted_classes[:, tf.newaxis],
'probabilities': tf.nn.softmax(logits),
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Compute loss.
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# Compute evaluation metrics.
accuracy = tf.metrics.accuracy(labels=labels,
predictions=predicted_classes,
name='acc_op')
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
# Create training op.
assert mode == tf.estimator.ModeKeys.TRAIN
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def main(argv):
args = parser.parse_args(argv[1:])
# Fetch the data
(train_x, train_y), (test_x, test_y) = Aba_data.load_data(0.7)
# Feature columns describe how to use the input.
my_feature_columns = []
for key in train_x.keys():
if key == 'Sex':
my_feature_columns.append(tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
key=key, vocabulary_list=["M", "F", "I"])))
else:
my_feature_columns.append(tf.feature_column.numeric_column(key=key))
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier = tf.estimator.Estimator(
model_fn=my_model,
params={
'feature_columns': my_feature_columns,
# Two hidden layers of 10 nodes each.
'hidden_units': [10, 10],
# The model must choose between 3 classes.
'n_classes': 30,
})
# Train the Model.
classifier.train(
input_fn=lambda:Aba_data.train_input_fn(train_x, train_y, args.batch_size),
steps=args.train_steps)
# Evaluate the model.
eval_result = classifier.evaluate(
input_fn=lambda:Aba_data.eval_input_fn(test_x, test_y, args.batch_size))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
# Generate predictions from the model
# expected = ['Setosa', 'Versicolor', 'Virginica']
# predict_x = {
# 'SepalLength': [5.1, 5.9, 6.9],
# 'SepalWidth': [3.3, 3.0, 3.1],
# 'PetalLength': [1.7, 4.2, 5.4],
# 'PetalWidth': [0.5, 1.5, 2.1],
# }
#
# predictions = classifier.predict(
# input_fn=lambda:iris_data.eval_input_fn(predict_x,
# labels=None,
# batch_size=args.batch_size))
#
# for pred_dict, expec in zip(predictions, expected):
# template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
#
# class_id = pred_dict['class_ids'][0]
# probability = pred_dict['probabilities'][class_id]
#
# print(template.format(iris_data.SPECIES[class_id],
# 100 * probability, expec))
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main)
I have been trying to develop a CNN model for image classification. I am new to tensorflow and getting help from the following books
Learning.TensorFlow.A.Guide.to.Building.Deep.Learning.Systems
TensorFlow For Machine Intelligence by Sam Abrahams
For the past few weeks I have been working to develop a good model but I always get the same prediction. I have tried many different architectures but no luck!
Lately I decided to test my model with CIFAR-10 dataset and using the exact same model as given in the Learning Tensorflow book. But the outcome was same (same class for every image) even after training for 50K steps.
Here is highlight of my model and code.
1.) Downloaded CIFAR-10 image sets, converted them into tfrecord files with labels(labels are string for each category of CIFAR-10 in the tfrecord file) each for training and test set.
2) Reading the images from tfrecord file and generating random shuffle batch of size 100.
3) Converting the label from string to the integer32 type from 0-9 each for given category
4) Pass the training and test batches to the network and getting the output of [batch_size , num_class] size.
5) Train the model using Adam optimizer and softmax cross entropy loss function (Have tried gradient optimizer as well)
7) evaluate the model for test batches before and after the training.
8) Getting the same prediction for entire data set (But different every time I re run the code to try again)
Is there something wrong I am doing here? I would appreciate if someone could help me out with this problem.
Note - My approach of converting images and labels into tfrecord could be unusual but believe me I have come up with this idea from the books I mentioned earlier.
My code for the problem:
import tensorflow as tf
import numpy as np
import _datetime as dt
import PIL
# The glob module allows directory listing
import glob
import random
from itertools import groupby
from collections import defaultdict
H , W = 32 , 32 # Height and weight of the image
C = 3 # Number of channels
sessInt = tf.InteractiveSession()
# Read file and return the batches of the input data
def get_Batches_From_TFrecord(tf_record_filenames_list, batch_size):
# Match and load all the tfrecords found in the specified directory
tf_record_filename_queue = tf.train.string_input_producer(tf_record_filenames_list)
# It may have more than one example in them.
tf_record_reader = tf.TFRecordReader()
tf_image_name, tf_record_serialized = tf_record_reader.read(tf_record_filename_queue)
# The label and image are stored as bytes but could be stored as int64 or float64 values in a
# serialized tf.Example protobuf.
tf_record_features = tf.parse_single_example(tf_record_serialized,
features={'label': tf.FixedLenFeature([], tf.string),
'image': tf.FixedLenFeature([], tf.string), })
# Using tf.uint8 because all of the channel information is between 0-255
tf_record_image = tf.decode_raw(tf_record_features['image'], tf.uint8)
try:
# Reshape the image to look like the input image
tf_record_image = tf.reshape(tf_record_image, [H, W, C])
except:
print(tf_image_name)
tf_record_label = tf.cast(tf_record_features['label'], tf.string)
'''
#Check the image and label
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sessInt, coord=coord)
label = tf_record_label.eval().decode()
print(label)
image = PIL.Image.fromarray(tf_record_image.eval())
image.show()
coord.request_stop()
coord.join(threads)
'''
# creating a batch to feed the data
min_after_dequeue = 10 * batch_size
capacity = min_after_dequeue + 5 * batch_size
# Shuffle examples while feeding in the queue
image_batch, label_batch = tf.train.shuffle_batch([tf_record_image, tf_record_label], batch_size=batch_size,
capacity=capacity, min_after_dequeue=min_after_dequeue)
# Sequential feed in the examples in the queue (Don't shuffle)
# image_batch, label_batch = tf.train.batch([tf_record_image, tf_record_label], batch_size=batch_size, capacity=capacity)
# Converting the images to a float to match the expected input to convolution2d
float_image_batch = tf.image.convert_image_dtype(image_batch, tf.float32)
string_label_batch = label_batch
return float_image_batch, string_label_batch
#Count the number of images in the tfrecord file
def number_of_records(tfrecord_file_name):
count = 0
record_iterator = tf.python_io.tf_record_iterator(path = tfrecord_file_name)
for record in record_iterator:
count+=1
return count
def get_num_of_samples(tfrecords_list):
total_samples = 0
for tfrecord in tfrecords_list:
total_samples += number_of_records(tfrecord)
return total_samples
# Provide the input tfrecord names in a list
train_filenames = ["./TFRecords/cifar_train.tfrecord"]
test_filename = ["./TFRecords/cifar_test.tfrecord"]
num_train_samples = get_num_of_samples(train_filenames)
num_test_samples = get_num_of_samples(test_filename)
print("Number of Training samples: ", num_train_samples)
print("Number of Test samples: ", num_test_samples)
'''
IMP Note : (Batch_size * Training_Steps) should be at least greater than (2*Number_of_samples) for shuffling of batches
'''
train_batch_size = 100
# Total number of batches for input records
# Note - Num of samples in the tfrecord file can be determined by the tfrecord iterator.
# Batch size for test samples
test_batch_size = 50
train_image_batch, train_label_batch = get_Batches_From_TFrecord(train_filenames, train_batch_size)
test_image_batch, test_label_batch = get_Batches_From_TFrecord(test_filename, test_batch_size)
# Definition of the convolution network which returns a single neuron for each input image in the batch
# Define a placeholder for keep probability in dropout
# (Dropout should only use while training, for testing dropout should be always 1.0)
fc_prob = tf.placeholder(tf.float32)
conv_prob = tf.placeholder(tf.float32)
#Helper function to add learned filters(images) into tensorboard summary - for a random input in the batch
def add_filter_summary(name, filter_tensor):
rand_idx = random.randint(0,filter_tensor.get_shape()[0]-1) #Choose any random number from[0,batch_size)
#dispay_filter = filter_tensor[random.randint(0,filter_tensor.get_shape()[3])]
dispay_filter = filter_tensor[5] #keeping the index fix for consistency in visualization
with tf.name_scope("Filter_Summaries"):
img_summary = tf.summary.image(name, tf.reshape(dispay_filter,[-1 , filter_tensor.get_shape()[1],filter_tensor.get_shape()[1],1] ), max_outputs = 500)
# Helper functions for the network
def weight_initializer(shape):
weights = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(weights)
def bias_initializer(shape):
biases = tf.constant(0.1, shape=shape)
return tf.Variable(biases)
def conv2d(input, weights, stride):
return tf.nn.conv2d(input, filter=weights, strides=[1, stride, stride, 1], padding="SAME")
def pool_layer(input, window_size=2 , stride=2):
return tf.nn.max_pool(input, ksize=[1, window_size, window_size, 1], strides=[1, stride, stride, 1], padding='VALID')
# This is the actual layer we will use.
# Linear convolution as defined in conv2d, with a bias,
# followed by the ReLU nonlinearity.
def conv_layer(input, filter_shape , stride=1):
W = weight_initializer(filter_shape)
b = bias_initializer([filter_shape[3]])
return tf.nn.relu(conv2d(input, W, stride) + b)
# A standard full layer with a bias. Notice that here we didn’t add the ReLU.
# This allows us to use the same layer for the final output,
# where we don’t need the nonlinear part.
def full_layer(input, out_size):
in_size = int(input.get_shape()[1])
W = weight_initializer([in_size, out_size])
b = bias_initializer([out_size])
return tf.matmul(input, W) + b
## Model fro the book learning tensorflow - for CIFAR data
def conv_network(image_batch, batch_size):
# Now create the model which returns the output neurons (eequals to the number of labels)
# as a final fully connecetd layer output. Which we can use as input to the softmax classifier
C1 , C2 , C3 = 30 , 50, 80 # Number of output features for each convolution layer
F1 = 500 # Number of output neuron for FC1 layer
#Add original image to tensorboard summary
add_filter_summary("Original" , image_batch)
# First convolutaion layer with 5x5 filter size and 32 filters
conv1 = conv_layer(image_batch, filter_shape=[3, 3, C, C1])
pool1 = pool_layer(conv1, window_size=2)
pool1 = tf.nn.dropout(pool1, keep_prob=conv_prob)
add_filter_summary("conv1" , pool1)
# Second convolutaion layer with 5x5 filter_size and 64 filters
conv2 = conv_layer(pool1, filter_shape=[5, 5, C1, C2])
pool2 = pool_layer(conv2, 2)
pool2 = tf.nn.dropout(pool2, keep_prob=conv_prob)
add_filter_summary("conv2" , pool2)
# Third convolution layer
conv3 = conv_layer(pool2, filter_shape=[5, 5, C2, C3])
# Since at this point the feature maps are of size 8×8 (following the first two poolings
# that each reduced the 32×32 pictures by half on each axis).
# This last pool layer pools each of the feature maps and keeps only the maximal value.
# The number of feature maps at the third block was set to 80,
# so at that point (following the max pooling) the representation is reduced to only 80 numbers
pool3 = pool_layer(conv3, window_size = 8 , stride=8)
pool3 = tf.nn.dropout(pool3, keep_prob=conv_prob)
add_filter_summary("conv3" , pool3)
# Reshape the output to feed to the FC layer
flatterned_layer = tf.reshape(pool3, [batch_size,
-1]) # -1 is to specify to use all the dimensions remaining in the input (other than batch_size).reshape(input , )
fc1 = tf.nn.relu(full_layer(flatterned_layer, F1))
full1_drop = tf.nn.dropout(fc1, keep_prob=fc_prob)
# Fully connected layer 2 (output layer)
final_Output = full_layer(full1_drop, 10)
return final_Output, tf.summary.merge_all()
# Now that architecture is created , next step is to create the classification model
# (to predict the output class of the input data)
# Here we have used Logistic regression (Sigmoid function) to predict the output because we have only rwo class.
# For multiple class problem - softmax is the best prediction function
# Prepare the inputs to the input
Train_X , img_summary = conv_network(train_image_batch, train_batch_size)
Test_X , _ = conv_network(test_image_batch, test_batch_size)
# Generate 0 based index for labels
Train_Y = tf.to_int32(tf.argmax(
tf.to_int32(tf.stack([tf.equal(train_label_batch, ["airplane"]), tf.equal(train_label_batch, ["automobile"]),
tf.equal(train_label_batch, ["bird"]),tf.equal(train_label_batch, ["cat"]),
tf.equal(train_label_batch, ["deer"]),tf.equal(train_label_batch, ["dog"]),
tf.equal(train_label_batch, ["frog"]),tf.equal(train_label_batch, ["horse"]),
tf.equal(train_label_batch, ["ship"]), tf.equal(train_label_batch, ["truck"]) ])), 0))
Test_Y = tf.to_int32(tf.argmax(
tf.to_int32(tf.stack([tf.equal(test_label_batch, ["airplane"]), tf.equal(test_label_batch, ["automobile"]),
tf.equal(test_label_batch, ["bird"]),tf.equal(test_label_batch, ["cat"]),
tf.equal(test_label_batch, ["deer"]),tf.equal(test_label_batch, ["dog"]),
tf.equal(test_label_batch, ["frog"]),tf.equal(test_label_batch, ["horse"]),
tf.equal(test_label_batch, ["ship"]), tf.equal(test_label_batch, ["truck"]) ])), 0))
# Y = tf.reshape(float_label_batch, X.get_shape())
# compute inference model over data X and return the result
# (using sigmoid function - as this function is the best to predict two class output)
# (For multiclass problem - Softmax is the bset prediction function)
def inference(X):
return tf.nn.softmax(X)
# compute loss over training data X and expected outputs Y
# Cross entropy function is the best suited for loss calculation (Than the squared error function)
# Get the second column of the input to get only the features
def loss(X, Y):
return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=X, labels=Y))
# train / adjust model parameters according to computed total loss (using gradient descent)
def train(total_loss, learning_rate):
return tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
# evaluate the resulting trained model with dropout probability (Ideally 1.0 for testing)
def evaluate(sess, X, Y, dropout_prob):
# predicted = tf.cast(inference(X) > 0.5 , tf.float32)
#print("\nNetwork output:")
#print(sess.run(inference(X) , feed_dict={conv_prob:1.0 , fc_prob:1.0}))
# Inference contains the predicted probability of each class for each input image.
# The class having higher probability is the prediction of the network. y_pred_cls = tf.argmax(y_pred, dimension=1)
predicted = tf.cast(tf.argmax(X, 1), tf.int32)
#print("\npredicted labels:")
#print(sess.run(predicted , feed_dict={conv_prob:1.0 , fc_prob:1.0}))
#print("\nTrue Labels:")
#print(sess.run(Y , feed_dict={conv_prob:1.0 , fc_prob:1.0}))
batch_accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), tf.float32))
# calculate the mean of the accuracies of the each batch (iteration)
# No. of iteration Iteration should cover the (test_batch_size * num_of_iteration ) >= (2* num_of_test_samples ) condition
total_accuracy = np.mean([sess.run(batch_accuracy, feed_dict={conv_prob:1.0 , fc_prob:1.0}) for i in range(250)])
print("Accuracy of the model(in %): {:.4f} ".format(100 * total_accuracy))
# create a saver class to save the training checkpoints
saver = tf.train.Saver(max_to_keep=10)
# Create tensorboard sumamry for loss function
with tf.name_scope("summaries"):
loss_summary = tf.summary.scalar("loss", loss(Train_X, Train_Y))
#merged = tf.summary.merge_all()
# Launch the graph in a session, setup boilerplate
with tf.Session() as sess:
log_writer = tf.summary.FileWriter('./logs', sess.graph)
total_loss = loss(Train_X, Train_Y)
train_op = train(total_loss, 0.001)
#Initialise all variables after defining all variables
tf.global_variables_initializer().run()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
print(sess.run(Train_Y))
print(sess.run(Test_Y))
evaluate(sess, Test_X, Test_Y,1.0)
# actual training loop------------------------------------------------------
training_steps = 50000
print("\nStarting to train model with", str(training_steps), " steps...")
to1 = dt.datetime.now()
for step in range(1, training_steps + 1):
# print(sess.run(train_label_batch))
sess.run([train_op], feed_dict={fc_prob: 0.5 , conv_prob:0.8}) # Pass the dropout value for training batch to the placeholder
# for debugging and learning purposes, see how the loss gets decremented thru training steps
if step % 100 == 0:
# print("\n")
# print(sess.run(train_label_batch))
loss_summaries, img_summaries , Tloss = sess.run([loss_summary, img_summary, total_loss],
feed_dict={fc_prob: 0.5 , conv_prob:0.8}) # evaluate total loss to add it in summary object
log_writer.add_summary(loss_summaries, step) # add summary for each step
log_writer.add_summary(img_summaries, step)
print("Step:", step, " , loss: ", Tloss)
if step%2000 == 0:
saver.save(sess, "./Models/BookLT_CIFAR", global_step=step, latest_filename="model_chkpoint")
print("\n")
evaluate(sess, Test_X, Test_Y,1.0)
saver.save(sess, "./Models/BookLT_CIFAR", global_step=step, latest_filename="model_chkpoint")
to2 = dt.datetime.now()
print("\nTotal Trainig time Elapsed: ", str(to2 - to1))
# once the training is complete, evaluate the model with test (validation set)-------------------------------------------
# Restore the model file and perform the testing
#saver.restore(sess, "./Models/BookLT3_CIFAR-15000")
print("\nPost Training....")
# Performs Evaluation of model on batches of test samples
# In order to evaluate entire test set , number of iteration should be chosen such that ,
# (test_batch_size * num_of_iteration ) >= (2* num_of_test_samples )
evaluate(sess, Test_X, Test_Y,1.0) # Evaluate multiple batch of test data set (randomly chosen by shuffle train batch queue)
evaluate(sess, Test_X, Test_Y,1.0)
evaluate(sess, Test_X, Test_Y,1.0)
coord.request_stop()
coord.join(threads)
sess.close()
Here is the screenshot of my Pre training result:
Here is the screenshot of the result during training:
Hereis the screenshot of the Post training result
I did not run the code to verify that this is the only issue, but here is one important issue. When classifying, you should use one-hot encoding for your labels. Meaning that if you have 3 classes, you want your labels to be [1, 0, 0] for class 1, [0, 1, 0] for class 2, [0, 0, 1] for class 3. Your approach of using 1, 2, and 3 as labels leads to various issues. For examples, the network is penalized more for predicting class 1 versus predicting class 2 for an image from class 3. TensorFlow functions like tf.nn.softmax_cross_entropy_with_logits work with such representations.
Here is the basic example of correctly using one_hot labels to compute loss: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/tutorials/mnist/mnist_softmax.py
Here is how the one_hot label is constructed for mnist digits:
https://github.com/tensorflow/tensorflow/blob/438604fc885208ee05f9eef2d0f2c630e1360a83/tensorflow/contrib/learn/python/learn/datasets/mnist.py#L69
I have trained a simple long short-term memory (lstm) model in lasagne following the recipie here:https://github.com/Lasagne/Recipes/blob/master/examples/lstm_text_generation.py
Here is the architecture:
l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size))
# We now build the LSTM layer which takes l_in as the input layer
# We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients.
l_forward_1 = lasagne.layers.LSTMLayer(
l_in, N_HIDDEN, grad_clipping=GRAD_CLIP,
nonlinearity=lasagne.nonlinearities.tanh)
l_forward_2 = lasagne.layers.LSTMLayer(
l_forward_1, N_HIDDEN, grad_clipping=GRAD_CLIP,
nonlinearity=lasagne.nonlinearities.tanh)
# The l_forward layer creates an output of dimension (batch_size, SEQ_LENGTH, N_HIDDEN)
# Since we are only interested in the final prediction, we isolate that quantity and feed it to the next layer.
# The output of the sliced layer will then be of size (batch_size, N_HIDDEN)
l_forward_slice = lasagne.layers.SliceLayer(l_forward_2, -1, 1)
# The sliced output is then passed through the softmax nonlinearity to create probability distribution of the prediction
# The output of this stage is (batch_size, vocab_size)
l_out = lasagne.layers.DenseLayer(l_forward_slice, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)
# Theano tensor for the targets
target_values = T.ivector('target_output')
# lasagne.layers.get_output produces a variable for the output of the net
network_output = lasagne.layers.get_output(l_out)
# The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target.
cost = T.nnet.categorical_crossentropy(network_output,target_values).mean()
# Retrieve all parameters from the network
all_params = lasagne.layers.get_all_params(l_out)
# Compute AdaGrad updates for training
print("Computing updates ...")
updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)
# Theano functions for training and computing cost
print("Compiling functions ...")
train = theano.function([l_in.input_var, target_values], cost, updates=updates, allow_input_downcast=True)
compute_cost = theano.function([l_in.input_var, target_values], cost, allow_input_downcast=True)
# In order to generate text from the network, we need the probability distribution of the next character given
# the state of the network and the input (a seed).
# In order to produce the probability distribution of the prediction, we compile a function called probs.
probs = theano.function([l_in.input_var],network_output,allow_input_downcast=True)
and the model is trained via:
for it in xrange(data_size * num_epochs / BATCH_SIZE):
try_it_out() # Generate text using the p^th character as the start.
avg_cost = 0;
for _ in range(PRINT_FREQ):
x,y = gen_data(p)
#print(p)
p += SEQ_LENGTH + BATCH_SIZE - 1
if(p+BATCH_SIZE+SEQ_LENGTH >= data_size):
print('Carriage Return')
p = 0;
avg_cost += train(x, y)
print("Epoch {} average loss = {}".format(it*1.0*PRINT_FREQ/data_size*BATCH_SIZE, avg_cost / PRINT_FREQ))
How can I save the model so I do not need to train it again? With scikit I generally just pickle the model object. However I am unclear on the analogous process with Theano / lasagne.
You can save the weights with numpy:
np.savez('model.npz', *lasagne.layers.get_all_param_values(network_output))
And load them again later on like this:
with np.load('model.npz') as f:
param_values = [f['arr_%d' % i] for i in range(len(f.files))]
lasagne.layers.set_all_param_values(network_output, param_values)
Source: https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py
As for the model definition itself: One option is certainly to keep the code and regenerate the network, before setting the pretrained weights.
You can save the model parameters and the model by Pickle
import cPickle as pickle
import os
#save the network and its parameters as a dictionary
netInfo = {'network': network, 'params': lasagne.layers.get_all_param_values(network)}
Net_FileName = 'LSTM.pkl'
# save the dictionary as a .pkl file
pickle.dump(netInfo, open(os.path.join(/path/to/a/folder/, Net_FileName), 'wb'),protocol=pickle.HIGHEST_PROTOCOL)
After saving your model, it can be retrieved by pickle.load:
net = pickle.load(open(os.path.join(/path/to/a/folder/,Net_FileName),'rb'))
all_params = net['params']
lasagne.layers.set_all_param_values(net['network'], all_params)
I've had success using dill in combination with the numpy.savez function:
import dill as pickle
...
np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
with open('model.dpkl','wb') as p_output:
pickle.dump(network, p_output)
To import the pickled model:
with open('model.dpkl', 'rb') as p_input:
network = pickle.load(p_input)
with np.load('model.npz') as f:
param_values = [f['arr_%d' % i] for i in range(len(f.files))]
lasagne.layers.set_all_param_values(network, param_values)