I'm having some trouble using the Estimator API with constant_initializer. Originally, I was trying to load model weights from a .npy file, but the evaluation loss didn't seem to move at all.
I've made a smaller example that seems to have the same problem. When I replace the constant_initializer with any other random initializer, it seems to work. Can anybody explain what is going on?
Here is the main part of the code:
# Big thanks to https://medium.com/onfido-tech/higher-level-apis-in-tensorflow-67bfb602e6c0
import os
import tensorflow as tf
from tensorflow.contrib.learn import ModeKeys
from tensorflow.contrib.learn import learn_runner
from fcn import fcn32_vgg
from fcn import loss as fcn_loss
import voc_dataset
from voc_to_tfrecord import load_voc_dataset
from test_model import SimpleNet, WeightInitializerHook
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
flag_name='weights_dir', default_value='...',
docstring='Top-level directory where the input data will be looked for.')
tf.app.flags.DEFINE_string(
flag_name='model_dir', default_value='...',
docstring='Output directory for model and training stats.')
tf.app.flags.DEFINE_string(
flag_name='data_dir', default_value='...',
docstring='Directory containing the "voc_segmentation_{train|val}.tfrecord" files.')
def run_experiment(argv=None):
# Define model parameters
params = tf.contrib.training.HParams(
learning_rate=0.002,
n_classes=22,
train_steps=100,
eval_steps=1,
min_eval_frequency=10,
eval_delay_secs=0
)
# Set the run_config and the directory to save the model and stats
run_config = tf.contrib.learn.RunConfig()
run_config = run_config.replace(model_dir=FLAGS.model_dir)
run_config = run_config.replace(tf_random_seed=42)
learn_runner.run(
experiment_fn=experiment_fn,
run_config=run_config, # RunConfig
schedule="train_and_evaluate", # What to run
hparams=params # HParams
)
def experiment_fn(run_config, params):
# You can change a subset of the run_config properties as
run_config = run_config.replace(
save_checkpoints_steps=params.min_eval_frequency)
estimator = tf.estimator.Estimator(
model_fn=model_fn, # First-class function
params=params, # HParams
config=run_config # RunConfig
)
# Setup data loaders
train_input_fn, train_input_hook = voc_dataset.get_inputs(
batch_size=64,
tfrecords_path=os.path.join(FLAGS.data_dir,'voc_segmentation_train.tfrecords'),
name_scope='train_data',
shuffle_and_repeat=True)
eval_input_fn, eval_input_hook = voc_dataset.get_inputs(
batch_size=64,
tfrecords_path=os.path.join(FLAGS.data_dir, 'voc_segmentation_val.tfrecords'),
name_scope='eval_data',
shuffle_and_repeat=False)
# Define the experiment
experiment = tf.contrib.learn.Experiment(
estimator=estimator, # Estimator
train_input_fn=train_input_fn,
eval_input_fn=eval_input_fn,
train_steps=params.train_steps,
eval_steps=params.eval_steps,
min_eval_frequency=params.min_eval_frequency, # Eval frequency
train_monitors=[train_input_hook], # Hooks for training
eval_hooks=[eval_input_hook], # Hooks for evaluation
eval_delay_secs=params.eval_delay_secs,
)
return experiment
def model_fn(features, labels, mode, params):
is_training = mode == ModeKeys.TRAIN
net = SimpleNet()
net.build(features, is_training=is_training)
logits = net.logits
predictions = net.predictions
loss = None
train_op = None
eval_metric_ops = {}
if mode != ModeKeys.INFER:
loss = fcn_loss.loss(logits, labels, params.n_classes)
if mode == ModeKeys.TRAIN:
train_op = get_train_op_fn(loss, params)
tf.summary.image('INPUT' + str(is_training), features, max_outputs=64)
tf.summary.image('OUTPUT' + str(is_training), tf.expand_dims(tf.argmax(predictions, -1) / 22, -1), max_outputs=64)
tf.summary.image('LABELS' + str(is_training), tf.expand_dims(tf.argmax(labels, -1) / 22, -1), max_outputs=64)
return tf.estimator.EstimatorSpec(
mode=mode,
predictions={'result': predictions},
loss=loss,
train_op=train_op,
# eval_metric_ops=eval_metric_ops
)
def get_train_op_fn(loss, params):
return tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.train.get_global_step(),
optimizer=tf.train.AdamOptimizer,
learning_rate=params.learning_rate,
name='optimize_loss',
summaries=['loss']
)
# Run script ##############################################
if __name__ == "__main__":
tf.app.run(
main=train_manual
)
and here is the architecture:
class SimpleNet:
def __init__(self, vgg16_npy_path=None):
pass
def build(self, rgb, is_training=False, debug=False):
k_init = None
if is_training:
k_init = tf.constant_initializer(0.1)
self.conv_1 = tf.layers.conv2d(rgb, 5, (5, 5), activation=tf.nn.elu, padding='same', name='conv1', kernel_initializer=k_init)
self.conv_2 = tf.layers.conv2d(self.conv_1, 10, (5, 5), activation=tf.nn.elu, padding='same', name='conv2', kernel_initializer=k_init)
self.conv_3 = tf.layers.conv2d(self.conv_2, 15, (5, 5), activation=tf.nn.elu, padding='same', name='conv3', kernel_initializer=k_init)
self.conv_4 = tf.layers.conv2d(self.conv_3, 20, (5, 5), activation=tf.nn.elu, padding='same', name='conv4', kernel_initializer=k_init)
self.logits = tf.layers.conv2d(self.conv_4, 22, (5, 5), activation=None, padding='same', name='logits', kernel_initializer=k_init)
with tf.name_scope('softmax'):
self.predictions = tf.nn.softmax(self.logits)
If I set the is_training flag to False, then the evaluation loss seems to drop. Otherwise, it is completely flat. Any ideas on why this might be?
Related
I'm trying to use the Autoencoder which code you can see below as a tool for Dimensionality Reduction,
I was wondering how can I "extract" the hidden layer and use it for my purpose
My original Dataset went under Standard Scaling
Here I define a Dictionary to centralize the values
CONFIG = {
'BATCH_SIZE' : 1024,
'LR' : 1e-4,
'WD' : 1e-8,
'EPOCHS': 50
}
Here I convert the values of my train and test dataframes into tensors
t_test = torch.FloatTensor(test.values)
t_train = torch.FloatTensor(train.values)
Here I create data loaders
loader_test = torch.utils.data.DataLoader(dataset = t_test,
batch_size = CONFIG['BATCH_SIZE'],
shuffle = True)
loader_train = torch.utils.data.DataLoader(dataset = t_train,
batch_size = CONFIG['BATCH_SIZE'],
shuffle = True)
Here I create the class AutoEncoder (AE)
class AE(torch.nn.Module):
def __init__(self):
super().__init__()
self.encoder = torch.nn.Sequential(
torch.nn.Linear(31,16),
torch.nn.ReLU(),
torch.nn.Linear(16, 8),
torch.nn.ReLU(),
torch.nn.Linear(8, 4),
)
self.decoder = torch.nn.Sequential(
torch.nn.Linear(4, 8),
torch.nn.ReLU(),
torch.nn.Linear(8, 16),
torch.nn.ReLU(),
torch.nn.Linear(16, 31),
)
def forward(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return decoded
Here I define model loss_funcion and the optimizer
model = AE()
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
lr = CONFIG['LR'],
weight_decay = CONFIG['WD'])
Here I compute the algorithm
epochs = CONFIG['EPOCHS']
dict_list = []
for epoch in range(epochs):
for (ix, batch) in enumerate(loader_train):
model.train()
reconstructed = model(batch)
loss = loss_function(reconstructed, batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
temp_dict = {'Epoch':epoch,'Batch_N':ix,'Batch_L':batch.shape[0],'loss':loss.detach().numpy()}
dict_list.append(temp_dict)
df_learning_o = pd.DataFrame(dict_list)
You can simply return not just the decoded output, but also the encoded embedding layer, like this:
class AE(torch.nn.Module):
def __init__(self):
super().__init__()
self.encoder = torch.nn.Sequential(
torch.nn.Linear(31,16),
torch.nn.ReLU(),
torch.nn.Linear(16, 8),
torch.nn.ReLU(),
torch.nn.Linear(8, 4),
)
self.decoder = torch.nn.Sequential(
torch.nn.Linear(4, 8),
torch.nn.ReLU(),
torch.nn.Linear(8, 16),
torch.nn.ReLU(),
torch.nn.Linear(16, 31),
)
def forward(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return encoded, decoded
When you pass something to your model (in the train loop for example), you would have to change it to the following:
encoded, reconstructed = model(batch)
Now you can do whatever you'd like with the encoded embedding, i.e. which is the dimensionally reduced input.
I made a Tensorflow pipeline for loading numpy arrays (video data shape (40,160,160,3)). However, it stops working after loading the first x batches.
The problem is solved when removing num_parallel_calls=AUTOTUNE. However, if I do this, the training becomes significantly slower (ETA/epoch ~30 min -> ETA/epoch ~ 4 hours) . Is there a way to load the numpy arrays in parallel (or apply num_parallel_calls=AUTOTUNE) without any problems?
def get_label(file_path):
import os
parts = tf.strings.split(file_path, os.path.sep)
return parts[-2]
def process_video(file_path):
label = get_label(file_path)
video = np.load(file_path, allow_pickle=True)
return np.float32(video/255), np.float32(label)
def set_shape(video, label):
video.set_shape((40,160,160, 3))
label.set_shape([])
return video, label
## Data pipeline
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = tf.data.Dataset.list_files("path/train/*/*",shuffle=True)
train_ds = train_ds.map(lambda item: tf.numpy_function(
process_video, [item], (tf.float32, tf.float32)) ,num_parallel_calls=AUTOTUNE)
train_ds = train_ds.map(set_shape)
train_ds = train_ds.batch(8)
train_ds = train_ds.prefetch(AUTOTUNE)
## Model
def create_LRCN_model():
model = Sequential()
model.add(TimeDistributed(Conv2D(64, (3, 3), padding='same',activation = 'relu'),
input_shape = (40, 160, 160, 3)))
model.add(TimeDistributed(MaxPooling2D((4, 4))))
model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Conv2D(64, (3, 3), padding='same',activation = 'relu')))
model.add(TimeDistributed(MaxPooling2D((4, 4))))
model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Conv2D(64, (3, 3), padding='same',activation = 'relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2))))
model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Conv2D(32, (3, 3), padding='same',activation = 'relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2))))
#model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(32))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()
return model
LRCN_model = create_LRCN_model()
early_stopping_callback = EarlyStopping(monitor = 'val_loss', patience = 15, mode = 'min', restore_best_weights = True)
LRCN_model.compile(loss='binary_crossentropy', optimizer = 'Adam', metrics = ["accuracy"])
LRCN_model_training_history = LRCN_model.fit(train_ds, validation_data= val_ds, epochs = 70, callbacks = [early_stopping_callback]) #class_weight= class_weights,
I have a data loader pipeline for video data. Although I specify the output of the pipeline, I still get the following error when calling model.fit. "ValueError: as_list() is not defined on an unknown TensorShape". I searched for the error and most people say it is because of the tf.numpy_function that returns an (to the Tensorflow pipeline) unknown shape. Specifying the shape after that function should solve the problem. However, it does not.
AUTOTUNE = tf.data.experimental.AUTOTUNE
#get list of numpy files in directory
train_ds = tf.data.Dataset.list_files("dir")
#load numpy files (video with shape 40,160,160,3), get corresponding label and output both
#video and label
def get_label(file_path):
label = tf.strings.split(file_path, os.path.sep)
return label [-2]
def process_image(file_path):
label = get_label(file_path)
video= np.load(file_path, allow_pickle=True)
video= tf.convert_to_tensor(video/255, dtype=tf.float32)
return video, np.float32(label)
train_ds = train_ds.map(lambda item: tf.numpy_function(
process_image, [item], (tf.float32, tf.float32)),num_parallel_calls = AUTOTUNE )
#Convert video to tf object
def set_shape(video, label):
video = tf.reshape(video, (40,160,160,3))
#video = tf.ensure_shape(video, (40,160,160,3)) #also does not work
#video = tf.convert_to_tensor(video, dtype=tf.float32) #also does not work
return video, label
train_ds = train_ds.map(set_shape)
#batching
train_ds = train_ds.batch(batch_size =5)
#optimazation
train_ds = train_ds.prefetch(AUTOTUNE)
train_ds.take(1)
Although the rest of the code seems fine (it does work when I manually input data), I will paste it in case it is not.
def create_LRCN_model():
'''
This function will construct the required LRCN model.
Returns:
model: It is the required constructed LRCN model.
'''
# We will use a Sequential model for model construction.
model = Sequential()
# Define the Model Architecture.
########################################################################################
model.add(TimeDistributed(Conv2D(128, (3, 3), padding='same',activation = 'relu'),
input_shape = (40, 160, 160, 3)))
model.add(TimeDistributed(MaxPooling2D((4, 4))))
model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Conv2D(256, (3, 3), padding='same',activation = 'relu')))
model.add(TimeDistributed(MaxPooling2D((4, 4))))
model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Conv2D(128, (3, 3), padding='same',activation = 'relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2))))
model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Conv2D(64, (3, 3), padding='same',activation = 'relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2))))
#model.add(TimeDistributed(Dropout(0.25)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(32))
model.add(Dense(1, activation = 'sigmoid'))
########################################################################################
# Display the models summary.
model.summary()
# Return the constructed LRCN model.
return model
LRCN_model = create_LRCN_model()
early_stopping_callback = EarlyStopping(monitor = 'val_loss', patience = 15, mode = 'min', restore_best_weights = True)
LRCN_model.compile(loss='binary_crossentropy', optimizer = 'Adam', metrics = ["accuracy"])
LRCN_model_training_history = LRCN_model.fit(train_ds, validation_data= val_ds, epochs = 70, callbacks = [early_stopping_callback])
Okay I found another solution. I do not exactly know why it works, just calling the following function does the job.
def set_shape(video, label):
video.set_shape((40,160,160, 3))
label.set_shape([])
return video, label
Got it! You just need to change "accuracy" to "binary_accuracy" in model compile. It worked for me with your code and some dummy video and label input data.
I build a GAN network that predict a output of the shape (40,40,6) form two inputs of the shapes [(40,40,4),(20,20,6)].
The model is actually working and already delivers results but I "only" get a GPU utilization between 60 and 70% (displayed by nvidia-smi).
My question is if that is intrinsic for such a model as it has to do stuff in between the calls of train_on_batch or if there is way to speed this process up?
A minimalist working example on random data would look like:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import UpSampling3D
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras.optimizers import Adam
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
# =============================================================================
# define the model
# =============================================================================
def resBlock(X_in, num_of_features, kernel_size, scale):
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(X_in)
x = Activation('relu')(x)
x = Conv2D(num_of_features, kernel_size, kernel_initializer='he_uniform', padding='same')(x)
x = Lambda(lambda x: x * scale)(x)
X_out = Add()([X_in,x])
return X_out
class Generator(object):
def __init__(self, noise_shape):
self.noise_shape = noise_shape
self.num_of_features = 128
self.kernel_size = (3,3)
self.scale = 0.1
self.padding=8
self.hp = int(self.padding/2) # half padding
def generator(self):
# get the inputs and do upsampling
inputs_channels_A = Input((32+self.padding,32+self.padding,4),name = 'input_A')
inputs_channels_B = Input((16+self.hp,16+self.hp,6),name = 'input_B')
inputs_channels_B_upsampled = UpSampling3D(size = (2,2,1))(inputs_channels_B)
# concentrate everything
concentrated_input = concatenate([inputs_channels_A,
inputs_channels_B_upsampled],
axis=3,)
# do the first convolution
x = Conv2D(self.num_of_features,
self.kernel_size,
activation = 'relu',
padding = 'same',
kernel_initializer = 'he_normal')(concentrated_input)
# do the resBlock iterations
for resblock_index in range(6):
x = resBlock(x,self.num_of_features, self.kernel_size, self.scale)
# doing the last conv to resize it to (28,28,6)
x = Conv2D(6, (3, 3), kernel_initializer='he_uniform', padding='same')(x)
# last scipt connection
output = Add()([x,inputs_channels_B_upsampled])
# defining model
generator_model = Model(inputs = [inputs_channels_A,inputs_channels_B], outputs = output)
return generator_model
def discriminator_block(model, filters, kernel_size, strides):
model = Conv2D(filters = filters, kernel_size = kernel_size, strides = strides, padding = "same")(model)
model = BatchNormalization(momentum = 0.5)(model)
model = LeakyReLU(alpha = 0.2)(model)
return model
class Discriminator(object):
def __init__(self, image_shape):
self.image_shape = image_shape
def discriminator(self):
dis_input = Input(shape = (self.image_shape))
model = Conv2D(filters = 64, kernel_size = 3, strides = 1, padding = "same")(dis_input)
model = LeakyReLU(alpha = 0.2)(model)
model = discriminator_block(model, 64, 3, 2)
model = discriminator_block(model, 128, 3, 1)
model = discriminator_block(model, 128, 3, 2)
model = discriminator_block(model, 256, 3, 1)
model = discriminator_block(model, 256, 3, 2)
model = discriminator_block(model, 512, 3, 1)
model = discriminator_block(model, 512, 3, 2)
model = Flatten()(model)
model = Dense(1024)(model)
model = LeakyReLU(alpha = 0.2)(model)
model = Dense(1)(model)
model = Activation('sigmoid')(model)
discriminator_model = Model(inputs = dis_input, outputs = model)
return discriminator_model
def get_gan_network(discriminator, shape_list_AB, generator, optimizer, loss):
discriminator.trainable = False
gan_input_A = Input(shape=shape_list_AB[0])
gan_input_B = Input(shape=shape_list_AB[1])
x = generator([gan_input_A,gan_input_B])
gan_output = discriminator(x)
gan = Model(inputs=[gan_input_A,gan_input_B], outputs=[x,gan_output])
gan.compile(loss=[loss, "binary_crossentropy"], loss_weights=[1., 1e-3], optimizer=optimizer)
return gan
def get_optimizer():
adam = Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
return adam
# =============================================================================
# choose some parameters and compile the model
# =============================================================================
batch_size = 128
shape_input_A = (40,40,4)
shape_input_B = (20,20,6)
shape_output = (40,40,6)
generator = Generator(shape_input_B).generator() # todo shape
discriminator = Discriminator(shape_output).discriminator() # todo shape
optimizer = get_optimizer()
generator.compile(loss="mse", optimizer=optimizer)
discriminator.compile(loss="binary_crossentropy", optimizer=optimizer)
gan = get_gan_network(discriminator, [shape_input_A,shape_input_B], generator, optimizer, "mse")
# =============================================================================
# training
# =============================================================================
def get_random_data(mod):
# get the networks input
if mod == 0:
return [np.random.rand(batch_size,40,40,4),np.random.rand(batch_size,20,20,6)]
# get the networks output
else:
return np.random.rand(batch_size,40,40,6)
# initalize empty arrays
rand_nums = np.empty(batch_size,dtype=np.int)
image_batch_lr = np.empty((batch_size,)+shape_input_B)
image_batch_hr = np.empty((batch_size,)+shape_output)
generated_images_sr = np.empty_like(image_batch_hr)
real_data_Y = np.empty(batch_size)
fake_data_Y = np.empty(batch_size)
for e in range(1, 10):
print("epoch:",e)
for batchindex in range(200):
generated_images_sr[:] = generator.predict(get_random_data(0))
real_data_Y[:] = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
fake_data_Y[:] = np.random.random_sample(batch_size)*0.2
discriminator.trainable = True
d_loss_real = discriminator.train_on_batch(get_random_data(1), real_data_Y)
d_loss_fake = discriminator.train_on_batch(generated_images_sr, fake_data_Y)
discriminator_loss = 0.5 * np.add(d_loss_fake, d_loss_real)
gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
discriminator.trainable = False
gan_loss = gan.train_on_batch(get_random_data(0), [get_random_data(1),gan_Y])
print("discriminator_loss : %f" % discriminator_loss)
print("gan_loss :", gan_loss)
I run this code on my GTX2080 within a docker container tensorflow/tensorflow:2.0.0-gpu-py3.
Training a GAN implies some overhead that will not be executed on the GPU. In your case, obtaining real_data_Y and fake_data_Y, executing get_random_data() and computing the loss will result in GPU idle time.
You can try profiling your program with python -mcProfile -o performance.prof xxx.py and see if there are bottlenecks that can be improved, but 60 to 70% already seems not too bad.
When I start the training on my tf.estimator.Estimator object,
Tensorflow automatically creates a CheckpointSaverHook whilst printing
INFO:tensorflow:Create CheckpointSaverHook.
This automatically created SaverHook will save my model at the very start and the end of the training.
What I want though is to create a checkpoint every n training steps. For this I created my own saving hook and passed it to my estimator when training.
saver_hook = tf.train.CheckpointSaverHook(
checkpoint_dir = model_dir,
save_steps = 100
)
model.train(input_fn,steps=1500,hooks=[saver_hook])
This works in theory but my own CheckpointSaverHook will just save *.meta files, while the automatically created one saves *.meta, *.index and *.data-XXXXX-of-XXXXX files.
How can I configure my own SaverHook to do that aswell?
EDIT:
Added my whole network definition
network.py
import pickle
import random
import numpy as np
import tensorflow as tf
LEARNING_RATE = 0.002
class TFDotNet:
def __init__(self,model_dir):
# model def
self.model_dir = model_dir
self.model = tf.estimator.Estimator(model_fn=model_fn,model_dir=model_dir)
# hooks
self.summary_hook = tf.train.SummarySaverHook(
save_steps=50,
output_dir=model_dir,
scaffold=tf.train.Scaffold()
)
self.saver_hook = tf.train.CheckpointSaverHook(
checkpoint_dir=model_dir,
save_steps=100,
)
def train(self,x_train,y_train,steps=1500,batch_size=128):
""" train the neuralnetwork """
tf.logging.set_verbosity(tf.logging.INFO)
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'x': x_train}, y=y_train,batch_size=batch_size, num_epochs=None, shuffle=True
)
self.model.train(input_fn,steps=steps,hooks=[self.summary_hook,self.saver_hook])
def predict(self,x_predict):
""" predict some inputs """
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'x':x_predict}, y=None, batch_size=1, shuffle=False
)
return list(self.model.predict(input_fn))
def evaluate(self,x_test,y_test):
""" evaluate network on testset """
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'x': x_test}, y=y_test,batch_size=1, shuffle=False
)
return self.model.evaluate(input_fn)
def load_dataset(self,dataset_path):
""" loads a dataset from a serialized data file """
with open(dataset_path,'rb') as f:
return pickle.load(f)
def split_dataset(self,dataset,ratio,random_state=42):
""" splits a loaded dataset into training and testset """
random.seed(random_state)
random.shuffle(dataset)
length = int(ratio * len(dataset))
test_data = dataset[:length]
training_data = dataset[length:]
x_train = np.hstack([x for (x, y) in training_data]).transpose().astype('float32')
y_train = np.asarray([y for (x, y) in training_data]).reshape(-1, 1).astype('float32')
x_test = np.hstack([x for (x, y) in test_data]).transpose().astype('float32')
y_test = np.asarray([y for (x, y) in test_data]).reshape(-1, 1).astype('float32')
return x_train, y_train, x_test, y_test
def export(self):
""" exports the conv net """
def serving_input_receiver_fn():
# The outer dimension (None) allows us to batch up inputs for
# efficiency. However, it also means that if we want a prediction
# for a single instance, we'll need to wrap it in an outer list.
inputs = {"x": tf.placeholder(shape=[None, 900], dtype=tf.float32)}
return tf.estimator.export.ServingInputReceiver(inputs, inputs)
self.model.export_savedmodel(
export_dir_base=self.model_dir,
serving_input_receiver_fn=serving_input_receiver_fn)
def cnn_layout(features,reuse,is_training):
with tf.variable_scope('cnn',reuse=reuse):
# resize input to [batchsize,height,width,channel]
x = tf.reshape(features['x'], shape=[-1,30,30,1])
# conv1, 32 filter, 5 kernel
conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu, name='conv1')
# pool1, 2 stride, 2 kernel
pool1 = tf.layers.max_pooling2d(conv1, 2, 2, name='pool1')
# conv2, 64 filter, 3 kernel
conv2 = tf.layers.conv2d(pool1, 64, 3, activation=tf.nn.relu, name='conv2')
# pool2, 2 stride, 2 kernel
pool2 = tf.layers.max_pooling2d(conv2, 2, 2, name='pool2')
# flatten pool2
flatten = tf.contrib.layers.flatten(pool2)
# fc1 with 1024 neurons
fc1 = tf.layers.dense(flatten, 1024, name='fc1')
# 75% dropout
drop = tf.layers.dropout(fc1, rate=0.75, training=is_training, name='dropout')
# output logits
output = tf.layers.dense(drop, 1, name='output_logits')
return output
def model_fn(features, labels, mode):
# setup two networks one for training one for prediction while sharing weights
logits_train = cnn_layout(features=features,reuse=False,is_training=True)
logits_test = cnn_layout(features=features,reuse=True,is_training=False)
# predictions
probabilites = tf.sigmoid(logits_test, name='probabilities')
predictions = tf.round(probabilites,name='predictions')
export_outputs = tf.estimator.export.PredictOutput(outputs={'predictions':predictions,'probabilities':probabilites})
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs={'outputs':export_outputs})
# define loss and optimizer
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits_train,labels=labels),name='loss')
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE, name='optimizer')
train = optimizer.minimize(loss, global_step=tf.train.get_global_step(),name='train')
# accuracy for evaluation
accuracy = tf.metrics.accuracy(labels=labels,predictions=predictions,name='accuracy')
# summarys for tensorboard
tf.summary.scalar('loss',loss)
# return training and evalution spec
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train,
eval_metric_ops={'accuracy':accuracy}
)
training.py
from network import TFDotNet
from time import time
# settings
training_steps = 10000
mini_batch_size = 128
model_dir = 'neuralnet_data/02_networks/network01'
dataset_path = 'neuralnet_data/01_datasets/dataset.data'
# init dotnet
dotnet = TFDotNet(model_dir=model_dir)
# load dataset
print('loading dataset ...')
dataset = dotnet.load_dataset(dataset_path)
# split dataset
x_train, y_train, x_test, y_test = dotnet.split_dataset(dataset,0.1)
# train network
print('starting training ...')
t0 = time()
dotnet.train(x_train,y_train,steps=training_steps,batch_size=mini_batch_size)
print('Training took {}s'.format(time()-t0))
The problem here is that, when no Saver is specified (either directly or by the scaffold), CheckpointSaverHook will create a new Saver in its constructor. If the __init__ is not run in the same Graph as your model, then it won't find any variables so nothing will be saved (https://github.com/tensorflow/tensorflow/issues/13265).
Assuming you are using the tf.estimator framework, then the Graph you want simply does not exist yet before the call to train.
You should be able to work around that by creating the saver inside your model_fn, and pass it as a hook to the EstimatorSpec.
here my my code. it works fine, the complete code is on mygithub
start_time = datetime.datetime.now()
saver_hook = tf.train.CheckpointSaverHook(
checkpoint_dir=FLAGS.train_dir,
save_steps=100,
)
config = tf.estimator.RunConfig()
config = config.replace(session_config=sess_config)
per_example_hook = ExamplesPerSecondHook(FLAGS.train_batch_size, every_n_steps=100)
hooks = [per_example_hook,saver_hook]
classifier = tf.estimator.Estimator(
model_fn=model_fn_cnn,
model_dir= FLAGS.train_dir,
config=config,
)
classifier.train(input_fn=functools.partial(input_fn,subset="training"),
steps=FLAGS.train_steps,
hooks=hooks
)
train_time = datetime.datetime.now() - start_time