I have a bunch of models floating around, I clone them, cross-validate them, do hyperparameter selection and what have you. As such, my keras global session can get quite mucked up. The solution per various threads is to call .clear_session(). However, this will throw away any models that I want to keep. One option is to train all of my models in a multiprocessing thread. However, it would be convenient to just instantiate a new session for each model as one might do with Tensorflow:
def score_model(**hyperparameters):
with tf.Graph().as_default()
my_model = build_model(**hyperparameters)
with tf.Session() as sess:
my_model.train(X,y)
score = my_model.score()
# now it's all gone, I have the score, so I don't need the model anymore
# the rest of my_model should get garbage collected, hooray!
return score
Can I do this sort of thing with keras?
UPDATE
The sess.as_default() method is crashing my kernel. My memory does not seem to be running low, and it gives no error whatsoever. In the following loop I can't even make it to i=2 before crashing.
from sklearn.datasets import load_iris
import numpy as np
import sklearn
import keras
import keras.wrappers.scikit_learn
import tensorflow as tf
import keras.models
import os
def sessioned(f):
def sessioned_f(self, *args, **kwargs):
if not hasattr(self, "sess"):
self.sess = tf.Session()
with self.sess.as_default():
return f(self, *args, **kwargs)
return result
return sessioned_f
class LogisticRegression(keras.wrappers.scikit_learn.KerasClassifier):
def __init__(self, n_epochs=100, **kwargs):
self.n_epochs = n_epochs
super().__init__(**kwargs)
#sessioned
def fit(self, X, y,**kwargs):
# get the shape of X and one hot y
self.input_shape = X.shape[-1]
self.label_encoder = sklearn.preprocessing.LabelEncoder()
self.label_encoder.fit(y)
self.output_shape = len(self.label_encoder.classes_)
label_encoded = self.label_encoder.transform(y).reshape((-1,1))
y_onehot = sklearn.preprocessing.OneHotEncoder().fit_transform(label_encoded).toarray()
super().fit(X,y_onehot,epochs=self.n_epochs,verbose=1,**kwargs)
return self
#sessioned
def predict_proba(self, X):
return super().predict_proba(X)
def check_params(self, params):
#fuckit
pass
#sessioned
def __call__(self): # the build_fn thing
# create model
model = keras.models.Sequential()
model.add(keras.layers.Dense(self.output_shape, input_dim=self.input_shape, kernel_initializer="normal", activation="softmax"))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')
return model
data = load_iris()
i=0
while True:
print(i)
graph = tf.Graph()
with graph.as_default():
model = LogisticRegression()
model.fit(data.data, data.target)
model.sess.close()
del model
i+=1
del graph
You can use Keras exactly as you described, except instead of running Tensorflow code inside the with statements you run the Keras code.
To set the session you would use
with sess.as_default()
Here is a link with with more information:
https://blog.keras.io/keras-as-a-simplified-interface-to-tensorflow-tutorial.html
I have also found it helpful to look at the source code inside keras.backend. If you look at get_session() you can see that Keras first looks to see if there is a tensorflow default session. Otherwise it uses the session set to Keras using set_session(). Finally if no session has been set then it creates one.
Related
Why is it recommended to save the state dicts and load them instead of saving stuff with dill for example and then just getting the usable objects immediately?
I think I've done that without may issues and it saves users code.
But instead we are recommended to do something like:
def _load_model_and_optimizer_from_checkpoint(args: Namespace, training: bool = True) -> Namespace:
"""
based from: https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html
"""
import torch
from torch import optim
import torch.nn as nn
# model = Net()
args.model = nn.Linear()
# optimizer = optim.SGD(args.model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(args.model.parameters(), lr=0.001)
# scheduler...
checkpoint = torch.load(args.PATH)
args.model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
args.epoch_num = checkpoint['epoch_num']
args.loss = checkpoint['loss']
args.model.train() if training else args.model.eval()
For example I've saved:
def save_for_meta_learning(args: Namespace, ckpt_filename: str = 'ckpt.pt'):
if is_lead_worker(args.rank):
import dill
args.logger.save_current_plots_and_stats()
# - ckpt
assert uutils.xor(args.training_mode == 'epochs', args.training_mode == 'iterations')
f: nn.Module = get_model_from_ddp(args.base_model)
# pickle vs torch.save https://discuss.pytorch.org/t/advantages-disadvantages-of-using-pickle-module-to-save-models-vs-torch-save/79016
args_pickable: Namespace = uutils.make_args_pickable(args)
torch.save({'training_mode': args.training_mode, # assert uutils.xor(args.training_mode == 'epochs', args.training_mode == 'iterations')
'it': args.it,
'epoch_num': args.epoch_num,
'args': args_pickable, # some versions of this might not have args!
'meta_learner': args.meta_learner,
'meta_learner_str': str(args.meta_learner), # added later, to make it easier to check what optimizer was used
'f': f,
'f_state_dict': f.state_dict(), # added later, to make it easier to check what optimizer was used
'f_str': str(f), # added later, to make it easier to check what optimizer was used
# 'f_modules': f._modules,
# 'f_modules_str': str(f._modules),
'outer_opt': args.outer_opt, # added later, to make it easier to check what optimizer was used
'outer_opt_state_dict': args.outer_opt.state_dict(), # added later, to make it easier to check what optimizer was used
'outer_opt_str': str(args.outer_opt) # added later, to make it easier to check what optimizer was used
},
pickle_module=dill,
f=args.log_root / ckpt_filename)
then loaded:
def get_model_opt_meta_learner_to_resume_checkpoint_resnets_rfs(args: Namespace,
path2ckpt: str,
filename: str,
device: Optional[torch.device] = None
) -> tuple[nn.Module, optim.Optimizer, MetaLearner]:
"""
Get the model, optimizer, meta_learner to resume training from checkpoint.
Examples:
- see: _resume_from_checkpoint_meta_learning_for_resnets_rfs_test
"""
import uutils
path2ckpt: Path = Path(path2ckpt).expanduser() if isinstance(path2ckpt, str) else path2ckpt.expanduser()
ckpt: dict = torch.load(path2ckpt / filename, map_location=torch.device('cpu'))
# args_ckpt: Namespace = ckpt['args']
training_mode = ckpt.get('training_mode')
if training_mode is not None:
assert uutils.xor(training_mode == 'epochs', training_mode == 'iterations')
if training_mode == 'epochs':
args.epoch_num = ckpt['epoch_num']
else:
args.it = ckpt['it']
# - get meta-learner
meta_learner: MetaLearner = ckpt['meta_learner']
# - get model
model: nn.Module = meta_learner.base_model
# - get outer-opt
outer_opt_str = ckpt.get('outer_opt_str')
if outer_opt_str is not None:
# use the string to create optimizer, load the state dict, etc.
outer_opt: optim.Optimizer = get_optimizer(outer_opt_str)
outer_opt_state_dict: dict = ckpt['outer_opt_state_dict']
outer_opt.load_state_dict(outer_opt_state_dict)
else:
# this is not ideal, but since Adam has a exponentially moving average for it's adaptive learning rate,
# hopefully this doesn't screw my checkpoint to much
outer_opt: optim.Optimizer = optim.Adam(model.parameters(), lr=args.outer_lr)
# - device setup
if device is not None:
# if torch.cuda.is_available():
# meta_learner.base_model = meta_learner.base_model.cuda()
meta_learner.base_model = meta_learner.base_model.to(device)
return model, outer_opt, meta_learner
without issues.
Related:
Save and load model optimizer state
pytorch save and load model
Save and load a Pytorch model
save and load unserialized pytorch pretrained model
https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html
Why is it not recommended to save the optimizer, model etc as pickable/dillable objs in PyTorch but instead get the state dicts and load them?
https://discuss.pytorch.org/t/why-is-it-not-recommended-to-save-the-optimizer-model-etc-as-pickable-dillable-objs-in-pytorch-but-instead-get-the-state-dicts-and-load-them/137933
I start 2 processes because I only have 2 gpus but then it gives me a Exception: process 0 terminated with signal SIGSEGV. This code does work with multiple cpus (or at least no error is thrown). Also, it works with a single GPU. Besides that is fails when world_size > 0 and multiple cuda/gpus are present.
My error message this this:
(automl-meta-learning) miranda9~/ML4Coq $ python playground/multiprocessing_playground/ddp_hello_world.py
world_size=2
Traceback (most recent call last):
File "playground/multiprocessing_playground/ddp_hello_world.py", line 49, in <module>
main()
File "playground/multiprocessing_playground/ddp_hello_world.py", line 43, in main
mp.spawn(example,
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
raise Exception(
Exception: process 0 terminated with signal SIGSEGV
This is the code that gives the error:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def example(rank, world_size):
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__=="__main__":
main()
print('Done\n\a')
[Optional] Larger self-contained example (gives same error)
Note however, that this slightly more complete example (only missing a distributed dataloader) also gives me the same issue:
"""
Based on: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
Correctness of code: https://stackoverflow.com/questions/66226135/how-to-parallelize-a-training-loop-ever-samples-of-a-batch-when-cpu-is-only-avai
Note: as opposed to the multiprocessing (torch.multiprocessing) package, processes can use
different communication backends and are not restricted to being executed on the same machine.
"""
import time
from typing import Tuple
import torch
from torch import nn, optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
num_epochs = 5
batch_size = 8
Din, Dout = 10, 5
data_x = torch.randn(batch_size, Din)
data_y = torch.randn(batch_size, Dout)
data = [(i*data_x, i*data_y) for i in range(num_epochs)]
class PerDeviceModel(nn.Module):
"""
Toy example for a model ran in parallel but not distributed accross gpus
(only processes with their own gpu or hardware)
"""
def __init__(self):
super().__init__()
self.net1 = nn.Linear(Din, Din)
self.relu = nn.ReLU()
self.net2 = nn.Linear(Din, Dout)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def setup_process(rank, world_size, backend='gloo'):
"""
Initialize the distributed environment (for each process).
gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.
"""
# set up the master's ip address so this child process can coordinate
# os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
if torch.cuda.is_available():
backend = 'nccl'
# Initializes the default distributed process group, and this will also initialize the distributed package.
dist.init_process_group(backend, rank=rank, world_size=world_size)
def cleanup():
""" Destroy a given process group, and deinitialize the distributed package """
dist.destroy_process_group()
def get_batch(batch: Tuple[torch.Tensor, torch.Tensor], rank):
x, y = batch
if torch.cuda.is_available():
x, y = x.to(rank), y.to(rank)
else:
x, y = x.share_memory_(), y.share_memory_()
return x, y
def get_ddp_model(model: nn.Module, rank):
"""
Moves the underlying storage to shared memory.
This is a no-op if the underlying storage is already in shared memory
and for CUDA tensors. Tensors in shared memory cannot be resized.
:return:
TODO: does this have to be done outside or inside the process? my guess is that it doesn't matter because
1) if its on gpu once it's on the right proc it moves it to cpu with id rank via mdl.to(rank)
2) if it's on cpu then mdl.share_memory() or data.share_memory() is a no op if it's already in shared memory o.w.
"""
# if gpu avail do the standard of creating a model and moving the model to the GPU with id rank
if torch.cuda.is_available():
# create model and move it to GPU with id rank
model = model.to(rank)
ddp_model = DDP(model, device_ids=[rank])
else:
# if we want multiple cpu just make sure the model is shared properly accross the cpus with shared_memory()
# note that op is a no op if it's already in shared_memory
model = model.share_memory()
ddp_model = DDP(model) # I think removing the devices ids should be fine...?
return ddp_model
# return OneDeviceModel().to(rank) if torch.cuda.is_available() else OneDeviceModel().share_memory()
def run_parallel_training_loop(rank, world_size):
"""
Distributed function to be implemented later.
This is the function that is actually ran in each distributed process.
Note: as DDP broadcasts model states from rank 0 process to all other processes in the DDP constructor,
you don’t need to worry about different DDP processes start from different model parameter initial values.
"""
setup_process(rank, world_size)
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# get ddp model
model = PerDeviceModel()
ddp_model = get_ddp_model(model, rank)
# do training
for batch_idx, batch in enumerate(data):
x, y = get_batch(batch, rank)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(x)
# Gradient synchronization communications take place during the backward pass and overlap with the backward computation.
loss_fn(outputs, y).backward() # When the backward() returns, param.grad already contains the synchronized gradient tensor.
optimizer.step() # TODO how does the optimizer know to do the gradient step only once?
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# Destroy a given process group, and deinitialize the distributed package
cleanup()
def main():
print()
print('running main()')
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# args
if torch.cuda.is_available():
world_size = torch.cuda.device_count()
else:
world_size = mp.cpu_count()
print(f'world_size={world_size}')
mp.spawn(run_parallel_training_loop, args=(world_size,), nprocs=world_size)
if __name__ == "__main__":
print('starting __main__')
start = time.time()
main()
print(f'execution length = {time.time() - start}')
print('Done!\a\n')
cross posted: https://discuss.pytorch.org/t/why-is-mp-spawn-spawning-4-processes-when-i-only-want-2/112299
I ran your "(minimal) code example" without any change and any error on a server with 4 GPUs (python version: 3.6.9, and pytorch version: 1.5.0+cu101 ).
Does the problem still exist when you run the minimal code example?
If so, and if you are on a linux machine, could you please run the following code instead, and tell me what output you get:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def get_visible_gpus():
ns = os.popen('nvidia-smi')
lines_ns = ns.readlines()
# print(lines_ns)
for _i, _line in enumerate(lines_ns):
if _line.find('|=') >= 0:
break
line_gpus = lines_ns[_i:]
for _i, _line in enumerate(line_gpus):
if _line.find('Processes') >= 0:
break
line_gpus = line_gpus[:_i-3]
# print(line_gpus)
idx_gpu_lines = []
for _i, _line in enumerate(line_gpus):
if _line.find('+') >= 0:
idx_gpu_lines.append(_i+1)
idx_gpus = []
for _line_gpu in idx_gpu_lines:
idx_gpus.append(int(line_gpus[_line_gpu].split()[1]))
# print(idx_gpus)
return idx_gpus
def example(rank, world_size):
print('rank:{}'.format(rank))
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
print('world_size:{}'.format(world_size))
print('get_visible_gpus():{}'.format(get_visible_gpus()))
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
print(torch.__version__)
main()
print('Done\n\a')
In my case, I simply get:
1.5.0+cu101
world_size:4
get_visible_gpus():[0, 1, 2, 3]
rank:1
rank:3
rank:0
rank:2
Done
get_visible_gpus() is simply text parsing an nvidia-smi shell cmd to get the ids of the gpus that cuda can see.
NB: Please excuse me, I would have commented instead of "answering" -as I am not directly solving your problem, but asking for more details- but my reputation is not good enough T.T
Solution: increase shm-size
docker run -it \
--shm-size=64g
Reason:
If you run on docker container, it's probably because the shm_size of docker is not large enough. By default, Docker containers are allocated 64 MB of shared memory. This shared memory is not a memory limit, but a /dev/shm temporary file storage file system that uses RAM to store files. This is used for IPC.
For check shm-size. After entering the container you can use df to view the shm size.
I use Tensorflow for regression using the following function
import tensorflow as tf
def ff(*args, **kwargs):
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=[inp_train.shape[-1],]))
for i in range(n_layer):
model.add(tf.keras.layers.Dense(n_unit, activation=act))
model.add(tf.keras.layers.Dense(out_train.shape[1]))
model.compile(optimizer=opt, loss='mae')
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=100)
check_point = tf.keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
model.fit(inp_train, out_train, epochs=n_epoch, batch_size=s_batch, validation_data=(inp_val, out_val), callbacks=[early_stop, check_point], verbose=0)
best_model = tf.keras.models.load_model('best_model.h5')
return model, best_mode
As you see, I save the best model by check_point callback and use it later for prediction. The problem is that in this way I have to save the best model on the disk first, and then load it from the disk. If I want to do a couple of runs in parallel, since each run create a file with the same name it does not work.
So, how can I assign the best model in a variable without having to save it on the disk?
NOTE: I fixed a bug and is untested
I had to do this for myself and thought I would share:
Callback:
class SaveBestModel(tf.keras.callbacks.Callback):
def __init__(self, save_best_metric='val_loss', this_max=False):
self.save_best_metric = save_best_metric
self.max = this_max
if this_max:
self.best = float('-inf')
else:
self.best = float('inf')
def on_epoch_end(self, epoch, logs=None):
metric_value = logs[self.save_best_metric]
if self.max:
if metric_value > self.best:
self.best = metric_value
self.best_weights = self.model.get_weights()
else:
if metric_value < self.best:
self.best = metric_value
self.best_weights= self.model.get_weights()
usage:
save_best_model = SaveBestModel()
model.fit(data, callbacks=[save_best_model]
#set best weigts
model.set_weights(save_best_model.best_weights)
Here is a basic example of creating a callback and saving the model at the time of callback to an external list. It has to be a list (or a type that allows modification using a method). The base tf.keras.callbacks.Callback class is extended with an additional argument, the list, in the callback class __init___ method. This example shows that it works. When the callback is called on training_end it appends the current model to the list.
import tensorflow as tf
from tensorflow.python.keras.models import Model
# define a custom callback
class MyCustomCallback(tf.keras.callbacks.Callback):
def __init__(self, external_list):
self.list_obj = external_list
def on_train_end(self, logs=None):
self.list_obj.append(self.model)
# test the idea works
model_save_list = []
my_callback = MyCustomCallback(model_save_list)
model1 = Model()
my_callback.set_model(model1)
my_callback.on_train_end()
print(model_save_list)
Run this and you will see the internal model gets added to your list object:
[<tensorflow.python.keras.engine.training.Model object at 0x10d230b50>]
Modify your training by adding your new callback to the callbacks like so:
model.fit(inp_train, out_train, epochs=n_epoch, batch_size=s_batch, validation_data=(inp_val, out_val), callbacks=[early_stop, my_callback], verbose=0)
I'm training a neural network in keras and logging the results obtained via CSVLogger from keras.callbacks. Because my model.fit() call is enclosed inside a forloop of epochs (an adaptation for mini-batch training), I'm trying to write a wrapper class that blends data-wrangling functionality of pandas with CSVLogger model logging. My goal is to achieve this using object-oriented methodology by writing a class in my utils.py that:
(before training) initializes a pandas data frame where the results for all epochs will be stored
(before training) initializes the CSV file given a path before fitting the model using the CSVLogger() method
(after each epoch) converts the CSV logger to a pandas data frame
(after each epoch) adds a column the data frame that logs the current epoch
(after each epoch) appends the epoch-specific data frame to the all-epochs data frame
(after training) saves the all-epochs data frame to file
Here is my utils.py
import os
from keras.callbacks import CSVLogger
import pandas as pd
class logger(object):
def __init__(self, data):
self.log = None
self.data = data # a pandas data frame
def load(self, in_path):
self.log = CSVLogger(os.path.expanduser(in_path), append=True, separator=',') # ideally I wouldn't have to write directly to temp file
def log_epoch(self, epoch):
# convert to pandas dataframe
self.log = pd.DataFrame(self.log)
# rename epoch to iter
self.log.rename(columns={'epoch': 'iter'})
# add current epoch value to dataframe here
self.log['epoch'] = [epoch for index in range(len(self.data))]
def append(self):
self.data = self.data.append(self.log, ignore_index=False)
def save(self, out_path):
self.data.to_csv(os.path.expanduser(out_path), index=False)
Here is my train.py
from utils.py import logger
log = logger.data(pd.DataFrame())
for epoch in range(len(num_epochs)):
log_ep = logger.load(path_1)
for X,y in data:
model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, callbacks=[log_ep])
log_ep = logger.log_ep(epoch)
log = logger.append(log, log_ep)
logger.save(path_2)
When I run train.py I get this error: AttributeError: type_object 'logger' has no attribute 'data'
How do I need to modify the class code to run the script correctly?
Do you use a debugger?
I would check the following:
pd.DataFrame() - Check what value you get , as i didn't see the declaration .
in :
def init(self, data):
self.log = None
self.data = data # a pandas data frame
I would change to :
def init(self, data_in):
self.log = None
self.data = data_in # a pandas data frame
sometimes it cause issues,
I have a model trained on a single machine without using Estimator and I'm looking to serve the final trained model on Google cloud AI platform (ML engine). I exported the frozen graph as a SavedModel using SavedModelBuilder and deployed it on the AI platform. It works fine for small input images but for it to be able to accept large input images for online prediction, I need to change it to accept b64 encoded strings ({'image_bytes': {'b64': base64.b64encode(jpeg_data).decode()}}) which are converted to the required tensor by a serving_input_fn if using Estimators.
What options do I have if I am not using an Estimator? If I have a frozen graph or SavedModel being created from SavedModelBuilder, is there a way to have something similar to an estimator's serving_input_fn when exporting/ saving?
Here's the code I'm using for exporting:
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants
export_dir = 'serving_model/'
graph_pb = 'model.pb'
builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
with tf.gfile.GFile(graph_pb, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
sigs = {}
with tf.Session(graph=tf.Graph()) as sess:
# name="" is important to ensure we don't get spurious prefixing
tf.import_graph_def(graph_def, name="")
g = tf.get_default_graph()
inp = g.get_tensor_by_name("image_bytes:0")
out_f1 = g.get_tensor_by_name("feature_1:0")
out_f2 = g.get_tensor_by_name("feature_2:0")
sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
tf.saved_model.signature_def_utils.predict_signature_def(
{"image_bytes": inp}, {"f1": out_f1, "f2": out_f2})
builder.add_meta_graph_and_variables(sess,
[tag_constants.SERVING],
strip_default_attrs=True,
signature_def_map=sigs)
builder.save()
Use a #tf.function to specify a serving signature. Here's an example that calls Keras:
class ExportModel(tf.keras.Model):
def __init__(self, model):
super().__init__(self)
self.model = model
#tf.function(input_signature=[
tf.TensorSpec([None,], dtype='int32', name='a'),
tf.TensorSpec([None,], dtype='int32', name='b')
])
def serving_fn(self, a, b):
return {
'pred' : self.model({'a': a, 'b': b}) #, steps=1)
}
def save(self, export_path):
sigs = {
'serving_default' : self.serving_fn
}
tf.keras.backend.set_learning_phase(0) # inference only
tf.saved_model.save(self, export_path, signatures=sigs)
sm = ExportModel(model)
sm.save(EXPORT_PATH)
First, load your already exported SavedModel with
import tensorflow as tf
loaded_model = tf.saved_model.load(MODEL_DIR)
Then, wrap it with a new Keras model that takes base64 input
class Base64WrapperModel(tf.keras.Model):
def __init__(self, model):
super(Base64WrapperModel, self).__init__()
self.inner_model = model
#tf.function
def call(self, base64_input):
str_input = tf.io.decode_base64(base64_input)
return self.inner_model(str_input)
wrapper_model = Base64WrapperModel(loaded_model)
Finally, save your wrapped model with Keras API
wrapper_model.save(EXPORT_DIR)