I'm developing a simple REST controller using gunicorn and flask.
At each REST call, I execute the following code
#app.route('/objects', methods=['GET'])
def get_objects():
video_title = request.args.get('video_title')
video_path = "../../video/" + video_title
cl.logger.info(video_path)
start = request.args.get('start')
stop = request.args.get('stop')
scene = [start, stop]
frames = images_utils.extract_frames(video_path, scene[0], scene[1], 1)
cl.logger.info(scene[0]+" "+scene[1])
objects = list()
##objects
model = GenericDetector('../resources/open_images/frozen_inference_graph.pb', '../resources/open_images/labels.txt')
model.run(frames)
for result in model.get_boxes_and_labels():
if result is not None:
objects.append(result)
data = {'message': {
'start_time': scene[0],
'end_time': scene[1],
'path': video_path,
'objects':objects,
}, 'metadata_type': 'detection'}
return jsonify({'status': data}), 200
This code runs a tensorflow frozen model as follows:
class GenericDetector(Process):
def __init__(self, model, labels):
# ## Load a (frozen) Tensorflow model into memory.
self.detection_graph = tf.Graph()
with self.detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(model, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
self.boxes_and_labels = []
# ## Loading label map
with open(labels) as f:
txt_labels = f.read()
self.labels = json.loads(txt_labels)
def run(self, frames):
tf.reset_default_graph()
with self.detection_graph.as_default():
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(graph=self.detection_graph, config=config) as sess:
image_tensor = self.detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = self.detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
detection_scores = self.detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = self.detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = self.detection_graph.get_tensor_by_name('num_detections:0')
i = 0
for frame in frames:
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(frame, axis=0)
# Actual detection.
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections], \
feed_dict={image_tensor: image_np_expanded})
boxes = np.squeeze(boxes)
classes = np.squeeze(classes).astype(np.int32)
scores = np.squeeze(scores)
for j, box in enumerate(boxes):
if all(v == 0 for v in box):
continue
self.boxes_and_labels.append(
{
"ymin": str(box[0]),
"xmin": str(box[1]),
"ymax": str(box[2]),
"xmax": str(box[3]),
"label": self.labels[str(classes[j])],
"score": str(scores[j]),
"frame":i
})
i += 1
sess.close()
def get_boxes_and_labels(self):
return self.boxes_and_labels
Everything seems to work as excepted, but once I send a second request to my server, my GPU ( a GTX 1050 ) goes out of memory:
ResourceExhaustedError (see above for traceback): OOM when allocating
tensor of shape [3,3,256,256] and type float
If I try to make a call after that, it works most of the time. Sometimes it will work on subsequent calls too. I tried executing the GenericDetector on a separate Process ( Making GEnericDetector hereditate Process ), but it did not help. I read that once the process that executes the REST GET is dead, the memory of the GPU should be freed, so I also tried adding a sleep(30) after the execution of the tensorflow model, with no luck. Wham am I doing wrong?
The thing is that Tensorflow allocates the memory for the process not the Session, closing the session is not enough (even if you put the allow_growth option).
The first is the allow_growth option, which attempts to allocate only as much GPU memory based on runtime allocations: it starts out allocating very little memory, and as Sessions get run and more GPU memory is needed, we extend the GPU memory region needed by the TensorFlow process. Note that we do not release memory, since that can lead to even worse memory fragmentation.
There is an issue on TF github with some solutions , you could for example decorate your run method with the RunAsCUDASubprocess proposed in the thread.
This error means that you are trying to fit into the GPU something bigger than the memory you have available. Maybe you can reduce the number of parameters somewhere in your model in order for it to be lighter?
Related
I start 2 processes because I only have 2 gpus but then it gives me a Exception: process 0 terminated with signal SIGSEGV. This code does work with multiple cpus (or at least no error is thrown). Also, it works with a single GPU. Besides that is fails when world_size > 0 and multiple cuda/gpus are present.
My error message this this:
(automl-meta-learning) miranda9~/ML4Coq $ python playground/multiprocessing_playground/ddp_hello_world.py
world_size=2
Traceback (most recent call last):
File "playground/multiprocessing_playground/ddp_hello_world.py", line 49, in <module>
main()
File "playground/multiprocessing_playground/ddp_hello_world.py", line 43, in main
mp.spawn(example,
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
raise Exception(
Exception: process 0 terminated with signal SIGSEGV
This is the code that gives the error:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def example(rank, world_size):
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__=="__main__":
main()
print('Done\n\a')
[Optional] Larger self-contained example (gives same error)
Note however, that this slightly more complete example (only missing a distributed dataloader) also gives me the same issue:
"""
Based on: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
Correctness of code: https://stackoverflow.com/questions/66226135/how-to-parallelize-a-training-loop-ever-samples-of-a-batch-when-cpu-is-only-avai
Note: as opposed to the multiprocessing (torch.multiprocessing) package, processes can use
different communication backends and are not restricted to being executed on the same machine.
"""
import time
from typing import Tuple
import torch
from torch import nn, optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
num_epochs = 5
batch_size = 8
Din, Dout = 10, 5
data_x = torch.randn(batch_size, Din)
data_y = torch.randn(batch_size, Dout)
data = [(i*data_x, i*data_y) for i in range(num_epochs)]
class PerDeviceModel(nn.Module):
"""
Toy example for a model ran in parallel but not distributed accross gpus
(only processes with their own gpu or hardware)
"""
def __init__(self):
super().__init__()
self.net1 = nn.Linear(Din, Din)
self.relu = nn.ReLU()
self.net2 = nn.Linear(Din, Dout)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def setup_process(rank, world_size, backend='gloo'):
"""
Initialize the distributed environment (for each process).
gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.
"""
# set up the master's ip address so this child process can coordinate
# os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
if torch.cuda.is_available():
backend = 'nccl'
# Initializes the default distributed process group, and this will also initialize the distributed package.
dist.init_process_group(backend, rank=rank, world_size=world_size)
def cleanup():
""" Destroy a given process group, and deinitialize the distributed package """
dist.destroy_process_group()
def get_batch(batch: Tuple[torch.Tensor, torch.Tensor], rank):
x, y = batch
if torch.cuda.is_available():
x, y = x.to(rank), y.to(rank)
else:
x, y = x.share_memory_(), y.share_memory_()
return x, y
def get_ddp_model(model: nn.Module, rank):
"""
Moves the underlying storage to shared memory.
This is a no-op if the underlying storage is already in shared memory
and for CUDA tensors. Tensors in shared memory cannot be resized.
:return:
TODO: does this have to be done outside or inside the process? my guess is that it doesn't matter because
1) if its on gpu once it's on the right proc it moves it to cpu with id rank via mdl.to(rank)
2) if it's on cpu then mdl.share_memory() or data.share_memory() is a no op if it's already in shared memory o.w.
"""
# if gpu avail do the standard of creating a model and moving the model to the GPU with id rank
if torch.cuda.is_available():
# create model and move it to GPU with id rank
model = model.to(rank)
ddp_model = DDP(model, device_ids=[rank])
else:
# if we want multiple cpu just make sure the model is shared properly accross the cpus with shared_memory()
# note that op is a no op if it's already in shared_memory
model = model.share_memory()
ddp_model = DDP(model) # I think removing the devices ids should be fine...?
return ddp_model
# return OneDeviceModel().to(rank) if torch.cuda.is_available() else OneDeviceModel().share_memory()
def run_parallel_training_loop(rank, world_size):
"""
Distributed function to be implemented later.
This is the function that is actually ran in each distributed process.
Note: as DDP broadcasts model states from rank 0 process to all other processes in the DDP constructor,
you don’t need to worry about different DDP processes start from different model parameter initial values.
"""
setup_process(rank, world_size)
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# get ddp model
model = PerDeviceModel()
ddp_model = get_ddp_model(model, rank)
# do training
for batch_idx, batch in enumerate(data):
x, y = get_batch(batch, rank)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(x)
# Gradient synchronization communications take place during the backward pass and overlap with the backward computation.
loss_fn(outputs, y).backward() # When the backward() returns, param.grad already contains the synchronized gradient tensor.
optimizer.step() # TODO how does the optimizer know to do the gradient step only once?
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# Destroy a given process group, and deinitialize the distributed package
cleanup()
def main():
print()
print('running main()')
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# args
if torch.cuda.is_available():
world_size = torch.cuda.device_count()
else:
world_size = mp.cpu_count()
print(f'world_size={world_size}')
mp.spawn(run_parallel_training_loop, args=(world_size,), nprocs=world_size)
if __name__ == "__main__":
print('starting __main__')
start = time.time()
main()
print(f'execution length = {time.time() - start}')
print('Done!\a\n')
cross posted: https://discuss.pytorch.org/t/why-is-mp-spawn-spawning-4-processes-when-i-only-want-2/112299
I ran your "(minimal) code example" without any change and any error on a server with 4 GPUs (python version: 3.6.9, and pytorch version: 1.5.0+cu101 ).
Does the problem still exist when you run the minimal code example?
If so, and if you are on a linux machine, could you please run the following code instead, and tell me what output you get:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def get_visible_gpus():
ns = os.popen('nvidia-smi')
lines_ns = ns.readlines()
# print(lines_ns)
for _i, _line in enumerate(lines_ns):
if _line.find('|=') >= 0:
break
line_gpus = lines_ns[_i:]
for _i, _line in enumerate(line_gpus):
if _line.find('Processes') >= 0:
break
line_gpus = line_gpus[:_i-3]
# print(line_gpus)
idx_gpu_lines = []
for _i, _line in enumerate(line_gpus):
if _line.find('+') >= 0:
idx_gpu_lines.append(_i+1)
idx_gpus = []
for _line_gpu in idx_gpu_lines:
idx_gpus.append(int(line_gpus[_line_gpu].split()[1]))
# print(idx_gpus)
return idx_gpus
def example(rank, world_size):
print('rank:{}'.format(rank))
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
print('world_size:{}'.format(world_size))
print('get_visible_gpus():{}'.format(get_visible_gpus()))
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
print(torch.__version__)
main()
print('Done\n\a')
In my case, I simply get:
1.5.0+cu101
world_size:4
get_visible_gpus():[0, 1, 2, 3]
rank:1
rank:3
rank:0
rank:2
Done
get_visible_gpus() is simply text parsing an nvidia-smi shell cmd to get the ids of the gpus that cuda can see.
NB: Please excuse me, I would have commented instead of "answering" -as I am not directly solving your problem, but asking for more details- but my reputation is not good enough T.T
Solution: increase shm-size
docker run -it \
--shm-size=64g
Reason:
If you run on docker container, it's probably because the shm_size of docker is not large enough. By default, Docker containers are allocated 64 MB of shared memory. This shared memory is not a memory limit, but a /dev/shm temporary file storage file system that uses RAM to store files. This is used for IPC.
For check shm-size. After entering the container you can use df to view the shm size.
Hi everyone i need somehelp.
I try to code resnet-101 imagenet classification using tensorflow without using estimator. I try it to study deep learning and understand how to use tensorflow.
My problem is monitoredtrainingSession does not initilize my iterator.
I have read some article about the problems and try to use hook to handle it but it fails and i have no idea why it fails.
after i create monitoredtrainingsession it first initialize train_iterator
and got outOfRange exception
then validation step are performed.
It seems fine for now but after finish runing validation and try to run training step again. I got Error related with iterator.get_next().
It saids I did not initialize iterator but my hook function clearly call
session.run(self._initializer, feed_dict={filenames: self._filenames})
i'm sure because i can see the below message that i print to check if it is initialized or not.
iter_val.initializer after_create_session is called 0 times
what am i wrong with it?
running flow are like below
run train step fine (epoch =0)
run validation step fine (epoch =0)
run train step Error(epoch =1)
Please ignore horovod(hvd()) in the code cause I am not using it right now.
Here is my code so please help me to fix it and let me know what's wrong with my code.
class _DatasetInitializerHook(tf.train.SessionRunHook):
def __init__(self, initializer, filenames=[], name=""):
self._initializer = initializer
self._filenames = filenames
self._name = name
self._cnt = 0
self._before_runCnt = 0
def begin(self):
pass
def after_create_session(self, session, coord):
del coord
if len(self._filenames) == 0:
session.run(self._initializer)
else:
session.run(self._initializer, feed_dict={filenames: self._filenames})
print(self._name, "after_create_session is called {} times".format(self._cnt))
self._cnt += 1
if __name__ == "__main__":
if len(sys.argv) > 1:
nlogs = sys.argv[1]
else:
nlogs = 0
hvd.init()
b_imagenet=False
if b_imagenet:
training_filenames = ['/data/tfrecords/imagenet2012_train_shard{}.tfrecord'.format(i) for i in range(129)]
else:
training_filenames = ['/data/cifar-10-tfrecords/train_shard{}.tfrecord'.format(i) for i in range(1, 2, 1)]
filenames = tf.placeholder(tf.string, shape=[None])
trainData = dataset_input_fn(is_training=True, filename=filenames, nworkers=hvd.size(), workeridx=hvd.rank(),
batch_size=FLAGS.batchSize, prefetch_size=FLAGS.prefetch_buffer_size, repeat=1,
shuffle_buffer_size=FLAGS.shuffle_buffer_size)
valData = dataset_input_fn(is_training=False, filename=FLAGS.validationfile, nworkers=hvd.size(), workeridx=hvd.rank(),
batch_size=1,prefetch_size=FLAGS.prefetch_buffer_size, repeat=1, shuffle_buffer_size=1)
# Pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
for i in tqdm(range(FLAGS.nepoch)):
shuffle(training_filenames)
model = model_class(nCls=FLAGS.nClasses, img_width=FLAGS.width, img_height=FLAGS.height,
learning_rate=FLAGS.learning_rate, weight_decay=FLAGS.weight_decay)
iter_train = trainData.make_initializable_iterator()
train_op = model.build_model(iter_train.get_next(), is_trainig=True, hvd=None)
train_hooks = [hvd.BroadcastGlobalVariablesHook(0),
_DatasetInitializerHook(iter_train.initializer, training_filenames, "iter_train.initializer")]
with tf.train.MonitoredTrainingSession(checkpoint_dir="./tmp/train_logs", config=config, hooks=train_hooks,
save_checkpoint_secs=30) as sess:
try:
while True:
opt = sess.run([train_op])
except tf.errors.OutOfRangeError:
pass
iter_val = valData.make_initializable_iterator()
prediction_result = model.build_model(iter_val.get_next(),is_trainig=False, hvd=None)
validation_hooks = [hvd.BroadcastGlobalVariablesHook(0),
_DatasetInitializerHook(iter_val.initializer, [], "iter_val.initializer")]
with tf.train.MonitoredTrainingSession( checkpoint_dir="./tmp/train_logs",config=config, hooks=validation_hooks) as sess:
try:
while True:
result = sess.run([prediction_result])
except tf.errors.OutOfRangeError:
pass
This is the error message I got.
tensorflow.python.framework.errors_impl.FailedPreconditionError: GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
[[node IteratorGetNext (defined at workspace/multi_gpu/main.py:128) ]]
Errors may have originated from an input operation.
Input Source operations connected to node IteratorGetNext:
IteratorV2_2 (defined at workspace/multi_gpu/main.py:126)
Try putting your initializer into a scaffold:
scaffold = tf.train.Scaffold(local_init_op=train_init_operator)
and give it to the monitoredTrainingSession with:
with tf.train.MonitoredTrainingSession(scaffold=scaffold, ...
The situation:
I've already created several models, trained over several days each, that we're ready to move from local testing to a serving environment.
The models were saved using the function
def save_graph_to_file(sess, graph, graph_file_name):
"""Saves an graph to file, creating a valid quantized one if necessary."""
output_graph_def = graph_util.convert_variables_to_constants(sess, graph.as_graph_def(), [final_tensor_name])
with gfile.FastGFile(graph_file_name, 'wb') as f:
f.write(output_graph_def.SerializeToString())
Now when attempting to deploy to a serving environment (Sagemaker, using a correct directory structure and file naming convention), the system returns
2019-06-04 22:38:53.794056: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:54] Reading meta graph with tags { serve }
2019-06-04 22:38:53.798096: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:259] SavedModel load for tags { serve }; Status: fail. Took 83297 microseconds.
2019-06-04 22:38:53.798132: E tensorflow_serving/util/retrier.cc:37] Loading servable: {name: model version: 1} failed: Not found: Could not find meta graph def matching supplied tags: { serve }. To inspect available tag-sets in the SavedModel, please use the SavedModel CLI: `saved_model_cli`
All I have are the *.pb files and their label textfiles. These work lovely across multiple computers in local environments.
def load_graph(model_file):
"""
Code from v1.6.0 of Tensorflow's label_image.py example
"""
graph = tf.Graph()
graph_def = tf.GraphDef()
with open(model_file, "rb") as f:
graph_def.ParseFromString(f.read())
with graph.as_default():
tf.import_graph_def(graph_def)
return graph
inputLayer = "Mul"
outputLayer = "final_result"
inputName = "import/" + inputLayer
outputName = "import/" + outputLayer
graph = load_graph(modelPath)
inputOperation = graph.get_operation_by_name(inputName)
outputOperation = graph.get_operation_by_name(outputName)
with tf.Session(graph= graph) as sess:
# ... make a tensor t
results = sess.run(outputOperation.outputs[0], {
inputOperation.outputs[0]: t
})
# lovely functional results here
All I want to do is to take these existing files, add the "serve" tag needed, and re-save them, but everything I see seems to be related to doing this from scratch.
I tried to use the builder to append a graph to a model like so:
# Load the graph
graph = load_graph(modelPath)
import shutil
if os.path.exists(exportDir):
shutil.rmtree(exportDir)
# Add the serving metagraph tag
builder = tf.saved_model.builder.SavedModelBuilder(exportDir)
from tensorflow.saved_model import tag_constants
with tf.Session(graph= graph) as sess:
builder.add_meta_graph_and_variables(sess, [tag_constants.SERVING, tag_constants.GPU], strip_default_attrs= True)
builder.save()
print("Built a SavedModel")
but got the same error.
Finally solved it. This contains some S3 specific code and S3 instance calls (the ! commands) but you should pretty much be able to slice that out to run this.
#!python3
"""
Assumes we've defined:
- A directory for our working files to live in, CONTAINER_DIR
- an arbitrary integer VERSION_INT
- We have established local and S3 paths for our model and their labels as variables, particularly `modelLabel` and `modelPath`
"""
# Create a versioned path for the models to live in
# See https://stackoverflow.com/a/54014480/1877527
exportDir = os.path.join(CONTAINER_DIR, VERSION_INT)
if os.path.exists(exportDir):
shutil.rmtree(exportDir)
os.mkdir(exportDir)
import tensorflow as tf
def load_graph(model_file, returnElements= None):
"""
Code from v1.6.0 of Tensorflow's label_image.py example
"""
graph = tf.Graph()
graph_def = tf.GraphDef()
with open(model_file, "rb") as f:
graph_def.ParseFromString(f.read())
returns = None
with graph.as_default():
returns = tf.import_graph_def(graph_def, return_elements= returnElements)
if returnElements is None:
return graph
return graph, returns
# Add the serving metagraph tag
# We need the inputLayerName; in Inception we're feeding the resized tensor
# corresponding to resized_input_tensor_name
# May be able to get away with auto-determining this if not using Inception,
# but for Inception this is the 11th layer
inputLayerName = "Mul:0"
# Load the graph
if inputLayerName is None:
graph = load_graph(modelPath)
inputTensor = None
else:
graph, returns = load_graph(modelPath, returnElements= [inputLayerName])
inputTensor = returns[0]
with tf.Session(graph= graph) as sess:
# Read the layers
try:
from tensorflow.compat.v1.saved_model import simple_save
except (ModuleNotFoundError, ImportError):
from tensorflow.saved_model import simple_save
with graph.as_default():
layers = [n.name for n in graph.as_graph_def().node]
outName = layers.pop() + ":0"
if inputLayerName is None:
inputLayerName = layers.pop(0) + ":0"
print("Checking outlayer", outName)
outLayer = tf.get_default_graph().get_tensor_by_name(outName)
if inputTensor is None:
print("Checking inlayer", inputLayerName)
inputTensor = tf.get_default_graph().get_tensor_by_name(inputLayerName)
inputs = {
inputLayerName: inputTensor
}
outputs = {
outName: outLayer
}
simple_save(sess, exportDir, inputs, outputs)
print("Built a SavedModel")
# Put the model label into the artifact dir
modelLabelDest = os.path.join(exportDir, "saved_model.txt")
!cp {modelLabel} {modelLabelDest}
# Prep for serving
import datetime as dt
modelArtifact = f"livemodel_{dt.datetime.now().timestamp()}.tar.gz"
# Copy the version directory here to package
!cp -R {exportDir} ./
# gziptar it
!tar -czvf {modelArtifact} {VERSION_INT}
# Shove it back to S3 for serving
!aws s3 cp {modelArtifact} {bucketPath}
shutil.rmtree(VERSION_INT) # Cleanup
shutil.rmtree(exportDir) # Cleanup
This model is then deployable as a Sagemaker endpoint (and any other Tensorflow serving environment)
I'm fighting with TensorRT (TensorRT 4 for python right now) since several weeks. I passed a lot of problems to get TensorRT running. The example code from NVIDIA works well for me :
TensorRT MNIST example
Now, i created my own network in tensorflow (a very simple one) for upscaling images, let's say (in HWC) 320x240x3 into 640x480x3 .The usual way by creating a frozen-graph and running an inferencer just based on Tensorflow gave me expected results but not by using TensorRT.
I have a strange feeling about that i made something wrong by feeding the images into the GPU-memory (This would be probably an issue about pycuda and/or TensorRT).
The worst case scenario would be that TensorRT destroys my network by the optimization process.
I hope someone has just a little idea for saving my life.
This is my Tensorflow-model (i just wrapped the functions):
net = conv2d(input,
64,
k_size=3,
activation=tf.nn.relu,
name='conv1')
net = deconv2d(net,
3,
k_size=5,
activation=tf.tanh,
stride=self.params.resize_factor,
scale=self.params.resize_factor,
name='deconv')
This is the important snippet of my inferencer:
import tensorrt as trt
import uff
from tensorrt.parsers import uffparser
import pycuda.driver as cuda
import numpy as np
...
def _init_infer(self, uff_model):
g_logger = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)
parser = uffparser.create_uff_parser()
parser.register_input(self.input_node, (self.channels, self.height, self.width), 0)
parser.register_output(self.output_node)
self.engine = trt.utils.uff_to_trt_engine(g_logger, uff_model, parser, self.max_batch_size,
self.max_workspace_size)
parser.destroy()
self.runtime = trt.infer.create_infer_runtime(g_logger)
self.context = self.engine.create_execution_context()
self.output = np.empty(self.output_size, dtype=self.dtype)
# create CUDA stream
self.stream = cuda.Stream()
# allocate device memory
self.d_input = cuda.mem_alloc(self.channels * self.max_batch_size * self.width *
self.height * self.output.dtype.itemsize)
self.d_output = cuda.mem_alloc(self.output_size * self.output.dtype.itemsize)
self.bindings = [int(self.d_input), int(self.d_output)]
def infer(self, input_batch, batch_size=1):
# transfer input data to device
cuda.memcpy_htod_async(self.d_input, input_batch, self.stream)
# execute model
self.context.enqueue(batch_size, self.bindings, self.stream.handle, None)
# transfer predictions back
cuda.memcpy_dtoh_async(self.output, self.d_output, self.stream)
# synchronize threads
self.stream.synchronize()
return self.output
And the executable snippet:
...
# create trt inferencer
trt_inferencer = TensorRTInferencer(params=params)
img = [misc.imread('./test_images/lion.png')]
img[0] = normalize(img[0])
img = img[0]
# inferencing method
result = trt_inferencer.infer(img)
result = inormalize(result, dtype=np.uint8)
result = result.reshape(1, params.height * 2, params.width * 2, 3)
...
And the weird result by comparison :(
upscaled lion TensorRT, Tensorflow, Original
I got it now, finally. The problem was a wrong dimension and order of the input images and output. And for everyone who run into the same problem, this is the adopted executable snippet, dependent on my initialization:
...
# create trt inferencer
trt_inferencer = TensorRTInferencer(params=params)
img = [misc.imread('./test_images/lion.png')]
img[0] = normalize(img[0])
img = img[0]
img = np.transpose(img, (2, 0, 1))
img = img.ravel()
# inferencing method
result = trt_inferencer.infer(img)
result = inormalize(result, dtype=np.uint8)
result = np.reshape(result, newshape=[3, params.height * 2, params.width * 2])
result = np.transpose(result, (1, 2, 0))
...
I'm using tf.train.string_input_producer to read data from tfRecord file. I suppose it create a queue and pipeline and the data will automatically loaded and feed into my model. However, it stuck at the first batch, and show this exception:
FailedPreconditionError (see above for traceback): Attempting to use uninitialized value input_producer/limit_epochs/epochs
my tfrecord was made by tf.train.SequenceExample, instead of tf.train.Example, which don't have clear documentation in the official guide.
here is code snapshot to reproduce my problem. (I believe my problem come from the queue initializing or sth. because it seems that the whole pipeline is hang up)
from config.config import get_config
init = tf.global_variables_initializer()
config = get_config()
filename_queue = tf.train.string_input_producer(['data0.tfrecord,data1.tfrecord'], 5, capacity=16384)
reader = tf.TFRecordReader()
(keys, values) = reader.read_up_to(filename_queue, config.batch_size)
context_features = {
"seq_len": tf.FixedLenFeature([1], dtype=tf.int64),
}
audio_features = {
"audio": tf.FixedLenSequenceFeature([config.num_features], dtype=tf.float32),
"label": tf.FixedLenSequenceFeature([config.num_classes], dtype=tf.float32)
}
audio_list = []
label_list = []
len_list = []
for i in range(config.batch_size):
print(i)
context, sequence = tf.parse_single_sequence_example(
serialized=values[i],
context_features=context_features,
sequence_features=audio_features
)
audio = sequence['audio']
label = sequence['label']
# seq_len = context['seq_len'][0]
seq_len = tf.shape(audio)[0]
audio_list.append(audio)
label_list.append(label)
len_list.append(seq_len)
audio_tensor = tf.stack(audio_list)
label_tenor = tf.stack(label_list)
len_tensor = tf.stack(len_list)
with tf.Session() as sess:
sess.run(init)
threads = tf.train.start_queue_runners(sess=sess)
for i in range(3):
x, y, z = sess.run([audio_tensor, label_tenor, len_tensor])
print(z)
Try
init2 = tf.local_variables_initializer()
sess.run(init2)
Variabes (num_epochs or capacity) inside tf.train.string_input_producer() are local variables. You have to initialize them with local variable initializer as shown above.
Let me know if this helped.