Problem retraing PPO1 model and using Ternsorflow with SB2 - python

I am new at stable baselines and RL. What I am trying to do is :
loading my previously trained model from my computer and then re-train it from the point it ended it’s last training. For that, I am loading my previously saved model inside policy_fn() and I am giving policy_fn as parameter inside pposgd_simple.learn() method. It shows error "ValueError: At least two variables have the same name: pi/obfilter/count"
Also, I am unsure of whether it starts the training from the previous ending point or whether it started the training from the very beginning (when it trains correctly in a different setting). Can anyone please help me directing the way to verify it. One option may be printing the model parameters, but I am unsure of it.
I am also trying to use Tensorboard to monitor my training. But when I run the training, the program says “tensorboard_log=logger_path, TypeError: learn() got an unexpected keyword argument 'tensorboard_log'. ” My stable baselines version 2.10.2. I am attaching my entire code of training below.
def make_env(seed=None):
reward_scale = 1.0
rank = MPI.COMM_WORLD.Get_rank()
myseed = seed + 1000 * rank if seed is not None else None
set_global_seeds(myseed)
env = Env()
env = Monitor(env, logger_path, allow_early_resets=True)
env.seed(seed)
if reward_scale != 1.0:
from baselines.common.retro_wrappers import RewardScaler
env = RewardScaler(env, reward_scale)
return env
def train(num_timesteps, path=None):
from baselines.ppo1 import mlp_policy, pposgd_simple
sess = U.make_session(num_cpu=1)
sess.__enter__()
def policy_fn(name, ob_space, ac_space):
policy = mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=3)
saver = tf.train.Saver()
if path is not None:
print("Tried to restore from ", path)
U.initialize()
saver.restore(tf.get_default_session(), path)
saver2 = tf.train.import_meta_graph('/srcs/src/models/model1.meta')
model = saver.restore(sess,tf.train.latest_checkpoint('/srcs/src/models/'))
#return policy
return saver2
env = make_env()
pi = pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=1024,
clip_param=0.2, entcoeff=0.0,
optim_epochs=10,
optim_stepsize=5e-5,
optim_batchsize=64,
gamma=0.99,
lam=0.95,
schedule='linear',
tensorboard_log=logger_path,
#tensorboard_log="./ppo1_tensorboard/",
)
env.env.plotSave()
saver = tf.train.Saver(tf.all_variables())
saver.save(sess, '/models/model1')
return pi
def main():
logger.configure()
path_ = "/models/model1"
train(num_timesteps=409600, path=path_)
if __name__ == '__main__':
rank = MPI.COMM_WORLD.Get_rank()
logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank))
main()

Related

Why tensorflow is required to be non-deterministic in tf.timestamp()?

I am trying to measure the time required for the model forward pass. I've encountered a post mentioning the disadvantage of using python time modules for doing so.
Although the post relies on torch and uses torch.cuda.Event(enable_timing=True) to determine the current time, I've found maybe a similar function with tensorflow tf.timestamp().
However, using this function with os.environ['TF_DETERMINISTIC_OPS'] = '1', leads to the following error:
tensorflow.python.framework.errors_impl.FailedPreconditionError: Timestamp cannot be called when determinism is enabled [Op:Timestamp]
I am interested in knowing the reason tf.timestamp() requires the model to be not deterministic. Any ideas ?
Code Idea:
os.environ['TF_DETERMINISTIC_OPS'] = '1'
#tf.function()
def forward_pass(model, x):
y = model(x, training=False)
return y
def inspect_time(model, model_in, runs):
time_start = time.time()
time_start_gpu = tf.timestamp()
for i in range(runs):
pred = forward_pass(model, model_in)
time_avg_gpu = (tf.timestamp() - time_start_gpu)/runs
time_avg_cpu = (time.time()-time_start)/runs
return time_avg_cpu, time_avg_gpu
if __name__ == '__main__':
model = make_model()
with tf.device(logical_gpus[0]):
x_batch, _ = train_dataset.take(1).get_single_element()
x_batch = tnp.copy(x_batch)
assert x_batch.device.endswith("GPU:0")
time_cpu, time_gpu = inspect_time(model, x_batch, 100)

Weights and Biases error: The wandb backend process has shutdown

running the colab linked below, I get the following error:
"The wandb backend process has shutdown"
I see nothing suspicious in the way the colab uses wandb and I couldn't find anyone with the same problem. Any help is greatly appreciated. I am using the latest version of wandb in colab.
This is where I set up wandb:
if WANDB:
wandb.login()
and this is the part where I get the error:
#setup wandb if we're using it
if WANDB:
experiment_name = os.environ.get("EXPERIMENT_NAME")
group = experiment_name if experiment_name != "none" else wandb.util.generate_id()
cv_scores = []
oof_data_frame = pd.DataFrame()
for fold in range(1, config.folds + 1):
print(f"Fold {fold}/{config.folds}", end="\n"*2)
fold_directory = os.path.join(config.output_directory, f"fold_{fold}")
make_directory(fold_directory)
model_path = os.path.join(fold_directory, "model.pth")
model_config_path = os.path.join(fold_directory, "model_config.json")
checkpoints_directory = os.path.join(fold_directory, "checkpoints/")
make_directory(checkpoints_directory)
#Data collators are objects that will form a batch by using a list of dataset elements as input.
collator = Collator(tokenizer=tokenizer, max_length=config.max_length)
train_fold = train[~train["fold"].isin([fold])]
train_dataset = Dataset(texts=train_fold["anchor"].values,
pair_texts=train_fold["target"].values,
contexts=train_fold["title"].values,
targets=train_fold["score"].values,
max_length=config.max_length,
sep=tokenizer.sep_token,
tokenizer=tokenizer)
train_loader = DataLoader(dataset=train_dataset,
batch_size=config.batch_size,
num_workers=config.num_workers,
pin_memory=config.pin_memory,
collate_fn=collator,
shuffle=True,
drop_last=False)
print(f"Train samples: {len(train_dataset)}")
validation_fold = train[train["fold"].isin([fold])]
validation_dataset = Dataset(texts=validation_fold["anchor"].values,
pair_texts=validation_fold["target"].values,
contexts=validation_fold["title"].values,
targets=validation_fold["score"].values,
max_length=config.max_length,
sep=tokenizer.sep_token,
tokenizer=tokenizer)
validation_loader = DataLoader(dataset=validation_dataset,
batch_size=config.batch_size*2,
num_workers=config.num_workers,
pin_memory=config.pin_memory,
collate_fn=collator,
shuffle=True,
drop_last=False)
print(f"Validation samples: {len(validation_dataset)}")
model = Model(**config.model)
if not os.path.exists(model_config_path):
model.config.to_json_file(model_config_path)
model_parameters = model.parameters()
optimizer = get_optimizer(**config.optimizer, model_parameters=model_parameters)
training_steps = len(train_loader) * config.epochs
if "scheduler" in config:
config.scheduler.parameters.num_training_steps = training_steps
config.scheduler.parameters.num_warmup_steps = training_steps * config.get("warmup", 0)
scheduler = get_scheduler(**config.scheduler, optimizer=optimizer, from_transformers=True)
else:
scheduler = None
model_checkpoint = ModelCheckpoint(mode="min",
delta=config.delta,
directory=checkpoints_directory,
overwriting=True,
filename_format="checkpoint.pth",
num_candidates=1)
if WANDB:
wandb.init()
#wandb.init(group=group, name=f"fold_{fold}", config=config)
(train_loss, train_metrics), (validation_loss, validation_metrics, validation_outputs) = training_loop(model=model,
optimizer=optimizer,
scheduler=scheduler,
scheduling_after=config.scheduling_after,
train_loader=train_loader,
validation_loader=validation_loader,
epochs=config.epochs,
gradient_accumulation_steps=config.gradient_accumulation_steps,
gradient_scaling=config.gradient_scaling,
gradient_norm=config.gradient_norm,
validation_steps=config.validation_steps,
amp=config.amp,
debug=config.debug,
verbose=config.verbose,
device=config.device,
recalculate_metrics_at_end=True,
return_validation_outputs=True,
logger="tqdm")
if WANDB:
wandb.finish()
if config.save_model:
model_state = model.state_dict()
torch.save(model_state, model_path)
print(f"Model's path: {model_path}")
validation_fold["prediction"] = validation_outputs.to("cpu").numpy()
oof_data_frame = pd.concat([oof_data_frame, validation_fold])
cv_monitor_value = validation_loss if config.cv_monitor_value == "loss" else validation_metrics[config.cv_monitor_value]
cv_scores.append(cv_monitor_value)
del model, optimizer, validation_outputs, train_fold, validation_fold
torch.cuda.empty_cache()
gc.collect()
print(end="\n"*6)
TDLR; Check if the generated id is unique in the project space of wandb you are using.
Explanation
You can check the exact reason this happened in the log files under the wandb folder and specific run id. Had the same issue with Error communicating with wandb process and The wandb backend process has shutdown.
My problem was that I was assigning the run id to a specific instance which already existed, and rerunning the whole search space, but the run id have to be unique. Using name in init is generally a safer bet if you don't intend to continue the previous run (which is possible if you indicate so in the init method).
You can try running Wandb in offline mode, to see if this can help, and later on doing wandb sync.
Solution which worked for me is run !wandb login --relogin.

Why is it not recommended to save the optimizer, model etc as pickable/dillable objs in PyTorch but instead get the state dicts and load them?

Why is it recommended to save the state dicts and load them instead of saving stuff with dill for example and then just getting the usable objects immediately?
I think I've done that without may issues and it saves users code.
But instead we are recommended to do something like:
def _load_model_and_optimizer_from_checkpoint(args: Namespace, training: bool = True) -> Namespace:
"""
based from: https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html
"""
import torch
from torch import optim
import torch.nn as nn
# model = Net()
args.model = nn.Linear()
# optimizer = optim.SGD(args.model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(args.model.parameters(), lr=0.001)
# scheduler...
checkpoint = torch.load(args.PATH)
args.model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
args.epoch_num = checkpoint['epoch_num']
args.loss = checkpoint['loss']
args.model.train() if training else args.model.eval()
For example I've saved:
def save_for_meta_learning(args: Namespace, ckpt_filename: str = 'ckpt.pt'):
if is_lead_worker(args.rank):
import dill
args.logger.save_current_plots_and_stats()
# - ckpt
assert uutils.xor(args.training_mode == 'epochs', args.training_mode == 'iterations')
f: nn.Module = get_model_from_ddp(args.base_model)
# pickle vs torch.save https://discuss.pytorch.org/t/advantages-disadvantages-of-using-pickle-module-to-save-models-vs-torch-save/79016
args_pickable: Namespace = uutils.make_args_pickable(args)
torch.save({'training_mode': args.training_mode, # assert uutils.xor(args.training_mode == 'epochs', args.training_mode == 'iterations')
'it': args.it,
'epoch_num': args.epoch_num,
'args': args_pickable, # some versions of this might not have args!
'meta_learner': args.meta_learner,
'meta_learner_str': str(args.meta_learner), # added later, to make it easier to check what optimizer was used
'f': f,
'f_state_dict': f.state_dict(), # added later, to make it easier to check what optimizer was used
'f_str': str(f), # added later, to make it easier to check what optimizer was used
# 'f_modules': f._modules,
# 'f_modules_str': str(f._modules),
'outer_opt': args.outer_opt, # added later, to make it easier to check what optimizer was used
'outer_opt_state_dict': args.outer_opt.state_dict(), # added later, to make it easier to check what optimizer was used
'outer_opt_str': str(args.outer_opt) # added later, to make it easier to check what optimizer was used
},
pickle_module=dill,
f=args.log_root / ckpt_filename)
then loaded:
def get_model_opt_meta_learner_to_resume_checkpoint_resnets_rfs(args: Namespace,
path2ckpt: str,
filename: str,
device: Optional[torch.device] = None
) -> tuple[nn.Module, optim.Optimizer, MetaLearner]:
"""
Get the model, optimizer, meta_learner to resume training from checkpoint.
Examples:
- see: _resume_from_checkpoint_meta_learning_for_resnets_rfs_test
"""
import uutils
path2ckpt: Path = Path(path2ckpt).expanduser() if isinstance(path2ckpt, str) else path2ckpt.expanduser()
ckpt: dict = torch.load(path2ckpt / filename, map_location=torch.device('cpu'))
# args_ckpt: Namespace = ckpt['args']
training_mode = ckpt.get('training_mode')
if training_mode is not None:
assert uutils.xor(training_mode == 'epochs', training_mode == 'iterations')
if training_mode == 'epochs':
args.epoch_num = ckpt['epoch_num']
else:
args.it = ckpt['it']
# - get meta-learner
meta_learner: MetaLearner = ckpt['meta_learner']
# - get model
model: nn.Module = meta_learner.base_model
# - get outer-opt
outer_opt_str = ckpt.get('outer_opt_str')
if outer_opt_str is not None:
# use the string to create optimizer, load the state dict, etc.
outer_opt: optim.Optimizer = get_optimizer(outer_opt_str)
outer_opt_state_dict: dict = ckpt['outer_opt_state_dict']
outer_opt.load_state_dict(outer_opt_state_dict)
else:
# this is not ideal, but since Adam has a exponentially moving average for it's adaptive learning rate,
# hopefully this doesn't screw my checkpoint to much
outer_opt: optim.Optimizer = optim.Adam(model.parameters(), lr=args.outer_lr)
# - device setup
if device is not None:
# if torch.cuda.is_available():
# meta_learner.base_model = meta_learner.base_model.cuda()
meta_learner.base_model = meta_learner.base_model.to(device)
return model, outer_opt, meta_learner
without issues.
Related:
Save and load model optimizer state
pytorch save and load model
Save and load a Pytorch model
save and load unserialized pytorch pretrained model
https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html
Why is it not recommended to save the optimizer, model etc as pickable/dillable objs in PyTorch but instead get the state dicts and load them?
https://discuss.pytorch.org/t/why-is-it-not-recommended-to-save-the-optimizer-model-etc-as-pickable-dillable-objs-in-pytorch-but-instead-get-the-state-dicts-and-load-them/137933

How does one fix a `Exception: process 0 terminated with signal SIGSEGV` error and if the single gpu code works fine?

I start 2 processes because I only have 2 gpus but then it gives me a Exception: process 0 terminated with signal SIGSEGV. This code does work with multiple cpus (or at least no error is thrown). Also, it works with a single GPU. Besides that is fails when world_size > 0 and multiple cuda/gpus are present.
My error message this this:
(automl-meta-learning) miranda9~/ML4Coq $ python playground/multiprocessing_playground/ddp_hello_world.py
world_size=2
Traceback (most recent call last):
File "playground/multiprocessing_playground/ddp_hello_world.py", line 49, in <module>
main()
File "playground/multiprocessing_playground/ddp_hello_world.py", line 43, in main
mp.spawn(example,
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
raise Exception(
Exception: process 0 terminated with signal SIGSEGV
This is the code that gives the error:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def example(rank, world_size):
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__=="__main__":
main()
print('Done\n\a')
[Optional] Larger self-contained example (gives same error)
Note however, that this slightly more complete example (only missing a distributed dataloader) also gives me the same issue:
"""
Based on: https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
Correctness of code: https://stackoverflow.com/questions/66226135/how-to-parallelize-a-training-loop-ever-samples-of-a-batch-when-cpu-is-only-avai
Note: as opposed to the multiprocessing (torch.multiprocessing) package, processes can use
different communication backends and are not restricted to being executed on the same machine.
"""
import time
from typing import Tuple
import torch
from torch import nn, optim
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
num_epochs = 5
batch_size = 8
Din, Dout = 10, 5
data_x = torch.randn(batch_size, Din)
data_y = torch.randn(batch_size, Dout)
data = [(i*data_x, i*data_y) for i in range(num_epochs)]
class PerDeviceModel(nn.Module):
"""
Toy example for a model ran in parallel but not distributed accross gpus
(only processes with their own gpu or hardware)
"""
def __init__(self):
super().__init__()
self.net1 = nn.Linear(Din, Din)
self.relu = nn.ReLU()
self.net2 = nn.Linear(Din, Dout)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def setup_process(rank, world_size, backend='gloo'):
"""
Initialize the distributed environment (for each process).
gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.
"""
# set up the master's ip address so this child process can coordinate
# os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
if torch.cuda.is_available():
backend = 'nccl'
# Initializes the default distributed process group, and this will also initialize the distributed package.
dist.init_process_group(backend, rank=rank, world_size=world_size)
def cleanup():
""" Destroy a given process group, and deinitialize the distributed package """
dist.destroy_process_group()
def get_batch(batch: Tuple[torch.Tensor, torch.Tensor], rank):
x, y = batch
if torch.cuda.is_available():
x, y = x.to(rank), y.to(rank)
else:
x, y = x.share_memory_(), y.share_memory_()
return x, y
def get_ddp_model(model: nn.Module, rank):
"""
Moves the underlying storage to shared memory.
This is a no-op if the underlying storage is already in shared memory
and for CUDA tensors. Tensors in shared memory cannot be resized.
:return:
TODO: does this have to be done outside or inside the process? my guess is that it doesn't matter because
1) if its on gpu once it's on the right proc it moves it to cpu with id rank via mdl.to(rank)
2) if it's on cpu then mdl.share_memory() or data.share_memory() is a no op if it's already in shared memory o.w.
"""
# if gpu avail do the standard of creating a model and moving the model to the GPU with id rank
if torch.cuda.is_available():
# create model and move it to GPU with id rank
model = model.to(rank)
ddp_model = DDP(model, device_ids=[rank])
else:
# if we want multiple cpu just make sure the model is shared properly accross the cpus with shared_memory()
# note that op is a no op if it's already in shared_memory
model = model.share_memory()
ddp_model = DDP(model) # I think removing the devices ids should be fine...?
return ddp_model
# return OneDeviceModel().to(rank) if torch.cuda.is_available() else OneDeviceModel().share_memory()
def run_parallel_training_loop(rank, world_size):
"""
Distributed function to be implemented later.
This is the function that is actually ran in each distributed process.
Note: as DDP broadcasts model states from rank 0 process to all other processes in the DDP constructor,
you don’t need to worry about different DDP processes start from different model parameter initial values.
"""
setup_process(rank, world_size)
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# get ddp model
model = PerDeviceModel()
ddp_model = get_ddp_model(model, rank)
# do training
for batch_idx, batch in enumerate(data):
x, y = get_batch(batch, rank)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(x)
# Gradient synchronization communications take place during the backward pass and overlap with the backward computation.
loss_fn(outputs, y).backward() # When the backward() returns, param.grad already contains the synchronized gradient tensor.
optimizer.step() # TODO how does the optimizer know to do the gradient step only once?
print()
print(f"Start running DDP with model parallel example on rank: {rank}.")
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# Destroy a given process group, and deinitialize the distributed package
cleanup()
def main():
print()
print('running main()')
print(f'current process: {mp.current_process()}')
print(f'pid: {os.getpid()}')
# args
if torch.cuda.is_available():
world_size = torch.cuda.device_count()
else:
world_size = mp.cpu_count()
print(f'world_size={world_size}')
mp.spawn(run_parallel_training_loop, args=(world_size,), nprocs=world_size)
if __name__ == "__main__":
print('starting __main__')
start = time.time()
main()
print(f'execution length = {time.time() - start}')
print('Done!\a\n')
cross posted: https://discuss.pytorch.org/t/why-is-mp-spawn-spawning-4-processes-when-i-only-want-2/112299
I ran your "(minimal) code example" without any change and any error on a server with 4 GPUs (python version: 3.6.9, and pytorch version: 1.5.0+cu101 ).
Does the problem still exist when you run the minimal code example?
If so, and if you are on a linux machine, could you please run the following code instead, and tell me what output you get:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def get_visible_gpus():
ns = os.popen('nvidia-smi')
lines_ns = ns.readlines()
# print(lines_ns)
for _i, _line in enumerate(lines_ns):
if _line.find('|=') >= 0:
break
line_gpus = lines_ns[_i:]
for _i, _line in enumerate(line_gpus):
if _line.find('Processes') >= 0:
break
line_gpus = line_gpus[:_i-3]
# print(line_gpus)
idx_gpu_lines = []
for _i, _line in enumerate(line_gpus):
if _line.find('+') >= 0:
idx_gpu_lines.append(_i+1)
idx_gpus = []
for _line_gpu in idx_gpu_lines:
idx_gpus.append(int(line_gpus[_line_gpu].split()[1]))
# print(idx_gpus)
return idx_gpus
def example(rank, world_size):
print('rank:{}'.format(rank))
# create default process group
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8888'
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# create local model
model = nn.Linear(10, 10).to(rank)
# construct DDP model
ddp_model = DDP(model, device_ids=[rank])
# define loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
# forward pass
outputs = ddp_model(torch.randn(20, 10).to(rank))
labels = torch.randn(20, 10).to(rank)
# backward pass
loss_fn(outputs, labels).backward()
# update parameters
optimizer.step()
def main():
# world_size = 2
world_size = torch.cuda.device_count()
print('world_size:{}'.format(world_size))
print('get_visible_gpus():{}'.format(get_visible_gpus()))
mp.spawn(example,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
print(torch.__version__)
main()
print('Done\n\a')
In my case, I simply get:
1.5.0+cu101
world_size:4
get_visible_gpus():[0, 1, 2, 3]
rank:1
rank:3
rank:0
rank:2
Done
get_visible_gpus() is simply text parsing an nvidia-smi shell cmd to get the ids of the gpus that cuda can see.
NB: Please excuse me, I would have commented instead of "answering" -as I am not directly solving your problem, but asking for more details- but my reputation is not good enough T.T
Solution: increase shm-size
docker run -it \
--shm-size=64g
Reason:
If you run on docker container, it's probably because the shm_size of docker is not large enough. By default, Docker containers are allocated 64 MB of shared memory. This shared memory is not a memory limit, but a /dev/shm temporary file storage file system that uses RAM to store files. This is used for IPC.
For check shm-size. After entering the container you can use df to view the shm size.

Tensorflow error with dataset iterator initialization in monitoredtrainingsession

Hi everyone i need somehelp.
I try to code resnet-101 imagenet classification using tensorflow without using estimator. I try it to study deep learning and understand how to use tensorflow.
My problem is monitoredtrainingSession does not initilize my iterator.
I have read some article about the problems and try to use hook to handle it but it fails and i have no idea why it fails.
after i create monitoredtrainingsession it first initialize train_iterator
and got outOfRange exception
then validation step are performed.
It seems fine for now but after finish runing validation and try to run training step again. I got Error related with iterator.get_next().
It saids I did not initialize iterator but my hook function clearly call
session.run(self._initializer, feed_dict={filenames: self._filenames})
i'm sure because i can see the below message that i print to check if it is initialized or not.
iter_val.initializer after_create_session is called 0 times
what am i wrong with it?
running flow are like below
run train step fine (epoch =0)
run validation step fine (epoch =0)
run train step Error(epoch =1)
Please ignore horovod(hvd()) in the code cause I am not using it right now.
Here is my code so please help me to fix it and let me know what's wrong with my code.
class _DatasetInitializerHook(tf.train.SessionRunHook):
def __init__(self, initializer, filenames=[], name=""):
self._initializer = initializer
self._filenames = filenames
self._name = name
self._cnt = 0
self._before_runCnt = 0
def begin(self):
pass
def after_create_session(self, session, coord):
del coord
if len(self._filenames) == 0:
session.run(self._initializer)
else:
session.run(self._initializer, feed_dict={filenames: self._filenames})
print(self._name, "after_create_session is called {} times".format(self._cnt))
self._cnt += 1
if __name__ == "__main__":
if len(sys.argv) > 1:
nlogs = sys.argv[1]
else:
nlogs = 0
hvd.init()
b_imagenet=False
if b_imagenet:
training_filenames = ['/data/tfrecords/imagenet2012_train_shard{}.tfrecord'.format(i) for i in range(129)]
else:
training_filenames = ['/data/cifar-10-tfrecords/train_shard{}.tfrecord'.format(i) for i in range(1, 2, 1)]
filenames = tf.placeholder(tf.string, shape=[None])
trainData = dataset_input_fn(is_training=True, filename=filenames, nworkers=hvd.size(), workeridx=hvd.rank(),
batch_size=FLAGS.batchSize, prefetch_size=FLAGS.prefetch_buffer_size, repeat=1,
shuffle_buffer_size=FLAGS.shuffle_buffer_size)
valData = dataset_input_fn(is_training=False, filename=FLAGS.validationfile, nworkers=hvd.size(), workeridx=hvd.rank(),
batch_size=1,prefetch_size=FLAGS.prefetch_buffer_size, repeat=1, shuffle_buffer_size=1)
# Pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
for i in tqdm(range(FLAGS.nepoch)):
shuffle(training_filenames)
model = model_class(nCls=FLAGS.nClasses, img_width=FLAGS.width, img_height=FLAGS.height,
learning_rate=FLAGS.learning_rate, weight_decay=FLAGS.weight_decay)
iter_train = trainData.make_initializable_iterator()
train_op = model.build_model(iter_train.get_next(), is_trainig=True, hvd=None)
train_hooks = [hvd.BroadcastGlobalVariablesHook(0),
_DatasetInitializerHook(iter_train.initializer, training_filenames, "iter_train.initializer")]
with tf.train.MonitoredTrainingSession(checkpoint_dir="./tmp/train_logs", config=config, hooks=train_hooks,
save_checkpoint_secs=30) as sess:
try:
while True:
opt = sess.run([train_op])
except tf.errors.OutOfRangeError:
pass
iter_val = valData.make_initializable_iterator()
prediction_result = model.build_model(iter_val.get_next(),is_trainig=False, hvd=None)
validation_hooks = [hvd.BroadcastGlobalVariablesHook(0),
_DatasetInitializerHook(iter_val.initializer, [], "iter_val.initializer")]
with tf.train.MonitoredTrainingSession( checkpoint_dir="./tmp/train_logs",config=config, hooks=validation_hooks) as sess:
try:
while True:
result = sess.run([prediction_result])
except tf.errors.OutOfRangeError:
pass
This is the error message I got.
tensorflow.python.framework.errors_impl.FailedPreconditionError: GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
[[node IteratorGetNext (defined at workspace/multi_gpu/main.py:128) ]]
Errors may have originated from an input operation.
Input Source operations connected to node IteratorGetNext:
IteratorV2_2 (defined at workspace/multi_gpu/main.py:126)
Try putting your initializer into a scaffold:
scaffold = tf.train.Scaffold(local_init_op=train_init_operator)
and give it to the monitoredTrainingSession with:
with tf.train.MonitoredTrainingSession(scaffold=scaffold, ...

Categories