estimator.fit hangs on sagemaker on local mode

estimator.fit hangs on sagemaker on local mode - python

I am trying to train a pytorch model using Sagemaker on local mode, but whenever I call estimator.fit the code hangs indefinitely and I have to interrupt the notebook kernel. This happens both in my local machine and in Sagemaker Studio. But when I use EC2, the training runs normally.
Here the call to the estimator, and the stack trace once I interrupt the kernel:
import sagemaker
from sagemaker.pytorch import PyTorch
bucket = "bucket-name"
role = sagemaker.get_execution_role()
training_input_path = f"s3://{bucket}/dataset/path"
sagemaker_session = sagemaker.LocalSession()
sagemaker_session.config = {"local": {"local_code": True}}
output_path = "file://."
estimator = PyTorch(
entry_point="train.py",
source_dir="src",
hyperparameters={"max-epochs": 1},
framework_version="1.8",
py_version="py3",
instance_count=1,
instance_type="local",
role=role,
output_path=output_path,
sagemaker_session=sagemaker_session,
)
estimator.fit({"training": training_input_path})
Stack trace:
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-9-35cdd6021288> in <module>
----> 1 estimator.fit({"training": training_input_path})
/opt/conda/lib/python3.7/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
678 self._prepare_for_training(job_name=job_name)
679
--> 680 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
681 self.jobs.append(self.latest_training_job)
682 if wait:
/opt/conda/lib/python3.7/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
1450 """
1451 train_args = cls._get_train_args(estimator, inputs, experiment_config)
-> 1452 estimator.sagemaker_session.train(**train_args)
1453
1454 return cls(estimator.sagemaker_session, estimator._current_job_name)
/opt/conda/lib/python3.7/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
572 LOGGER.info("Creating training-job with name: %s", job_name)
573 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
--> 574 self.sagemaker_client.create_training_job(**train_request)
575
576 def _get_train_request( # noqa: C901
/opt/conda/lib/python3.7/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
184 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
185 logger.info("Starting training job")
--> 186 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
187
188 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
/opt/conda/lib/python3.7/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
219
220 self.model_artifacts = self.container.train(
--> 221 input_data_config, output_data_config, hyperparameters, job_name
222 )
223 self.end_time = datetime.datetime.now()
/opt/conda/lib/python3.7/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
200 data_dir = self._create_tmp_folder()
201 volumes = self._prepare_training_volumes(
--> 202 data_dir, input_data_config, output_data_config, hyperparameters
203 )
204 # If local, source directory needs to be updated to mounted /opt/ml/code path
/opt/conda/lib/python3.7/site-packages/sagemaker/local/image.py in _prepare_training_volumes(self, data_dir, input_data_config, output_data_config, hyperparameters)
487 os.mkdir(channel_dir)
488
--> 489 data_source = sagemaker.local.data.get_data_source_instance(uri, self.sagemaker_session)
490 volumes.append(_Volume(data_source.get_root_dir(), channel=channel_name))
491
/opt/conda/lib/python3.7/site-packages/sagemaker/local/data.py in get_data_source_instance(data_source, sagemaker_session)
52 return LocalFileDataSource(parsed_uri.netloc + parsed_uri.path)
53 if parsed_uri.scheme == "s3":
---> 54 return S3DataSource(parsed_uri.netloc, parsed_uri.path, sagemaker_session)
55 raise ValueError(
56 "data_source must be either file or s3. parsed_uri.scheme: {}".format(parsed_uri.scheme)
/opt/conda/lib/python3.7/site-packages/sagemaker/local/data.py in __init__(self, bucket, prefix, sagemaker_session)
183 working_dir = "/private{}".format(working_dir)
184
--> 185 sagemaker.utils.download_folder(bucket, prefix, working_dir, sagemaker_session)
186 self.files = LocalFileDataSource(working_dir)
187
/opt/conda/lib/python3.7/site-packages/sagemaker/utils.py in download_folder(bucket_name, prefix, target, sagemaker_session)
286 raise
287
--> 288 _download_files_under_prefix(bucket_name, prefix, target, s3)
289
290
/opt/conda/lib/python3.7/site-packages/sagemaker/utils.py in _download_files_under_prefix(bucket_name, prefix, target, s3)
314 if exc.errno != errno.EEXIST:
315 raise
--> 316 obj.download_file(file_path)
317
318
/opt/conda/lib/python3.7/site-packages/boto3/s3/inject.py in object_download_file(self, Filename, ExtraArgs, Callback, Config)
313 return self.meta.client.download_file(
314 Bucket=self.bucket_name, Key=self.key, Filename=Filename,
--> 315 ExtraArgs=ExtraArgs, Callback=Callback, Config=Config)
316
317
/opt/conda/lib/python3.7/site-packages/boto3/s3/inject.py in download_file(self, Bucket, Key, Filename, ExtraArgs, Callback, Config)
171 return transfer.download_file(
172 bucket=Bucket, key=Key, filename=Filename,
--> 173 extra_args=ExtraArgs, callback=Callback)
174
175
/opt/conda/lib/python3.7/site-packages/boto3/s3/transfer.py in download_file(self, bucket, key, filename, extra_args, callback)
305 bucket, key, filename, extra_args, subscribers)
306 try:
--> 307 future.result()
308 # This is for backwards compatibility where when retries are
309 # exceeded we need to throw the same error from boto3 instead of
/opt/conda/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
107 except KeyboardInterrupt as e:
108 self.cancel()
--> 109 raise e
110
111 def cancel(self):
/opt/conda/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
104 # however if a KeyboardInterrupt is raised we want want to exit
105 # out of this and propogate the exception.
--> 106 return self._coordinator.result()
107 except KeyboardInterrupt as e:
108 self.cancel()
/opt/conda/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
258 # possible value integer value, which is on the scale of billions of
259 # years...
--> 260 self._done_event.wait(MAXINT)
261
262 # Once done waiting, raise an exception if present or return the
/opt/conda/lib/python3.7/threading.py in wait(self, timeout)
550 signaled = self._flag
551 if not signaled:
--> 552 signaled = self._cond.wait(timeout)
553 return signaled
554
/opt/conda/lib/python3.7/threading.py in wait(self, timeout)
294 try: # restore state no matter what (e.g., KeyboardInterrupt)
295 if timeout is None:
--> 296 waiter.acquire()
297 gotit = True
298 else:
KeyboardInterrupt:

SageMaker Studio does not natively support local mode. Studio Apps are themselves docker containers and therefore they require privileged access if they were to be able to build and run docker containers.
As an alternative solution, you can create a remote docker host on an EC2 instance and setup docker on your Studio App. There is quite a bit of networking and package installation involved, but the solution will enable you to use full docker functionality. Additionally, as of version 2.80.0 of SageMaker Python SDK, it now supports local mode when you are using remote docker host.
sdockerSageMaker Studio Docker CLI extension (see this repo) can simplify deploying the above solution in simple two steps (only works for Studio Domain in VPCOnly mode) and it has an easy to follow example here.
UPDATE:
There is now a UI extension (see repo) which can make the experience much smoother and easier to manage.

Related

Mlflow "load_model" goes in deadlock

Trying to load a model from past run in mlflow, in jupyterlab, never finishes. After waiting for hours, interrupting the run throws the below state.
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Input In [21], in <cell line: 2>()
1 logged_model = 'runs:/7f6932baef144fa69847ba11ef66f8e6/model/'
----> 2 loaded_model = mlflow.tensorflow.load_model(logged_model)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/tensorflow/__init__.py:397, in load_model(model_uri, dst_path)
360 def load_model(model_uri, dst_path=None):
361 """
362 Load an MLflow model that contains the TensorFlow flavor from the specified path.
363
(...)
395 for _, output_signature in signature_definition.outputs.items()]
396 """
--> 397 local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
398 flavor_conf = _get_flavor_configuration(local_model_path, FLAVOR_NAME)
399 _add_code_from_conf_to_system_path(local_model_path, flavor_conf)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/tracking/artifact_utils.py:95, in _download_artifact_from_uri(artifact_uri, output_path)
92 parsed_uri = parsed_uri._replace(path=posixpath.dirname(parsed_uri.path))
93 root_uri = prefix + urllib.parse.urlunparse(parsed_uri)
---> 95 return get_artifact_repository(artifact_uri=root_uri).download_artifacts(
96 artifact_path=artifact_path, dst_path=output_path
97 )
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/runs_artifact_repo.py:125, in RunsArtifactRepository.download_artifacts(self, artifact_path, dst_path)
110 def download_artifacts(self, artifact_path, dst_path=None):
111 """
112 Download an artifact file or directory to a local directory if applicable, and return a
113 local path for it.
(...)
123 :return: Absolute path of the local filesystem location containing the desired artifacts.
124 """
--> 125 return self.repo.download_artifacts(artifact_path, dst_path)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/artifact_repo.py:242, in ArtifactRepository.download_artifacts(self, artifact_path, dst_path)
240 # Check if the artifacts points to a directory
241 if self._is_directory(artifact_path):
--> 242 dst_local_path, inflight_downloads = async_download_artifact_dir(
243 src_artifact_dir_path=artifact_path, dst_local_dir_path=dst_path
244 )
245 else:
246 inflight_downloads = async_download_artifact(
247 src_artifact_path=artifact_path, dst_local_dir_path=dst_path
248 )
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/artifact_repo.py:208, in ArtifactRepository.download_artifacts.<locals>.async_download_artifact_dir(src_artifact_dir_path, dst_local_dir_path)
206 for file_info in dir_content:
207 if file_info.is_dir:
--> 208 inflight_downloads += async_download_artifact_dir(
209 src_artifact_dir_path=file_info.path,
210 dst_local_dir_path=dst_local_dir_path,
211 )[2]
212 else:
213 inflight_downloads += async_download_artifact(
214 src_artifact_path=file_info.path,
215 dst_local_dir_path=dst_local_dir_path,
216 )
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/artifact_repo.py:199, in ArtifactRepository.download_artifacts.<locals>.async_download_artifact_dir(src_artifact_dir_path, dst_local_dir_path)
195 local_dir = os.path.join(dst_local_dir_path, src_artifact_dir_path)
196 inflight_downloads = []
197 dir_content = [ # prevent infinite loop, sometimes the dir is recursively included
198 file_info
--> 199 for file_info in self.list_artifacts(src_artifact_dir_path)
200 if file_info.path != "." and file_info.path != src_artifact_dir_path
201 ]
202 if not dir_content: # empty dir
203 if not os.path.exists(local_dir):
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/sftp_artifact_repo.py:94, in SFTPArtifactRepository.list_artifacts(self, path)
92 artifact_dir = self.path
93 list_dir = posixpath.join(artifact_dir, path) if path else artifact_dir
---> 94 if not self.sftp.isdir(list_dir):
95 return []
96 artifact_files = self.sftp.listdir(list_dir)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/pysftp/__init__.py:652, in Connection.isdir(self, remotepath)
650 self._sftp_connect()
651 try:
--> 652 result = S_ISDIR(self._sftp.stat(remotepath).st_mode)
653 except IOError: # no such file
654 result = False
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp_client.py:493, in SFTPClient.stat(self, path)
491 path = self._adjust_cwd(path)
492 self._log(DEBUG, "stat({!r})".format(path))
--> 493 t, msg = self._request(CMD_STAT, path)
494 if t != CMD_ATTRS:
495 raise SFTPError("Expected attributes")
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp_client.py:822, in SFTPClient._request(self, t, *arg)
820 def _request(self, t, *arg):
821 num = self._async_request(type(None), t, *arg)
--> 822 return self._read_response(num)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp_client.py:852, in SFTPClient._read_response(self, waitfor)
850 while True:
851 try:
--> 852 t, data = self._read_packet()
853 except EOFError as e:
854 raise SSHException("Server connection dropped: {}".format(e))
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp.py:201, in BaseSFTP._read_packet(self)
200 def _read_packet(self):
--> 201 x = self._read_all(4)
202 # most sftp servers won't accept packets larger than about 32k, so
203 # anything with the high byte set (> 16MB) is just garbage.
204 if byte_ord(x[0]):
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp.py:185, in BaseSFTP._read_all(self, n)
183 break
184 else:
--> 185 x = self.sock.recv(n)
187 if len(x) == 0:
188 raise EOFError()
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/channel.py:699, in Channel.recv(self, nbytes)
686 """
687 Receive data from the channel. The return value is a string
688 representing the data received. The maximum amount of data to be
(...)
696 if no data is ready before the timeout set by `settimeout`.
697 """
698 try:
--> 699 out = self.in_buffer.read(nbytes, self.timeout)
700 except PipeTimeout:
701 raise socket.timeout()
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/buffered_pipe.py:160, in BufferedPipe.read(self, nbytes, timeout)
158 while (len(self._buffer) == 0) and not self._closed:
159 then = time.time()
--> 160 self._cv.wait(timeout)
161 if timeout is not None:
162 timeout -= time.time() - then
File ~/.conda/envs/tensorflow/lib/python3.8/threading.py:302, in Condition.wait(self, timeout)
300 try: # restore state no matter what (e.g., KeyboardInterrupt)
301 if timeout is None:
--> 302 waiter.acquire()
303 gotit = True
304 else:
KeyboardInterrupt:
The mlflow tracking server is working properly for all the other operations. I am able to log params, metrics and artifacts. But I am not able to load a model or retrive any of the artifacts.
Update:
Looks like a bug as per https://github.com/mlflow/mlflow/issues/5656.

Please upgrade mlflow, their is some issue with version 1.26.0
pip install mlflow==1.27.0
Assuming you are also using above version

Failure to parallelize code trying to load the same numpy array with joblib

I am new to the world of parallelization, and encountered a very odd bug as I was trying to run a function trying to load the same npy file running on several cores.
My code is of the form:
import os
from pathlib import Path
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
mydir = 'path/of/your/choice'
myfile = 'myArray.npy'
mydir=Path(mydir)
myfile=mydir/myfile
os.chdir(mydir)
myarray = np.zeros((12345))
np.save(myfile, myarray)
def foo(myfile, x):
# function loading a myArray and working with it
arr=np.load(myfile)
return arr+x
if __name__=='__main__':
foo_results = Parallel(n_jobs=num_cores, backend="threading")(\
delayed(foo)(myfile,i) for i in range(10))
In my case, this script would run fine about 40% of the way, then return
--> 17 arr=np.load(mydir/'myArray.npy')
ValueError: cannot reshape array of size 0 into shape (12345,)
What blows my mind is that if I enter %pdb debug mode and actually try to run arr=np.load(mydir/'myArray.npy'), this works! So I assume that the issue stems from all the parallel processes running foo trying to load the same numpy array at the same time (as in debug mode, all the processes are paused and only the code that I execute actually runs).
This very minimal example actually works, presumably because the function is very simple and joblib handles this gracefully, but my code would be too long and complicated to be posted here - first of all, has anyone encountered a similar issue in the past? If no one manages to identify my issue, I will post my whole script.
Thanks for your help!
-------------------- EDIT ------------------
Given that there doesn't seem to be an easy answer with the toy code that I posted, here are the full error logs. I played around with the backends following #psarka recommendation and for some reason, the following error arises with the default loky backend (again, no problem to run the code in a non-parallel manner):
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg_stack(dp, U_src, U_trg, cbin, cwin, normalize, all_to_all, name, sav, again, periods)
541
542 ccg_results=Parallel(n_jobs=num_cores)(\
--> 543 delayed(ccg)(*ccg_inputs[i]) for i in tqdm(range(len(ccg_inputs)), desc=f'Computing ccgs over {num_cores} cores'))
544 for ((i1, u1, i2, u2), CCG) in zip(ccg_ids,ccg_results):
545 if i1==i2:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~/miniconda3/envs/npyx/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
~/miniconda3/envs/npyx/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Cannot load file containing pickled data when allow_pickle=False
but this arises with the threading backend, which is more informative (which was originally used in my question) - again, it is possible to actually run train = np.load(Path(dprm,fn)) in debug mode:
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg_stack(dp, U_src, U_trg, cbin, cwin, normalize, all_to_all, name, sav, again, periods)
541
542 ccg_results=Parallel(n_jobs=num_cores, backend='threading')(\
--> 543 delayed(ccg)(*ccg_inputs[i]) for i in tqdm(range(len(ccg_inputs)), desc=f'Computing ccgs over {num_cores} cores'))
544 for ((i1, u1, i2, u2), CCG) in zip(ccg_ids,ccg_results):
545 if i1==i2:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/miniconda3/envs/npyx/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
~/miniconda3/envs/npyx/lib/python3.7/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/_parallel_backends.py in __call__(self, *args, **kwargs)
593 def __call__(self, *args, **kwargs):
594 try:
--> 595 return self.func(*args, **kwargs)
596 except KeyboardInterrupt as e:
597 # We capture the KeyboardInterrupt and reraise it as
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg(dp, U, bin_size, win_size, fs, normalize, ret, sav, verbose, periods, again, trains)
258 if verbose: print("File {} not found in routines memory.".format(fn))
259 crosscorrelograms = crosscorrelate_cyrille(dp, bin_size, win_size, sortedU, fs, True,
--> 260 periods=periods, verbose=verbose, trains=trains)
261 crosscorrelograms = np.asarray(crosscorrelograms, dtype='float64')
262 if crosscorrelograms.shape[0]<len(U): # no spikes were found in this period
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in crosscorrelate_cyrille(dp, bin_size, win_size, U, fs, symmetrize, periods, verbose, trains)
88 U=list(U)
89
---> 90 spike_times, spike_clusters = make_phy_like_spikeClustersTimes(dp, U, periods=periods, verbose=verbose, trains=trains)
91
92 return crosscorr_cyrille(spike_times, spike_clusters, win_size, bin_size, fs, symmetrize)
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in make_phy_like_spikeClustersTimes(dp, U, periods, verbose, trains)
46 for iu, u in enumerate(U):
47 # Even lists of strings can be dealt with as integers by being replaced by their indices
---> 48 trains_dic[iu]=trn(dp, u, sav=True, periods=periods, verbose=verbose) # trains in samples
49 else:
50 assert len(trains)>1
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/spk_t.py in trn(dp, unit, sav, verbose, periods, again, enforced_rp)
106 if op.exists(Path(dprm,fn)) and not again:
107 if verbose: print("File {} found in routines memory.".format(fn))
--> 108 train = np.load(Path(dprm,fn))
109
110 # if not, compute it
~/miniconda3/envs/npyx/lib/python3.7/site-packages/numpy-1.21.0rc2-py3.7-linux-x86_64.egg/numpy/lib/npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
443 # Try a pickle
444 if not allow_pickle:
--> 445 raise ValueError("Cannot load file containing pickled data "
446 "when allow_pickle=False")
447 try:
ValueError: Cannot load file containing pickled data when allow_pickle=False
The original error ValueError: cannot reshape array of size 0 into shape (12345,) doesn't show up anymore for some reason.

Operations on a Dask DataFrame fail when using snappy compression

I partitioned a large dataset into a sequence of parquet files using pandas.DataFrame.to_parquet and saved them to S3. I then read these into Dask on a cluster using dask.read_parquet:
import dask.dataframe as dd
df = dd.read_parquet(
's3://aleksey-emr-dask/data/2019-taxi-dataset/',
storage_options={'key': 'secret', 'secret': 'secret'},
engine='fastparquet'
)
pandas uses snappy compression by default. fastparquet is able to work with this compression so long as you install the python-snappy and snappy packages. Since I am running on AWS EMR, and using Dask's EMR example bootstrap script, I have installed these packages from conda-forge using the --botstrap-actions flag and the --conda-packages optional argument:
python3 -m pip list | grep snappy
python-snappy 0.5.4
This is enough to make dd.read_parquet succeed. However, certain operations fail with KeyError: snappy. For example, this fails:
passenger_counts = df.trip_distance.value_counts().compute()
I know this is not an error with the cluster configuration because other operations, like this one, succeed:
vendors = df.VendorID.value_counts().compute()
> 2.0 53516733
> 1.0 30368157
> 4.0 267080
> Name: VendorID, dtype: int64
Which leads to my question. Does Dask not support snappy compression, even if its IO engine (fastparquet in this case) does?
Here is the full body of the error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<timed exec> in <module>
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/base.py in compute(self, **kwargs)
165 dask.base.compute
166 """
--> 167 (result,) = compute(self, traverse=False, **kwargs)
168 return result
169
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/base.py in compute(*args, **kwargs)
445 postcomputes.append(x.__dask_postcompute__())
446
--> 447 results = schedule(dsk, keys, **kwargs)
448 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
449
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2686 should_rejoin = False
2687 try:
-> 2688 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2689 finally:
2690 for f in futures.values():
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1986 direct=direct,
1987 local_worker=local_worker,
-> 1988 asynchronous=asynchronous,
1989 )
1990
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
831 else:
832 return sync(
--> 833 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
834 )
835
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
337 if error[0]:
338 typ, exc, tb = error[0]
--> 339 raise exc.with_traceback(tb)
340 else:
341 return result[0]
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils.py in f()
321 if callback_timeout is not None:
322 future = asyncio.wait_for(future, callback_timeout)
--> 323 result[0] = yield future
324 except Exception as exc:
325 error[0] = sys.exc_info()
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1874 else:
1875 self._gather_future = future
-> 1876 response = await future
1877
1878 if response["status"] == "error":
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in _gather_remote(self, direct, local_worker)
1925
1926 else: # ask scheduler to gather data for us
-> 1927 response = await retry_operation(self.scheduler.gather, keys=keys)
1928
1929 return response
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils_comm.py in retry_operation(coro, operation, *args, **kwargs)
388 delay_min=retry_delay_min,
389 delay_max=retry_delay_max,
--> 390 operation=operation,
391 )
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils_comm.py in retry(coro, count, delay_min, delay_max, jitter_fraction, retry_on_exceptions, operation)
368 delay *= 1 + random.random() * jitter_fraction
369 await asyncio.sleep(delay)
--> 370 return await coro()
371
372
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/core.py in send_recv_from_rpc(**kwargs)
859 name, comm.name = comm.name, "ConnectionPool." + key
860 try:
--> 861 result = await send_recv(comm=comm, op=key, **kwargs)
862 finally:
863 self.pool.reuse(self.addr, comm)
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/core.py in send_recv(comm, reply, serializers, deserializers, **kwargs)
642 await comm.write(msg, serializers=serializers, on_error="raise")
643 if reply:
--> 644 response = await comm.read(deserializers=deserializers)
645 else:
646 response = None
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/tcp.py in read(self, deserializers)
204 deserialize=self.deserialize,
205 deserializers=deserializers,
--> 206 allow_offload=self.allow_offload,
207 )
208 except EOFError:
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/utils.py in from_frames(frames, deserialize, deserializers, allow_offload)
85 res = await offload(_from_frames)
86 else:
---> 87 res = _from_frames()
88
89 return res
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/utils.py in _from_frames()
64 try:
65 return protocol.loads(
---> 66 frames, deserialize=deserialize, deserializers=deserializers
67 )
68 except EOFError:
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/core.py in loads(frames, deserialize, deserializers)
126 if deserialize or key in bytestrings:
127 if "compression" in head:
--> 128 fs = decompress(head, fs)
129 fs = merge_frames(head, fs)
130 value = _deserialize(head, fs, deserializers=deserializers)
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/compression.py in decompress(header, frames)
214 return [
215 compressions[c]["decompress"](frame)
--> 216 for c, frame in zip(header["compression"], frames)
217 ]
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/compression.py in <listcomp>(.0)
214 return [
215 compressions[c]["decompress"](frame)
--> 216 for c, frame in zip(header["compression"], frames)
217 ]
KeyError: 'snappy'

You need to have snappy and python-snappy installed in the client environment as well, so that the worker can use the codec to turn source bytes into data.
I'm accessing the cluster from a local Jupyter notebook on my machine via SSH port forwarding, and did not have these packages installed locally. Installing them in my local env:
$ conda install -c conda-forge snappy python-snappy
Resolved the issue.

AWS Sagemaker - ClientError: Data download failed

Problem:
I am trying to setup a model in Sagemaker, however it fails when it comes to downloading the data.
Does anyone know what I am doing wrong?
What I did so far:
In order to avoid any mistakes on my side I decided to use the AWS tutorial:
tensorflow_iris_dnn_classifier_using_estimators
And I made only two changes:
I copied the dataset to my own S3 instance. --> I tested if I could access / show the data and it worked.
I edited the path to point to the new folder.
This is the AWS source code:
https://github.com/awslabs/amazon-sagemaker-examples/tree/master/sagemaker-python-sdk/tensorflow_iris_dnn_classifier_using_estimators
%%time
import boto3
# use the region-specific sample data bucket
region = boto3.Session().region_name
#train_data_location = 's3://sagemaker-sample-data-{}/tensorflow/iris'.format(region)
train_data_location = 's3://my-s3-bucket'
iris_estimator.fit(train_data_location)
And this is the error I get:
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
2115 magic_arg_s = self.var_expand(line, stack_depth)
2116 with self.builtin_trap:
-> 2117 result = fn(magic_arg_s, cell)
2118 return result
2119
<decorator-gen-60> in time(self, line, cell, local_ns)
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
1191 else:
1192 st = clock2()
-> 1193 exec(code, glob, local_ns)
1194 end = clock2()
1195 out = None
<timed exec> in <module>()
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/sagemaker/tensorflow/estimator.pyc in fit(self, inputs, wait, logs, job_name, run_tensorboard_locally)
314 tensorboard.join()
315 else:
--> 316 fit_super()
317
318 #classmethod
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/sagemaker/tensorflow/estimator.pyc in fit_super()
293
294 def fit_super():
--> 295 super(TensorFlow, self).fit(inputs, wait, logs, job_name)
296
297 if run_tensorboard_locally and wait is False:
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/sagemaker/estimator.pyc in fit(self, inputs, wait, logs, job_name)
232 self.latest_training_job = _TrainingJob.start_new(self, inputs)
233 if wait:
--> 234 self.latest_training_job.wait(logs=logs)
235
236 def _compilation_job_name(self):
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/sagemaker/estimator.pyc in wait(self, logs)
571 def wait(self, logs=True):
572 if logs:
--> 573 self.sagemaker_session.logs_for_job(self.job_name, wait=True)
574 else:
575 self.sagemaker_session.wait_for_job(self.job_name)
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/sagemaker/session.pyc in logs_for_job(self, job_name, wait, poll)
1126
1127 if wait:
-> 1128 self._check_job_status(job_name, description, 'TrainingJobStatus')
1129 if dot:
1130 print()
/home/ec2-user/anaconda3/envs/tensorflow_p27/lib/python2.7/site-packages/sagemaker/session.pyc in _check_job_status(self, job, desc, status_key_name)
826 reason = desc.get('FailureReason', '(No reason provided)')
827 job_type = status_key_name.replace('JobStatus', ' job')
--> 828 raise ValueError('Error for {} {}: {} Reason: {}'.format(job_type, job, status, reason))
829
830 def wait_for_endpoint(self, endpoint, poll=5):
ValueError: Error for Training job sagemaker-tensorflow-2019-01-03-16-32-16-435: Failed Reason: ClientError: Data download failed:S3 key: s3://my-s3-bucket//sagemaker-tensorflow-2019-01-03-14-02-39-959/source/sourcedir.tar.gz has an illegal char sub-sequence '//' in it

The script is expecting 'bucket' to be bucket = Session().default_bucket() or your own. Have you tried setting bucket equal to your personal bucket?

It looks like the full error message you received there was:
ClientError: Data download failed:S3 key: s3://my-s3-bucket//sagemaker-tensorflow-2019-01-03-14-02-39-959/source/sourcedir.tar.gz has an illegal char sub-sequence '//' in it
Does the problem persist even after fixing the key?

i had similar . had to change just to the name of the output with nothing preceeding it or it will give me that double '//' error. so just do 'my-s3-bucket'

no. make sure its just your output name not the bucket name too so mine was 'vanias bucket/results' i changed it to just 'results' and it worked. good luck!

pytorch RuntimeError: CUDA error: device-side assert triggered

I've a notebook on google colab that fails with following error
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
93 exception = e
---> 94 raise e
95 finally: cb_handler.on_train_end(exception)
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
83 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 84 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
85 if cb_handler.on_batch_end(loss): break
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
24 if opt is not None:
---> 25 loss = cb_handler.on_backward_begin(loss)
26 loss.backward()
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_backward_begin(self, loss)
223 for cb in self.callbacks:
--> 224 a = cb.on_backward_begin(**self.state_dict)
225 if a is not None: self.state_dict['last_loss'] = a
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in on_backward_begin(self, smooth_loss, **kwargs)
266 if self.pbar is not None and hasattr(self.pbar,'child'):
--> 267 self.pbar.child.comment = f'{smooth_loss:.4f}'
268
/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __format__(self, format_spec)
377 if self.dim() == 0:
--> 378 return self.item().__format__(format_spec)
379 return object.__format__(self, format_spec)
RuntimeError: CUDA error: device-side assert triggered
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-33-dd390b1c8108> in <module>()
----> 1 lr_find(learn)
2 learn.recorder.plot()
/usr/local/lib/python3.6/dist-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
26 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
27 a = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 28 learn.fit(a, start_lr, callbacks=[cb], **kwargs)
29
30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
160 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
161 fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 162 callbacks=self.callbacks+callbacks)
163
164 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
93 exception = e
94 raise e
---> 95 finally: cb_handler.on_train_end(exception)
96
97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_train_end(self, exception)
254 def on_train_end(self, exception:Union[bool,Exception])->None:
255 "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
--> 256 self('train_end', exception=exception)
257
258 class AverageMetric(Callback):
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
185 "Call through to all of the `CallbakHandler` functions."
186 if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187 return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
188
189 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in <listcomp>(.0)
185 "Call through to all of the `CallbakHandler` functions."
186 if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187 return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
188
189 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
/usr/local/lib/python3.6/dist-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)
45 # restore the valid_dl we turned of on `__init__`
46 self.data.valid_dl = self.valid_dl
---> 47 self.learn.load('tmp')
48 if hasattr(self.learn.model, 'reset'): self.learn.model.reset()
49 print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in load(self, name, device)
202 "Load model `name` from `self.model_dir` using `device`, defaulting to `self.data.device`."
203 if device is None: device = self.data.device
--> 204 self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))
205 return self
206
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in load(f, map_location, pickle_module)
356 f = open(f, 'rb')
357 try:
--> 358 return _load(f, map_location, pickle_module)
359 finally:
360 if new_fd:
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _load(f, map_location, pickle_module)
527 unpickler = pickle_module.Unpickler(f)
528 unpickler.persistent_load = persistent_load
--> 529 result = unpickler.load()
530
531 deserialized_storage_keys = pickle_module.load(f)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in persistent_load(saved_id)
493 if root_key not in deserialized_objects:
494 deserialized_objects[root_key] = restore_location(
--> 495 data_type(size), location)
496 storage = deserialized_objects[root_key]
497 if view_metadata is not None:
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in restore_location(storage, location)
376 elif isinstance(map_location, torch.device):
377 def restore_location(storage, location):
--> 378 return default_restore_location(storage, str(map_location))
379 else:
380 def restore_location(storage, location):
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in default_restore_location(storage, location)
102 def default_restore_location(storage, location):
103 for _, _, fn in _package_registry:
--> 104 result = fn(storage, location)
105 if result is not None:
106 return result
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _cuda_deserialize(obj, location)
84 'to an existing device.'.format(
85 device, torch.cuda.device_count()))
---> 86 return obj.cuda(device)
87
88
/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)
74 else:
75 new_type = getattr(torch.cuda, self.__class__.__name__)
---> 76 return new_type(self.size()).copy_(self, non_blocking)
77
78
RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20
There is no information about the real cause, I tried to get the stack trace by forcing cuda to run on one gpu (as suggested here) using a cell like this
!export CUDA_LAUNCH_BLOCKING=1
But this does not seem to work, still having the same error with.
Is there another way that works with Google Colab?

Be sure that your targets values starts from zero to number of classes - 1. Ex: you have 100 classification class so your target should be from 0 to 99

!export FOO=blah is usually not useful to run in a notebook because ! means run the following command in a sub-shell, so the effect of the statement is gone by the time the ! returns.
You might have more success by storing your python code in a file and then executing that file in a subshell:
In one cell:
%%writefile foo.py
[...your code...]
In the next cell:
!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py
(or s/python3/python2/ if you're writing py2)

Switch Hardware Accelerator Type to "None" under Runtime->Change Runtime Type . This should give you a more meaningful error message.

The proper way to set environmental variables in Google Colab is to use os:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
Using the os library will allow you to set whatever environmental variables you need. Setting CUDA_LAUNCH_BLOCKING this way enables proper CUDA tracebacks in Google Colab.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

estimator.fit hangs on sagemaker on local mode - python

Related

Mlflow "load_model" goes in deadlock

Failure to parallelize code trying to load the same numpy array with joblib

Operations on a Dask DataFrame fail when using snappy compression

AWS Sagemaker - ClientError: Data download failed

pytorch RuntimeError: CUDA error: device-side assert triggered

Categories

Resources