Mlflow "load_model" goes in deadlock

Mlflow "load_model" goes in deadlock - python

Trying to load a model from past run in mlflow, in jupyterlab, never finishes. After waiting for hours, interrupting the run throws the below state.
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Input In [21], in <cell line: 2>()
1 logged_model = 'runs:/7f6932baef144fa69847ba11ef66f8e6/model/'
----> 2 loaded_model = mlflow.tensorflow.load_model(logged_model)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/tensorflow/__init__.py:397, in load_model(model_uri, dst_path)
360 def load_model(model_uri, dst_path=None):
361 """
362 Load an MLflow model that contains the TensorFlow flavor from the specified path.
363
(...)
395 for _, output_signature in signature_definition.outputs.items()]
396 """
--> 397 local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
398 flavor_conf = _get_flavor_configuration(local_model_path, FLAVOR_NAME)
399 _add_code_from_conf_to_system_path(local_model_path, flavor_conf)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/tracking/artifact_utils.py:95, in _download_artifact_from_uri(artifact_uri, output_path)
92 parsed_uri = parsed_uri._replace(path=posixpath.dirname(parsed_uri.path))
93 root_uri = prefix + urllib.parse.urlunparse(parsed_uri)
---> 95 return get_artifact_repository(artifact_uri=root_uri).download_artifacts(
96 artifact_path=artifact_path, dst_path=output_path
97 )
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/runs_artifact_repo.py:125, in RunsArtifactRepository.download_artifacts(self, artifact_path, dst_path)
110 def download_artifacts(self, artifact_path, dst_path=None):
111 """
112 Download an artifact file or directory to a local directory if applicable, and return a
113 local path for it.
(...)
123 :return: Absolute path of the local filesystem location containing the desired artifacts.
124 """
--> 125 return self.repo.download_artifacts(artifact_path, dst_path)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/artifact_repo.py:242, in ArtifactRepository.download_artifacts(self, artifact_path, dst_path)
240 # Check if the artifacts points to a directory
241 if self._is_directory(artifact_path):
--> 242 dst_local_path, inflight_downloads = async_download_artifact_dir(
243 src_artifact_dir_path=artifact_path, dst_local_dir_path=dst_path
244 )
245 else:
246 inflight_downloads = async_download_artifact(
247 src_artifact_path=artifact_path, dst_local_dir_path=dst_path
248 )
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/artifact_repo.py:208, in ArtifactRepository.download_artifacts.<locals>.async_download_artifact_dir(src_artifact_dir_path, dst_local_dir_path)
206 for file_info in dir_content:
207 if file_info.is_dir:
--> 208 inflight_downloads += async_download_artifact_dir(
209 src_artifact_dir_path=file_info.path,
210 dst_local_dir_path=dst_local_dir_path,
211 )[2]
212 else:
213 inflight_downloads += async_download_artifact(
214 src_artifact_path=file_info.path,
215 dst_local_dir_path=dst_local_dir_path,
216 )
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/artifact_repo.py:199, in ArtifactRepository.download_artifacts.<locals>.async_download_artifact_dir(src_artifact_dir_path, dst_local_dir_path)
195 local_dir = os.path.join(dst_local_dir_path, src_artifact_dir_path)
196 inflight_downloads = []
197 dir_content = [ # prevent infinite loop, sometimes the dir is recursively included
198 file_info
--> 199 for file_info in self.list_artifacts(src_artifact_dir_path)
200 if file_info.path != "." and file_info.path != src_artifact_dir_path
201 ]
202 if not dir_content: # empty dir
203 if not os.path.exists(local_dir):
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/mlflow/store/artifact/sftp_artifact_repo.py:94, in SFTPArtifactRepository.list_artifacts(self, path)
92 artifact_dir = self.path
93 list_dir = posixpath.join(artifact_dir, path) if path else artifact_dir
---> 94 if not self.sftp.isdir(list_dir):
95 return []
96 artifact_files = self.sftp.listdir(list_dir)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/pysftp/__init__.py:652, in Connection.isdir(self, remotepath)
650 self._sftp_connect()
651 try:
--> 652 result = S_ISDIR(self._sftp.stat(remotepath).st_mode)
653 except IOError: # no such file
654 result = False
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp_client.py:493, in SFTPClient.stat(self, path)
491 path = self._adjust_cwd(path)
492 self._log(DEBUG, "stat({!r})".format(path))
--> 493 t, msg = self._request(CMD_STAT, path)
494 if t != CMD_ATTRS:
495 raise SFTPError("Expected attributes")
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp_client.py:822, in SFTPClient._request(self, t, *arg)
820 def _request(self, t, *arg):
821 num = self._async_request(type(None), t, *arg)
--> 822 return self._read_response(num)
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp_client.py:852, in SFTPClient._read_response(self, waitfor)
850 while True:
851 try:
--> 852 t, data = self._read_packet()
853 except EOFError as e:
854 raise SSHException("Server connection dropped: {}".format(e))
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp.py:201, in BaseSFTP._read_packet(self)
200 def _read_packet(self):
--> 201 x = self._read_all(4)
202 # most sftp servers won't accept packets larger than about 32k, so
203 # anything with the high byte set (> 16MB) is just garbage.
204 if byte_ord(x[0]):
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/sftp.py:185, in BaseSFTP._read_all(self, n)
183 break
184 else:
--> 185 x = self.sock.recv(n)
187 if len(x) == 0:
188 raise EOFError()
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/channel.py:699, in Channel.recv(self, nbytes)
686 """
687 Receive data from the channel. The return value is a string
688 representing the data received. The maximum amount of data to be
(...)
696 if no data is ready before the timeout set by `settimeout`.
697 """
698 try:
--> 699 out = self.in_buffer.read(nbytes, self.timeout)
700 except PipeTimeout:
701 raise socket.timeout()
File ~/.conda/envs/tensorflow/lib/python3.8/site-packages/paramiko/buffered_pipe.py:160, in BufferedPipe.read(self, nbytes, timeout)
158 while (len(self._buffer) == 0) and not self._closed:
159 then = time.time()
--> 160 self._cv.wait(timeout)
161 if timeout is not None:
162 timeout -= time.time() - then
File ~/.conda/envs/tensorflow/lib/python3.8/threading.py:302, in Condition.wait(self, timeout)
300 try: # restore state no matter what (e.g., KeyboardInterrupt)
301 if timeout is None:
--> 302 waiter.acquire()
303 gotit = True
304 else:
KeyboardInterrupt:
The mlflow tracking server is working properly for all the other operations. I am able to log params, metrics and artifacts. But I am not able to load a model or retrive any of the artifacts.
Update:
Looks like a bug as per https://github.com/mlflow/mlflow/issues/5656.

Please upgrade mlflow, their is some issue with version 1.26.0
pip install mlflow==1.27.0
Assuming you are also using above version

Related

FSSpec Error Handling in Python - Timeout Error

I am trying to get Terraclimate Data from Microsoft Planetary and facing time out error. Is there a possiblity of increasing the timeout time ? Please find the code below and the error I am facing. I am using fsspec and xarray for downloading spatial data from MS Planetary portal.
import fsspec
import xarray as xr
store = fsspec.get_mapper(asset.href)
data = xr.open_zarr(store, **asset.extra_fields["xarray:open_kwargs"])
clipped_data = data.sel(time=slice('2015-01-01','2019-12-31'),lon=slice(min_lon,max_lon),lat=slice(max_lat,min_lat))
parsed_data = clipped_data[['tmax', 'tmin', 'ppt', 'soil']]
lat_list = parsed_data['lat'].values.tolist()
lon_list = parsed_data['lon'].values.tolist()
filename = "Soil_Moisture_sample.csv"
for(i,j) in zip(lat_list,lon_list):
parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=i, lat=j, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
I am getting the following error
TimeoutError Traceback (most recent call last)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:53, in _runner(event, coro, result, timeout)
52 try:
---> 53 result[0] = await coro
54 except Exception as ex:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:423, in AsyncFileSystem._cat(self, path, recursive, on_error, batch_size, **kwargs)
422 if ex:
--> 423 raise ex
424 if (
425 len(paths) > 1
426 or isinstance(path, list)
427 or paths[0] != self._strip_protocol(path)
428 ):
File ~\Anaconda3\envs\satellite\lib\asyncio\tasks.py:455, in wait_for(fut, timeout, loop)
454 if timeout is None:
--> 455 return await fut
457 if timeout <= 0:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\implementations\http.py:221, in HTTPFileSystem._cat_file(self, url, start, end, **kwargs)
220 async with session.get(url, **kw) as r:
--> 221 out = await r.read()
222 self._raise_not_found_for_status(r, url)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\client_reqrep.py:1036, in ClientResponse.read(self)
1035 try:
-> 1036 self._body = await self.content.read()
1037 for trace in self._traces:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:375, in StreamReader.read(self, n)
374 while True:
--> 375 block = await self.readany()
376 if not block:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:397, in StreamReader.readany(self)
396 while not self._buffer and not self._eof:
--> 397 await self._wait("readany")
399 return self._read_nowait(-1)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:304, in StreamReader._wait(self, func_name)
303 with self._timer:
--> 304 await waiter
305 else:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\helpers.py:721, in TimerContext.__exit__(self, exc_type, exc_val, exc_tb)
720 if exc_type is asyncio.CancelledError and self._cancelled:
--> 721 raise asyncio.TimeoutError from None
722 return None
TimeoutError:
The above exception was the direct cause of the following exception:
FSTimeoutError Traceback (most recent call last)
Input In [62], in <cell line: 3>()
1 # Flood Region Point - Thiruvanthpuram
2 filename = "Soil_Moisture_sample.csv"
----> 3 parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=8.520833, lat=76.4375, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5898, in Dataset.to_dataframe(self, dim_order)
5870 """Convert this dataset into a pandas.DataFrame.
5871
5872 Non-index variables in this dataset form the columns of the
(...)
5893
5894 """
5896 ordered_dims = self._normalize_dim_order(dim_order=dim_order)
-> 5898 return self._to_dataframe(ordered_dims=ordered_dims)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5862, in Dataset._to_dataframe(self, ordered_dims)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
-> 5862 data = [
5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5863, in <listcomp>(.0)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
5862 data = [
-> 5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:527, in Variable.values(self)
524 #property
525 def values(self):
526 """The variable's data as a numpy.ndarray"""
--> 527 return _as_array_or_item(self._data)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:267, in _as_array_or_item(data)
253 def _as_array_or_item(data):
254 """Return the given values as a numpy array, or as an individual item if
255 it's a 0d datetime64 or timedelta64 array.
256
(...)
265 TODO: remove this (replace with np.asarray) once these issues are fixed
266 """
--> 267 data = np.asarray(data)
268 if data.ndim == 0:
269 if data.dtype.kind == "M":
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:1696, in Array.__array__(self, dtype, **kwargs)
1695 def __array__(self, dtype=None, **kwargs):
-> 1696 x = self.compute()
1697 if dtype and x.dtype != dtype:
1698 x = x.astype(dtype)
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:315, in DaskMethodsMixin.compute(self, **kwargs)
291 def compute(self, **kwargs):
292 """Compute this dask collection
293
294 This turns a lazy Dask collection into its in-memory equivalent.
(...)
313 dask.base.compute
314 """
--> 315 (result,) = compute(self, traverse=False, **kwargs)
316 return result
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:600, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
597 keys.append(x.__dask_keys__())
598 postcomputes.append(x.__dask_postcompute__())
--> 600 results = schedule(dsk, keys, **kwargs)
601 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
File ~\AppData\Roaming\Python\Python38\site-packages\dask\threaded.py:89, in get(dsk, keys, cache, num_workers, pool, **kwargs)
86 elif isinstance(pool, multiprocessing.pool.Pool):
87 pool = MultiprocessingPoolExecutor(pool)
---> 89 results = get_async(
90 pool.submit,
91 pool._max_workers,
92 dsk,
93 keys,
94 cache=cache,
95 get_id=_thread_get_id,
96 pack_exception=pack_exception,
97 **kwargs,
98 )
100 # Cleanup pools associated to dead threads
101 with pools_lock:
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
509 _execute_task(task, data) # Re-execute locally
510 else:
--> 511 raise_exception(exc, tb)
512 res, worker_id = loads(res_info)
513 state["cache"][key] = res
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:319, in reraise(exc, tb)
317 if exc.__traceback__ is not tb:
318 raise exc.with_traceback(tb)
--> 319 raise exc
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:224, in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
222 try:
223 task, data = loads(task_info)
--> 224 result = _execute_task(task, data)
225 id = get_id()
226 result = dumps((result, id))
File ~\AppData\Roaming\Python\Python38\site-packages\dask\core.py:119, in _execute_task(arg, cache, dsk)
115 func, args = arg[0], arg[1:]
116 # Note: Don't assign the subtask results to a variable. numpy detects
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:128, in getter(a, b, asarray, lock)
123 # Below we special-case `np.matrix` to force a conversion to
124 # `np.ndarray` and preserve original Dask behavior for `getter`,
125 # as for all purposes `np.matrix` is array-like and thus
126 # `is_arraylike` evaluates to `True` in that case.
127 if asarray and (not is_arraylike(c) or isinstance(c, np.matrix)):
--> 128 c = np.asarray(c)
129 finally:
130 if lock:
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:459, in ImplicitToExplicitIndexingAdapter.__array__(self, dtype)
458 def __array__(self, dtype=None):
--> 459 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:623, in CopyOnWriteArray.__array__(self, dtype)
622 def __array__(self, dtype=None):
--> 623 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:524, in LazilyIndexedArray.__array__(self, dtype)
522 def __array__(self, dtype=None):
523 array = as_indexable(self.array)
--> 524 return np.asarray(array[self.key], dtype=None)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\backends\zarr.py:76, in ZarrArrayWrapper.__getitem__(self, key)
74 array = self.get_array()
75 if isinstance(key, indexing.BasicIndexer):
---> 76 return array[key.tuple]
77 elif isinstance(key, indexing.VectorizedIndexer):
78 return array.vindex[
79 indexing._arrayize_vectorized_indexer(key, self.shape).tuple
80 ]
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:788, in Array.__getitem__(self, selection)
786 result = self.vindex[selection]
787 else:
--> 788 result = self.get_basic_selection(pure_selection, fields=fields)
789 return result
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:914, in Array.get_basic_selection(self, selection, out, fields)
911 return self._get_basic_selection_zd(selection=selection, out=out,
912 fields=fields)
913 else:
--> 914 return self._get_basic_selection_nd(selection=selection, out=out,
915 fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:957, in Array._get_basic_selection_nd(self, selection, out, fields)
951 def _get_basic_selection_nd(self, selection, out=None, fields=None):
952 # implementation of basic selection for array with at least one dimension
953
954 # setup indexer
955 indexer = BasicIndexer(selection, self)
--> 957 return self._get_selection(indexer=indexer, out=out, fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1247, in Array._get_selection(self, indexer, out, fields)
1241 if not hasattr(self.chunk_store, "getitems") or \
1242 any(map(lambda x: x == 0, self.shape)):
1243 # sequentially get one key at a time from storage
1244 for chunk_coords, chunk_selection, out_selection in indexer:
1245
1246 # load chunk selection into output array
-> 1247 self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection,
1248 drop_axes=indexer.drop_axes, fields=fields)
1249 else:
1250 # allow storage to get multiple items at once
1251 lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1939, in Array._chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes, fields)
1935 ckey = self._chunk_key(chunk_coords)
1937 try:
1938 # obtain compressed data for chunk
-> 1939 cdata = self.chunk_store[ckey]
1941 except KeyError:
1942 # chunk not initialized
1943 if self._fill_value is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\storage.py:717, in KVStore.__getitem__(self, key)
716 def __getitem__(self, key):
--> 717 return self._mutable_mapping[key]
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\mapping.py:137, in FSMap.__getitem__(self, key, default)
135 k = self._key_to_str(key)
136 try:
--> 137 result = self.fs.cat(k)
138 except self.missing_exceptions:
139 if default is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:111, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
108 #functools.wraps(func)
109 def wrapper(*args, **kwargs):
110 self = obj or args[0]
--> 111 return sync(self.loop, func, *args, **kwargs)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:94, in sync(loop, func, timeout, *args, **kwargs)
91 return_result = result[0]
92 if isinstance(return_result, asyncio.TimeoutError):
93 # suppress asyncio.TimeoutError, raise FSTimeoutError
---> 94 raise FSTimeoutError from return_result
95 elif isinstance(return_result, BaseException):
96 raise return_result
FSTimeoutError:

In the line:
store = fsspec.get_mapper(asset.href)
You can pass extra arguments to the fsspec backend, in this case HTTP, see fsspec.implementations.http.HTTPFileSystem. In this case, client_kwargs get passed to aiohttp.ClientSession, and include an optional timeout argument. Your call may look something like
from aiohttp import ClientTimeout
store = get_mapper(asset.href, client_kwargs={"timeout": ClientTimeout(total=5000, connect=1000)})

estimator.fit hangs on sagemaker on local mode

I am trying to train a pytorch model using Sagemaker on local mode, but whenever I call estimator.fit the code hangs indefinitely and I have to interrupt the notebook kernel. This happens both in my local machine and in Sagemaker Studio. But when I use EC2, the training runs normally.
Here the call to the estimator, and the stack trace once I interrupt the kernel:
import sagemaker
from sagemaker.pytorch import PyTorch
bucket = "bucket-name"
role = sagemaker.get_execution_role()
training_input_path = f"s3://{bucket}/dataset/path"
sagemaker_session = sagemaker.LocalSession()
sagemaker_session.config = {"local": {"local_code": True}}
output_path = "file://."
estimator = PyTorch(
entry_point="train.py",
source_dir="src",
hyperparameters={"max-epochs": 1},
framework_version="1.8",
py_version="py3",
instance_count=1,
instance_type="local",
role=role,
output_path=output_path,
sagemaker_session=sagemaker_session,
)
estimator.fit({"training": training_input_path})
Stack trace:
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-9-35cdd6021288> in <module>
----> 1 estimator.fit({"training": training_input_path})
/opt/conda/lib/python3.7/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
678 self._prepare_for_training(job_name=job_name)
679
--> 680 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config)
681 self.jobs.append(self.latest_training_job)
682 if wait:
/opt/conda/lib/python3.7/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config)
1450 """
1451 train_args = cls._get_train_args(estimator, inputs, experiment_config)
-> 1452 estimator.sagemaker_session.train(**train_args)
1453
1454 return cls(estimator.sagemaker_session, estimator._current_job_name)
/opt/conda/lib/python3.7/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image_uri, algorithm_arn, encrypt_inter_container_traffic, use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics, profiler_rule_configs, profiler_config, environment, retry_strategy)
572 LOGGER.info("Creating training-job with name: %s", job_name)
573 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
--> 574 self.sagemaker_client.create_training_job(**train_request)
575
576 def _get_train_request( # noqa: C901
/opt/conda/lib/python3.7/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs)
184 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {}
185 logger.info("Starting training job")
--> 186 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName)
187
188 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job
/opt/conda/lib/python3.7/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name)
219
220 self.model_artifacts = self.container.train(
--> 221 input_data_config, output_data_config, hyperparameters, job_name
222 )
223 self.end_time = datetime.datetime.now()
/opt/conda/lib/python3.7/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name)
200 data_dir = self._create_tmp_folder()
201 volumes = self._prepare_training_volumes(
--> 202 data_dir, input_data_config, output_data_config, hyperparameters
203 )
204 # If local, source directory needs to be updated to mounted /opt/ml/code path
/opt/conda/lib/python3.7/site-packages/sagemaker/local/image.py in _prepare_training_volumes(self, data_dir, input_data_config, output_data_config, hyperparameters)
487 os.mkdir(channel_dir)
488
--> 489 data_source = sagemaker.local.data.get_data_source_instance(uri, self.sagemaker_session)
490 volumes.append(_Volume(data_source.get_root_dir(), channel=channel_name))
491
/opt/conda/lib/python3.7/site-packages/sagemaker/local/data.py in get_data_source_instance(data_source, sagemaker_session)
52 return LocalFileDataSource(parsed_uri.netloc + parsed_uri.path)
53 if parsed_uri.scheme == "s3":
---> 54 return S3DataSource(parsed_uri.netloc, parsed_uri.path, sagemaker_session)
55 raise ValueError(
56 "data_source must be either file or s3. parsed_uri.scheme: {}".format(parsed_uri.scheme)
/opt/conda/lib/python3.7/site-packages/sagemaker/local/data.py in __init__(self, bucket, prefix, sagemaker_session)
183 working_dir = "/private{}".format(working_dir)
184
--> 185 sagemaker.utils.download_folder(bucket, prefix, working_dir, sagemaker_session)
186 self.files = LocalFileDataSource(working_dir)
187
/opt/conda/lib/python3.7/site-packages/sagemaker/utils.py in download_folder(bucket_name, prefix, target, sagemaker_session)
286 raise
287
--> 288 _download_files_under_prefix(bucket_name, prefix, target, s3)
289
290
/opt/conda/lib/python3.7/site-packages/sagemaker/utils.py in _download_files_under_prefix(bucket_name, prefix, target, s3)
314 if exc.errno != errno.EEXIST:
315 raise
--> 316 obj.download_file(file_path)
317
318
/opt/conda/lib/python3.7/site-packages/boto3/s3/inject.py in object_download_file(self, Filename, ExtraArgs, Callback, Config)
313 return self.meta.client.download_file(
314 Bucket=self.bucket_name, Key=self.key, Filename=Filename,
--> 315 ExtraArgs=ExtraArgs, Callback=Callback, Config=Config)
316
317
/opt/conda/lib/python3.7/site-packages/boto3/s3/inject.py in download_file(self, Bucket, Key, Filename, ExtraArgs, Callback, Config)
171 return transfer.download_file(
172 bucket=Bucket, key=Key, filename=Filename,
--> 173 extra_args=ExtraArgs, callback=Callback)
174
175
/opt/conda/lib/python3.7/site-packages/boto3/s3/transfer.py in download_file(self, bucket, key, filename, extra_args, callback)
305 bucket, key, filename, extra_args, subscribers)
306 try:
--> 307 future.result()
308 # This is for backwards compatibility where when retries are
309 # exceeded we need to throw the same error from boto3 instead of
/opt/conda/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
107 except KeyboardInterrupt as e:
108 self.cancel()
--> 109 raise e
110
111 def cancel(self):
/opt/conda/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
104 # however if a KeyboardInterrupt is raised we want want to exit
105 # out of this and propogate the exception.
--> 106 return self._coordinator.result()
107 except KeyboardInterrupt as e:
108 self.cancel()
/opt/conda/lib/python3.7/site-packages/s3transfer/futures.py in result(self)
258 # possible value integer value, which is on the scale of billions of
259 # years...
--> 260 self._done_event.wait(MAXINT)
261
262 # Once done waiting, raise an exception if present or return the
/opt/conda/lib/python3.7/threading.py in wait(self, timeout)
550 signaled = self._flag
551 if not signaled:
--> 552 signaled = self._cond.wait(timeout)
553 return signaled
554
/opt/conda/lib/python3.7/threading.py in wait(self, timeout)
294 try: # restore state no matter what (e.g., KeyboardInterrupt)
295 if timeout is None:
--> 296 waiter.acquire()
297 gotit = True
298 else:
KeyboardInterrupt:

SageMaker Studio does not natively support local mode. Studio Apps are themselves docker containers and therefore they require privileged access if they were to be able to build and run docker containers.
As an alternative solution, you can create a remote docker host on an EC2 instance and setup docker on your Studio App. There is quite a bit of networking and package installation involved, but the solution will enable you to use full docker functionality. Additionally, as of version 2.80.0 of SageMaker Python SDK, it now supports local mode when you are using remote docker host.
sdockerSageMaker Studio Docker CLI extension (see this repo) can simplify deploying the above solution in simple two steps (only works for Studio Domain in VPCOnly mode) and it has an easy to follow example here.
UPDATE:
There is now a UI extension (see repo) which can make the experience much smoother and easier to manage.

(Memory) Problem when accessing Dask type arrays

I need to load some meteorological data to analyze several months but such data is stored in files that cover only one day so I need to acces many files at once.
I am following some pre-given instruction that told me to create a memory partition in my computer.
from datetime import datetime, timedelta
import dask.array as da
from dask.distributed import Client, LocalCluster
import xarray
try:
client
except NameError:
client = Client(n_workers=1, threads_per_worker=4, memory_limit='2GB')
else:
print("Client already exists")
After this, I create an array dates that goes from 1st June to 1st October and that is need in "files" to get the link to the meteorological data.
dates=[datetime(2019,6,1) + timedelta(days=i) for i in range(3*30)]
files= [date.strftime('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/%Y/%m/wrf_arw_det_history_d03_%Y%m%d_0000.nc4') for date in dates]
My issue starts when I try to unzip all that data as
multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
It raises the error:
KeyError Traceback (most recent call last)
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
197 try:
--> 198 file = self._cache[self._key]
199 except KeyError:
~\Nueva carpeta\lib\site-packages\xarray\backends\lru_cache.py in __getitem__(self, key)
52 with self._lock:
---> 53 value = self._cache[key]
54 self._cache.move_to_end(key)
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-19-c3d0f4a8cc26> in <module>
----> 1 multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, combine, autoclose, parallel, join, attrs_file, **kwargs)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in <listcomp>(.0)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
507 if engine == "netcdf4":
508 store = backends.NetCDF4DataStore.open(
--> 509 filename_or_obj, group=group, lock=lock, **backend_kwargs
510 )
511 elif engine == "scipy":
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in open(cls, filename, mode, format, group, clobber, diskless, persist, lock, lock_maker, autoclose)
356 netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
357 )
--> 358 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
359
360 def _acquire(self, needs_lock=True):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in __init__(self, manager, group, mode, lock, autoclose)
312 self._group = group
313 self._mode = mode
--> 314 self.format = self.ds.data_model
315 self._filename = self.ds.filepath()
316 self.is_remote = is_remote_uri(self._filename)
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in ds(self)
365 #property
366 def ds(self):
--> 367 return self._acquire()
368
369 def open_store_variable(self, name, var):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in _acquire(self, needs_lock)
359
360 def _acquire(self, needs_lock=True):
--> 361 with self._manager.acquire_context(needs_lock) as root:
362 ds = _nc4_require_group(root, self._group, self._mode)
363 return ds
~\Nueva carpeta\lib\contextlib.py in __enter__(self)
110 del self.args, self.kwds, self.func
111 try:
--> 112 return next(self.gen)
113 except StopIteration:
114 raise RuntimeError("generator didn't yield") from None
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in acquire_context(self, needs_lock)
184 def acquire_context(self, needs_lock=True):
185 """Context manager for acquiring a file."""
--> 186 file, cached = self._acquire_with_cache_info(needs_lock)
187 try:
188 yield file
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
202 kwargs = kwargs.copy()
203 kwargs["mode"] = self._mode
--> 204 file = self._opener(*self._args, **kwargs)
205 if self._mode == "w":
206 # ensure file doesn't get overriden when opened again
netCDF4\_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()
netCDF4\_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -37] NetCDF: Write to read only: b'http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4'
Does anyone know why this error occurs?

How can I load sklearn data in Jupyter Python 3?

Hey I have a very short question. I need to load data for my machine learning course, but it does not work for me and I have no idea why. Im using Jupyter with Python 3.
My Code:
from sklearn.datasets import fetch_covtype
forest = fetch_covtype()
For my friend it works fine with the same conditions. I already tried to update sklearn with pip install -U scikit-learn, but it did not solve the problem. I hope somebody can help me.
It creates the following error:
UnboundLocalError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
126 try:
--> 127 X, y
128 except NameError:
UnboundLocalError: local variable 'X' referenced before assignment
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-9-fb303a92b6ca> in <module>
----> 1 forest =fetch_covtype()
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
127 X, y
128 except NameError:
--> 129 X, y = _refresh_cache([samples_path, targets_path], 9)
130 # TODO: Revert to the following two lines in v0.23
131 # X = joblib.load(samples_path)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in _refresh_cache(files, compress)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in <listcomp>(.0)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load(filename, mmap_mode)
603 return load_compatibility(fobj)
604
--> 605 obj = _unpickle(fobj, filename, mmap_mode)
606
607 return obj
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in _unpickle(fobj, filename, mmap_mode)
527 obj = None
528 try:
--> 529 obj = unpickler.load()
530 if unpickler.compat_mode:
531 warnings.warn("The file '%s' has been generated with a "
/opt/conda/lib/python3.7/pickle.py in load(self)
1083 raise EOFError
1084 assert isinstance(key, bytes_types)
-> 1085 dispatch[key[0]](self)
1086 except _Stop as stopinst:
1087 return stopinst.value
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load_build(self)
353 if isinstance(array_wrapper, NDArrayWrapper):
354 self.compat_mode = True
--> 355 self.stack.append(array_wrapper.read(self))
356
357 # Be careful to register our new method.
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read(self, unpickler)
196 array = self.read_mmap(unpickler)
197 else:
--> 198 array = self.read_array(unpickler)
199
200 # Manage array subclass case
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read_array(self, unpickler)
147 read_size = int(read_count * self.dtype.itemsize)
148 data = _read_bytes(unpickler.file_handle,
--> 149 read_size, "array data")
150 array[i:i + read_count] = \
151 unpickler.np.frombuffer(data, dtype=self.dtype,
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle_utils.py in _read_bytes(fp, size, error_template)
241 if len(data) != size:
242 msg = "EOF: reading %s, expected %d bytes got %d"
--> 243 raise ValueError(msg % (error_template, size, len(data)))
244 else:
245 return data
ValueError: EOF: reading array data, expected 262144 bytes got 209661

python sequence item 0: expected str instance, NoneType found

I'm trying to convert a local .xml file to .ttl and I used this code:
import rdflib
g=rdflib.Graph()
g.parse("C:\\Users\\Username\\Desktop\\public-12T23.xml",format='xml')
print (g.serialize(format='turtle'))
and I got this error
TypeError Traceback (most recent call last)
<ipython-input-3-d52cc014f61e> in <module>()
2 g=rdflib.Graph()
3
----> 4 g.parse("C:\\Users\\Saeid\\Desktop\\public-12T23.xml", format='xml')
5 print (g.serialize(format='turtle'))
C:\Program Files\Anaconda3\lib\site-packages\rdflib\graph.py in parse(self, >source, publicID, format, location, file, data, **args)
1041 parser = plugin.get(format, Parser)()
1042 try:
-> 1043 parser.parse(source, self, **args)
1044 finally:
1045 if source.auto_close:
C:\Program Files\Anaconda3\lib\site-packages\rdflib\plugins\parsers\rdfxml.py >in parse(self, source, sink, **args)
576 # content_handler.reset()
577 # self._parser.reset()
--> 578 self._parser.parse(source)
C:\Program Files\Anaconda3\lib\xml\sax\expatreader.py in parse(self, source)
108 self.reset()
109 self._cont_handler.setDocumentLocator(ExpatLocator(self))
--> 110 xmlreader.IncrementalParser.parse(self, source)
111
112 def prepareParser(self, source):
C:\Program Files\Anaconda3\lib\xml\sax\xmlreader.py in parse(self, source)
123 buffer = file.read(self._bufsize)
124 while buffer:
--> 125 self.feed(buffer)
126 buffer = file.read(self._bufsize)
127 self.close()
C:\Program Files\Anaconda3\lib\xml\sax\expatreader.py in feed(self, data, >isFinal)
208 # document. When feeding chunks, they are not normally >final -
209 # except when invoked from close.
--> 210 self._parser.Parse(data, isFinal)
211 except expat.error as e:
212 exc = SAXParseException(expat.ErrorString(e.code), e, self)
..\Modules\pyexpat.c in EndElement()
C:\Program Files\Anaconda3\lib\xml\sax\expatreader.py in end_element_ns(self, >name)
368 pair = tuple(pair)
369
--> 370 self._cont_handler.endElementNS(pair, None)
371
372 # this is not used (call directly to ContentHandler)
C:\Program Files\Anaconda3\lib\site-packages\rdflib\plugins\parsers\rdfxml.py >in endElementNS(self, name, qname)
158
159 def endElementNS(self, name, qname):
--> 160 self.current.end(name, qname)
161 self.stack.pop()
162
C:\Program Files\Anaconda3\lib\site-packages\rdflib\plugins\parsers\rdfxml.py >in node_element_end(self, name, qname)
330 if self.parent.object and self.current != self.stack[2]:
331
--> 332 self.error("Repeat node-elements inside property elements: >%s"%"".join(name))
333
334 self.parent.object = self.current.subject
TypeError: sequence item 0: expected str instance, NoneType found
I know that it was not as easy as I was thought, but I got the same error when I used these script here.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Mlflow "load_model" goes in deadlock - python

Please upgrade mlflow, their is some issue with version 1.26.0 pip install mlflow==1.27.0 Assuming you are also using above version

Related

FSSpec Error Handling in Python - Timeout Error

estimator.fit hangs on sagemaker on local mode

(Memory) Problem when accessing Dask type arrays

How can I load sklearn data in Jupyter Python 3?

python sequence item 0: expected str instance, NoneType found

Categories

Resources