pytorch RuntimeError: CUDA error: device-side assert triggered

pytorch RuntimeError: CUDA error: device-side assert triggered - python

I've a notebook on google colab that fails with following error
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
93 exception = e
---> 94 raise e
95 finally: cb_handler.on_train_end(exception)
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
83 xb, yb = cb_handler.on_batch_begin(xb, yb)
---> 84 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)
85 if cb_handler.on_batch_end(loss): break
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)
24 if opt is not None:
---> 25 loss = cb_handler.on_backward_begin(loss)
26 loss.backward()
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_backward_begin(self, loss)
223 for cb in self.callbacks:
--> 224 a = cb.on_backward_begin(**self.state_dict)
225 if a is not None: self.state_dict['last_loss'] = a
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in on_backward_begin(self, smooth_loss, **kwargs)
266 if self.pbar is not None and hasattr(self.pbar,'child'):
--> 267 self.pbar.child.comment = f'{smooth_loss:.4f}'
268
/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __format__(self, format_spec)
377 if self.dim() == 0:
--> 378 return self.item().__format__(format_spec)
379 return object.__format__(self, format_spec)
RuntimeError: CUDA error: device-side assert triggered
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-33-dd390b1c8108> in <module>()
----> 1 lr_find(learn)
2 learn.recorder.plot()
/usr/local/lib/python3.6/dist-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)
26 cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)
27 a = int(np.ceil(num_it/len(learn.data.train_dl)))
---> 28 learn.fit(a, start_lr, callbacks=[cb], **kwargs)
29
30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
160 callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)
161 fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,
--> 162 callbacks=self.callbacks+callbacks)
163
164 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)
93 exception = e
94 raise e
---> 95 finally: cb_handler.on_train_end(exception)
96
97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_train_end(self, exception)
254 def on_train_end(self, exception:Union[bool,Exception])->None:
255 "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."
--> 256 self('train_end', exception=exception)
257
258 class AverageMetric(Callback):
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
185 "Call through to all of the `CallbakHandler` functions."
186 if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187 return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
188
189 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
/usr/local/lib/python3.6/dist-packages/fastai/callback.py in <listcomp>(.0)
185 "Call through to all of the `CallbakHandler` functions."
186 if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]
--> 187 return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]
188
189 def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:
/usr/local/lib/python3.6/dist-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)
45 # restore the valid_dl we turned of on `__init__`
46 self.data.valid_dl = self.valid_dl
---> 47 self.learn.load('tmp')
48 if hasattr(self.learn.model, 'reset'): self.learn.model.reset()
49 print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')
/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in load(self, name, device)
202 "Load model `name` from `self.model_dir` using `device`, defaulting to `self.data.device`."
203 if device is None: device = self.data.device
--> 204 self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))
205 return self
206
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in load(f, map_location, pickle_module)
356 f = open(f, 'rb')
357 try:
--> 358 return _load(f, map_location, pickle_module)
359 finally:
360 if new_fd:
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _load(f, map_location, pickle_module)
527 unpickler = pickle_module.Unpickler(f)
528 unpickler.persistent_load = persistent_load
--> 529 result = unpickler.load()
530
531 deserialized_storage_keys = pickle_module.load(f)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in persistent_load(saved_id)
493 if root_key not in deserialized_objects:
494 deserialized_objects[root_key] = restore_location(
--> 495 data_type(size), location)
496 storage = deserialized_objects[root_key]
497 if view_metadata is not None:
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in restore_location(storage, location)
376 elif isinstance(map_location, torch.device):
377 def restore_location(storage, location):
--> 378 return default_restore_location(storage, str(map_location))
379 else:
380 def restore_location(storage, location):
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in default_restore_location(storage, location)
102 def default_restore_location(storage, location):
103 for _, _, fn in _package_registry:
--> 104 result = fn(storage, location)
105 if result is not None:
106 return result
/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _cuda_deserialize(obj, location)
84 'to an existing device.'.format(
85 device, torch.cuda.device_count()))
---> 86 return obj.cuda(device)
87
88
/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)
74 else:
75 new_type = getattr(torch.cuda, self.__class__.__name__)
---> 76 return new_type(self.size()).copy_(self, non_blocking)
77
78
RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20
There is no information about the real cause, I tried to get the stack trace by forcing cuda to run on one gpu (as suggested here) using a cell like this
!export CUDA_LAUNCH_BLOCKING=1
But this does not seem to work, still having the same error with.
Is there another way that works with Google Colab?

Be sure that your targets values starts from zero to number of classes - 1. Ex: you have 100 classification class so your target should be from 0 to 99

!export FOO=blah is usually not useful to run in a notebook because ! means run the following command in a sub-shell, so the effect of the statement is gone by the time the ! returns.
You might have more success by storing your python code in a file and then executing that file in a subshell:
In one cell:
%%writefile foo.py
[...your code...]
In the next cell:
!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py
(or s/python3/python2/ if you're writing py2)

Switch Hardware Accelerator Type to "None" under Runtime->Change Runtime Type . This should give you a more meaningful error message.

The proper way to set environmental variables in Google Colab is to use os:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
Using the os library will allow you to set whatever environmental variables you need. Setting CUDA_LAUNCH_BLOCKING this way enables proper CUDA tracebacks in Google Colab.

Related

AutoPyTorch Forecasting error after successful installation (forecasting_init_cfgs.json)

I installed Auto-PyTorch for Time Series Forecasting as requested by the GitHub repository of AutoML (https://github.com/automl/Auto-PyTorch) on Ubuntu (22.04 LTS).
To try that the installation is running correctly, I have tried the following code:
https://github.com/automl/Auto-PyTorch/blob/master/examples/20_basics/example_time_series_forecasting.py
Following error ocurred while running the code:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Input In [2], in <cell line: 56>()
51 api = TimeSeriesForecastingTask()
53 ############################################################################
54 # Search for an ensemble of machine learning algorithms
55 # =====================================================
---> 56 api.search(
57 X_train=X_train,
58 y_train=copy.deepcopy(y_train),
59 X_test=X_test,
60 optimize_metric='mean_MASE_forecasting',
61 n_prediction_steps=forecasting_horizon,
62 memory_limit=16 * 1024, # Currently, forecasting models use much more memories
63 freq=freq,
64 start_times=start_times,
65 func_eval_time_limit_secs=28,
66 total_walltime_limit=60,
67 min_num_test_instances=1000, # proxy validation sets. This only works for the tasks with more than 1000 series
68 known_future_features=known_future_features,
69 )
72 from autoPyTorch.datasets.time_series_dataset import TimeSeriesSequence
74 test_sets = []
File ~/anaconda3/envs/autopytorch-ts/lib/python3.8/site-packages/autoPyTorch/api/time_series_forecasting.py:476, in TimeSeriesForecastingTask.search(self, optimize_metric, X_train, y_train, X_test, y_test, n_prediction_steps, freq, start_times, series_idx, dataset_name, budget_type, min_budget, max_budget, total_walltime_limit, func_eval_time_limit_secs, enable_traditional_pipeline, memory_limit, smac_scenario_args, get_smac_object_callback, all_supported_metrics, precision, disable_file_output, load_models, portfolio_selection, suggested_init_models, custom_init_setting_path, min_num_test_instances, dataset_compression, **forecasting_dataset_kwargs)
465 self._metrics_kwargs = {
466 "sp": self.dataset.seasonality,
467 "n_prediction_steps": n_prediction_steps,
468 }
470 forecasting_kwargs = dict(
471 suggested_init_models=suggested_init_models,
472 custom_init_setting_path=custom_init_setting_path,
473 min_num_test_instances=min_num_test_instances,
474 )
--> 476 return self._search(
477 dataset=self.dataset,
478 optimize_metric=optimize_metric,
479 budget_type=budget_type,
480 min_budget=min_budget,
481 max_budget=max_budget,
482 total_walltime_limit=total_walltime_limit,
483 func_eval_time_limit_secs=func_eval_time_limit_secs,
484 enable_traditional_pipeline=enable_traditional_pipeline,
485 memory_limit=memory_limit,
486 smac_scenario_args=smac_scenario_args,
487 get_smac_object_callback=get_smac_object_callback,
488 all_supported_metrics=all_supported_metrics,
489 precision=precision,
490 disable_file_output=disable_file_output,
491 load_models=load_models,
492 portfolio_selection=portfolio_selection,
493 **forecasting_kwargs, # type: ignore[arg-type]
494 )
File ~/anaconda3/envs/autopytorch-ts/lib/python3.8/site-packages/autoPyTorch/api/base_task.py:1257, in BaseTask._search(self, optimize_metric, dataset, budget_type, min_budget, max_budget, total_walltime_limit, func_eval_time_limit_secs, enable_traditional_pipeline, memory_limit, smac_scenario_args, get_smac_object_callback, tae_func, all_supported_metrics, precision, disable_file_output, load_models, portfolio_selection, dask_client, **kwargs)
1255 self._logger.warning(" Not starting SMAC because there is no time left")
1256 else:
-> 1257 _proc_smac = AutoMLSMBO(
1258 config_space=self.search_space,
1259 dataset_name=str(dataset.dataset_name),
1260 backend=self._backend,
1261 total_walltime_limit=total_walltime_limit,
1262 func_eval_time_limit_secs=func_eval_time_limit_secs,
1263 dask_client=self._dask_client,
1264 memory_limit=self._memory_limit,
1265 n_jobs=self.n_jobs,
1266 watcher=self._stopwatch,
1267 metric=self._metric,
1268 seed=self.seed,
1269 include=self.include_components,
1270 exclude=self.exclude_components,
1271 disable_file_output=self._disable_file_output,
1272 all_supported_metrics=self._all_supported_metrics,
1273 smac_scenario_args=smac_scenario_args,
1274 get_smac_object_callback=get_smac_object_callback,
1275 pipeline_config=self.pipeline_options,
1276 min_budget=min_budget,
1277 max_budget=max_budget,
1278 ensemble_callback=proc_ensemble,
1279 logger_port=self._logger_port,
1280 # We do not increase the num_run here, this is something
1281 # smac does internally
1282 start_num_run=self._backend.get_next_num_run(peek=True),
1283 search_space_updates=self.search_space_updates,
1284 portfolio_selection=portfolio_selection,
1285 pynisher_context=self._multiprocessing_context,
1286 task_type=self.task_type,
1287 **kwargs,
1288 )
1289 try:
1290 run_history, self._results_manager.trajectory, budget_type = \
1291 _proc_smac.run_smbo(func=tae_func)
File ~/anaconda3/envs/autopytorch-ts/lib/python3.8/site-packages/autoPyTorch/optimizer/smbo.py:279, in AutoMLSMBO.__init__(self, config_space, dataset_name, backend, total_walltime_limit, func_eval_time_limit_secs, memory_limit, metric, watcher, n_jobs, dask_client, pipeline_config, start_num_run, seed, resampling_strategy, resampling_strategy_args, include, exclude, disable_file_output, smac_scenario_args, get_smac_object_callback, all_supported_metrics, ensemble_callback, logger_port, search_space_updates, portfolio_selection, pynisher_context, min_budget, max_budget, task_type, **kwargs)
276 initial_configurations = []
278 if STRING_TO_TASK_TYPES.get(self.task_type, -1) == TIMESERIES_FORECASTING:
--> 279 initial_configurations = self.get_init_configs_for_forecasting(config_space, kwargs)
280 # proxy-validation sets
281 self.min_num_test_instances: Optional[int] = kwargs.get('min_num_test_instances', # type:ignore[assignment]
282 None)
File ~/anaconda3/envs/autopytorch-ts/lib/python3.8/site-packages/autoPyTorch/optimizer/smbo.py:440, in AutoMLSMBO.get_init_configs_for_forecasting(self, config_space, kwargs)
438 datamanager: BaseDataset = self.backend.load_datamanager()
439 dataset_properties = datamanager.get_dataset_properties([])
--> 440 initial_configurations = read_forecasting_init_configurations(
441 config_space=config_space,
442 suggested_init_models=suggested_init_models,
443 custom_init_setting_path=custom_init_setting_path,
444 dataset_properties=dataset_properties
445 )
446 return initial_configurations
447 return []
File ~/anaconda3/envs/autopytorch-ts/lib/python3.8/site-packages/autoPyTorch/optimizer/utils.py:48, in read_forecasting_init_configurations(config_space, suggested_init_models, custom_init_setting_path, dataset_properties)
45 features_have_missing_values = dataset_properties.get('features_have_missing_values', False)
47 if suggested_init_models or suggested_init_models is None:
---> 48 with open(forecasting_init_path, 'r') as f:
49 forecasting_init_dict: Dict[str, Any] = json.load(f)
50 cfg_trainer: Dict = forecasting_init_dict['trainer']
FileNotFoundError: [Errno 2] No such file or directory: '/home/focal/anaconda3/envs/autopytorch-ts/lib/python3.8/site-packages/autoPyTorch/optimizer/../configs/forecasting_init_cfgs.json'
Unfortunately, I do not understand what the error raised because every package was installed correctly (already satisfied requirements).
I would be glad for any hints to solve this issue!

Q: Errors when usinig gluonts of LSTNet: GluonTSDataError

I've been studying time series forecasting, and I'm trying to learn how to use gluon-ts&python.
Here is the source code of gluon-ts:
https://github.com/awslabs/gluon-ts/
I tried to use LSTNetEstimator module, however, it turns out an error as follow. And I found that all the discussions of gluon-ts is about DeepAR.
Is there anyone that may come to help?
GluonTSDataError: Input for field "target" does not have the requireddimension (field: target, ndim observed: 1, expected ndim: 2)
I think it has something to do with my custom dataset, which includes a dataframe with the shape of [320000,3] and columns of ['time','Power_Cdp','Power_Dp'].
here is the trainset:
from gluonts.dataset.common import ListDataset
from gluonts.model.lstnet import LSTNetEstimator
from gluonts.mx.trainer import Trainer
training_data = ListDataset(
[{"start": LSTNet_df.index[0], "target": LSTNet_df['Power_Cdp'][:-10000}],
freq = "15min")
estimator = LSTNetEstimator(freq="15min", prediction_length=24*4, context_length=24*4,
num_series=48*4, skip_size=72*4, ar_window=24*4, channels=32,
trainer=Trainer(epochs=10))
predictor = estimator.train(training_data=training_data) # Error
The full report is bellow.
GluonTSDataError Traceback (most recent call last)
<ipython-input-311-ad48fdc20df5> in <module>
7 num_series=48*4, skip_size=72*4, ar_window=24*4, channels=32,
8 trainer=Trainer(epochs=10))
----> 9 predictor = estimator.train(training_data=training_data)
~/.local/lib/python3.8/site-packages/gluonts/mx/model/estimator.py in train(self, training_data, validation_data, num_workers, num_prefetch, shuffle_buffer_length, cache_data, **kwargs)
192 **kwargs,
193 ) -> Predictor:
--> 194 return self.train_model(
195 training_data=training_data,
196 validation_data=validation_data,
~/.local/lib/python3.8/site-packages/gluonts/mx/model/estimator.py in train_model(self, training_data, validation_data, num_workers, num_prefetch, shuffle_buffer_length, cache_data)
145 transformed_training_data = transformation.apply(training_data)
146
--> 147 training_data_loader = self.create_training_data_loader(
148 transformed_training_data
149 if not cache_data
~/.local/lib/python3.8/site-packages/gluonts/model/lstnet/_estimator.py in create_training_data_loader(self, data, **kwargs)
216 ) -> DataLoader:
217 input_names = get_hybrid_forward_input_names(LSTNetTrain)
--> 218 with env._let(max_idle_transforms=maybe_len(data) or 0):
219 instance_splitter = self._create_instance_splitter("training")
220 return TrainDataLoader(
~/.local/lib/python3.8/site-packages/gluonts/itertools.py in maybe_len(obj)
21 def maybe_len(obj) -> Optional[int]:
22 try:
---> 23 return len(obj)
24 except (NotImplementedError, AttributeError):
25 return None
~/.local/lib/python3.8/site-packages/gluonts/transform/_base.py in __len__(self)
99 # NOTE this is unsafe when transformations are run with is_train = True
100 # since some transformations may not be deterministic (instance splitter)
--> 101 return sum(1 for _ in self)
102
103 def __iter__(self) -> Iterator[DataEntry]:
~/.local/lib/python3.8/site-packages/gluonts/transform/_base.py in <genexpr>(.0)
99 # NOTE this is unsafe when transformations are run with is_train = True
100 # since some transformations may not be deterministic (instance splitter)
--> 101 return sum(1 for _ in self)
102
103 def __iter__(self) -> Iterator[DataEntry]:
~/.local/lib/python3.8/site-packages/gluonts/transform/_base.py in __iter__(self)
102
103 def __iter__(self) -> Iterator[DataEntry]:
--> 104 yield from self.transformation(
105 self.base_dataset, is_train=self.is_train
106 )
~/.local/lib/python3.8/site-packages/gluonts/transform/_base.py in __call__(self, data_it, is_train)
122 self, data_it: Iterable[DataEntry], is_train: bool
123 ) -> Iterator:
--> 124 for data_entry in data_it:
125 try:
126 yield self.map_transform(data_entry.copy(), is_train)
~/.local/lib/python3.8/site-packages/gluonts/transform/_base.py in __call__(self, data_it, is_train)
126 yield self.map_transform(data_entry.copy(), is_train)
127 except Exception as e:
--> 128 raise e
129
130 #abc.abstractmethod
~/.local/lib/python3.8/site-packages/gluonts/transform/_base.py in __call__(self, data_it, is_train)
124 for data_entry in data_it:
125 try:
--> 126 yield self.map_transform(data_entry.copy(), is_train)
127 except Exception as e:
128 raise e
~/.local/lib/python3.8/site-packages/gluonts/transform/_base.py in map_transform(self, data, is_train)
139
140 def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
--> 141 return self.transform(data)
142
143 #abc.abstractmethod
~/.local/lib/python3.8/site-packages/gluonts/transform/convert.py in transform(self, data)
127 value = np.asarray(data[self.field], dtype=self.dtype)
128
--> 129 assert_data_error(
130 value.ndim == self.expected_ndim,
131 'Input for field "{self.field}" does not have the required'
~/.local/lib/python3.8/site-packages/gluonts/exceptions.py in assert_data_error(condition, message, *args, **kwargs)
114 exception message.
115 """
--> 116 assert_gluonts(GluonTSDataError, condition, message, *args, **kwargs)
~/.local/lib/python3.8/site-packages/gluonts/exceptions.py in assert_gluonts(exception_class, condition, message, *args, **kwargs)
93 """
94 if not condition:
---> 95 raise exception_class(message.format(*args, **kwargs))
96
97
GluonTSDataError: Input for field "target" does not have the requireddimension (field: target, ndim observed: 1, expected ndim: 2)

#darth baba I tried again lately, and this time I used DeepVAR.
Here is what got.
I reset "target": ... , stepped into my code and found
gluonts.exceptions.GluonTSDataError: Array 'target' has bad shape - expected 1 dimensions, got 2.
This is because I set "target": train_df[['Power_Cdp', 'Power_Active_Fan']], which might be reported as an error at ./gluonts/dataset/common.py +385.
I found self.req_ndim != value.ndim , which suggested that I could have input a wrong shape of target, thus I reset input like "target": train_df.index[:] and this problem solved.
But it reports an another error,
gluonts.exceptions.GluonTSDataError: Input for field "target" does not have the requireddimension (field: target, ndim observed: 1, expected ndim: 2)
To be sincerely, this one confused me a lot, for the gluonts's code seems so complicated.
I checked again and found it reports error at ./gluonts/transform/convert.py +129.
It seems like expected_ndim is not equals to value.ndim in this case.
However, after rewriting expected_ndim manually, the training progress worked eventually.
I have no idea whether this modification is right.
Although the avg_epoch_loss are decreasing, the final forecast is not as good as expected.

Pytorch CNN: AttributeError: module 'main' has no attribute 'spec'

AttributeError Traceback (most recent call last)
~\Anaconda\main.py in <module>
176
177 for epoch in range(0, 200):
--> 178 train(epoch)
179 schedular.step()
180 acc = test(epoch)
~\Anaconda\main.py in train(epoch)
118 correct = 0
119 total = 0
--> 120 for batch_idx, (inputs, targets) in enumerate(trainloader, 0):
121 stime = time.time()
122 inputs, targets = inputs.to(device), targets.to(device)
~\.conda\envs\pytorch1\lib\site-packages\torch\utils\data\dataloader.py in __iter__(self)
277 return _SingleProcessDataLoaderIter(self)
278 else:
--> 279 return _MultiProcessingDataLoaderIter(self)
280
281 #property
~\.conda\envs\pytorch1\lib\site-packages\torch\utils\data\dataloader.py in __init__(self, loader)
717 # before it starts, and __del__ tries to join but will get:
718 # AssertionError: can only join a started process.
--> 719 w.start()
720 self._index_queues.append(index_queue)
721 self._workers.append(w)
~\.conda\envs\pytorch1\lib\multiprocessing\process.py in start(self)
110 'daemonic processes are not allowed to have children'
111 _cleanup()
--> 112 self._popen = self._Popen(self)
113 self._sentinel = self._popen.sentinel
114 # Avoid a refcycle if the target function holds an indirect
~\.conda\envs\pytorch1\lib\multiprocessing\context.py in _Popen(process_obj)
221 #staticmethod
222 def _Popen(process_obj):
--> 223 return _default_context.get_context().Process._Popen(process_obj)
224
225 class DefaultContext(BaseContext):
~\.conda\envs\pytorch1\lib\multiprocessing\context.py in _Popen(process_obj)
320 def _Popen(process_obj):
321 from .popen_spawn_win32 import Popen
--> 322 return Popen(process_obj)
323
324 class SpawnContext(BaseContext):
~\.conda\envs\pytorch1\lib\multiprocessing\popen_spawn_win32.py in __init__(self, process_obj)
44
45 def __init__(self, process_obj):
---> 46 prep_data = spawn.get_preparation_data(process_obj._name)
47
48 # read end of pipe will be "stolen" by the child process
~\.conda\envs\pytorch1\lib\multiprocessing\spawn.py in get_preparation_data(name)
170 # or through direct execution (or to leave it alone entirely)
171 main_module = sys.modules['__main__']
--> 172 main_mod_name = getattr(main_module.__spec__, "name", None)
173 if main_mod_name is not None:
174 d['init_main_from_name'] = main_mod_name
AttributeError: module '__main__' has no attribute '__spec__'
Can anybody help me with this?
I saw multiple answers but none of them helped.
I am trying to implement pytorch Cifar10 Image Classification in Anaconda.
The same code is working fine on Google Colab, why is it not working on my machine.
https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

(Memory) Problem when accessing Dask type arrays

I need to load some meteorological data to analyze several months but such data is stored in files that cover only one day so I need to acces many files at once.
I am following some pre-given instruction that told me to create a memory partition in my computer.
from datetime import datetime, timedelta
import dask.array as da
from dask.distributed import Client, LocalCluster
import xarray
try:
client
except NameError:
client = Client(n_workers=1, threads_per_worker=4, memory_limit='2GB')
else:
print("Client already exists")
After this, I create an array dates that goes from 1st June to 1st October and that is need in "files" to get the link to the meteorological data.
dates=[datetime(2019,6,1) + timedelta(days=i) for i in range(3*30)]
files= [date.strftime('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/%Y/%m/wrf_arw_det_history_d03_%Y%m%d_0000.nc4') for date in dates]
My issue starts when I try to unzip all that data as
multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
It raises the error:
KeyError Traceback (most recent call last)
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
197 try:
--> 198 file = self._cache[self._key]
199 except KeyError:
~\Nueva carpeta\lib\site-packages\xarray\backends\lru_cache.py in __getitem__(self, key)
52 with self._lock:
---> 53 value = self._cache[key]
54 self._cache.move_to_end(key)
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-19-c3d0f4a8cc26> in <module>
----> 1 multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, combine, autoclose, parallel, join, attrs_file, **kwargs)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in <listcomp>(.0)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
507 if engine == "netcdf4":
508 store = backends.NetCDF4DataStore.open(
--> 509 filename_or_obj, group=group, lock=lock, **backend_kwargs
510 )
511 elif engine == "scipy":
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in open(cls, filename, mode, format, group, clobber, diskless, persist, lock, lock_maker, autoclose)
356 netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
357 )
--> 358 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
359
360 def _acquire(self, needs_lock=True):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in __init__(self, manager, group, mode, lock, autoclose)
312 self._group = group
313 self._mode = mode
--> 314 self.format = self.ds.data_model
315 self._filename = self.ds.filepath()
316 self.is_remote = is_remote_uri(self._filename)
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in ds(self)
365 #property
366 def ds(self):
--> 367 return self._acquire()
368
369 def open_store_variable(self, name, var):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in _acquire(self, needs_lock)
359
360 def _acquire(self, needs_lock=True):
--> 361 with self._manager.acquire_context(needs_lock) as root:
362 ds = _nc4_require_group(root, self._group, self._mode)
363 return ds
~\Nueva carpeta\lib\contextlib.py in __enter__(self)
110 del self.args, self.kwds, self.func
111 try:
--> 112 return next(self.gen)
113 except StopIteration:
114 raise RuntimeError("generator didn't yield") from None
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in acquire_context(self, needs_lock)
184 def acquire_context(self, needs_lock=True):
185 """Context manager for acquiring a file."""
--> 186 file, cached = self._acquire_with_cache_info(needs_lock)
187 try:
188 yield file
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
202 kwargs = kwargs.copy()
203 kwargs["mode"] = self._mode
--> 204 file = self._opener(*self._args, **kwargs)
205 if self._mode == "w":
206 # ensure file doesn't get overriden when opened again
netCDF4\_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()
netCDF4\_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -37] NetCDF: Write to read only: b'http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4'
Does anyone know why this error occurs?

pymc3: Disaster example with deterministic switchpoint function

I'm trying to reproduce coal mining example with deterministic function for switchpoint instead of using theano's switch function. Code:
%matplotlib inline
import matplotlib.pyplot as plt
import pymc3
import numpy as np
import theano.tensor as t
import theano
data = np.hstack((np.random.poisson(15,1000),np.random.poisson(2,100)))
plt.plot(data)
#theano.compile.ops.as_op(itypes=[t.lscalar, t.dscalar,t.dscalar],otypes=[t.dvector])
def rate1(sw,mu1,mu2):
n = len(data)
out = np.empty(n)
out[:sw] = mu1
out[sw:] = mu2
return out
with pymc3.Model() as dis:
switchpoint = pymc3.DiscreteUniform('switchpoint',lower=0, upper=len(data)-1)
mu1 = pymc3.Exponential('mu1', lam=1.)
mu2 = pymc3.Exponential('mu2',lam=1.)
disasters=pymc3.Poisson('disasters', mu=rate1, observed = data)
But this code rise an error:
--------------------------------------------------------------------------- KeyError Traceback (most recent call
last) c:\program files\git\theano\theano\tensor\type.py in
dtype_specs(self)
266 'complex64': (complex, 'theano_complex64', 'NPY_COMPLEX64')
--> 267 }[self.dtype]
268 except KeyError:
KeyError: 'object'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call
last) c:\program files\git\theano\theano\tensor\basic.py in
constant_or_value(x, rtype, name, ndim, dtype)
407 rval = rtype(
--> 408 TensorType(dtype=x_.dtype, broadcastable=bcastable),
409 x_.copy(),
c:\program files\git\theano\theano\tensor\type.py in init(self,
dtype, broadcastable, name, sparse_grad)
49 self.broadcastable = tuple(bool(b) for b in broadcastable)
---> 50 self.dtype_specs() # error checking is done there
51 self.name = name
c:\program files\git\theano\theano\tensor\type.py in dtype_specs(self)
269 raise TypeError("Unsupported dtype for %s: %s"
--> 270 % (self.class.name, self.dtype))
271
TypeError: Unsupported dtype for TensorType: object
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call
last) c:\program files\git\theano\theano\tensor\basic.py in
as_tensor_variable(x, name, ndim)
201 try:
--> 202 return constant(x, name=name, ndim=ndim)
203 except TypeError:
c:\program files\git\theano\theano\tensor\basic.py in constant(x,
name, ndim, dtype)
421 ret = constant_or_value(x, rtype=TensorConstant, name=name, ndim=ndim,
--> 422 dtype=dtype)
423
c:\program files\git\theano\theano\tensor\basic.py in
constant_or_value(x, rtype, name, ndim, dtype)
416 except Exception:
--> 417 raise TypeError("Could not convert %s to TensorType" % x, type(x))
418
TypeError: ('Could not convert FromFunctionOp{rate1} to TensorType',
)
During handling of the above exception, another exception occurred:
AsTensorError Traceback (most recent call
last) in ()
14 mu2 = pymc3.Exponential('mu2',lam=1.)
15 #rate1 = pymc3.switch(switchpoint >= np.arange(len(data)), mu1,mu2)
---> 16 disasters=pymc3.Poisson('disasters', mu=rate1, observed = data)
C:\Users\User\Anaconda3\lib\site-packages\pymc3\distributions\distribution.py
in new(cls, name, *args, **kwargs)
19 if isinstance(name, str):
20 data = kwargs.pop('observed', None)
---> 21 dist = cls.dist(*args, **kwargs)
22 return model.Var(name, dist, data)
23 elif name is None:
C:\Users\User\Anaconda3\lib\site-packages\pymc3\distributions\distribution.py
in dist(cls, *args, **kwargs)
32 def dist(cls, *args, **kwargs):
33 dist = object.new(cls)
---> 34 dist.init(*args, **kwargs)
35 return dist
36
C:\Users\User\Anaconda3\lib\site-packages\pymc3\distributions\discrete.py
in init(self, mu, *args, **kwargs)
185 super(Poisson, self).init(*args, **kwargs)
186 self.mu = mu
--> 187 self.mode = floor(mu).astype('int32')
188
189 def random(self, point=None, size=None, repeat=None):
c:\program files\git\theano\theano\gof\op.py in call(self,
*inputs, **kwargs)
598 """
599 return_list = kwargs.pop('return_list', False)
--> 600 node = self.make_node(*inputs, **kwargs)
601
602 if config.compute_test_value != 'off':
c:\program files\git\theano\theano\tensor\elemwise.py in
make_node(self, *inputs)
540 using DimShuffle.
541 """
--> 542 inputs = list(map(as_tensor_variable, inputs))
543 shadow = self.scalar_op.make_node(
544 *[get_scalar_type(dtype=i.type.dtype).make_variable()
c:\program files\git\theano\theano\tensor\basic.py in
as_tensor_variable(x, name, ndim)
206 except Exception:
207 str_x = repr(x)
--> 208 raise AsTensorError("Cannot convert %s to TensorType" % str_x, type(x))
209
210 # this has a different name, because _as_tensor_variable is the
AsTensorError: ('Cannot convert FromFunctionOp{rate1} to TensorType',
)
How i handle this?
The second thing - when i'm using the pymc3.switch function like this:
with pymc3.Model() as dis:
switchpoint = pymc3.DiscreteUniform('switchpoint',lower=0, upper=len(data)-1)
mu1 = pymc3.Exponential('mu1', lam=1.)
mu2 = pymc3.Exponential('mu2',lam=1.)
rate1 = pymc3.switch(switchpoint >= np.arange(len(data)), mu1,mu2)
disasters=pymc3.Poisson('disasters', mu=rate1, observed = data)
And next try to sample:
with dis:
step1 = pymc3.NUTS([mu1, mu2])
step2 = pymc3.Metropolis([switchpoint])
trace = pymc3.sample(10000, step = [step1,step2])
I get an error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\program files\git\theano\theano\compile\function_module.py in __call__(self, *args, **kwargs)
858 try:
--> 859 outputs = self.fn()
860 except Exception:
TypeError: expected type_num 9 (NPY_INT64) got 7
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-4-3247d908f897> in <module>()
2 step1 = pymc3.NUTS([mu1, mu2])
3 step2 = pymc3.Metropolis([switchpoint])
----> 4 trace = pymc3.sample(10000, step = [step1,step2])
C:\Users\User\Anaconda3\lib\site-packages\pymc3\sampling.py in sample(draws, step, start, trace, chain, njobs, tune, progressbar, model, random_seed)
153 sample_args = [draws, step, start, trace, chain,
154 tune, progressbar, model, random_seed]
--> 155 return sample_func(*sample_args)
156
157
C:\Users\User\Anaconda3\lib\site-packages\pymc3\sampling.py in _sample(draws, step, start, trace, chain, tune, progressbar, model, random_seed)
162 progress = progress_bar(draws)
163 try:
--> 164 for i, strace in enumerate(sampling):
165 if progressbar:
166 progress.update(i)
C:\Users\User\Anaconda3\lib\site-packages\pymc3\sampling.py in _iter_sample(draws, step, start, trace, chain, tune, model, random_seed)
244 if i == tune:
245 step = stop_tuning(step)
--> 246 point = step.step(point)
247 strace.record(point)
248 yield strace
C:\Users\User\Anaconda3\lib\site-packages\pymc3\step_methods\compound.py in step(self, point)
11 def step(self, point):
12 for method in self.methods:
---> 13 point = method.step(point)
14 return point
C:\Users\User\Anaconda3\lib\site-packages\pymc3\step_methods\arraystep.py in step(self, point)
116 bij = DictToArrayBijection(self.ordering, point)
117
--> 118 apoint = self.astep(bij.map(point))
119 return bij.rmap(apoint)
120
C:\Users\User\Anaconda3\lib\site-packages\pymc3\step_methods\metropolis.py in astep(self, q0)
123
124
--> 125 q_new = metrop_select(self.delta_logp(q,q0), q, q0)
126
127 if q_new is q:
c:\program files\git\theano\theano\compile\function_module.py in __call__(self, *args, **kwargs)
869 node=self.fn.nodes[self.fn.position_of_error],
870 thunk=thunk,
--> 871 storage_map=getattr(self.fn, 'storage_map', None))
872 else:
873 # old-style linkers raise their own exceptions
c:\program files\git\theano\theano\gof\link.py in raise_with_op(node, thunk, exc_info, storage_map)
312 # extra long error message in that case.
313 pass
--> 314 reraise(exc_type, exc_value, exc_trace)
315
316
C:\Users\User\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
656 value = tp()
657 if value.__traceback__ is not tb:
--> 658 raise value.with_traceback(tb)
659 raise value
660
c:\program files\git\theano\theano\compile\function_module.py in __call__(self, *args, **kwargs)
857 t0_fn = time.time()
858 try:
--> 859 outputs = self.fn()
860 except Exception:
861 if hasattr(self.fn, 'position_of_error'):
TypeError: expected type_num 9 (NPY_INT64) got 7
Apply node that caused the error: Elemwise{Composite{Switch(GE(i0, i1), i2, i3)}}(InplaceDimShuffle{x}.0, TensorConstant{[ 0 1..1098 1099]}, InplaceDimShuffle{x}.0, InplaceDimShuffle{x}.0)
Toposort index: 11
Inputs types: [TensorType(int64, (True,)), TensorType(int32, vector), TensorType(float64, (True,)), TensorType(float64, (True,))]
Inputs shapes: [(1,), (1100,), (1,), (1,)]
Inputs strides: [(4,), (4,), (8,), (8,)]
Inputs values: [array([549]), 'not shown', array([ 1.07762995]), array([ 1.01502801])]
Outputs clients: [[Elemwise{eq,no_inplace}(Elemwise{Composite{Switch(GE(i0, i1), i2, i3)}}.0, TensorConstant{(1,) of 0}), Elemwise{Composite{Switch(GE(i0, i1), ((Switch(i2, i3, (i4 * log(i0))) - i5) - i0), i3)}}[(0, 0)](Elemwise{Composite{Switch(GE(i0, i1), i2, i3)}}.0, TensorConstant{(1,) of 0}, InplaceDimShuffle{x}.0, TensorConstant{(1,) of -inf}, TensorConstant{[ 13. 13... 0. 1.]}, TensorConstant{[ 22.55216... ]})]]
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Being simple analyst, should i learn all this stuff about theano to being able to work with my statistical problems? Is a new mcmc sampler with gradient feature is only one thing that should motivates me to switch from pymc2 to pymc3?

For your first question, it looks like you're trying to pass a theano function as a variable. You need to call the function with the other variables as arguments, which will then return a theano variable. Try changing your line to
disasters=pymc3.Poisson('disasters', mu=rate1(switchpoint, mu1, mu2), observed = data)
I couldn't reproduce the error in your second part; the sampling worked just fine for me.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pytorch RuntimeError: CUDA error: device-side assert triggered - python

Be sure that your targets values starts from zero to number of classes - 1. Ex: you have 100 classification class so your target should be from 0 to 99

Switch Hardware Accelerator Type to "None" under Runtime->Change Runtime Type . This should give you a more meaningful error message.

The proper way to set environmental variables in Google Colab is to use os: import os os.environ['CUDA_LAUNCH_BLOCKING'] = "1" Using the os library will allow you to set whatever environmental variables you need. Setting CUDA_LAUNCH_BLOCKING this way enables proper CUDA tracebacks in Google Colab.

Related

AutoPyTorch Forecasting error after successful installation (forecasting_init_cfgs.json)

Q: Errors when usinig gluonts of LSTNet: GluonTSDataError

Pytorch CNN: AttributeError: module 'main' has no attribute 'spec'

(Memory) Problem when accessing Dask type arrays

pymc3: Disaster example with deterministic switchpoint function

Categories

Resources

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pytorch RuntimeError: CUDA error: device-side assert triggered - python

Be sure that your targets values starts from zero to number of classes - 1. Ex: you have 100 classification class so your target should be from 0 to 99

Switch Hardware Accelerator Type to "None" under Runtime->Change Runtime Type . This should give you a more meaningful error message.

The proper way to set environmental variables in Google Colab is to use os: import os os.environ['CUDA_LAUNCH_BLOCKING'] = "1" Using the os library will allow you to set whatever environmental variables you need. Setting CUDA_LAUNCH_BLOCKING this way enables proper CUDA tracebacks in Google Colab.

Related

AutoPyTorch Forecasting error after successful installation (forecasting_init_cfgs.json)

Q: Errors when usinig gluonts of LSTNet: GluonTSDataError

Pytorch CNN: AttributeError: module '__main__' has no attribute '__spec__'

(Memory) Problem when accessing Dask type arrays

pymc3: Disaster example with deterministic switchpoint function

Categories

Resources

Pytorch CNN: AttributeError: module 'main' has no attribute 'spec'