I am running a simple code for KMeans:
# Scaling the data set before clustering
scaler = StandardScaler()
subset = df[num_col].copy()
subset_scaled = scaler.fit_transform(subset)
subset_scaled_df = pd.DataFrame(subset_scaled, columns=subset.columns)
clusters = range(1, 9)
meanDistortions = []
for k in clusters:
model = KMeans(n_clusters=k)
model.fit(subset_scaled_df)
prediction = model.predict(subset_scaled_df)
distortion = (
sum(
np.min(cdist(subset_scaled_df, model.cluster_centers_, "euclidean"), axis=1)
)
/ subset_scaled_df.shape[0]
)
meanDistortions.append(distortion)
print("Number of Clusters:", k, "\tAverage Distortion:", distortion)
plt.plot(clusters, meanDistortions, "bx-")
plt.xlabel("k")
plt.ylabel("Average Distortion")
plt.title("Selecting k with the Elbow Method", fontsize=20)
Running into the following error:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-173-4b988580ff32> in <module>
11 for k in clusters:
12 model = KMeans(n_clusters=k)
---> 13 model.fit(subset_scaled_df)
14 prediction = model.predict(subset_scaled_df)
15 distortion = (
/usr/local/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py in fit(self, X, y, sample_weight)
1006 if self._algorithm == "full":
1007 kmeans_single = _kmeans_single_lloyd
-> 1008 self._check_mkl_vcomp(X, X.shape[0])
1009 else:
1010 kmeans_single = _kmeans_single_elkan
/usr/local/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py in _check_mkl_vcomp(self, X, n_samples)
872 active_threads = int(np.ceil(n_samples / CHUNK_SIZE))
873 if active_threads < self._n_threads:
--> 874 modules = threadpool_info()
875 has_vcomp = "vcomp" in [module["prefix"] for module in modules]
876 has_mkl = ("mkl", "intel") in [
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in threadpool_info()
122 In addition, each module may contain internal_api specific entries.
123 """
--> 124 return _ThreadpoolInfo(user_api=_ALL_USER_APIS).todicts()
125
126
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in __init__(self, user_api, prefixes, modules)
338
339 self.modules = []
--> 340 self._load_modules()
341 self._warn_if_incompatible_openmp()
342 else:
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in _load_modules(self)
369 """Loop through loaded libraries and store supported ones"""
370 if sys.platform == "darwin":
--> 371 self._find_modules_with_dyld()
372 elif sys.platform == "win32":
373 self._find_modules_with_enum_process_module_ex()
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in _find_modules_with_dyld(self)
426
427 # Store the module if it is supported and selected
--> 428 self._make_module_from_path(filepath)
429
430 def _find_modules_with_enum_process_module_ex(self):
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in _make_module_from_path(self, filepath)
513 if prefix in self.prefixes or user_api in self.user_api:
514 module_class = globals()[module_class]
--> 515 module = module_class(filepath, prefix, user_api, internal_api)
516 self.modules.append(module)
517
/usr/local/lib/python3.9/site-packages/threadpoolctl.py in __init__(self, filepath, prefix, user_api, internal_api)
603 self.user_api = user_api
604 self.internal_api = internal_api
--> 605 self._dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
606 self.version = self.get_version()
607 self.num_threads = self.get_num_threads()
/usr/local/Cellar/python#3.9/3.9.1_4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ctypes/__init__.py in __init__(self, name, mode, handle, use_errno, use_last_error, winmode)
372
373 if handle is None:
--> 374 self._handle = _dlopen(self._name, mode)
375 else:
376 self._handle = handle
OSError: image not already loaded
However, if I replace the above code with the following, it works fine:
clusters = range(1, 9)
meanDistortions = []
for k in clusters:
model = KMeans(n_clusters=8)
Instead of passing "k" into KMeans(n_clusters= ), if I pass an integer it works fine. Not able to understand what is going wrong, any pointers would be greatly appreciated.
Thanks!
Try to update your sklearn package:
pip install -U sklearn
I assume it throws the error for k=1, and works for k > 1. This would also explain your working modification. So you could do range(2, 9) as a quick fix. I'm observing the same in my most recent sklean environment (0.24.2).
Related
I am trying to execute the the code below but i keep getting an assertion error:
from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(
pd.PeriodIndex(pd.date_range("2023-01-01", periods=6, freq="M")), is_relative=False
)
cutoff = pd.Period("2022-12-01", freq="M")
fh.to_relative(cutoff)
from sktime.forecasting.naive import NaiveForecaster
forecaster = NaiveForecaster(strategy="last", sp=12)
forecaster.fit(water_data)
water_data.info()
y_pred = forecaster.predict(fh)
The error stack is
looks like this
AssertionError Traceback (most recent call last)
Cell In[111], line 1
----> 1 y_pred = forecaster.predict(fh)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_base.py:404, in BaseForecaster.predict(self, fh, X)
402 # we call the ordinary _predict if no looping/vectorization needed
403 if not self._is_vectorized:
--> 404 y_pred = self._predict(fh=fh, X=X_inner)
405 else:
406 # otherwise we call the vectorized version of predict
407 y_pred = self._vectorize("predict", X=X_inner, fh=fh)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\naive.py:336, in NaiveForecaster._predict(self, fh, X)
326 def _predict(self, fh=None, X=None):
327 """Forecast time series at future horizon.
328
329 Parameters
(...)
334 Exogenous time series
335 """
--> 336 y_pred = super(NaiveForecaster, self)._predict(fh=fh, X=X)
338 # test_predict_time_index_in_sample_full[ForecastingPipeline-0-int-int-True]
339 # causes a pd.DataFrame to appear as y_pred, which upsets the next lines
340 # reasons are unclear, this is coming from the _BaseWindowForecaster
341 # todo: investigate this
342 if isinstance(y_pred, pd.DataFrame):
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_sktime.py:30, in _BaseWindowForecaster._predict(self, fh, X)
27 kwargs = {"X": X}
29 # all values are out-of-sample
---> 30 if fh.is_all_out_of_sample(self.cutoff):
31 return self._predict_fixed_cutoff(
32 fh.to_out_of_sample(self.cutoff), **kwargs
33 )
35 # all values are in-sample
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:612, in ForecastingHorizon.is_all_out_of_sample(self, cutoff)
597 def is_all_out_of_sample(self, cutoff=None) -> bool:
598 """Whether the forecasting horizon is purely out-of-sample for given cutoff.
599
600 Parameters
(...)
610 cutoff.
611 """
--> 612 return sum(self._is_out_of_sample(cutoff)) == len(self)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:595, in ForecastingHorizon._is_out_of_sample(self, cutoff)
593 def _is_out_of_sample(self, cutoff=None) -> np.ndarray:
594 """Get index location of out-of-sample values."""
--> 595 return np.logical_not(self._is_in_sample(cutoff))
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:574, in ForecastingHorizon._is_in_sample(self, cutoff)
572 def _is_in_sample(self, cutoff=None) -> np.ndarray:
573 """Get index location of in-sample values."""
--> 574 relative = self.to_relative(cutoff).to_pandas()
575 null = 0 if is_integer_index(relative) else pd.Timedelta(0)
576 return relative <= null
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:461, in ForecastingHorizon.to_relative(self, cutoff)
446 """Return forecasting horizon values relative to a cutoff.
447
448 Parameters
(...)
458 Relative representation of forecasting horizon.
459 """
460 cutoff = self._coerce_cutoff_to_index_element(cutoff)
--> 461 return _to_relative(fh=self, cutoff=cutoff)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:682, in _to_relative(fh, cutoff)
680 else:
681 absolute = fh.to_pandas()
--> 682 _check_cutoff(cutoff, absolute)
684 if isinstance(absolute, pd.DatetimeIndex):
685 # coerce to pd.Period for reliable arithmetics and computations of
686 # time deltas
687 absolute = _coerce_to_period(absolute, freq=fh.freq)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:776, in _check_cutoff(cutoff, index)
773 raise ValueError("`cutoff` must be given, but found none.")
775 if isinstance(index, pd.PeriodIndex):
--> 776 assert isinstance(cutoff, pd.Period)
777 assert index.freqstr == cutoff.freqstr
779 if isinstance(index, pd.DatetimeIndex):
AssertionError:
The water_data has 49 data points with a datetime index and a floating point value.
What could be the issue.
I am following the tutorial on the official website.
Could the problem be the data am passing to the model.
I am doing a simple PCA analysis with some satellite data. All the land points are removed, the mean and standard deviation are close to 0 and 1. However I get
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler_sst = scaler.fit(sss_data)
import joblib
joblib.dump(scaler_sst, './scaler_sst.pkl', compress=9)
scaler_sst = joblib.load('./scaler_sst.pkl')
X = scaler_sst.transform(sss_data)
print(X.mean())
print(X.std())
#X.shape
5.7725416769826885e-15
0.9999999999999993
from sklearn.decomposition import pca
skpca=pca.PCA()
skpca.fit(X)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj, include, exclude)
968
969 if method is not None:
--> 970 return method(include=include, exclude=exclude)
971 return None
972 else:
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/base.py in _repr_mimebundle_(self, **kwargs)
461 def _repr_mimebundle_(self, **kwargs):
462 """Mime bundle used by jupyter kernels to display estimator"""
--> 463 output = {"text/plain": repr(self)}
464 if get_config()["display"] == 'diagram':
465 output["text/html"] = estimator_html_repr(self)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/base.py in __repr__(self, N_CHAR_MAX)
273
274 # use ellipsis for sequences with a lot of elements
--> 275 pp = _EstimatorPrettyPrinter(
276 compact=True, indent=1, indent_at_name=True,
277 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/utils/_pprint.py in __init__(self, indent, width, depth, stream, compact, indent_at_name, n_max_elements_to_show)
162 if self._indent_at_name:
163 self._indent_per_level = 1 # ignore indent param
--> 164 self._changed_only = get_config()['print_changed_only']
165 # Max number of elements in a list, dict, tuple until we start using
166 # ellipsis. This also affects the number of arguments of an estimators
KeyError: 'print_changed_only'
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/base.py in __repr__(self, N_CHAR_MAX)
273
274 # use ellipsis for sequences with a lot of elements
--> 275 pp = _EstimatorPrettyPrinter(
276 compact=True, indent=1, indent_at_name=True,
277 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/utils/_pprint.py in __init__(self, indent, width, depth, stream, compact, indent_at_name, n_max_elements_to_show)
162 if self._indent_at_name:
163 self._indent_per_level = 1 # ignore indent param
--> 164 self._changed_only = get_config()['print_changed_only']
165 # Max number of elements in a list, dict, tuple until we start using
166 # ellipsis. This also affects the number of arguments of an estimators
KeyError: 'print_changed_only'
The error occurs at the part skpca.fit(X). I reinstalled the sklearn package and scikit packages. I have used a PCA analysis before with sklearn and this has never happened.
I don't know the answer but maybe this is a bug in sklearn:
Try:
import sklearn
sklearn.get_config()
In my case it returns a dict:
{'assume_finite': False, 'working_memory': 1024, 'print_changed_only': False}
The error indicates that the print_changend_only does not exist for you.
My sklearn version is '0.21.2' on python 3.6. Maybe it helps to downgrade the sklearn version?
I need to load some meteorological data to analyze several months but such data is stored in files that cover only one day so I need to acces many files at once.
I am following some pre-given instruction that told me to create a memory partition in my computer.
from datetime import datetime, timedelta
import dask.array as da
from dask.distributed import Client, LocalCluster
import xarray
try:
client
except NameError:
client = Client(n_workers=1, threads_per_worker=4, memory_limit='2GB')
else:
print("Client already exists")
After this, I create an array dates that goes from 1st June to 1st October and that is need in "files" to get the link to the meteorological data.
dates=[datetime(2019,6,1) + timedelta(days=i) for i in range(3*30)]
files= [date.strftime('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/%Y/%m/wrf_arw_det_history_d03_%Y%m%d_0000.nc4') for date in dates]
My issue starts when I try to unzip all that data as
multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
It raises the error:
KeyError Traceback (most recent call last)
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
197 try:
--> 198 file = self._cache[self._key]
199 except KeyError:
~\Nueva carpeta\lib\site-packages\xarray\backends\lru_cache.py in __getitem__(self, key)
52 with self._lock:
---> 53 value = self._cache[key]
54 self._cache.move_to_end(key)
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-19-c3d0f4a8cc26> in <module>
----> 1 multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, combine, autoclose, parallel, join, attrs_file, **kwargs)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in <listcomp>(.0)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
507 if engine == "netcdf4":
508 store = backends.NetCDF4DataStore.open(
--> 509 filename_or_obj, group=group, lock=lock, **backend_kwargs
510 )
511 elif engine == "scipy":
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in open(cls, filename, mode, format, group, clobber, diskless, persist, lock, lock_maker, autoclose)
356 netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
357 )
--> 358 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
359
360 def _acquire(self, needs_lock=True):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in __init__(self, manager, group, mode, lock, autoclose)
312 self._group = group
313 self._mode = mode
--> 314 self.format = self.ds.data_model
315 self._filename = self.ds.filepath()
316 self.is_remote = is_remote_uri(self._filename)
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in ds(self)
365 #property
366 def ds(self):
--> 367 return self._acquire()
368
369 def open_store_variable(self, name, var):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in _acquire(self, needs_lock)
359
360 def _acquire(self, needs_lock=True):
--> 361 with self._manager.acquire_context(needs_lock) as root:
362 ds = _nc4_require_group(root, self._group, self._mode)
363 return ds
~\Nueva carpeta\lib\contextlib.py in __enter__(self)
110 del self.args, self.kwds, self.func
111 try:
--> 112 return next(self.gen)
113 except StopIteration:
114 raise RuntimeError("generator didn't yield") from None
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in acquire_context(self, needs_lock)
184 def acquire_context(self, needs_lock=True):
185 """Context manager for acquiring a file."""
--> 186 file, cached = self._acquire_with_cache_info(needs_lock)
187 try:
188 yield file
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
202 kwargs = kwargs.copy()
203 kwargs["mode"] = self._mode
--> 204 file = self._opener(*self._args, **kwargs)
205 if self._mode == "w":
206 # ensure file doesn't get overriden when opened again
netCDF4\_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()
netCDF4\_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -37] NetCDF: Write to read only: b'http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4'
Does anyone know why this error occurs?
I am trying to load a ".unf" file into Jupyter environment, using HyperSpy Library, but I get this error.
import hyperspy.api as hs
data = hs.load("/path/to/file/PRC.unf")
This is the error:
ValueError Traceback (most recent call last)
<ipython-input-7-b0117f505d01> in <module>
----> 1 data = hs.load("/home/vahid/PythonProjects/UNFfiles/PRC.unf")
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load(filenames, signal_type, stack, stack_axis, new_axis_name, lazy, convert_units, **kwds)
279 objects = [load_single_file(filename, lazy=lazy,
280 **kwds)
--> 281 for filename in filenames]
282
283 if len(objects) == 1:
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in <listcomp>(.0)
279 objects = [load_single_file(filename, lazy=lazy,
280 **kwds)
--> 281 for filename in filenames]
282
283 if len(objects) == 1:
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load_single_file(filename, **kwds)
316 else:
317 reader = io_plugins[i]
--> 318 return load_with_reader(filename=filename, reader=reader, **kwds)
319
320
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load_with_reader(filename, reader, signal_type, convert_units, **kwds)
323 lazy = kwds.get('lazy', False)
324 file_data_list = reader.file_reader(filename,
--> 325 **kwds)
326 objects = []
327
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io_plugins/semper_unf.py in file_reader(filename, **kwds)
703 def file_reader(filename, **kwds):
704 lazy = kwds.get('lazy', False)
--> 705 semper = SemperFormat.load_from_unf(filename, lazy=lazy)
706 semper.log_info()
707 return [semper.to_signal(lazy=lazy)._to_dictionary()]
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io_plugins/semper_unf.py in load_from_unf(cls, filename, lazy)
386 :rec_length //
387 2],
--> 388 count=1)
389 metadata.update(sarray2dict(header))
390 assert np.frombuffer(f.read(4), dtype=np.int32)[0] == rec_length, \
**ValueError: field 'IFORM' occurs more than once**
I am not sure what the error is about. Apparently, the "IFORM" is some sort of a dictionary key in this type of data structure. I would be appreciated if anyone can help me address this problem.
Hey I have a very short question. I need to load data for my machine learning course, but it does not work for me and I have no idea why. Im using Jupyter with Python 3.
My Code:
from sklearn.datasets import fetch_covtype
forest = fetch_covtype()
For my friend it works fine with the same conditions. I already tried to update sklearn with pip install -U scikit-learn, but it did not solve the problem. I hope somebody can help me.
It creates the following error:
UnboundLocalError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
126 try:
--> 127 X, y
128 except NameError:
UnboundLocalError: local variable 'X' referenced before assignment
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-9-fb303a92b6ca> in <module>
----> 1 forest =fetch_covtype()
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
127 X, y
128 except NameError:
--> 129 X, y = _refresh_cache([samples_path, targets_path], 9)
130 # TODO: Revert to the following two lines in v0.23
131 # X = joblib.load(samples_path)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in _refresh_cache(files, compress)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in <listcomp>(.0)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load(filename, mmap_mode)
603 return load_compatibility(fobj)
604
--> 605 obj = _unpickle(fobj, filename, mmap_mode)
606
607 return obj
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in _unpickle(fobj, filename, mmap_mode)
527 obj = None
528 try:
--> 529 obj = unpickler.load()
530 if unpickler.compat_mode:
531 warnings.warn("The file '%s' has been generated with a "
/opt/conda/lib/python3.7/pickle.py in load(self)
1083 raise EOFError
1084 assert isinstance(key, bytes_types)
-> 1085 dispatch[key[0]](self)
1086 except _Stop as stopinst:
1087 return stopinst.value
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load_build(self)
353 if isinstance(array_wrapper, NDArrayWrapper):
354 self.compat_mode = True
--> 355 self.stack.append(array_wrapper.read(self))
356
357 # Be careful to register our new method.
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read(self, unpickler)
196 array = self.read_mmap(unpickler)
197 else:
--> 198 array = self.read_array(unpickler)
199
200 # Manage array subclass case
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read_array(self, unpickler)
147 read_size = int(read_count * self.dtype.itemsize)
148 data = _read_bytes(unpickler.file_handle,
--> 149 read_size, "array data")
150 array[i:i + read_count] = \
151 unpickler.np.frombuffer(data, dtype=self.dtype,
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle_utils.py in _read_bytes(fp, size, error_template)
241 if len(data) != size:
242 msg = "EOF: reading %s, expected %d bytes got %d"
--> 243 raise ValueError(msg % (error_template, size, len(data)))
244 else:
245 return data
ValueError: EOF: reading array data, expected 262144 bytes got 209661