Operations on a Dask DataFrame fail when using snappy compression

Operations on a Dask DataFrame fail when using snappy compression - python

I partitioned a large dataset into a sequence of parquet files using pandas.DataFrame.to_parquet and saved them to S3. I then read these into Dask on a cluster using dask.read_parquet:
import dask.dataframe as dd
df = dd.read_parquet(
's3://aleksey-emr-dask/data/2019-taxi-dataset/',
storage_options={'key': 'secret', 'secret': 'secret'},
engine='fastparquet'
)
pandas uses snappy compression by default. fastparquet is able to work with this compression so long as you install the python-snappy and snappy packages. Since I am running on AWS EMR, and using Dask's EMR example bootstrap script, I have installed these packages from conda-forge using the --botstrap-actions flag and the --conda-packages optional argument:
python3 -m pip list | grep snappy
python-snappy 0.5.4
This is enough to make dd.read_parquet succeed. However, certain operations fail with KeyError: snappy. For example, this fails:
passenger_counts = df.trip_distance.value_counts().compute()
I know this is not an error with the cluster configuration because other operations, like this one, succeed:
vendors = df.VendorID.value_counts().compute()
> 2.0 53516733
> 1.0 30368157
> 4.0 267080
> Name: VendorID, dtype: int64
Which leads to my question. Does Dask not support snappy compression, even if its IO engine (fastparquet in this case) does?
Here is the full body of the error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<timed exec> in <module>
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/base.py in compute(self, **kwargs)
165 dask.base.compute
166 """
--> 167 (result,) = compute(self, traverse=False, **kwargs)
168 return result
169
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/base.py in compute(*args, **kwargs)
445 postcomputes.append(x.__dask_postcompute__())
446
--> 447 results = schedule(dsk, keys, **kwargs)
448 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
449
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2686 should_rejoin = False
2687 try:
-> 2688 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2689 finally:
2690 for f in futures.values():
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1986 direct=direct,
1987 local_worker=local_worker,
-> 1988 asynchronous=asynchronous,
1989 )
1990
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
831 else:
832 return sync(
--> 833 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
834 )
835
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
337 if error[0]:
338 typ, exc, tb = error[0]
--> 339 raise exc.with_traceback(tb)
340 else:
341 return result[0]
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils.py in f()
321 if callback_timeout is not None:
322 future = asyncio.wait_for(future, callback_timeout)
--> 323 result[0] = yield future
324 except Exception as exc:
325 error[0] = sys.exc_info()
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1874 else:
1875 self._gather_future = future
-> 1876 response = await future
1877
1878 if response["status"] == "error":
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in _gather_remote(self, direct, local_worker)
1925
1926 else: # ask scheduler to gather data for us
-> 1927 response = await retry_operation(self.scheduler.gather, keys=keys)
1928
1929 return response
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils_comm.py in retry_operation(coro, operation, *args, **kwargs)
388 delay_min=retry_delay_min,
389 delay_max=retry_delay_max,
--> 390 operation=operation,
391 )
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils_comm.py in retry(coro, count, delay_min, delay_max, jitter_fraction, retry_on_exceptions, operation)
368 delay *= 1 + random.random() * jitter_fraction
369 await asyncio.sleep(delay)
--> 370 return await coro()
371
372
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/core.py in send_recv_from_rpc(**kwargs)
859 name, comm.name = comm.name, "ConnectionPool." + key
860 try:
--> 861 result = await send_recv(comm=comm, op=key, **kwargs)
862 finally:
863 self.pool.reuse(self.addr, comm)
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/core.py in send_recv(comm, reply, serializers, deserializers, **kwargs)
642 await comm.write(msg, serializers=serializers, on_error="raise")
643 if reply:
--> 644 response = await comm.read(deserializers=deserializers)
645 else:
646 response = None
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/tcp.py in read(self, deserializers)
204 deserialize=self.deserialize,
205 deserializers=deserializers,
--> 206 allow_offload=self.allow_offload,
207 )
208 except EOFError:
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/utils.py in from_frames(frames, deserialize, deserializers, allow_offload)
85 res = await offload(_from_frames)
86 else:
---> 87 res = _from_frames()
88
89 return res
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/utils.py in _from_frames()
64 try:
65 return protocol.loads(
---> 66 frames, deserialize=deserialize, deserializers=deserializers
67 )
68 except EOFError:
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/core.py in loads(frames, deserialize, deserializers)
126 if deserialize or key in bytestrings:
127 if "compression" in head:
--> 128 fs = decompress(head, fs)
129 fs = merge_frames(head, fs)
130 value = _deserialize(head, fs, deserializers=deserializers)
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/compression.py in decompress(header, frames)
214 return [
215 compressions[c]["decompress"](frame)
--> 216 for c, frame in zip(header["compression"], frames)
217 ]
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/compression.py in <listcomp>(.0)
214 return [
215 compressions[c]["decompress"](frame)
--> 216 for c, frame in zip(header["compression"], frames)
217 ]
KeyError: 'snappy'

You need to have snappy and python-snappy installed in the client environment as well, so that the worker can use the codec to turn source bytes into data.
I'm accessing the cluster from a local Jupyter notebook on my machine via SSH port forwarding, and did not have these packages installed locally. Installing them in my local env:
$ conda install -c conda-forge snappy python-snappy
Resolved the issue.

Related

FSSpec Error Handling in Python - Timeout Error

I am trying to get Terraclimate Data from Microsoft Planetary and facing time out error. Is there a possiblity of increasing the timeout time ? Please find the code below and the error I am facing. I am using fsspec and xarray for downloading spatial data from MS Planetary portal.
import fsspec
import xarray as xr
store = fsspec.get_mapper(asset.href)
data = xr.open_zarr(store, **asset.extra_fields["xarray:open_kwargs"])
clipped_data = data.sel(time=slice('2015-01-01','2019-12-31'),lon=slice(min_lon,max_lon),lat=slice(max_lat,min_lat))
parsed_data = clipped_data[['tmax', 'tmin', 'ppt', 'soil']]
lat_list = parsed_data['lat'].values.tolist()
lon_list = parsed_data['lon'].values.tolist()
filename = "Soil_Moisture_sample.csv"
for(i,j) in zip(lat_list,lon_list):
parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=i, lat=j, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
I am getting the following error
TimeoutError Traceback (most recent call last)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:53, in _runner(event, coro, result, timeout)
52 try:
---> 53 result[0] = await coro
54 except Exception as ex:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:423, in AsyncFileSystem._cat(self, path, recursive, on_error, batch_size, **kwargs)
422 if ex:
--> 423 raise ex
424 if (
425 len(paths) > 1
426 or isinstance(path, list)
427 or paths[0] != self._strip_protocol(path)
428 ):
File ~\Anaconda3\envs\satellite\lib\asyncio\tasks.py:455, in wait_for(fut, timeout, loop)
454 if timeout is None:
--> 455 return await fut
457 if timeout <= 0:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\implementations\http.py:221, in HTTPFileSystem._cat_file(self, url, start, end, **kwargs)
220 async with session.get(url, **kw) as r:
--> 221 out = await r.read()
222 self._raise_not_found_for_status(r, url)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\client_reqrep.py:1036, in ClientResponse.read(self)
1035 try:
-> 1036 self._body = await self.content.read()
1037 for trace in self._traces:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:375, in StreamReader.read(self, n)
374 while True:
--> 375 block = await self.readany()
376 if not block:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:397, in StreamReader.readany(self)
396 while not self._buffer and not self._eof:
--> 397 await self._wait("readany")
399 return self._read_nowait(-1)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:304, in StreamReader._wait(self, func_name)
303 with self._timer:
--> 304 await waiter
305 else:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\helpers.py:721, in TimerContext.__exit__(self, exc_type, exc_val, exc_tb)
720 if exc_type is asyncio.CancelledError and self._cancelled:
--> 721 raise asyncio.TimeoutError from None
722 return None
TimeoutError:
The above exception was the direct cause of the following exception:
FSTimeoutError Traceback (most recent call last)
Input In [62], in <cell line: 3>()
1 # Flood Region Point - Thiruvanthpuram
2 filename = "Soil_Moisture_sample.csv"
----> 3 parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=8.520833, lat=76.4375, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5898, in Dataset.to_dataframe(self, dim_order)
5870 """Convert this dataset into a pandas.DataFrame.
5871
5872 Non-index variables in this dataset form the columns of the
(...)
5893
5894 """
5896 ordered_dims = self._normalize_dim_order(dim_order=dim_order)
-> 5898 return self._to_dataframe(ordered_dims=ordered_dims)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5862, in Dataset._to_dataframe(self, ordered_dims)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
-> 5862 data = [
5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5863, in <listcomp>(.0)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
5862 data = [
-> 5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:527, in Variable.values(self)
524 #property
525 def values(self):
526 """The variable's data as a numpy.ndarray"""
--> 527 return _as_array_or_item(self._data)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:267, in _as_array_or_item(data)
253 def _as_array_or_item(data):
254 """Return the given values as a numpy array, or as an individual item if
255 it's a 0d datetime64 or timedelta64 array.
256
(...)
265 TODO: remove this (replace with np.asarray) once these issues are fixed
266 """
--> 267 data = np.asarray(data)
268 if data.ndim == 0:
269 if data.dtype.kind == "M":
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:1696, in Array.__array__(self, dtype, **kwargs)
1695 def __array__(self, dtype=None, **kwargs):
-> 1696 x = self.compute()
1697 if dtype and x.dtype != dtype:
1698 x = x.astype(dtype)
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:315, in DaskMethodsMixin.compute(self, **kwargs)
291 def compute(self, **kwargs):
292 """Compute this dask collection
293
294 This turns a lazy Dask collection into its in-memory equivalent.
(...)
313 dask.base.compute
314 """
--> 315 (result,) = compute(self, traverse=False, **kwargs)
316 return result
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:600, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
597 keys.append(x.__dask_keys__())
598 postcomputes.append(x.__dask_postcompute__())
--> 600 results = schedule(dsk, keys, **kwargs)
601 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
File ~\AppData\Roaming\Python\Python38\site-packages\dask\threaded.py:89, in get(dsk, keys, cache, num_workers, pool, **kwargs)
86 elif isinstance(pool, multiprocessing.pool.Pool):
87 pool = MultiprocessingPoolExecutor(pool)
---> 89 results = get_async(
90 pool.submit,
91 pool._max_workers,
92 dsk,
93 keys,
94 cache=cache,
95 get_id=_thread_get_id,
96 pack_exception=pack_exception,
97 **kwargs,
98 )
100 # Cleanup pools associated to dead threads
101 with pools_lock:
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
509 _execute_task(task, data) # Re-execute locally
510 else:
--> 511 raise_exception(exc, tb)
512 res, worker_id = loads(res_info)
513 state["cache"][key] = res
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:319, in reraise(exc, tb)
317 if exc.__traceback__ is not tb:
318 raise exc.with_traceback(tb)
--> 319 raise exc
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:224, in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
222 try:
223 task, data = loads(task_info)
--> 224 result = _execute_task(task, data)
225 id = get_id()
226 result = dumps((result, id))
File ~\AppData\Roaming\Python\Python38\site-packages\dask\core.py:119, in _execute_task(arg, cache, dsk)
115 func, args = arg[0], arg[1:]
116 # Note: Don't assign the subtask results to a variable. numpy detects
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:128, in getter(a, b, asarray, lock)
123 # Below we special-case `np.matrix` to force a conversion to
124 # `np.ndarray` and preserve original Dask behavior for `getter`,
125 # as for all purposes `np.matrix` is array-like and thus
126 # `is_arraylike` evaluates to `True` in that case.
127 if asarray and (not is_arraylike(c) or isinstance(c, np.matrix)):
--> 128 c = np.asarray(c)
129 finally:
130 if lock:
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:459, in ImplicitToExplicitIndexingAdapter.__array__(self, dtype)
458 def __array__(self, dtype=None):
--> 459 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:623, in CopyOnWriteArray.__array__(self, dtype)
622 def __array__(self, dtype=None):
--> 623 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:524, in LazilyIndexedArray.__array__(self, dtype)
522 def __array__(self, dtype=None):
523 array = as_indexable(self.array)
--> 524 return np.asarray(array[self.key], dtype=None)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\backends\zarr.py:76, in ZarrArrayWrapper.__getitem__(self, key)
74 array = self.get_array()
75 if isinstance(key, indexing.BasicIndexer):
---> 76 return array[key.tuple]
77 elif isinstance(key, indexing.VectorizedIndexer):
78 return array.vindex[
79 indexing._arrayize_vectorized_indexer(key, self.shape).tuple
80 ]
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:788, in Array.__getitem__(self, selection)
786 result = self.vindex[selection]
787 else:
--> 788 result = self.get_basic_selection(pure_selection, fields=fields)
789 return result
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:914, in Array.get_basic_selection(self, selection, out, fields)
911 return self._get_basic_selection_zd(selection=selection, out=out,
912 fields=fields)
913 else:
--> 914 return self._get_basic_selection_nd(selection=selection, out=out,
915 fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:957, in Array._get_basic_selection_nd(self, selection, out, fields)
951 def _get_basic_selection_nd(self, selection, out=None, fields=None):
952 # implementation of basic selection for array with at least one dimension
953
954 # setup indexer
955 indexer = BasicIndexer(selection, self)
--> 957 return self._get_selection(indexer=indexer, out=out, fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1247, in Array._get_selection(self, indexer, out, fields)
1241 if not hasattr(self.chunk_store, "getitems") or \
1242 any(map(lambda x: x == 0, self.shape)):
1243 # sequentially get one key at a time from storage
1244 for chunk_coords, chunk_selection, out_selection in indexer:
1245
1246 # load chunk selection into output array
-> 1247 self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection,
1248 drop_axes=indexer.drop_axes, fields=fields)
1249 else:
1250 # allow storage to get multiple items at once
1251 lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1939, in Array._chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes, fields)
1935 ckey = self._chunk_key(chunk_coords)
1937 try:
1938 # obtain compressed data for chunk
-> 1939 cdata = self.chunk_store[ckey]
1941 except KeyError:
1942 # chunk not initialized
1943 if self._fill_value is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\storage.py:717, in KVStore.__getitem__(self, key)
716 def __getitem__(self, key):
--> 717 return self._mutable_mapping[key]
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\mapping.py:137, in FSMap.__getitem__(self, key, default)
135 k = self._key_to_str(key)
136 try:
--> 137 result = self.fs.cat(k)
138 except self.missing_exceptions:
139 if default is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:111, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
108 #functools.wraps(func)
109 def wrapper(*args, **kwargs):
110 self = obj or args[0]
--> 111 return sync(self.loop, func, *args, **kwargs)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:94, in sync(loop, func, timeout, *args, **kwargs)
91 return_result = result[0]
92 if isinstance(return_result, asyncio.TimeoutError):
93 # suppress asyncio.TimeoutError, raise FSTimeoutError
---> 94 raise FSTimeoutError from return_result
95 elif isinstance(return_result, BaseException):
96 raise return_result
FSTimeoutError:

In the line:
store = fsspec.get_mapper(asset.href)
You can pass extra arguments to the fsspec backend, in this case HTTP, see fsspec.implementations.http.HTTPFileSystem. In this case, client_kwargs get passed to aiohttp.ClientSession, and include an optional timeout argument. Your call may look something like
from aiohttp import ClientTimeout
store = get_mapper(asset.href, client_kwargs={"timeout": ClientTimeout(total=5000, connect=1000)})

Scattering data to dask cluster workers: unknown address scheme 'gateway'

I am following the code found on the accepted answer to this SO question (the "Chunk then scatter" part) and I get a strange error while trying to scatter a pandas.DataFrame to the workers.
I am working in jupyter notebook if that matters.
I am not sure what this error means, it's quite cryptic, so any help would be greatly appreciated.
from dask_gateway import Gateway
import dask.dataframe as dd
import dask
gateway = Gateway()
options = gateway.cluster_options()
cluster = gateway.new_cluster(cluster_options=options)
cluster.scale(10)
client = cluster.get_client()
X_train = ... # build pandas.DataFrame
x = dd.from_pandas(X_train, npartitions=10)
x = x.persist(get=dask.threaded.get) # chunk locally
futures = client.scatter(dict(x.dask)) # scatter chunks
x.dask = x
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
/tmp/ipykernel_567/3586545525.py in <module>
1 x = dd.from_pandas(X_train, npartitions=10)
2 x = x.persist(get=dask.threaded.get) # chunk locally
----> 3 futures = client.scatter(dict(x.dask)) # scatter chunks
4 x.dask = x
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/client.py in scatter(self, data, workers, broadcast, direct, hash, timeout, asynchronous)
2182 else:
2183 local_worker = None
-> 2184 return self.sync(
2185 self._scatter,
2186 data,
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
866 return future
867 else:
--> 868 return sync(
869 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
870 )
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
330 if error[0]:
331 typ, exc, tb = error[0]
--> 332 raise exc.with_traceback(tb)
333 else:
334 return result[0]
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/utils.py in f()
313 if callback_timeout is not None:
314 future = asyncio.wait_for(future, callback_timeout)
--> 315 result[0] = yield future
316 except Exception:
317 error[0] = sys.exc_info()
/srv/conda/envs/notebook/lib/python3.9/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/client.py in _scatter(self, data, workers, broadcast, direct, local_worker, timeout, hash)
2004 isinstance(k, (bytes, str)) for k in data
2005 ):
-> 2006 d = await self._scatter(keymap(stringify, data), workers, broadcast)
2007 return {k: d[stringify(k)] for k in data}
2008
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/client.py in _scatter(self, data, workers, broadcast, direct, local_worker, timeout, hash)
2073 )
2074 else:
-> 2075 await self.scheduler.scatter(
2076 data=data2,
2077 workers=workers,
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/core.py in send_recv_from_rpc(**kwargs)
893 name, comm.name = comm.name, "ConnectionPool." + key
894 try:
--> 895 result = await send_recv(comm=comm, op=key, **kwargs)
896 finally:
897 self.pool.reuse(self.addr, comm)
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/core.py in send_recv(comm, reply, serializers, deserializers, **kwargs)
686 if comm.deserialize:
687 typ, exc, tb = clean_exception(**response)
--> 688 raise exc.with_traceback(tb)
689 else:
690 raise Exception(response["exception_text"])
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/core.py in handle_comm()
528 result = asyncio.ensure_future(result)
529 self._ongoing_coroutines.add(result)
--> 530 result = await result
531 except (CommClosedError, CancelledError):
532 if self.status in (Status.running, Status.paused):
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/scheduler.py in scatter()
5795 assert isinstance(data, dict)
5796
-> 5797 keys, who_has, nbytes = await scatter_to_workers(
5798 nthreads, data, rpc=self.rpc, report=False
5799 )
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/utils_comm.py in scatter_to_workers()
143 rpcs = {addr: rpc(addr) for addr in d}
144 try:
--> 145 out = await All(
146 [
147 rpcs[address].update_data(
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/utils.py in All()
214 while not tasks.done():
215 try:
--> 216 result = await tasks.next()
217 except Exception:
218
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/core.py in send_recv_from_rpc()
893 name, comm.name = comm.name, "ConnectionPool." + key
894 try:
--> 895 result = await send_recv(comm=comm, op=key, **kwargs)
896 finally:
897 self.pool.reuse(self.addr, comm)
/srv/conda/envs/notebook/lib/python3.9/site-packages/distributed/core.py in send_recv()
688 raise exc.with_traceback(tb)
689 else:
--> 690 raise Exception(response["exception_text"])
691 return response
692
Exception: ValueError("unknown address scheme 'gateway' (known schemes: ['inproc', 'tcp', 'tls', 'ucx', 'ws', 'wss'])")

dd.from_pandas() does this "partitioning-then-scattering" internally, so you don't have to do it manually anymore. You can directly use the Dask DataFrame API on x, and the compute should automatically work on your cluster. :)
The answer you've linked is from 5 years ago, which is now outdated because Dask has matured a lot since. For instance, x.dask now refers to a "high level graph" (recently added feature) instead of a low-level graph. Dask Gateway uses its own URL scheme, and I'm guessing it's not able to interface with this older Dask syntax properly.
Also, note that mixing schedulers (as done in that answer) isn't recommended anymore.

Failure to parallelize code trying to load the same numpy array with joblib

I am new to the world of parallelization, and encountered a very odd bug as I was trying to run a function trying to load the same npy file running on several cores.
My code is of the form:
import os
from pathlib import Path
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
mydir = 'path/of/your/choice'
myfile = 'myArray.npy'
mydir=Path(mydir)
myfile=mydir/myfile
os.chdir(mydir)
myarray = np.zeros((12345))
np.save(myfile, myarray)
def foo(myfile, x):
# function loading a myArray and working with it
arr=np.load(myfile)
return arr+x
if __name__=='__main__':
foo_results = Parallel(n_jobs=num_cores, backend="threading")(\
delayed(foo)(myfile,i) for i in range(10))
In my case, this script would run fine about 40% of the way, then return
--> 17 arr=np.load(mydir/'myArray.npy')
ValueError: cannot reshape array of size 0 into shape (12345,)
What blows my mind is that if I enter %pdb debug mode and actually try to run arr=np.load(mydir/'myArray.npy'), this works! So I assume that the issue stems from all the parallel processes running foo trying to load the same numpy array at the same time (as in debug mode, all the processes are paused and only the code that I execute actually runs).
This very minimal example actually works, presumably because the function is very simple and joblib handles this gracefully, but my code would be too long and complicated to be posted here - first of all, has anyone encountered a similar issue in the past? If no one manages to identify my issue, I will post my whole script.
Thanks for your help!
-------------------- EDIT ------------------
Given that there doesn't seem to be an easy answer with the toy code that I posted, here are the full error logs. I played around with the backends following #psarka recommendation and for some reason, the following error arises with the default loky backend (again, no problem to run the code in a non-parallel manner):
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg_stack(dp, U_src, U_trg, cbin, cwin, normalize, all_to_all, name, sav, again, periods)
541
542 ccg_results=Parallel(n_jobs=num_cores)(\
--> 543 delayed(ccg)(*ccg_inputs[i]) for i in tqdm(range(len(ccg_inputs)), desc=f'Computing ccgs over {num_cores} cores'))
544 for ((i1, u1, i2, u2), CCG) in zip(ccg_ids,ccg_results):
545 if i1==i2:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~/miniconda3/envs/npyx/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
~/miniconda3/envs/npyx/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Cannot load file containing pickled data when allow_pickle=False
but this arises with the threading backend, which is more informative (which was originally used in my question) - again, it is possible to actually run train = np.load(Path(dprm,fn)) in debug mode:
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg_stack(dp, U_src, U_trg, cbin, cwin, normalize, all_to_all, name, sav, again, periods)
541
542 ccg_results=Parallel(n_jobs=num_cores, backend='threading')(\
--> 543 delayed(ccg)(*ccg_inputs[i]) for i in tqdm(range(len(ccg_inputs)), desc=f'Computing ccgs over {num_cores} cores'))
544 for ((i1, u1, i2, u2), CCG) in zip(ccg_ids,ccg_results):
545 if i1==i2:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/miniconda3/envs/npyx/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
~/miniconda3/envs/npyx/lib/python3.7/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/_parallel_backends.py in __call__(self, *args, **kwargs)
593 def __call__(self, *args, **kwargs):
594 try:
--> 595 return self.func(*args, **kwargs)
596 except KeyboardInterrupt as e:
597 # We capture the KeyboardInterrupt and reraise it as
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg(dp, U, bin_size, win_size, fs, normalize, ret, sav, verbose, periods, again, trains)
258 if verbose: print("File {} not found in routines memory.".format(fn))
259 crosscorrelograms = crosscorrelate_cyrille(dp, bin_size, win_size, sortedU, fs, True,
--> 260 periods=periods, verbose=verbose, trains=trains)
261 crosscorrelograms = np.asarray(crosscorrelograms, dtype='float64')
262 if crosscorrelograms.shape[0]<len(U): # no spikes were found in this period
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in crosscorrelate_cyrille(dp, bin_size, win_size, U, fs, symmetrize, periods, verbose, trains)
88 U=list(U)
89
---> 90 spike_times, spike_clusters = make_phy_like_spikeClustersTimes(dp, U, periods=periods, verbose=verbose, trains=trains)
91
92 return crosscorr_cyrille(spike_times, spike_clusters, win_size, bin_size, fs, symmetrize)
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in make_phy_like_spikeClustersTimes(dp, U, periods, verbose, trains)
46 for iu, u in enumerate(U):
47 # Even lists of strings can be dealt with as integers by being replaced by their indices
---> 48 trains_dic[iu]=trn(dp, u, sav=True, periods=periods, verbose=verbose) # trains in samples
49 else:
50 assert len(trains)>1
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/spk_t.py in trn(dp, unit, sav, verbose, periods, again, enforced_rp)
106 if op.exists(Path(dprm,fn)) and not again:
107 if verbose: print("File {} found in routines memory.".format(fn))
--> 108 train = np.load(Path(dprm,fn))
109
110 # if not, compute it
~/miniconda3/envs/npyx/lib/python3.7/site-packages/numpy-1.21.0rc2-py3.7-linux-x86_64.egg/numpy/lib/npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
443 # Try a pickle
444 if not allow_pickle:
--> 445 raise ValueError("Cannot load file containing pickled data "
446 "when allow_pickle=False")
447 try:
ValueError: Cannot load file containing pickled data when allow_pickle=False
The original error ValueError: cannot reshape array of size 0 into shape (12345,) doesn't show up anymore for some reason.

Got 504 Deadline Exceeded in Jupiter Notebook (Python) with Big query

I am trying to get a the result of a Google Bigquery query in a pandas dataframe (in Jupiter notebook).
But everytime I try to run the query I get a DeadlineExceeded: 504 Deadline Exceeded.
This happens not only for queries in my own BQ project but also for other projects.
I have tried a lot of option to run the query like in here: https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas
Anyone have a idea how to fix this?
Query:
%load_ext google.cloud.bigquery
%%bigquery tax_forms --use_bqstorage_api
SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`
---------------------------------------------------------------------------
_MultiThreadedRendezvous Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in error_remapped_callable(*args, **kwargs)
149 prefetch_first = getattr(callable_, "_prefetch_first_result_", True)
--> 150 return _StreamingResponseIterator(result, prefetch_first_result=prefetch_first)
151 except grpc.RpcError as exc:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in __init__(self, wrapped, prefetch_first_result)
72 if prefetch_first_result:
---> 73 self._stored_first_result = six.next(self._wrapped)
74 except TypeError:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\grpc\_channel.py in __next__(self)
415 def __next__(self):
--> 416 return self._next()
417
~\AppData\Local\Continuum\anaconda3\lib\site-packages\grpc\_channel.py in _next(self)
705 elif self._state.code is not None:
--> 706 raise self
707
_MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.DEADLINE_EXCEEDED
details = "Deadline Exceeded"
debug_error_string = "{"created":"#1597838569.388000000","description":"Error received from peer ipv4:172.217.168.202:443","file":"src/core/lib/surface/call.cc","file_line":1062,"grpc_message":"Deadline Exceeded","grpc_status":4}"
>
The above exception was the direct cause of the following exception:
DeadlineExceeded Traceback (most recent call last)
<ipython-input-2-4fdaec7219df> in <module>
----> 1 get_ipython().run_cell_magic('bigquery', 'tax_forms --use_bqstorage_api', 'SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`\n')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2357 with self.builtin_trap:
2358 args = (magic_arg_s, cell)
-> 2359 result = fn(*args, **kwargs)
2360 return result
2361
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\magics.py in _cell_magic(line, query)
589 )
590 else:
--> 591 result = query_job.to_dataframe(bqstorage_client=bqstorage_client)
592
593 if args.destination_var:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\job.py in to_dataframe(self, bqstorage_client, dtypes, progress_bar_type, create_bqstorage_client, date_as_object)
3381 progress_bar_type=progress_bar_type,
3382 create_bqstorage_client=create_bqstorage_client,
-> 3383 date_as_object=date_as_object,
3384 )
3385
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in to_dataframe(self, bqstorage_client, dtypes, progress_bar_type, create_bqstorage_client, date_as_object)
1726 progress_bar_type=progress_bar_type,
1727 bqstorage_client=bqstorage_client,
-> 1728 create_bqstorage_client=create_bqstorage_client,
1729 )
1730
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in to_arrow(self, progress_bar_type, bqstorage_client, create_bqstorage_client)
1544 record_batches = []
1545 for record_batch in self._to_arrow_iterable(
-> 1546 bqstorage_client=bqstorage_client
1547 ):
1548 record_batches.append(record_batch)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in _to_page_iterable(self, bqstorage_download, tabledata_list_download, bqstorage_client)
1433 ):
1434 if bqstorage_client is not None:
-> 1435 for item in bqstorage_download():
1436 yield item
1437 return
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\_pandas_helpers.py in _download_table_bqstorage(project_id, table, bqstorage_client, preserve_order, selected_fields, page_to_item)
723 # Call result() on any finished threads to raise any
724 # exceptions encountered.
--> 725 future.result()
726
727 try:
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\_pandas_helpers.py in _download_table_bqstorage_stream(download_state, bqstorage_client, session, stream, worker_queue, page_to_item)
591 rowstream = bqstorage_client.read_rows(position).rows(session)
592 else:
--> 593 rowstream = bqstorage_client.read_rows(stream.name).rows(session)
594
595 for page in rowstream.pages:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery_storage_v1\client.py in read_rows(self, name, offset, retry, timeout, metadata)
120 retry=retry,
121 timeout=timeout,
--> 122 metadata=metadata,
123 )
124 return reader.ReadRowsStream(
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery_storage_v1\gapic\big_query_read_client.py in read_rows(self, read_stream, offset, retry, timeout, metadata)
370
371 return self._inner_api_calls["read_rows"](
--> 372 request, retry=retry, timeout=timeout, metadata=metadata
373 )
374
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\gapic_v1\method.py in __call__(self, *args, **kwargs)
143 kwargs["metadata"] = metadata
144
--> 145 return wrapped_func(*args, **kwargs)
146
147
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\retry.py in retry_wrapped_func(*args, **kwargs)
284 sleep_generator,
285 self._deadline,
--> 286 on_error=on_error,
287 )
288
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\retry.py in retry_target(target, predicate, sleep_generator, deadline, on_error)
182 for sleep in sleep_generator:
183 try:
--> 184 return target()
185
186 # pylint: disable=broad-except
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\timeout.py in func_with_timeout(*args, **kwargs)
212 """Wrapped function that adds timeout."""
213 kwargs["timeout"] = next(timeouts)
--> 214 return func(*args, **kwargs)
215
216 return func_with_timeout
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in error_remapped_callable(*args, **kwargs)
150 return _StreamingResponseIterator(result, prefetch_first_result=prefetch_first)
151 except grpc.RpcError as exc:
--> 152 six.raise_from(exceptions.from_grpc_error(exc), exc)
153
154 return error_remapped_callable
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in raise_from(value, from_value)
DeadlineExceeded: 504 Deadline Exceeded
Let me know if you need to know more. Thanks in advance.
Rutger

It turned out to be a conflict between a Conda package and a pip packages.
I resolved it by reinstall all the packages.

KilledWorker exception using Dask on 1 machine and a large dataset?

so my problem is that I have tried to read a large file (almost 12GB, on my 16gb RAM PC), but every time I try to do some operations on the Dask dataframe, something goes wrong and an exception such as KilledWorker exception happens...
I am not using any clusters, and I have tried to use pandas, but the RAM would go up to 100%, so I don't think I have a choice other than Dask at the moment.
Please check out a snippet of my code:
from dask.distributed import Client, progress
client = Client()
client
ddf = dd.read_csv('C:\\Users\\user\\Desktop\\bigfile1.csv', encoding="latin-1", dtype="str")
mylist_a =['a', 'b', 'c', 'd', 'e','f','g','h','i']
daskfile1 = ddf.loc[:, ~ddf.columns.isin(mylist_a)].compute()
Here are the errors:
distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting
distributed.nanny - WARNING - Worker process 10292 was killed by signal 15
distributed.scheduler - ERROR - Workers don't have promised key: ['tcp://127.0.0.1:6585'], ('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 76)
distributed.client - WARNING - Couldn't gather 200 keys, rescheduling {"('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 76)": ('tcp://127.0.0.1:6585',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 161)": ('tcp://127.0.0.1:6628',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 51)": ('tcp://127.0.0.1:6568',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 156)": ('tcp://127.0.0.1:6568',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 58)": ('tcp://127.0.0.1:6585',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 176)": ('tcp://127.0.0.1:6628',),
distributed.scheduler - ERROR - Workers don't have promised key: ['tcp://127.0.0.1:6715'], ('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 66)
And here is the Exception:
KilledWorker Traceback (most recent call last)
<ipython-input-7-38194b0211e6> in <module>
3 # df = ddf.compute()
4
----> 5 daskfile1 = ddf.loc[:, ~ddf.columns.isin(mylist_a)].compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2330 try:
2331 results = self.gather(packed, asynchronous=asynchronous,
-> 2332 direct=direct)
2333 finally:
2334 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1654 return self.sync(self._gather, futures, errors=errors,
1655 direct=direct, local_worker=local_worker,
-> 1656 asynchronous=asynchronous)
1657
1658 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
674 return future
675 else:
--> 676 return sync(self.loop, func, *args, **kwargs)
677
678 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
727
728 try:
--> 729 value = future.result()
730 except Exception:
731 exc_info = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
734 if exc_info is not None:
735 try:
--> 736 yielded = self.gen.throw(*exc_info) # type: ignore
737 finally:
738 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1495 six.reraise(type(exception),
1496 exception,
-> 1497 traceback)
1498 if errors == 'skip':
1499 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
KilledWorker: ("('from-delayed-pandas_read_text-read-block-try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 36)", 'tcp://127.0.0.1:6717')
I just want to do this relatively simple operation on a large data size
(please note that I only showed a snippet of the errors, since there are a lot)
Any type of help would be appreciated

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Operations on a Dask DataFrame fail when using snappy compression - python

Related

FSSpec Error Handling in Python - Timeout Error

Scattering data to dask cluster workers: unknown address scheme 'gateway'

Failure to parallelize code trying to load the same numpy array with joblib

Got 504 Deadline Exceeded in Jupiter Notebook (Python) with Big query

KilledWorker exception using Dask on 1 machine and a large dataset?

Categories

Resources