I am trying to get a the result of a Google Bigquery query in a pandas dataframe (in Jupiter notebook).
But everytime I try to run the query I get a DeadlineExceeded: 504 Deadline Exceeded.
This happens not only for queries in my own BQ project but also for other projects.
I have tried a lot of option to run the query like in here: https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas
Anyone have a idea how to fix this?
Query:
%load_ext google.cloud.bigquery
%%bigquery tax_forms --use_bqstorage_api
SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`
---------------------------------------------------------------------------
_MultiThreadedRendezvous Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in error_remapped_callable(*args, **kwargs)
149 prefetch_first = getattr(callable_, "_prefetch_first_result_", True)
--> 150 return _StreamingResponseIterator(result, prefetch_first_result=prefetch_first)
151 except grpc.RpcError as exc:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in __init__(self, wrapped, prefetch_first_result)
72 if prefetch_first_result:
---> 73 self._stored_first_result = six.next(self._wrapped)
74 except TypeError:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\grpc\_channel.py in __next__(self)
415 def __next__(self):
--> 416 return self._next()
417
~\AppData\Local\Continuum\anaconda3\lib\site-packages\grpc\_channel.py in _next(self)
705 elif self._state.code is not None:
--> 706 raise self
707
_MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.DEADLINE_EXCEEDED
details = "Deadline Exceeded"
debug_error_string = "{"created":"#1597838569.388000000","description":"Error received from peer ipv4:172.217.168.202:443","file":"src/core/lib/surface/call.cc","file_line":1062,"grpc_message":"Deadline Exceeded","grpc_status":4}"
>
The above exception was the direct cause of the following exception:
DeadlineExceeded Traceback (most recent call last)
<ipython-input-2-4fdaec7219df> in <module>
----> 1 get_ipython().run_cell_magic('bigquery', 'tax_forms --use_bqstorage_api', 'SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`\n')
~\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2357 with self.builtin_trap:
2358 args = (magic_arg_s, cell)
-> 2359 result = fn(*args, **kwargs)
2360 return result
2361
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\magics.py in _cell_magic(line, query)
589 )
590 else:
--> 591 result = query_job.to_dataframe(bqstorage_client=bqstorage_client)
592
593 if args.destination_var:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\job.py in to_dataframe(self, bqstorage_client, dtypes, progress_bar_type, create_bqstorage_client, date_as_object)
3381 progress_bar_type=progress_bar_type,
3382 create_bqstorage_client=create_bqstorage_client,
-> 3383 date_as_object=date_as_object,
3384 )
3385
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in to_dataframe(self, bqstorage_client, dtypes, progress_bar_type, create_bqstorage_client, date_as_object)
1726 progress_bar_type=progress_bar_type,
1727 bqstorage_client=bqstorage_client,
-> 1728 create_bqstorage_client=create_bqstorage_client,
1729 )
1730
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in to_arrow(self, progress_bar_type, bqstorage_client, create_bqstorage_client)
1544 record_batches = []
1545 for record_batch in self._to_arrow_iterable(
-> 1546 bqstorage_client=bqstorage_client
1547 ):
1548 record_batches.append(record_batch)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\table.py in _to_page_iterable(self, bqstorage_download, tabledata_list_download, bqstorage_client)
1433 ):
1434 if bqstorage_client is not None:
-> 1435 for item in bqstorage_download():
1436 yield item
1437 return
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\_pandas_helpers.py in _download_table_bqstorage(project_id, table, bqstorage_client, preserve_order, selected_fields, page_to_item)
723 # Call result() on any finished threads to raise any
724 # exceptions encountered.
--> 725 future.result()
726
727 try:
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
~\AppData\Local\Continuum\anaconda3\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery\_pandas_helpers.py in _download_table_bqstorage_stream(download_state, bqstorage_client, session, stream, worker_queue, page_to_item)
591 rowstream = bqstorage_client.read_rows(position).rows(session)
592 else:
--> 593 rowstream = bqstorage_client.read_rows(stream.name).rows(session)
594
595 for page in rowstream.pages:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery_storage_v1\client.py in read_rows(self, name, offset, retry, timeout, metadata)
120 retry=retry,
121 timeout=timeout,
--> 122 metadata=metadata,
123 )
124 return reader.ReadRowsStream(
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\cloud\bigquery_storage_v1\gapic\big_query_read_client.py in read_rows(self, read_stream, offset, retry, timeout, metadata)
370
371 return self._inner_api_calls["read_rows"](
--> 372 request, retry=retry, timeout=timeout, metadata=metadata
373 )
374
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\gapic_v1\method.py in __call__(self, *args, **kwargs)
143 kwargs["metadata"] = metadata
144
--> 145 return wrapped_func(*args, **kwargs)
146
147
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\retry.py in retry_wrapped_func(*args, **kwargs)
284 sleep_generator,
285 self._deadline,
--> 286 on_error=on_error,
287 )
288
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\retry.py in retry_target(target, predicate, sleep_generator, deadline, on_error)
182 for sleep in sleep_generator:
183 try:
--> 184 return target()
185
186 # pylint: disable=broad-except
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\timeout.py in func_with_timeout(*args, **kwargs)
212 """Wrapped function that adds timeout."""
213 kwargs["timeout"] = next(timeouts)
--> 214 return func(*args, **kwargs)
215
216 return func_with_timeout
~\AppData\Local\Continuum\anaconda3\lib\site-packages\google\api_core\grpc_helpers.py in error_remapped_callable(*args, **kwargs)
150 return _StreamingResponseIterator(result, prefetch_first_result=prefetch_first)
151 except grpc.RpcError as exc:
--> 152 six.raise_from(exceptions.from_grpc_error(exc), exc)
153
154 return error_remapped_callable
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in raise_from(value, from_value)
DeadlineExceeded: 504 Deadline Exceeded
Let me know if you need to know more. Thanks in advance.
Rutger
It turned out to be a conflict between a Conda package and a pip packages.
I resolved it by reinstall all the packages.
Related
I am trying to get Terraclimate Data from Microsoft Planetary and facing time out error. Is there a possiblity of increasing the timeout time ? Please find the code below and the error I am facing. I am using fsspec and xarray for downloading spatial data from MS Planetary portal.
import fsspec
import xarray as xr
store = fsspec.get_mapper(asset.href)
data = xr.open_zarr(store, **asset.extra_fields["xarray:open_kwargs"])
clipped_data = data.sel(time=slice('2015-01-01','2019-12-31'),lon=slice(min_lon,max_lon),lat=slice(max_lat,min_lat))
parsed_data = clipped_data[['tmax', 'tmin', 'ppt', 'soil']]
lat_list = parsed_data['lat'].values.tolist()
lon_list = parsed_data['lon'].values.tolist()
filename = "Soil_Moisture_sample.csv"
for(i,j) in zip(lat_list,lon_list):
parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=i, lat=j, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
I am getting the following error
TimeoutError Traceback (most recent call last)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:53, in _runner(event, coro, result, timeout)
52 try:
---> 53 result[0] = await coro
54 except Exception as ex:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:423, in AsyncFileSystem._cat(self, path, recursive, on_error, batch_size, **kwargs)
422 if ex:
--> 423 raise ex
424 if (
425 len(paths) > 1
426 or isinstance(path, list)
427 or paths[0] != self._strip_protocol(path)
428 ):
File ~\Anaconda3\envs\satellite\lib\asyncio\tasks.py:455, in wait_for(fut, timeout, loop)
454 if timeout is None:
--> 455 return await fut
457 if timeout <= 0:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\implementations\http.py:221, in HTTPFileSystem._cat_file(self, url, start, end, **kwargs)
220 async with session.get(url, **kw) as r:
--> 221 out = await r.read()
222 self._raise_not_found_for_status(r, url)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\client_reqrep.py:1036, in ClientResponse.read(self)
1035 try:
-> 1036 self._body = await self.content.read()
1037 for trace in self._traces:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:375, in StreamReader.read(self, n)
374 while True:
--> 375 block = await self.readany()
376 if not block:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:397, in StreamReader.readany(self)
396 while not self._buffer and not self._eof:
--> 397 await self._wait("readany")
399 return self._read_nowait(-1)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:304, in StreamReader._wait(self, func_name)
303 with self._timer:
--> 304 await waiter
305 else:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\helpers.py:721, in TimerContext.__exit__(self, exc_type, exc_val, exc_tb)
720 if exc_type is asyncio.CancelledError and self._cancelled:
--> 721 raise asyncio.TimeoutError from None
722 return None
TimeoutError:
The above exception was the direct cause of the following exception:
FSTimeoutError Traceback (most recent call last)
Input In [62], in <cell line: 3>()
1 # Flood Region Point - Thiruvanthpuram
2 filename = "Soil_Moisture_sample.csv"
----> 3 parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=8.520833, lat=76.4375, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5898, in Dataset.to_dataframe(self, dim_order)
5870 """Convert this dataset into a pandas.DataFrame.
5871
5872 Non-index variables in this dataset form the columns of the
(...)
5893
5894 """
5896 ordered_dims = self._normalize_dim_order(dim_order=dim_order)
-> 5898 return self._to_dataframe(ordered_dims=ordered_dims)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5862, in Dataset._to_dataframe(self, ordered_dims)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
-> 5862 data = [
5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5863, in <listcomp>(.0)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
5862 data = [
-> 5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:527, in Variable.values(self)
524 #property
525 def values(self):
526 """The variable's data as a numpy.ndarray"""
--> 527 return _as_array_or_item(self._data)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:267, in _as_array_or_item(data)
253 def _as_array_or_item(data):
254 """Return the given values as a numpy array, or as an individual item if
255 it's a 0d datetime64 or timedelta64 array.
256
(...)
265 TODO: remove this (replace with np.asarray) once these issues are fixed
266 """
--> 267 data = np.asarray(data)
268 if data.ndim == 0:
269 if data.dtype.kind == "M":
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:1696, in Array.__array__(self, dtype, **kwargs)
1695 def __array__(self, dtype=None, **kwargs):
-> 1696 x = self.compute()
1697 if dtype and x.dtype != dtype:
1698 x = x.astype(dtype)
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:315, in DaskMethodsMixin.compute(self, **kwargs)
291 def compute(self, **kwargs):
292 """Compute this dask collection
293
294 This turns a lazy Dask collection into its in-memory equivalent.
(...)
313 dask.base.compute
314 """
--> 315 (result,) = compute(self, traverse=False, **kwargs)
316 return result
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:600, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
597 keys.append(x.__dask_keys__())
598 postcomputes.append(x.__dask_postcompute__())
--> 600 results = schedule(dsk, keys, **kwargs)
601 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
File ~\AppData\Roaming\Python\Python38\site-packages\dask\threaded.py:89, in get(dsk, keys, cache, num_workers, pool, **kwargs)
86 elif isinstance(pool, multiprocessing.pool.Pool):
87 pool = MultiprocessingPoolExecutor(pool)
---> 89 results = get_async(
90 pool.submit,
91 pool._max_workers,
92 dsk,
93 keys,
94 cache=cache,
95 get_id=_thread_get_id,
96 pack_exception=pack_exception,
97 **kwargs,
98 )
100 # Cleanup pools associated to dead threads
101 with pools_lock:
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
509 _execute_task(task, data) # Re-execute locally
510 else:
--> 511 raise_exception(exc, tb)
512 res, worker_id = loads(res_info)
513 state["cache"][key] = res
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:319, in reraise(exc, tb)
317 if exc.__traceback__ is not tb:
318 raise exc.with_traceback(tb)
--> 319 raise exc
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:224, in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
222 try:
223 task, data = loads(task_info)
--> 224 result = _execute_task(task, data)
225 id = get_id()
226 result = dumps((result, id))
File ~\AppData\Roaming\Python\Python38\site-packages\dask\core.py:119, in _execute_task(arg, cache, dsk)
115 func, args = arg[0], arg[1:]
116 # Note: Don't assign the subtask results to a variable. numpy detects
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:128, in getter(a, b, asarray, lock)
123 # Below we special-case `np.matrix` to force a conversion to
124 # `np.ndarray` and preserve original Dask behavior for `getter`,
125 # as for all purposes `np.matrix` is array-like and thus
126 # `is_arraylike` evaluates to `True` in that case.
127 if asarray and (not is_arraylike(c) or isinstance(c, np.matrix)):
--> 128 c = np.asarray(c)
129 finally:
130 if lock:
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:459, in ImplicitToExplicitIndexingAdapter.__array__(self, dtype)
458 def __array__(self, dtype=None):
--> 459 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:623, in CopyOnWriteArray.__array__(self, dtype)
622 def __array__(self, dtype=None):
--> 623 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:524, in LazilyIndexedArray.__array__(self, dtype)
522 def __array__(self, dtype=None):
523 array = as_indexable(self.array)
--> 524 return np.asarray(array[self.key], dtype=None)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\backends\zarr.py:76, in ZarrArrayWrapper.__getitem__(self, key)
74 array = self.get_array()
75 if isinstance(key, indexing.BasicIndexer):
---> 76 return array[key.tuple]
77 elif isinstance(key, indexing.VectorizedIndexer):
78 return array.vindex[
79 indexing._arrayize_vectorized_indexer(key, self.shape).tuple
80 ]
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:788, in Array.__getitem__(self, selection)
786 result = self.vindex[selection]
787 else:
--> 788 result = self.get_basic_selection(pure_selection, fields=fields)
789 return result
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:914, in Array.get_basic_selection(self, selection, out, fields)
911 return self._get_basic_selection_zd(selection=selection, out=out,
912 fields=fields)
913 else:
--> 914 return self._get_basic_selection_nd(selection=selection, out=out,
915 fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:957, in Array._get_basic_selection_nd(self, selection, out, fields)
951 def _get_basic_selection_nd(self, selection, out=None, fields=None):
952 # implementation of basic selection for array with at least one dimension
953
954 # setup indexer
955 indexer = BasicIndexer(selection, self)
--> 957 return self._get_selection(indexer=indexer, out=out, fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1247, in Array._get_selection(self, indexer, out, fields)
1241 if not hasattr(self.chunk_store, "getitems") or \
1242 any(map(lambda x: x == 0, self.shape)):
1243 # sequentially get one key at a time from storage
1244 for chunk_coords, chunk_selection, out_selection in indexer:
1245
1246 # load chunk selection into output array
-> 1247 self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection,
1248 drop_axes=indexer.drop_axes, fields=fields)
1249 else:
1250 # allow storage to get multiple items at once
1251 lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1939, in Array._chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes, fields)
1935 ckey = self._chunk_key(chunk_coords)
1937 try:
1938 # obtain compressed data for chunk
-> 1939 cdata = self.chunk_store[ckey]
1941 except KeyError:
1942 # chunk not initialized
1943 if self._fill_value is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\storage.py:717, in KVStore.__getitem__(self, key)
716 def __getitem__(self, key):
--> 717 return self._mutable_mapping[key]
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\mapping.py:137, in FSMap.__getitem__(self, key, default)
135 k = self._key_to_str(key)
136 try:
--> 137 result = self.fs.cat(k)
138 except self.missing_exceptions:
139 if default is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:111, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
108 #functools.wraps(func)
109 def wrapper(*args, **kwargs):
110 self = obj or args[0]
--> 111 return sync(self.loop, func, *args, **kwargs)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:94, in sync(loop, func, timeout, *args, **kwargs)
91 return_result = result[0]
92 if isinstance(return_result, asyncio.TimeoutError):
93 # suppress asyncio.TimeoutError, raise FSTimeoutError
---> 94 raise FSTimeoutError from return_result
95 elif isinstance(return_result, BaseException):
96 raise return_result
FSTimeoutError:
In the line:
store = fsspec.get_mapper(asset.href)
You can pass extra arguments to the fsspec backend, in this case HTTP, see fsspec.implementations.http.HTTPFileSystem. In this case, client_kwargs get passed to aiohttp.ClientSession, and include an optional timeout argument. Your call may look something like
from aiohttp import ClientTimeout
store = get_mapper(asset.href, client_kwargs={"timeout": ClientTimeout(total=5000, connect=1000)})
I partitioned a large dataset into a sequence of parquet files using pandas.DataFrame.to_parquet and saved them to S3. I then read these into Dask on a cluster using dask.read_parquet:
import dask.dataframe as dd
df = dd.read_parquet(
's3://aleksey-emr-dask/data/2019-taxi-dataset/',
storage_options={'key': 'secret', 'secret': 'secret'},
engine='fastparquet'
)
pandas uses snappy compression by default. fastparquet is able to work with this compression so long as you install the python-snappy and snappy packages. Since I am running on AWS EMR, and using Dask's EMR example bootstrap script, I have installed these packages from conda-forge using the --botstrap-actions flag and the --conda-packages optional argument:
python3 -m pip list | grep snappy
python-snappy 0.5.4
This is enough to make dd.read_parquet succeed. However, certain operations fail with KeyError: snappy. For example, this fails:
passenger_counts = df.trip_distance.value_counts().compute()
I know this is not an error with the cluster configuration because other operations, like this one, succeed:
vendors = df.VendorID.value_counts().compute()
> 2.0 53516733
> 1.0 30368157
> 4.0 267080
> Name: VendorID, dtype: int64
Which leads to my question. Does Dask not support snappy compression, even if its IO engine (fastparquet in this case) does?
Here is the full body of the error message:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<timed exec> in <module>
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/base.py in compute(self, **kwargs)
165 dask.base.compute
166 """
--> 167 (result,) = compute(self, traverse=False, **kwargs)
168 return result
169
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/base.py in compute(*args, **kwargs)
445 postcomputes.append(x.__dask_postcompute__())
446
--> 447 results = schedule(dsk, keys, **kwargs)
448 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
449
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2686 should_rejoin = False
2687 try:
-> 2688 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2689 finally:
2690 for f in futures.values():
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1986 direct=direct,
1987 local_worker=local_worker,
-> 1988 asynchronous=asynchronous,
1989 )
1990
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
831 else:
832 return sync(
--> 833 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
834 )
835
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
337 if error[0]:
338 typ, exc, tb = error[0]
--> 339 raise exc.with_traceback(tb)
340 else:
341 return result[0]
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils.py in f()
321 if callback_timeout is not None:
322 future = asyncio.wait_for(future, callback_timeout)
--> 323 result[0] = yield future
324 except Exception as exc:
325 error[0] = sys.exc_info()
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1874 else:
1875 self._gather_future = future
-> 1876 response = await future
1877
1878 if response["status"] == "error":
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/client.py in _gather_remote(self, direct, local_worker)
1925
1926 else: # ask scheduler to gather data for us
-> 1927 response = await retry_operation(self.scheduler.gather, keys=keys)
1928
1929 return response
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils_comm.py in retry_operation(coro, operation, *args, **kwargs)
388 delay_min=retry_delay_min,
389 delay_max=retry_delay_max,
--> 390 operation=operation,
391 )
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/utils_comm.py in retry(coro, count, delay_min, delay_max, jitter_fraction, retry_on_exceptions, operation)
368 delay *= 1 + random.random() * jitter_fraction
369 await asyncio.sleep(delay)
--> 370 return await coro()
371
372
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/core.py in send_recv_from_rpc(**kwargs)
859 name, comm.name = comm.name, "ConnectionPool." + key
860 try:
--> 861 result = await send_recv(comm=comm, op=key, **kwargs)
862 finally:
863 self.pool.reuse(self.addr, comm)
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/core.py in send_recv(comm, reply, serializers, deserializers, **kwargs)
642 await comm.write(msg, serializers=serializers, on_error="raise")
643 if reply:
--> 644 response = await comm.read(deserializers=deserializers)
645 else:
646 response = None
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/tcp.py in read(self, deserializers)
204 deserialize=self.deserialize,
205 deserializers=deserializers,
--> 206 allow_offload=self.allow_offload,
207 )
208 except EOFError:
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/utils.py in from_frames(frames, deserialize, deserializers, allow_offload)
85 res = await offload(_from_frames)
86 else:
---> 87 res = _from_frames()
88
89 return res
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/comm/utils.py in _from_frames()
64 try:
65 return protocol.loads(
---> 66 frames, deserialize=deserialize, deserializers=deserializers
67 )
68 except EOFError:
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/core.py in loads(frames, deserialize, deserializers)
126 if deserialize or key in bytestrings:
127 if "compression" in head:
--> 128 fs = decompress(head, fs)
129 fs = merge_frames(head, fs)
130 value = _deserialize(head, fs, deserializers=deserializers)
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/compression.py in decompress(header, frames)
214 return [
215 compressions[c]["decompress"](frame)
--> 216 for c, frame in zip(header["compression"], frames)
217 ]
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/distributed/protocol/compression.py in <listcomp>(.0)
214 return [
215 compressions[c]["decompress"](frame)
--> 216 for c, frame in zip(header["compression"], frames)
217 ]
KeyError: 'snappy'
You need to have snappy and python-snappy installed in the client environment as well, so that the worker can use the codec to turn source bytes into data.
I'm accessing the cluster from a local Jupyter notebook on my machine via SSH port forwarding, and did not have these packages installed locally. Installing them in my local env:
$ conda install -c conda-forge snappy python-snappy
Resolved the issue.
so my problem is that I have tried to read a large file (almost 12GB, on my 16gb RAM PC), but every time I try to do some operations on the Dask dataframe, something goes wrong and an exception such as KilledWorker exception happens...
I am not using any clusters, and I have tried to use pandas, but the RAM would go up to 100%, so I don't think I have a choice other than Dask at the moment.
Please check out a snippet of my code:
from dask.distributed import Client, progress
client = Client()
client
ddf = dd.read_csv('C:\\Users\\user\\Desktop\\bigfile1.csv', encoding="latin-1", dtype="str")
mylist_a =['a', 'b', 'c', 'd', 'e','f','g','h','i']
daskfile1 = ddf.loc[:, ~ddf.columns.isin(mylist_a)].compute()
Here are the errors:
distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting
distributed.nanny - WARNING - Worker process 10292 was killed by signal 15
distributed.scheduler - ERROR - Workers don't have promised key: ['tcp://127.0.0.1:6585'], ('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 76)
distributed.client - WARNING - Couldn't gather 200 keys, rescheduling {"('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 76)": ('tcp://127.0.0.1:6585',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 161)": ('tcp://127.0.0.1:6628',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 51)": ('tcp://127.0.0.1:6568',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 156)": ('tcp://127.0.0.1:6568',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 58)": ('tcp://127.0.0.1:6585',), "('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 176)": ('tcp://127.0.0.1:6628',),
distributed.scheduler - ERROR - Workers don't have promised key: ['tcp://127.0.0.1:6715'], ('try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 66)
And here is the Exception:
KilledWorker Traceback (most recent call last)
<ipython-input-7-38194b0211e6> in <module>
3 # df = ddf.compute()
4
----> 5 daskfile1 = ddf.loc[:, ~ddf.columns.isin(mylist_a)].compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2330 try:
2331 results = self.gather(packed, asynchronous=asynchronous,
-> 2332 direct=direct)
2333 finally:
2334 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1654 return self.sync(self._gather, futures, errors=errors,
1655 direct=direct, local_worker=local_worker,
-> 1656 asynchronous=asynchronous)
1657
1658 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
674 return future
675 else:
--> 676 return sync(self.loop, func, *args, **kwargs)
677
678 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
727
728 try:
--> 729 value = future.result()
730 except Exception:
731 exc_info = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
734 if exc_info is not None:
735 try:
--> 736 yielded = self.gen.throw(*exc_info) # type: ignore
737 finally:
738 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1495 six.reraise(type(exception),
1496 exception,
-> 1497 traceback)
1498 if errors == 'skip':
1499 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
KilledWorker: ("('from-delayed-pandas_read_text-read-block-try_loc-ebb1f07a0b9e9ef0362fea6ff6407461', 36)", 'tcp://127.0.0.1:6717')
I just want to do this relatively simple operation on a large data size
(please note that I only showed a snippet of the errors, since there are a lot)
Any type of help would be appreciated
I'm Trying to convert a column into category in order to perform a pivot_table operation.
I've tried the following:
user_item_df = user_item.pivot_table(index='msno',
columns='song_id',
values='interacted',
aggfunc='mean')
And I got this:
ValueError Traceback (most recent call last)
<ipython-input-76-a870ece1f3e8> in <module>
2 columns='song_id',
3 values='interacted',
----> 4 aggfunc='mean')
~/anaconda3/lib/python3.6/site-packages/dask/dataframe/core.py in pivot_table(self, index, columns, values, aggfunc)
3123 from .reshape import pivot_table
3124 return pivot_table(self, index=index, columns=columns, values=values,
-> 3125 aggfunc=aggfunc)
3126
3127 def to_records(self, index=False):
~/anaconda3/lib/python3.6/site-packages/dask/dataframe/reshape.py in pivot_table(df, index, columns, values, aggfunc)
190 raise ValueError("'columns' must be the name of an existing column")
191 if not is_categorical_dtype(df[columns]):
--> 192 raise ValueError("'columns' must be category dtype")
193 if not has_known_categories(df[columns]):
194 raise ValueError("'columns' must have known categories. Please use "
ValueError: 'columns' must be category dtype
So I've tried to convert the column:
user_item.song_id = user_item.song_id.astype('category')
But I got this when calling pivot_table:
ValueError Traceback (most recent call last)
<ipython-input-78-a870ece1f3e8> in <module>
2 columns='song_id',
3 values='interacted',
----> 4 aggfunc='mean')
~/anaconda3/lib/python3.6/site-packages/dask/dataframe/core.py in pivot_table(self, index, columns, values, aggfunc)
3123 from .reshape import pivot_table
3124 return pivot_table(self, index=index, columns=columns, values=values,
-> 3125 aggfunc=aggfunc)
3126
3127 def to_records(self, index=False):
~/anaconda3/lib/python3.6/site-packages/dask/dataframe/reshape.py in pivot_table(df, index, columns, values, aggfunc)
192 raise ValueError("'columns' must be category dtype")
193 if not has_known_categories(df[columns]):
--> 194 raise ValueError("'columns' must have known categories. Please use "
195 "`df[columns].cat.as_known()` beforehand to ensure "
196 "known categories")
ValueError: 'columns' must have known categories. Please use `df[columns].cat.as_known()` beforehand to ensure known categories
Then I tried:
user_item.song_id = user_item.song_id.astype('category').cat.as_known()
And I immediately got:
KeyError Traceback (most recent call last)
<timed exec> in <module>
~/anaconda3/lib/python3.6/site-packages/dask/dataframe/categorical.py in as_known(self, **kwargs)
187 if self.known:
188 return self._series
--> 189 categories = self._property_map('categories').unique().compute(**kwargs)
190 return self.set_categories(categories.values)
191
~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~/anaconda3/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2336 try:
2337 results = self.gather(packed, asynchronous=asynchronous,
-> 2338 direct=direct)
2339 finally:
2340 for f in futures.values():
~/anaconda3/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1660 return self.sync(self._gather, futures, errors=errors,
1661 direct=direct, local_worker=local_worker,
-> 1662 asynchronous=asynchronous)
1663
1664 #gen.coroutine
~/anaconda3/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
674 return future
675 else:
--> 676 return sync(self.loop, func, *args, **kwargs)
677
678 def __repr__(self):
~/anaconda3/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~/anaconda3/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
684 if value.__traceback__ is not tb:
685 raise value.with_traceback(tb)
--> 686 raise value
687
688 else:
~/anaconda3/lib/python3.6/site-packages/distributed/utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~/anaconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~/anaconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~/anaconda3/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1501 six.reraise(type(exception),
1502 exception,
-> 1503 traceback)
1504 if errors == 'skip':
1505 bad_keys.add(key)
~/anaconda3/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
683 value = tp()
684 if value.__traceback__ is not tb:
--> 685 raise value.with_traceback(tb)
686 raise value
687
/home/pi/env/lib/python3.5/site-packages/dask/dataframe/core.py in apply_and_enforce()
KeyError: '_func'
And the output of my workers is:
Exception: KeyError('_func',)
distributed.worker - WARNING - Compute Failed
Function: execute_task
args: ((<function apply at 0x764b3c90>, <function unique at 0x6ef24a50>, [(<function apply_and_enforce at 0x6eeede88>, <function Accessor._delegate_property at 0x6ef28198>, [(<function apply_and_enforce at 0x6eeede88>, <methodcaller: astype>, [(<built-in function getitem>, (<function apply at 0x764b3c90>, <function partial_by_order at 0x762ebd20>, [ msno ... interacted
0 vDi/nHqBu7wb+DtI2Ix4TupWQatUEFR41mDC0c8Voh8= ... 1
1 3IGfhB6dtaYxEGm20yFtRxN7KoFZjzGJbXPSjsjW5cM= ... 1
2 4QugsKXr1pJXSBj6CbSYCF6O7QY2/MHGICUU16p3fig= ... 1
3 i4g6DQpmkTuRCS6/osUsQ8GSBJM8261is4Q04NDGRPk= ... 1
4 TTNNMisplhps4y5gdQ6rsv0++TIKOOIIZLz05W97vFU= ... 1
5 sDR8kS+t73zE9QM8D03Zw3mVrsRXc0Nih/WRl02sfZI= ... 1
6 yiGYGWyGrCYHlMOtPv65urw9RfdH43PNGzu8TRaO+m8= ... 1
7 7lXXPZLRbAPWE5ILi2BFQVEhYzPz9cwNvuzIVCuHfZY= ... 1
8 4clHF4wjaFgY6+nQWoXm1EEAvB
kwargs: {}
Exception: KeyError('_func',)
If anyone Know how to fix this issue it would helps me a lot.
Solved by putting the same version of dask-distributed dask-core across all the workers, scheduler and client.
I'm following this https://www.mongodb.com/blog/post/getting-started-with-python-and-mongodb introductory tutorial. I can connect to the cluster fine with mongo shell, but not with pymongo (Python: 3.6.1, Pymongo 3.4.0). Pymongo works okay with a local mongodb. What is the problem? Below is the exception I get:
----------------------------------------------------------------------
-----
KeyError Traceback (most recent call
last)
<ipython-input-22-1c9d47341338> in <module>()
----> 1 server_status_result = db.command('serverStatus')
2 pprint(server_status_result)
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/database.py in command(self, command, value, check,
allowable_errors, read_preference, codec_options, **kwargs)
489 """
490 client = self.__client
--> 491 with client._socket_for_reads(read_preference) as
(sock_info, slave_ok):
492 return self._command(sock_info, command, slave_ok,
value,
493 check, allowable_errors,
read_preference,
/usr/lib/python3.6/contextlib.py in __enter__(self)
80 def __enter__(self):
81 try:
---> 82 return next(self.gen)
83 except StopIteration:
84 raise RuntimeError("generator didn't yield") from None
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/mongo_client.py in _socket_for_reads(self,
read_preference)
857 topology = self._get_topology()
858 single = topology.description.topology_type ==
TOPOLOGY_TYPE.Single
--> 859 with self._get_socket(read_preference) as sock_info:
860 slave_ok = (single and not sock_info.is_mongos) or (
861 preference != ReadPreference.PRIMARY)
/usr/lib/python3.6/contextlib.py in __enter__(self)
80 def __enter__(self):
81 try:
---> 82 return next(self.gen)
83 except StopIteration:
84 raise RuntimeError("generator didn't yield") from None
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/mongo_client.py in _get_socket(self, selector)
823 server = self._get_topology().select_server(selector)
824 try:
--> 825 with server.get_socket(self.__all_credentials) as
sock_info:
826 yield sock_info
827 except NetworkTimeout:
/usr/lib/python3.6/contextlib.py in __enter__(self)
80 def __enter__(self):
81 try:
---> 82 return next(self.gen)
83 except StopIteration:
84 raise RuntimeError("generator didn't yield") from None
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/server.py in get_socket(self, all_credentials,
checkout)
166 #contextlib.contextmanager
167 def get_socket(self, all_credentials, checkout=False):
--> 168 with self.pool.get_socket(all_credentials, checkout)
as sock_info:
169 yield sock_info
170
/usr/lib/python3.6/contextlib.py in __enter__(self)
80 def __enter__(self):
81 try:
---> 82 return next(self.gen)
83 except StopIteration:
84 raise RuntimeError("generator didn't yield") from None
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/pool.py in get_socket(self, all_credentials,
checkout)
790 sock_info = self._get_socket_no_auth()
791 try:
--> 792 sock_info.check_auth(all_credentials)
793 yield sock_info
794 except:
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/pool.py in check_auth(self, all_credentials)
510
511 for credentials in cached - authset:
--> 512 auth.authenticate(credentials, self)
513 self.authset.add(credentials)
514
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/auth.py in authenticate(credentials, sock_info)
468 mechanism = credentials.mechanism
469 auth_func = _AUTH_MAP.get(mechanism)
--> 470 auth_func(credentials, sock_info)
471
472
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/auth.py in _authenticate_default(credentials,
sock_info)
448 def _authenticate_default(credentials, sock_info):
449 if sock_info.max_wire_version >= 3:
--> 450 return _authenticate_scram_sha1(credentials,
sock_info)
451 else:
452 return _authenticate_mongo_cr(credentials, sock_info)
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/auth.py in _authenticate_scram_sha1(credentials,
sock_info)
227 ('conversationId', res['conversationId']),
228 ('payload', Binary(client_final))])
--> 229 res = sock_info.command(source, cmd)
230
231 parsed = _parse_scram_response(res['payload'])
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/pool.py in command(self, dbname, spec, slave_ok,
read_preference, codec_options, check, allowable_errors, check_keys,
read_concern, write_concern, parse_write_concern_error, collation)
422 # Catch socket.error, KeyboardInterrupt, etc. and close
ourselves.
423 except BaseException as error:
--> 424 self._raise_connection_failure(error)
425
426 def send_message(self, message, max_doc_size):
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/pool.py in _raise_connection_failure(self, error)
550 _raise_connection_failure(self.address, error)
551 else:
--> 552 raise error
553
554 def __eq__(self, other):
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/pool.py in command(self, dbname, spec, slave_ok,
read_preference, codec_options, check, allowable_errors, check_keys,
read_concern, write_concern, parse_write_concern_error, collation)
417 read_concern,
418
parse_write_concern_error=parse_write_concern_error,
--> 419 collation=collation)
420 except OperationFailure:
421 raise
/home/tim/.virtualenvs/main/lib/python3.6/site-p
ackages/pymongo/network.py in command(sock, dbname, spec, slave_ok,
is_mongos, read_preference, codec_options, check, allowable_errors,
address, check_keys, listeners, max_bson_size, read_concern,
parse_write_concern_error, collation)
108 response = receive_message(sock, 1, request_id)
109 unpacked = helpers._unpack_response(
--> 110 response, codec_options=codec_options)
111
112 response_doc = unpacked['data'][0]
/home/tim/.virtualenvs/main/lib/python3.6/site-
packages/pymongo/helpers.py in _unpack_response(response, cursor_id,
codec_options)
126 # Fake the ok field if it doesn't exist.
127 error_object.setdefault("ok", 0)
--> 128 if error_object["$err"].startswith("not master"):
129 raise NotMasterError(error_object["$err"],
error_object)
130 elif error_object.get("code") == 50:
KeyError: '$err'
I believe this is an Atlas bug, I've reported it to the team. The bug is, if you fail to log in to Atlas because your username or password are incorrect, it replies in a way that makes PyMongo throw a KeyError instead of the proper OperationFailure("auth failed").
PyMongo does work with Atlas, however, if you properly format your connection string with your username and password. Make sure your username and password are URL-quoted. Substitute your username and password into this Python code:
from urllib import quote_plus
print(quote_plus('MY USERNAME'))
print(quote_plus('MY PASSWORD'))
Take the output and put it into the connection string Atlas gave you, e.g. if your username is jesse#example.com and your password is "foo:bar", put that in the first part of the string, and get the rest of the string from the Atlas control panel for your account:
mongodb://jesse%40example.com:foo%3Abar/#cluster0-shard-00-00-abc.mongodb.net:27017,cluster0-shard-00-01-abc.mongodb.net:27017,cluster0-shard-00-02-abc.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin
Note how "jesse#example.com" has become "jesse%40example.com", and "foo:bar" has become "foo%3Abar".