How can I properly serialize metpy units (based on pint) to work with dask distributed? As far as I understand, it looks like dask distributed automatically pickles data for ease of transfer, but fails to pickle the metpy units which is necessary for computation. Error produced: TypeError: cannot pickle 'weakref' object. MWE below.
import metpy.calc as mpcalc
from metpy.units import units
from dask.distributed import Client, LocalCluster
def calculate_dewpoint(vapor_pressure):
dewpoint = mpcalc.dewpoint(vapor_pressure * units('hPa'))
return dewpoint
cluster = LocalCluster()
client = Client(cluster)
## works
vapor_pressure = 5
dp = calculate_dewpoint(vapor_pressure)
print(dp)
## doesn't work
vapor_pressure = 5
dp_future = client.submit(calculate_dewpoint, vapor_pressure)
dp = dp_future.result()
EDIT: Added full traceback.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/worker.py in dumps_function(func)
4271 with _cache_lock:
-> 4272 result = cache_dumps[func]
4273 except KeyError:
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/utils.py in __getitem__(self, key)
1362 def __getitem__(self, key):
-> 1363 value = super().__getitem__(key)
1364 self.data.move_to_end(key)
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/collections/__init__.py in __getitem__(self, key)
1009 return self.__class__.__missing__(self, key)
-> 1010 raise KeyError(key)
1011 def __setitem__(self, key, item): self.data[key] = item
KeyError: <function calculate_dewpoint at 0x2ad5e010f0d0>
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
52 buffers.clear()
---> 53 result = cloudpickle.dumps(x, **dump_kwargs)
54 elif not _always_use_pickle_for(x) and b"__main__" in result:
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
601 try:
--> 602 return Pickler.dump(self, obj)
603 except RuntimeError as e:
TypeError: cannot pickle 'weakref' object
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
/glade/scratch/cbecker/ipykernel_272346/952144406.py in <module>
20 ## doesn't work
21 vapor_pressure = 5
---> 22 dp_future = client.submit(calculate_dewpoint, vapor_pressure)
23 dp = dp_future.result()
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/client.py in submit(self, func, key, workers, resources, retries, priority, fifo_timeout, allow_other_workers, actor, actors, pure, *args, **kwargs)
1577 dsk = {skey: (func,) + tuple(args)}
1578
-> 1579 futures = self._graph_to_futures(
1580 dsk,
1581 [skey],
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, workers, allow_other_workers, priority, user_priority, resources, retries, fifo_timeout, actors)
2628 # Pack the high level graph before sending it to the scheduler
2629 keyset = set(keys)
-> 2630 dsk = dsk.__dask_distributed_pack__(self, keyset, annotations)
2631
2632 # Create futures before sending graph (helps avoid contention)
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, client, client_keys, annotations)
1074 "__module__": layer.__module__,
1075 "__name__": type(layer).__name__,
-> 1076 "state": layer.__dask_distributed_pack__(
1077 self.get_all_external_keys(),
1078 self.key_dependencies,
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies, client, client_keys)
432 for k, v in dsk.items()
433 }
--> 434 dsk = toolz.valmap(dumps_task, dsk)
435 return {"dsk": dsk, "dependencies": dependencies}
436
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/worker.py in dumps_task(task)
4308 return d
4309 elif not any(map(_maybe_complex, task[1:])):
-> 4310 return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])}
4311 return to_serialize(task)
4312
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/worker.py in dumps_function(func)
4272 result = cache_dumps[func]
4273 except KeyError:
-> 4274 result = pickle.dumps(func, protocol=4)
4275 if len(result) < 100000:
4276 with _cache_lock:
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
58 try:
59 buffers.clear()
---> 60 result = cloudpickle.dumps(x, **dump_kwargs)
61 except Exception as e:
62 logger.info("Failed to serialize %s. Exception: %s", x, e)
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
71 file, protocol=protocol, buffer_callback=buffer_callback
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
75
/glade/work/cbecker/miniconda3/envs/risk/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
600 def dump(self, obj):
601 try:
--> 602 return Pickler.dump(self, obj)
603 except RuntimeError as e:
604 if "recursion" in e.args[0]:
TypeError: cannot pickle 'weakref' object
So there's an issue where (I think) it's trying to serialize the unit registry or units and transfer them between processes. To work around this, try moving the import of units inside the function (though this might cause some other problems):
def calculate_dewpoint(vapor_pressure):
from metpy.units import units
return mpcalc.dewpoint(vapor_pressure * units('hPa'))
Related
I'm trying to do multiprocessing using dask. I have a function which has to run for 10000 files and will generate files as an output. Function is taking files from S3 bucket as an input and is working with another file inside from S3 with similar date and time. And I'm doing everything in JupyterLab
So here's my function:
def get_temp(file, name):
d=[name[0:4],name[4:6],name[6:8],name[9:11],name[11:13]]
f_zip = gzip.decompress(file)
yr=d[0]
mo=d[1]
da=d[2]
hr=d[3]
mn=d[4]
fs = s3fs.S3FileSystem(anon=True)
period = pd.Period(str(yr)+str('-')+str(mo)+str('-')+str(da), freq='D')
# period.dayofyear
dy=period.dayofyear
cc=[7,8,9,10,11,12,13,14,15,16] #look at the IR channels only for now
dat = xr.open_dataset(f_zip)
dd=dat[['recNum','trackLat','trackLon', 'temp']]
dd=dd.to_dataframe()
dd = dd.dropna()
dd['num'] = np.arange(len(dd))
l=dd.where((dd.trackLat>-50.0) & (dd.trackLat<50.0) & (dd.trackLon>-110.0) & (dd.trackLon<10.0))
l = l.dropna()
l.reset_index()
dy="{0:0=3d}".format(dy)
#opening goes data from S3
F=xr.open_dataset(fs.open(fs.glob('s3://noaa-goes16/ABI-L1b-RadF/'+str(yr)+'/'+str(dy)+'/'+str(hr)+'/'+'OR_ABI-L1b-RadF-M3C07'+'*')[int(mn)//15]))
#Converting Lat lon to radiance
req=F['goes_imager_projection'].semi_major_axis
oneovf=F['goes_imager_projection'].inverse_flattening
rpol=F['goes_imager_projection'].semi_minor_axis
e = 0.0818191910435
sat_h=F['goes_imager_projection'].perspective_point_height
H=req+sat_h
gc=np.deg2rad(F['goes_imager_projection'].longitude_of_projection_origin)
phi=np.deg2rad(l.trackLat.values)
gam=np.deg2rad(l.trackLon.values)
phic=np.arctan((rpol**2/req**2)*np.tan(phi))
rc=rpol/np.sqrt((1-e**2*np.cos(phic)**2))
sx=H-rc*np.cos(phic)*np.cos(gam-gc)
sy=-rc*np.cos(phic)*np.sin(gam-gc)
sz=rc*np.sin(phic)
yy=np.arctan(sz/sx)
xx=np.arcsin(-sy/(np.sqrt(sx**2+sy**2+sz**2)))
for i in range(len(xx)):
for c in range(len(ch):
ch="{0:0=2d}".format(cc[c])
F1=xr.open_dataset(fs.open(fs.glob('s3://noaa-goes16/ABI-L1b-RadF/'+str(yr)+'/'+str(dy)+'/'+str(hr)+'/'+'OR_ABI-L1b-RadF-M3C'+ch+'*')[0]))
F2=xr.open_dataset(fs.open(fs.glob('s3://noaa-goes16/ABI-L1b-RadF/'+str(yr)+'/'+str(dy)+'/'+str("{0:0=2d}".format(hr))+'/'+'OR_ABI-L1b-RadF-M3C'+ch+'*')[-1]))
G1 = F1.where((F1.x >= (xx[i]-0.005)) & (F1.x <= (xx[i]+0.005)) & (F1.y >= (yy[i]-0.005)) & (F1.y <= (yy[i]+0.005)), drop=True)
G2 = F2.where((F2.x >= (xx[i]-0.005)) & (F2.x <= (xx[i]+0.005)) & (F2.y >= (yy[i]-0.005)) & (F2.y <= (yy[i]+0.005)), drop=True)
G = xr.concat([G1, G2], dim = 'time')
G = G.assign_coords(channel=(ch))
if c == 0:
T = G
else:
T = xr.concat([T, G], dim = 'channel')
T = T.assign_coords(temp=(str(l['temp'][i])))
print(l.iloc[i]['num'])
path = name+'_'+str(int(l.iloc[i]['num']))+'.nc'
T.to_netcdf(path)
#zipping the file
with zipfile.ZipFile(name+'_'+str(int(l.iloc[i]['num']))+'.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zf:
zf.write(path, arcname=str(name+'_'+str(int(l.iloc[i]['num']))+'.nc'))
# Storing it to S3
s3.Bucket(BUCKET).upload_file(path[:-3]+'.zip', "Output/" + path[:-3]+'.zip')
Here's I'm calling data from S3:
s3 = boto3.resource('s3')
s3client = boto3.client(
's3',
region_name='us-east-1'
)
bucketname = s3.Bucket('temp')
filedata = []
keys = []
names = []
for my_bucket_object in bucketname.objects.all():
keys.append(my_bucket_object.key)
for i in range(1, 21):
fileobj = s3client.get_object(
Bucket='temp',
Key=(keys[i]))
filedata.append(fileobj['Body'].read())
names.append(keys[i][10:-3])
Initially, I'm just trying to run 20 files for testing purposes.
Here's I'm creating dask delayed and compute function:
temp_files = []
for i in range(20):
s3_ds = dask.delayed(get_temp)(filedata[i], names[i])
temp_files.append(s3_ds)
temp_files = dask.compute(*temp_files)
Here's full log of error:
distributed.protocol.pickle - INFO - Failed to serialize <function get_temp at 0x7f20a9cb8550>. Exception: cannot pickle '_thread.lock' object
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/worker.py in dumps_function(func)
3319 with _cache_lock:
-> 3320 result = cache_dumps[func]
3321 except KeyError:
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in __getitem__(self, key)
1572 def __getitem__(self, key):
-> 1573 value = super().__getitem__(key)
1574 self.data.move_to_end(key)
/srv/conda/envs/notebook/lib/python3.8/collections/__init__.py in __getitem__(self, key)
1009 return self.__class__.__missing__(self, key)
-> 1010 raise KeyError(key)
1011 def __setitem__(self, key, item): self.data[key] = item
KeyError: <function get_temp at 0x7f20a9cb8550>
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
52 buffers.clear()
---> 53 result = cloudpickle.dumps(x, **dump_kwargs)
54 elif not _always_use_pickle_for(x) and b"__main__" in result:
/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
562 try:
--> 563 return Pickler.dump(self, obj)
564 except RuntimeError as e:
TypeError: cannot pickle '_thread.lock' object
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-77-fa46004f5919> in <module>
----> 1 temp_files = dask.compute(*temp_files)
/srv/conda/envs/notebook/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
450 postcomputes.append(x.__dask_postcompute__())
451
--> 452 results = schedule(dsk, keys, **kwargs)
453 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
454
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2703 Client.compute: Compute asynchronous collections
2704 """
-> 2705 futures = self._graph_to_futures(
2706 dsk,
2707 keys=set(flatten([keys])),
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout, actors)
2639 {
2640 "op": "update-graph",
-> 2641 "tasks": valmap(dumps_task, dsk),
2642 "dependencies": dependencies,
2643 "keys": list(map(tokey, keys)),
/srv/conda/envs/notebook/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/srv/conda/envs/notebook/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/worker.py in dumps_task(task)
3356 return d
3357 elif not any(map(_maybe_complex, task[1:])):
-> 3358 return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])}
3359 return to_serialize(task)
3360
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/worker.py in dumps_function(func)
3320 result = cache_dumps[func]
3321 except KeyError:
-> 3322 result = pickle.dumps(func, protocol=4)
3323 if len(result) < 100000:
3324 with _cache_lock:
/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
58 try:
59 buffers.clear()
---> 60 result = cloudpickle.dumps(x, **dump_kwargs)
61 except Exception as e:
62 logger.info("Failed to serialize %s. Exception: %s", x, e)
/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
71 file, protocol=protocol, buffer_callback=buffer_callback
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
75
/srv/conda/envs/notebook/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
561 def dump(self, obj):
562 try:
--> 563 return Pickler.dump(self, obj)
564 except RuntimeError as e:
565 if "recursion" in e.args[0]:
TypeError: cannot pickle '_thread.lock' object
Can someone help me here and tell me what I'm doing wrong. And apart from Dask is there any other way for parallel processing?
So I figured out only when I'm uploading file to S3 bucket it's throwing that error, otherwise it's working fine. But If I'm not saving files in S3, I'm not able to figure out where the files are getting stored. When I'm running on dask it's saving file somewhere where I'm not able to find it. I'm running my code in Jupyterlab and there's nothing getting save in any directory.
I have taken some time to parse your code.
In the large function, you use s3fs to interact with your cloud storage, and this works well with xarray.
However, in your main code, you use boto3 to list and open S3 files. These files retain a reference to the client object, which maintains a connection pool. That is the thing that cannot be pickled.
s3fs is designed to work with Dask, and ensures the picklebility of the filesystem instances and OpenFile objects. Since you already use it in one part, I would recommend using s3fs throughout (but I am, of course biased, since I am the main author).
Alternatively, you could pass just the file names (as strings), and not open anything until within the worker function. This would be "best practice" - you should load data in worker tasks, rather than loading in the client and passing the data.
I am trying to use dask to parallelise some code. The function that I parallelise has 3 arguments, but only one of these changes as the loop progresses. This is what I have so far:
import dask
import numpy as np
# Set up client
cluster = SLURMCluster(cores=1, memory='40 GB',
queue='brc', interface='em1',
log_directory='./dask_logs')
cluster.scale(jobs=2)
client = distributed.Client(cluster)
# Fuction to be parrallelised
def nT_loop(i, P,inv_DiagCe):
x = P[:,i]* np.squeeze(-inv_DiagCe)
return x
P = np.random.rand(64620, 64620)
inv_DiagCe = np.random.rand(64620)
# Run loop
res1=[]
for i in range(2):
res = dask.delayed(nT_loop)(i, P,inv_DiagCe)
res1.append(res)
# Compute results
res1 = dask.compute(*res1)
When I run this however it give the following error:
~/miniconda3/envs/python38/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
48 buffers.clear()
---> 49 result = pickle.dumps(x, **dump_kwargs)
50 if len(result) < 1000:
MemoryError:
During handling of the above exception, another exception occurred:
MemoryError Traceback (most recent call last)
~/wang_model/estimation.py in
208 #P[:,i] = P[:,i]* np.squeeze(-inv_DiagCe) #bsxfun(#times, P(:,i), -inv_DiagCe');
209
---> 210 res1 = dask.compute(*res1)
211 print(datetime.now().strftime("%H:%M:%S"))
~/miniconda3/envs/python38/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
450 postcomputes.append(x.__dask_postcompute__())
451
--> 452 results = schedule(dsk, keys, **kwargs)
453 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
454
~/miniconda3/envs/python38/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2703 Client.compute: Compute asynchronous collections
2704 """
-> 2705 futures = self._graph_to_futures(
2706 dsk,
2707 keys=set(flatten([keys])),
~/miniconda3/envs/python38/lib/python3.8/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout, actors)
2639 {
2640 "op": "update-graph",
-> 2641 "tasks": valmap(dumps_task, dsk),
2642 "dependencies": dependencies,
2643 "keys": list(map(tokey, keys)),
~/miniconda3/envs/python38/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
~/miniconda3/envs/python38/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
~/miniconda3/envs/python38/lib/python3.8/site-packages/distributed/worker.py in dumps_task(task)
3356 return d
3357 elif not any(map(_maybe_complex, task[1:])):
-> 3358 return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])}
3359 return to_serialize(task)
3360
~/miniconda3/envs/python38/lib/python3.8/site-packages/distributed/worker.py in warn_dumps(obj, dumps, limit)
3365 def warn_dumps(obj, dumps=pickle.dumps, limit=1e6):
3366 """ Dump an object to bytes, warn if those bytes are large """
-> 3367 b = dumps(obj, protocol=4)
3368 if not _warn_dumps_warned[0] and len(b) > limit:
3369 _warn_dumps_warned[0] = True
~/miniconda3/envs/python38/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
58 try:
59 buffers.clear()
---> 60 result = cloudpickle.dumps(x, **dump_kwargs)
61 except Exception as e:
62 logger.info("Failed to serialize %s. Exception: %s", x, e)
~/miniconda3/envs/python38/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
71 file, protocol=protocol, buffer_callback=buffer_callback
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
75
~/miniconda3/envs/python38/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
561 def dump(self, obj):
562 try:
--> 563 return Pickler.dump(self, obj)
564 except RuntimeError as e:
565 if "recursion" in e.args[0]:
MemoryError: '
I think this may be related to the large size of 'P'. Does anyone have any advice?
Thanks
Here
P = np.random.rand(64620, 64620)
you produce a massive array in memory, and then make copies to send to workers. Your function also returns an equally big array.
You should at the very least use client.scatter to do this step alone, rather than include the array in the graph.
But actually, dask has a perfectly good interface designed to be able to handle large arrays chunk-wise without breaking memory. I suggest you should use it instead of your delayed function approach.
I need to load some meteorological data to analyze several months but such data is stored in files that cover only one day so I need to acces many files at once.
I am following some pre-given instruction that told me to create a memory partition in my computer.
from datetime import datetime, timedelta
import dask.array as da
from dask.distributed import Client, LocalCluster
import xarray
try:
client
except NameError:
client = Client(n_workers=1, threads_per_worker=4, memory_limit='2GB')
else:
print("Client already exists")
After this, I create an array dates that goes from 1st June to 1st October and that is need in "files" to get the link to the meteorological data.
dates=[datetime(2019,6,1) + timedelta(days=i) for i in range(3*30)]
files= [date.strftime('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/%Y/%m/wrf_arw_det_history_d03_%Y%m%d_0000.nc4') for date in dates]
My issue starts when I try to unzip all that data as
multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
It raises the error:
KeyError Traceback (most recent call last)
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
197 try:
--> 198 file = self._cache[self._key]
199 except KeyError:
~\Nueva carpeta\lib\site-packages\xarray\backends\lru_cache.py in __getitem__(self, key)
52 with self._lock:
---> 53 value = self._cache[key]
54 self._cache.move_to_end(key)
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-19-c3d0f4a8cc26> in <module>
----> 1 multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, combine, autoclose, parallel, join, attrs_file, **kwargs)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in <listcomp>(.0)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
507 if engine == "netcdf4":
508 store = backends.NetCDF4DataStore.open(
--> 509 filename_or_obj, group=group, lock=lock, **backend_kwargs
510 )
511 elif engine == "scipy":
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in open(cls, filename, mode, format, group, clobber, diskless, persist, lock, lock_maker, autoclose)
356 netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
357 )
--> 358 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
359
360 def _acquire(self, needs_lock=True):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in __init__(self, manager, group, mode, lock, autoclose)
312 self._group = group
313 self._mode = mode
--> 314 self.format = self.ds.data_model
315 self._filename = self.ds.filepath()
316 self.is_remote = is_remote_uri(self._filename)
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in ds(self)
365 #property
366 def ds(self):
--> 367 return self._acquire()
368
369 def open_store_variable(self, name, var):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in _acquire(self, needs_lock)
359
360 def _acquire(self, needs_lock=True):
--> 361 with self._manager.acquire_context(needs_lock) as root:
362 ds = _nc4_require_group(root, self._group, self._mode)
363 return ds
~\Nueva carpeta\lib\contextlib.py in __enter__(self)
110 del self.args, self.kwds, self.func
111 try:
--> 112 return next(self.gen)
113 except StopIteration:
114 raise RuntimeError("generator didn't yield") from None
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in acquire_context(self, needs_lock)
184 def acquire_context(self, needs_lock=True):
185 """Context manager for acquiring a file."""
--> 186 file, cached = self._acquire_with_cache_info(needs_lock)
187 try:
188 yield file
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
202 kwargs = kwargs.copy()
203 kwargs["mode"] = self._mode
--> 204 file = self._opener(*self._args, **kwargs)
205 if self._mode == "w":
206 # ensure file doesn't get overriden when opened again
netCDF4\_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()
netCDF4\_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -37] NetCDF: Write to read only: b'http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4'
Does anyone know why this error occurs?
I am getting an error while trying to access data from a tf.data.Dataset object.
The dataset object is built from a generator. Any help will be appreciated.
I'm using TensorFlow 2 and trying to run the example from https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
def gen():
for i in itertools.count(1):
yield (i, [1] * i)
dataset = tf.data.Dataset.from_generator(
gen,
(tf.int64, tf.int64),
(tf.TensorShape([]), tf.TensorShape([None])))
list(dataset.take(3).as_numpy_iterator())
The error is :
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/iterator_ops.py in _next_internal(self)
662 # Fast path for the case `self._structure` is not a nested structure.
--> 663 return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access
664 except AttributeError:
AttributeError: 'tuple' object has no attribute '_from_compatible_tensor_list'
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/eager/context.py in execution_mode(mode)
1896 ctx.executor = executor_new
-> 1897 yield
1898 finally:
...
NotFoundError Traceback (most recent call last)
<ipython-input-25-ac0e933e02b3> in <module>
----> 1 list(dataset.take(3).as_numpy_iterator())
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in __next__(self)
3640
3641 def __next__(self):
-> 3642 return self.next()
3643
3644
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in next(self)
3637
3638 def next(self):
-> 3639 return nest.map_structure(lambda x: x.numpy(), next(self._iterator))
3640
3641 def __next__(self):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/iterator_ops.py in __next__(self)
628
629 def __next__(self): # For Python 3 compatibility
--> 630 return self.next()
631
632 def _next_internal(self):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/iterator_ops.py in next(self)
672 """Returns a nested structure of `Tensor`s containing the next element."""
673 try:
--> 674 return self._next_internal()
675 except errors.OutOfRangeError:
676 raise StopIteration
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/iterator_ops.py in _next_internal(self)
663 return self._element_spec._from_compatible_tensor_list(ret) # pylint: disable=protected-access
664 except AttributeError:
--> 665 return structure.from_compatible_tensor_list(self._element_spec, ret)
666
667 #property
/opt/conda/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
97 value = type()
98 try:
---> 99 self.gen.throw(type, value, traceback)
100 except StopIteration as exc:
101 # Suppress StopIteration *unless* it's the same exception that
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/eager/context.py in execution_mode(mode)
1898 finally:
1899 ctx.executor = executor_old
-> 1900 executor_new.wait()
1901
1902
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/eager/executor.py in wait(self)
65 def wait(self):
66 """Waits for ops dispatched in this executor to finish."""
---> 67 pywrap_tensorflow.TFE_ExecutorWaitForAllPendingNodes(self._handle)
68
69 def clear_error(self):
NotFoundError: No registered 'PyFunc' OpKernel for 'CPU' devices compatible with node {{node PyFunc}}
. Registered: <no registered kernels>
I am able to execute your code in TF 2.2.0 in Google colab
For the benefit of community, here i am posting the successful run with output
import tensorflow as tf
print(tf.__version__)
import itertools
def gen():
for i in itertools.count(1):
yield (i, [1] * i)
dataset = tf.data.Dataset.from_generator(
gen,
(tf.int64, tf.int64),
(tf.TensorShape([]), tf.TensorShape([None])))
list(dataset.take(3).as_numpy_iterator())
Output:
2.2.0
[(1, array([1])), (2, array([1, 1])), (3, array([1, 1, 1]))]
I'm trying to reproduce coal mining example with deterministic function for switchpoint instead of using theano's switch function. Code:
%matplotlib inline
import matplotlib.pyplot as plt
import pymc3
import numpy as np
import theano.tensor as t
import theano
data = np.hstack((np.random.poisson(15,1000),np.random.poisson(2,100)))
plt.plot(data)
#theano.compile.ops.as_op(itypes=[t.lscalar, t.dscalar,t.dscalar],otypes=[t.dvector])
def rate1(sw,mu1,mu2):
n = len(data)
out = np.empty(n)
out[:sw] = mu1
out[sw:] = mu2
return out
with pymc3.Model() as dis:
switchpoint = pymc3.DiscreteUniform('switchpoint',lower=0, upper=len(data)-1)
mu1 = pymc3.Exponential('mu1', lam=1.)
mu2 = pymc3.Exponential('mu2',lam=1.)
disasters=pymc3.Poisson('disasters', mu=rate1, observed = data)
But this code rise an error:
--------------------------------------------------------------------------- KeyError Traceback (most recent call
last) c:\program files\git\theano\theano\tensor\type.py in
dtype_specs(self)
266 'complex64': (complex, 'theano_complex64', 'NPY_COMPLEX64')
--> 267 }[self.dtype]
268 except KeyError:
KeyError: 'object'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call
last) c:\program files\git\theano\theano\tensor\basic.py in
constant_or_value(x, rtype, name, ndim, dtype)
407 rval = rtype(
--> 408 TensorType(dtype=x_.dtype, broadcastable=bcastable),
409 x_.copy(),
c:\program files\git\theano\theano\tensor\type.py in init(self,
dtype, broadcastable, name, sparse_grad)
49 self.broadcastable = tuple(bool(b) for b in broadcastable)
---> 50 self.dtype_specs() # error checking is done there
51 self.name = name
c:\program files\git\theano\theano\tensor\type.py in dtype_specs(self)
269 raise TypeError("Unsupported dtype for %s: %s"
--> 270 % (self.class.name, self.dtype))
271
TypeError: Unsupported dtype for TensorType: object
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call
last) c:\program files\git\theano\theano\tensor\basic.py in
as_tensor_variable(x, name, ndim)
201 try:
--> 202 return constant(x, name=name, ndim=ndim)
203 except TypeError:
c:\program files\git\theano\theano\tensor\basic.py in constant(x,
name, ndim, dtype)
421 ret = constant_or_value(x, rtype=TensorConstant, name=name, ndim=ndim,
--> 422 dtype=dtype)
423
c:\program files\git\theano\theano\tensor\basic.py in
constant_or_value(x, rtype, name, ndim, dtype)
416 except Exception:
--> 417 raise TypeError("Could not convert %s to TensorType" % x, type(x))
418
TypeError: ('Could not convert FromFunctionOp{rate1} to TensorType',
)
During handling of the above exception, another exception occurred:
AsTensorError Traceback (most recent call
last) in ()
14 mu2 = pymc3.Exponential('mu2',lam=1.)
15 #rate1 = pymc3.switch(switchpoint >= np.arange(len(data)), mu1,mu2)
---> 16 disasters=pymc3.Poisson('disasters', mu=rate1, observed = data)
C:\Users\User\Anaconda3\lib\site-packages\pymc3\distributions\distribution.py
in new(cls, name, *args, **kwargs)
19 if isinstance(name, str):
20 data = kwargs.pop('observed', None)
---> 21 dist = cls.dist(*args, **kwargs)
22 return model.Var(name, dist, data)
23 elif name is None:
C:\Users\User\Anaconda3\lib\site-packages\pymc3\distributions\distribution.py
in dist(cls, *args, **kwargs)
32 def dist(cls, *args, **kwargs):
33 dist = object.new(cls)
---> 34 dist.init(*args, **kwargs)
35 return dist
36
C:\Users\User\Anaconda3\lib\site-packages\pymc3\distributions\discrete.py
in init(self, mu, *args, **kwargs)
185 super(Poisson, self).init(*args, **kwargs)
186 self.mu = mu
--> 187 self.mode = floor(mu).astype('int32')
188
189 def random(self, point=None, size=None, repeat=None):
c:\program files\git\theano\theano\gof\op.py in call(self,
*inputs, **kwargs)
598 """
599 return_list = kwargs.pop('return_list', False)
--> 600 node = self.make_node(*inputs, **kwargs)
601
602 if config.compute_test_value != 'off':
c:\program files\git\theano\theano\tensor\elemwise.py in
make_node(self, *inputs)
540 using DimShuffle.
541 """
--> 542 inputs = list(map(as_tensor_variable, inputs))
543 shadow = self.scalar_op.make_node(
544 *[get_scalar_type(dtype=i.type.dtype).make_variable()
c:\program files\git\theano\theano\tensor\basic.py in
as_tensor_variable(x, name, ndim)
206 except Exception:
207 str_x = repr(x)
--> 208 raise AsTensorError("Cannot convert %s to TensorType" % str_x, type(x))
209
210 # this has a different name, because _as_tensor_variable is the
AsTensorError: ('Cannot convert FromFunctionOp{rate1} to TensorType',
)
How i handle this?
The second thing - when i'm using the pymc3.switch function like this:
with pymc3.Model() as dis:
switchpoint = pymc3.DiscreteUniform('switchpoint',lower=0, upper=len(data)-1)
mu1 = pymc3.Exponential('mu1', lam=1.)
mu2 = pymc3.Exponential('mu2',lam=1.)
rate1 = pymc3.switch(switchpoint >= np.arange(len(data)), mu1,mu2)
disasters=pymc3.Poisson('disasters', mu=rate1, observed = data)
And next try to sample:
with dis:
step1 = pymc3.NUTS([mu1, mu2])
step2 = pymc3.Metropolis([switchpoint])
trace = pymc3.sample(10000, step = [step1,step2])
I get an error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
c:\program files\git\theano\theano\compile\function_module.py in __call__(self, *args, **kwargs)
858 try:
--> 859 outputs = self.fn()
860 except Exception:
TypeError: expected type_num 9 (NPY_INT64) got 7
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-4-3247d908f897> in <module>()
2 step1 = pymc3.NUTS([mu1, mu2])
3 step2 = pymc3.Metropolis([switchpoint])
----> 4 trace = pymc3.sample(10000, step = [step1,step2])
C:\Users\User\Anaconda3\lib\site-packages\pymc3\sampling.py in sample(draws, step, start, trace, chain, njobs, tune, progressbar, model, random_seed)
153 sample_args = [draws, step, start, trace, chain,
154 tune, progressbar, model, random_seed]
--> 155 return sample_func(*sample_args)
156
157
C:\Users\User\Anaconda3\lib\site-packages\pymc3\sampling.py in _sample(draws, step, start, trace, chain, tune, progressbar, model, random_seed)
162 progress = progress_bar(draws)
163 try:
--> 164 for i, strace in enumerate(sampling):
165 if progressbar:
166 progress.update(i)
C:\Users\User\Anaconda3\lib\site-packages\pymc3\sampling.py in _iter_sample(draws, step, start, trace, chain, tune, model, random_seed)
244 if i == tune:
245 step = stop_tuning(step)
--> 246 point = step.step(point)
247 strace.record(point)
248 yield strace
C:\Users\User\Anaconda3\lib\site-packages\pymc3\step_methods\compound.py in step(self, point)
11 def step(self, point):
12 for method in self.methods:
---> 13 point = method.step(point)
14 return point
C:\Users\User\Anaconda3\lib\site-packages\pymc3\step_methods\arraystep.py in step(self, point)
116 bij = DictToArrayBijection(self.ordering, point)
117
--> 118 apoint = self.astep(bij.map(point))
119 return bij.rmap(apoint)
120
C:\Users\User\Anaconda3\lib\site-packages\pymc3\step_methods\metropolis.py in astep(self, q0)
123
124
--> 125 q_new = metrop_select(self.delta_logp(q,q0), q, q0)
126
127 if q_new is q:
c:\program files\git\theano\theano\compile\function_module.py in __call__(self, *args, **kwargs)
869 node=self.fn.nodes[self.fn.position_of_error],
870 thunk=thunk,
--> 871 storage_map=getattr(self.fn, 'storage_map', None))
872 else:
873 # old-style linkers raise their own exceptions
c:\program files\git\theano\theano\gof\link.py in raise_with_op(node, thunk, exc_info, storage_map)
312 # extra long error message in that case.
313 pass
--> 314 reraise(exc_type, exc_value, exc_trace)
315
316
C:\Users\User\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
656 value = tp()
657 if value.__traceback__ is not tb:
--> 658 raise value.with_traceback(tb)
659 raise value
660
c:\program files\git\theano\theano\compile\function_module.py in __call__(self, *args, **kwargs)
857 t0_fn = time.time()
858 try:
--> 859 outputs = self.fn()
860 except Exception:
861 if hasattr(self.fn, 'position_of_error'):
TypeError: expected type_num 9 (NPY_INT64) got 7
Apply node that caused the error: Elemwise{Composite{Switch(GE(i0, i1), i2, i3)}}(InplaceDimShuffle{x}.0, TensorConstant{[ 0 1..1098 1099]}, InplaceDimShuffle{x}.0, InplaceDimShuffle{x}.0)
Toposort index: 11
Inputs types: [TensorType(int64, (True,)), TensorType(int32, vector), TensorType(float64, (True,)), TensorType(float64, (True,))]
Inputs shapes: [(1,), (1100,), (1,), (1,)]
Inputs strides: [(4,), (4,), (8,), (8,)]
Inputs values: [array([549]), 'not shown', array([ 1.07762995]), array([ 1.01502801])]
Outputs clients: [[Elemwise{eq,no_inplace}(Elemwise{Composite{Switch(GE(i0, i1), i2, i3)}}.0, TensorConstant{(1,) of 0}), Elemwise{Composite{Switch(GE(i0, i1), ((Switch(i2, i3, (i4 * log(i0))) - i5) - i0), i3)}}[(0, 0)](Elemwise{Composite{Switch(GE(i0, i1), i2, i3)}}.0, TensorConstant{(1,) of 0}, InplaceDimShuffle{x}.0, TensorConstant{(1,) of -inf}, TensorConstant{[ 13. 13... 0. 1.]}, TensorConstant{[ 22.55216... ]})]]
HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Being simple analyst, should i learn all this stuff about theano to being able to work with my statistical problems? Is a new mcmc sampler with gradient feature is only one thing that should motivates me to switch from pymc2 to pymc3?
For your first question, it looks like you're trying to pass a theano function as a variable. You need to call the function with the other variables as arguments, which will then return a theano variable. Try changing your line to
disasters=pymc3.Poisson('disasters', mu=rate1(switchpoint, mu1, mu2), observed = data)
I couldn't reproduce the error in your second part; the sampling worked just fine for me.