I am trying to load a ".unf" file into Jupyter environment, using HyperSpy Library, but I get this error.
import hyperspy.api as hs
data = hs.load("/path/to/file/PRC.unf")
This is the error:
ValueError Traceback (most recent call last)
<ipython-input-7-b0117f505d01> in <module>
----> 1 data = hs.load("/home/vahid/PythonProjects/UNFfiles/PRC.unf")
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load(filenames, signal_type, stack, stack_axis, new_axis_name, lazy, convert_units, **kwds)
279 objects = [load_single_file(filename, lazy=lazy,
280 **kwds)
--> 281 for filename in filenames]
282
283 if len(objects) == 1:
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in <listcomp>(.0)
279 objects = [load_single_file(filename, lazy=lazy,
280 **kwds)
--> 281 for filename in filenames]
282
283 if len(objects) == 1:
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load_single_file(filename, **kwds)
316 else:
317 reader = io_plugins[i]
--> 318 return load_with_reader(filename=filename, reader=reader, **kwds)
319
320
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io.py in load_with_reader(filename, reader, signal_type, convert_units, **kwds)
323 lazy = kwds.get('lazy', False)
324 file_data_list = reader.file_reader(filename,
--> 325 **kwds)
326 objects = []
327
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io_plugins/semper_unf.py in file_reader(filename, **kwds)
703 def file_reader(filename, **kwds):
704 lazy = kwds.get('lazy', False)
--> 705 semper = SemperFormat.load_from_unf(filename, lazy=lazy)
706 semper.log_info()
707 return [semper.to_signal(lazy=lazy)._to_dictionary()]
~/PythonProjects/UNFfiles/venv/lib/python3.7/site-packages/hyperspy/io_plugins/semper_unf.py in load_from_unf(cls, filename, lazy)
386 :rec_length //
387 2],
--> 388 count=1)
389 metadata.update(sarray2dict(header))
390 assert np.frombuffer(f.read(4), dtype=np.int32)[0] == rec_length, \
**ValueError: field 'IFORM' occurs more than once**
I am not sure what the error is about. Apparently, the "IFORM" is some sort of a dictionary key in this type of data structure. I would be appreciated if anyone can help me address this problem.
Related
I am trying to save a pandas dataframe to a feather file using a pandas function .to_feather() as shown below;
df.to_feather("D:\{}.feather".format(parms['table_or_view_name']))
But I am getting an error pointing to pyarrow library, please be noted that I have already upgraded the pyarrow to the most recent build(7.0.0), despite that I am still facing the same issue:
ArrowInvalid Traceback (most recent call last)
<ipython-input-26-5459a546a0cb> in <module>
----> 1 df.to_feather("D:\{}.feather".format(parms['table_or_view_name']))
~\AppData\Roaming\Python\Python38\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
205 else:
206 kwargs[new_arg_name] = new_arg_value
--> 207 return func(*args, **kwargs)
208
209 return cast(F, wrapper)
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\frame.py in to_feather(self, path, **kwargs)
2517 from pandas.io.feather_format import to_feather
2518
-> 2519 to_feather(self, path, **kwargs)
2520
2521 #doc(
~\AppData\Roaming\Python\Python38\site-packages\pandas\io\feather_format.py in to_feather(df, path, storage_options, **kwargs)
85 path, "wb", storage_options=storage_options, is_text=False
86 ) as handles:
---> 87 feather.write_feather(df, handles.handle, **kwargs)
88
89
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\feather.py in write_feather(df, dest, compression, compression_level, chunksize, version)
152
153 if _pandas_api.is_data_frame(df):
--> 154 table = Table.from_pandas(df, preserve_index=False)
155
156 if version == 1:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\table.pxi in pyarrow.lib.Table.from_pandas()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
592
593 if nthreads == 1:
--> 594 arrays = [convert_column(c, f)
595 for c, f in zip(columns_to_convert, convert_fields)]
596 else:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in <listcomp>(.0)
592
593 if nthreads == 1:
--> 594 arrays = [convert_column(c, f)
595 for c, f in zip(columns_to_convert, convert_fields)]
596 else:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in convert_column(col, field)
579 e.args += ("Conversion failed for column {!s} with type {!s}"
580 .format(col.name, col.dtype),)
--> 581 raise e
582 if not field_nullable and result.null_count > 0:
583 raise ValueError("Field {} was non-nullable but pandas column "
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in convert_column(col, field)
573
574 try:
--> 575 result = pa.array(col, type=type_, from_pandas=True, safe=safe)
576 except (pa.ArrowInvalid,
577 pa.ArrowNotImplementedError,
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib.array()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib._ndarray_to_array()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\error.pxi in pyarrow.lib.check_status()
ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column Resources with type object')
Note:
This error only occurs when I try to save the file as to_feather(), but when I tried to save the dataframe as .to_json() it worked.
Please suggest if there is any workaround for this error, thanks in advance.
I need to load some meteorological data to analyze several months but such data is stored in files that cover only one day so I need to acces many files at once.
I am following some pre-given instruction that told me to create a memory partition in my computer.
from datetime import datetime, timedelta
import dask.array as da
from dask.distributed import Client, LocalCluster
import xarray
try:
client
except NameError:
client = Client(n_workers=1, threads_per_worker=4, memory_limit='2GB')
else:
print("Client already exists")
After this, I create an array dates that goes from 1st June to 1st October and that is need in "files" to get the link to the meteorological data.
dates=[datetime(2019,6,1) + timedelta(days=i) for i in range(3*30)]
files= [date.strftime('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/%Y/%m/wrf_arw_det_history_d03_%Y%m%d_0000.nc4') for date in dates]
My issue starts when I try to unzip all that data as
multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
It raises the error:
KeyError Traceback (most recent call last)
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
197 try:
--> 198 file = self._cache[self._key]
199 except KeyError:
~\Nueva carpeta\lib\site-packages\xarray\backends\lru_cache.py in __getitem__(self, key)
52 with self._lock:
---> 53 value = self._cache[key]
54 self._cache.move_to_end(key)
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-19-c3d0f4a8cc26> in <module>
----> 1 multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, combine, autoclose, parallel, join, attrs_file, **kwargs)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in <listcomp>(.0)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
507 if engine == "netcdf4":
508 store = backends.NetCDF4DataStore.open(
--> 509 filename_or_obj, group=group, lock=lock, **backend_kwargs
510 )
511 elif engine == "scipy":
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in open(cls, filename, mode, format, group, clobber, diskless, persist, lock, lock_maker, autoclose)
356 netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
357 )
--> 358 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
359
360 def _acquire(self, needs_lock=True):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in __init__(self, manager, group, mode, lock, autoclose)
312 self._group = group
313 self._mode = mode
--> 314 self.format = self.ds.data_model
315 self._filename = self.ds.filepath()
316 self.is_remote = is_remote_uri(self._filename)
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in ds(self)
365 #property
366 def ds(self):
--> 367 return self._acquire()
368
369 def open_store_variable(self, name, var):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in _acquire(self, needs_lock)
359
360 def _acquire(self, needs_lock=True):
--> 361 with self._manager.acquire_context(needs_lock) as root:
362 ds = _nc4_require_group(root, self._group, self._mode)
363 return ds
~\Nueva carpeta\lib\contextlib.py in __enter__(self)
110 del self.args, self.kwds, self.func
111 try:
--> 112 return next(self.gen)
113 except StopIteration:
114 raise RuntimeError("generator didn't yield") from None
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in acquire_context(self, needs_lock)
184 def acquire_context(self, needs_lock=True):
185 """Context manager for acquiring a file."""
--> 186 file, cached = self._acquire_with_cache_info(needs_lock)
187 try:
188 yield file
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
202 kwargs = kwargs.copy()
203 kwargs["mode"] = self._mode
--> 204 file = self._opener(*self._args, **kwargs)
205 if self._mode == "w":
206 # ensure file doesn't get overriden when opened again
netCDF4\_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()
netCDF4\_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -37] NetCDF: Write to read only: b'http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4'
Does anyone know why this error occurs?
Hey I have a very short question. I need to load data for my machine learning course, but it does not work for me and I have no idea why. Im using Jupyter with Python 3.
My Code:
from sklearn.datasets import fetch_covtype
forest = fetch_covtype()
For my friend it works fine with the same conditions. I already tried to update sklearn with pip install -U scikit-learn, but it did not solve the problem. I hope somebody can help me.
It creates the following error:
UnboundLocalError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
126 try:
--> 127 X, y
128 except NameError:
UnboundLocalError: local variable 'X' referenced before assignment
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-9-fb303a92b6ca> in <module>
----> 1 forest =fetch_covtype()
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/covtype.py in fetch_covtype(data_home, download_if_missing, random_state, shuffle, return_X_y)
127 X, y
128 except NameError:
--> 129 X, y = _refresh_cache([samples_path, targets_path], 9)
130 # TODO: Revert to the following two lines in v0.23
131 # X = joblib.load(samples_path)
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in _refresh_cache(files, compress)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/sklearn/datasets/base.py in <listcomp>(.0)
928 msg = "sklearn.externals.joblib is deprecated in 0.21"
929 with warnings.catch_warnings(record=True) as warns:
--> 930 data = tuple([joblib.load(f) for f in files])
931
932 refresh_needed = any([str(x.message).startswith(msg) for x in warns])
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load(filename, mmap_mode)
603 return load_compatibility(fobj)
604
--> 605 obj = _unpickle(fobj, filename, mmap_mode)
606
607 return obj
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in _unpickle(fobj, filename, mmap_mode)
527 obj = None
528 try:
--> 529 obj = unpickler.load()
530 if unpickler.compat_mode:
531 warnings.warn("The file '%s' has been generated with a "
/opt/conda/lib/python3.7/pickle.py in load(self)
1083 raise EOFError
1084 assert isinstance(key, bytes_types)
-> 1085 dispatch[key[0]](self)
1086 except _Stop as stopinst:
1087 return stopinst.value
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in load_build(self)
353 if isinstance(array_wrapper, NDArrayWrapper):
354 self.compat_mode = True
--> 355 self.stack.append(array_wrapper.read(self))
356
357 # Be careful to register our new method.
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read(self, unpickler)
196 array = self.read_mmap(unpickler)
197 else:
--> 198 array = self.read_array(unpickler)
199
200 # Manage array subclass case
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle.py in read_array(self, unpickler)
147 read_size = int(read_count * self.dtype.itemsize)
148 data = _read_bytes(unpickler.file_handle,
--> 149 read_size, "array data")
150 array[i:i + read_count] = \
151 unpickler.np.frombuffer(data, dtype=self.dtype,
/opt/conda/lib/python3.7/site-packages/joblib/numpy_pickle_utils.py in _read_bytes(fp, size, error_template)
241 if len(data) != size:
242 msg = "EOF: reading %s, expected %d bytes got %d"
--> 243 raise ValueError(msg % (error_template, size, len(data)))
244 else:
245 return data
ValueError: EOF: reading array data, expected 262144 bytes got 209661
I try to read a csv file using pyspark with this pyspark code :
tr_df = spark.read.csv("/data/file.csv",
header=True, inferSchema=True
)
tr_df.head(5)
But I get this error :
ValueError Traceback (most recent call last)
<ipython-input-53-03432bbf269d> in <module>
----> 1 tr_df.head(5)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in head(self, n)
1250 rs = self.head(1)
1251 return rs[0] if rs else None
-> 1252 return self.take(n)
1253
1254 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in take(self, num)
569 [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
570 """
--> 571 return self.limit(num).collect()
572
573 #since(1.3)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in collect(self)
532 with SCCallSiteSync(self._sc) as css:
533 sock_info = self._jdf.collectToPython()
--> 534 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
535
536 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in load_stream(self, stream)
145 while True:
146 try:
--> 147 yield self._read_with_length(stream)
148 except EOFError:
149 return
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in _read_with_length(self, stream)
170 if len(obj) < length:
171 raise EOFError
--> 172 return self.loads(obj)
173
174 def dumps(self, obj):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in loads(self, obj, encoding)
578 if sys.version >= '3':
579 def loads(self, obj, encoding="bytes"):
--> 580 return pickle.loads(obj, encoding=encoding)
581 else:
582 def loads(self, obj, encoding=None):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_string(json_string)
867 >>> check_datatype(complex_maptype)
868 """
--> 869 return _parse_datatype_json_value(json.loads(json_string))
870
871
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
884 tpe = json_value["type"]
885 if tpe in _all_complex_types:
--> 886 return _all_complex_types[tpe].fromJson(json_value)
887 elif tpe == 'udt':
888 return UserDefinedType.fromJson(json_value)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in (.0)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
432 def fromJson(cls, json):
433 return StructField(json["name"],
--> 434 _parse_datatype_json_value(json["type"]),
435 json["nullable"],
436 json["metadata"])
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
880 return DecimalType(int(m.group(1)), int(m.group(2)))
881 else:
--> 882 raise ValueError("Could not parse datatype: %s" % json_value)
883 else:
884 tpe = json_value["type"]
ValueError: Could not parse datatype: decimal(17,-24)
Can anyone help me to resolve this problem please?
Thanks
Seems there is a problem with datatype in one of your columns. Hence its throwing error. Remove inferSchema =True option while reading. After reading the data,try to analayze datatype and make any corrections if needed, then apply your own schema.
I loaded a csv file using the following code
from pyspark import SparkContext
from pyspark.sql import *
sc = SparkContext(master='local[1]')
df = sq.read.csv(file_path,header='true',inferSchema='true')
But, when i tried to convert this spark dataframe i have to a pandas dataframe using the following code
pdf = df.toPandas()
i got the following error
OSError Traceback (most recent call last)
<ipython-input-27-cf3578af3a8d> in <module>()
----> 1 a = df.toPandas()
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in toPandas(self)
1964 raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
1965 else:
-> 1966 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1967
1968 dtype = {}
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in collect(self)
465 with SCCallSiteSync(self._sc) as css:
466 port = self._jdf.collectToPython()
--> 467 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
468
469 #ignore_unicode_prefix
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in load_stream(self, stream)
143 while True:
144 try:
--> 145 yield self._read_with_length(stream)
146 except EOFError:
147 return
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in _read_with_length(self, stream)
168 if len(obj) < length:
169 raise EOFError
--> 170 return self.loads(obj)
171
172 def dumps(self, obj):
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in loads(self, obj, encoding)
557 if sys.version >= '3':
558 def loads(self, obj, encoding="bytes"):
--> 559 return pickle.loads(obj, encoding=encoding)
560 else:
561 def loads(self, obj, encoding=None):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <lambda>(*a)
1426 # This is used to unpickle a Row from JVM
1427 def _create_row_inbound_converter(dataType):
-> 1428 return lambda *a: dataType.fromInternal(a)
1429
1430
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <listcomp>(.0)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
440
441 def fromInternal(self, obj):
--> 442 return self.dataType.fromInternal(obj)
443
444 def typeName(self):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, ts)
198 if ts is not None:
199 # using int to avoid precision loss in float
--> 200 return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000)
201
202
OSError: [Errno 22] Invalid argument
Can anyone help me on how to solve this error?