writing from to parquet using pandas - python

Trying to export and convert my data to a parquet file. Data is sba data from kaggle that we've transformed bit. Trying to covert it to parquet to load onto a hfds server.
Data link
https://www.kaggle.com/mirbektoktogaraev/should-this-loan-be-approved-or-denied
tryin to use the code:
sba.to_parquet('sba.parquet.gzip', compression = 'gzip', partition_cols= 'State')
but get the error:
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
<ipython-input-39-377ee6551e44> in <module>
----> 1 sba.to_parquet('sba.parquet.gzip', compression = 'gzip', partition_cols= 'State')
/opt/conda/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
197 else:
198 kwargs[new_arg_name] = new_arg_value
--> 199 return func(*args, **kwargs)
200
201 return cast(F, wrapper)
/opt/conda/lib/python3.8/site-packages/pandas/core/frame.py in to_parquet(self, path, engine, compression, index, partition_cols, storage_options, **kwargs)
2453 from pandas.io.parquet import to_parquet
2454
-> 2455 return to_parquet(
2456 self,
2457 path,
/opt/conda/lib/python3.8/site-packages/pandas/io/parquet.py in to_parquet(df, path, engine, compression, index, storage_options, partition_cols, **kwargs)
388 path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path
389
--> 390 impl.write(
391 df,
392 path_or_buf,
/opt/conda/lib/python3.8/site-packages/pandas/io/parquet.py in write(self, df, path, compression, index, storage_options, partition_cols, **kwargs)
150 from_pandas_kwargs["preserve_index"] = index
151
--> 152 table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
153
154 path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
/opt/conda/lib/python3.8/site-packages/pyarrow/table.pxi in pyarrow.lib.Table.from_pandas()
/opt/conda/lib/python3.8/site-packages/pyarrow/pandas_compat.py in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
600 for i, maybe_fut in enumerate(arrays):
601 if isinstance(maybe_fut, futures.Future):
--> 602 arrays[i] = maybe_fut.result()
603
604 types = [x.type for x in arrays]
/opt/conda/lib/python3.8/concurrent/futures/_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433
434 self._condition.wait(timeout)
/opt/conda/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
/opt/conda/lib/python3.8/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
/opt/conda/lib/python3.8/site-packages/pyarrow/pandas_compat.py in convert_column(col, field)
572 e.args += ("Conversion failed for column {!s} with type {!s}"
573 .format(col.name, col.dtype),)
--> 574 raise e
575 if not field_nullable and result.null_count > 0:
576 raise ValueError("Field {} was non-nullable but pandas column "
/opt/conda/lib/python3.8/site-packages/pyarrow/pandas_compat.py in convert_column(col, field)
566
567 try:
--> 568 result = pa.array(col, type=type_, from_pandas=True, safe=safe)
569 except (pa.ArrowInvalid,
570 pa.ArrowNotImplementedError,
/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
/opt/conda/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._ndarray_to_array()
/opt/conda/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: ('Could not convert 2004 with type str: tried to convert to int', 'Conversion failed for column ApprovalFY with type object')
Any help would be amazing.

#Micah Kornfield is correct. Here is more specific answer.
If you look at your data, more specifically between rows 688127 and 688128 you find the following
df.loc[688127,'ApprovalFY']
2004
vs
df.loc[688128,'ApprovalFY']
'2004'
This type of change in data causes issue when parsing as parquet file. I am not an expert on parquet file, however the way that I understood is that parquet files identify the type of data in order to store them more efficiently. Therefore if you have a two different type of data in the same column you will receive the error. A lot of people run to this type of issue when they save their data into csv and then try to read csv file and concatenate the csv data with new data that they get from API,etc.
Every time you save your data in the csv format it converts it to text and when you read it it can change it from 2004 to '2004'.
Back to original question, it is a good idea to perform some data type checking before saving your data as parquet.

Related

Trying to download dataset, code doesn't work in Jupyter notebook but it does work in Pycharm

I'm trying to download the MNIST dataset from openml, using the openml library.
I tried using Jupyter notebooks because I don't want to download the same dataset every time.
Problem is, after running the following code, I get an error:
from openml.datasets import get_dataset
mnist = get_dataset(554)
x, y, p, q = mnist.get_data(
dataset_format="dataframe", target=mnist.default_target_attribute
)
I'm pasting the whole error message I get, the problem occurs when I try assigning the .get_data to x, y, p and q.
The environment I'm running this on is called Oceanic.
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
File ~\anaconda3\envs\Oceanic\lib\site-packages\openml\datasets\dataset.py:491, in OpenMLDataset._cache_compressed_file_from_file(self, data_file)
490 try:
--> 491 data = pd.read_parquet(data_file)
492 except Exception as e:
File ~\anaconda3\envs\Oceanic\lib\site-packages\pandas\io\parquet.py:493, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, **kwargs)
491 impl = get_engine(engine)
--> 493 return impl.read(
494 path,
495 columns=columns,
496 storage_options=storage_options,
497 use_nullable_dtypes=use_nullable_dtypes,
498 **kwargs,
499 )
File ~\anaconda3\envs\Oceanic\lib\site-packages\pandas\io\parquet.py:240, in PyArrowImpl.read(self, path, columns, use_nullable_dtypes, storage_options, **kwargs)
239 try:
--> 240 result = self.api.parquet.read_table(
241 path_or_handle, columns=columns, **kwargs
242 ).to_pandas(**to_pandas_kwargs)
243 if manager == "array":
File ~\anaconda3\envs\Oceanic\lib\site-packages\pyarrow\parquet.py:1731, in read_table(source, columns, use_threads, metadata, use_pandas_metadata, memory_map, read_dictionary, filesystem, filters, buffer_size, partitioning, use_legacy_dataset, ignore_prefixes)
1727 dataset = ParquetFile(
1728 source, metadata=metadata, read_dictionary=read_dictionary,
1729 memory_map=memory_map, buffer_size=buffer_size)
-> 1731 return dataset.read(columns=columns, use_threads=use_threads,
1732 use_pandas_metadata=use_pandas_metadata)
1734 if ignore_prefixes is not None:
File ~\anaconda3\envs\Oceanic\lib\site-packages\pyarrow\parquet.py:1608, in _ParquetDatasetV2.read(self, columns, use_threads, use_pandas_metadata)
1606 use_threads = False
-> 1608 table = self._dataset.to_table(
1609 columns=columns, filter=self._filter_expression,
1610 use_threads=use_threads
1611 )
1613 # if use_pandas_metadata, restore the pandas metadata (which gets
1614 # lost if doing a specific `columns` selection in to_table)
File ~\anaconda3\envs\Oceanic\lib\site-packages\pyarrow\_dataset.pyx:458, in pyarrow._dataset.Dataset.to_table()
File ~\anaconda3\envs\Oceanic\lib\site-packages\pyarrow\_dataset.pyx:2889, in pyarrow._dataset.Scanner.to_table()
File ~\anaconda3\envs\Oceanic\lib\site-packages\pyarrow\error.pxi:141, in pyarrow.lib.pyarrow_internal_check_status()
File ~\anaconda3\envs\Oceanic\lib\site-packages\pyarrow\error.pxi:112, in pyarrow.lib.check_status()
OSError: NotImplemented: Support for codec 'snappy' not built
The above exception was the direct cause of the following exception:
Exception Traceback (most recent call last)
Input In [10], in <cell line: 1>()
----> 1 x, y, p, q = mnist.get_data(
2 dataset_format="dataframe", target=mnist.default_target_attribute
3 )
File ~\anaconda3\envs\Oceanic\lib\site-packages\openml\datasets\dataset.py:698, in OpenMLDataset.get_data(self, target, include_row_id, include_ignore_attribute, dataset_format)
658 def get_data(
659 self,
660 target: Optional[Union[List[str], str]] = None,
(...)
668 List[str],
669 ]:
670 """ Returns dataset content as dataframes or sparse matrices.
671
672 Parameters
(...)
696 List of attribute names.
697 """
--> 698 data, categorical, attribute_names = self._load_data()
700 to_exclude = []
701 if not include_row_id and self.row_id_attribute is not None:
File ~\anaconda3\envs\Oceanic\lib\site-packages\openml\datasets\dataset.py:531, in OpenMLDataset._load_data(self)
528 self._download_data()
530 file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
--> 531 return self._cache_compressed_file_from_file(file_to_load)
533 # helper variable to help identify where errors occur
534 fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file
File ~\anaconda3\envs\Oceanic\lib\site-packages\openml\datasets\dataset.py:493, in OpenMLDataset._cache_compressed_file_from_file(self, data_file)
491 data = pd.read_parquet(data_file)
492 except Exception as e:
--> 493 raise Exception(f"File: {data_file}") from e
495 categorical = [data[c].dtype.name == "category" for c in data.columns]
496 attribute_names = list(data.columns)
Exception: File: C:\Users\Irving\.openml\org\openml\www\datasets\554\dataset.pq
Now, I've written the same code on Pycharm and it works just fine, I managed to correctly assign the dataframes and show them to me. I've got no idea why this isn't working and I would like to know why because I would prefer to work with Jupyter notebooks.
Any help is appreciated, thanks in advance.

ValueError: Invalid file path or buffer object type

I've been using mplsoccer library and statsbombpy libraries for a while now with success.
Recently, I've tried to use it again with this code (not fully reproducible due to it being behind a paid api).
!pip install mplsoccer
import pandas as pd
import requests
from mplsoccer.statsbomb import read_event
username = creds['user']
password = creds['passwd']
auth = requests.auth.HTTPBasicAuth(username, password)
URL = 'https://data.statsbombservices.com/api/v5/events/18241'
response = requests.get(URL, auth = auth)
df_dict = read_event(response)
and I'm now starting to get the ValueError of invalid file path or buffer type. I contacted the owner of mplsoccer and asked him about it, and he said it wasn't a reproducible error for him, but it looks like my pandas is having trouble reading it.
response is returning exactly what it should be, it just fails with the below error code
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-64-9be43ec4f519> in <module>
6 URL = 'https://data.statsbombservices.com/api/v5/events/7430'
7 response = requests.get(URL, auth=auth)
----> 8 df_dict = read_event(response)
E:\py\lib\site-packages\mplsoccer\statsbomb.py in read_event(path_or_buf, related_event_df, shot_freeze_frame_df, tactics_lineup_df, warn)
120 match_id = int(path_or_buf.url.split('/')[-1].split('.')[0])
121 else:
--> 122 df = pd.read_json(path_or_buf, encoding='utf-8')
123 match_id = int(os.path.basename(path_or_buf)[:-5])
124
E:\py\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
205 else:
206 kwargs[new_arg_name] = new_arg_value
--> 207 return func(*args, **kwargs)
208
209 return cast(F, wrapper)
E:\py\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
E:\py\lib\site-packages\pandas\io\json\_json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, encoding_errors, lines, chunksize, compression, nrows, storage_options)
588 convert_axes = True
589
--> 590 json_reader = JsonReader(
591 path_or_buf,
592 orient=orient,
E:\py\lib\site-packages\pandas\io\json\_json.py in __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression, nrows, storage_options, encoding_errors)
673 raise ValueError("nrows can only be passed if lines=True")
674
--> 675 data = self._get_data_from_filepath(filepath_or_buffer)
676 self.data = self._preprocess_data(data)
677
E:\py\lib\site-packages\pandas\io\json\_json.py in _get_data_from_filepath(self, filepath_or_buffer)
710 or file_exists(filepath_or_buffer)
711 ):
--> 712 self.handles = get_handle(
713 filepath_or_buffer,
714 "r",
E:\py\lib\site-packages\pandas\io\common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
606
607 # open URLs
--> 608 ioargs = _get_filepath_or_buffer(
609 path_or_buf,
610 encoding=encoding,
E:\py\lib\site-packages\pandas\io\common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
393 if not is_file_like(filepath_or_buffer):
394 msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
--> 395 raise ValueError(msg)
396
397 return IOArgs(
ValueError: Invalid file path or buffer object type: <class 'requests_cache.models.response.CachedResponse'>
hoping someone can help me see exactly where pandas is struggling and what I can do to fix it? Thanks

Error while saving a pandas DataFrame to a feather file, using to_feather() function

I am trying to save a pandas dataframe to a feather file using a pandas function .to_feather() as shown below;
df.to_feather("D:\{}.feather".format(parms['table_or_view_name']))
But I am getting an error pointing to pyarrow library, please be noted that I have already upgraded the pyarrow to the most recent build(7.0.0), despite that I am still facing the same issue:
ArrowInvalid Traceback (most recent call last)
<ipython-input-26-5459a546a0cb> in <module>
----> 1 df.to_feather("D:\{}.feather".format(parms['table_or_view_name']))
~\AppData\Roaming\Python\Python38\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
205 else:
206 kwargs[new_arg_name] = new_arg_value
--> 207 return func(*args, **kwargs)
208
209 return cast(F, wrapper)
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\frame.py in to_feather(self, path, **kwargs)
2517 from pandas.io.feather_format import to_feather
2518
-> 2519 to_feather(self, path, **kwargs)
2520
2521 #doc(
~\AppData\Roaming\Python\Python38\site-packages\pandas\io\feather_format.py in to_feather(df, path, storage_options, **kwargs)
85 path, "wb", storage_options=storage_options, is_text=False
86 ) as handles:
---> 87 feather.write_feather(df, handles.handle, **kwargs)
88
89
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\feather.py in write_feather(df, dest, compression, compression_level, chunksize, version)
152
153 if _pandas_api.is_data_frame(df):
--> 154 table = Table.from_pandas(df, preserve_index=False)
155
156 if version == 1:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\table.pxi in pyarrow.lib.Table.from_pandas()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
592
593 if nthreads == 1:
--> 594 arrays = [convert_column(c, f)
595 for c, f in zip(columns_to_convert, convert_fields)]
596 else:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in <listcomp>(.0)
592
593 if nthreads == 1:
--> 594 arrays = [convert_column(c, f)
595 for c, f in zip(columns_to_convert, convert_fields)]
596 else:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in convert_column(col, field)
579 e.args += ("Conversion failed for column {!s} with type {!s}"
580 .format(col.name, col.dtype),)
--> 581 raise e
582 if not field_nullable and result.null_count > 0:
583 raise ValueError("Field {} was non-nullable but pandas column "
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in convert_column(col, field)
573
574 try:
--> 575 result = pa.array(col, type=type_, from_pandas=True, safe=safe)
576 except (pa.ArrowInvalid,
577 pa.ArrowNotImplementedError,
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib.array()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib._ndarray_to_array()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\error.pxi in pyarrow.lib.check_status()
ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column Resources with type object')
Note:
This error only occurs when I try to save the file as to_feather(), but when I tried to save the dataframe as .to_json() it worked.
Please suggest if there is any workaround for this error, thanks in advance.

(Memory) Problem when accessing Dask type arrays

I need to load some meteorological data to analyze several months but such data is stored in files that cover only one day so I need to acces many files at once.
I am following some pre-given instruction that told me to create a memory partition in my computer.
from datetime import datetime, timedelta
import dask.array as da
from dask.distributed import Client, LocalCluster
import xarray
try:
client
except NameError:
client = Client(n_workers=1, threads_per_worker=4, memory_limit='2GB')
else:
print("Client already exists")
After this, I create an array dates that goes from 1st June to 1st October and that is need in "files" to get the link to the meteorological data.
dates=[datetime(2019,6,1) + timedelta(days=i) for i in range(3*30)]
files= [date.strftime('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/%Y/%m/wrf_arw_det_history_d03_%Y%m%d_0000.nc4') for date in dates]
My issue starts when I try to unzip all that data as
multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
It raises the error:
KeyError Traceback (most recent call last)
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
197 try:
--> 198 file = self._cache[self._key]
199 except KeyError:
~\Nueva carpeta\lib\site-packages\xarray\backends\lru_cache.py in __getitem__(self, key)
52 with self._lock:
---> 53 value = self._cache[key]
54 self._cache.move_to_end(key)
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False))]
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-19-c3d0f4a8cc26> in <module>
----> 1 multi = xarray.open_mfdataset(files, preprocess= lambda a : a.isel(time=slice(0,24)))
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, lock, data_vars, coords, combine, autoclose, parallel, join, attrs_file, **kwargs)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in <listcomp>(.0)
916 getattr_ = getattr
917
--> 918 datasets = [open_(p, **open_kwargs) for p in paths]
919 file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
920 if preprocess is not None:
~\Nueva carpeta\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
507 if engine == "netcdf4":
508 store = backends.NetCDF4DataStore.open(
--> 509 filename_or_obj, group=group, lock=lock, **backend_kwargs
510 )
511 elif engine == "scipy":
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in open(cls, filename, mode, format, group, clobber, diskless, persist, lock, lock_maker, autoclose)
356 netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
357 )
--> 358 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
359
360 def _acquire(self, needs_lock=True):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in __init__(self, manager, group, mode, lock, autoclose)
312 self._group = group
313 self._mode = mode
--> 314 self.format = self.ds.data_model
315 self._filename = self.ds.filepath()
316 self.is_remote = is_remote_uri(self._filename)
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in ds(self)
365 #property
366 def ds(self):
--> 367 return self._acquire()
368
369 def open_store_variable(self, name, var):
~\Nueva carpeta\lib\site-packages\xarray\backends\netCDF4_.py in _acquire(self, needs_lock)
359
360 def _acquire(self, needs_lock=True):
--> 361 with self._manager.acquire_context(needs_lock) as root:
362 ds = _nc4_require_group(root, self._group, self._mode)
363 return ds
~\Nueva carpeta\lib\contextlib.py in __enter__(self)
110 del self.args, self.kwds, self.func
111 try:
--> 112 return next(self.gen)
113 except StopIteration:
114 raise RuntimeError("generator didn't yield") from None
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in acquire_context(self, needs_lock)
184 def acquire_context(self, needs_lock=True):
185 """Context manager for acquiring a file."""
--> 186 file, cached = self._acquire_with_cache_info(needs_lock)
187 try:
188 yield file
~\Nueva carpeta\lib\site-packages\xarray\backends\file_manager.py in _acquire_with_cache_info(self, needs_lock)
202 kwargs = kwargs.copy()
203 kwargs["mode"] = self._mode
--> 204 file = self._opener(*self._args, **kwargs)
205 if self._mode == "w":
206 # ensure file doesn't get overriden when opened again
netCDF4\_netCDF4.pyx in netCDF4._netCDF4.Dataset.__init__()
netCDF4\_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -37] NetCDF: Write to read only: b'http://mandeo.meteogalicia.es/thredds/dodsC/modelos/WRF_HIST/d03/2019/06/wrf_arw_det_history_d03_20190626_0000.nc4'
Does anyone know why this error occurs?

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering

I want to read in a JSON file in to my Jupiter notebook as a pandas dataframe.
macOS 10.12, Python 3.7, pandas 0.24.2
my dataset: https://open.fda.gov/apis/drug/label/download/
Similar question with same error message (I have tried to use the solution from here but gives me the same error message): Read JSON to pandas dataframe - ValueError: Mixing dicts with non-Series may lead to ambiguous ordering
import json
import pandas as pd
data = json.load(open('drug-label-0001-of-0008.json'))
df = pd.DataFrame(data)
As this answer says I am not doing double conversion: Pandas vs JSON library to read a JSON file in Python
His code just works, mine gets an error:
import pandas as pd
pd_example = pd.read_json('some_json_file.json')
My code is similar but I get the following error:
import pandas as pd
df = pd.read_json('drug-label-0008-of-0008.json')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-23-77b3c3e486fc> in <module>
----> 1 df = pd.read_json('drug-label-0008-of-0008.json')
~/anaconda3/lib/python3.7/site-packages/pandas/io/json/json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression)
425 return json_reader
426
--> 427 result = json_reader.read()
428 if should_close:
429 try:
~/anaconda3/lib/python3.7/site-packages/pandas/io/json/json.py in read(self)
535 )
536 else:
--> 537 obj = self._get_object_parser(self.data)
538 self.close()
539 return obj
~/anaconda3/lib/python3.7/site-packages/pandas/io/json/json.py in _get_object_parser(self, json)
554 obj = None
555 if typ == 'frame':
--> 556 obj = FrameParser(json, **kwargs).parse()
557
558 if typ == 'series' or obj is None:
~/anaconda3/lib/python3.7/site-packages/pandas/io/json/json.py in parse(self)
650
651 else:
--> 652 self._parse_no_numpy()
653
654 if self.obj is None:
~/anaconda3/lib/python3.7/site-packages/pandas/io/json/json.py in _parse_no_numpy(self)
869 if orient == "columns":
870 self.obj = DataFrame(
--> 871 loads(json, precise_float=self.precise_float), dtype=None)
872 elif orient == "split":
873 decoded = {str(k): v for k, v in compat.iteritems(
~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
390 dtype=dtype, copy=copy)
391 elif isinstance(data, dict):
--> 392 mgr = init_dict(data, index, columns, dtype=dtype)
393 elif isinstance(data, ma.MaskedArray):
394 import numpy.ma.mrecords as mrecords
~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
210 arrays = [data[k] for k in keys]
211
--> 212 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
213
214
~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
49 # figure out the index, if necessary
50 if index is None:
---> 51 index = extract_index(arrays)
52 else:
53 index = ensure_index(index)
~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py in extract_index(data)
318
319 if have_dicts:
--> 320 raise ValueError('Mixing dicts with non-Series may lead to '
321 'ambiguous ordering.')
322
ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.
You can just use python's built in JSON handling capabilities :
import json
with open("drug-label-0008-of-0008.json", "r") as read_file:
data = json.load(read_file)
"When you have a single JSON structure inside a json file, use read_json because it loads the JSON directly into a DataFrame. With json.loads, you've to load it into a python dictionary/list, and then into a DataFrame - an unnecessary two step process. Pandas vs JSON library to read a JSON file in Python "

Categories