OSError:invalid argument while converting sqldataframe to pandas dataframe in pyspark - python

I loaded a csv file using the following code
from pyspark import SparkContext
from pyspark.sql import *
sc = SparkContext(master='local[1]')
df = sq.read.csv(file_path,header='true',inferSchema='true')
But, when i tried to convert this spark dataframe i have to a pandas dataframe using the following code
pdf = df.toPandas()
i got the following error
OSError Traceback (most recent call last)
<ipython-input-27-cf3578af3a8d> in <module>()
----> 1 a = df.toPandas()
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in toPandas(self)
1964 raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
1965 else:
-> 1966 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
1967
1968 dtype = {}
D:\softwares\anaconda\lib\site-packages\pyspark\sql\dataframe.py in collect(self)
465 with SCCallSiteSync(self._sc) as css:
466 port = self._jdf.collectToPython()
--> 467 return list(_load_from_socket(port, BatchedSerializer(PickleSerializer())))
468
469 #ignore_unicode_prefix
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in load_stream(self, stream)
143 while True:
144 try:
--> 145 yield self._read_with_length(stream)
146 except EOFError:
147 return
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in _read_with_length(self, stream)
168 if len(obj) < length:
169 raise EOFError
--> 170 return self.loads(obj)
171
172 def dumps(self, obj):
D:\softwares\anaconda\lib\site-packages\pyspark\serializers.py in loads(self, obj, encoding)
557 if sys.version >= '3':
558 def loads(self, obj, encoding="bytes"):
--> 559 return pickle.loads(obj, encoding=encoding)
560 else:
561 def loads(self, obj, encoding=None):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <lambda>(*a)
1426 # This is used to unpickle a Row from JVM
1427 def _create_row_inbound_converter(dataType):
-> 1428 return lambda *a: dataType.fromInternal(a)
1429
1430
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in <listcomp>(.0)
628 # Only calling fromInternal function for fields that need conversion
629 values = [f.fromInternal(v) if c else v
--> 630 for f, v, c in zip(self.fields, obj, self._needConversion)]
631 else:
632 values = obj
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, obj)
440
441 def fromInternal(self, obj):
--> 442 return self.dataType.fromInternal(obj)
443
444 def typeName(self):
D:\softwares\anaconda\lib\site-packages\pyspark\sql\types.py in fromInternal(self, ts)
198 if ts is not None:
199 # using int to avoid precision loss in float
--> 200 return datetime.datetime.fromtimestamp(ts // 1000000).replace(microsecond=ts % 1000000)
201
202
OSError: [Errno 22] Invalid argument
Can anyone help me on how to solve this error?

Related

FSSpec Error Handling in Python - Timeout Error

I am trying to get Terraclimate Data from Microsoft Planetary and facing time out error. Is there a possiblity of increasing the timeout time ? Please find the code below and the error I am facing. I am using fsspec and xarray for downloading spatial data from MS Planetary portal.
import fsspec
import xarray as xr
store = fsspec.get_mapper(asset.href)
data = xr.open_zarr(store, **asset.extra_fields["xarray:open_kwargs"])
clipped_data = data.sel(time=slice('2015-01-01','2019-12-31'),lon=slice(min_lon,max_lon),lat=slice(max_lat,min_lat))
parsed_data = clipped_data[['tmax', 'tmin', 'ppt', 'soil']]
lat_list = parsed_data['lat'].values.tolist()
lon_list = parsed_data['lon'].values.tolist()
filename = "Soil_Moisture_sample.csv"
for(i,j) in zip(lat_list,lon_list):
parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=i, lat=j, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
I am getting the following error
TimeoutError Traceback (most recent call last)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:53, in _runner(event, coro, result, timeout)
52 try:
---> 53 result[0] = await coro
54 except Exception as ex:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:423, in AsyncFileSystem._cat(self, path, recursive, on_error, batch_size, **kwargs)
422 if ex:
--> 423 raise ex
424 if (
425 len(paths) > 1
426 or isinstance(path, list)
427 or paths[0] != self._strip_protocol(path)
428 ):
File ~\Anaconda3\envs\satellite\lib\asyncio\tasks.py:455, in wait_for(fut, timeout, loop)
454 if timeout is None:
--> 455 return await fut
457 if timeout <= 0:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\implementations\http.py:221, in HTTPFileSystem._cat_file(self, url, start, end, **kwargs)
220 async with session.get(url, **kw) as r:
--> 221 out = await r.read()
222 self._raise_not_found_for_status(r, url)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\client_reqrep.py:1036, in ClientResponse.read(self)
1035 try:
-> 1036 self._body = await self.content.read()
1037 for trace in self._traces:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:375, in StreamReader.read(self, n)
374 while True:
--> 375 block = await self.readany()
376 if not block:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:397, in StreamReader.readany(self)
396 while not self._buffer and not self._eof:
--> 397 await self._wait("readany")
399 return self._read_nowait(-1)
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\streams.py:304, in StreamReader._wait(self, func_name)
303 with self._timer:
--> 304 await waiter
305 else:
File ~\Anaconda3\envs\satellite\lib\site-packages\aiohttp\helpers.py:721, in TimerContext.__exit__(self, exc_type, exc_val, exc_tb)
720 if exc_type is asyncio.CancelledError and self._cancelled:
--> 721 raise asyncio.TimeoutError from None
722 return None
TimeoutError:
The above exception was the direct cause of the following exception:
FSTimeoutError Traceback (most recent call last)
Input In [62], in <cell line: 3>()
1 # Flood Region Point - Thiruvanthpuram
2 filename = "Soil_Moisture_sample.csv"
----> 3 parsed_data[["soil","tmax","tmin","ppt"]].sel(lon=8.520833, lat=76.4375, method="nearest").to_dataframe().to_csv(filename,mode='a',index=False, header=False)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5898, in Dataset.to_dataframe(self, dim_order)
5870 """Convert this dataset into a pandas.DataFrame.
5871
5872 Non-index variables in this dataset form the columns of the
(...)
5893
5894 """
5896 ordered_dims = self._normalize_dim_order(dim_order=dim_order)
-> 5898 return self._to_dataframe(ordered_dims=ordered_dims)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5862, in Dataset._to_dataframe(self, ordered_dims)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
-> 5862 data = [
5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\dataset.py:5863, in <listcomp>(.0)
5860 def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
5861 columns = [k for k in self.variables if k not in self.dims]
5862 data = [
-> 5863 self._variables[k].set_dims(ordered_dims).values.reshape(-1)
5864 for k in columns
5865 ]
5866 index = self.coords.to_index([*ordered_dims])
5867 return pd.DataFrame(dict(zip(columns, data)), index=index)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:527, in Variable.values(self)
524 #property
525 def values(self):
526 """The variable's data as a numpy.ndarray"""
--> 527 return _as_array_or_item(self._data)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\variable.py:267, in _as_array_or_item(data)
253 def _as_array_or_item(data):
254 """Return the given values as a numpy array, or as an individual item if
255 it's a 0d datetime64 or timedelta64 array.
256
(...)
265 TODO: remove this (replace with np.asarray) once these issues are fixed
266 """
--> 267 data = np.asarray(data)
268 if data.ndim == 0:
269 if data.dtype.kind == "M":
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:1696, in Array.__array__(self, dtype, **kwargs)
1695 def __array__(self, dtype=None, **kwargs):
-> 1696 x = self.compute()
1697 if dtype and x.dtype != dtype:
1698 x = x.astype(dtype)
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:315, in DaskMethodsMixin.compute(self, **kwargs)
291 def compute(self, **kwargs):
292 """Compute this dask collection
293
294 This turns a lazy Dask collection into its in-memory equivalent.
(...)
313 dask.base.compute
314 """
--> 315 (result,) = compute(self, traverse=False, **kwargs)
316 return result
File ~\AppData\Roaming\Python\Python38\site-packages\dask\base.py:600, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
597 keys.append(x.__dask_keys__())
598 postcomputes.append(x.__dask_postcompute__())
--> 600 results = schedule(dsk, keys, **kwargs)
601 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
File ~\AppData\Roaming\Python\Python38\site-packages\dask\threaded.py:89, in get(dsk, keys, cache, num_workers, pool, **kwargs)
86 elif isinstance(pool, multiprocessing.pool.Pool):
87 pool = MultiprocessingPoolExecutor(pool)
---> 89 results = get_async(
90 pool.submit,
91 pool._max_workers,
92 dsk,
93 keys,
94 cache=cache,
95 get_id=_thread_get_id,
96 pack_exception=pack_exception,
97 **kwargs,
98 )
100 # Cleanup pools associated to dead threads
101 with pools_lock:
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
509 _execute_task(task, data) # Re-execute locally
510 else:
--> 511 raise_exception(exc, tb)
512 res, worker_id = loads(res_info)
513 state["cache"][key] = res
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:319, in reraise(exc, tb)
317 if exc.__traceback__ is not tb:
318 raise exc.with_traceback(tb)
--> 319 raise exc
File ~\AppData\Roaming\Python\Python38\site-packages\dask\local.py:224, in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
222 try:
223 task, data = loads(task_info)
--> 224 result = _execute_task(task, data)
225 id = get_id()
226 result = dumps((result, id))
File ~\AppData\Roaming\Python\Python38\site-packages\dask\core.py:119, in _execute_task(arg, cache, dsk)
115 func, args = arg[0], arg[1:]
116 # Note: Don't assign the subtask results to a variable. numpy detects
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg
File ~\AppData\Roaming\Python\Python38\site-packages\dask\array\core.py:128, in getter(a, b, asarray, lock)
123 # Below we special-case `np.matrix` to force a conversion to
124 # `np.ndarray` and preserve original Dask behavior for `getter`,
125 # as for all purposes `np.matrix` is array-like and thus
126 # `is_arraylike` evaluates to `True` in that case.
127 if asarray and (not is_arraylike(c) or isinstance(c, np.matrix)):
--> 128 c = np.asarray(c)
129 finally:
130 if lock:
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:459, in ImplicitToExplicitIndexingAdapter.__array__(self, dtype)
458 def __array__(self, dtype=None):
--> 459 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:623, in CopyOnWriteArray.__array__(self, dtype)
622 def __array__(self, dtype=None):
--> 623 return np.asarray(self.array, dtype=dtype)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\core\indexing.py:524, in LazilyIndexedArray.__array__(self, dtype)
522 def __array__(self, dtype=None):
523 array = as_indexable(self.array)
--> 524 return np.asarray(array[self.key], dtype=None)
File ~\Anaconda3\envs\satellite\lib\site-packages\xarray\backends\zarr.py:76, in ZarrArrayWrapper.__getitem__(self, key)
74 array = self.get_array()
75 if isinstance(key, indexing.BasicIndexer):
---> 76 return array[key.tuple]
77 elif isinstance(key, indexing.VectorizedIndexer):
78 return array.vindex[
79 indexing._arrayize_vectorized_indexer(key, self.shape).tuple
80 ]
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:788, in Array.__getitem__(self, selection)
786 result = self.vindex[selection]
787 else:
--> 788 result = self.get_basic_selection(pure_selection, fields=fields)
789 return result
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:914, in Array.get_basic_selection(self, selection, out, fields)
911 return self._get_basic_selection_zd(selection=selection, out=out,
912 fields=fields)
913 else:
--> 914 return self._get_basic_selection_nd(selection=selection, out=out,
915 fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:957, in Array._get_basic_selection_nd(self, selection, out, fields)
951 def _get_basic_selection_nd(self, selection, out=None, fields=None):
952 # implementation of basic selection for array with at least one dimension
953
954 # setup indexer
955 indexer = BasicIndexer(selection, self)
--> 957 return self._get_selection(indexer=indexer, out=out, fields=fields)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1247, in Array._get_selection(self, indexer, out, fields)
1241 if not hasattr(self.chunk_store, "getitems") or \
1242 any(map(lambda x: x == 0, self.shape)):
1243 # sequentially get one key at a time from storage
1244 for chunk_coords, chunk_selection, out_selection in indexer:
1245
1246 # load chunk selection into output array
-> 1247 self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection,
1248 drop_axes=indexer.drop_axes, fields=fields)
1249 else:
1250 # allow storage to get multiple items at once
1251 lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\core.py:1939, in Array._chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes, fields)
1935 ckey = self._chunk_key(chunk_coords)
1937 try:
1938 # obtain compressed data for chunk
-> 1939 cdata = self.chunk_store[ckey]
1941 except KeyError:
1942 # chunk not initialized
1943 if self._fill_value is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\zarr\storage.py:717, in KVStore.__getitem__(self, key)
716 def __getitem__(self, key):
--> 717 return self._mutable_mapping[key]
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\mapping.py:137, in FSMap.__getitem__(self, key, default)
135 k = self._key_to_str(key)
136 try:
--> 137 result = self.fs.cat(k)
138 except self.missing_exceptions:
139 if default is not None:
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:111, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
108 #functools.wraps(func)
109 def wrapper(*args, **kwargs):
110 self = obj or args[0]
--> 111 return sync(self.loop, func, *args, **kwargs)
File ~\Anaconda3\envs\satellite\lib\site-packages\fsspec\asyn.py:94, in sync(loop, func, timeout, *args, **kwargs)
91 return_result = result[0]
92 if isinstance(return_result, asyncio.TimeoutError):
93 # suppress asyncio.TimeoutError, raise FSTimeoutError
---> 94 raise FSTimeoutError from return_result
95 elif isinstance(return_result, BaseException):
96 raise return_result
FSTimeoutError:
In the line:
store = fsspec.get_mapper(asset.href)
You can pass extra arguments to the fsspec backend, in this case HTTP, see fsspec.implementations.http.HTTPFileSystem. In this case, client_kwargs get passed to aiohttp.ClientSession, and include an optional timeout argument. Your call may look something like
from aiohttp import ClientTimeout
store = get_mapper(asset.href, client_kwargs={"timeout": ClientTimeout(total=5000, connect=1000)})

Error while saving a pandas DataFrame to a feather file, using to_feather() function

I am trying to save a pandas dataframe to a feather file using a pandas function .to_feather() as shown below;
df.to_feather("D:\{}.feather".format(parms['table_or_view_name']))
But I am getting an error pointing to pyarrow library, please be noted that I have already upgraded the pyarrow to the most recent build(7.0.0), despite that I am still facing the same issue:
ArrowInvalid Traceback (most recent call last)
<ipython-input-26-5459a546a0cb> in <module>
----> 1 df.to_feather("D:\{}.feather".format(parms['table_or_view_name']))
~\AppData\Roaming\Python\Python38\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
205 else:
206 kwargs[new_arg_name] = new_arg_value
--> 207 return func(*args, **kwargs)
208
209 return cast(F, wrapper)
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\frame.py in to_feather(self, path, **kwargs)
2517 from pandas.io.feather_format import to_feather
2518
-> 2519 to_feather(self, path, **kwargs)
2520
2521 #doc(
~\AppData\Roaming\Python\Python38\site-packages\pandas\io\feather_format.py in to_feather(df, path, storage_options, **kwargs)
85 path, "wb", storage_options=storage_options, is_text=False
86 ) as handles:
---> 87 feather.write_feather(df, handles.handle, **kwargs)
88
89
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\feather.py in write_feather(df, dest, compression, compression_level, chunksize, version)
152
153 if _pandas_api.is_data_frame(df):
--> 154 table = Table.from_pandas(df, preserve_index=False)
155
156 if version == 1:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\table.pxi in pyarrow.lib.Table.from_pandas()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
592
593 if nthreads == 1:
--> 594 arrays = [convert_column(c, f)
595 for c, f in zip(columns_to_convert, convert_fields)]
596 else:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in <listcomp>(.0)
592
593 if nthreads == 1:
--> 594 arrays = [convert_column(c, f)
595 for c, f in zip(columns_to_convert, convert_fields)]
596 else:
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in convert_column(col, field)
579 e.args += ("Conversion failed for column {!s} with type {!s}"
580 .format(col.name, col.dtype),)
--> 581 raise e
582 if not field_nullable and result.null_count > 0:
583 raise ValueError("Field {} was non-nullable but pandas column "
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in convert_column(col, field)
573
574 try:
--> 575 result = pa.array(col, type=type_, from_pandas=True, safe=safe)
576 except (pa.ArrowInvalid,
577 pa.ArrowNotImplementedError,
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib.array()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib._ndarray_to_array()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\error.pxi in pyarrow.lib.check_status()
ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column Resources with type object')
Note:
This error only occurs when I try to save the file as to_feather(), but when I tried to save the dataframe as .to_json() it worked.
Please suggest if there is any workaround for this error, thanks in advance.

Python/Numba: Trouble creating custom type using Numba Extension API

I am trying to create a custom Numba Type. I am having issues boxing and unboxing Numba Numpy Arrays to a Native Numpy Arrays.
I have searched online for similar issues and followed the documentation example to the best of my ability. (https://numba.pydata.org/numba-doc/latest/extending/interval-example.html).
I have tried to interpret (https://github.com/numba/numba/blob/master/numba/targets/boxing.py) but it is quite difficult. Therefore, I think I might be doing something small wrong.
Below is my current attempt at including a Numpy array in my custom type.
import numpy as np
from numba import types, cgutils
from numba.extending import typeof_impl, type_callable, models
from numba.extending import register_model, make_attribute_wrapper, overload_attribute
from numba.extending import lower_builtin, unbox, NativeValue, box
class BMatrix(object):
"""
A empty wrapper for a Binary Matrix
"""
def __init__(self, m, n, row_index):#, col_index):
self.m = m
self.n = n
self.row_index = row_index
# self.col_i = col_index
def __repr__(self):
return 'BMatrix(%d, %d)' % (self.m, self.n)
#property
def shape(self):
return (self.m, self.n)
class BMatrixType(types.Type):
def __init__(self):
super(BMatrixType, self).__init__(name='BMatrix')
bmatrix_type = BMatrixType()
#typeof_impl.register(BMatrix)
def typeof_index(val, c):
return bmatrix_type
#type_callable(BMatrix)
def type_bmatrix(context):
def typer(m, n, row_index):
if (isinstance(m, types.Integer)
and isinstance(n, types.Integer)
and isinstance(row_index, nb.types.Array)):
# and isinstance(col_index, nb.types.Array)):
return bmatrix_type
return typer
#register_model(BMatrixType)
class BMatrixModel(models.StructModel):
def __init__(self, dmm, fe_type):
members = [
('m', types.int64),
('n', types.int64),
('row_index', types.Array(types.int64, 1, 'C'))
]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(BMatrixType, 'm', 'm')
make_attribute_wrapper(BMatrixType, 'n', 'n')
make_attribute_wrapper(BMatrixType, 'row_index', 'row_index')
#overload_attribute(BMatrixType, "shape")
def get_shape(bmatrix):
def getter(bmatrix):
return (bmatrix.m, bmatrix.n)
return getter
#lower_builtin(BMatrix, types.Integer, types.Integer, types.Array) #nb.types.Array, #nb.types.Array)
def impl_bmatrix(context, builder, sig, args):
typ = sig.return_type
m, n, row_index = args
bmatrix = cgutils.create_struct_proxy(typ)(context, builder)
bmatrix.m = m
bmatrix.n = n
bmatrix.row_index = row_index
return bmatrix._getvalue()
#unbox(BMatrixType)
def unbox_bmatrix(typ, obj, c):
"""
Convert a BMatrixType object to a native interval structure.
"""
m_obj = c.pyapi.object_getattr_string(obj, "m")
n_obj = c.pyapi.object_getattr_string(obj, "n")
row_index_obj = c.pyapi.object_getattr_string(obj, "row_index")
BMatrix = cgutils.create_struct_proxy(typ)(c.context, c.builder)
BMatrix.m = c.pyapi.long_as_longlong(m_obj)
BMatrix.n = c.pyapi.long_as_longlong(n_obj)
BMatrix.row_index = nb.targets.boxing.unbox_array(types.Array(types.int64, 1, 'C'),
row_index_obj, c)
c.pyapi.decref(m_obj)
c.pyapi.decref(n_obj)
c.pyapi.decref(row_index_obj)
is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred())
return NativeValue(BMatrix._getvalue(), is_error=is_error)
#box(BMatrixType)
def box_bmatrix(typ, val, c):
"""
Convert a native bmatrix structure to an BMatrix object.
"""
Bmatrix = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
m_obj = c.pyapi.long_from_longlong(Bmatrix.m)
n_obj = c.pyapi.long_from_longlong(Bmatrix.n)
row_index_obj = nb.targets.boxing.box_array(types.Array(types.int64, 1, 'C'),
Bmatrix.row_index, c)
class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Bmatrix))
res = c.pyapi.call_function_objargs(class_obj, (m_obj, n_obj))
c.pyapi.decref(m_obj)
c.pyapi.decref(n_obj)
c.pyapi.decref(row_index_obj)
c.pyapi.decref(class_obj)
return res
Test Cases (The error Tracebacks are absolutely massive for test_2 and test_3).
#nb.jit(nopython=True)
def test_1(): #Runs
x = BMatrix(10, 10, np.array([10,10,10]))
def test_2(): #Errors
x = BMatrix(10, 10, np.array([10,10,10]))
#nb.jit(nopython=True)
def _test_2(y):
return y
return _test_2(x)
#nb.jit(nopython=True)
def test_3(): #Errors
return BMatrix(10, 10, np.array([10,10,10]))
#nb.jit(nopython=True)
def test_4():
return BMatrix(10, 10, np.array([10,10,10])).row_index
These are the error when I run the test cases
test_1() #Runs
test_2()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-52-0f6d1bdba40b> in <module>
----> 1 test_2()
<ipython-input-51-60141c9792c1> in test_2()
9 return y
10
---> 11 return _test_2(x)
12 #nb.jit(nopython=True)
13 def test_3():
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
368 e.patch_message(''.join(e.args) + help_msg)
369 # ignore the FULL_TRACEBACKS config, this needs reporting!
--> 370 raise e
371
372 def inspect_llvm(self, signature=None):
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
325 argtypes.append(self.typeof_pyval(a))
326 try:
--> 327 return self.compile(tuple(argtypes))
328 except errors.TypingError as e:
329 # Intercept typing error that may be due to an argument
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, sig)
657
658 self._cache_misses[sig] += 1
--> 659 cres = self._compiler.compile(args, return_type)
660 self.add_overload(cres)
661 self._cache.save_overload(sig, cres)
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, args, return_type)
81 args=args, return_type=return_type,
82 flags=flags, locals=self.locals,
---> 83 pipeline_class=self.pipeline_class)
84 # Check typing error if object mode is used
85 if cres.typing_error is not None and not flags.enable_pyobject:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
953 pipeline = pipeline_class(typingctx, targetctx, library,
954 args, return_type, flags, locals)
--> 955 return pipeline.compile_extra(func)
956
957
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(self, func)
375 self.lifted = ()
376 self.lifted_from = None
--> 377 return self._compile_bytecode()
378
379 def compile_ir(self, func_ir, lifted=(), lifted_from=None):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_bytecode(self)
884 """
885 assert self.func_ir is None
--> 886 return self._compile_core()
887
888 def _compile_ir(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_core(self)
871 self.define_pipelines(pm)
872 pm.finalize()
--> 873 res = pm.run(self.status)
874 if res is not None:
875 # Early pipeline completion
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
252 # No more fallback pipelines?
253 if is_final_pipeline:
--> 254 raise patched_exception
255 # Go to next fallback pipeline
256 else:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
243 try:
244 event("-- %s" % stage_name)
--> 245 stage()
246 except _EarlyPipelineCompletion as e:
247 return e.result
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in stage_nopython_backend(self)
745 """
746 lowerfn = self.backend_nopython_mode
--> 747 self._backend(lowerfn, objectmode=False)
748
749 def stage_compile_interp_mode(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _backend(self, lowerfn, objectmode)
685 self.library.enable_object_caching()
686
--> 687 lowered = lowerfn()
688 signature = typing.signature(self.return_type, *self.args)
689 self.cr = compile_result(
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in backend_nopython_mode(self)
672 self.calltypes,
673 self.flags,
--> 674 self.metadata)
675
676 def _backend(self, lowerfn, objectmode):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in native_lowering_stage(targetctx, library, interp, typemap, restype, calltypes, flags, metadata)
1124 lower.lower()
1125 if not flags.no_cpython_wrapper:
-> 1126 lower.create_cpython_wrapper(flags.release_gil)
1127 env = lower.env
1128 call_helper = lower.call_helper
//anaconda3/lib/python3.7/site-packages/numba/lowering.py in create_cpython_wrapper(self, release_gil)
269 self.context.create_cpython_wrapper(self.library, self.fndesc,
270 self.env, self.call_helper,
--> 271 release_gil=release_gil)
272
273 def setup_function(self, fndesc):
//anaconda3/lib/python3.7/site-packages/numba/targets/cpu.py in create_cpython_wrapper(self, library, fndesc, env, call_helper, release_gil)
155 fndesc, env, call_helper=call_helper,
156 release_gil=release_gil)
--> 157 builder.build()
158 library.add_ir_module(wrapper_module)
159
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build(self)
120
121 api = self.context.get_python_api(builder)
--> 122 self.build_wrapper(api, builder, closure, args, kws)
123
124 return wrapper, api
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build_wrapper(self, api, builder, closure, args, kws)
153 innerargs.append(None)
154 else:
--> 155 val = cleanup_manager.add_arg(builder.load(obj), ty)
156 innerargs.append(val)
157
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in add_arg(self, obj, ty)
30 """
31 # Unbox argument
---> 32 native = self.api.to_native_value(ty, obj)
33
34 # If an error occurred, go to the cleanup block for the previous argument.
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in to_native_value(self, typ, obj)
1423 impl = _unboxers.lookup(typ.__class__, unbox_unsupported)
1424 c = _UnboxContext(self.context, self.builder, self)
-> 1425 return impl(typ, obj, c)
1426
1427 def from_native_return(self, typ, val, env_manager):
<ipython-input-45-d8ac5afde794> in unbox_bmatrix(typ, obj, c)
85 BMatrix.n = c.pyapi.long_as_longlong(n_obj)
86 BMatrix.row_index = nb.targets.boxing.unbox_array(types.Array(types.int64, 1, 'C'),
---> 87 row_index_obj, c)
88 c.pyapi.decref(m_obj)
89 c.pyapi.decref(n_obj)
//anaconda3/lib/python3.7/site-packages/numba/cgutils.py in __setattr__(self, field, value)
162 if field.startswith('_'):
163 return super(_StructProxy, self).__setattr__(field, value)
--> 164 self[self._datamodel.get_field_position(field)] = value
165
166 def __getitem__(self, index):
//anaconda3/lib/python3.7/site-packages/numba/cgutils.py in __setitem__(self, index, value)
177 ptr = self._get_ptr_by_index(index)
178 value = self._cast_member_from_value(index, value)
--> 179 if value.type != ptr.type.pointee:
180 if (is_pointer(value.type) and is_pointer(ptr.type.pointee)
181 and value.type.pointee == ptr.type.pointee.pointee):
AttributeError: Failed in nopython mode pipeline (step: nopython mode backend)
'NativeValue' object has no attribute 'type'
test_3()
KeyError Traceback (most recent call last)
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in serialize_object(self, obj)
1403 try:
-> 1404 gv = self.module.__serialized[obj]
1405 except KeyError:
KeyError: <numba.cgutils.ValueStructProxy_BMatrix object at 0x11e693f28>
During handling of the above exception, another exception occurred:
PicklingError Traceback (most recent call last)
<ipython-input-53-8d78c7c0acee> in <module>
----> 1 test_3()
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
368 e.patch_message(''.join(e.args) + help_msg)
369 # ignore the FULL_TRACEBACKS config, this needs reporting!
--> 370 raise e
371
372 def inspect_llvm(self, signature=None):
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in _compile_for_args(self, *args, **kws)
325 argtypes.append(self.typeof_pyval(a))
326 try:
--> 327 return self.compile(tuple(argtypes))
328 except errors.TypingError as e:
329 # Intercept typing error that may be due to an argument
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, sig)
657
658 self._cache_misses[sig] += 1
--> 659 cres = self._compiler.compile(args, return_type)
660 self.add_overload(cres)
661 self._cache.save_overload(sig, cres)
//anaconda3/lib/python3.7/site-packages/numba/dispatcher.py in compile(self, args, return_type)
81 args=args, return_type=return_type,
82 flags=flags, locals=self.locals,
---> 83 pipeline_class=self.pipeline_class)
84 # Check typing error if object mode is used
85 if cres.typing_error is not None and not flags.enable_pyobject:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
953 pipeline = pipeline_class(typingctx, targetctx, library,
954 args, return_type, flags, locals)
--> 955 return pipeline.compile_extra(func)
956
957
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in compile_extra(self, func)
375 self.lifted = ()
376 self.lifted_from = None
--> 377 return self._compile_bytecode()
378
379 def compile_ir(self, func_ir, lifted=(), lifted_from=None):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_bytecode(self)
884 """
885 assert self.func_ir is None
--> 886 return self._compile_core()
887
888 def _compile_ir(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _compile_core(self)
871 self.define_pipelines(pm)
872 pm.finalize()
--> 873 res = pm.run(self.status)
874 if res is not None:
875 # Early pipeline completion
//anaconda3/lib/python3.7/site-packages/numba/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
30 def _acquire_compile_lock(*args, **kwargs):
31 with self:
---> 32 return func(*args, **kwargs)
33 return _acquire_compile_lock
34
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
252 # No more fallback pipelines?
253 if is_final_pipeline:
--> 254 raise patched_exception
255 # Go to next fallback pipeline
256 else:
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in run(self, status)
243 try:
244 event("-- %s" % stage_name)
--> 245 stage()
246 except _EarlyPipelineCompletion as e:
247 return e.result
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in stage_nopython_backend(self)
745 """
746 lowerfn = self.backend_nopython_mode
--> 747 self._backend(lowerfn, objectmode=False)
748
749 def stage_compile_interp_mode(self):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in _backend(self, lowerfn, objectmode)
685 self.library.enable_object_caching()
686
--> 687 lowered = lowerfn()
688 signature = typing.signature(self.return_type, *self.args)
689 self.cr = compile_result(
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in backend_nopython_mode(self)
672 self.calltypes,
673 self.flags,
--> 674 self.metadata)
675
676 def _backend(self, lowerfn, objectmode):
//anaconda3/lib/python3.7/site-packages/numba/compiler.py in native_lowering_stage(targetctx, library, interp, typemap, restype, calltypes, flags, metadata)
1124 lower.lower()
1125 if not flags.no_cpython_wrapper:
-> 1126 lower.create_cpython_wrapper(flags.release_gil)
1127 env = lower.env
1128 call_helper = lower.call_helper
//anaconda3/lib/python3.7/site-packages/numba/lowering.py in create_cpython_wrapper(self, release_gil)
269 self.context.create_cpython_wrapper(self.library, self.fndesc,
270 self.env, self.call_helper,
--> 271 release_gil=release_gil)
272
273 def setup_function(self, fndesc):
//anaconda3/lib/python3.7/site-packages/numba/targets/cpu.py in create_cpython_wrapper(self, library, fndesc, env, call_helper, release_gil)
155 fndesc, env, call_helper=call_helper,
156 release_gil=release_gil)
--> 157 builder.build()
158 library.add_ir_module(wrapper_module)
159
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build(self)
120
121 api = self.context.get_python_api(builder)
--> 122 self.build_wrapper(api, builder, closure, args, kws)
123
124 return wrapper, api
//anaconda3/lib/python3.7/site-packages/numba/callwrapper.py in build_wrapper(self, api, builder, closure, args, kws)
174
175 retty = self._simplified_return_type()
--> 176 obj = api.from_native_return(retty, retval, env_manager)
177 builder.ret(obj)
178
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in from_native_return(self, typ, val, env_manager)
1429 "prevented the return of " \
1430 "optional value"
-> 1431 out = self.from_native_value(typ, val, env_manager)
1432 return out
1433
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in from_native_value(self, typ, val, env_manager)
1443
1444 c = _BoxContext(self.context, self.builder, self, env_manager)
-> 1445 return impl(typ, val, c)
1446
1447 def reflect_native_value(self, typ, val, env_manager=None):
<ipython-input-45-d8ac5afde794> in box_bmatrix(typ, val, c)
104 Bmatrix.row_index, c)
105
--> 106 class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Bmatrix))
107 res = c.pyapi.call_function_objargs(class_obj, (m_obj, n_obj))
108 c.pyapi.decref(m_obj)
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in serialize_object(self, obj)
1404 gv = self.module.__serialized[obj]
1405 except KeyError:
-> 1406 struct = self.serialize_uncached(obj)
1407 name = ".const.picklebuf.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR")
1408 gv = self.context.insert_unique_const(self.module, name, struct)
//anaconda3/lib/python3.7/site-packages/numba/pythonapi.py in serialize_uncached(self, obj)
1383 """
1384 # First make the array constant
-> 1385 data = pickle.dumps(obj, protocol=-1)
1386 assert len(data) < 2**31
1387 name = ".const.pickledata.%s" % (id(obj) if config.DIFF_IR == 0 else "DIFF_IR")
PicklingError: Failed in nopython mode pipeline (step: nopython mode backend)
Can't pickle <class 'numba.cgutils.ValueStructProxy_BMatrix'>: attribute lookup ValueStructProxy_BMatrix on numba.cgutils failed
test_4() #Runs Wrong
array([-2387225703656530210, -2387225703656530210, -2387225703656530210])
unbox_array returns a NativeValue. Inside NativeValue is the actual value which is what you want to assign to row_index. So, just add ".value" to the end of the following line to extract the value from the NativeValue.
BMatrix.row_index = nb.targets.boxing.unbox_array(types.Array(types.int64, 1, 'C'), row_index_obj, c)

impossible to read a csv file ith pyspark

I try to read a csv file using pyspark with this pyspark code :
tr_df = spark.read.csv("/data/file.csv",
header=True, inferSchema=True
)
tr_df.head(5)
But I get this error :
ValueError Traceback (most recent call last)
<ipython-input-53-03432bbf269d> in <module>
----> 1 tr_df.head(5)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in head(self, n)
1250 rs = self.head(1)
1251 return rs[0] if rs else None
-> 1252 return self.take(n)
1253
1254 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in take(self, num)
569 [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
570 """
--> 571 return self.limit(num).collect()
572
573 #since(1.3)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/dataframe.py
in collect(self)
532 with SCCallSiteSync(self._sc) as css:
533 sock_info = self._jdf.collectToPython()
--> 534 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
535
536 #ignore_unicode_prefix
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in load_stream(self, stream)
145 while True:
146 try:
--> 147 yield self._read_with_length(stream)
148 except EOFError:
149 return
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in _read_with_length(self, stream)
170 if len(obj) < length:
171 raise EOFError
--> 172 return self.loads(obj)
173
174 def dumps(self, obj):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/serializers.py
in loads(self, obj, encoding)
578 if sys.version >= '3':
579 def loads(self, obj, encoding="bytes"):
--> 580 return pickle.loads(obj, encoding=encoding)
581 else:
582 def loads(self, obj, encoding=None):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_string(json_string)
867 >>> check_datatype(complex_maptype)
868 """
--> 869 return _parse_datatype_json_value(json.loads(json_string))
870
871
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
884 tpe = json_value["type"]
885 if tpe in _all_complex_types:
--> 886 return _all_complex_types[tpe].fromJson(json_value)
887 elif tpe == 'udt':
888 return UserDefinedType.fromJson(json_value)
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in (.0)
575 #classmethod
576 def fromJson(cls, json):
--> 577 return StructType([StructField.fromJson(f) for f in json["fields"]])
578
579 def fieldNames(self):
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in fromJson(cls, json)
432 def fromJson(cls, json):
433 return StructField(json["name"],
--> 434 _parse_datatype_json_value(json["type"]),
435 json["nullable"],
436 json["metadata"])
~/anaconda3/envs/naboo-env/lib/python3.6/site-packages/pyspark/sql/types.py
in _parse_datatype_json_value(json_value)
880 return DecimalType(int(m.group(1)), int(m.group(2)))
881 else:
--> 882 raise ValueError("Could not parse datatype: %s" % json_value)
883 else:
884 tpe = json_value["type"]
ValueError: Could not parse datatype: decimal(17,-24)
Can anyone help me to resolve this problem please?
Thanks
Seems there is a problem with datatype in one of your columns. Hence its throwing error. Remove inferSchema =True option while reading. After reading the data,try to analayze datatype and make any corrections if needed, then apply your own schema.

How to print pandas dataframe containing some russian language

I am working on a following type of data.
itemid category subcategory title
1 10000010 Транспорт Автомобили с пробегом Toyota Sera, 1991
2 10000025 Услуги Предложения услуг Монтаж кровли
3 10000094 Личные вещи Одежда, обувь, аксессуары Костюм Steilmann
4 10000101 Транспорт Автомобили с пробегом Ford Focus, 2011
5 10000132 Транспорт Запчасти и аксессуары Турбина 3.0 Bar
6 10000152 Транспорт Автомобили с пробегом ВАЗ 2115 Samara, 2005
Now I run the following commands
import pandas as pd
trainingData = pd.read_table("train.tsv",nrows=10, header=0,encoding='utf-8')
trainingData['itemid'].head()
0 10000010
1 10000025
2 10000094
3 10000101
4 10000132
Name: itemid
Everything is good this point but when I do something like
trainingData['itemid','category'].head()
Error:
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
/home/vikram/Documents/Avito/ in ()
----> 1 trainingData[['itemid','category']].head()
/usr/lib/python2.7/dist-packages/IPython/core/displayhook.pyc in __call__(self, result)
236 self.start_displayhook()
237 self.write_output_prompt()
--> 238 format_dict = self.compute_format_data(result)
239 self.write_format_data(format_dict)
240 self.update_user_ns(result)
/usr/lib/python2.7/dist-packages/IPython/core/displayhook.pyc in compute_format_data(self, result)
148 MIME type representation of the object.
149 """
--> 150 return self.shell.display_formatter.format(result)
151
152 def write_format_data(self, format_dict):
/usr/lib/python2.7/dist-packages/IPython/core/formatters.pyc in format(self, obj, include, exclude)
124 continue
125 try:
--> 126 data = formatter(obj)
127 except:
128 # FIXME: log the exception
/usr/lib/python2.7/dist-packages/IPython/core/formatters.pyc in __call__(self, obj)
445 type_pprinters=self.type_printers,
446 deferred_pprinters=self.deferred_printers)
--> 447 printer.pretty(obj)
448 printer.flush()
449 return stream.getvalue()
/usr/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in pretty(self, obj)
352 if callable(obj_class._repr_pretty_):
353 return obj_class._repr_pretty_(obj, self, cycle)
--> 354 return _default_pprint(obj, self, cycle)
355 finally:
356 self.end_group()
/usr/lib/python2.7/dist-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
472 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
473 # A user-provided repr.
--> 474 p.text(repr(obj))
475 return
476 p.begin_group(1, ' 456 self.to_string(buf=buf)
457 value = buf.getvalue()
458 if max([len(l) for l in value.split('\n')]) > terminal_width:
/usr/lib/pymodules/python2.7/pandas/core/frame.pyc in to_string(self, buf, columns, col_space, colSpace, header, index, na_rep, formatters, float_format, sparsify, nanRep, index_names, justify, force_unicode)
1024 index_names=index_names,
1025 header=header, index=index)
-> 1026 formatter.to_string(force_unicode=force_unicode)
1027
1028 if buf is None:
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in to_string(self, force_unicode)
176 for i, c in enumerate(self.columns):
177 if self.header:
--> 178 fmt_values = self._format_col(c)
179 cheader = str_columns[i]
180 max_len = max(max(len(x) for x in fmt_values),
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in _format_col(self, col)
217 float_format=self.float_format,
218 na_rep=self.na_rep,
--> 219 space=self.col_space)
220
221 def to_html(self):
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
424 justify=justify)
425
--> 426 return fmt_obj.get_result()
427
428
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in get_result(self)
471 fmt_values.append(float_format(v))
472 else:
--> 473 fmt_values.append(' %s' % _format(v))
474
475 return _make_fixed_width(fmt_values, self.justify)
/usr/lib/pymodules/python2.7/pandas/core/format.pyc in _format(x)
457 else:
458 # object dtype
--> 459 return '%s' % formatter(x)
460
461 vals = self.values
/usr/lib/pymodules/python2.7/pandas/core/common.pyc in _stringify(col)
503 def _stringify(col):
504 # unicode workaround
--> 505 return unicode(col)
506
507 def _maybe_make_list(obj):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
please help me "display" the data properly.
I had the same issue caused by IPython, which could not display non-ASCII text returned by the Pandas head() function. It turned out that the default encoding for Python was set to 'ascii' on my machine. You can check this with
import sys
sys.getdefaultencoding()
The solution was to re-set the default encoding to UTF-8:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
After this, IPython displayed Pandas data frames with non-ASCII characters correctly.
Note that the reload call is necessary to make the setdefaultencoding function available. Without it you'll get the error:
AttributeError: 'module' object has no attribute 'setdefaultencoding'

Categories