pandas: FloatingPointError with np.seterr(all='raise') and missing data

pandas: FloatingPointError with np.seterr(all='raise') and missing data - python

I'm getting a FloatingPointError when I want to look at data involving missing data.
import numpy as np
import pandas as pd
np.seterr(all='raise')
s = pd.Series([np.nan,np.nan,np.nan],index=[1,2,3]); print(s); print(s.head())
I'm on the newest version of pandas, installed via
conda install -f pandas
after pkill python and conda remove pandas.
Here's the trace back:
Out[4]: ---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/core/formatters.pyc in __call__(self, obj)
695 type_pprinters=self.type_printers,
696 deferred_pprinters=self.deferred_printers)
--> 697 printer.pretty(obj)
698 printer.flush()
699 return stream.getvalue()
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/lib/pretty.pyc in pretty(self, obj)
381 if callable(meth):
382 return meth(obj, self, cycle)
--> 383 return _default_pprint(obj, self, cycle)
384 finally:
385 self.end_group()
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
501 if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
502 # A user-provided repr. Find newlines and replace them with p.break_()
--> 503 _repr_pprint(obj, p, cycle)
504 return
505 p.begin_group(1, '<')
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/lib/pretty.pyc in _repr_pprint(obj, p, cycle)
683 """A pprint that just redirects to the normal repr function."""
684 # Find newlines and replace them with p.break_()
--> 685 output = repr(obj)
686 for idx,output_line in enumerate(output.splitlines()):
687 if idx:
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/base.pyc in __repr__(self)
61 Yields Bytestring in Py2, Unicode String in py3.
62 """
---> 63 return str(self)
64
65
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/base.pyc in __str__(self)
41 if compat.PY3:
42 return self.__unicode__()
---> 43 return self.__bytes__()
44
45 def __bytes__(self):
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/base.pyc in __bytes__(self)
53
54 encoding = get_option("display.encoding")
---> 55 return self.__unicode__().encode(encoding, 'replace')
56
57 def __repr__(self):
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/series.pyc in __unicode__(self)
954
955 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 956 max_rows=max_rows)
957 result = buf.getvalue()
958
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/series.pyc in to_string(self, buf, na_rep, float_format, header, length, dtype, name, max_rows)
992 the_repr = self._get_repr(float_format=float_format, na_rep=na_rep,
993 header=header, length=length, dtype=dtype,
--> 994 name=name, max_rows=max_rows)
995
996 # catch contract violations
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/series.pyc in _get_repr(self, name, header, length, dtype, na_rep, float_format, max_rows)
1022 float_format=float_format,
1023 max_rows=max_rows)
-> 1024 result = formatter.to_string()
1025
1026 # TODO: following check prob. not neces.
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in to_string(self)
223
224 fmt_index, have_header = self._get_formatted_index()
--> 225 fmt_values = self._get_formatted_values()
226
227 maxlen = max(self.adj.len(x) for x in fmt_index) # max index len
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in _get_formatted_values(self)
213 return format_array(self.tr_series._values, None,
214 float_format=self.float_format,
--> 215 na_rep=self.na_rep)
216
217 def to_string(self):
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
1974 justify=justify)
1975
-> 1976 return fmt_obj.get_result()
1977
1978
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in get_result(self)
1990
1991 def get_result(self):
-> 1992 fmt_values = self._format_strings()
1993 return _make_fixed_width(fmt_values, self.justify)
1994
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in _format_strings(self)
2085
2086 # this is pretty arbitrary for now
-> 2087 has_large_values = (abs_vals > 1e8).any()
2088 has_small_values = ((abs_vals < 10 ** (-self.digits)) &
2089 (abs_vals > 0)).any()
FloatingPointError: invalid value encountered in greater

Whenever you import pandas, all numpy errors are set to be ignore. This is currently undocumented behavior.
This is done in pandas/compat/numpy_compat.py
# TODO: HACK for NumPy 1.5.1 to suppress warnings
# is this necessary?
try:
np.seterr(all='ignore')
except Exception: # pragma: no cover
pass
Here's how that plays out
In [1]: import numpy as np
In [2]: np.geterr()
Out[2]: {'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}
In [3]: import pandas as pd
In [4]: np.geterr()
Out[4]: {'divide': 'ignore', 'invalid': 'ignore', 'over': 'ignore', 'under': 'ignore'}
In [5]: s = pd.Series([np.nan,np.nan,np.nan],index=[1,2,3]); print(s); print(s.head())
1 NaN
2 NaN
3 NaN
dtype: float64
1 NaN
2 NaN
3 NaN
dtype: float64
In [6]: np.seterr(invalid='raise')
Out[6]: {'divide': 'ignore', 'invalid': 'ignore', 'over': 'ignore', 'under': 'ignore'}
In [7]: s = pd.Series([np.nan,np.nan,np.nan],index=[1,2,3]); print(s); print(s.head())
FloatingPointError: invalid value encountered in greater
The "solution" is hence to not np.seterr(invalid'raise'), whenever you use pandas (especially when working with missing data).

Related

Why pd.read_csv get wrong value when using dtype = 'Int64'?

import pandas as pd
When there is no na values, It's right.
!cat id1
1471341653427101696 1458379213265436885
pd.read_csv('id1',sep ='\t',header=None, na_values=['\\N'],dtype = 'Int64')
0
1
1471341653427101696
1458379213265436885
!cat id2
1471870967209926656 \N
1471341653427101696 1458379213265436885
1471458498691866624 1458379213265436889
when using int64, and there are na values, pd get wrong number
df = pd.read_csv('id2',sep ='\t',header=None, na_values=['\\N'],dtype = 'Int64')
df
0
1
1471870967209926656
<NA>
1471341653427101696
1458379213265436672
1471458498691866624
1458379213265436672
when read as str, it's correct
df = pd.read_csv('id2',sep ='\t',header=None, na_values=['\\N'],dtype = 'str')
df
0
1
1471870967209926656
NaN
1471341653427101696
1458379213265436885
1471458498691866624
1458379213265436889
df[1]
0 NaN
1 1458379213265436885
2 1458379213265436889
Name: 1, dtype: object
df.loc[[0],1].astype('Int64')
0 <NA>
Name: 1, dtype: Int64
df.loc[[1],1].astype('Int64')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_1971828/2578832362.py in <module>
----> 1 df.loc[[1],1].astype('Int64')
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5813 else:
5814 # else, only a single dtype is given
-> 5815 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
5816 return self._constructor(new_data).__finalize__(self, method="astype")
5817
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
416
417 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
419
420 def convert(
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
589 values = self.values
590
--> 591 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
592
593 new_values = maybe_coerce_values(new_values)
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
1307
1308 try:
-> 1309 new_values = astype_array(values, dtype, copy=copy)
1310 except (ValueError, TypeError):
1311 # e.g. astype_nansafe can fail on object-dtype of strings
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
1255
1256 else:
-> 1257 values = astype_nansafe(values, dtype, copy=copy)
1258
1259 # in pandas we don't store numpy str dtypes, so convert to object
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
1103 # dispatch on extension dtype if needed
1104 if isinstance(dtype, ExtensionDtype):
-> 1105 return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
1106
1107 elif not isinstance(dtype, np.dtype): # pragma: no cover
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/arrays/integer.py in _from_sequence(cls, scalars, dtype, copy)
321 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
322 ) -> IntegerArray:
--> 323 values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
324 return IntegerArray(values, mask)
325
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/arrays/integer.py in coerce_to_array(values, dtype, mask, copy)
196 "mixed-integer-float",
197 ]:
--> 198 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
199
200 elif is_bool_dtype(values) and is_integer_dtype(dtype):
TypeError: object cannot be converted to an IntegerDtype

This is an old issue for pandas: github.com/pandas-dev/pandas/issues/30268.
So the only way is using str,remove na ,then convert to int

Pandas pivot_table Assertion error: `result` has not been initialized

df:
avg
count
date
val
prop
unit
distance
d-atmp
d-clouds
d-dewpoint
0.0786107
12
2014-10-03 00:00:00
22
atmp
(Deg C)
24829.6
24829.6
nan
nan
0.0786107
12
2014-10-03 00:00:00
0
clouds
(oktas)
22000.6
nan
22000.6
nan
0.0786107
12
2014-10-03 00:00:00
32
dewpoint
(Deg C)
21344.1
nan
nan
21344.1
0.0684246
6
2014-10-04 00:00:00
21.5
atmp
(Deg C)
26345.1
26345.1
nan
nan
cols = ['avg', 'date', 'count', 'd-atmp', 'd-cloud', 'd-dewpoint']
d = pd.pivot_table(x, index=cols, columns=['prop', 'unit'], values='val', aggfunc=max)
Ideal result:
date
countObs
avg
d-atmp
atmp (Deg C)
d-clouds
clouds (oktas)
d-dewpoint
dewpoint (Deg C)
2014-10-03 00:00:00
12
0.0786107
24829.6
22
22000.6
0
21344.1
32
2014-10-04 00:00:00
6
0.0684246
26345.1
21.5
nan
nan
nan
nan
Error
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) ~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1067 try:
-> 1068 result = self.grouper._cython_operation( 1069 "aggregate", values, how, axis=data.ndim - 1, min_count=min_count
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_operation(self, kind, values, how, axis, min_count, **kwargs)
998 ngroups = self.ngroups
--> 999 return cy_op.cython_operation( 1000 values=values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in cython_operation(self, values, axis, min_count, comp_ids, ngroups,
**kwargs)
659
--> 660 return self._cython_op_ndim_compat(
661 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
515
--> 516 return self._call_cython_op(
517 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_call_cython_op(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
561 out_shape = self._get_output_shape(ngroups, values)
--> 562 func, values = self.get_cython_func_and_vals(values, is_numeric)
563 out_dtype = self.get_out_dtype(values.dtype)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in get_cython_func_and_vals(self, values, is_numeric)
204
--> 205 func = self._get_cython_function(kind, how, values.dtype, is_numeric)
206
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_get_cython_function(cls, kind, how, dtype, is_numeric)
169 # raise NotImplementedError here rather than TypeError later
--> 170 raise NotImplementedError(
171 f"function is not implemented for this dtype: "
NotImplementedError: function is not implemented for this dtype: [how->mean,dtype->object]
During handling of the above exception, another exception occurred:
AssertionError Traceback (most recent call last) <ipython-input-119-b64b487d2810> in <module>
5 # o
6 # cols += []
----> 7 d = pd.pivot_table(x, index=cols, columns=['osmcObsProperty', 'unit'], values='val') #, aggfunc=max #np.mean or max appear similar , dropna=False
8
9 d.reset_index(inplace=True)
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
93 return table.__finalize__(data, method="pivot_table")
94
---> 95 table = __internal_pivot_table(
96 data,
97 values,
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in
__internal_pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
163
164 grouped = data.groupby(keys, observed=observed, sort=sort)
--> 165 agged = grouped.agg(aggfunc)
166 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
167 agged = agged.dropna(how="all")
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in agg(self)
156
157 if isinstance(arg, str):
--> 158 return self.apply_str()
159
160 if is_dict_like(arg):
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_str(self)
505 elif self.axis != 0:
506 raise ValueError(f"Operation {f} does not support axis=1")
--> 507 return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)
508
509 def apply_multiple(self) -> FrameOrSeriesUnion:
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in
_try_aggregate_string_function(self, obj, arg, *args, **kwargs)
575 if f is not None:
576 if callable(f):
--> 577 return f(*args, **kwargs)
578
579 # people may try to aggregate on a non-callable attribute
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only) 1685 numeric_only = self._resolve_numeric_only(numeric_only) 1686
-> 1687 result = self._cython_agg_general( 1688 "mean", 1689 alt=lambda x: Series(x).mean(numeric_only=numeric_only),
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in
_cython_agg_general(self, how, alt, numeric_only, min_count) 1080 # TypeError -> we may have an exception in trying to aggregate 1081 # continue and exclude the block
-> 1082 new_mgr = data.grouped_reduce(array_func, ignore_failures=True) 1083 1084 if len(new_mgr) < len(data):
~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py in grouped_reduce(self, func, ignore_failures) 1233 for sb in blk._split(): 1234 try:
-> 1235 applied = sb.apply(func) 1236 except (TypeError, NotImplementedError): 1237 if not ignore_failures:
~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py in apply(self, func, **kwargs)
379 """
380 with np.errstate(all="ignore"):
--> 381 result = func(self.values, **kwargs)
382
383 return self._split_op_result(result)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1074 # try to python agg 1075
# TODO: shouldn't min_count matter?
-> 1076 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) 1077 1078 return result
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in
_agg_py_fallback(self, values, ndim, alt) 1396 # should always be preserved by the implemented aggregations 1397 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1398 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) 1399 1400 if isinstance(values, Categorical):
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in agg_series(self, obj, func, preserve_dtype) 1047 1048 else:
-> 1049 result = self._aggregate_series_fast(obj, func) 1050 1051 npvalues = lib.maybe_convert_objects(result, try_float=False)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_aggregate_series_fast(self, obj, func) 1072 ids = ids.take(indexer) 1073 sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
-> 1074 result, _ = sgrouper.get_result() 1075 return result 1076
~/.local/lib/python3.9/site-packages/pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()
AssertionError: `result` has not been initialized.

IIUC, you can use groupby_agg:
out = df.groupby('date', as_index=False).agg(max)
Output:
date
avg
count
val
prop
unit
distance
d-atmp
d-clouds
d-dewpoint
2014-10-03 00:00:00
0.0786107
12
32
dewpoint
(oktas)
24829.6
24829.6
22000.6
21344.1
2014-10-04 00:00:00
0.0684246
6
21.5
atmp
(Deg C)
26345.1
26345.1
nan
nan

You could pivot; then use groupby + max:
cols = ['avg', 'date', 'count', 'd-atmp', 'd-clouds', 'd-dewpoint']
tmp = df.pivot(index=cols, columns=['prop', 'unit'], values='val')
tmp.columns = tmp.columns.map(' '.join)
out = tmp.reset_index().groupby('date', as_index=False).max()\
[['date', 'count', 'avg', 'd-atmp', 'atmp (Deg C)', 'd-clouds',
'clouds (oktas)', 'd-dewpoint', 'dewpoint (Deg C)']]
Output:
date count avg d-atmp atmp (Deg C) d-clouds clouds (oktas) d-dewpoint dewpoint (Deg C)
0 2014-10-03 00:00:00 12 0.078611 24829.6 22.0 22000.6 0.0 21344.1 32.0
1 2014-10-04 00:00:00 6 0.068425 26345.1 21.5 NaN NaN NaN NaN

TypeError: unsupported format string when printing Pandas dataframe with incorrectly shaped index

Found this not-so-helpful traceback when incorrectly constructing a data frame with an index. My question is, is this a bug that I should report to Pandas as an issue or feature request or am I doing something wrong?
What I wanted to do:
index = pd.Index(np.array([0, 1]))
df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
index=index)
print(df)
A B
0 0 1.1
1 1 1.2
(works, no problem)
What I actually did (note dimension of index array data):
index = pd.Index(np.array([[0], [1]]))
df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
index=index)
print(df)
Traceback message (very long):
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-95-af090c2ae470> in <module>
2 df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
3 index=index)
----> 4 print(df)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in __repr__(self)
653 max_cols=max_cols,
654 line_width=width,
--> 655 show_dimensions=show_dimensions,
656 )
657
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width)
774 line_width=line_width,
775 )
--> 776 formatter.to_string()
777
778 if buf is None:
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in to_string(self)
686 else:
687
--> 688 strcols = self._to_str_columns()
689 if self.line_width is None: # no need to wrap around just print
690 # the whole frame
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _to_str_columns(self)
586 # may include levels names also
587
--> 588 str_index = self._get_formatted_index(frame)
589
590 if not is_list_like(self.header) and not self.header:
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _get_formatted_index(self, frame)
919 )
920 else:
--> 921 fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
922
923 fmt_index = [
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in format(self, name, formatter, **kwargs)
1106 return header + list(self.map(formatter))
1107
-> 1108 return self._format_with_header(header, **kwargs)
1109
1110 def _format_with_header(self, header, na_rep="NaN", **kwargs):
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in _format_with_header(self, header, na_rep, **kwargs)
1130
1131 else:
-> 1132 result = _trim_front(format_array(values, None, justify="left"))
1133 return header + result
1134
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space)
1031 )
1032
-> 1033 return fmt_obj.get_result()
1034
1035
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in get_result(self)
1062
1063 def get_result(self):
-> 1064 fmt_values = self._format_strings()
1065 return _make_fixed_width(fmt_values, self.justify)
1066
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _format_strings(self)
1293 def _format_strings(self):
1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
-> 1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
1297
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in <listcomp>(.0)
1293 def _format_strings(self):
1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
-> 1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
1297
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in <lambda>(x)
1292 class IntArrayFormatter(GenericArrayFormatter):
1293 def _format_strings(self):
-> 1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
TypeError: unsupported format string passed to numpy.ndarray.__format__
Note that the dataframe was constructed (it just can't be printed):
In [14]: df.shape
Out[14]: (2, 2)
In [15]: df.index
Out[15]: Int64Index([[0], [1]], dtype='int64')
In [16]: df.values
Out[16]:
array([[0. , 1.1],
[1. , 1.2]])
In [18]: df.columns
Out[18]: Index(['A', 'B'], dtype='object')
In [19]: df.index[0]
Out[19]: array([0])
In [20]: df.index.dtype
Out[20]: dtype('int64')
Also note that if you make the same mistake with the data arguments...
index = pd.Index(np.array([0, 1]))
df = pd.DataFrame({'A': np.array([[0], [1]]), 'B': np.array([[1.1], [1.2]])},
index=index)
print(df)
...you get a nice, informative error message:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-96-218c77e99705> in <module>
1 index = pd.Index(np.array([0, 1]))
2 df = pd.DataFrame({'A': np.array([[0], [1]]), 'B': np.array([[1.1], [1.2]])},
----> 3 index=index)
4 print(df)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
409 )
410 elif isinstance(data, dict):
--> 411 mgr = init_dict(data, index, columns, dtype=dtype)
412 elif isinstance(data, ma.MaskedArray):
413 import numpy.ma.mrecords as mrecords
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
255 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
256 ]
--> 257 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
258
259
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
80
81 # don't force copy because getting jammed in an ndarray anyway
---> 82 arrays = _homogenize(arrays, index, dtype)
83
84 # from BlockManager perspective
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
321 val = lib.fast_multiget(val, oindex.values, default=np.nan)
322 val = sanitize_array(
--> 323 val, index, dtype=dtype, copy=False, raise_cast_failure=False
324 )
325
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
727 elif subarr.ndim > 1:
728 if isinstance(data, np.ndarray):
--> 729 raise Exception("Data must be 1-dimensional")
730 else:
731 subarr = com.asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
Version info (pd.__version__ , np.__version__)
('0.25.3', '1.17.4')
('0.24.2', '1.16.2')
(I don't like to raise issues until I'm sure it's something worth considering).

I raised this issue on the numpy GitHub and was advised it is an issue with Pandas. However, I also checked with the latest version of Pandas (1.0.0) and it seems to have been fixed:
>>> index = pd.Index(np.array([[0], [1]]))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-ce554b72776c> in <module>
----> 1 index = pd.Index(np.array([[0], [1]]))
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
388 # maybe coerce to a sub-class
389 if is_signed_integer_dtype(data.dtype):
--> 390 return Int64Index(data, copy=copy, dtype=dtype, name=name)
391 elif is_unsigned_integer_dtype(data.dtype):
392 return UInt64Index(data, copy=copy, dtype=dtype, name=name)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/numeric.py in __new__(cls, data, dtype, copy, name)
76 if subarr.ndim > 1:
77 # GH#13601, GH#20285, GH#27125
---> 78 raise ValueError("Index data must be 1-dimensional")
79
80 name = maybe_extract_name(name, data, cls)
ValueError: Index data must be 1-dimensional

Pandas, Postgres, and Daylight Savings Time (DST)

I am using read_sql_query from Pandas vesion 0.22.0 to pull time series data from a local PostgreSQL database. If I do not parse the date columns, then I get the following data frame:
dataid localminute use
0 1642 2012-05-11 19:00:00-05:00 0.827
1 1642 2012-05-11 19:01:00-05:00 0.830
2 1642 2012-05-11 19:02:00-05:00 0.833
3 1642 2012-05-11 19:03:00-05:00 0.835
4 1642 2012-05-11 19:04:00-05:00 0.837
the localminute column has dtype=object and contains a mix of object such as
datetime.datetime(2012, 5, 11, 19, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None))
datetime.datetime(2012, 12, 9, 2, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-360, name=None))
note that these objects have different tzinfo.
If I try to pass parse_dates=["localminute"] when using pd.read_sql_query I get the following error and traceback.
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
302 try:
--> 303 values, tz = tslib.datetime_to_datetime64(arg)
304 return DatetimeIndex._simple_new(values, name=name, tz=tz)
pandas/_libs/tslib.pyx in pandas._libs.tslib.datetime_to_datetime64()
ValueError: Array must be all same time zone
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-8-0d092e8e183b> in <module>()
6 end_time = pd.Timestamp("2015-06-01", tz="US/Central")
7
----> 8 usage_df = pecanpy.read_electricity_egauge_minutes_query(local_con, "public", "all", dataid, start_time, end_time)
c:\users\pughdr\research\pecanpy\pecanpy\api.py in read_electricity_egauge_minutes_query(con, schema, columns, dataid, start_time, end_time, tz)
32 query = template.format(**kwargs)
33 #parse_dates= {"localminute": {}}
---> 34 df = pd.read_sql_query(query, con=con, parse_dates=["localminute"])
35
36 # if the time period of interest contains observations only from within the
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize)
330 return pandas_sql.read_query(
331 sql, index_col=index_col, params=params, coerce_float=coerce_float,
--> 332 parse_dates=parse_dates, chunksize=chunksize)
333
334
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize)
1102 frame = _wrap_result(data, columns, index_col=index_col,
1103 coerce_float=coerce_float,
-> 1104 parse_dates=parse_dates)
1105 return frame
1106
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _wrap_result(data, columns, index_col, coerce_float, parse_dates)
157 coerce_float=coerce_float)
158
--> 159 _parse_date_columns(frame, parse_dates)
160
161 if index_col is not None:
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _parse_date_columns(data_frame, parse_dates)
138 except TypeError:
139 fmt = None
--> 140 data_frame[col_name] = _handle_date_column(df_col, format=fmt)
141
142 # we want to coerce datetime64_tz dtypes for now
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _handle_date_column(col, utc, format)
117 .astype('datetime64[ns, UTC]'))
118 else:
--> 119 return to_datetime(col, errors='coerce', format=format, utc=utc)
120
121
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
371 elif isinstance(arg, ABCSeries):
372 from pandas import Series
--> 373 values = _convert_listlike(arg._values, True, format)
374 result = Series(values, index=arg.index, name=arg.name)
375 elif isinstance(arg, (ABCDataFrame, MutableMapping)):
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
304 return DatetimeIndex._simple_new(values, name=name, tz=tz)
305 except (ValueError, TypeError):
--> 306 raise e
307
308 if arg is None:
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
292 dayfirst=dayfirst,
293 yearfirst=yearfirst,
--> 294 require_iso8601=require_iso8601
295 )
296
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True
The ValueError is being thrown because the timestamps have different time zones due to Daylight Savings Time (DST). I don't understand the source of the second error.
How can I use read_sql_query to load a DataFrame with a localminute column that is time zone aware with a "US/Central" time zone?

Pandas FloatingPoint Error

I'm getting a floating point error on a simple time series in pandas. I'm trying to do shift operations... but this also happens with the window functions like rolling_mean.
EDIT: For some more info... I tried to actually build this from source yesterday prior to the error. I'm not sure if the error would've occurred prior the build attempt, as I'd never messed around w/ these functions.
EDIT2: I thought I'd fixed this, but when I run this inside python it works, but when it's in ipython I get the error.
EDIT3: Numpy 1.7.0, iPython 0.13, pandas 0.7.3
In [35]: ts = Series(np.arange(12), index=DateRange('1/1/2000', periods=12, freq='T'))
In [36]: ts.shift(0)
Out[36]:
2000-01-03 0
2000-01-04 1
2000-01-05 2
2000-01-06 3
2000-01-07 4
2000-01-10 5
2000-01-11 6
2000-01-12 7
2000-01-13 8
2000-01-14 9
2000-01-17 10
2000-01-18 11
In [37]: ts.shift(1)
Out[37]: ---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
/Users/trenthauck/Repository/work/SQS/analysis/campaign/tv2/data/<ipython-input-37-2b7cec97d440> in <module>()
----> 1 ts.shift(1)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in __call__(self, result)
236 self.start_displayhook()
237 self.write_output_prompt()
--> 238 format_dict = self.compute_format_data(result)
239 self.write_format_data(format_dict)
240 self.update_user_ns(result)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in compute_format_data(self, result)
148 MIME type representation of the object.
149 """
--> 150 return self.shell.display_formatter.format(result)
151
152 def write_format_data(self, format_dict):
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in format(self, obj, include, exclude)
124 continue
125 try:
--> 126 data = formatter(obj)
127 except:
128 # FIXME: log the exception
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in __call__(self, obj)
445 type_pprinters=self.type_printers,
446 deferred_pprinters=self.deferred_printers)
--> 447 printer.pretty(obj)
448 printer.flush()
449 return stream.getvalue()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in pretty(self, obj)
353 if callable(obj_class._repr_pretty_):
354 return obj_class._repr_pretty_(obj, self, cycle)
--> 355 return _default_pprint(obj, self, cycle)
356 finally:
357 self.end_group()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
473 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
474 # A user-provided repr.
--> 475 p.text(repr(obj))
476 return
477 p.begin_group(1, '<')
/Library/Python/2.7/site-packages/pandas/core/series.pyc in __repr__(self)
696 result = self._get_repr(print_header=True,
697 length=len(self) > 50,
--> 698 name=True)
699 else:
700 result = '%s' % ndarray.__repr__(self)
/Library/Python/2.7/site-packages/pandas/core/series.pyc in _get_repr(self, name, print_header, length, na_rep, float_format)
756 length=length, na_rep=na_rep,
757 float_format=float_format)
--> 758 return formatter.to_string()
759
760 def __str__(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in to_string(self)
99
100 fmt_index, have_header = self._get_formatted_index()
--> 101 fmt_values = self._get_formatted_values()
102
103 maxlen = max(len(x) for x in fmt_index)
/Library/Python/2.7/site-packages/pandas/core/format.pyc in _get_formatted_values(self)
90 return format_array(self.series.values, None,
91 float_format=self.float_format,
---> 92 na_rep=self.na_rep)
93
94 def to_string(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
431 justify=justify)
432
--> 433 return fmt_obj.get_result()
434
435
/Library/Python/2.7/site-packages/pandas/core/format.pyc in get_result(self)
528
529 # this is pretty arbitrary for now
--> 530 has_large_values = (np.abs(self.values) > 1e8).any()
531
532 if too_long and has_large_values:
FloatingPointError: invalid value encountered in absolute
In [38]: ts.shift(-1)
Out[38]: ---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
/Users/myusername/Repository/work/SQS/analysis/campaign/tv2/data/<ipython-input-38-314ec815a7c5> in <module>()
----> 1 ts.shift(-1)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in __call__(self, result)
236 self.start_displayhook()
237 self.write_output_prompt()
--> 238 format_dict = self.compute_format_data(result)
239 self.write_format_data(format_dict)
240 self.update_user_ns(result)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in compute_format_data(self, result)
148 MIME type representation of the object.
149 """
--> 150 return self.shell.display_formatter.format(result)
151
152 def write_format_data(self, format_dict):
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in format(self, obj, include, exclude)
124 continue
125 try:
--> 126 data = formatter(obj)
127 except:
128 # FIXME: log the exception
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in __call__(self, obj)
445 type_pprinters=self.type_printers,
446 deferred_pprinters=self.deferred_printers)
--> 447 printer.pretty(obj)
448 printer.flush()
449 return stream.getvalue()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in pretty(self, obj)
353 if callable(obj_class._repr_pretty_):
354 return obj_class._repr_pretty_(obj, self, cycle)
--> 355 return _default_pprint(obj, self, cycle)
356 finally:
357 self.end_group()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
473 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
474 # A user-provided repr.
--> 475 p.text(repr(obj))
476 return
477 p.begin_group(1, '<')
/Library/Python/2.7/site-packages/pandas/core/series.pyc in __repr__(self)
696 result = self._get_repr(print_header=True,
697 length=len(self) > 50,
--> 698 name=True)
699 else:
700 result = '%s' % ndarray.__repr__(self)
/Library/Python/2.7/site-packages/pandas/core/series.pyc in _get_repr(self, name, print_header, length, na_rep, float_format)
756 length=length, na_rep=na_rep,
757 float_format=float_format)
--> 758 return formatter.to_string()
759
760 def __str__(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in to_string(self)
99
100 fmt_index, have_header = self._get_formatted_index()
--> 101 fmt_values = self._get_formatted_values()
102
103 maxlen = max(len(x) for x in fmt_index)
/Library/Python/2.7/site-packages/pandas/core/format.pyc in _get_formatted_values(self)
90 return format_array(self.series.values, None,
91 float_format=self.float_format,
---> 92 na_rep=self.na_rep)
93
94 def to_string(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
431 justify=justify)
432
--> 433 return fmt_obj.get_result()
434
435
/Library/Python/2.7/site-packages/pandas/core/format.pyc in get_result(self)
528
529 # this is pretty arbitrary for now
--> 530 has_large_values = (np.abs(self.values) > 1e8).any()
531
532 if too_long and has_large_values:
FloatingPointError: invalid value encountered in absolute

I would add this as a comment, but I don't have the privilege to do that yet :)
It works for me in python and iPython 0.12; iPython 0.13 is still in development (see http://ipython.org/ ), and, since the errors you're getting seem to involve formatting in the iPython 0.13 egg, I suspect that might be the cause. Try with iPython 0.12 instead-- if it works, file a bug report with iPython and then probably stick with 0.12 until 0.13 is (more) stable.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

pandas: FloatingPointError with np.seterr(all='raise') and missing data - python

Related

Why pd.read_csv get wrong value when using dtype = 'Int64'?

Pandas pivot_table Assertion error: `result` has not been initialized

TypeError: unsupported format string when printing Pandas dataframe with incorrectly shaped index

Pandas, Postgres, and Daylight Savings Time (DST)

Pandas FloatingPoint Error

Categories

Resources