I'm getting a FloatingPointError when I want to look at data involving missing data.
import numpy as np
import pandas as pd
np.seterr(all='raise')
s = pd.Series([np.nan,np.nan,np.nan],index=[1,2,3]); print(s); print(s.head())
I'm on the newest version of pandas, installed via
conda install -f pandas
after pkill python and conda remove pandas.
Here's the trace back:
Out[4]: ---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/core/formatters.pyc in __call__(self, obj)
695 type_pprinters=self.type_printers,
696 deferred_pprinters=self.deferred_printers)
--> 697 printer.pretty(obj)
698 printer.flush()
699 return stream.getvalue()
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/lib/pretty.pyc in pretty(self, obj)
381 if callable(meth):
382 return meth(obj, self, cycle)
--> 383 return _default_pprint(obj, self, cycle)
384 finally:
385 self.end_group()
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
501 if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
502 # A user-provided repr. Find newlines and replace them with p.break_()
--> 503 _repr_pprint(obj, p, cycle)
504 return
505 p.begin_group(1, '<')
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/IPython/lib/pretty.pyc in _repr_pprint(obj, p, cycle)
683 """A pprint that just redirects to the normal repr function."""
684 # Find newlines and replace them with p.break_()
--> 685 output = repr(obj)
686 for idx,output_line in enumerate(output.splitlines()):
687 if idx:
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/base.pyc in __repr__(self)
61 Yields Bytestring in Py2, Unicode String in py3.
62 """
---> 63 return str(self)
64
65
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/base.pyc in __str__(self)
41 if compat.PY3:
42 return self.__unicode__()
---> 43 return self.__bytes__()
44
45 def __bytes__(self):
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/base.pyc in __bytes__(self)
53
54 encoding = get_option("display.encoding")
---> 55 return self.__unicode__().encode(encoding, 'replace')
56
57 def __repr__(self):
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/series.pyc in __unicode__(self)
954
955 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 956 max_rows=max_rows)
957 result = buf.getvalue()
958
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/series.pyc in to_string(self, buf, na_rep, float_format, header, length, dtype, name, max_rows)
992 the_repr = self._get_repr(float_format=float_format, na_rep=na_rep,
993 header=header, length=length, dtype=dtype,
--> 994 name=name, max_rows=max_rows)
995
996 # catch contract violations
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/series.pyc in _get_repr(self, name, header, length, dtype, na_rep, float_format, max_rows)
1022 float_format=float_format,
1023 max_rows=max_rows)
-> 1024 result = formatter.to_string()
1025
1026 # TODO: following check prob. not neces.
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in to_string(self)
223
224 fmt_index, have_header = self._get_formatted_index()
--> 225 fmt_values = self._get_formatted_values()
226
227 maxlen = max(self.adj.len(x) for x in fmt_index) # max index len
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in _get_formatted_values(self)
213 return format_array(self.tr_series._values, None,
214 float_format=self.float_format,
--> 215 na_rep=self.na_rep)
216
217 def to_string(self):
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
1974 justify=justify)
1975
-> 1976 return fmt_obj.get_result()
1977
1978
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in get_result(self)
1990
1991 def get_result(self):
-> 1992 fmt_values = self._format_strings()
1993 return _make_fixed_width(fmt_values, self.justify)
1994
/home/xxx/.conda/envs/myenv2/lib/python2.7/site-packages/pandas/core/format.pyc in _format_strings(self)
2085
2086 # this is pretty arbitrary for now
-> 2087 has_large_values = (abs_vals > 1e8).any()
2088 has_small_values = ((abs_vals < 10 ** (-self.digits)) &
2089 (abs_vals > 0)).any()
FloatingPointError: invalid value encountered in greater
Whenever you import pandas, all numpy errors are set to be ignore. This is currently undocumented behavior.
This is done in pandas/compat/numpy_compat.py
# TODO: HACK for NumPy 1.5.1 to suppress warnings
# is this necessary?
try:
np.seterr(all='ignore')
except Exception: # pragma: no cover
pass
Here's how that plays out
In [1]: import numpy as np
In [2]: np.geterr()
Out[2]: {'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}
In [3]: import pandas as pd
In [4]: np.geterr()
Out[4]: {'divide': 'ignore', 'invalid': 'ignore', 'over': 'ignore', 'under': 'ignore'}
In [5]: s = pd.Series([np.nan,np.nan,np.nan],index=[1,2,3]); print(s); print(s.head())
1 NaN
2 NaN
3 NaN
dtype: float64
1 NaN
2 NaN
3 NaN
dtype: float64
In [6]: np.seterr(invalid='raise')
Out[6]: {'divide': 'ignore', 'invalid': 'ignore', 'over': 'ignore', 'under': 'ignore'}
In [7]: s = pd.Series([np.nan,np.nan,np.nan],index=[1,2,3]); print(s); print(s.head())
FloatingPointError: invalid value encountered in greater
The "solution" is hence to not np.seterr(invalid'raise'), whenever you use pandas (especially when working with missing data).
Related
import pandas as pd
When there is no na values, It's right.
!cat id1
1471341653427101696 1458379213265436885
pd.read_csv('id1',sep ='\t',header=None, na_values=['\\N'],dtype = 'Int64')
0
1
1471341653427101696
1458379213265436885
!cat id2
1471870967209926656 \N
1471341653427101696 1458379213265436885
1471458498691866624 1458379213265436889
when using int64, and there are na values, pd get wrong number
df = pd.read_csv('id2',sep ='\t',header=None, na_values=['\\N'],dtype = 'Int64')
df
0
1
1471870967209926656
<NA>
1471341653427101696
1458379213265436672
1471458498691866624
1458379213265436672
when read as str, it's correct
df = pd.read_csv('id2',sep ='\t',header=None, na_values=['\\N'],dtype = 'str')
df
0
1
1471870967209926656
NaN
1471341653427101696
1458379213265436885
1471458498691866624
1458379213265436889
df[1]
0 NaN
1 1458379213265436885
2 1458379213265436889
Name: 1, dtype: object
df.loc[[0],1].astype('Int64')
0 <NA>
Name: 1, dtype: Int64
df.loc[[1],1].astype('Int64')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_1971828/2578832362.py in <module>
----> 1 df.loc[[1],1].astype('Int64')
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5813 else:
5814 # else, only a single dtype is given
-> 5815 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
5816 return self._constructor(new_data).__finalize__(self, method="astype")
5817
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
416
417 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
419
420 def convert(
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
589 values = self.values
590
--> 591 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
592
593 new_values = maybe_coerce_values(new_values)
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
1307
1308 try:
-> 1309 new_values = astype_array(values, dtype, copy=copy)
1310 except (ValueError, TypeError):
1311 # e.g. astype_nansafe can fail on object-dtype of strings
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
1255
1256 else:
-> 1257 values = astype_nansafe(values, dtype, copy=copy)
1258
1259 # in pandas we don't store numpy str dtypes, so convert to object
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
1103 # dispatch on extension dtype if needed
1104 if isinstance(dtype, ExtensionDtype):
-> 1105 return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
1106
1107 elif not isinstance(dtype, np.dtype): # pragma: no cover
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/arrays/integer.py in _from_sequence(cls, scalars, dtype, copy)
321 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
322 ) -> IntegerArray:
--> 323 values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
324 return IntegerArray(values, mask)
325
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/arrays/integer.py in coerce_to_array(values, dtype, mask, copy)
196 "mixed-integer-float",
197 ]:
--> 198 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
199
200 elif is_bool_dtype(values) and is_integer_dtype(dtype):
TypeError: object cannot be converted to an IntegerDtype
This is an old issue for pandas: github.com/pandas-dev/pandas/issues/30268.
So the only way is using str,remove na ,then convert to int
df:
avg
count
date
val
prop
unit
distance
d-atmp
d-clouds
d-dewpoint
0.0786107
12
2014-10-03 00:00:00
22
atmp
(Deg C)
24829.6
24829.6
nan
nan
0.0786107
12
2014-10-03 00:00:00
0
clouds
(oktas)
22000.6
nan
22000.6
nan
0.0786107
12
2014-10-03 00:00:00
32
dewpoint
(Deg C)
21344.1
nan
nan
21344.1
0.0684246
6
2014-10-04 00:00:00
21.5
atmp
(Deg C)
26345.1
26345.1
nan
nan
cols = ['avg', 'date', 'count', 'd-atmp', 'd-cloud', 'd-dewpoint']
d = pd.pivot_table(x, index=cols, columns=['prop', 'unit'], values='val', aggfunc=max)
Ideal result:
date
countObs
avg
d-atmp
atmp (Deg C)
d-clouds
clouds (oktas)
d-dewpoint
dewpoint (Deg C)
2014-10-03 00:00:00
12
0.0786107
24829.6
22
22000.6
0
21344.1
32
2014-10-04 00:00:00
6
0.0684246
26345.1
21.5
nan
nan
nan
nan
Error
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) ~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1067 try:
-> 1068 result = self.grouper._cython_operation( 1069 "aggregate", values, how, axis=data.ndim - 1, min_count=min_count
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_operation(self, kind, values, how, axis, min_count, **kwargs)
998 ngroups = self.ngroups
--> 999 return cy_op.cython_operation( 1000 values=values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in cython_operation(self, values, axis, min_count, comp_ids, ngroups,
**kwargs)
659
--> 660 return self._cython_op_ndim_compat(
661 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
515
--> 516 return self._call_cython_op(
517 values,
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_call_cython_op(self, values, min_count, ngroups, comp_ids, mask, **kwargs)
561 out_shape = self._get_output_shape(ngroups, values)
--> 562 func, values = self.get_cython_func_and_vals(values, is_numeric)
563 out_dtype = self.get_out_dtype(values.dtype)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in get_cython_func_and_vals(self, values, is_numeric)
204
--> 205 func = self._get_cython_function(kind, how, values.dtype, is_numeric)
206
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_get_cython_function(cls, kind, how, dtype, is_numeric)
169 # raise NotImplementedError here rather than TypeError later
--> 170 raise NotImplementedError(
171 f"function is not implemented for this dtype: "
NotImplementedError: function is not implemented for this dtype: [how->mean,dtype->object]
During handling of the above exception, another exception occurred:
AssertionError Traceback (most recent call last) <ipython-input-119-b64b487d2810> in <module>
5 # o
6 # cols += []
----> 7 d = pd.pivot_table(x, index=cols, columns=['osmcObsProperty', 'unit'], values='val') #, aggfunc=max #np.mean or max appear similar , dropna=False
8
9 d.reset_index(inplace=True)
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
93 return table.__finalize__(data, method="pivot_table")
94
---> 95 table = __internal_pivot_table(
96 data,
97 values,
~/.local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in
__internal_pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
163
164 grouped = data.groupby(keys, observed=observed, sort=sort)
--> 165 agged = grouped.agg(aggfunc)
166 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
167 agged = agged.dropna(how="all")
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
977
978 op = GroupByApply(self, func, args, kwargs)
--> 979 result = op.agg()
980 if not is_dict_like(func) and result is not None:
981 return result
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in agg(self)
156
157 if isinstance(arg, str):
--> 158 return self.apply_str()
159
160 if is_dict_like(arg):
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in apply_str(self)
505 elif self.axis != 0:
506 raise ValueError(f"Operation {f} does not support axis=1")
--> 507 return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)
508
509 def apply_multiple(self) -> FrameOrSeriesUnion:
~/.local/lib/python3.9/site-packages/pandas/core/apply.py in
_try_aggregate_string_function(self, obj, arg, *args, **kwargs)
575 if f is not None:
576 if callable(f):
--> 577 return f(*args, **kwargs)
578
579 # people may try to aggregate on a non-callable attribute
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in mean(self, numeric_only) 1685 numeric_only = self._resolve_numeric_only(numeric_only) 1686
-> 1687 result = self._cython_agg_general( 1688 "mean", 1689 alt=lambda x: Series(x).mean(numeric_only=numeric_only),
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in
_cython_agg_general(self, how, alt, numeric_only, min_count) 1080 # TypeError -> we may have an exception in trying to aggregate 1081 # continue and exclude the block
-> 1082 new_mgr = data.grouped_reduce(array_func, ignore_failures=True) 1083 1084 if len(new_mgr) < len(data):
~/.local/lib/python3.9/site-packages/pandas/core/internals/managers.py in grouped_reduce(self, func, ignore_failures) 1233 for sb in blk._split(): 1234 try:
-> 1235 applied = sb.apply(func) 1236 except (TypeError, NotImplementedError): 1237 if not ignore_failures:
~/.local/lib/python3.9/site-packages/pandas/core/internals/blocks.py in apply(self, func, **kwargs)
379 """
380 with np.errstate(all="ignore"):
--> 381 result = func(self.values, **kwargs)
382
383 return self._split_op_result(result)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in array_func(values) 1074 # try to python agg 1075
# TODO: shouldn't min_count matter?
-> 1076 result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt) 1077 1078 return result
~/.local/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in
_agg_py_fallback(self, values, ndim, alt) 1396 # should always be preserved by the implemented aggregations 1397 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1398 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) 1399 1400 if isinstance(values, Categorical):
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in agg_series(self, obj, func, preserve_dtype) 1047 1048 else:
-> 1049 result = self._aggregate_series_fast(obj, func) 1050 1051 npvalues = lib.maybe_convert_objects(result, try_float=False)
~/.local/lib/python3.9/site-packages/pandas/core/groupby/ops.py in
_aggregate_series_fast(self, obj, func) 1072 ids = ids.take(indexer) 1073 sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
-> 1074 result, _ = sgrouper.get_result() 1075 return result 1076
~/.local/lib/python3.9/site-packages/pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesGrouper.get_result()
AssertionError: `result` has not been initialized.
IIUC, you can use groupby_agg:
out = df.groupby('date', as_index=False).agg(max)
Output:
date
avg
count
val
prop
unit
distance
d-atmp
d-clouds
d-dewpoint
2014-10-03 00:00:00
0.0786107
12
32
dewpoint
(oktas)
24829.6
24829.6
22000.6
21344.1
2014-10-04 00:00:00
0.0684246
6
21.5
atmp
(Deg C)
26345.1
26345.1
nan
nan
You could pivot; then use groupby + max:
cols = ['avg', 'date', 'count', 'd-atmp', 'd-clouds', 'd-dewpoint']
tmp = df.pivot(index=cols, columns=['prop', 'unit'], values='val')
tmp.columns = tmp.columns.map(' '.join)
out = tmp.reset_index().groupby('date', as_index=False).max()\
[['date', 'count', 'avg', 'd-atmp', 'atmp (Deg C)', 'd-clouds',
'clouds (oktas)', 'd-dewpoint', 'dewpoint (Deg C)']]
Output:
date count avg d-atmp atmp (Deg C) d-clouds clouds (oktas) d-dewpoint dewpoint (Deg C)
0 2014-10-03 00:00:00 12 0.078611 24829.6 22.0 22000.6 0.0 21344.1 32.0
1 2014-10-04 00:00:00 6 0.068425 26345.1 21.5 NaN NaN NaN NaN
Found this not-so-helpful traceback when incorrectly constructing a data frame with an index. My question is, is this a bug that I should report to Pandas as an issue or feature request or am I doing something wrong?
What I wanted to do:
index = pd.Index(np.array([0, 1]))
df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
index=index)
print(df)
A B
0 0 1.1
1 1 1.2
(works, no problem)
What I actually did (note dimension of index array data):
index = pd.Index(np.array([[0], [1]]))
df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
index=index)
print(df)
Traceback message (very long):
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-95-af090c2ae470> in <module>
2 df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
3 index=index)
----> 4 print(df)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in __repr__(self)
653 max_cols=max_cols,
654 line_width=width,
--> 655 show_dimensions=show_dimensions,
656 )
657
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width)
774 line_width=line_width,
775 )
--> 776 formatter.to_string()
777
778 if buf is None:
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in to_string(self)
686 else:
687
--> 688 strcols = self._to_str_columns()
689 if self.line_width is None: # no need to wrap around just print
690 # the whole frame
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _to_str_columns(self)
586 # may include levels names also
587
--> 588 str_index = self._get_formatted_index(frame)
589
590 if not is_list_like(self.header) and not self.header:
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _get_formatted_index(self, frame)
919 )
920 else:
--> 921 fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
922
923 fmt_index = [
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in format(self, name, formatter, **kwargs)
1106 return header + list(self.map(formatter))
1107
-> 1108 return self._format_with_header(header, **kwargs)
1109
1110 def _format_with_header(self, header, na_rep="NaN", **kwargs):
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in _format_with_header(self, header, na_rep, **kwargs)
1130
1131 else:
-> 1132 result = _trim_front(format_array(values, None, justify="left"))
1133 return header + result
1134
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space)
1031 )
1032
-> 1033 return fmt_obj.get_result()
1034
1035
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in get_result(self)
1062
1063 def get_result(self):
-> 1064 fmt_values = self._format_strings()
1065 return _make_fixed_width(fmt_values, self.justify)
1066
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _format_strings(self)
1293 def _format_strings(self):
1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
-> 1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
1297
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in <listcomp>(.0)
1293 def _format_strings(self):
1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
-> 1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
1297
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in <lambda>(x)
1292 class IntArrayFormatter(GenericArrayFormatter):
1293 def _format_strings(self):
-> 1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
TypeError: unsupported format string passed to numpy.ndarray.__format__
Note that the dataframe was constructed (it just can't be printed):
In [14]: df.shape
Out[14]: (2, 2)
In [15]: df.index
Out[15]: Int64Index([[0], [1]], dtype='int64')
In [16]: df.values
Out[16]:
array([[0. , 1.1],
[1. , 1.2]])
In [18]: df.columns
Out[18]: Index(['A', 'B'], dtype='object')
In [19]: df.index[0]
Out[19]: array([0])
In [20]: df.index.dtype
Out[20]: dtype('int64')
Also note that if you make the same mistake with the data arguments...
index = pd.Index(np.array([0, 1]))
df = pd.DataFrame({'A': np.array([[0], [1]]), 'B': np.array([[1.1], [1.2]])},
index=index)
print(df)
...you get a nice, informative error message:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-96-218c77e99705> in <module>
1 index = pd.Index(np.array([0, 1]))
2 df = pd.DataFrame({'A': np.array([[0], [1]]), 'B': np.array([[1.1], [1.2]])},
----> 3 index=index)
4 print(df)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
409 )
410 elif isinstance(data, dict):
--> 411 mgr = init_dict(data, index, columns, dtype=dtype)
412 elif isinstance(data, ma.MaskedArray):
413 import numpy.ma.mrecords as mrecords
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
255 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
256 ]
--> 257 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
258
259
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
80
81 # don't force copy because getting jammed in an ndarray anyway
---> 82 arrays = _homogenize(arrays, index, dtype)
83
84 # from BlockManager perspective
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
321 val = lib.fast_multiget(val, oindex.values, default=np.nan)
322 val = sanitize_array(
--> 323 val, index, dtype=dtype, copy=False, raise_cast_failure=False
324 )
325
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
727 elif subarr.ndim > 1:
728 if isinstance(data, np.ndarray):
--> 729 raise Exception("Data must be 1-dimensional")
730 else:
731 subarr = com.asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
Version info (pd.__version__ , np.__version__)
('0.25.3', '1.17.4')
('0.24.2', '1.16.2')
(I don't like to raise issues until I'm sure it's something worth considering).
I raised this issue on the numpy GitHub and was advised it is an issue with Pandas. However, I also checked with the latest version of Pandas (1.0.0) and it seems to have been fixed:
>>> index = pd.Index(np.array([[0], [1]]))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-ce554b72776c> in <module>
----> 1 index = pd.Index(np.array([[0], [1]]))
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
388 # maybe coerce to a sub-class
389 if is_signed_integer_dtype(data.dtype):
--> 390 return Int64Index(data, copy=copy, dtype=dtype, name=name)
391 elif is_unsigned_integer_dtype(data.dtype):
392 return UInt64Index(data, copy=copy, dtype=dtype, name=name)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/numeric.py in __new__(cls, data, dtype, copy, name)
76 if subarr.ndim > 1:
77 # GH#13601, GH#20285, GH#27125
---> 78 raise ValueError("Index data must be 1-dimensional")
79
80 name = maybe_extract_name(name, data, cls)
ValueError: Index data must be 1-dimensional
I am using read_sql_query from Pandas vesion 0.22.0 to pull time series data from a local PostgreSQL database. If I do not parse the date columns, then I get the following data frame:
dataid localminute use
0 1642 2012-05-11 19:00:00-05:00 0.827
1 1642 2012-05-11 19:01:00-05:00 0.830
2 1642 2012-05-11 19:02:00-05:00 0.833
3 1642 2012-05-11 19:03:00-05:00 0.835
4 1642 2012-05-11 19:04:00-05:00 0.837
the localminute column has dtype=object and contains a mix of object such as
datetime.datetime(2012, 5, 11, 19, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None))
datetime.datetime(2012, 12, 9, 2, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-360, name=None))
note that these objects have different tzinfo.
If I try to pass parse_dates=["localminute"] when using pd.read_sql_query I get the following error and traceback.
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
302 try:
--> 303 values, tz = tslib.datetime_to_datetime64(arg)
304 return DatetimeIndex._simple_new(values, name=name, tz=tz)
pandas/_libs/tslib.pyx in pandas._libs.tslib.datetime_to_datetime64()
ValueError: Array must be all same time zone
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-8-0d092e8e183b> in <module>()
6 end_time = pd.Timestamp("2015-06-01", tz="US/Central")
7
----> 8 usage_df = pecanpy.read_electricity_egauge_minutes_query(local_con, "public", "all", dataid, start_time, end_time)
c:\users\pughdr\research\pecanpy\pecanpy\api.py in read_electricity_egauge_minutes_query(con, schema, columns, dataid, start_time, end_time, tz)
32 query = template.format(**kwargs)
33 #parse_dates= {"localminute": {}}
---> 34 df = pd.read_sql_query(query, con=con, parse_dates=["localminute"])
35
36 # if the time period of interest contains observations only from within the
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize)
330 return pandas_sql.read_query(
331 sql, index_col=index_col, params=params, coerce_float=coerce_float,
--> 332 parse_dates=parse_dates, chunksize=chunksize)
333
334
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize)
1102 frame = _wrap_result(data, columns, index_col=index_col,
1103 coerce_float=coerce_float,
-> 1104 parse_dates=parse_dates)
1105 return frame
1106
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _wrap_result(data, columns, index_col, coerce_float, parse_dates)
157 coerce_float=coerce_float)
158
--> 159 _parse_date_columns(frame, parse_dates)
160
161 if index_col is not None:
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _parse_date_columns(data_frame, parse_dates)
138 except TypeError:
139 fmt = None
--> 140 data_frame[col_name] = _handle_date_column(df_col, format=fmt)
141
142 # we want to coerce datetime64_tz dtypes for now
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\io\sql.py in _handle_date_column(col, utc, format)
117 .astype('datetime64[ns, UTC]'))
118 else:
--> 119 return to_datetime(col, errors='coerce', format=format, utc=utc)
120
121
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
371 elif isinstance(arg, ABCSeries):
372 from pandas import Series
--> 373 values = _convert_listlike(arg._values, True, format)
374 result = Series(values, index=arg.index, name=arg.name)
375 elif isinstance(arg, (ABCDataFrame, MutableMapping)):
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
304 return DatetimeIndex._simple_new(values, name=name, tz=tz)
305 except (ValueError, TypeError):
--> 306 raise e
307
308 if arg is None:
~\AppData\Local\Continuum\Anaconda3\envs\pecanpy-dev\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike(arg, box, format, name, tz)
292 dayfirst=dayfirst,
293 yearfirst=yearfirst,
--> 294 require_iso8601=require_iso8601
295 )
296
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()
ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True
The ValueError is being thrown because the timestamps have different time zones due to Daylight Savings Time (DST). I don't understand the source of the second error.
How can I use read_sql_query to load a DataFrame with a localminute column that is time zone aware with a "US/Central" time zone?
I'm getting a floating point error on a simple time series in pandas. I'm trying to do shift operations... but this also happens with the window functions like rolling_mean.
EDIT: For some more info... I tried to actually build this from source yesterday prior to the error. I'm not sure if the error would've occurred prior the build attempt, as I'd never messed around w/ these functions.
EDIT2: I thought I'd fixed this, but when I run this inside python it works, but when it's in ipython I get the error.
EDIT3: Numpy 1.7.0, iPython 0.13, pandas 0.7.3
In [35]: ts = Series(np.arange(12), index=DateRange('1/1/2000', periods=12, freq='T'))
In [36]: ts.shift(0)
Out[36]:
2000-01-03 0
2000-01-04 1
2000-01-05 2
2000-01-06 3
2000-01-07 4
2000-01-10 5
2000-01-11 6
2000-01-12 7
2000-01-13 8
2000-01-14 9
2000-01-17 10
2000-01-18 11
In [37]: ts.shift(1)
Out[37]: ---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
/Users/trenthauck/Repository/work/SQS/analysis/campaign/tv2/data/<ipython-input-37-2b7cec97d440> in <module>()
----> 1 ts.shift(1)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in __call__(self, result)
236 self.start_displayhook()
237 self.write_output_prompt()
--> 238 format_dict = self.compute_format_data(result)
239 self.write_format_data(format_dict)
240 self.update_user_ns(result)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in compute_format_data(self, result)
148 MIME type representation of the object.
149 """
--> 150 return self.shell.display_formatter.format(result)
151
152 def write_format_data(self, format_dict):
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in format(self, obj, include, exclude)
124 continue
125 try:
--> 126 data = formatter(obj)
127 except:
128 # FIXME: log the exception
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in __call__(self, obj)
445 type_pprinters=self.type_printers,
446 deferred_pprinters=self.deferred_printers)
--> 447 printer.pretty(obj)
448 printer.flush()
449 return stream.getvalue()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in pretty(self, obj)
353 if callable(obj_class._repr_pretty_):
354 return obj_class._repr_pretty_(obj, self, cycle)
--> 355 return _default_pprint(obj, self, cycle)
356 finally:
357 self.end_group()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
473 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
474 # A user-provided repr.
--> 475 p.text(repr(obj))
476 return
477 p.begin_group(1, '<')
/Library/Python/2.7/site-packages/pandas/core/series.pyc in __repr__(self)
696 result = self._get_repr(print_header=True,
697 length=len(self) > 50,
--> 698 name=True)
699 else:
700 result = '%s' % ndarray.__repr__(self)
/Library/Python/2.7/site-packages/pandas/core/series.pyc in _get_repr(self, name, print_header, length, na_rep, float_format)
756 length=length, na_rep=na_rep,
757 float_format=float_format)
--> 758 return formatter.to_string()
759
760 def __str__(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in to_string(self)
99
100 fmt_index, have_header = self._get_formatted_index()
--> 101 fmt_values = self._get_formatted_values()
102
103 maxlen = max(len(x) for x in fmt_index)
/Library/Python/2.7/site-packages/pandas/core/format.pyc in _get_formatted_values(self)
90 return format_array(self.series.values, None,
91 float_format=self.float_format,
---> 92 na_rep=self.na_rep)
93
94 def to_string(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
431 justify=justify)
432
--> 433 return fmt_obj.get_result()
434
435
/Library/Python/2.7/site-packages/pandas/core/format.pyc in get_result(self)
528
529 # this is pretty arbitrary for now
--> 530 has_large_values = (np.abs(self.values) > 1e8).any()
531
532 if too_long and has_large_values:
FloatingPointError: invalid value encountered in absolute
In [38]: ts.shift(-1)
Out[38]: ---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
/Users/myusername/Repository/work/SQS/analysis/campaign/tv2/data/<ipython-input-38-314ec815a7c5> in <module>()
----> 1 ts.shift(-1)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in __call__(self, result)
236 self.start_displayhook()
237 self.write_output_prompt()
--> 238 format_dict = self.compute_format_data(result)
239 self.write_format_data(format_dict)
240 self.update_user_ns(result)
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/displayhook.pyc in compute_format_data(self, result)
148 MIME type representation of the object.
149 """
--> 150 return self.shell.display_formatter.format(result)
151
152 def write_format_data(self, format_dict):
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in format(self, obj, include, exclude)
124 continue
125 try:
--> 126 data = formatter(obj)
127 except:
128 # FIXME: log the exception
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/core/formatters.pyc in __call__(self, obj)
445 type_pprinters=self.type_printers,
446 deferred_pprinters=self.deferred_printers)
--> 447 printer.pretty(obj)
448 printer.flush()
449 return stream.getvalue()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in pretty(self, obj)
353 if callable(obj_class._repr_pretty_):
354 return obj_class._repr_pretty_(obj, self, cycle)
--> 355 return _default_pprint(obj, self, cycle)
356 finally:
357 self.end_group()
/Library/Python/2.7/site-packages/ipython-0.13.dev-py2.7.egg/IPython/lib/pretty.pyc in _default_pprint(obj, p, cycle)
473 if getattr(klass, '__repr__', None) not in _baseclass_reprs:
474 # A user-provided repr.
--> 475 p.text(repr(obj))
476 return
477 p.begin_group(1, '<')
/Library/Python/2.7/site-packages/pandas/core/series.pyc in __repr__(self)
696 result = self._get_repr(print_header=True,
697 length=len(self) > 50,
--> 698 name=True)
699 else:
700 result = '%s' % ndarray.__repr__(self)
/Library/Python/2.7/site-packages/pandas/core/series.pyc in _get_repr(self, name, print_header, length, na_rep, float_format)
756 length=length, na_rep=na_rep,
757 float_format=float_format)
--> 758 return formatter.to_string()
759
760 def __str__(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in to_string(self)
99
100 fmt_index, have_header = self._get_formatted_index()
--> 101 fmt_values = self._get_formatted_values()
102
103 maxlen = max(len(x) for x in fmt_index)
/Library/Python/2.7/site-packages/pandas/core/format.pyc in _get_formatted_values(self)
90 return format_array(self.series.values, None,
91 float_format=self.float_format,
---> 92 na_rep=self.na_rep)
93
94 def to_string(self):
/Library/Python/2.7/site-packages/pandas/core/format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify)
431 justify=justify)
432
--> 433 return fmt_obj.get_result()
434
435
/Library/Python/2.7/site-packages/pandas/core/format.pyc in get_result(self)
528
529 # this is pretty arbitrary for now
--> 530 has_large_values = (np.abs(self.values) > 1e8).any()
531
532 if too_long and has_large_values:
FloatingPointError: invalid value encountered in absolute
I would add this as a comment, but I don't have the privilege to do that yet :)
It works for me in python and iPython 0.12; iPython 0.13 is still in development (see http://ipython.org/ ), and, since the errors you're getting seem to involve formatting in the iPython 0.13 egg, I suspect that might be the cause. Try with iPython 0.12 instead-- if it works, file a bug report with iPython and then probably stick with 0.12 until 0.13 is (more) stable.